Optimize surrogate decoding. (#894)
Use a slightly different approach to recognizing surrogate pairs, which can avoid some duplicate computations. Slight speedup measured locally when with `compile js` and `compile exe`.
diff --git a/pkgs/characters/CHANGELOG.md b/pkgs/characters/CHANGELOG.md
index 1992446..f8782bf 100644
--- a/pkgs/characters/CHANGELOG.md
+++ b/pkgs/characters/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 1.4.1-wip
+## 1.4.1
-- Run `dart format` with the new style.
+* Run `dart format` with the new style.
+* Performance improvement for non-BMP characters.
## 1.4.0
diff --git a/pkgs/characters/lib/src/characters_impl.dart b/pkgs/characters/lib/src/characters_impl.dart
index e869433..a901b51 100644
--- a/pkgs/characters/lib/src/characters_impl.dart
+++ b/pkgs/characters/lib/src/characters_impl.dart
@@ -509,15 +509,16 @@
var index = _end;
while (index < _string.length) {
var char = _string.codeUnitAt(index);
+ var surrogate = char ^ 0xD800;
var category = categoryControl;
var nextIndex = index + 1;
- if (char & 0xFC00 != 0xD800) {
+ if (surrogate > 0x3FF) {
category = low(char);
} else if (nextIndex < _string.length) {
- var nextChar = _string.codeUnitAt(nextIndex);
- if (nextChar & 0xFC00 == 0xDC00) {
+ var nextSurrogate = _string.codeUnitAt(nextIndex) ^ 0xDC00;
+ if (nextSurrogate <= 0x3FF) {
nextIndex += 1;
- category = high(char, nextChar);
+ category = high(surrogate, nextSurrogate);
}
}
state = move(state, category);
diff --git a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart
index 4e8c9e8..ec373d7 100644
--- a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart
+++ b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart
@@ -76,16 +76,17 @@
void step() {
assert(cursor < end);
var char = base.codeUnitAt(cursor++);
- if (char & 0xFC00 != 0xD800) {
+ var surrogate = char ^ 0xD800;
+ if (surrogate > 0x3FF) {
state = move(state, low(char));
return;
}
// The category of an unpaired lead surrogate is Control.
int category;
- int nextChar;
+ int nextSurrogate;
if (cursor < end &&
- (nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
- category = high(char, nextChar);
+ (nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
+ category = high(surrogate, nextSurrogate);
cursor++;
} else {
category = categoryControl;
@@ -112,28 +113,33 @@
}
var cursorBefore = cursor - 1;
var prevChar = base.codeUnitAt(cursorBefore);
- int prevCategory;
- if (prevChar & 0xF800 != 0xD800) {
+ var prevSurrogate = prevChar ^ 0xD800;
+ if (prevSurrogate > 0x7FF) {
// Not surrogate.
- prevCategory = low(prevChar);
- } else if (prevChar & 0xFC00 == 0xD800) {
- // Lead surrogate. Check for a following tail surrogate.
- int tailChar;
- if (cursor < end &&
- (tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
- cursor += 1;
- prevCategory = high(prevChar, tailChar);
+ var prevCategory = low(prevChar);
+ state = move(stateCAny, prevCategory);
+ return cursorBefore;
+ }
+ int prevCategory;
+ if (prevSurrogate > 0x3FF) {
+ // Tail surrogate, check for prior lead surrogate.
+ int leadSurrogate;
+ var leadIndex = cursorBefore - 1;
+ prevSurrogate &= 0x3FF;
+ if (leadIndex >= start &&
+ (leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) {
+ prevCategory = high(leadSurrogate, prevSurrogate);
+ cursorBefore = leadIndex;
} else {
prevCategory = categoryControl;
}
} else {
- // Tail surrogate, check for prior lead surrogate.
- int leadChar;
- var leadIndex = cursorBefore - 1;
- if (leadIndex >= start &&
- (leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) {
- prevCategory = high(leadChar, prevChar);
- cursorBefore = leadIndex;
+ // Lead surrogate. Check for a following tail surrogate.
+ int tailSurrogate;
+ if (cursor < end &&
+ (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
+ cursor += 1;
+ prevCategory = high(prevSurrogate, tailSurrogate);
} else {
prevCategory = categoryControl;
}
@@ -206,7 +212,8 @@
void step() {
assert(cursor > start);
var char = base.codeUnitAt(--cursor);
- if (char & 0xFC00 != 0xDC00) {
+ var surrogate = char ^ 0xDC00;
+ if (surrogate > 0x3FF) {
var category = low(char);
state = moveBack(state, category);
return;
@@ -214,10 +221,10 @@
// Found tail surrogate, check for prior lead surrogate.
// The category of an unpaired tail surrogate is Control.
int category;
- int prevChar;
+ int prevSurrogate;
if (cursor >= start &&
- (prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) {
- category = high(prevChar, char);
+ (prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) {
+ category = high(prevSurrogate, surrogate);
} else {
category = categoryControl;
cursor++;
@@ -342,21 +349,23 @@
if (start < index && index < end) {
var cursorBefore = index;
var nextChar = text.codeUnitAt(index);
+ var nextSurrogate = nextChar ^ 0xD800;
var category = categoryControl;
- if (nextChar & 0xF800 != 0xD800) {
+ if (nextSurrogate > 0x7FF) {
category = low(nextChar);
- } else if (nextChar & 0xFC00 == 0xD800) {
+ } else if (nextSurrogate <= 0x3FF) {
var indexAfter = index + 1;
if (indexAfter < end) {
- var secondChar = text.codeUnitAt(indexAfter);
- if (secondChar & 0xFC00 == 0xDC00) {
- category = high(nextChar, secondChar);
+ var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00;
+ if (secondSurrogate <= 0x3FF) {
+ category = high(nextSurrogate, secondSurrogate);
}
}
} else {
- var prevChar = text.codeUnitAt(index - 1);
- if (prevChar & 0xFC00 == 0xD800) {
- category = high(prevChar, nextChar);
+ var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800;
+ nextSurrogate &= 0x3FF;
+ if (prevSurrogate <= 0x3FF) {
+ category = high(prevSurrogate, nextSurrogate);
cursorBefore -= 1;
}
}
diff --git a/pkgs/characters/lib/src/grapheme_clusters/table.dart b/pkgs/characters/lib/src/grapheme_clusters/table.dart
index fce9c85..79d3365 100644
--- a/pkgs/characters/lib/src/grapheme_clusters/table.dart
+++ b/pkgs/characters/lib/src/grapheme_clusters/table.dart
@@ -1130,6 +1130,7 @@
@pragma('vm:prefer-inline')
@pragma('wasm:prefer-inline')
int low(int codeUnit) {
+ assert(codeUnit <= 0xFFFF);
var chunkStart = _start.codeUnitAt(codeUnit >> 5);
var index = chunkStart + (codeUnit & 31);
return _data.codeUnitAt(index);
@@ -1139,10 +1140,11 @@
@pragma('vm:prefer-inline')
@pragma('wasm:prefer-inline')
int high(int lead, int tail) {
- var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8);
- var chunkStart = _start.codeUnitAt(offset >> 8);
- var index = chunkStart + (tail & 255);
- return _data.codeUnitAt(index);
+ assert(lead <= 0x3FF && tail <= 0x3FF);
+ var chunkIndex = (tail >> 8) + (lead << 2);
+ var byteIndex = tail & 255;
+ var chunkStart = _start.codeUnitAt(2048 + chunkIndex);
+ return _data.codeUnitAt(chunkStart + byteIndex);
}
const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey'
diff --git a/pkgs/characters/pubspec.yaml b/pkgs/characters/pubspec.yaml
index 0ab15fb..3d36c17 100644
--- a/pkgs/characters/pubspec.yaml
+++ b/pkgs/characters/pubspec.yaml
@@ -1,5 +1,5 @@
name: characters
-version: 1.4.1-wip
+version: 1.4.1
description: >-
String replacement with operations that are Unicode/grapheme cluster aware.
repository: https://github.com/dart-lang/core/tree/main/pkgs/characters
@@ -14,4 +14,4 @@
dev_dependencies:
dart_flutter_team_lints: ^3.1.0
- test: ^1.16.6
+ test: ^1.16.0
diff --git a/pkgs/characters/test/characters_test.dart b/pkgs/characters/test/characters_test.dart
index d259cbc..9f39961 100644
--- a/pkgs/characters/test/characters_test.dart
+++ b/pkgs/characters/test/characters_test.dart
@@ -115,28 +115,29 @@
var zwj = '\u200d'; // U+200D, ZWJ
var rainbow = '\u{1f308}'; // U+1F308, Rainbow. Category Pictogram
- var rbflag = '$flag$white$zwj$rainbow';
- var string = '-$rbflag-';
+ var rainbowFlag = '$flag$white$zwj$rainbow';
+ var string = '-$rainbowFlag-';
var range = CharacterRange.at(string, 1);
expect(range.isEmpty, true);
expect(range.moveNext(), true);
- expect(range.current, rbflag);
+ expect(range.current, rainbowFlag);
range = range = CharacterRange.at(string, 2);
expect(range.isEmpty, false);
- expect(range.current, rbflag);
+ expect(range.current, rainbowFlag);
range = range = CharacterRange.at(string, 0, 2);
expect(range.isEmpty, false);
- expect(range.current, '-$rbflag');
+ expect(range.current, '-$rainbowFlag');
range = range = CharacterRange.at(string, 0, 2);
expect(range.isEmpty, false);
- expect(range.current, '-$rbflag');
+ expect(range.current, '-$rainbowFlag');
- range = range = CharacterRange.at(string, 2, '-$rbflag'.length - 1);
+ range =
+ range = CharacterRange.at(string, 2, '-$rainbowFlag'.length - 1);
expect(range.isEmpty, false);
- expect(range.current, rbflag);
+ expect(range.current, rainbowFlag);
expect(range.stringBeforeLength, 1);
range = range = CharacterRange.at(string, 0, string.length);
diff --git a/pkgs/characters/test/src/unicode_tests.dart b/pkgs/characters/test/src/unicode_tests.dart
index 5c01dda..3fc50ee 100644
--- a/pkgs/characters/test/src/unicode_tests.dart
+++ b/pkgs/characters/test/src/unicode_tests.dart
@@ -31,7 +31,7 @@
int categoryOf(int codePoint) {
if (codePoint < 0x10000) return low(codePoint);
var nonBmpOffset = codePoint - 0x10000;
- return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff));
+ return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff);
}
String partCategories(List<String> parts) {
diff --git a/pkgs/characters/tool/benchmark.dart b/pkgs/characters/tool/benchmark.dart
index a77b811..2cf28cd 100644
--- a/pkgs/characters/tool/benchmark.dart
+++ b/pkgs/characters/tool/benchmark.dart
@@ -10,15 +10,14 @@
import '../test/src/various_tests.dart';
// Low-level benchmark of the grapheme cluster step functions.
+// Use ../benchmark/benchmark.dart for the more high-level `Characters`
+// methods.
void main(List<String> args) {
var count = 5;
if (args.isNotEmpty) {
count = int.parse(args[0]);
}
- var gcsf = 0;
- var gcsb = 0;
-
var text = genesis +
hangul +
genesis +
@@ -28,66 +27,94 @@
recJoin(zalgo);
var codeUnits = text.length;
var codePoints = text.runes.length;
- for (var i = 0; i < count; i++) {
- gcsf = benchForward(text, i, codePoints, codeUnits);
- gcsb = benchBackward(text, i, codePoints, codeUnits);
- }
- print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.');
- if (gcsf != gcsb) {
+ // Warmup.
+ var gcSumForward = benchForward(text, -1, codePoints, codeUnits, 150);
+ var gcSumBackwards = benchBackward(text, -1, codePoints, codeUnits, 150);
+ if (gcSumForward != gcSumBackwards) {
print(
'ERROR: Did not count the same number of grapheme clusters: '
- '$gcsf forward vs. $gcsb backward.',
+ '$gcSumForward forward vs. $gcSumBackwards backward.',
+ );
+ return;
+ }
+
+ for (var i = 0; i < count; i++) {
+ gcSumForward = benchForward(text, i, codePoints, codeUnits, 1500);
+ gcSumBackwards = benchBackward(text, i, codePoints, codeUnits, 1500);
+ }
+ print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.');
+ if (gcSumForward != gcSumBackwards) {
+ print(
+ 'ERROR: Did not count the same number of grapheme clusters: '
+ '$gcSumForward forward vs. $gcSumBackwards backward.',
);
} else {
- print('Total: $gcsf gc, $codePoints cp, $codeUnits cu');
- print('Avg ${(codePoints / gcsf).toStringAsFixed(3)} cp/gc');
- print('Avg ${(codeUnits / gcsf).toStringAsFixed(3)} cu/gc');
+ var surrogates = codeUnits - codePoints;
+ print(
+ 'Total: $gcSumForward gc, $codePoints cp, $codeUnits cu, '
+ '$surrogates surrogates '
+ '(${(surrogates / codePoints * 100).toStringAsFixed(3)}%)',
+ );
+ print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc');
+ print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc');
}
}
String recJoin(Iterable<List<String>> texts) =>
texts.map((x) => x.join('')).join('\n');
-int benchForward(String text, int i, int cp, int cu) {
+int benchForward(String text, int round, int cp, int cu, int limit) {
var n = 0;
+ var step = 10;
var gc = 0;
var e = 0;
var sw = Stopwatch()..start();
do {
- var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
- while (breaks.nextBreak() >= 0) {
- gc++;
+ for (var i = 0; i < step; i++) {
+ var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
+ while (breaks.nextBreak() >= 0) {
+ gc++;
+ }
}
e = sw.elapsedMilliseconds;
- n++;
- } while (e < 2000);
- print(
- 'Forward #$i: ${(gc / e).round()} gc/ms, '
- '${(n * cp / e).round()} cp/ms, '
- '${(n * cu / e).round()} cu/ms, '
- '$n rounds',
- );
+ n += step;
+ step += step;
+ } while (e < limit);
+ if (limit > 500) {
+ print(
+ 'Forward #$round: ${(gc / e).round()} gc/ms, '
+ '${(n * cp / e).round()} cp/ms, '
+ '${(n * cu / e).round()} cu/ms, '
+ '$n rounds in $e ms',
+ );
+ }
return gc ~/ n;
}
-int benchBackward(String text, int i, int cp, int cu) {
+int benchBackward(String text, int round, int cp, int cu, int limit) {
var n = 0;
+ var step = 10;
var gc = 0;
var e = 0;
var sw = Stopwatch()..start();
do {
- var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
- while (breaks.nextBreak() >= 0) {
- gc++;
+ for (var i = 0; i < step; i++) {
+ var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
+ while (breaks.nextBreak() >= 0) {
+ gc++;
+ }
}
e = sw.elapsedMilliseconds;
- n++;
- } while (e < 2000);
- print(
- 'Backward #$i: ${(gc / e).round()} gc/ms, '
- '${(n * cp / e).round()} cp/ms, '
- '${(n * cu / e).round()} cu/ms, '
- '$n rounds',
- );
+ n += step;
+ step += step;
+ } while (e < limit);
+ if (limit > 500) {
+ print(
+ 'Backward #$round: ${(gc / e).round()} gc/ms, '
+ '${(n * cp / e).round()} cp/ms, '
+ '${(n * cu / e).round()} cu/ms, '
+ '$n rounds in $e ms',
+ );
+ }
return gc ~/ n;
}
diff --git a/pkgs/characters/tool/bin/generate_tables.dart b/pkgs/characters/tool/bin/generate_tables.dart
index f348276..a829246 100644
--- a/pkgs/characters/tool/bin/generate_tables.dart
+++ b/pkgs/characters/tool/bin/generate_tables.dart
@@ -307,6 +307,7 @@
'''
$preferInline
int $name(int codeUnit) {
+ assert(codeUnit <= 0xFFFF);
var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1});
var index = chunkStart + (codeUnit & ${chunkSize - 1});
return $dataName.codeUnitAt(index);
@@ -320,27 +321,34 @@
int startOffset,
int chunkSize,
) {
- if (chunkSize == 1024) {
+ var shift = chunkSize.bitLength - 1;
+ assert(shift <= 10);
+ if (shift == 10) {
return '''
$preferInline
int $name(int lead, int tail) {
- var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead));
- var index = chunkStart + (0x3ff & tail);
- return $dataName.codeUnitAt(index);
+ assert(lead <= 0x3FF && tail <= 0x3FF);
+ var chunkStart = $startName.codeUnitAt($startOffset + lead);
+ return $dataName.codeUnitAt(chunkStart + tail);
}
''';
}
- var shift = chunkSize.bitLength - 1;
- var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
- return '''
+ if (shift < 10) {
+ return '''
$preferInline
int $name(int lead, int tail) {
- var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
- var chunkStart = $startName.codeUnitAt(offset >> $shift);
- var index = chunkStart + ($indexVar & ${chunkSize - 1});
- return $dataName.codeUnitAt(index);
+ assert(lead <= 0x3FF && tail <= 0x3FF);
+ var chunkIndex = (tail >> $shift) + (lead << ${10 - shift});
+ var byteIndex = tail & ${chunkSize - 1};
+ var chunkStart = $startName.codeUnitAt($startOffset + chunkIndex);
+ return $dataName.codeUnitAt(chunkStart + byteIndex);
}
''';
+ }
+ // Add code if shift > 10 ever becomes optimal for table size.
+ // Fx: chunkIndex = lead >> ${20 - shift};
+ // byteIndex = tail + ((lead & ${(chunkSize >> 10) - 1}) << 10);
+ throw UnimplementedError('No code for chunk sizes > 10 bits');
}
// -----------------------------------------------------------------------------
diff --git a/pkgs/characters/tool/src/grapheme_category_loader.dart b/pkgs/characters/tool/src/grapheme_category_loader.dart
index 1f436f8..2b5c177 100644
--- a/pkgs/characters/tool/src/grapheme_category_loader.dart
+++ b/pkgs/characters/tool/src/grapheme_category_loader.dart
@@ -291,9 +291,6 @@
}
// --------------------------------------------------------------------
-// TODO: Use a sparse table?
-// Likely not worth it.
-
/// Fixed length table for Unicode properties.
class UnicodePropertyTable {
static const int _unicodeCodePoints = 0x110000;
diff --git a/pkgs/characters/tool/src/string_literal_writer.dart b/pkgs/characters/tool/src/string_literal_writer.dart
index 9259b4e..96ca93f 100644
--- a/pkgs/characters/tool/src/string_literal_writer.dart
+++ b/pkgs/characters/tool/src/string_literal_writer.dart
@@ -43,12 +43,12 @@
/// Adds a single UTF-16 code unit.
void add(int codeUnit) {
// Always escape: `\n`, `\r`, `'`, `$` and `\`, plus anything the user wants.
- if (_escape(codeUnit) ||
- codeUnit == 0x24 ||
- codeUnit == 0x27 ||
- codeUnit == 0x5c ||
- codeUnit == 0x0a ||
- codeUnit == 0x0d) {
+ if (_escape(codeUnit) || // Anything the user wants encoded.
+ codeUnit == 0x24 /* $ */ ||
+ codeUnit == 0x27 /* ' */ ||
+ codeUnit == 0x5c /* \ */ ||
+ codeUnit == 0x0a /* \n */ ||
+ codeUnit == 0x0d /* \r */) {
_writeEscape(codeUnit);
return;
}
@@ -59,6 +59,9 @@
buffer.writeCharCode(codeUnit);
}
+ /// Writes an escape for the [codeUnit].
+ ///
+ /// Is only called for characters that need escaping.
void _writeEscape(int codeUnit) {
var replacement = _escapeCache[codeUnit];
if (replacement == null) {
@@ -83,9 +86,8 @@
replacement = r'\$';
} else if (codeUnit == "'".codeUnitAt(0)) {
replacement = r"\'";
- }
- if (codeUnit == r''.codeUnitAt(0)) {
- replacement = r'\';
+ } else if (codeUnit == r'\'.codeUnitAt(0)) {
+ replacement = r'\\';
} else {
replacement = r'\x' + codeUnit.toRadixString(16);
}