Optimize surrogate decoding. (#894)

Use a slightly different approach to recognizing surrogate pairs, which can avoid some duplicate computations. Slight speedup measured locally when with `compile js` and `compile exe`.
diff --git a/pkgs/characters/CHANGELOG.md b/pkgs/characters/CHANGELOG.md
index 1992446..f8782bf 100644
--- a/pkgs/characters/CHANGELOG.md
+++ b/pkgs/characters/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 1.4.1-wip
+## 1.4.1
 
-- Run `dart format` with the new style.
+* Run `dart format` with the new style.
+* Performance improvement for non-BMP characters.
 
 ## 1.4.0
 
diff --git a/pkgs/characters/lib/src/characters_impl.dart b/pkgs/characters/lib/src/characters_impl.dart
index e869433..a901b51 100644
--- a/pkgs/characters/lib/src/characters_impl.dart
+++ b/pkgs/characters/lib/src/characters_impl.dart
@@ -509,15 +509,16 @@
       var index = _end;
       while (index < _string.length) {
         var char = _string.codeUnitAt(index);
+        var surrogate = char ^ 0xD800;
         var category = categoryControl;
         var nextIndex = index + 1;
-        if (char & 0xFC00 != 0xD800) {
+        if (surrogate > 0x3FF) {
           category = low(char);
         } else if (nextIndex < _string.length) {
-          var nextChar = _string.codeUnitAt(nextIndex);
-          if (nextChar & 0xFC00 == 0xDC00) {
+          var nextSurrogate = _string.codeUnitAt(nextIndex) ^ 0xDC00;
+          if (nextSurrogate <= 0x3FF) {
             nextIndex += 1;
-            category = high(char, nextChar);
+            category = high(surrogate, nextSurrogate);
           }
         }
         state = move(state, category);
diff --git a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart
index 4e8c9e8..ec373d7 100644
--- a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart
+++ b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart
@@ -76,16 +76,17 @@
   void step() {
     assert(cursor < end);
     var char = base.codeUnitAt(cursor++);
-    if (char & 0xFC00 != 0xD800) {
+    var surrogate = char ^ 0xD800;
+    if (surrogate > 0x3FF) {
       state = move(state, low(char));
       return;
     }
     // The category of an unpaired lead surrogate is Control.
     int category;
-    int nextChar;
+    int nextSurrogate;
     if (cursor < end &&
-        (nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
-      category = high(char, nextChar);
+        (nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
+      category = high(surrogate, nextSurrogate);
       cursor++;
     } else {
       category = categoryControl;
@@ -112,28 +113,33 @@
     }
     var cursorBefore = cursor - 1;
     var prevChar = base.codeUnitAt(cursorBefore);
-    int prevCategory;
-    if (prevChar & 0xF800 != 0xD800) {
+    var prevSurrogate = prevChar ^ 0xD800;
+    if (prevSurrogate > 0x7FF) {
       // Not surrogate.
-      prevCategory = low(prevChar);
-    } else if (prevChar & 0xFC00 == 0xD800) {
-      // Lead surrogate. Check for a following tail surrogate.
-      int tailChar;
-      if (cursor < end &&
-          (tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
-        cursor += 1;
-        prevCategory = high(prevChar, tailChar);
+      var prevCategory = low(prevChar);
+      state = move(stateCAny, prevCategory);
+      return cursorBefore;
+    }
+    int prevCategory;
+    if (prevSurrogate > 0x3FF) {
+      // Tail surrogate, check for prior lead surrogate.
+      int leadSurrogate;
+      var leadIndex = cursorBefore - 1;
+      prevSurrogate &= 0x3FF;
+      if (leadIndex >= start &&
+          (leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) {
+        prevCategory = high(leadSurrogate, prevSurrogate);
+        cursorBefore = leadIndex;
       } else {
         prevCategory = categoryControl;
       }
     } else {
-      // Tail surrogate, check for prior lead surrogate.
-      int leadChar;
-      var leadIndex = cursorBefore - 1;
-      if (leadIndex >= start &&
-          (leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) {
-        prevCategory = high(leadChar, prevChar);
-        cursorBefore = leadIndex;
+      // Lead surrogate. Check for a following tail surrogate.
+      int tailSurrogate;
+      if (cursor < end &&
+          (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
+        cursor += 1;
+        prevCategory = high(prevSurrogate, tailSurrogate);
       } else {
         prevCategory = categoryControl;
       }
@@ -206,7 +212,8 @@
   void step() {
     assert(cursor > start);
     var char = base.codeUnitAt(--cursor);
-    if (char & 0xFC00 != 0xDC00) {
+    var surrogate = char ^ 0xDC00;
+    if (surrogate > 0x3FF) {
       var category = low(char);
       state = moveBack(state, category);
       return;
@@ -214,10 +221,10 @@
     // Found tail surrogate, check for prior lead surrogate.
     // The category of an unpaired tail surrogate is Control.
     int category;
-    int prevChar;
+    int prevSurrogate;
     if (cursor >= start &&
-        (prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) {
-      category = high(prevChar, char);
+        (prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) {
+      category = high(prevSurrogate, surrogate);
     } else {
       category = categoryControl;
       cursor++;
@@ -342,21 +349,23 @@
   if (start < index && index < end) {
     var cursorBefore = index;
     var nextChar = text.codeUnitAt(index);
+    var nextSurrogate = nextChar ^ 0xD800;
     var category = categoryControl;
-    if (nextChar & 0xF800 != 0xD800) {
+    if (nextSurrogate > 0x7FF) {
       category = low(nextChar);
-    } else if (nextChar & 0xFC00 == 0xD800) {
+    } else if (nextSurrogate <= 0x3FF) {
       var indexAfter = index + 1;
       if (indexAfter < end) {
-        var secondChar = text.codeUnitAt(indexAfter);
-        if (secondChar & 0xFC00 == 0xDC00) {
-          category = high(nextChar, secondChar);
+        var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00;
+        if (secondSurrogate <= 0x3FF) {
+          category = high(nextSurrogate, secondSurrogate);
         }
       }
     } else {
-      var prevChar = text.codeUnitAt(index - 1);
-      if (prevChar & 0xFC00 == 0xD800) {
-        category = high(prevChar, nextChar);
+      var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800;
+      nextSurrogate &= 0x3FF;
+      if (prevSurrogate <= 0x3FF) {
+        category = high(prevSurrogate, nextSurrogate);
         cursorBefore -= 1;
       }
     }
diff --git a/pkgs/characters/lib/src/grapheme_clusters/table.dart b/pkgs/characters/lib/src/grapheme_clusters/table.dart
index fce9c85..79d3365 100644
--- a/pkgs/characters/lib/src/grapheme_clusters/table.dart
+++ b/pkgs/characters/lib/src/grapheme_clusters/table.dart
@@ -1130,6 +1130,7 @@
 @pragma('vm:prefer-inline')
 @pragma('wasm:prefer-inline')
 int low(int codeUnit) {
+  assert(codeUnit <= 0xFFFF);
   var chunkStart = _start.codeUnitAt(codeUnit >> 5);
   var index = chunkStart + (codeUnit & 31);
   return _data.codeUnitAt(index);
@@ -1139,10 +1140,11 @@
 @pragma('vm:prefer-inline')
 @pragma('wasm:prefer-inline')
 int high(int lead, int tail) {
-  var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8);
-  var chunkStart = _start.codeUnitAt(offset >> 8);
-  var index = chunkStart + (tail & 255);
-  return _data.codeUnitAt(index);
+  assert(lead <= 0x3FF && tail <= 0x3FF);
+  var chunkIndex = (tail >> 8) + (lead << 2);
+  var byteIndex = tail & 255;
+  var chunkStart = _start.codeUnitAt(2048 + chunkIndex);
+  return _data.codeUnitAt(chunkStart + byteIndex);
 }
 
 const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey'
diff --git a/pkgs/characters/pubspec.yaml b/pkgs/characters/pubspec.yaml
index 0ab15fb..3d36c17 100644
--- a/pkgs/characters/pubspec.yaml
+++ b/pkgs/characters/pubspec.yaml
@@ -1,5 +1,5 @@
 name: characters
-version: 1.4.1-wip
+version: 1.4.1
 description: >-
   String replacement with operations that are Unicode/grapheme cluster aware.
 repository: https://github.com/dart-lang/core/tree/main/pkgs/characters
@@ -14,4 +14,4 @@
 
 dev_dependencies:
   dart_flutter_team_lints: ^3.1.0
-  test: ^1.16.6
+  test: ^1.16.0
diff --git a/pkgs/characters/test/characters_test.dart b/pkgs/characters/test/characters_test.dart
index d259cbc..9f39961 100644
--- a/pkgs/characters/test/characters_test.dart
+++ b/pkgs/characters/test/characters_test.dart
@@ -115,28 +115,29 @@
         var zwj = '\u200d'; // U+200D, ZWJ
         var rainbow = '\u{1f308}'; // U+1F308, Rainbow. Category Pictogram
 
-        var rbflag = '$flag$white$zwj$rainbow';
-        var string = '-$rbflag-';
+        var rainbowFlag = '$flag$white$zwj$rainbow';
+        var string = '-$rainbowFlag-';
         var range = CharacterRange.at(string, 1);
         expect(range.isEmpty, true);
         expect(range.moveNext(), true);
-        expect(range.current, rbflag);
+        expect(range.current, rainbowFlag);
 
         range = range = CharacterRange.at(string, 2);
         expect(range.isEmpty, false);
-        expect(range.current, rbflag);
+        expect(range.current, rainbowFlag);
 
         range = range = CharacterRange.at(string, 0, 2);
         expect(range.isEmpty, false);
-        expect(range.current, '-$rbflag');
+        expect(range.current, '-$rainbowFlag');
 
         range = range = CharacterRange.at(string, 0, 2);
         expect(range.isEmpty, false);
-        expect(range.current, '-$rbflag');
+        expect(range.current, '-$rainbowFlag');
 
-        range = range = CharacterRange.at(string, 2, '-$rbflag'.length - 1);
+        range =
+            range = CharacterRange.at(string, 2, '-$rainbowFlag'.length - 1);
         expect(range.isEmpty, false);
-        expect(range.current, rbflag);
+        expect(range.current, rainbowFlag);
         expect(range.stringBeforeLength, 1);
 
         range = range = CharacterRange.at(string, 0, string.length);
diff --git a/pkgs/characters/test/src/unicode_tests.dart b/pkgs/characters/test/src/unicode_tests.dart
index 5c01dda..3fc50ee 100644
--- a/pkgs/characters/test/src/unicode_tests.dart
+++ b/pkgs/characters/test/src/unicode_tests.dart
@@ -31,7 +31,7 @@
 int categoryOf(int codePoint) {
   if (codePoint < 0x10000) return low(codePoint);
   var nonBmpOffset = codePoint - 0x10000;
-  return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff));
+  return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff);
 }
 
 String partCategories(List<String> parts) {
diff --git a/pkgs/characters/tool/benchmark.dart b/pkgs/characters/tool/benchmark.dart
index a77b811..2cf28cd 100644
--- a/pkgs/characters/tool/benchmark.dart
+++ b/pkgs/characters/tool/benchmark.dart
@@ -10,15 +10,14 @@
 import '../test/src/various_tests.dart';
 
 // Low-level benchmark of the grapheme cluster step functions.
+// Use ../benchmark/benchmark.dart for the more high-level `Characters`
+// methods.
 
 void main(List<String> args) {
   var count = 5;
   if (args.isNotEmpty) {
     count = int.parse(args[0]);
   }
-  var gcsf = 0;
-  var gcsb = 0;
-
   var text = genesis +
       hangul +
       genesis +
@@ -28,66 +27,94 @@
       recJoin(zalgo);
   var codeUnits = text.length;
   var codePoints = text.runes.length;
-  for (var i = 0; i < count; i++) {
-    gcsf = benchForward(text, i, codePoints, codeUnits);
-    gcsb = benchBackward(text, i, codePoints, codeUnits);
-  }
-  print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.');
-  if (gcsf != gcsb) {
+  // Warmup.
+  var gcSumForward = benchForward(text, -1, codePoints, codeUnits, 150);
+  var gcSumBackwards = benchBackward(text, -1, codePoints, codeUnits, 150);
+  if (gcSumForward != gcSumBackwards) {
     print(
       'ERROR: Did not count the same number of grapheme clusters: '
-      '$gcsf forward vs. $gcsb backward.',
+      '$gcSumForward forward vs. $gcSumBackwards backward.',
+    );
+    return;
+  }
+
+  for (var i = 0; i < count; i++) {
+    gcSumForward = benchForward(text, i, codePoints, codeUnits, 1500);
+    gcSumBackwards = benchBackward(text, i, codePoints, codeUnits, 1500);
+  }
+  print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.');
+  if (gcSumForward != gcSumBackwards) {
+    print(
+      'ERROR: Did not count the same number of grapheme clusters: '
+      '$gcSumForward forward vs. $gcSumBackwards backward.',
     );
   } else {
-    print('Total: $gcsf gc, $codePoints cp, $codeUnits cu');
-    print('Avg ${(codePoints / gcsf).toStringAsFixed(3)} cp/gc');
-    print('Avg ${(codeUnits / gcsf).toStringAsFixed(3)} cu/gc');
+    var surrogates = codeUnits - codePoints;
+    print(
+      'Total: $gcSumForward gc, $codePoints cp, $codeUnits cu, '
+      '$surrogates surrogates '
+      '(${(surrogates / codePoints * 100).toStringAsFixed(3)}%)',
+    );
+    print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc');
+    print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc');
   }
 }
 
 String recJoin(Iterable<List<String>> texts) =>
     texts.map((x) => x.join('')).join('\n');
 
-int benchForward(String text, int i, int cp, int cu) {
+int benchForward(String text, int round, int cp, int cu, int limit) {
   var n = 0;
+  var step = 10;
   var gc = 0;
   var e = 0;
   var sw = Stopwatch()..start();
   do {
-    var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
-    while (breaks.nextBreak() >= 0) {
-      gc++;
+    for (var i = 0; i < step; i++) {
+      var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
+      while (breaks.nextBreak() >= 0) {
+        gc++;
+      }
     }
     e = sw.elapsedMilliseconds;
-    n++;
-  } while (e < 2000);
-  print(
-    'Forward  #$i: ${(gc / e).round()} gc/ms, '
-    '${(n * cp / e).round()} cp/ms, '
-    '${(n * cu / e).round()} cu/ms, '
-    '$n rounds',
-  );
+    n += step;
+    step += step;
+  } while (e < limit);
+  if (limit > 500) {
+    print(
+      'Forward  #$round: ${(gc / e).round()} gc/ms, '
+      '${(n * cp / e).round()} cp/ms, '
+      '${(n * cu / e).round()} cu/ms, '
+      '$n rounds in $e ms',
+    );
+  }
   return gc ~/ n;
 }
 
-int benchBackward(String text, int i, int cp, int cu) {
+int benchBackward(String text, int round, int cp, int cu, int limit) {
   var n = 0;
+  var step = 10;
   var gc = 0;
   var e = 0;
   var sw = Stopwatch()..start();
   do {
-    var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
-    while (breaks.nextBreak() >= 0) {
-      gc++;
+    for (var i = 0; i < step; i++) {
+      var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
+      while (breaks.nextBreak() >= 0) {
+        gc++;
+      }
     }
     e = sw.elapsedMilliseconds;
-    n++;
-  } while (e < 2000);
-  print(
-    'Backward #$i: ${(gc / e).round()} gc/ms, '
-    '${(n * cp / e).round()} cp/ms, '
-    '${(n * cu / e).round()} cu/ms, '
-    '$n rounds',
-  );
+    n += step;
+    step += step;
+  } while (e < limit);
+  if (limit > 500) {
+    print(
+      'Backward #$round: ${(gc / e).round()} gc/ms, '
+      '${(n * cp / e).round()} cp/ms, '
+      '${(n * cu / e).round()} cu/ms, '
+      '$n rounds in $e ms',
+    );
+  }
   return gc ~/ n;
 }
diff --git a/pkgs/characters/tool/bin/generate_tables.dart b/pkgs/characters/tool/bin/generate_tables.dart
index f348276..a829246 100644
--- a/pkgs/characters/tool/bin/generate_tables.dart
+++ b/pkgs/characters/tool/bin/generate_tables.dart
@@ -307,6 +307,7 @@
     '''
 $preferInline
 int $name(int codeUnit) {
+  assert(codeUnit <= 0xFFFF);
   var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1});
   var index = chunkStart + (codeUnit & ${chunkSize - 1});
   return $dataName.codeUnitAt(index);
@@ -320,27 +321,34 @@
   int startOffset,
   int chunkSize,
 ) {
-  if (chunkSize == 1024) {
+  var shift = chunkSize.bitLength - 1;
+  assert(shift <= 10);
+  if (shift == 10) {
     return '''
 $preferInline
 int $name(int lead, int tail) {
-  var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead));
-  var index = chunkStart + (0x3ff & tail);
-  return $dataName.codeUnitAt(index);
+  assert(lead <= 0x3FF && tail <= 0x3FF);
+  var chunkStart = $startName.codeUnitAt($startOffset + lead);
+  return $dataName.codeUnitAt(chunkStart + tail);
 }
 ''';
   }
-  var shift = chunkSize.bitLength - 1;
-  var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
-  return '''
+  if (shift < 10) {
+    return '''
 $preferInline
 int $name(int lead, int tail) {
-  var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
-  var chunkStart = $startName.codeUnitAt(offset >> $shift);
-  var index = chunkStart + ($indexVar & ${chunkSize - 1});
-  return $dataName.codeUnitAt(index);
+  assert(lead <= 0x3FF && tail <= 0x3FF);
+  var chunkIndex = (tail >> $shift) + (lead << ${10 - shift});
+  var byteIndex = tail & ${chunkSize - 1};
+  var chunkStart = $startName.codeUnitAt($startOffset + chunkIndex);
+  return $dataName.codeUnitAt(chunkStart + byteIndex);
 }
 ''';
+  }
+  // Add code if shift > 10 ever becomes optimal for table size.
+  // Fx: chunkIndex = lead >> ${20 - shift};
+  //     byteIndex = tail + ((lead & ${(chunkSize >> 10) - 1}) << 10);
+  throw UnimplementedError('No code for chunk sizes > 10 bits');
 }
 
 // -----------------------------------------------------------------------------
diff --git a/pkgs/characters/tool/src/grapheme_category_loader.dart b/pkgs/characters/tool/src/grapheme_category_loader.dart
index 1f436f8..2b5c177 100644
--- a/pkgs/characters/tool/src/grapheme_category_loader.dart
+++ b/pkgs/characters/tool/src/grapheme_category_loader.dart
@@ -291,9 +291,6 @@
 }
 
 // --------------------------------------------------------------------
-// TODO: Use a sparse table?
-// Likely not worth it.
-
 /// Fixed length table for Unicode properties.
 class UnicodePropertyTable {
   static const int _unicodeCodePoints = 0x110000;
diff --git a/pkgs/characters/tool/src/string_literal_writer.dart b/pkgs/characters/tool/src/string_literal_writer.dart
index 9259b4e..96ca93f 100644
--- a/pkgs/characters/tool/src/string_literal_writer.dart
+++ b/pkgs/characters/tool/src/string_literal_writer.dart
@@ -43,12 +43,12 @@
   /// Adds a single UTF-16 code unit.
   void add(int codeUnit) {
     // Always escape: `\n`, `\r`, `'`, `$` and `\`, plus anything the user wants.
-    if (_escape(codeUnit) ||
-        codeUnit == 0x24 ||
-        codeUnit == 0x27 ||
-        codeUnit == 0x5c ||
-        codeUnit == 0x0a ||
-        codeUnit == 0x0d) {
+    if (_escape(codeUnit) || // Anything the user wants encoded.
+        codeUnit == 0x24 /* $ */ ||
+        codeUnit == 0x27 /* ' */ ||
+        codeUnit == 0x5c /* \ */ ||
+        codeUnit == 0x0a /* \n */ ||
+        codeUnit == 0x0d /* \r */) {
       _writeEscape(codeUnit);
       return;
     }
@@ -59,6 +59,9 @@
     buffer.writeCharCode(codeUnit);
   }
 
+  /// Writes an escape for the [codeUnit].
+  ///
+  /// Is only called for characters that need escaping.
   void _writeEscape(int codeUnit) {
     var replacement = _escapeCache[codeUnit];
     if (replacement == null) {
@@ -83,9 +86,8 @@
           replacement = r'\$';
         } else if (codeUnit == "'".codeUnitAt(0)) {
           replacement = r"\'";
-        }
-        if (codeUnit == r''.codeUnitAt(0)) {
-          replacement = r'\';
+        } else if (codeUnit == r'\'.codeUnitAt(0)) {
+          replacement = r'\\';
         } else {
           replacement = r'\x' + codeUnit.toRadixString(16);
         }