Add better support for dealing with supplemental-plane code units (#46)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb4d297..5963ce8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,17 @@
+## 1.2.0
+
+* Add better support for reading code points in the Unicode supplementary plane:
+
+  * Added `StringScanner.readCodePoint()`, which consumes an entire Unicode code
+    point even if it's represented by two UTF-16 code units.
+
+  * Added `StringScanner.peekCodePoint()`, which returns an entire Unicode code
+    point even if it's represented by two UTF-16 code units.
+
+  * `StringScanner.scanChar()` and `StringScanner.expectChar()` will now
+    properly consume two UTF-16 code units if they're passed Unicode code points
+    in the supplementary plane.
+
 ## 1.1.1
 
 * Populate the pubspec `repository` field.
diff --git a/lib/src/eager_span_scanner.dart b/lib/src/eager_span_scanner.dart
index d27a818..3bf5416 100644
--- a/lib/src/eager_span_scanner.dart
+++ b/lib/src/eager_span_scanner.dart
@@ -5,6 +5,7 @@
 import 'charcode.dart';
 import 'line_scanner.dart';
 import 'span_scanner.dart';
+import 'utils.dart';
 
 // TODO(nweiz): Currently this duplicates code in line_scanner.dart. Once
 // sdk#23770 is fully complete, we should move the shared code into a mixin.
@@ -90,7 +91,7 @@
       _line += 1;
       _column = 0;
     } else {
-      _column += 1;
+      _column += inSupplementaryPlane(character) ? 2 : 1;
     }
   }
 
diff --git a/lib/src/line_scanner.dart b/lib/src/line_scanner.dart
index af6b6e7..4f0673c 100644
--- a/lib/src/line_scanner.dart
+++ b/lib/src/line_scanner.dart
@@ -4,6 +4,7 @@
 
 import 'charcode.dart';
 import 'string_scanner.dart';
+import 'utils.dart';
 
 // Note that much of this code is duplicated in eager_span_scanner.dart.
 
@@ -95,7 +96,7 @@
       _line += 1;
       _column = 0;
     } else {
-      _column += 1;
+      _column += inSupplementaryPlane(character) ? 2 : 1;
     }
   }
 
diff --git a/lib/src/string_scanner.dart b/lib/src/string_scanner.dart
index d254b04..de566a5 100644
--- a/lib/src/string_scanner.dart
+++ b/lib/src/string_scanner.dart
@@ -90,16 +90,35 @@
 
   /// If the next character in the string is [character], consumes it.
   ///
+  /// If [character] is a Unicode code point in a supplementary plane, this will
+  /// consume two code units. Dart's string representation is UTF-16, which
+  /// represents supplementary-plane code units as two code units.
+  ///
   /// Returns whether or not [character] was consumed.
   bool scanChar(int character) {
-    if (isDone) return false;
-    if (string.codeUnitAt(_position) != character) return false;
-    _position++;
-    return true;
+    if (inSupplementaryPlane(character)) {
+      if (_position + 1 >= string.length ||
+          string.codeUnitAt(_position) != highSurrogate(character) ||
+          string.codeUnitAt(_position + 1) != lowSurrogate(character)) {
+        return false;
+      } else {
+        _position += 2;
+        return true;
+      }
+    } else {
+      if (isDone) return false;
+      if (string.codeUnitAt(_position) != character) return false;
+      _position++;
+      return true;
+    }
   }
 
   /// If the next character in the string is [character], consumes it.
   ///
+  /// If [character] is a Unicode code point in a supplementary plane, this will
+  /// consume two code units. Dart's string representation is UTF-16, which
+  /// represents supplementary-plane code units as two code units.
+  ///
   /// If [character] could not be consumed, throws a [FormatException]
   /// describing the position of the failure. [name] is used in this error as
   /// the expected name of the character being matched; if it's `null`, the
@@ -120,6 +139,43 @@
     _fail(name);
   }
 
+  /// Consumes a single Unicode code unit and returns it.
+  ///
+  /// This works like [readChar], except that it automatically handles UTF-16
+  /// surrogate pairs. Specifically, if the next two code units form a surrogate
+  /// pair, consumes them both and returns the corresponding Unicode code point.
+  ///
+  /// If next two characters are not a surrogate pair, the next code unit is
+  /// returned as-is, even if it's an unpaired surrogate.
+  int readCodePoint() {
+    final first = readChar();
+    if (!isHighSurrogate(first)) return first;
+
+    final next = peekChar();
+    if (next == null || !isLowSurrogate(next)) return first;
+
+    readChar();
+    return decodeSurrogatePair(first, next);
+  }
+
+  /// Returns the Unicode code point immediately after [position].
+  ///
+  /// This works like [peekChar], except that it automatically handles UTF-16
+  /// surrogate pairs. Specifically, if the next two code units form a surrogate
+  /// pair, returns the corresponding Unicode code point.
+  ///
+  /// If next two characters are not a surrogate pair, the next code unit is
+  /// returned as-is, even if it's an unpaired surrogate.
+  int? peekCodePoint() {
+    final first = peekChar();
+    if (first == null || !isHighSurrogate(first)) return first;
+
+    final next = peekChar(1);
+    if (next == null || !isLowSurrogate(next)) return first;
+
+    return decodeSurrogatePair(first, next);
+  }
+
   /// If [pattern] matches at the current position of the string, scans forward
   /// until the end of the match.
   ///
diff --git a/lib/src/utils.dart b/lib/src/utils.dart
index 52dfb63..39891a1 100644
--- a/lib/src/utils.dart
+++ b/lib/src/utils.dart
@@ -29,3 +29,67 @@
         'the string.');
   }
 }
+
+// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF
+// for documentation on how UTF-16 encoding works and definitions of various
+// related terms.
+
+/// The inclusive lower bound of Unicode's supplementary plane.
+const _supplementaryPlaneLowerBound = 0x10000;
+
+/// The inclusive upper bound of Unicode's supplementary plane.
+const _supplementaryPlaneUpperBound = 0x10FFFF;
+
+/// The inclusive lower bound of the UTF-16 high surrogate block.
+const _highSurrogateLowerBound = 0xD800;
+
+/// The inclusive lower bound of the UTF-16 low surrogate block.
+const _lowSurrogateLowerBound = 0xDC00;
+
+/// The number of low bits in each code unit of a surrogate pair that goes into
+/// determining which code point it encodes.
+const _surrogateBits = 10;
+
+/// A bit mask that covers the lower [_surrogateBits] of a code point, which can
+/// be used to extract the value of a surrogate or the low surrogate value of a
+/// code unit.
+const _surrogateValueMask = (1 << _surrogateBits) - 1;
+
+/// Returns whether [codePoint] is in the Unicode supplementary plane, and thus
+/// must be represented as a surrogate pair in UTF-16.
+bool inSupplementaryPlane(int codePoint) =>
+    codePoint >= _supplementaryPlaneLowerBound &&
+    codePoint <= _supplementaryPlaneUpperBound;
+
+/// Returns whether [codeUnit] is a UTF-16 high surrogate.
+bool isHighSurrogate(int codeUnit) =>
+    (codeUnit & ~_surrogateValueMask) == _highSurrogateLowerBound;
+
+/// Returns whether [codeUnit] is a UTF-16 low surrogate.
+bool isLowSurrogate(int codeUnit) =>
+    (codeUnit >> _surrogateBits) == (_lowSurrogateLowerBound >> _surrogateBits);
+
+/// Returns the high surrogate needed to encode the supplementary-plane
+/// [codePoint].
+int highSurrogate(int codePoint) {
+  assert(inSupplementaryPlane(codePoint));
+  return ((codePoint - _supplementaryPlaneLowerBound) >> _surrogateBits) +
+      _highSurrogateLowerBound;
+}
+
+/// Returns the low surrogate needed to encode the supplementary-plane
+/// [codePoint].
+int lowSurrogate(int codePoint) {
+  assert(inSupplementaryPlane(codePoint));
+  return ((codePoint - _supplementaryPlaneLowerBound) & _surrogateValueMask) +
+      _lowSurrogateLowerBound;
+}
+
+/// Converts a UTF-16 surrogate pair into the Unicode code unit it represents.
+int decodeSurrogatePair(int highSurrogate, int lowSurrogate) {
+  assert(isHighSurrogate(highSurrogate));
+  assert(isLowSurrogate(lowSurrogate));
+  return _supplementaryPlaneLowerBound +
+      (((highSurrogate & _surrogateValueMask) << _surrogateBits) |
+          (lowSurrogate & _surrogateValueMask));
+}
diff --git a/pubspec.yaml b/pubspec.yaml
index 421a227..c38f3bf 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,5 +1,5 @@
 name: string_scanner
-version: 1.1.1
+version: 1.2.0
 description: A class for parsing strings using a sequence of patterns.
 repository: https://github.com/dart-lang/string_scanner
 
diff --git a/test/line_scanner_test.dart b/test/line_scanner_test.dart
index a3deff1..d31d313 100644
--- a/test/line_scanner_test.dart
+++ b/test/line_scanner_test.dart
@@ -81,6 +81,39 @@
     });
   });
 
+  group('readCodePoint()', () {
+    test('on a non-newline character increases the column but not the line',
+        () {
+      scanner.readCodePoint();
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(1));
+    });
+
+    test('consuming a newline resets the column and increases the line', () {
+      scanner.expect('foo');
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(3));
+
+      scanner.readCodePoint();
+      expect(scanner.line, equals(1));
+      expect(scanner.column, equals(0));
+    });
+
+    test("consuming halfway through a CR LF doesn't count as a line", () {
+      scanner.expect('foo\nbar');
+      expect(scanner.line, equals(1));
+      expect(scanner.column, equals(3));
+
+      scanner.readCodePoint();
+      expect(scanner.line, equals(1));
+      expect(scanner.column, equals(4));
+
+      scanner.readCodePoint();
+      expect(scanner.line, equals(2));
+      expect(scanner.column, equals(0));
+    });
+  });
+
   group('scanChar()', () {
     test('on a non-newline character increases the column but not the line',
         () {
@@ -114,6 +147,59 @@
     });
   });
 
+  group('before a surrogate pair', () {
+    final codePoint = '\uD83D\uDC6D'.runes.first;
+    const highSurrogate = 0xD83D;
+
+    late LineScanner scanner;
+    setUp(() {
+      scanner = LineScanner('foo: \uD83D\uDC6D');
+      expect(scanner.scan('foo: '), isTrue);
+    });
+
+    test('readChar returns the high surrogate and moves into the pair', () {
+      expect(scanner.readChar(), equals(highSurrogate));
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(6));
+      expect(scanner.position, equals(6));
+    });
+
+    test('readCodePoint returns the code unit and moves past the pair', () {
+      expect(scanner.readCodePoint(), equals(codePoint));
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(7));
+      expect(scanner.position, equals(7));
+    });
+
+    test('scanChar with the high surrogate moves into the pair', () {
+      expect(scanner.scanChar(highSurrogate), isTrue);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(6));
+      expect(scanner.position, equals(6));
+    });
+
+    test('scanChar with the code point moves past the pair', () {
+      expect(scanner.scanChar(codePoint), isTrue);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(7));
+      expect(scanner.position, equals(7));
+    });
+
+    test('expectChar with the high surrogate moves into the pair', () {
+      scanner.expectChar(highSurrogate);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(6));
+      expect(scanner.position, equals(6));
+    });
+
+    test('expectChar with the code point moves past the pair', () {
+      scanner.expectChar(codePoint);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(7));
+      expect(scanner.position, equals(7));
+    });
+  });
+
   group('position=', () {
     test('forward through newlines sets the line and column', () {
       scanner.position = 10; // "foo\nbar\r\nb"
diff --git a/test/span_scanner_test.dart b/test/span_scanner_test.dart
index 828745f..0e20c36 100644
--- a/test/span_scanner_test.dart
+++ b/test/span_scanner_test.dart
@@ -139,15 +139,6 @@
       expect(span.text, equals('o\nbar\nba'));
     });
 
-    test('.spanFrom() handles surrogate pairs correctly', () {
-      scanner = create('fo\u{12345}o');
-      scanner.scan('fo');
-      final state = scanner.state;
-      scanner.scan('\u{12345}o');
-      final span = scanner.spanFrom(state);
-      expect(span.text, equals('\u{12345}o'));
-    });
-
     test('.emptySpan returns an empty span at the current location', () {
       scanner.scan('foo\nba');
 
@@ -164,5 +155,64 @@
 
       expect(span.text, equals(''));
     });
+
+    group('before a surrogate pair', () {
+      final codePoint = '\uD83D\uDC6D'.runes.first;
+      const highSurrogate = 0xD83D;
+
+      late SpanScanner scanner;
+      setUp(() {
+        scanner = create('foo: \uD83D\uDC6D bar');
+        expect(scanner.scan('foo: '), isTrue);
+      });
+
+      test('readChar returns the high surrogate and moves into the pair', () {
+        expect(scanner.readChar(), equals(highSurrogate));
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(6));
+        expect(scanner.position, equals(6));
+      });
+
+      test('readCodePoint returns the code unit and moves past the pair', () {
+        expect(scanner.readCodePoint(), equals(codePoint));
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(7));
+        expect(scanner.position, equals(7));
+      });
+
+      test('scanChar with the high surrogate moves into the pair', () {
+        expect(scanner.scanChar(highSurrogate), isTrue);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(6));
+        expect(scanner.position, equals(6));
+      });
+
+      test('scanChar with the code point moves past the pair', () {
+        expect(scanner.scanChar(codePoint), isTrue);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(7));
+        expect(scanner.position, equals(7));
+      });
+
+      test('expectChar with the high surrogate moves into the pair', () {
+        scanner.expectChar(highSurrogate);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(6));
+        expect(scanner.position, equals(6));
+      });
+
+      test('expectChar with the code point moves past the pair', () {
+        scanner.expectChar(codePoint);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(7));
+        expect(scanner.position, equals(7));
+      });
+
+      test('spanFrom covers the surrogate pair', () {
+        final state = scanner.state;
+        scanner.scan('\uD83D\uDC6D b');
+        expect(scanner.spanFrom(state).text, equals('\uD83D\uDC6D b'));
+      });
+    });
   });
 }
diff --git a/test/string_scanner_test.dart b/test/string_scanner_test.dart
index 34176b8..36a737e 100644
--- a/test/string_scanner_test.dart
+++ b/test/string_scanner_test.dart
@@ -36,12 +36,24 @@
       expect(scanner.position, equals(0));
     });
 
+    test("readCodePoint fails and doesn't change the state", () {
+      expect(scanner.readCodePoint, throwsFormatException);
+      expect(scanner.lastMatch, isNull);
+      expect(scanner.position, equals(0));
+    });
+
     test("peekChar returns null and doesn't change the state", () {
       expect(scanner.peekChar(), isNull);
       expect(scanner.lastMatch, isNull);
       expect(scanner.position, equals(0));
     });
 
+    test("peekCodePoint returns null and doesn't change the state", () {
+      expect(scanner.peekCodePoint(), isNull);
+      expect(scanner.lastMatch, isNull);
+      expect(scanner.position, equals(0));
+    });
+
     test("scanChar returns false and doesn't change the state", () {
       expect(scanner.scanChar($f), isFalse);
       expect(scanner.lastMatch, isNull);
@@ -118,6 +130,12 @@
       expect(scanner.position, equals(1));
     });
 
+    test('readCodePoint returns the first character and moves forward', () {
+      expect(scanner.readCodePoint(), equals(0x66));
+      expect(scanner.lastMatch, isNull);
+      expect(scanner.position, equals(1));
+    });
+
     test('peekChar returns the first character', () {
       expect(scanner.peekChar(), equals(0x66));
       expect(scanner.lastMatch, isNull);
@@ -130,6 +148,12 @@
       expect(scanner.position, equals(0));
     });
 
+    test('peekCodePoint returns the first character', () {
+      expect(scanner.peekCodePoint(), equals(0x66));
+      expect(scanner.lastMatch, isNull);
+      expect(scanner.position, equals(0));
+    });
+
     test('a matching scanChar returns true moves forward', () {
       expect(scanner.scanChar($f), isTrue);
       expect(scanner.lastMatch, isNull);
@@ -275,6 +299,13 @@
       expect(scanner.position, equals(4));
     });
 
+    test('readCodePoint returns the first character and unsets the last match',
+        () {
+      expect(scanner.readCodePoint(), equals($space));
+      expect(scanner.lastMatch, isNull);
+      expect(scanner.position, equals(4));
+    });
+
     test('a matching scanChar returns true and unsets the last match', () {
       expect(scanner.scanChar($space), isTrue);
       expect(scanner.lastMatch, isNull);
@@ -314,12 +345,24 @@
       expect(scanner.position, equals(7));
     });
 
+    test("readCodePoint fails and doesn't change the state", () {
+      expect(scanner.readCodePoint, throwsFormatException);
+      expect(scanner.lastMatch, isNotNull);
+      expect(scanner.position, equals(7));
+    });
+
     test("peekChar returns null and doesn't change the state", () {
       expect(scanner.peekChar(), isNull);
       expect(scanner.lastMatch, isNotNull);
       expect(scanner.position, equals(7));
     });
 
+    test("peekCodePoint returns null and doesn't change the state", () {
+      expect(scanner.peekCodePoint(), isNull);
+      expect(scanner.lastMatch, isNotNull);
+      expect(scanner.position, equals(7));
+    });
+
     test("scanChar returns false and doesn't change the state", () {
       expect(scanner.scanChar($f), isFalse);
       expect(scanner.lastMatch, isNotNull);
@@ -393,6 +436,111 @@
     });
   });
 
+  group('before a surrogate pair', () {
+    final codePoint = '\uD83D\uDC6D'.runes.first;
+    const highSurrogate = 0xD83D;
+
+    late StringScanner scanner;
+    setUp(() {
+      scanner = StringScanner('foo: \uD83D\uDC6D');
+      expect(scanner.scan('foo: '), isTrue);
+    });
+
+    test('readChar returns the high surrogate and moves into the pair', () {
+      expect(scanner.readChar(), equals(highSurrogate));
+      expect(scanner.position, equals(6));
+    });
+
+    test('readCodePoint returns the code unit and moves past the pair', () {
+      expect(scanner.readCodePoint(), equals(codePoint));
+      expect(scanner.position, equals(7));
+    });
+
+    test('peekChar returns the high surrogate', () {
+      expect(scanner.peekChar(), equals(highSurrogate));
+      expect(scanner.position, equals(5));
+    });
+
+    test('peekCodePoint returns the code unit', () {
+      expect(scanner.peekCodePoint(), equals(codePoint));
+      expect(scanner.position, equals(5));
+    });
+
+    test('scanChar with the high surrogate moves into the pair', () {
+      expect(scanner.scanChar(highSurrogate), isTrue);
+      expect(scanner.position, equals(6));
+    });
+
+    test('scanChar with the code point moves past the pair', () {
+      expect(scanner.scanChar(codePoint), isTrue);
+      expect(scanner.position, equals(7));
+    });
+
+    test('expectChar with the high surrogate moves into the pair', () {
+      scanner.expectChar(highSurrogate);
+      expect(scanner.position, equals(6));
+    });
+
+    test('expectChar with the code point moves past the pair', () {
+      scanner.expectChar(codePoint);
+      expect(scanner.position, equals(7));
+    });
+  });
+
+  group('before an invalid surrogate pair', () {
+    // This surrogate pair is invalid because U+E000 is just outside the range
+    // of low surrogates. If it were interpreted as a surrogate pair anyway, the
+    // value would be U+110000, which is outside of the Unicode gamut.
+    const codePoint = 0x110000;
+    const highSurrogate = 0xD800;
+
+    late StringScanner scanner;
+    setUp(() {
+      scanner = StringScanner('foo: \uD800\uE000');
+      expect(scanner.scan('foo: '), isTrue);
+    });
+
+    test('readChar returns the high surrogate and moves into the pair', () {
+      expect(scanner.readChar(), equals(highSurrogate));
+      expect(scanner.position, equals(6));
+    });
+
+    test('readCodePoint returns the high surrogate and moves past the pair',
+        () {
+      expect(scanner.readCodePoint(), equals(highSurrogate));
+      expect(scanner.position, equals(6));
+    });
+
+    test('peekChar returns the high surrogate', () {
+      expect(scanner.peekChar(), equals(highSurrogate));
+      expect(scanner.position, equals(5));
+    });
+
+    test('peekCodePoint returns the high surrogate', () {
+      expect(scanner.peekCodePoint(), equals(highSurrogate));
+      expect(scanner.position, equals(5));
+    });
+
+    test('scanChar with the high surrogate moves into the pair', () {
+      expect(scanner.scanChar(highSurrogate), isTrue);
+      expect(scanner.position, equals(6));
+    });
+
+    test('scanChar with the fake code point returns false', () {
+      expect(scanner.scanChar(codePoint), isFalse);
+      expect(scanner.position, equals(5));
+    });
+
+    test('expectChar with the high surrogate moves into the pair', () {
+      scanner.expectChar(highSurrogate);
+      expect(scanner.position, equals(6));
+    });
+
+    test('expectChar with the fake code point fails', () {
+      expect(() => scanner.expectChar(codePoint), throwsRangeError);
+    });
+  });
+
   group('a scanner constructed with a custom position', () {
     test('starts scanning from that position', () {
       final scanner = StringScanner('foo bar', position: 1);