Add better support for dealing with supplemental-plane code units (#46)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb4d297..5963ce8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,17 @@
+## 1.2.0
+
+* Add better support for reading code points in the Unicode supplementary plane:
+
+ * Added `StringScanner.readCodePoint()`, which consumes an entire Unicode code
+ point even if it's represented by two UTF-16 code units.
+
+ * Added `StringScanner.peekCodePoint()`, which returns an entire Unicode code
+ point even if it's represented by two UTF-16 code units.
+
+ * `StringScanner.scanChar()` and `StringScanner.expectChar()` will now
+ properly consume two UTF-16 code units if they're passed Unicode code points
+ in the supplementary plane.
+
## 1.1.1
* Populate the pubspec `repository` field.
diff --git a/lib/src/eager_span_scanner.dart b/lib/src/eager_span_scanner.dart
index d27a818..3bf5416 100644
--- a/lib/src/eager_span_scanner.dart
+++ b/lib/src/eager_span_scanner.dart
@@ -5,6 +5,7 @@
import 'charcode.dart';
import 'line_scanner.dart';
import 'span_scanner.dart';
+import 'utils.dart';
// TODO(nweiz): Currently this duplicates code in line_scanner.dart. Once
// sdk#23770 is fully complete, we should move the shared code into a mixin.
@@ -90,7 +91,7 @@
_line += 1;
_column = 0;
} else {
- _column += 1;
+ _column += inSupplementaryPlane(character) ? 2 : 1;
}
}
diff --git a/lib/src/line_scanner.dart b/lib/src/line_scanner.dart
index af6b6e7..4f0673c 100644
--- a/lib/src/line_scanner.dart
+++ b/lib/src/line_scanner.dart
@@ -4,6 +4,7 @@
import 'charcode.dart';
import 'string_scanner.dart';
+import 'utils.dart';
// Note that much of this code is duplicated in eager_span_scanner.dart.
@@ -95,7 +96,7 @@
_line += 1;
_column = 0;
} else {
- _column += 1;
+ _column += inSupplementaryPlane(character) ? 2 : 1;
}
}
diff --git a/lib/src/string_scanner.dart b/lib/src/string_scanner.dart
index d254b04..de566a5 100644
--- a/lib/src/string_scanner.dart
+++ b/lib/src/string_scanner.dart
@@ -90,16 +90,35 @@
/// If the next character in the string is [character], consumes it.
///
+ /// If [character] is a Unicode code point in a supplementary plane, this will
+ /// consume two code units. Dart's string representation is UTF-16, which
+ /// represents supplementary-plane code units as two code units.
+ ///
/// Returns whether or not [character] was consumed.
bool scanChar(int character) {
- if (isDone) return false;
- if (string.codeUnitAt(_position) != character) return false;
- _position++;
- return true;
+ if (inSupplementaryPlane(character)) {
+ if (_position + 1 >= string.length ||
+ string.codeUnitAt(_position) != highSurrogate(character) ||
+ string.codeUnitAt(_position + 1) != lowSurrogate(character)) {
+ return false;
+ } else {
+ _position += 2;
+ return true;
+ }
+ } else {
+ if (isDone) return false;
+ if (string.codeUnitAt(_position) != character) return false;
+ _position++;
+ return true;
+ }
}
/// If the next character in the string is [character], consumes it.
///
+ /// If [character] is a Unicode code point in a supplementary plane, this will
+ /// consume two code units. Dart's string representation is UTF-16, which
+ /// represents supplementary-plane code units as two code units.
+ ///
/// If [character] could not be consumed, throws a [FormatException]
/// describing the position of the failure. [name] is used in this error as
/// the expected name of the character being matched; if it's `null`, the
@@ -120,6 +139,43 @@
_fail(name);
}
+ /// Consumes a single Unicode code unit and returns it.
+ ///
+ /// This works like [readChar], except that it automatically handles UTF-16
+ /// surrogate pairs. Specifically, if the next two code units form a surrogate
+ /// pair, consumes them both and returns the corresponding Unicode code point.
+ ///
+ /// If next two characters are not a surrogate pair, the next code unit is
+ /// returned as-is, even if it's an unpaired surrogate.
+ int readCodePoint() {
+ final first = readChar();
+ if (!isHighSurrogate(first)) return first;
+
+ final next = peekChar();
+ if (next == null || !isLowSurrogate(next)) return first;
+
+ readChar();
+ return decodeSurrogatePair(first, next);
+ }
+
+ /// Returns the Unicode code point immediately after [position].
+ ///
+ /// This works like [peekChar], except that it automatically handles UTF-16
+ /// surrogate pairs. Specifically, if the next two code units form a surrogate
+ /// pair, returns the corresponding Unicode code point.
+ ///
+ /// If next two characters are not a surrogate pair, the next code unit is
+ /// returned as-is, even if it's an unpaired surrogate.
+ int? peekCodePoint() {
+ final first = peekChar();
+ if (first == null || !isHighSurrogate(first)) return first;
+
+ final next = peekChar(1);
+ if (next == null || !isLowSurrogate(next)) return first;
+
+ return decodeSurrogatePair(first, next);
+ }
+
/// If [pattern] matches at the current position of the string, scans forward
/// until the end of the match.
///
diff --git a/lib/src/utils.dart b/lib/src/utils.dart
index 52dfb63..39891a1 100644
--- a/lib/src/utils.dart
+++ b/lib/src/utils.dart
@@ -29,3 +29,67 @@
'the string.');
}
}
+
+// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF
+// for documentation on how UTF-16 encoding works and definitions of various
+// related terms.
+
+/// The inclusive lower bound of Unicode's supplementary plane.
+const _supplementaryPlaneLowerBound = 0x10000;
+
+/// The inclusive upper bound of Unicode's supplementary plane.
+const _supplementaryPlaneUpperBound = 0x10FFFF;
+
+/// The inclusive lower bound of the UTF-16 high surrogate block.
+const _highSurrogateLowerBound = 0xD800;
+
+/// The inclusive lower bound of the UTF-16 low surrogate block.
+const _lowSurrogateLowerBound = 0xDC00;
+
+/// The number of low bits in each code unit of a surrogate pair that goes into
+/// determining which code point it encodes.
+const _surrogateBits = 10;
+
+/// A bit mask that covers the lower [_surrogateBits] of a code point, which can
+/// be used to extract the value of a surrogate or the low surrogate value of a
+/// code unit.
+const _surrogateValueMask = (1 << _surrogateBits) - 1;
+
+/// Returns whether [codePoint] is in the Unicode supplementary plane, and thus
+/// must be represented as a surrogate pair in UTF-16.
+bool inSupplementaryPlane(int codePoint) =>
+ codePoint >= _supplementaryPlaneLowerBound &&
+ codePoint <= _supplementaryPlaneUpperBound;
+
+/// Returns whether [codeUnit] is a UTF-16 high surrogate.
+bool isHighSurrogate(int codeUnit) =>
+ (codeUnit & ~_surrogateValueMask) == _highSurrogateLowerBound;
+
+/// Returns whether [codeUnit] is a UTF-16 low surrogate.
+bool isLowSurrogate(int codeUnit) =>
+ (codeUnit >> _surrogateBits) == (_lowSurrogateLowerBound >> _surrogateBits);
+
+/// Returns the high surrogate needed to encode the supplementary-plane
+/// [codePoint].
+int highSurrogate(int codePoint) {
+ assert(inSupplementaryPlane(codePoint));
+ return ((codePoint - _supplementaryPlaneLowerBound) >> _surrogateBits) +
+ _highSurrogateLowerBound;
+}
+
+/// Returns the low surrogate needed to encode the supplementary-plane
+/// [codePoint].
+int lowSurrogate(int codePoint) {
+ assert(inSupplementaryPlane(codePoint));
+ return ((codePoint - _supplementaryPlaneLowerBound) & _surrogateValueMask) +
+ _lowSurrogateLowerBound;
+}
+
+/// Converts a UTF-16 surrogate pair into the Unicode code unit it represents.
+int decodeSurrogatePair(int highSurrogate, int lowSurrogate) {
+ assert(isHighSurrogate(highSurrogate));
+ assert(isLowSurrogate(lowSurrogate));
+ return _supplementaryPlaneLowerBound +
+ (((highSurrogate & _surrogateValueMask) << _surrogateBits) |
+ (lowSurrogate & _surrogateValueMask));
+}
diff --git a/pubspec.yaml b/pubspec.yaml
index 421a227..c38f3bf 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,5 +1,5 @@
name: string_scanner
-version: 1.1.1
+version: 1.2.0
description: A class for parsing strings using a sequence of patterns.
repository: https://github.com/dart-lang/string_scanner
diff --git a/test/line_scanner_test.dart b/test/line_scanner_test.dart
index a3deff1..d31d313 100644
--- a/test/line_scanner_test.dart
+++ b/test/line_scanner_test.dart
@@ -81,6 +81,39 @@
});
});
+ group('readCodePoint()', () {
+ test('on a non-newline character increases the column but not the line',
+ () {
+ scanner.readCodePoint();
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(1));
+ });
+
+ test('consuming a newline resets the column and increases the line', () {
+ scanner.expect('foo');
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(3));
+
+ scanner.readCodePoint();
+ expect(scanner.line, equals(1));
+ expect(scanner.column, equals(0));
+ });
+
+ test("consuming halfway through a CR LF doesn't count as a line", () {
+ scanner.expect('foo\nbar');
+ expect(scanner.line, equals(1));
+ expect(scanner.column, equals(3));
+
+ scanner.readCodePoint();
+ expect(scanner.line, equals(1));
+ expect(scanner.column, equals(4));
+
+ scanner.readCodePoint();
+ expect(scanner.line, equals(2));
+ expect(scanner.column, equals(0));
+ });
+ });
+
group('scanChar()', () {
test('on a non-newline character increases the column but not the line',
() {
@@ -114,6 +147,59 @@
});
});
+ group('before a surrogate pair', () {
+ final codePoint = '\uD83D\uDC6D'.runes.first;
+ const highSurrogate = 0xD83D;
+
+ late LineScanner scanner;
+ setUp(() {
+ scanner = LineScanner('foo: \uD83D\uDC6D');
+ expect(scanner.scan('foo: '), isTrue);
+ });
+
+ test('readChar returns the high surrogate and moves into the pair', () {
+ expect(scanner.readChar(), equals(highSurrogate));
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(6));
+ expect(scanner.position, equals(6));
+ });
+
+ test('readCodePoint returns the code unit and moves past the pair', () {
+ expect(scanner.readCodePoint(), equals(codePoint));
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(7));
+ expect(scanner.position, equals(7));
+ });
+
+ test('scanChar with the high surrogate moves into the pair', () {
+ expect(scanner.scanChar(highSurrogate), isTrue);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(6));
+ expect(scanner.position, equals(6));
+ });
+
+ test('scanChar with the code point moves past the pair', () {
+ expect(scanner.scanChar(codePoint), isTrue);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(7));
+ expect(scanner.position, equals(7));
+ });
+
+ test('expectChar with the high surrogate moves into the pair', () {
+ scanner.expectChar(highSurrogate);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(6));
+ expect(scanner.position, equals(6));
+ });
+
+ test('expectChar with the code point moves past the pair', () {
+ scanner.expectChar(codePoint);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(7));
+ expect(scanner.position, equals(7));
+ });
+ });
+
group('position=', () {
test('forward through newlines sets the line and column', () {
scanner.position = 10; // "foo\nbar\r\nb"
diff --git a/test/span_scanner_test.dart b/test/span_scanner_test.dart
index 828745f..0e20c36 100644
--- a/test/span_scanner_test.dart
+++ b/test/span_scanner_test.dart
@@ -139,15 +139,6 @@
expect(span.text, equals('o\nbar\nba'));
});
- test('.spanFrom() handles surrogate pairs correctly', () {
- scanner = create('fo\u{12345}o');
- scanner.scan('fo');
- final state = scanner.state;
- scanner.scan('\u{12345}o');
- final span = scanner.spanFrom(state);
- expect(span.text, equals('\u{12345}o'));
- });
-
test('.emptySpan returns an empty span at the current location', () {
scanner.scan('foo\nba');
@@ -164,5 +155,64 @@
expect(span.text, equals(''));
});
+
+ group('before a surrogate pair', () {
+ final codePoint = '\uD83D\uDC6D'.runes.first;
+ const highSurrogate = 0xD83D;
+
+ late SpanScanner scanner;
+ setUp(() {
+ scanner = create('foo: \uD83D\uDC6D bar');
+ expect(scanner.scan('foo: '), isTrue);
+ });
+
+ test('readChar returns the high surrogate and moves into the pair', () {
+ expect(scanner.readChar(), equals(highSurrogate));
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(6));
+ expect(scanner.position, equals(6));
+ });
+
+ test('readCodePoint returns the code unit and moves past the pair', () {
+ expect(scanner.readCodePoint(), equals(codePoint));
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(7));
+ expect(scanner.position, equals(7));
+ });
+
+ test('scanChar with the high surrogate moves into the pair', () {
+ expect(scanner.scanChar(highSurrogate), isTrue);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(6));
+ expect(scanner.position, equals(6));
+ });
+
+ test('scanChar with the code point moves past the pair', () {
+ expect(scanner.scanChar(codePoint), isTrue);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(7));
+ expect(scanner.position, equals(7));
+ });
+
+ test('expectChar with the high surrogate moves into the pair', () {
+ scanner.expectChar(highSurrogate);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(6));
+ expect(scanner.position, equals(6));
+ });
+
+ test('expectChar with the code point moves past the pair', () {
+ scanner.expectChar(codePoint);
+ expect(scanner.line, equals(0));
+ expect(scanner.column, equals(7));
+ expect(scanner.position, equals(7));
+ });
+
+ test('spanFrom covers the surrogate pair', () {
+ final state = scanner.state;
+ scanner.scan('\uD83D\uDC6D b');
+ expect(scanner.spanFrom(state).text, equals('\uD83D\uDC6D b'));
+ });
+ });
});
}
diff --git a/test/string_scanner_test.dart b/test/string_scanner_test.dart
index 34176b8..36a737e 100644
--- a/test/string_scanner_test.dart
+++ b/test/string_scanner_test.dart
@@ -36,12 +36,24 @@
expect(scanner.position, equals(0));
});
+ test("readCodePoint fails and doesn't change the state", () {
+ expect(scanner.readCodePoint, throwsFormatException);
+ expect(scanner.lastMatch, isNull);
+ expect(scanner.position, equals(0));
+ });
+
test("peekChar returns null and doesn't change the state", () {
expect(scanner.peekChar(), isNull);
expect(scanner.lastMatch, isNull);
expect(scanner.position, equals(0));
});
+ test("peekCodePoint returns null and doesn't change the state", () {
+ expect(scanner.peekCodePoint(), isNull);
+ expect(scanner.lastMatch, isNull);
+ expect(scanner.position, equals(0));
+ });
+
test("scanChar returns false and doesn't change the state", () {
expect(scanner.scanChar($f), isFalse);
expect(scanner.lastMatch, isNull);
@@ -118,6 +130,12 @@
expect(scanner.position, equals(1));
});
+ test('readCodePoint returns the first character and moves forward', () {
+ expect(scanner.readCodePoint(), equals(0x66));
+ expect(scanner.lastMatch, isNull);
+ expect(scanner.position, equals(1));
+ });
+
test('peekChar returns the first character', () {
expect(scanner.peekChar(), equals(0x66));
expect(scanner.lastMatch, isNull);
@@ -130,6 +148,12 @@
expect(scanner.position, equals(0));
});
+ test('peekCodePoint returns the first character', () {
+ expect(scanner.peekCodePoint(), equals(0x66));
+ expect(scanner.lastMatch, isNull);
+ expect(scanner.position, equals(0));
+ });
+
test('a matching scanChar returns true moves forward', () {
expect(scanner.scanChar($f), isTrue);
expect(scanner.lastMatch, isNull);
@@ -275,6 +299,13 @@
expect(scanner.position, equals(4));
});
+ test('readCodePoint returns the first character and unsets the last match',
+ () {
+ expect(scanner.readCodePoint(), equals($space));
+ expect(scanner.lastMatch, isNull);
+ expect(scanner.position, equals(4));
+ });
+
test('a matching scanChar returns true and unsets the last match', () {
expect(scanner.scanChar($space), isTrue);
expect(scanner.lastMatch, isNull);
@@ -314,12 +345,24 @@
expect(scanner.position, equals(7));
});
+ test("readCodePoint fails and doesn't change the state", () {
+ expect(scanner.readCodePoint, throwsFormatException);
+ expect(scanner.lastMatch, isNotNull);
+ expect(scanner.position, equals(7));
+ });
+
test("peekChar returns null and doesn't change the state", () {
expect(scanner.peekChar(), isNull);
expect(scanner.lastMatch, isNotNull);
expect(scanner.position, equals(7));
});
+ test("peekCodePoint returns null and doesn't change the state", () {
+ expect(scanner.peekCodePoint(), isNull);
+ expect(scanner.lastMatch, isNotNull);
+ expect(scanner.position, equals(7));
+ });
+
test("scanChar returns false and doesn't change the state", () {
expect(scanner.scanChar($f), isFalse);
expect(scanner.lastMatch, isNotNull);
@@ -393,6 +436,111 @@
});
});
+ group('before a surrogate pair', () {
+ final codePoint = '\uD83D\uDC6D'.runes.first;
+ const highSurrogate = 0xD83D;
+
+ late StringScanner scanner;
+ setUp(() {
+ scanner = StringScanner('foo: \uD83D\uDC6D');
+ expect(scanner.scan('foo: '), isTrue);
+ });
+
+ test('readChar returns the high surrogate and moves into the pair', () {
+ expect(scanner.readChar(), equals(highSurrogate));
+ expect(scanner.position, equals(6));
+ });
+
+ test('readCodePoint returns the code unit and moves past the pair', () {
+ expect(scanner.readCodePoint(), equals(codePoint));
+ expect(scanner.position, equals(7));
+ });
+
+ test('peekChar returns the high surrogate', () {
+ expect(scanner.peekChar(), equals(highSurrogate));
+ expect(scanner.position, equals(5));
+ });
+
+ test('peekCodePoint returns the code unit', () {
+ expect(scanner.peekCodePoint(), equals(codePoint));
+ expect(scanner.position, equals(5));
+ });
+
+ test('scanChar with the high surrogate moves into the pair', () {
+ expect(scanner.scanChar(highSurrogate), isTrue);
+ expect(scanner.position, equals(6));
+ });
+
+ test('scanChar with the code point moves past the pair', () {
+ expect(scanner.scanChar(codePoint), isTrue);
+ expect(scanner.position, equals(7));
+ });
+
+ test('expectChar with the high surrogate moves into the pair', () {
+ scanner.expectChar(highSurrogate);
+ expect(scanner.position, equals(6));
+ });
+
+ test('expectChar with the code point moves past the pair', () {
+ scanner.expectChar(codePoint);
+ expect(scanner.position, equals(7));
+ });
+ });
+
+ group('before an invalid surrogate pair', () {
+ // This surrogate pair is invalid because U+E000 is just outside the range
+ // of low surrogates. If it were interpreted as a surrogate pair anyway, the
+ // value would be U+110000, which is outside of the Unicode gamut.
+ const codePoint = 0x110000;
+ const highSurrogate = 0xD800;
+
+ late StringScanner scanner;
+ setUp(() {
+ scanner = StringScanner('foo: \uD800\uE000');
+ expect(scanner.scan('foo: '), isTrue);
+ });
+
+ test('readChar returns the high surrogate and moves into the pair', () {
+ expect(scanner.readChar(), equals(highSurrogate));
+ expect(scanner.position, equals(6));
+ });
+
+ test('readCodePoint returns the high surrogate and moves past the pair',
+ () {
+ expect(scanner.readCodePoint(), equals(highSurrogate));
+ expect(scanner.position, equals(6));
+ });
+
+ test('peekChar returns the high surrogate', () {
+ expect(scanner.peekChar(), equals(highSurrogate));
+ expect(scanner.position, equals(5));
+ });
+
+ test('peekCodePoint returns the high surrogate', () {
+ expect(scanner.peekCodePoint(), equals(highSurrogate));
+ expect(scanner.position, equals(5));
+ });
+
+ test('scanChar with the high surrogate moves into the pair', () {
+ expect(scanner.scanChar(highSurrogate), isTrue);
+ expect(scanner.position, equals(6));
+ });
+
+ test('scanChar with the fake code point returns false', () {
+ expect(scanner.scanChar(codePoint), isFalse);
+ expect(scanner.position, equals(5));
+ });
+
+ test('expectChar with the high surrogate moves into the pair', () {
+ scanner.expectChar(highSurrogate);
+ expect(scanner.position, equals(6));
+ });
+
+ test('expectChar with the fake code point fails', () {
+ expect(() => scanner.expectChar(codePoint), throwsRangeError);
+ });
+ });
+
group('a scanner constructed with a custom position', () {
test('starts scanning from that position', () {
final scanner = StringScanner('foo bar', position: 1);