Handle surrogate pairs during scanning (dart-lang/yaml#159)
Change back to readChar() whenever possible; remove the need to decode the surrogate for further checking
diff --git a/pkgs/yaml/CHANGELOG.md b/pkgs/yaml/CHANGELOG.md
index 37e5660..cd800a8 100644
--- a/pkgs/yaml/CHANGELOG.md
+++ b/pkgs/yaml/CHANGELOG.md
@@ -1,6 +1,7 @@
## 3.1.3-wip
* Require Dart 3.4
+* Fix UTF-16 surrogate pair handling in plain scaler.
## 3.1.2
diff --git a/pkgs/yaml/lib/src/scanner.dart b/pkgs/yaml/lib/src/scanner.dart
index 4bf0b93..4b155e1 100644
--- a/pkgs/yaml/lib/src/scanner.dart
+++ b/pkgs/yaml/lib/src/scanner.dart
@@ -253,7 +253,7 @@
null => false,
LF || CR || BOM => false,
TAB || NEL => true,
- _ => _isStandardCharacter(char),
+ _ => _isStandardCharacterAt(0),
};
}
@@ -267,7 +267,7 @@
null => false,
LF || CR || BOM || SP => false,
NEL => true,
- _ => _isStandardCharacter(char),
+ _ => _isStandardCharacterAt(0),
};
}
@@ -614,9 +614,9 @@
// Consume the indicator token.
var start = _scanner.state;
- _scanner.readChar();
- _scanner.readChar();
- _scanner.readChar();
+ _scanner.readCodePoint();
+ _scanner.readCodePoint();
+ _scanner.readCodePoint();
_tokens.add(Token(type, _scanner.spanFrom(start)));
}
@@ -732,7 +732,7 @@
/// The span of the new token is the current character.
void _addCharToken(TokenType type) {
var start = _scanner.state;
- _scanner.readChar();
+ _scanner.readCodePoint();
_tokens.add(Token(type, _scanner.spanFrom(start)));
}
@@ -836,7 +836,7 @@
// libyaml doesn't support unknown directives, but the spec says to ignore
// them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
while (!_isBreakOrEnd) {
- _scanner.readChar();
+ _scanner.readCodePoint();
}
return null;
@@ -866,7 +866,7 @@
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
var start = _scanner.position;
while (_isNonSpace) {
- _scanner.readChar();
+ _scanner.readCodePoint();
}
var name = _scanner.substring(start);
@@ -941,13 +941,13 @@
var start = _scanner.state;
// Eat the indicator character.
- _scanner.readChar();
+ _scanner.readCodePoint();
// libyaml only allows word characters in anchor names, but the spec
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
var startPosition = _scanner.position;
while (_isAnchorChar) {
- _scanner.readChar();
+ _scanner.readCodePoint();
}
var name = _scanner.substring(startPosition);
@@ -1032,7 +1032,7 @@
buffer.write(_scanner.substring(start));
if (_scanner.peekChar() == EXCLAMATION) {
- buffer.writeCharCode(_scanner.readChar());
+ buffer.writeCharCode(_scanner.readCodePoint());
} else {
// It's either the '!' tag or not really a tag handle. If it's a %TAG
// directive, it's an error. If it's a tag token, it must be part of a
@@ -1083,7 +1083,7 @@
var start = _scanner.state;
// Eat the indicator '|' or '>'.
- _scanner.readChar();
+ _scanner.readCodePoint();
// Check for a chomping indicator.
var chomping = _Chomping.clip;
@@ -1091,7 +1091,7 @@
var char = _scanner.peekChar();
if (char == PLUS || char == HYPHEN) {
chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
- _scanner.readChar();
+ _scanner.readCodePoint();
// Check for an indentation indicator.
if (_isDigit) {
@@ -1101,7 +1101,7 @@
_scanner.spanFrom(start));
}
- increment = _scanner.readChar() - NUMBER_0;
+ increment = _scanner.readCodePoint() - NUMBER_0;
}
} else if (_isDigit) {
// Do the same as above, but in the opposite order.
@@ -1110,12 +1110,12 @@
_scanner.spanFrom(start));
}
- increment = _scanner.readChar() - NUMBER_0;
+ increment = _scanner.readCodePoint() - NUMBER_0;
char = _scanner.peekChar();
if (char == PLUS || char == HYPHEN) {
chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
- _scanner.readChar();
+ _scanner.readCodePoint();
}
}
@@ -1182,7 +1182,7 @@
var startPosition = _scanner.position;
while (!_isBreakOrEnd) {
- _scanner.readChar();
+ _scanner.readCodePoint();
}
buffer.write(_scanner.substring(startPosition));
end = _scanner.state;
@@ -1373,7 +1373,7 @@
buffer.writeCharCode(value);
}
} else {
- buffer.writeCharCode(_scanner.readChar());
+ buffer.writeCharCode(_scanner.readCodePoint());
}
}
@@ -1462,7 +1462,7 @@
// 1.2's. We use [_isPlainChar] instead of libyaml's character here.
var startPosition = _scanner.position;
while (_isPlainChar) {
- _scanner.readChar();
+ _scanner.readCodePoint();
}
buffer.write(_scanner.substring(startPosition));
end = _scanner.state;
@@ -1587,15 +1587,28 @@
_inBlockContext,
SP || TAB || LF || CR || BOM => false,
NEL => true,
- _ => _isStandardCharacter(char)
+ _ => _isStandardCharacterAt(offset)
};
}
+ bool _isStandardCharacterAt(int offset) {
+ var first = _scanner.peekChar(offset);
+ if (first == null) return false;
+
+ if (isHighSurrogate(first)) {
+ var next = _scanner.peekChar(offset + 1);
+ // A surrogate pair encodes code points from U+010000 to U+10FFFF, so it
+ // must be a standard character.
+ return next != null && isLowSurrogate(next);
+ }
+
+ return _isStandardCharacter(first);
+ }
+
bool _isStandardCharacter(int char) =>
- (char >= 0x00020 && char <= 0x00007E) ||
- (char >= 0x000A0 && char <= 0x00D7FF) ||
- (char >= 0x0E000 && char <= 0x00FFFD) ||
- (char >= 0x10000 && char <= 0x10FFFF);
+ (char >= 0x0020 && char <= 0x007E) ||
+ (char >= 0x00A0 && char <= 0xD7FF) ||
+ (char >= 0xE000 && char <= 0xFFFD);
/// Returns the hexidecimal value of [char].
int _asHex(int char) {
diff --git a/pkgs/yaml/lib/src/utils.dart b/pkgs/yaml/lib/src/utils.dart
index db4612c..d9e20d1 100644
--- a/pkgs/yaml/lib/src/utils.dart
+++ b/pkgs/yaml/lib/src/utils.dart
@@ -43,3 +43,9 @@
if (span != null) message = span.message(message);
print(message);
};
+
+/// Whether [codeUnit] is a UTF-16 high surrogate.
+bool isHighSurrogate(int codeUnit) => codeUnit >>> 10 == 0x36;
+
+/// Whether [codeUnit] is a UTF-16 low surrogate.
+bool isLowSurrogate(int codeUnit) => codeUnit >>> 10 == 0x37;
diff --git a/pkgs/yaml/pubspec.yaml b/pkgs/yaml/pubspec.yaml
index 09b6541..be7d165 100644
--- a/pkgs/yaml/pubspec.yaml
+++ b/pkgs/yaml/pubspec.yaml
@@ -12,7 +12,7 @@
dependencies:
collection: ^1.15.0
source_span: ^1.8.0
- string_scanner: ^1.1.0
+ string_scanner: ^1.2.0
dev_dependencies:
dart_flutter_team_lints: ^3.0.0
diff --git a/pkgs/yaml/test/yaml_test.dart b/pkgs/yaml/test/yaml_test.dart
index bb35ba4..3b5b77d 100644
--- a/pkgs/yaml/test/yaml_test.dart
+++ b/pkgs/yaml/test/yaml_test.dart
@@ -420,20 +420,25 @@
test('[Example 2.17]', () {
expectYamlLoads({
- 'unicode': 'Sosa did fine.\u263A',
+ 'unicode': 'Sosa did fine.\u263A \u{1F680}',
'control': '\b1998\t1999\t2000\n',
'hex esc': '\r\n is \r\n',
'single': '"Howdy!" he cried.',
'quoted': " # Not a 'comment'.",
- 'tie-fighter': '|\\-*-/|'
+ 'tie-fighter': '|\\-*-/|',
+ 'surrogate-pair': 'I \u{D83D}\u{DE03} ️Dart!',
+ 'key-\u{D83D}\u{DD11}': 'Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!',
}, """
- unicode: "Sosa did fine.\\u263A"
+ unicode: "Sosa did fine.\\u263A \\U0001F680"
control: "\\b1998\\t1999\\t2000\\n"
hex esc: "\\x0d\\x0a is \\r\\n"
single: '"Howdy!" he cried.'
quoted: ' # Not a ''comment''.'
- tie-fighter: '|\\-*-/|'""");
+ tie-fighter: '|\\-*-/|'
+
+ surrogate-pair: I \u{D83D}\u{DE03} ️Dart!
+ key-\u{D83D}\u{DD11}: Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!""");
});
test('[Example 2.18]', () {