Fix spans generated for HTML with higher-plane unicode characters (#109)
diff --git a/.travis.yml b/.travis.yml
index 337ceca..0033332 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,7 @@
dart:
- dev
- - 2.0.0
+ - 2.3.0
dart_task:
- test: -p vm
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0208179..9bd9d29 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.14.0+3
+
+- Fix spans generated for HTML with higher-plane unicode characters (eg. emojis)
+
## 0.14.0+2
- Support `package:css` `>=0.13.2 <0.17.0`.
diff --git a/lib/src/html_input_stream.dart b/lib/src/html_input_stream.dart
index 42b1741..4590c5d 100644
--- a/lib/src/html_input_stream.dart
+++ b/lib/src/html_input_stream.dart
@@ -33,7 +33,7 @@
List<int> _rawBytes;
/// Raw UTF-16 codes, used if a Dart String is passed in.
- Iterable<int> _rawChars;
+ List<int> _rawChars;
Queue<String> errors;
@@ -66,7 +66,7 @@
this.sourceUrl])
: charEncodingName = codecName(encoding) {
if (source is String) {
- _rawChars = source.runes.toList();
+ _rawChars = source.codeUnits;
charEncodingName = 'utf-8';
charEncodingCertain = true;
} else if (source is List<int>) {
@@ -96,17 +96,27 @@
}
bool skipNewline = false;
- for (var c in _rawChars) {
+ bool wasSurrogatePair = false;
+ for (int i = 0; i < _rawChars.length; i++) {
+ int c = _rawChars[i];
if (skipNewline) {
skipNewline = false;
if (c == NEWLINE) continue;
}
- if (_invalidUnicode(c)) errors.add('invalid-codepoint');
+ final isSurrogatePair = _isSurrogatePair(_rawChars, i);
+ if (!isSurrogatePair && !wasSurrogatePair) {
+ if (_invalidUnicode(c)) {
+ errors.add('invalid-codepoint');
- if (0xD800 <= c && c <= 0xDFFF) {
- c = 0xFFFD;
- } else if (c == RETURN) {
+ if (0xD800 <= c && c <= 0xDFFF) {
+ c = 0xFFFD;
+ }
+ }
+ }
+ wasSurrogatePair = isSurrogatePair;
+
+ if (c == RETURN) {
skipNewline = true;
c = NEWLINE;
}
@@ -203,21 +213,38 @@
/// EOF when EOF is reached.
String char() {
if (_offset >= _chars.length) return eof;
- return String.fromCharCodes([_chars[_offset++]]);
+ return _isSurrogatePair(_chars, _offset)
+ ? String.fromCharCodes([_chars[_offset++], _chars[_offset++]])
+ : String.fromCharCodes([_chars[_offset++]]);
}
String peekChar() {
if (_offset >= _chars.length) return eof;
- return String.fromCharCodes([_chars[_offset]]);
+ return _isSurrogatePair(_chars, _offset)
+ ? String.fromCharCodes([_chars[_offset], _chars[_offset + 1]])
+ : String.fromCharCodes([_chars[_offset]]);
}
+ // Whether the current and next chars indicate a surrogate pair.
+ bool _isSurrogatePair(List<int> chars, int i) {
+ return i + 1 < chars.length &&
+ _isLeadSurrogate(chars[i]) &&
+ _isTrailSurrogate(chars[i + 1]);
+ }
+
+ // Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
+ bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
+
+ // Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
+ bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
+
/// Returns a string of characters from the stream up to but not
/// including any character in 'characters' or EOF.
String charsUntil(String characters, [bool opposite = false]) {
int start = _offset;
String c;
while ((c = peekChar()) != null && characters.contains(c) == opposite) {
- _offset++;
+ _offset += c.codeUnits.length;
}
return String.fromCharCodes(_chars.sublist(start, _offset));
@@ -227,7 +254,7 @@
// Only one character is allowed to be ungotten at once - it must
// be consumed again before any further call to unget
if (ch != null) {
- _offset--;
+ _offset -= ch.codeUnits.length;
assert(peekChar() == ch);
}
}
@@ -304,18 +331,18 @@
bytes[offset + 2] == 0xBF;
}
-/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
+/// Decodes the [bytes] with the provided [encoding] and returns a list for
/// the codepoints. Supports the major unicode encodings as well as ascii and
/// and windows-1252 encodings.
-Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
+List<int> _decodeBytes(String encoding, List<int> bytes) {
switch (encoding) {
case 'ascii':
- return ascii.decode(bytes).runes;
+ return ascii.decode(bytes).codeUnits;
case 'utf-8':
// NOTE: To match the behavior of the other decode functions, we eat the
// UTF-8 BOM here. This is the default behavior of `utf8.decode`.
- return utf8.decode(bytes).runes;
+ return utf8.decode(bytes).codeUnits;
default:
throw ArgumentError('Encoding $encoding not supported');
diff --git a/pubspec.yaml b/pubspec.yaml
index fcd9ca0..b5eaa1a 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -6,7 +6,7 @@
homepage: https://github.com/dart-lang/html
environment:
- sdk: '>=2.0.0 <3.0.0'
+ sdk: '>=2.3.0 <3.0.0'
dependencies:
csslib: '>=0.13.2 <0.17.0'
diff --git a/test/data/tokenizer/unicodeCharsSurrogates.test b/test/data/tokenizer/unicodeCharsSurrogates.test
new file mode 100644
index 0000000..9b56a98
--- /dev/null
+++ b/test/data/tokenizer/unicodeCharsSurrogates.test
@@ -0,0 +1,24 @@
+{"tests" : [
+{"description": "Unicode surrogate (emoji)",
+"input": "\uD83D\uDC3C",
+"output":[["Character", "\uD83D\uDC3C"]]},
+
+{"description": "Unicode surrogate (emoji) prefixed by characters",
+"input": "before\uD83D\uDC3C",
+"output":[["Character", "before\uD83D\uDC3C"]]},
+
+{"description": "Unicode surrogate (emoji) suffixed by characters",
+"input": "\uD83D\uDC3Cafter",
+"output":[["Character", "\uD83D\uDC3Cafter"]]},
+
+{"description":"Quoted attribute with surrogate unicode content",
+"generateSpans": true,
+"input":"<a href='\uD83D\uDC3C'/>",
+"output":[["StartTag","a",{"href":"\uD83D\uDC3C"},true,0,14]]},
+
+{"description":"Surrogate unicode content followed by attribute",
+"generateSpans": true,
+"input":"\uD83D\uDC3C<a href='b'/>",
+"output":[["Character", "\uD83D\uDC3C", 0, 2],["StartTag","a",{"href":"b"},true,2,15]]}
+]
+}
\ No newline at end of file
diff --git a/test/tokenizer_test.dart b/test/tokenizer_test.dart
index ccb9d8c..3d2aeb3 100644
--- a/test/tokenizer_test.dart
+++ b/test/tokenizer_test.dart
@@ -16,16 +16,20 @@
class TokenizerTestParser {
final String _state;
final String _lastStartTag;
+ final bool _generateSpans;
List outputTokens;
- TokenizerTestParser(String initialState, [String lastStartTag])
+ TokenizerTestParser(String initialState,
+ [String lastStartTag, bool generateSpans = false])
: _state = initialState,
- _lastStartTag = lastStartTag;
+ _lastStartTag = lastStartTag,
+ _generateSpans = generateSpans;
List parse(String str) {
// Note: we need to pass bytes to the tokenizer if we want it to handle BOM.
var bytes = utf8.encode(str);
- var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8');
+ var tokenizer =
+ HtmlTokenizer(bytes, encoding: 'utf-8', generateSpans: _generateSpans);
outputTokens = [];
// Note: we can't get a closure of the state method. However, we can
@@ -68,20 +72,21 @@
}
void processDoctype(DoctypeToken token) {
- outputTokens.add(
+ addOutputToken(token,
["DOCTYPE", token.name, token.publicId, token.systemId, token.correct]);
}
void processStartTag(StartTagToken token) {
- outputTokens.add(["StartTag", token.name, token.data, token.selfClosing]);
+ addOutputToken(
+ token, ["StartTag", token.name, token.data, token.selfClosing]);
}
void processEndTag(EndTagToken token) {
- outputTokens.add(["EndTag", token.name, token.selfClosing]);
+ addOutputToken(token, ["EndTag", token.name, token.selfClosing]);
}
void processComment(StringToken token) {
- outputTokens.add(["Comment", token.data]);
+ addOutputToken(token, ["Comment", token.data]);
}
void processSpaceCharacters(StringToken token) {
@@ -89,7 +94,7 @@
}
void processCharacters(StringToken token) {
- outputTokens.add(["Character", token.data]);
+ addOutputToken(token, ["Character", token.data]);
}
void processEOF(token) {}
@@ -98,7 +103,15 @@
// TODO(jmesserly): when debugging test failures it can be useful to add
// logging here like `print('ParseError $token');`. It would be nice to
// use the actual logging library.
- outputTokens.add(["ParseError", token.data]);
+ addOutputToken(token, ["ParseError", token.data]);
+ }
+
+ void addOutputToken(Token token, List array) {
+ outputTokens.add([
+ ...array,
+ if (token.span != null && _generateSpans) token.span.start.offset,
+ if (token.span != null && _generateSpans) token.span.end.offset,
+ ]);
}
}
@@ -138,16 +151,18 @@
void expectTokensMatch(
List expectedTokens, List receivedTokens, bool ignoreErrorOrder,
[bool ignoreErrors = false, String message]) {
- var checkSelfClosing = false;
+ // If the 'selfClosing' attribute is not included in the expected test tokens,
+ // remove it from the received token.
+ var removeSelfClosing = false;
for (var token in expectedTokens) {
- if (token[0] == "StartTag" && token.length == 4 ||
- token[0] == "EndTag" && token.length == 3) {
- checkSelfClosing = true;
+ if (token[0] == "StartTag" && token.length == 3 ||
+ token[0] == "EndTag" && token.length == 2) {
+ removeSelfClosing = true;
break;
}
}
- if (!checkSelfClosing) {
+ if (removeSelfClosing) {
for (var token in receivedTokens) {
if (token[0] == "StartTag" || token[0] == "EndTag") {
token.removeLast();
@@ -182,8 +197,8 @@
if (!testInfo.containsKey('lastStartTag')) {
testInfo['lastStartTag'] = null;
}
- var parser =
- TokenizerTestParser(testInfo['initialState'], testInfo['lastStartTag']);
+ var parser = TokenizerTestParser(testInfo['initialState'],
+ testInfo['lastStartTag'], testInfo['generateSpans'] ?? false);
var tokens = parser.parse(testInfo['input']);
tokens = concatenateCharacterTokens(tokens);
var received = normalizeTokens(tokens);