Fix spans generated for HTML with higher-plane unicode characters (#109)

commit: 2b392a4b8d739edeac3552dff3f614219a733955 [log] [tgz]
author: cvolzke4 <45087979+cvolzke4@users.noreply.github.com> Fri Sep 20 09:06:20 2019 +1000
committer: Nicholas Shahan <nshahan@google.com> Thu Sep 19 16:06:20 2019 -0700
tree: 03a867819af1d72e181a77f4612d7c7c82493fc0
parent: d37f5887e327a245abb9fc3675446dd96b172957 [diff]
diff --git a/.travis.yml b/.travis.yml
index 337ceca..0033332 100644
--- a/.travis.yml
+++ b/.travis.yml

@@ -2,7 +2,7 @@
 
 dart:
   - dev
-  - 2.0.0
+  - 2.3.0
 
 dart_task:
   - test: -p vm

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0208179..9bd9d29 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,3 +1,7 @@
+## 0.14.0+3
+
+- Fix spans generated for HTML with higher-plane unicode characters (eg. emojis)
+
 ## 0.14.0+2
 
 - Support `package:css` `>=0.13.2 <0.17.0`.

diff --git a/lib/src/html_input_stream.dart b/lib/src/html_input_stream.dart
index 42b1741..4590c5d 100644
--- a/lib/src/html_input_stream.dart
+++ b/lib/src/html_input_stream.dart

@@ -33,7 +33,7 @@
   List<int> _rawBytes;
 
   /// Raw UTF-16 codes, used if a Dart String is passed in.
-  Iterable<int> _rawChars;
+  List<int> _rawChars;
 
   Queue<String> errors;
 
@@ -66,7 +66,7 @@
       this.sourceUrl])
       : charEncodingName = codecName(encoding) {
     if (source is String) {
-      _rawChars = source.runes.toList();
+      _rawChars = source.codeUnits;
       charEncodingName = 'utf-8';
       charEncodingCertain = true;
     } else if (source is List<int>) {
@@ -96,17 +96,27 @@
     }
 
     bool skipNewline = false;
-    for (var c in _rawChars) {
+    bool wasSurrogatePair = false;
+    for (int i = 0; i < _rawChars.length; i++) {
+      int c = _rawChars[i];
       if (skipNewline) {
         skipNewline = false;
         if (c == NEWLINE) continue;
       }
 
-      if (_invalidUnicode(c)) errors.add('invalid-codepoint');
+      final isSurrogatePair = _isSurrogatePair(_rawChars, i);
+      if (!isSurrogatePair && !wasSurrogatePair) {
+        if (_invalidUnicode(c)) {
+          errors.add('invalid-codepoint');
 
-      if (0xD800 <= c && c <= 0xDFFF) {
-        c = 0xFFFD;
-      } else if (c == RETURN) {
+          if (0xD800 <= c && c <= 0xDFFF) {
+            c = 0xFFFD;
+          }
+        }
+      }
+      wasSurrogatePair = isSurrogatePair;
+
+      if (c == RETURN) {
         skipNewline = true;
         c = NEWLINE;
       }
@@ -203,21 +213,38 @@
   /// EOF when EOF is reached.
   String char() {
     if (_offset >= _chars.length) return eof;
-    return String.fromCharCodes([_chars[_offset++]]);
+    return _isSurrogatePair(_chars, _offset)
+        ? String.fromCharCodes([_chars[_offset++], _chars[_offset++]])
+        : String.fromCharCodes([_chars[_offset++]]);
   }
 
   String peekChar() {
     if (_offset >= _chars.length) return eof;
-    return String.fromCharCodes([_chars[_offset]]);
+    return _isSurrogatePair(_chars, _offset)
+        ? String.fromCharCodes([_chars[_offset], _chars[_offset + 1]])
+        : String.fromCharCodes([_chars[_offset]]);
   }
 
+  // Whether the current and next chars indicate a surrogate pair.
+  bool _isSurrogatePair(List<int> chars, int i) {
+    return i + 1 < chars.length &&
+        _isLeadSurrogate(chars[i]) &&
+        _isTrailSurrogate(chars[i + 1]);
+  }
+
+  // Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
+  bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
+
+  // Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
+  bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
+
   /// Returns a string of characters from the stream up to but not
   /// including any character in 'characters' or EOF.
   String charsUntil(String characters, [bool opposite = false]) {
     int start = _offset;
     String c;
     while ((c = peekChar()) != null && characters.contains(c) == opposite) {
-      _offset++;
+      _offset += c.codeUnits.length;
     }
 
     return String.fromCharCodes(_chars.sublist(start, _offset));
@@ -227,7 +254,7 @@
     // Only one character is allowed to be ungotten at once - it must
     // be consumed again before any further call to unget
     if (ch != null) {
-      _offset--;
+      _offset -= ch.codeUnits.length;
       assert(peekChar() == ch);
     }
   }
@@ -304,18 +331,18 @@
       bytes[offset + 2] == 0xBF;
 }
 
-/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
+/// Decodes the [bytes] with the provided [encoding] and returns a list for
 /// the codepoints. Supports the major unicode encodings as well as ascii and
 /// and windows-1252 encodings.
-Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
+List<int> _decodeBytes(String encoding, List<int> bytes) {
   switch (encoding) {
     case 'ascii':
-      return ascii.decode(bytes).runes;
+      return ascii.decode(bytes).codeUnits;
 
     case 'utf-8':
       // NOTE: To match the behavior of the other decode functions, we eat the
       // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
-      return utf8.decode(bytes).runes;
+      return utf8.decode(bytes).codeUnits;
 
     default:
       throw ArgumentError('Encoding $encoding not supported');

diff --git a/pubspec.yaml b/pubspec.yaml
index fcd9ca0..b5eaa1a 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml

@@ -6,7 +6,7 @@
 homepage: https://github.com/dart-lang/html
 
 environment:
-  sdk: '>=2.0.0 <3.0.0'
+  sdk: '>=2.3.0 <3.0.0'
 
 dependencies:
   csslib: '>=0.13.2 <0.17.0'

diff --git a/test/data/tokenizer/unicodeCharsSurrogates.test b/test/data/tokenizer/unicodeCharsSurrogates.test
new file mode 100644
index 0000000..9b56a98
--- /dev/null
+++ b/test/data/tokenizer/unicodeCharsSurrogates.test

@@ -0,0 +1,24 @@
+{"tests" : [
+{"description": "Unicode surrogate (emoji)",
+"input": "\uD83D\uDC3C",
+"output":[["Character", "\uD83D\uDC3C"]]},
+
+{"description": "Unicode surrogate (emoji) prefixed by characters",
+"input": "before\uD83D\uDC3C",
+"output":[["Character", "before\uD83D\uDC3C"]]},
+
+{"description": "Unicode surrogate (emoji) suffixed by characters",
+"input": "\uD83D\uDC3Cafter",
+"output":[["Character", "\uD83D\uDC3Cafter"]]},
+
+{"description":"Quoted attribute with surrogate unicode content",
+"generateSpans": true,
+"input":"<a href='\uD83D\uDC3C'/>",
+"output":[["StartTag","a",{"href":"\uD83D\uDC3C"},true,0,14]]},
+
+{"description":"Surrogate unicode content followed by attribute",
+"generateSpans": true,
+"input":"\uD83D\uDC3C<a href='b'/>",
+"output":[["Character", "\uD83D\uDC3C", 0, 2],["StartTag","a",{"href":"b"},true,2,15]]}
+]
+}
\ No newline at end of file

diff --git a/test/tokenizer_test.dart b/test/tokenizer_test.dart
index ccb9d8c..3d2aeb3 100644
--- a/test/tokenizer_test.dart
+++ b/test/tokenizer_test.dart

@@ -16,16 +16,20 @@
 class TokenizerTestParser {
   final String _state;
   final String _lastStartTag;
+  final bool _generateSpans;
   List outputTokens;
 
-  TokenizerTestParser(String initialState, [String lastStartTag])
+  TokenizerTestParser(String initialState,
+      [String lastStartTag, bool generateSpans = false])
       : _state = initialState,
-        _lastStartTag = lastStartTag;
+        _lastStartTag = lastStartTag,
+        _generateSpans = generateSpans;
 
   List parse(String str) {
     // Note: we need to pass bytes to the tokenizer if we want it to handle BOM.
     var bytes = utf8.encode(str);
-    var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8');
+    var tokenizer =
+        HtmlTokenizer(bytes, encoding: 'utf-8', generateSpans: _generateSpans);
     outputTokens = [];
 
     // Note: we can't get a closure of the state method. However, we can
@@ -68,20 +72,21 @@
   }
 
   void processDoctype(DoctypeToken token) {
-    outputTokens.add(
+    addOutputToken(token,
         ["DOCTYPE", token.name, token.publicId, token.systemId, token.correct]);
   }
 
   void processStartTag(StartTagToken token) {
-    outputTokens.add(["StartTag", token.name, token.data, token.selfClosing]);
+    addOutputToken(
+        token, ["StartTag", token.name, token.data, token.selfClosing]);
   }
 
   void processEndTag(EndTagToken token) {
-    outputTokens.add(["EndTag", token.name, token.selfClosing]);
+    addOutputToken(token, ["EndTag", token.name, token.selfClosing]);
   }
 
   void processComment(StringToken token) {
-    outputTokens.add(["Comment", token.data]);
+    addOutputToken(token, ["Comment", token.data]);
   }
 
   void processSpaceCharacters(StringToken token) {
@@ -89,7 +94,7 @@
   }
 
   void processCharacters(StringToken token) {
-    outputTokens.add(["Character", token.data]);
+    addOutputToken(token, ["Character", token.data]);
   }
 
   void processEOF(token) {}
@@ -98,7 +103,15 @@
     // TODO(jmesserly): when debugging test failures it can be useful to add
     // logging here like `print('ParseError $token');`. It would be nice to
     // use the actual logging library.
-    outputTokens.add(["ParseError", token.data]);
+    addOutputToken(token, ["ParseError", token.data]);
+  }
+
+  void addOutputToken(Token token, List array) {
+    outputTokens.add([
+      ...array,
+      if (token.span != null && _generateSpans) token.span.start.offset,
+      if (token.span != null && _generateSpans) token.span.end.offset,
+    ]);
   }
 }
 
@@ -138,16 +151,18 @@
 void expectTokensMatch(
     List expectedTokens, List receivedTokens, bool ignoreErrorOrder,
     [bool ignoreErrors = false, String message]) {
-  var checkSelfClosing = false;
+  // If the 'selfClosing' attribute is not included in the expected test tokens,
+  // remove it from the received token.
+  var removeSelfClosing = false;
   for (var token in expectedTokens) {
-    if (token[0] == "StartTag" && token.length == 4 ||
-        token[0] == "EndTag" && token.length == 3) {
-      checkSelfClosing = true;
+    if (token[0] == "StartTag" && token.length == 3 ||
+        token[0] == "EndTag" && token.length == 2) {
+      removeSelfClosing = true;
       break;
     }
   }
 
-  if (!checkSelfClosing) {
+  if (removeSelfClosing) {
     for (var token in receivedTokens) {
       if (token[0] == "StartTag" || token[0] == "EndTag") {
         token.removeLast();
@@ -182,8 +197,8 @@
   if (!testInfo.containsKey('lastStartTag')) {
     testInfo['lastStartTag'] = null;
   }
-  var parser =
-      TokenizerTestParser(testInfo['initialState'], testInfo['lastStartTag']);
+  var parser = TokenizerTestParser(testInfo['initialState'],
+      testInfo['lastStartTag'], testInfo['generateSpans'] ?? false);
   var tokens = parser.parse(testInfo['input']);
   tokens = concatenateCharacterTokens(tokens);
   var received = normalizeTokens(tokens);
commit	2b392a4b8d739edeac3552dff3f614219a733955	[log] [tgz]
author	cvolzke4 <45087979+cvolzke4@users.noreply.github.com>	Fri Sep 20 09:06:20 2019 +1000
committer	Nicholas Shahan <nshahan@google.com>	Thu Sep 19 16:06:20 2019 -0700
tree	03a867819af1d72e181a77f4612d7c7c82493fc0
parent	d37f5887e327a245abb9fc3675446dd96b172957 [diff]