Handle surrogate pairs during scanning (dart-lang/yaml#159)

Change back to readChar() whenever possible; remove the need to decode the surrogate for further checking
diff --git a/pkgs/yaml/CHANGELOG.md b/pkgs/yaml/CHANGELOG.md
index 37e5660..cd800a8 100644
--- a/pkgs/yaml/CHANGELOG.md
+++ b/pkgs/yaml/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 3.1.3-wip
 
 * Require Dart 3.4
+* Fix UTF-16 surrogate pair handling in plain scaler.
 
 ## 3.1.2
 
diff --git a/pkgs/yaml/lib/src/scanner.dart b/pkgs/yaml/lib/src/scanner.dart
index 4bf0b93..4b155e1 100644
--- a/pkgs/yaml/lib/src/scanner.dart
+++ b/pkgs/yaml/lib/src/scanner.dart
@@ -253,7 +253,7 @@
       null => false,
       LF || CR || BOM => false,
       TAB || NEL => true,
-      _ => _isStandardCharacter(char),
+      _ => _isStandardCharacterAt(0),
     };
   }
 
@@ -267,7 +267,7 @@
       null => false,
       LF || CR || BOM || SP => false,
       NEL => true,
-      _ => _isStandardCharacter(char),
+      _ => _isStandardCharacterAt(0),
     };
   }
 
@@ -614,9 +614,9 @@
 
     // Consume the indicator token.
     var start = _scanner.state;
-    _scanner.readChar();
-    _scanner.readChar();
-    _scanner.readChar();
+    _scanner.readCodePoint();
+    _scanner.readCodePoint();
+    _scanner.readCodePoint();
 
     _tokens.add(Token(type, _scanner.spanFrom(start)));
   }
@@ -732,7 +732,7 @@
   /// The span of the new token is the current character.
   void _addCharToken(TokenType type) {
     var start = _scanner.state;
-    _scanner.readChar();
+    _scanner.readCodePoint();
     _tokens.add(Token(type, _scanner.spanFrom(start)));
   }
 
@@ -836,7 +836,7 @@
       // libyaml doesn't support unknown directives, but the spec says to ignore
       // them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
       while (!_isBreakOrEnd) {
-        _scanner.readChar();
+        _scanner.readCodePoint();
       }
 
       return null;
@@ -866,7 +866,7 @@
     // disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
     var start = _scanner.position;
     while (_isNonSpace) {
-      _scanner.readChar();
+      _scanner.readCodePoint();
     }
 
     var name = _scanner.substring(start);
@@ -941,13 +941,13 @@
     var start = _scanner.state;
 
     // Eat the indicator character.
-    _scanner.readChar();
+    _scanner.readCodePoint();
 
     // libyaml only allows word characters in anchor names, but the spec
     // disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
     var startPosition = _scanner.position;
     while (_isAnchorChar) {
-      _scanner.readChar();
+      _scanner.readCodePoint();
     }
     var name = _scanner.substring(startPosition);
 
@@ -1032,7 +1032,7 @@
     buffer.write(_scanner.substring(start));
 
     if (_scanner.peekChar() == EXCLAMATION) {
-      buffer.writeCharCode(_scanner.readChar());
+      buffer.writeCharCode(_scanner.readCodePoint());
     } else {
       // It's either the '!' tag or not really a tag handle. If it's a %TAG
       // directive, it's an error. If it's a tag token, it must be part of a
@@ -1083,7 +1083,7 @@
     var start = _scanner.state;
 
     // Eat the indicator '|' or '>'.
-    _scanner.readChar();
+    _scanner.readCodePoint();
 
     // Check for a chomping indicator.
     var chomping = _Chomping.clip;
@@ -1091,7 +1091,7 @@
     var char = _scanner.peekChar();
     if (char == PLUS || char == HYPHEN) {
       chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
-      _scanner.readChar();
+      _scanner.readCodePoint();
 
       // Check for an indentation indicator.
       if (_isDigit) {
@@ -1101,7 +1101,7 @@
               _scanner.spanFrom(start));
         }
 
-        increment = _scanner.readChar() - NUMBER_0;
+        increment = _scanner.readCodePoint() - NUMBER_0;
       }
     } else if (_isDigit) {
       // Do the same as above, but in the opposite order.
@@ -1110,12 +1110,12 @@
             _scanner.spanFrom(start));
       }
 
-      increment = _scanner.readChar() - NUMBER_0;
+      increment = _scanner.readCodePoint() - NUMBER_0;
 
       char = _scanner.peekChar();
       if (char == PLUS || char == HYPHEN) {
         chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
-        _scanner.readChar();
+        _scanner.readCodePoint();
       }
     }
 
@@ -1182,7 +1182,7 @@
 
       var startPosition = _scanner.position;
       while (!_isBreakOrEnd) {
-        _scanner.readChar();
+        _scanner.readCodePoint();
       }
       buffer.write(_scanner.substring(startPosition));
       end = _scanner.state;
@@ -1373,7 +1373,7 @@
             buffer.writeCharCode(value);
           }
         } else {
-          buffer.writeCharCode(_scanner.readChar());
+          buffer.writeCharCode(_scanner.readCodePoint());
         }
       }
 
@@ -1462,7 +1462,7 @@
       // 1.2's. We use [_isPlainChar] instead of libyaml's character here.
       var startPosition = _scanner.position;
       while (_isPlainChar) {
-        _scanner.readChar();
+        _scanner.readCodePoint();
       }
       buffer.write(_scanner.substring(startPosition));
       end = _scanner.state;
@@ -1587,15 +1587,28 @@
         _inBlockContext,
       SP || TAB || LF || CR || BOM => false,
       NEL => true,
-      _ => _isStandardCharacter(char)
+      _ => _isStandardCharacterAt(offset)
     };
   }
 
+  bool _isStandardCharacterAt(int offset) {
+    var first = _scanner.peekChar(offset);
+    if (first == null) return false;
+
+    if (isHighSurrogate(first)) {
+      var next = _scanner.peekChar(offset + 1);
+      // A surrogate pair encodes code points from U+010000 to U+10FFFF, so it
+      // must be a standard character.
+      return next != null && isLowSurrogate(next);
+    }
+
+    return _isStandardCharacter(first);
+  }
+
   bool _isStandardCharacter(int char) =>
-      (char >= 0x00020 && char <= 0x00007E) ||
-      (char >= 0x000A0 && char <= 0x00D7FF) ||
-      (char >= 0x0E000 && char <= 0x00FFFD) ||
-      (char >= 0x10000 && char <= 0x10FFFF);
+      (char >= 0x0020 && char <= 0x007E) ||
+      (char >= 0x00A0 && char <= 0xD7FF) ||
+      (char >= 0xE000 && char <= 0xFFFD);
 
   /// Returns the hexidecimal value of [char].
   int _asHex(int char) {
diff --git a/pkgs/yaml/lib/src/utils.dart b/pkgs/yaml/lib/src/utils.dart
index db4612c..d9e20d1 100644
--- a/pkgs/yaml/lib/src/utils.dart
+++ b/pkgs/yaml/lib/src/utils.dart
@@ -43,3 +43,9 @@
   if (span != null) message = span.message(message);
   print(message);
 };
+
+/// Whether [codeUnit] is a UTF-16 high surrogate.
+bool isHighSurrogate(int codeUnit) => codeUnit >>> 10 == 0x36;
+
+/// Whether [codeUnit] is a UTF-16 low surrogate.
+bool isLowSurrogate(int codeUnit) => codeUnit >>> 10 == 0x37;
diff --git a/pkgs/yaml/pubspec.yaml b/pkgs/yaml/pubspec.yaml
index 09b6541..be7d165 100644
--- a/pkgs/yaml/pubspec.yaml
+++ b/pkgs/yaml/pubspec.yaml
@@ -12,7 +12,7 @@
 dependencies:
   collection: ^1.15.0
   source_span: ^1.8.0
-  string_scanner: ^1.1.0
+  string_scanner: ^1.2.0
 
 dev_dependencies:
   dart_flutter_team_lints: ^3.0.0
diff --git a/pkgs/yaml/test/yaml_test.dart b/pkgs/yaml/test/yaml_test.dart
index bb35ba4..3b5b77d 100644
--- a/pkgs/yaml/test/yaml_test.dart
+++ b/pkgs/yaml/test/yaml_test.dart
@@ -420,20 +420,25 @@
 
     test('[Example 2.17]', () {
       expectYamlLoads({
-        'unicode': 'Sosa did fine.\u263A',
+        'unicode': 'Sosa did fine.\u263A \u{1F680}',
         'control': '\b1998\t1999\t2000\n',
         'hex esc': '\r\n is \r\n',
         'single': '"Howdy!" he cried.',
         'quoted': " # Not a 'comment'.",
-        'tie-fighter': '|\\-*-/|'
+        'tie-fighter': '|\\-*-/|',
+        'surrogate-pair': 'I \u{D83D}\u{DE03}  ️Dart!',
+        'key-\u{D83D}\u{DD11}': 'Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!',
       }, """
-        unicode: "Sosa did fine.\\u263A"
+        unicode: "Sosa did fine.\\u263A \\U0001F680"
         control: "\\b1998\\t1999\\t2000\\n"
         hex esc: "\\x0d\\x0a is \\r\\n"
 
         single: '"Howdy!" he cried.'
         quoted: ' # Not a ''comment''.'
-        tie-fighter: '|\\-*-/|'""");
+        tie-fighter: '|\\-*-/|'
+        
+        surrogate-pair: I \u{D83D}\u{DE03}  ️Dart!
+        key-\u{D83D}\u{DD11}: Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!""");
     });
 
     test('[Example 2.18]', () {