[dart2js] Take care not to split surrogate pairs when writing output
dart2js chunks the writing of the JavaScript file. With `--utf8`, the
output can contain code points that are represented as two surrogate
pairs code units in the UTF-16 string. If the chunking splits the the
surrogate pair then the lower level UTF8 encoding does not see the
other element of the pair, so encodes the illegal unpaired surrogate
as U+FFFD � (REPLACEMENT CHARACTER).
The code point U+10000 ("𐀀") is encoded as the surrogate pair U+D800
followed by U+DC00, which normally is emitted as the UTF8 bytes
... F0 90 80 80 ...
If split, each surrogate is converted to U+FFFD, resulting in bytes
... EF BF BD EF BF BD ...
This change avoids splitting the surrogate pair by adjusting the index
at which the string is split.
Change-Id: I9629bf07d391005934c99d1dd649c55c4c58c3bc
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/249066
Reviewed-by: Mayank Patke <fishythefish@google.com>
Commit-Queue: Stephen Adams <sra@google.com>
Reviewed-by: Joshua Litt <joshualitt@google.com>
diff --git a/pkg/compiler/lib/src/source_file_provider.dart b/pkg/compiler/lib/src/source_file_provider.dart
index b28f72a..41fe730 100644
--- a/pkg/compiler/lib/src/source_file_provider.dart
+++ b/pkg/compiler/lib/src/source_file_provider.dart
@@ -8,7 +8,6 @@
import 'dart:async';
import 'dart:io';
-import 'dart:math' as math;
import 'dart:typed_data';
import 'package:front_end/src/api_unstable/dart2js.dart' as fe;
@@ -385,9 +384,22 @@
int offset = 0;
while (offset < data.length) {
- output.writeStringSync(
- data.substring(offset, math.min(offset + chunkSize, data.length)));
- offset += chunkSize;
+ String chunk;
+ int cut = offset + chunkSize;
+ if (cut < data.length) {
+ // Don't break the string in the middle of a code point encoded as two
+ // surrogate pairs since `writeStringSync` will encode the unpaired
+ // surrogates as U+FFFD REPLACEMENT CHARACTER.
+ int lastCodeUnit = data.codeUnitAt(cut - 1);
+ if (_isLeadSurrogate(lastCodeUnit)) {
+ cut -= 1;
+ }
+ chunk = data.substring(offset, cut);
+ } else {
+ chunk = offset == 0 ? data : data.substring(offset);
+ }
+ output.writeStringSync(chunk);
+ offset += chunk.length;
}
charactersWritten += data.length;
}
@@ -406,6 +418,8 @@
return _OutputSinkWrapper(writeStringSync, onDone);
}
+ static bool _isLeadSurrogate(int codeUnit) => (codeUnit & 0xFC00) == 0xD800;
+
@override
api.BinaryOutputSink createBinarySink(Uri uri) {
uri = Uri.base.resolveUri(uri);