UTF-8 decoder using a state machine.
Two-pass decoder: the first pass scans through the input to compute
the length of the resulting string and which decoder to use, and the
second pass does the actual decoding.
The same decoder is used for both one-shot and chunked decoding, and
both with and without allowMalformed. If there is an error in the input
and allowMalformed is true, it starts over with a general decoder that
supports malformed input and allocates space as it goes along.
JS targets go directly to the general decoder, as the two-pass approach
is not beneficial here.
Three pieces of the decoder are designed to be pluggable by patches to
optimize the performance further:
- scan, running the first pass of the conversion.
- decode8, decoding Latin1 data into a OneByteString.
- decode16, decoding arbitrary data into a TwoByteString.
Improves decoding speed, especially for complex input (many multi-byte
characters). Observed speed increases are approximately:
- dart2js: up to 40%
- VM JIT: up to 260%
- VM AOT: up to 130%
The constant overhead of calling the UTF-8 decoder is also significantly
reduced for dart2js.
Code size for dart2js is slightly reduced compared to the old decoder.
ASCII inputs currently see a slight speed decrease for VM targets, which
will be fixed in https://dart-review.googlesource.com/c/sdk/+/145460
This is part of the implementation of the breaking change described at
https://github.com/dart-lang/sdk/issues/41100
Closes https://github.com/dart-lang/sdk/issues/28832
Closes https://github.com/dart-lang/sdk/issues/31954
Ideas for further improvements to the decoder are collected in
https://github.com/dart-lang/sdk/issues/41734
Change-Id: I3c5bb84e8d6783231680a9d34d6c38e8a28ab112
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/142025
Reviewed-by: Stephen Adams <sra@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
diff --git a/sdk/lib/_internal/js_dev_runtime/patch/convert_patch.dart b/sdk/lib/_internal/js_dev_runtime/patch/convert_patch.dart
index 6b85e67..cdc7370 100644
--- a/sdk/lib/_internal/js_dev_runtime/patch/convert_patch.dart
+++ b/sdk/lib/_internal/js_dev_runtime/patch/convert_patch.dart
@@ -500,11 +500,17 @@
}
@patch
-int _scanOneByteCharacters(List<int> units, int from, int endIndex) {
- final to = endIndex;
- for (var i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
+class _Utf8Decoder {
+ @patch
+ _Utf8Decoder(this.allowMalformed) : _state = beforeBom;
+
+ @patch
+ String convertSingle(List<int> codeUnits, int start, int maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, true);
}
- return to - from;
+
+ @patch
+ String convertChunked(List<int> codeUnits, int start, int maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, false);
+ }
}
diff --git a/sdk/lib/_internal/js_runtime/lib/convert_patch.dart b/sdk/lib/_internal/js_runtime/lib/convert_patch.dart
index 421fd51..c2dc264 100644
--- a/sdk/lib/_internal/js_runtime/lib/convert_patch.dart
+++ b/sdk/lib/_internal/js_runtime/lib/convert_patch.dart
@@ -494,11 +494,17 @@
}
@patch
-int _scanOneByteCharacters(List<int> units, int from, int endIndex) {
- final to = endIndex;
- for (var i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
+class _Utf8Decoder {
+ @patch
+ _Utf8Decoder(this.allowMalformed) : _state = beforeBom;
+
+ @patch
+ String convertSingle(List<int> codeUnits, int start, int maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, true);
}
- return to - from;
+
+ @patch
+ String convertChunked(List<int> codeUnits, int start, int maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, false);
+ }
}
diff --git a/sdk/lib/_internal/vm/lib/convert_patch.dart b/sdk/lib/_internal/vm/lib/convert_patch.dart
index f1e46cf..973214a 100644
--- a/sdk/lib/_internal/vm/lib/convert_patch.dart
+++ b/sdk/lib/_internal/vm/lib/convert_patch.dart
@@ -1855,24 +1855,417 @@
}
@patch
-int _scanOneByteCharacters(List<int> units, int from, int endIndex) {
- final to = endIndex;
+class _Utf8Decoder {
+ /// Flags indicating presence of the various kinds of bytes in the input.
+ int _scanFlags = 0;
- // Special case for _Uint8ArrayView.
- if (units is Uint8List) {
- if (from >= 0 && to >= 0 && to <= units.length) {
- for (int i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
- }
- return to - from;
+ /// How many bytes of the BOM have been read so far. Set to -1 when the BOM
+ /// has been skipped (or was not present).
+ int _bomIndex = 0;
+
+ // Table for the scanning phase, which quickly scans through the input.
+ //
+ // Each input byte is looked up in the table, providing a size and some flags.
+ // The sizes are summed, and the flags are or'ed together.
+ //
+ // The resulting size and flags indicate:
+ // A) How many UTF-16 code units will be emitted by the decoding of this
+ // input. This can be used to allocate a string of the correct length up
+ // front.
+ // B) Which decoder and resulting string representation is appropriate. There
+ // are three cases:
+ // 1) Pure ASCII (flags == 0): The input can simply be put into a
+ // OneByteString without further decoding.
+ // 2) Latin1 (flags == (flagLatin1 | flagExtension)): The result can be
+ // represented by a OneByteString, and the decoder can assume that only
+ // Latin1 characters are present.
+ // 3) Arbitrary input (otherwise): Needs a full-featured decoder. Output
+ // can be represented by a TwoByteString.
+
+ static const int sizeMask = 0x03;
+ static const int flagsMask = 0x3C;
+
+ static const int flagExtension = 1 << 2;
+ static const int flagLatin1 = 1 << 3;
+ static const int flagNonLatin1 = 1 << 4;
+ static const int flagIllegal = 1 << 5;
+
+ // ASCII 'A' = 64 + (1);
+ // Extension 'D' = 64 + (0 | flagExtension);
+ // Latin1 'I' = 64 + (1 | flagLatin1);
+ // BMP 'Q' = 64 + (1 | flagNonLatin1);
+ // Non-BMP 'R' = 64 + (2 | flagNonLatin1);
+ // Illegal 'a' = 64 + (1 | flagIllegal);
+ // Illegal 'b' = 64 + (2 | flagIllegal);
+ static const String scanTable = ""
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 00-1F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 20-3F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 40-5F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 60-7F
+ "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD" // 80-9F
+ "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD" // A0-BF
+ "aaIIQQQQQQQQQQQQQQQQQQQQQQQQQQQQ" // C0-DF
+ "QQQQQQQQQQQQQQQQRRRRRbbbbbbbbbbb" // E0-FF
+ ;
+
+ // The VM decoder handles BOM explicitly instead of via the state machine.
+ @patch
+ _Utf8Decoder(this.allowMalformed) : _state = initial;
+
+ @patch
+ String convertSingle(List<int> codeUnits, int start, int maybeEnd) {
+ int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
+
+ // Have bytes as Uint8List.
+ Uint8List bytes;
+ int errorOffset;
+ if (codeUnits is Uint8List) {
+ bytes = codeUnits;
+ errorOffset = 0;
+ } else {
+ bytes = _makeUint8List(codeUnits, start, end);
+ errorOffset = start;
+ end -= start;
+ start = 0;
}
+
+ // Skip initial BOM.
+ start = skipBomSingle(bytes, start, end);
+
+ // Special case empty input.
+ if (start == end) return "";
+
+ // Scan input to determine size and appropriate decoder.
+ int size = scan(bytes, start, end);
+ int flags = _scanFlags;
+
+ if (flags == 0) {
+ // Pure ASCII.
+ assert(size == end - start);
+ // TODO(dartbug.com/41703): String.fromCharCodes has a lot of overhead
+ // checking types and ranges, which is redundant in this case. Find a
+ // more direct way to do the conversion.
+ return String.fromCharCodes(bytes, start, end);
+ }
+
+ String result;
+ if (flags == (flagLatin1 | flagExtension)) {
+ // Latin1.
+ result = decode8(bytes, start, end, size);
+ } else {
+ // Arbitrary Unicode.
+ result = decode16(bytes, start, end, size);
+ }
+ if (_state == accept) {
+ return result;
+ }
+
+ if (!allowMalformed) {
+ if (!isErrorState(_state)) {
+ // Unfinished sequence.
+ _state = errorUnfinished;
+ _charOrIndex = end;
+ }
+ final String message = errorDescription(_state);
+ throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
+ }
+
+ // Start over on slow path.
+ _state = initial;
+ result = decodeGeneral(bytes, start, end, true);
+ assert(!isErrorState(_state));
+ return result;
}
- // Fall through to normal case.
- for (var i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
+ @patch
+ String convertChunked(List<int> codeUnits, int start, int maybeEnd) {
+ int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
+
+ // Have bytes as Uint8List.
+ Uint8List bytes;
+ int errorOffset;
+ if (codeUnits is Uint8List) {
+ bytes = codeUnits;
+ errorOffset = 0;
+ } else {
+ bytes = _makeUint8List(codeUnits, start, end);
+ errorOffset = start;
+ end -= start;
+ start = 0;
+ }
+
+ // Skip initial BOM.
+ start = skipBomChunked(bytes, start, end);
+
+ // Special case empty input.
+ if (start == end) return "";
+
+ // Scan input to determine size and appropriate decoder.
+ int size = scan(bytes, start, end);
+ int flags = _scanFlags;
+
+ // Adjust scan flags and size based on carry-over state.
+ switch (_state) {
+ case IA:
+ break;
+ case X1:
+ flags |= _charOrIndex < (0x100 >> 6) ? flagLatin1 : flagNonLatin1;
+ if (end - start >= 1) {
+ size += _charOrIndex < (0x10000 >> 6) ? 1 : 2;
+ }
+ break;
+ case X2:
+ flags |= flagNonLatin1;
+ if (end - start >= 2) {
+ size += _charOrIndex < (0x10000 >> 12) ? 1 : 2;
+ }
+ break;
+ case TO:
+ case TS:
+ flags |= flagNonLatin1;
+ if (end - start >= 2) size += 1;
+ break;
+ case X3:
+ case QO:
+ case QR:
+ flags |= flagNonLatin1;
+ if (end - start >= 3) size += 2;
+ break;
+ }
+
+ if (flags == 0) {
+ // Pure ASCII.
+ assert(_state == accept);
+ assert(size == end - start);
+ // TODO(dartbug.com/41703): String.fromCharCodes has a lot of overhead
+ // checking types and ranges, which is redundant in this case. Find a
+ // more direct way to do the conversion.
+ return String.fromCharCodes(bytes, start, end);
+ }
+
+ // Do not include any final, incomplete character in size.
+ int extensionCount = 0;
+ int i = end - 1;
+ while (i >= start && (bytes[i] & 0xC0) == 0x80) {
+ extensionCount++;
+ i--;
+ }
+ if (i >= start && bytes[i] >= ((~0x3F >> extensionCount) & 0xFF)) {
+ size -= bytes[i] >= 0xF0 ? 2 : 1;
+ }
+
+ final int carryOverState = _state;
+ final int carryOverChar = _charOrIndex;
+ String result;
+ if (flags == (flagLatin1 | flagExtension)) {
+ // Latin1.
+ result = decode8(bytes, start, end, size);
+ } else {
+ // Arbitrary Unicode.
+ result = decode16(bytes, start, end, size);
+ }
+ if (!isErrorState(_state)) {
+ return result;
+ }
+ assert(_bomIndex == -1);
+
+ if (!allowMalformed) {
+ final String message = errorDescription(_state);
+ _state = initial; // Ready for more input.
+ throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
+ }
+
+ // Start over on slow path.
+ _state = carryOverState;
+ _charOrIndex = carryOverChar;
+ result = decodeGeneral(bytes, start, end, false);
+ assert(!isErrorState(_state));
+ return result;
}
- return to - from;
+
+ @pragma("vm:prefer-inline")
+ int skipBomSingle(Uint8List bytes, int start, int end) {
+ if (end - start >= 3 &&
+ bytes[start] == 0xEF &&
+ bytes[start + 1] == 0xBB &&
+ bytes[start + 2] == 0xBF) {
+ return start + 3;
+ }
+ return start;
+ }
+
+ @pragma("vm:prefer-inline")
+ int skipBomChunked(Uint8List bytes, int start, int end) {
+ assert(start <= end);
+ int bomIndex = _bomIndex;
+ // Already skipped?
+ if (bomIndex == -1) return start;
+
+ const bomValues = <int>[0xEF, 0xBB, 0xBF];
+ int i = start;
+ while (bomIndex < 3) {
+ if (i == end) {
+ // Unfinished BOM.
+ _bomIndex = bomIndex;
+ return start;
+ }
+ if (bytes[i++] != bomValues[bomIndex++]) {
+ // No BOM.
+ _bomIndex = -1;
+ return start;
+ }
+ }
+ // Complete BOM.
+ _bomIndex = -1;
+ _state = initial;
+ return i;
+ }
+
+ // Scanning functions to compute the size of the resulting string and flags
+ // (written to _scanFlags) indicating which decoder to use.
+ // TODO(dartbug.com/41702): Intrinsify this function.
+ int scan(Uint8List bytes, int start, int end) {
+ _scanFlags = 0;
+ for (int i = start; i < end; i++) {
+ if (bytes[i] > 127) return i - start + scan2(bytes, i, end);
+ }
+ return end - start;
+ }
+
+ int scan2(Uint8List bytes, int start, int end) {
+ final String scanTable = _Utf8Decoder.scanTable;
+ int size = 0;
+ int flags = 0;
+ for (int i = start; i < end; i++) {
+ int t = scanTable.codeUnitAt(bytes[i]);
+ size += t & sizeMask;
+ flags |= t;
+ }
+ _scanFlags = flags & flagsMask;
+ return size;
+ }
+
+ String decode8(Uint8List bytes, int start, int end, int size) {
+ assert(start < end);
+ // TODO(dartbug.com/41704): Allocate an uninitialized _OneByteString and
+ // write characters to it using _setAt.
+ Uint8List chars = Uint8List(size);
+ int i = start;
+ int j = 0;
+ if (_state == X1) {
+ // Half-way though 2-byte sequence
+ assert(_charOrIndex == 2 || _charOrIndex == 3);
+ final int e = bytes[i++] ^ 0x80;
+ if (e >= 0x40) {
+ _state = errorMissingExtension;
+ _charOrIndex = i - 1;
+ return "";
+ }
+ chars[j++] = (_charOrIndex << 6) | e;
+ _state = accept;
+ }
+ assert(_state == accept);
+ while (i < end) {
+ int byte = bytes[i++];
+ if (byte >= 0x80) {
+ if (byte < 0xC0) {
+ _state = errorUnexpectedExtension;
+ _charOrIndex = i - 1;
+ return "";
+ }
+ assert(byte == 0xC2 || byte == 0xC3);
+ if (i == end) {
+ _state = X1;
+ _charOrIndex = byte & 0x1F;
+ break;
+ }
+ final int e = bytes[i++] ^ 0x80;
+ if (e >= 0x40) {
+ _state = errorMissingExtension;
+ _charOrIndex = i - 1;
+ return "";
+ }
+ byte = (byte << 6) | e;
+ }
+ chars[j++] = byte;
+ }
+ // Output size must match, unless we are doing single conversion and are
+ // inside an unfinished sequence (which will trigger an error later).
+ assert(_bomIndex == 0 && _state != accept
+ ? (j == size - 1 || j == size - 2)
+ : (j == size));
+ return String.fromCharCodes(chars);
+ }
+
+ String decode16(Uint8List bytes, int start, int end, int size) {
+ assert(start < end);
+ final String typeTable = _Utf8Decoder.typeTable;
+ final String transitionTable = _Utf8Decoder.transitionTable;
+ // TODO(dartbug.com/41704): Allocate an uninitialized _TwoByteString and
+ // write characters to it using _setAt.
+ Uint16List chars = Uint16List(size);
+ int i = start;
+ int j = 0;
+ int state = _state;
+ int char;
+
+ // First byte
+ assert(!isErrorState(state));
+ final int byte = bytes[i++];
+ final int type = typeTable.codeUnitAt(byte) & typeMask;
+ if (state == accept) {
+ char = byte & (shiftedByteMask >> type);
+ state = transitionTable.codeUnitAt(type);
+ } else {
+ char = (byte & 0x3F) | (_charOrIndex << 6);
+ state = transitionTable.codeUnitAt(state + type);
+ }
+
+ while (i < end) {
+ final int byte = bytes[i++];
+ final int type = typeTable.codeUnitAt(byte) & typeMask;
+ if (state == accept) {
+ if (char >= 0x10000) {
+ assert(char < 0x110000);
+ chars[j++] = 0xD7C0 + (char >> 10);
+ chars[j++] = 0xDC00 + (char & 0x3FF);
+ } else {
+ chars[j++] = char;
+ }
+ char = byte & (shiftedByteMask >> type);
+ state = transitionTable.codeUnitAt(type);
+ } else if (isErrorState(state)) {
+ _state = state;
+ _charOrIndex = i - 2;
+ return "";
+ } else {
+ char = (byte & 0x3F) | (char << 6);
+ state = transitionTable.codeUnitAt(state + type);
+ }
+ }
+
+ // Final write?
+ if (state == accept) {
+ if (char >= 0x10000) {
+ assert(char < 0x110000);
+ chars[j++] = 0xD7C0 + (char >> 10);
+ chars[j++] = 0xDC00 + (char & 0x3FF);
+ } else {
+ chars[j++] = char;
+ }
+ } else if (isErrorState(state)) {
+ _state = state;
+ _charOrIndex = end - 1;
+ return "";
+ }
+
+ _state = state;
+ _charOrIndex = char;
+ // Output size must match, unless we are doing single conversion and are
+ // inside an unfinished sequence (which will trigger an error later).
+ assert(_bomIndex == 0 && _state != accept
+ ? (j == size - 1 || j == size - 2)
+ : (j == size));
+ return String.fromCharCodes(chars);
+ }
}
diff --git a/sdk/lib/convert/string_conversion.dart b/sdk/lib/convert/string_conversion.dart
index 086ed79..6c7d965 100644
--- a/sdk/lib/convert/string_conversion.dart
+++ b/sdk/lib/convert/string_conversion.dart
@@ -256,12 +256,13 @@
class _Utf8StringSinkAdapter extends ByteConversionSink {
final _Utf8Decoder _decoder;
final Sink _sink;
+ final StringSink _stringSink;
- _Utf8StringSinkAdapter(this._sink, StringSink stringSink, bool allowMalformed)
- : _decoder = _Utf8Decoder(stringSink, allowMalformed);
+ _Utf8StringSinkAdapter(this._sink, this._stringSink, bool allowMalformed)
+ : _decoder = _Utf8Decoder(allowMalformed);
void close() {
- _decoder.close();
+ _decoder.flush(_stringSink);
if (_sink != null) _sink.close();
}
@@ -271,7 +272,7 @@
void addSlice(
List<int> codeUnits, int startIndex, int endIndex, bool isLast) {
- _decoder.convert(codeUnits, startIndex, endIndex);
+ _stringSink.write(_decoder.convertChunked(codeUnits, startIndex, endIndex));
if (isLast) close();
}
}
@@ -289,11 +290,11 @@
_Utf8ConversionSink._(
this._chunkedSink, StringBuffer stringBuffer, bool allowMalformed)
- : _decoder = _Utf8Decoder(stringBuffer, allowMalformed),
+ : _decoder = _Utf8Decoder(allowMalformed),
_buffer = stringBuffer;
void close() {
- _decoder.close();
+ _decoder.flush(_buffer);
if (_buffer.isNotEmpty) {
var accumulated = _buffer.toString();
_buffer.clear();
@@ -308,7 +309,7 @@
}
void addSlice(List<int> chunk, int startIndex, int endIndex, bool isLast) {
- _decoder.convert(chunk, startIndex, endIndex);
+ _buffer.write(_decoder.convertChunked(chunk, startIndex, endIndex));
if (_buffer.isNotEmpty) {
var accumulated = _buffer.toString();
_chunkedSink.addSlice(accumulated, 0, accumulated.length, isLast);
diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart
index a302e82..3d08ad7 100644
--- a/sdk/lib/convert/utf.dart
+++ b/sdk/lib/convert/utf.dart
@@ -58,12 +58,19 @@
/// was used to instantiate `this`.
String decode(List<int> codeUnits, {bool allowMalformed}) {
allowMalformed ??= _allowMalformed;
- return Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
+ // Switch between const objects to avoid allocation.
+ Utf8Decoder decoder = allowMalformed
+ ? const Utf8Decoder(allowMalformed: true)
+ : const Utf8Decoder(allowMalformed: false);
+ return decoder.convert(codeUnits);
}
Utf8Encoder get encoder => const Utf8Encoder();
Utf8Decoder get decoder {
- return Utf8Decoder(allowMalformed: _allowMalformed);
+ // Switch between const objects to avoid allocation.
+ return _allowMalformed
+ ? const Utf8Decoder(allowMalformed: true)
+ : const Utf8Decoder(allowMalformed: false);
}
}
@@ -311,29 +318,7 @@
return result;
}
- var length = codeUnits.length;
- end = RangeError.checkValidRange(start, end, length);
-
- // Fast case for ASCII strings avoids StringBuffer/_Utf8Decoder.
- int oneBytes = _scanOneByteCharacters(codeUnits, start, end);
- StringBuffer buffer;
- bool isFirstCharacter = true;
- if (oneBytes > 0) {
- var firstPart = String.fromCharCodes(codeUnits, start, start + oneBytes);
- start += oneBytes;
- if (start == end) {
- return firstPart;
- }
- buffer = StringBuffer(firstPart);
- isFirstCharacter = false;
- }
-
- buffer ??= StringBuffer();
- var decoder = _Utf8Decoder(buffer, _allowMalformed);
- decoder._isFirstCharacter = isFirstCharacter;
- decoder.convert(codeUnits, start, end);
- decoder.flush(codeUnits, end);
- return buffer.toString();
+ return _Utf8Decoder(_allowMalformed).convertSingle(codeUnits, start, end);
}
/// Starts a chunked conversion.
@@ -379,185 +364,314 @@
0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) |
(tail & _SURROGATE_VALUE_MASK);
-/// Decodes UTF-8.
-///
-/// The decoder handles chunked input.
-// TODO(floitsch): make this class public.
class _Utf8Decoder {
- final bool _allowMalformed;
- final StringSink _stringSink;
- bool _isFirstCharacter = true;
- int _value = 0;
- int _expectedUnits = 0;
- int _extraUnits = 0;
+ /// Decode malformed UTF-8 as replacement characters (instead of throwing)?
+ final bool allowMalformed;
- _Utf8Decoder(this._stringSink, this._allowMalformed);
+ /// Decoder DFA state.
+ int _state;
- bool get hasPartialInput => _expectedUnits > 0;
+ /// Partially decoded character. Meaning depends on state. Not used when in
+ /// the initial/accept state. When in an error state, contains the index into
+ /// the input of the error.
+ int _charOrIndex = 0;
- // Limits of one through four byte encodings.
- static const List<int> _LIMITS = <int>[
- _ONE_BYTE_LIMIT,
- _TWO_BYTE_LIMIT,
- _THREE_BYTE_LIMIT,
- _FOUR_BYTE_LIMIT
- ];
+ // State machine for UTF-8 decoding, based on this decoder by Björn Höhrmann:
+ // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ //
+ // One iteration in the state machine proceeds as:
+ //
+ // type = typeTable[byte];
+ // char = (state != accept)
+ // ? (byte & 0x3F) | (char << 6)
+ // : byte & (shiftedByteMask >> type);
+ // state = transitionTable[state + type];
+ //
+ // After each iteration, if state == accept, char is output as a character.
- void close() {
- flush();
+ // Mask to and on the type read from the table.
+ static const int typeMask = 0x1F;
+ // Mask shifted right by byte type to mask first byte of sequence.
+ static const int shiftedByteMask = 0xF0FE;
+
+ // Byte types.
+ // 'A' = ASCII, 00-7F
+ // 'B' = 2-byte, C2-DF
+ // 'C' = 3-byte, E1-EC, EE
+ // 'D' = 3-byte (possibly surrogate), ED
+ // 'E' = Illegal, C0-C1, F5+
+ // 'F' = Low extension, 80-8F
+ // 'G' = Mid extension, 90-9F
+ // 'H' = High extension, A0-BA, BC-BE
+ // 'I' = Second byte of BOM, BB
+ // 'J' = Third byte of BOM, BF
+ // 'K' = 3-byte (possibly overlong), E0
+ // 'L' = First byte of BOM, EF
+ // 'M' = 4-byte (possibly out-of-range), F4
+ // 'N' = 4-byte, F1-F3
+ // 'O' = 4-byte (possibly overlong), F0
+ static const String typeTable = ""
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 00-1F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 20-3F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 40-5F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 60-7F
+ "FFFFFFFFFFFFFFFFGGGGGGGGGGGGGGGG" // 80-9F
+ "HHHHHHHHHHHHHHHHHHHHHHHHHHHIHHHJ" // A0-BF
+ "EEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" // C0-DF
+ "KCCCCCCCCCCCCDCLONNNMEEEEEEEEEEE" // E0-FF
+ ;
+
+ // States (offsets into transition table).
+ static const int IA = 0x00; // Initial / Accept
+ static const int BB = 0x10; // Before BOM
+ static const int AB = 0x20; // After BOM
+ static const int X1 = 0x30; // Expecting one extension byte
+ static const int X2 = 0x3A; // Expecting two extension bytes
+ static const int X3 = 0x44; // Expecting three extension bytes
+ static const int TO = 0x4E; // Possibly overlong 3-byte
+ static const int TS = 0x58; // Possibly surrogate
+ static const int QO = 0x62; // Possibly overlong 4-byte
+ static const int QR = 0x6C; // Possibly out-of-range 4-byte
+ static const int B1 = 0x76; // One byte into BOM
+ static const int B2 = 0x80; // Two bytes into BOM
+ static const int E1 = 0x41; // Error: Missing extension byte
+ static const int E2 = 0x43; // Error: Unexpected extension byte
+ static const int E3 = 0x45; // Error: Invalid byte
+ static const int E4 = 0x47; // Error: Overlong encoding
+ static const int E5 = 0x49; // Error: Out of range
+ static const int E6 = 0x4B; // Error: Surrogate
+ static const int E7 = 0x4D; // Error: Unfinished
+
+ // Character equivalents for states.
+ static const String _IA = '\u0000';
+ static const String _BB = '\u0010';
+ static const String _AB = '\u0020';
+ static const String _X1 = '\u0030';
+ static const String _X2 = '\u003A';
+ static const String _X3 = '\u0044';
+ static const String _TO = '\u004E';
+ static const String _TS = '\u0058';
+ static const String _QO = '\u0062';
+ static const String _QR = '\u006C';
+ static const String _B1 = '\u0076';
+ static const String _B2 = '\u0080';
+ static const String _E1 = '\u0041';
+ static const String _E2 = '\u0043';
+ static const String _E3 = '\u0045';
+ static const String _E4 = '\u0047';
+ static const String _E5 = '\u0049';
+ static const String _E6 = '\u004B';
+ static const String _E7 = '\u004D';
+
+ // Transition table of the state machine. Maps state and byte type
+ // to next state.
+ static const String transitionTable = " "
+ // A B C D E F G H I J K L M N O
+ "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // IA
+ "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_B1$_QR$_X3$_QO " // BB
+ "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // AB
+ "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_IA" // Overlap 5 E1s X1
+ "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_X1$_X1" // Overlap 5 E1s X2
+ "$_E1$_E1$_E1$_E1$_E1$_X2$_X2$_X2$_X2$_X2" // Overlap 5 E1s X3
+ "$_E1$_E1$_E1$_E1$_E1$_E4$_E4$_X1$_X1$_X1" // Overlap 5 E1s TO
+ "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_E6$_E6$_E6" // Overlap 5 E1s TS
+ "$_E1$_E1$_E1$_E1$_E1$_E4$_X2$_X2$_X2$_X2" // Overlap 5 E1s QO
+ "$_E1$_E1$_E1$_E1$_E1$_X2$_E5$_E5$_E5$_E5" // Overlap 5 E1s QR
+ "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_B2$_X1" // Overlap 5 E1s B1
+ "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_AB$_E1$_E1$_E1$_E1$_E1" // B2
+ ;
+
+ // Aliases for states.
+ static const int initial = IA;
+ static const int accept = IA;
+ static const int beforeBom = BB;
+ static const int afterBom = AB;
+ static const int errorMissingExtension = E1;
+ static const int errorUnexpectedExtension = E2;
+ static const int errorInvalid = E3;
+ static const int errorOverlong = E4;
+ static const int errorOutOfRange = E5;
+ static const int errorSurrogate = E6;
+ static const int errorUnfinished = E7;
+
+ static bool isErrorState(int state) => (state & 1) != 0;
+
+ static String errorDescription(int state) {
+ switch (state) {
+ case errorMissingExtension:
+ return "Missing extension byte";
+ case errorUnexpectedExtension:
+ return "Unexpected extension byte";
+ case errorInvalid:
+ return "Invalid UTF-8 byte";
+ case errorOverlong:
+ return "Overlong encoding";
+ case errorOutOfRange:
+ return "Out of unicode range";
+ case errorSurrogate:
+ return "Encoded surrogate";
+ case errorUnfinished:
+ return "Unfinished UTF-8 octet sequence";
+ default:
+ return "";
+ }
+ }
+
+ external _Utf8Decoder(bool allowMalformed);
+
+ external String convertSingle(List<int> codeUnits, int start, int maybeEnd);
+
+ external String convertChunked(List<int> codeUnits, int start, int maybeEnd);
+
+ String convertGeneral(
+ List<int> codeUnits, int start, int maybeEnd, bool single) {
+ int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
+
+ if (start == end) return "";
+
+ // Have bytes as Uint8List.
+ Uint8List bytes;
+ int errorOffset;
+ if (codeUnits is Uint8List) {
+ bytes = codeUnits;
+ errorOffset = 0;
+ } else {
+ bytes = _makeUint8List(codeUnits, start, end);
+ errorOffset = start;
+ end -= start;
+ start = 0;
+ }
+
+ String result = decodeGeneral(bytes, start, end, single);
+ if (isErrorState(_state)) {
+ String message = errorDescription(_state);
+ _state = initial; // Ready for more input.
+ throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
+ }
+ return result;
}
/// Flushes this decoder as if closed.
///
/// This method throws if the input was partial and the decoder was
/// constructed with `allowMalformed` set to `false`.
- ///
- /// The [source] and [offset] of the current position may be provided,
- /// and are included in the exception if one is thrown.
- void flush([List<int> source, int offset]) {
- if (hasPartialInput) {
- if (!_allowMalformed) {
- throw FormatException(
- "Unfinished UTF-8 octet sequence", source, offset);
- }
- _stringSink.writeCharCode(unicodeReplacementCharacterRune);
- _value = 0;
- _expectedUnits = 0;
- _extraUnits = 0;
+ void flush(StringSink sink) {
+ final int state = _state;
+ _state = initial;
+ if (state <= afterBom) {
+ return;
+ }
+ // Unfinished sequence.
+ if (allowMalformed) {
+ sink.writeCharCode(unicodeReplacementCharacterRune);
+ } else {
+ throw FormatException(errorDescription(errorUnfinished), null, null);
}
}
- void convert(List<int> codeUnits, int startIndex, int endIndex) {
- var value = _value;
- var expectedUnits = _expectedUnits;
- var extraUnits = _extraUnits;
- _value = 0;
- _expectedUnits = 0;
- _extraUnits = 0;
-
- var i = startIndex;
+ String decodeGeneral(Uint8List bytes, int start, int end, bool single) {
+ final String typeTable = _Utf8Decoder.typeTable;
+ final String transitionTable = _Utf8Decoder.transitionTable;
+ int state = _state;
+ int char = _charOrIndex;
+ final StringBuffer buffer = StringBuffer();
+ int i = start;
+ int byte = bytes[i++];
loop:
while (true) {
multibyte:
- if (expectedUnits > 0) {
- do {
- if (i == endIndex) {
- break loop;
- }
- var unit = codeUnits[i];
- if ((unit & 0xC0) != 0x80) {
- expectedUnits = 0;
- if (!_allowMalformed) {
- throw FormatException(
- "Bad UTF-8 encoding 0x${unit.toRadixString(16)}",
- codeUnits,
- i);
+ while (true) {
+ int type = typeTable.codeUnitAt(byte) & typeMask;
+ char = (state <= afterBom)
+ ? byte & (shiftedByteMask >> type)
+ : (byte & 0x3F) | (char << 6);
+ state = transitionTable.codeUnitAt(state + type);
+ if (state == accept) {
+ buffer.writeCharCode(char);
+ if (i == end) break loop;
+ break multibyte;
+ } else if (isErrorState(state)) {
+ if (allowMalformed) {
+ switch (state) {
+ case errorInvalid:
+ case errorUnexpectedExtension:
+ // A single byte that can't start a sequence.
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ break;
+ case errorMissingExtension:
+ // Unfinished sequence followed by a byte that can start a
+ // sequence.
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ // Re-parse offending byte.
+ i -= 1;
+ break;
+ default:
+ // Unfinished sequence followed by a byte that can't start a
+ // sequence.
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ break;
}
- _isFirstCharacter = false;
- _stringSink.writeCharCode(unicodeReplacementCharacterRune);
- break multibyte;
+ state = initial;
} else {
- value = (value << 6) | (unit & 0x3f);
- expectedUnits--;
- i++;
+ _state = state;
+ _charOrIndex = i - 1;
+ return "";
}
- } while (expectedUnits > 0);
- if (value <= _LIMITS[extraUnits - 1]) {
- // Overly long encoding. The value could be encoded with a shorter
- // encoding.
- if (!_allowMalformed) {
- throw FormatException(
- "Overlong encoding of 0x${value.toRadixString(16)}",
- codeUnits,
- i - extraUnits - 1);
- }
- expectedUnits = extraUnits = 0;
- value = unicodeReplacementCharacterRune;
}
- if (value > _FOUR_BYTE_LIMIT) {
- if (!_allowMalformed) {
- throw FormatException(
- "Character outside valid Unicode range: "
- "0x${value.toRadixString(16)}",
- codeUnits,
- i - extraUnits - 1);
- }
- value = unicodeReplacementCharacterRune;
- }
- if (!_isFirstCharacter || value != unicodeBomCharacterRune) {
- _stringSink.writeCharCode(value);
- }
- _isFirstCharacter = false;
+ if (i == end) break loop;
+ byte = bytes[i++];
}
- while (i < endIndex) {
- var oneBytes = _scanOneByteCharacters(codeUnits, i, endIndex);
- if (oneBytes > 0) {
- _isFirstCharacter = false;
- assert(i + oneBytes <= endIndex);
- _stringSink.write(String.fromCharCodes(codeUnits, i, i + oneBytes));
-
- i += oneBytes;
- if (i == endIndex) break;
- }
- var unit = codeUnits[i++];
- // TODO(floitsch): the way we test we could potentially allow
- // units that are too large, if they happen to have the
- // right bit-pattern. (Same is true for the multibyte loop above).
- // TODO(floitsch): optimize this loop. See:
- // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.dart?column_width=80
- if (unit < 0) {
- // TODO(floitsch): should this be unit <= 0 ?
- if (!_allowMalformed) {
- throw FormatException(
- "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}",
- codeUnits,
- i - 1);
+ final int markStart = i;
+ byte = bytes[i++];
+ if (byte < 128) {
+ int markEnd = end;
+ while (i < end) {
+ byte = bytes[i++];
+ if (byte >= 128) {
+ markEnd = i - 1;
+ break;
}
- _stringSink.writeCharCode(unicodeReplacementCharacterRune);
+ }
+ assert(markStart < markEnd);
+ if (markEnd - markStart < 20) {
+ for (int m = markStart; m < markEnd; m++) {
+ buffer.writeCharCode(bytes[m]);
+ }
} else {
- assert(unit > _ONE_BYTE_LIMIT);
- if ((unit & 0xE0) == 0xC0) {
- value = unit & 0x1F;
- expectedUnits = extraUnits = 1;
- continue loop;
- }
- if ((unit & 0xF0) == 0xE0) {
- value = unit & 0x0F;
- expectedUnits = extraUnits = 2;
- continue loop;
- }
- // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
- if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
- value = unit & 0x07;
- expectedUnits = extraUnits = 3;
- continue loop;
- }
- if (!_allowMalformed) {
- throw FormatException(
- "Bad UTF-8 encoding 0x${unit.toRadixString(16)}",
- codeUnits,
- i - 1);
- }
- value = unicodeReplacementCharacterRune;
- expectedUnits = extraUnits = 0;
- _isFirstCharacter = false;
- _stringSink.writeCharCode(value);
+ buffer.write(String.fromCharCodes(bytes, markStart, markEnd));
}
+ if (markEnd == end) break loop;
}
- break loop;
}
- if (expectedUnits > 0) {
- _value = value;
- _expectedUnits = expectedUnits;
- _extraUnits = extraUnits;
+
+ if (single && state > afterBom) {
+ // Unfinished sequence.
+ if (allowMalformed) {
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ } else {
+ _state = errorUnfinished;
+ _charOrIndex = end;
+ return "";
+ }
}
+ _state = state;
+ _charOrIndex = char;
+ return buffer.toString();
+ }
+
+ static Uint8List _makeUint8List(List<int> codeUnits, int start, int end) {
+ final int length = end - start;
+ final Uint8List bytes = Uint8List(length);
+ for (int i = 0; i < length; i++) {
+ int b = codeUnits[start + i];
+ if ((b & ~0xFF) != 0) {
+ // Replace invalid byte values by FF, which is also invalid.
+ b = 0xFF;
+ }
+ bytes[i] = b;
+ }
+ return bytes;
}
}
-
-// Returns the number of bytes in [units] starting at offset [from] which have
-// the leftmost bit set to 0.
-//
-// To increase performance of this critical method we have a special variant of
-// it implemented in the VM's patch files, which is why we make it external.
-external int _scanOneByteCharacters(List<int> units, int from, int endIndex);
diff --git a/sdk/lib/internal/internal.dart b/sdk/lib/internal/internal.dart
index b3ecbe7..d044b03 100644
--- a/sdk/lib/internal/internal.dart
+++ b/sdk/lib/internal/internal.dart
@@ -20,16 +20,17 @@
import 'dart:core' hide Symbol;
import 'dart:core' as core;
import 'dart:math' show Random;
+import 'dart:typed_data' show Uint8List;
part 'async_cast.dart';
part 'cast.dart';
part 'errors.dart';
part 'iterable.dart';
part 'list.dart';
+part 'linked_list.dart';
part 'print.dart';
part 'sort.dart';
part 'symbol.dart';
-part 'linked_list.dart';
// Powers of 10 up to 10^22 are representable as doubles.
// Powers of 10 above that are only approximate due to lack of precission.
diff --git a/sdk_nnbd/lib/_internal/js_dev_runtime/patch/convert_patch.dart b/sdk_nnbd/lib/_internal/js_dev_runtime/patch/convert_patch.dart
index a17be68..1b79fe8 100644
--- a/sdk_nnbd/lib/_internal/js_dev_runtime/patch/convert_patch.dart
+++ b/sdk_nnbd/lib/_internal/js_dev_runtime/patch/convert_patch.dart
@@ -496,11 +496,17 @@
}
@patch
-int _scanOneByteCharacters(List<int> units, int from, int endIndex) {
- final to = endIndex;
- for (var i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
+class _Utf8Decoder {
+ @patch
+ _Utf8Decoder(this.allowMalformed) : _state = beforeBom;
+
+ @patch
+ String convertSingle(List<int> codeUnits, int start, int? maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, true);
}
- return to - from;
+
+ @patch
+ String convertChunked(List<int> codeUnits, int start, int? maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, false);
+ }
}
diff --git a/sdk_nnbd/lib/_internal/js_runtime/lib/convert_patch.dart b/sdk_nnbd/lib/_internal/js_runtime/lib/convert_patch.dart
index 44e8ab3..070946a 100644
--- a/sdk_nnbd/lib/_internal/js_runtime/lib/convert_patch.dart
+++ b/sdk_nnbd/lib/_internal/js_runtime/lib/convert_patch.dart
@@ -494,11 +494,17 @@
}
@patch
-int _scanOneByteCharacters(List<int> units, int from, int endIndex) {
- final to = endIndex;
- for (var i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
+class _Utf8Decoder {
+ @patch
+ _Utf8Decoder(this.allowMalformed) : _state = beforeBom;
+
+ @patch
+ String convertSingle(List<int> codeUnits, int start, int? maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, true);
}
- return to - from;
+
+ @patch
+ String convertChunked(List<int> codeUnits, int start, int? maybeEnd) {
+ return convertGeneral(codeUnits, start, maybeEnd, false);
+ }
}
diff --git a/sdk_nnbd/lib/_internal/vm/lib/convert_patch.dart b/sdk_nnbd/lib/_internal/vm/lib/convert_patch.dart
index 374221c..ae3d66a 100644
--- a/sdk_nnbd/lib/_internal/vm/lib/convert_patch.dart
+++ b/sdk_nnbd/lib/_internal/vm/lib/convert_patch.dart
@@ -1859,24 +1859,417 @@
}
@patch
-int _scanOneByteCharacters(List<int> units, int from, int endIndex) {
- final to = endIndex;
+class _Utf8Decoder {
+ /// Flags indicating presence of the various kinds of bytes in the input.
+ int _scanFlags = 0;
- // Special case for _Uint8ArrayView.
- if (units is Uint8List) {
- if (from >= 0 && to >= 0 && to <= units.length) {
- for (int i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
- }
- return to - from;
+ /// How many bytes of the BOM have been read so far. Set to -1 when the BOM
+ /// has been skipped (or was not present).
+ int _bomIndex = 0;
+
+ // Table for the scanning phase, which quickly scans through the input.
+ //
+ // Each input byte is looked up in the table, providing a size and some flags.
+ // The sizes are summed, and the flags are or'ed together.
+ //
+ // The resulting size and flags indicate:
+ // A) How many UTF-16 code units will be emitted by the decoding of this
+ // input. This can be used to allocate a string of the correct length up
+ // front.
+ // B) Which decoder and resulting string representation is appropriate. There
+ // are three cases:
+ // 1) Pure ASCII (flags == 0): The input can simply be put into a
+ // OneByteString without further decoding.
+ // 2) Latin1 (flags == (flagLatin1 | flagExtension)): The result can be
+ // represented by a OneByteString, and the decoder can assume that only
+ // Latin1 characters are present.
+ // 3) Arbitrary input (otherwise): Needs a full-featured decoder. Output
+ // can be represented by a TwoByteString.
+
+ static const int sizeMask = 0x03;
+ static const int flagsMask = 0x3C;
+
+ static const int flagExtension = 1 << 2;
+ static const int flagLatin1 = 1 << 3;
+ static const int flagNonLatin1 = 1 << 4;
+ static const int flagIllegal = 1 << 5;
+
+ // ASCII 'A' = 64 + (1);
+ // Extension 'D' = 64 + (0 | flagExtension);
+ // Latin1 'I' = 64 + (1 | flagLatin1);
+ // BMP 'Q' = 64 + (1 | flagNonLatin1);
+ // Non-BMP 'R' = 64 + (2 | flagNonLatin1);
+ // Illegal 'a' = 64 + (1 | flagIllegal);
+ // Illegal 'b' = 64 + (2 | flagIllegal);
+ static const String scanTable = ""
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 00-1F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 20-3F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 40-5F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 60-7F
+ "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD" // 80-9F
+ "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD" // A0-BF
+ "aaIIQQQQQQQQQQQQQQQQQQQQQQQQQQQQ" // C0-DF
+ "QQQQQQQQQQQQQQQQRRRRRbbbbbbbbbbb" // E0-FF
+ ;
+
+ // The VM decoder handles BOM explicitly instead of via the state machine.
+ @patch
+ _Utf8Decoder(this.allowMalformed) : _state = initial;
+
+ @patch
+ String convertSingle(List<int> codeUnits, int start, int? maybeEnd) {
+ int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
+
+ // Have bytes as Uint8List.
+ Uint8List bytes;
+ int errorOffset;
+ if (codeUnits is Uint8List) {
+ bytes = codeUnits;
+ errorOffset = 0;
+ } else {
+ bytes = _makeUint8List(codeUnits, start, end);
+ errorOffset = start;
+ end -= start;
+ start = 0;
}
+
+ // Skip initial BOM.
+ start = skipBomSingle(bytes, start, end);
+
+ // Special case empty input.
+ if (start == end) return "";
+
+ // Scan input to determine size and appropriate decoder.
+ int size = scan(bytes, start, end);
+ int flags = _scanFlags;
+
+ if (flags == 0) {
+ // Pure ASCII.
+ assert(size == end - start);
+ // TODO(dartbug.com/41703): String.fromCharCodes has a lot of overhead
+ // checking types and ranges, which is redundant in this case. Find a
+ // more direct way to do the conversion.
+ return String.fromCharCodes(bytes, start, end);
+ }
+
+ String result;
+ if (flags == (flagLatin1 | flagExtension)) {
+ // Latin1.
+ result = decode8(bytes, start, end, size);
+ } else {
+ // Arbitrary Unicode.
+ result = decode16(bytes, start, end, size);
+ }
+ if (_state == accept) {
+ return result;
+ }
+
+ if (!allowMalformed) {
+ if (!isErrorState(_state)) {
+ // Unfinished sequence.
+ _state = errorUnfinished;
+ _charOrIndex = end;
+ }
+ final String message = errorDescription(_state);
+ throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
+ }
+
+ // Start over on slow path.
+ _state = initial;
+ result = decodeGeneral(bytes, start, end, true);
+ assert(!isErrorState(_state));
+ return result;
}
- // Fall through to normal case.
- for (var i = from; i < to; i++) {
- final unit = units[i];
- if ((unit & _ONE_BYTE_LIMIT) != unit) return i - from;
+ @patch
+ String convertChunked(List<int> codeUnits, int start, int? maybeEnd) {
+ int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
+
+ // Have bytes as Uint8List.
+ Uint8List bytes;
+ int errorOffset;
+ if (codeUnits is Uint8List) {
+ bytes = codeUnits;
+ errorOffset = 0;
+ } else {
+ bytes = _makeUint8List(codeUnits, start, end);
+ errorOffset = start;
+ end -= start;
+ start = 0;
+ }
+
+ // Skip initial BOM.
+ start = skipBomChunked(bytes, start, end);
+
+ // Special case empty input.
+ if (start == end) return "";
+
+ // Scan input to determine size and appropriate decoder.
+ int size = scan(bytes, start, end);
+ int flags = _scanFlags;
+
+ // Adjust scan flags and size based on carry-over state.
+ switch (_state) {
+ case IA:
+ break;
+ case X1:
+ flags |= _charOrIndex < (0x100 >> 6) ? flagLatin1 : flagNonLatin1;
+ if (end - start >= 1) {
+ size += _charOrIndex < (0x10000 >> 6) ? 1 : 2;
+ }
+ break;
+ case X2:
+ flags |= flagNonLatin1;
+ if (end - start >= 2) {
+ size += _charOrIndex < (0x10000 >> 12) ? 1 : 2;
+ }
+ break;
+ case TO:
+ case TS:
+ flags |= flagNonLatin1;
+ if (end - start >= 2) size += 1;
+ break;
+ case X3:
+ case QO:
+ case QR:
+ flags |= flagNonLatin1;
+ if (end - start >= 3) size += 2;
+ break;
+ }
+
+ if (flags == 0) {
+ // Pure ASCII.
+ assert(_state == accept);
+ assert(size == end - start);
+ // TODO(dartbug.com/41703): String.fromCharCodes has a lot of overhead
+ // checking types and ranges, which is redundant in this case. Find a
+ // more direct way to do the conversion.
+ return String.fromCharCodes(bytes, start, end);
+ }
+
+ // Do not include any final, incomplete character in size.
+ int extensionCount = 0;
+ int i = end - 1;
+ while (i >= start && (bytes[i] & 0xC0) == 0x80) {
+ extensionCount++;
+ i--;
+ }
+ if (i >= start && bytes[i] >= ((~0x3F >> extensionCount) & 0xFF)) {
+ size -= bytes[i] >= 0xF0 ? 2 : 1;
+ }
+
+ final int carryOverState = _state;
+ final int carryOverChar = _charOrIndex;
+ String result;
+ if (flags == (flagLatin1 | flagExtension)) {
+ // Latin1.
+ result = decode8(bytes, start, end, size);
+ } else {
+ // Arbitrary Unicode.
+ result = decode16(bytes, start, end, size);
+ }
+ if (!isErrorState(_state)) {
+ return result;
+ }
+ assert(_bomIndex == -1);
+
+ if (!allowMalformed) {
+ final String message = errorDescription(_state);
+ _state = initial; // Ready for more input.
+ throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
+ }
+
+ // Start over on slow path.
+ _state = carryOverState;
+ _charOrIndex = carryOverChar;
+ result = decodeGeneral(bytes, start, end, false);
+ assert(!isErrorState(_state));
+ return result;
}
- return to - from;
+
+ @pragma("vm:prefer-inline")
+ int skipBomSingle(Uint8List bytes, int start, int end) {
+ if (end - start >= 3 &&
+ bytes[start] == 0xEF &&
+ bytes[start + 1] == 0xBB &&
+ bytes[start + 2] == 0xBF) {
+ return start + 3;
+ }
+ return start;
+ }
+
+ @pragma("vm:prefer-inline")
+ int skipBomChunked(Uint8List bytes, int start, int end) {
+ assert(start <= end);
+ int bomIndex = _bomIndex;
+ // Already skipped?
+ if (bomIndex == -1) return start;
+
+ const bomValues = <int>[0xEF, 0xBB, 0xBF];
+ int i = start;
+ while (bomIndex < 3) {
+ if (i == end) {
+ // Unfinished BOM.
+ _bomIndex = bomIndex;
+ return start;
+ }
+ if (bytes[i++] != bomValues[bomIndex++]) {
+ // No BOM.
+ _bomIndex = -1;
+ return start;
+ }
+ }
+ // Complete BOM.
+ _bomIndex = -1;
+ _state = initial;
+ return i;
+ }
+
+ // Scanning functions to compute the size of the resulting string and flags
+ // (written to _scanFlags) indicating which decoder to use.
+ // TODO(dartbug.com/41702): Intrinsify this function.
+ int scan(Uint8List bytes, int start, int end) {
+ _scanFlags = 0;
+ for (int i = start; i < end; i++) {
+ if (bytes[i] > 127) return i - start + scan2(bytes, i, end);
+ }
+ return end - start;
+ }
+
+ int scan2(Uint8List bytes, int start, int end) {
+ final String scanTable = _Utf8Decoder.scanTable;
+ int size = 0;
+ int flags = 0;
+ for (int i = start; i < end; i++) {
+ int t = scanTable.codeUnitAt(bytes[i]);
+ size += t & sizeMask;
+ flags |= t;
+ }
+ _scanFlags = flags & flagsMask;
+ return size;
+ }
+
+ String decode8(Uint8List bytes, int start, int end, int size) {
+ assert(start < end);
+ // TODO(dartbug.com/41704): Allocate an uninitialized _OneByteString and
+ // write characters to it using _setAt.
+ Uint8List chars = Uint8List(size);
+ int i = start;
+ int j = 0;
+ if (_state == X1) {
+ // Half-way though 2-byte sequence
+ assert(_charOrIndex == 2 || _charOrIndex == 3);
+ final int e = bytes[i++] ^ 0x80;
+ if (e >= 0x40) {
+ _state = errorMissingExtension;
+ _charOrIndex = i - 1;
+ return "";
+ }
+ chars[j++] = (_charOrIndex << 6) | e;
+ _state = accept;
+ }
+ assert(_state == accept);
+ while (i < end) {
+ int byte = bytes[i++];
+ if (byte >= 0x80) {
+ if (byte < 0xC0) {
+ _state = errorUnexpectedExtension;
+ _charOrIndex = i - 1;
+ return "";
+ }
+ assert(byte == 0xC2 || byte == 0xC3);
+ if (i == end) {
+ _state = X1;
+ _charOrIndex = byte & 0x1F;
+ break;
+ }
+ final int e = bytes[i++] ^ 0x80;
+ if (e >= 0x40) {
+ _state = errorMissingExtension;
+ _charOrIndex = i - 1;
+ return "";
+ }
+ byte = (byte << 6) | e;
+ }
+ chars[j++] = byte;
+ }
+ // Output size must match, unless we are doing single conversion and are
+ // inside an unfinished sequence (which will trigger an error later).
+ assert(_bomIndex == 0 && _state != accept
+ ? (j == size - 1 || j == size - 2)
+ : (j == size));
+ return String.fromCharCodes(chars);
+ }
+
+ String decode16(Uint8List bytes, int start, int end, int size) {
+ assert(start < end);
+ final String typeTable = _Utf8Decoder.typeTable;
+ final String transitionTable = _Utf8Decoder.transitionTable;
+ // TODO(dartbug.com/41704): Allocate an uninitialized _TwoByteString and
+ // write characters to it using _setAt.
+ Uint16List chars = Uint16List(size);
+ int i = start;
+ int j = 0;
+ int state = _state;
+ int char;
+
+ // First byte
+ assert(!isErrorState(state));
+ final int byte = bytes[i++];
+ final int type = typeTable.codeUnitAt(byte) & typeMask;
+ if (state == accept) {
+ char = byte & (shiftedByteMask >> type);
+ state = transitionTable.codeUnitAt(type);
+ } else {
+ char = (byte & 0x3F) | (_charOrIndex << 6);
+ state = transitionTable.codeUnitAt(state + type);
+ }
+
+ while (i < end) {
+ final int byte = bytes[i++];
+ final int type = typeTable.codeUnitAt(byte) & typeMask;
+ if (state == accept) {
+ if (char >= 0x10000) {
+ assert(char < 0x110000);
+ chars[j++] = 0xD7C0 + (char >> 10);
+ chars[j++] = 0xDC00 + (char & 0x3FF);
+ } else {
+ chars[j++] = char;
+ }
+ char = byte & (shiftedByteMask >> type);
+ state = transitionTable.codeUnitAt(type);
+ } else if (isErrorState(state)) {
+ _state = state;
+ _charOrIndex = i - 2;
+ return "";
+ } else {
+ char = (byte & 0x3F) | (char << 6);
+ state = transitionTable.codeUnitAt(state + type);
+ }
+ }
+
+ // Final write?
+ if (state == accept) {
+ if (char >= 0x10000) {
+ assert(char < 0x110000);
+ chars[j++] = 0xD7C0 + (char >> 10);
+ chars[j++] = 0xDC00 + (char & 0x3FF);
+ } else {
+ chars[j++] = char;
+ }
+ } else if (isErrorState(state)) {
+ _state = state;
+ _charOrIndex = end - 1;
+ return "";
+ }
+
+ _state = state;
+ _charOrIndex = char;
+ // Output size must match, unless we are doing single conversion and are
+ // inside an unfinished sequence (which will trigger an error later).
+ assert(_bomIndex == 0 && _state != accept
+ ? (j == size - 1 || j == size - 2)
+ : (j == size));
+ return String.fromCharCodes(chars);
+ }
}
diff --git a/sdk_nnbd/lib/convert/string_conversion.dart b/sdk_nnbd/lib/convert/string_conversion.dart
index 1d5f038..2315330 100644
--- a/sdk_nnbd/lib/convert/string_conversion.dart
+++ b/sdk_nnbd/lib/convert/string_conversion.dart
@@ -258,12 +258,13 @@
class _Utf8StringSinkAdapter extends ByteConversionSink {
final _Utf8Decoder _decoder;
final Sink<Object?> _sink;
+ final StringSink _stringSink;
- _Utf8StringSinkAdapter(this._sink, StringSink stringSink, bool allowMalformed)
- : _decoder = _Utf8Decoder(stringSink, allowMalformed);
+ _Utf8StringSinkAdapter(this._sink, this._stringSink, bool allowMalformed)
+ : _decoder = _Utf8Decoder(allowMalformed);
void close() {
- _decoder.close();
+ _decoder.flush(_stringSink);
_sink.close();
}
@@ -273,7 +274,7 @@
void addSlice(
List<int> codeUnits, int startIndex, int endIndex, bool isLast) {
- _decoder.convert(codeUnits, startIndex, endIndex);
+ _stringSink.write(_decoder.convertChunked(codeUnits, startIndex, endIndex));
if (isLast) close();
}
}
@@ -291,11 +292,11 @@
_Utf8ConversionSink._(
this._chunkedSink, StringBuffer stringBuffer, bool allowMalformed)
- : _decoder = _Utf8Decoder(stringBuffer, allowMalformed),
+ : _decoder = _Utf8Decoder(allowMalformed),
_buffer = stringBuffer;
void close() {
- _decoder.close();
+ _decoder.flush(_buffer);
if (_buffer.isNotEmpty) {
var accumulated = _buffer.toString();
_buffer.clear();
@@ -310,7 +311,7 @@
}
void addSlice(List<int> chunk, int startIndex, int endIndex, bool isLast) {
- _decoder.convert(chunk, startIndex, endIndex);
+ _buffer.write(_decoder.convertChunked(chunk, startIndex, endIndex));
if (_buffer.isNotEmpty) {
var accumulated = _buffer.toString();
_chunkedSink.addSlice(accumulated, 0, accumulated.length, isLast);
diff --git a/sdk_nnbd/lib/convert/utf.dart b/sdk_nnbd/lib/convert/utf.dart
index 99ccc90..f495ac0 100644
--- a/sdk_nnbd/lib/convert/utf.dart
+++ b/sdk_nnbd/lib/convert/utf.dart
@@ -55,13 +55,19 @@
/// If [allowMalformed] is not given, it defaults to the `allowMalformed` that
/// was used to instantiate `this`.
String decode(List<int> codeUnits, {bool? allowMalformed}) {
- return Utf8Decoder(allowMalformed: allowMalformed ?? _allowMalformed)
- .convert(codeUnits);
+ // Switch between const objects to avoid allocation.
+ Utf8Decoder decoder = allowMalformed ?? _allowMalformed
+ ? const Utf8Decoder(allowMalformed: true)
+ : const Utf8Decoder(allowMalformed: false);
+ return decoder.convert(codeUnits);
}
Utf8Encoder get encoder => const Utf8Encoder();
Utf8Decoder get decoder {
- return Utf8Decoder(allowMalformed: _allowMalformed);
+ // Switch between const objects to avoid allocation.
+ return _allowMalformed
+ ? const Utf8Decoder(allowMalformed: true)
+ : const Utf8Decoder(allowMalformed: false);
}
}
@@ -313,33 +319,7 @@
return result;
}
- var length = codeUnits.length;
- end = RangeError.checkValidRange(start, end, length);
- // TODO(38725): Remove workaround when assignment promotion is implemented
- if (end == null) {
- throw RangeError("Invalid range");
- }
-
- // Fast case for ASCII strings avoids StringBuffer/_Utf8Decoder.
- int oneBytes = _scanOneByteCharacters(codeUnits, start, end);
- StringBuffer buffer;
- bool isFirstCharacter = true;
- if (oneBytes > 0) {
- var firstPart = String.fromCharCodes(codeUnits, start, start + oneBytes);
- start += oneBytes;
- if (start == end) {
- return firstPart;
- }
- buffer = StringBuffer(firstPart);
- isFirstCharacter = false;
- } else {
- buffer = StringBuffer();
- }
- var decoder = _Utf8Decoder(buffer, _allowMalformed);
- decoder._isFirstCharacter = isFirstCharacter;
- decoder.convert(codeUnits, start, end);
- decoder.flush(codeUnits, end);
- return buffer.toString();
+ return _Utf8Decoder(_allowMalformed).convertSingle(codeUnits, start, end);
}
/// Starts a chunked conversion.
@@ -385,185 +365,314 @@
0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) |
(tail & _SURROGATE_VALUE_MASK);
-/// Decodes UTF-8.
-///
-/// The decoder handles chunked input.
-// TODO(floitsch): make this class public.
class _Utf8Decoder {
- final bool _allowMalformed;
- final StringSink _stringSink;
- bool _isFirstCharacter = true;
- int _value = 0;
- int _expectedUnits = 0;
- int _extraUnits = 0;
+ /// Decode malformed UTF-8 as replacement characters (instead of throwing)?
+ final bool allowMalformed;
- _Utf8Decoder(this._stringSink, this._allowMalformed);
+ /// Decoder DFA state.
+ int _state;
- bool get hasPartialInput => _expectedUnits > 0;
+ /// Partially decoded character. Meaning depends on state. Not used when in
+ /// the initial/accept state. When in an error state, contains the index into
+ /// the input of the error.
+ int _charOrIndex = 0;
- // Limits of one through four byte encodings.
- static const List<int> _LIMITS = <int>[
- _ONE_BYTE_LIMIT,
- _TWO_BYTE_LIMIT,
- _THREE_BYTE_LIMIT,
- _FOUR_BYTE_LIMIT
- ];
+ // State machine for UTF-8 decoding, based on this decoder by Björn Höhrmann:
+ // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ //
+ // One iteration in the state machine proceeds as:
+ //
+ // type = typeTable[byte];
+ // char = (state != accept)
+ // ? (byte & 0x3F) | (char << 6)
+ // : byte & (shiftedByteMask >> type);
+ // state = transitionTable[state + type];
+ //
+ // After each iteration, if state == accept, char is output as a character.
- void close() {
- flush();
+ // Mask to and on the type read from the table.
+ static const int typeMask = 0x1F;
+ // Mask shifted right by byte type to mask first byte of sequence.
+ static const int shiftedByteMask = 0xF0FE;
+
+ // Byte types.
+ // 'A' = ASCII, 00-7F
+ // 'B' = 2-byte, C2-DF
+ // 'C' = 3-byte, E1-EC, EE
+ // 'D' = 3-byte (possibly surrogate), ED
+ // 'E' = Illegal, C0-C1, F5+
+ // 'F' = Low extension, 80-8F
+ // 'G' = Mid extension, 90-9F
+ // 'H' = High extension, A0-BA, BC-BE
+ // 'I' = Second byte of BOM, BB
+ // 'J' = Third byte of BOM, BF
+ // 'K' = 3-byte (possibly overlong), E0
+ // 'L' = First byte of BOM, EF
+ // 'M' = 4-byte (possibly out-of-range), F4
+ // 'N' = 4-byte, F1-F3
+ // 'O' = 4-byte (possibly overlong), F0
+ static const String typeTable = ""
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 00-1F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 20-3F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 40-5F
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 60-7F
+ "FFFFFFFFFFFFFFFFGGGGGGGGGGGGGGGG" // 80-9F
+ "HHHHHHHHHHHHHHHHHHHHHHHHHHHIHHHJ" // A0-BF
+ "EEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" // C0-DF
+ "KCCCCCCCCCCCCDCLONNNMEEEEEEEEEEE" // E0-FF
+ ;
+
+ // States (offsets into transition table).
+ static const int IA = 0x00; // Initial / Accept
+ static const int BB = 0x10; // Before BOM
+ static const int AB = 0x20; // After BOM
+ static const int X1 = 0x30; // Expecting one extension byte
+ static const int X2 = 0x3A; // Expecting two extension bytes
+ static const int X3 = 0x44; // Expecting three extension bytes
+ static const int TO = 0x4E; // Possibly overlong 3-byte
+ static const int TS = 0x58; // Possibly surrogate
+ static const int QO = 0x62; // Possibly overlong 4-byte
+ static const int QR = 0x6C; // Possibly out-of-range 4-byte
+ static const int B1 = 0x76; // One byte into BOM
+ static const int B2 = 0x80; // Two bytes into BOM
+ static const int E1 = 0x41; // Error: Missing extension byte
+ static const int E2 = 0x43; // Error: Unexpected extension byte
+ static const int E3 = 0x45; // Error: Invalid byte
+ static const int E4 = 0x47; // Error: Overlong encoding
+ static const int E5 = 0x49; // Error: Out of range
+ static const int E6 = 0x4B; // Error: Surrogate
+ static const int E7 = 0x4D; // Error: Unfinished
+
+ // Character equivalents for states.
+ static const String _IA = '\u0000';
+ static const String _BB = '\u0010';
+ static const String _AB = '\u0020';
+ static const String _X1 = '\u0030';
+ static const String _X2 = '\u003A';
+ static const String _X3 = '\u0044';
+ static const String _TO = '\u004E';
+ static const String _TS = '\u0058';
+ static const String _QO = '\u0062';
+ static const String _QR = '\u006C';
+ static const String _B1 = '\u0076';
+ static const String _B2 = '\u0080';
+ static const String _E1 = '\u0041';
+ static const String _E2 = '\u0043';
+ static const String _E3 = '\u0045';
+ static const String _E4 = '\u0047';
+ static const String _E5 = '\u0049';
+ static const String _E6 = '\u004B';
+ static const String _E7 = '\u004D';
+
+ // Transition table of the state machine. Maps state and byte type
+ // to next state.
+ static const String transitionTable = " "
+ // A B C D E F G H I J K L M N O
+ "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // IA
+ "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_B1$_QR$_X3$_QO " // BB
+ "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // AB
+ "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_IA" // Overlap 5 E1s X1
+ "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_X1$_X1" // Overlap 5 E1s X2
+ "$_E1$_E1$_E1$_E1$_E1$_X2$_X2$_X2$_X2$_X2" // Overlap 5 E1s X3
+ "$_E1$_E1$_E1$_E1$_E1$_E4$_E4$_X1$_X1$_X1" // Overlap 5 E1s TO
+ "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_E6$_E6$_E6" // Overlap 5 E1s TS
+ "$_E1$_E1$_E1$_E1$_E1$_E4$_X2$_X2$_X2$_X2" // Overlap 5 E1s QO
+ "$_E1$_E1$_E1$_E1$_E1$_X2$_E5$_E5$_E5$_E5" // Overlap 5 E1s QR
+ "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_B2$_X1" // Overlap 5 E1s B1
+ "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_AB$_E1$_E1$_E1$_E1$_E1" // B2
+ ;
+
+ // Aliases for states.
+ static const int initial = IA;
+ static const int accept = IA;
+ static const int beforeBom = BB;
+ static const int afterBom = AB;
+ static const int errorMissingExtension = E1;
+ static const int errorUnexpectedExtension = E2;
+ static const int errorInvalid = E3;
+ static const int errorOverlong = E4;
+ static const int errorOutOfRange = E5;
+ static const int errorSurrogate = E6;
+ static const int errorUnfinished = E7;
+
+ static bool isErrorState(int state) => (state & 1) != 0;
+
+ static String errorDescription(int state) {
+ switch (state) {
+ case errorMissingExtension:
+ return "Missing extension byte";
+ case errorUnexpectedExtension:
+ return "Unexpected extension byte";
+ case errorInvalid:
+ return "Invalid UTF-8 byte";
+ case errorOverlong:
+ return "Overlong encoding";
+ case errorOutOfRange:
+ return "Out of unicode range";
+ case errorSurrogate:
+ return "Encoded surrogate";
+ case errorUnfinished:
+ return "Unfinished UTF-8 octet sequence";
+ default:
+ return "";
+ }
+ }
+
+ external _Utf8Decoder(bool allowMalformed);
+
+ external String convertSingle(List<int> codeUnits, int start, int? maybeEnd);
+
+ external String convertChunked(List<int> codeUnits, int start, int? maybeEnd);
+
+ String convertGeneral(
+ List<int> codeUnits, int start, int? maybeEnd, bool single) {
+ int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
+
+ if (start == end) return "";
+
+ // Have bytes as Uint8List.
+ Uint8List bytes;
+ int errorOffset;
+ if (codeUnits is Uint8List) {
+ bytes = codeUnits;
+ errorOffset = 0;
+ } else {
+ bytes = _makeUint8List(codeUnits, start, end);
+ errorOffset = start;
+ end -= start;
+ start = 0;
+ }
+
+ String result = decodeGeneral(bytes, start, end, single);
+ if (isErrorState(_state)) {
+ String message = errorDescription(_state);
+ _state = initial; // Ready for more input.
+ throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
+ }
+ return result;
}
/// Flushes this decoder as if closed.
///
/// This method throws if the input was partial and the decoder was
/// constructed with `allowMalformed` set to `false`.
- ///
- /// The [source] and [offset] of the current position may be provided,
- /// and are included in the exception if one is thrown.
- void flush([List<int>? source, int? offset]) {
- if (hasPartialInput) {
- if (!_allowMalformed) {
- throw FormatException(
- "Unfinished UTF-8 octet sequence", source, offset);
- }
- _stringSink.writeCharCode(unicodeReplacementCharacterRune);
- _value = 0;
- _expectedUnits = 0;
- _extraUnits = 0;
+ void flush(StringSink sink) {
+ final int state = _state;
+ _state = initial;
+ if (state <= afterBom) {
+ return;
+ }
+ // Unfinished sequence.
+ if (allowMalformed) {
+ sink.writeCharCode(unicodeReplacementCharacterRune);
+ } else {
+ throw FormatException(errorDescription(errorUnfinished), null, null);
}
}
- void convert(List<int> codeUnits, int startIndex, int endIndex) {
- var value = _value;
- var expectedUnits = _expectedUnits;
- var extraUnits = _extraUnits;
- _value = 0;
- _expectedUnits = 0;
- _extraUnits = 0;
-
- var i = startIndex;
+ String decodeGeneral(Uint8List bytes, int start, int end, bool single) {
+ final String typeTable = _Utf8Decoder.typeTable;
+ final String transitionTable = _Utf8Decoder.transitionTable;
+ int state = _state;
+ int char = _charOrIndex;
+ final StringBuffer buffer = StringBuffer();
+ int i = start;
+ int byte = bytes[i++];
loop:
while (true) {
multibyte:
- if (expectedUnits > 0) {
- do {
- if (i == endIndex) {
- break loop;
- }
- var unit = codeUnits[i];
- if ((unit & 0xC0) != 0x80) {
- expectedUnits = 0;
- if (!_allowMalformed) {
- throw FormatException(
- "Bad UTF-8 encoding 0x${unit.toRadixString(16)}",
- codeUnits,
- i);
+ while (true) {
+ int type = typeTable.codeUnitAt(byte) & typeMask;
+ char = (state <= afterBom)
+ ? byte & (shiftedByteMask >> type)
+ : (byte & 0x3F) | (char << 6);
+ state = transitionTable.codeUnitAt(state + type);
+ if (state == accept) {
+ buffer.writeCharCode(char);
+ if (i == end) break loop;
+ break multibyte;
+ } else if (isErrorState(state)) {
+ if (allowMalformed) {
+ switch (state) {
+ case errorInvalid:
+ case errorUnexpectedExtension:
+ // A single byte that can't start a sequence.
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ break;
+ case errorMissingExtension:
+ // Unfinished sequence followed by a byte that can start a
+ // sequence.
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ // Re-parse offending byte.
+ i -= 1;
+ break;
+ default:
+ // Unfinished sequence followed by a byte that can't start a
+ // sequence.
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ break;
}
- _isFirstCharacter = false;
- _stringSink.writeCharCode(unicodeReplacementCharacterRune);
- break multibyte;
+ state = initial;
} else {
- value = (value << 6) | (unit & 0x3f);
- expectedUnits--;
- i++;
+ _state = state;
+ _charOrIndex = i - 1;
+ return "";
}
- } while (expectedUnits > 0);
- if (value <= _LIMITS[extraUnits - 1]) {
- // Overly long encoding. The value could be encoded with a shorter
- // encoding.
- if (!_allowMalformed) {
- throw FormatException(
- "Overlong encoding of 0x${value.toRadixString(16)}",
- codeUnits,
- i - extraUnits - 1);
- }
- expectedUnits = extraUnits = 0;
- value = unicodeReplacementCharacterRune;
}
- if (value > _FOUR_BYTE_LIMIT) {
- if (!_allowMalformed) {
- throw FormatException(
- "Character outside valid Unicode range: "
- "0x${value.toRadixString(16)}",
- codeUnits,
- i - extraUnits - 1);
- }
- value = unicodeReplacementCharacterRune;
- }
- if (!_isFirstCharacter || value != unicodeBomCharacterRune) {
- _stringSink.writeCharCode(value);
- }
- _isFirstCharacter = false;
+ if (i == end) break loop;
+ byte = bytes[i++];
}
- while (i < endIndex) {
- var oneBytes = _scanOneByteCharacters(codeUnits, i, endIndex);
- if (oneBytes > 0) {
- _isFirstCharacter = false;
- assert(i + oneBytes <= endIndex);
- _stringSink.write(String.fromCharCodes(codeUnits, i, i + oneBytes));
-
- i += oneBytes;
- if (i == endIndex) break;
- }
- var unit = codeUnits[i++];
- // TODO(floitsch): the way we test we could potentially allow
- // units that are too large, if they happen to have the
- // right bit-pattern. (Same is true for the multibyte loop above).
- // TODO(floitsch): optimize this loop. See:
- // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.dart?column_width=80
- if (unit < 0) {
- // TODO(floitsch): should this be unit <= 0 ?
- if (!_allowMalformed) {
- throw FormatException(
- "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}",
- codeUnits,
- i - 1);
+ final int markStart = i;
+ byte = bytes[i++];
+ if (byte < 128) {
+ int markEnd = end;
+ while (i < end) {
+ byte = bytes[i++];
+ if (byte >= 128) {
+ markEnd = i - 1;
+ break;
}
- _stringSink.writeCharCode(unicodeReplacementCharacterRune);
+ }
+ assert(markStart < markEnd);
+ if (markEnd - markStart < 20) {
+ for (int m = markStart; m < markEnd; m++) {
+ buffer.writeCharCode(bytes[m]);
+ }
} else {
- assert(unit > _ONE_BYTE_LIMIT);
- if ((unit & 0xE0) == 0xC0) {
- value = unit & 0x1F;
- expectedUnits = extraUnits = 1;
- continue loop;
- }
- if ((unit & 0xF0) == 0xE0) {
- value = unit & 0x0F;
- expectedUnits = extraUnits = 2;
- continue loop;
- }
- // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
- if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
- value = unit & 0x07;
- expectedUnits = extraUnits = 3;
- continue loop;
- }
- if (!_allowMalformed) {
- throw FormatException(
- "Bad UTF-8 encoding 0x${unit.toRadixString(16)}",
- codeUnits,
- i - 1);
- }
- value = unicodeReplacementCharacterRune;
- expectedUnits = extraUnits = 0;
- _isFirstCharacter = false;
- _stringSink.writeCharCode(value);
+ buffer.write(String.fromCharCodes(bytes, markStart, markEnd));
}
+ if (markEnd == end) break loop;
}
- break loop;
}
- if (expectedUnits > 0) {
- _value = value;
- _expectedUnits = expectedUnits;
- _extraUnits = extraUnits;
+
+ if (single && state > afterBom) {
+ // Unfinished sequence.
+ if (allowMalformed) {
+ buffer.writeCharCode(unicodeReplacementCharacterRune);
+ } else {
+ _state = errorUnfinished;
+ _charOrIndex = end;
+ return "";
+ }
}
+ _state = state;
+ _charOrIndex = char;
+ return buffer.toString();
+ }
+
+ static Uint8List _makeUint8List(List<int> codeUnits, int start, int end) {
+ final int length = end - start;
+ final Uint8List bytes = Uint8List(length);
+ for (int i = 0; i < length; i++) {
+ int b = codeUnits[start + i];
+ if ((b & ~0xFF) != 0) {
+ // Replace invalid byte values by FF, which is also invalid.
+ b = 0xFF;
+ }
+ bytes[i] = b;
+ }
+ return bytes;
}
}
-
-// Returns the number of bytes in [units] starting at offset [from] which have
-// the leftmost bit set to 0.
-//
-// To increase performance of this critical method we have a special variant of
-// it implemented in the VM's patch files, which is why we make it external.
-external int _scanOneByteCharacters(List<int> units, int from, int endIndex);
diff --git a/sdk_nnbd/lib/internal/internal.dart b/sdk_nnbd/lib/internal/internal.dart
index bec8b0c..dad3b05 100644
--- a/sdk_nnbd/lib/internal/internal.dart
+++ b/sdk_nnbd/lib/internal/internal.dart
@@ -18,16 +18,17 @@
import 'dart:core' hide Symbol;
import 'dart:core' as core show Symbol;
import 'dart:math' show Random;
+import 'dart:typed_data' show Uint8List;
part 'async_cast.dart';
part 'cast.dart';
part 'errors.dart';
part 'iterable.dart';
part 'list.dart';
+part 'linked_list.dart';
part 'print.dart';
part 'sort.dart';
part 'symbol.dart';
-part 'linked_list.dart';
// Returns true iff `null as T` will succeed based on the
// execution mode.