sdk/lib/convert/utf.dart - sdk.git - Git at Google

 // Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
 // for details. All rights reserved. Use of this source code is governed by a
 // BSD-style license that can be found in the LICENSE file.

 part of dart.convert;

 /// The Unicode Replacement character `U+FFFD` (�).
 const int unicodeReplacementCharacterRune = 0xFFFD;

 /// The Unicode Byte Order Marker (BOM) character `U+FEFF`.
 const int unicodeBomCharacterRune = 0xFEFF;

 /// An instance of the default implementation of the [Utf8Codec].
 ///
 /// This instance provides a convenient access to the most common UTF-8
 /// use cases.
 ///
 /// Examples:
 /// ```dart
 /// var encoded = utf8.encode("Îñţérñåţîöñåļîžåţîờñ");
 /// var decoded = utf8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,
 ///                            0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]);
 /// ```
 const Utf8Codec utf8 = Utf8Codec();

 /// A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes
 /// UTF-8 code units to strings.
 class Utf8Codec extends Encoding {
   final bool _allowMalformed;

   /// Instantiates a new [Utf8Codec].
   ///
   /// The optional [allowMalformed] argument defines how [decoder] (and [decode])
   /// deal with invalid or unterminated character sequences.
   ///
   /// If it is `true` (and not overridden at the method invocation) [decode] and
   /// the [decoder] replace invalid (or unterminated) octet
   /// sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
   /// they throw a [FormatException].
   const Utf8Codec({bool allowMalformed = false})
       : _allowMalformed = allowMalformed;

   /// The name of this codec, "utf-8".
   String get name => "utf-8";

   /// Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
   /// corresponding string.
   ///
   /// If the [codeUnits] start with the encoding of a
   /// [unicodeBomCharacterRune], that character is discarded.
   ///
   /// If [allowMalformed] is `true` the decoder replaces invalid (or
   /// unterminated) character sequences with the Unicode Replacement character
   /// `U+FFFD` (�). Otherwise it throws a [FormatException].
   ///
   /// If [allowMalformed] is not given, it defaults to the `allowMalformed` that
   /// was used to instantiate `this`.
   String decode(List<int> codeUnits, {bool? allowMalformed}) {
     // Switch between const objects to avoid allocation.
     Utf8Decoder decoder = allowMalformed ?? _allowMalformed
         ? const Utf8Decoder(allowMalformed: true)
         : const Utf8Decoder(allowMalformed: false);
     return decoder.convert(codeUnits);
   }

   Utf8Encoder get encoder => const Utf8Encoder();
   Utf8Decoder get decoder {
     // Switch between const objects to avoid allocation.
     return _allowMalformed
         ? const Utf8Decoder(allowMalformed: true)
         : const Utf8Decoder(allowMalformed: false);
   }
 }

 /// This class converts strings to their UTF-8 code units (a list of
 /// unsigned 8-bit integers).
 class Utf8Encoder extends Converter<String, List<int>> {
   const Utf8Encoder();

   /// Converts [string] to its UTF-8 code units (a list of
   /// unsigned 8-bit integers).
   ///
   /// If [start] and [end] are provided, only the substring
   /// `string.substring(start, end)` is converted.
   ///
   /// Any unpaired surrogate character (`U+D800`-`U+DFFF`) in the input string
   /// is encoded as a Unicode Replacement character `U+FFFD` (�).
   Uint8List convert(String string, [int start = 0, int? end]) {
     var stringLength = string.length;
     end = RangeError.checkValidRange(start, end, stringLength);
     // TODO(38725): Remove workaround when assignment promotion is implemented
     if (end == null) {
       throw RangeError("Invalid range");
     }
     var length = end - start;
     if (length == 0) return Uint8List(0);
     // Create a new encoder with a length that is guaranteed to be big enough.
     // A single code unit uses at most 3 bytes, a surrogate pair at most 4.
     var encoder = _Utf8Encoder.withBufferSize(length * 3);
     var endPosition = encoder._fillBuffer(string, start, end);
     assert(endPosition >= end - 1);
     if (endPosition != end) {
       // Encoding skipped the last code unit.
       // That can only happen if the last code unit is a leadsurrogate.
       // Force encoding of the lead surrogate by itself.
       var lastCodeUnit = string.codeUnitAt(end - 1);
       assert(_isLeadSurrogate(lastCodeUnit));
       // Write a replacement character to represent the unpaired surrogate.
       encoder._writeReplacementCharacter();
     }
     return encoder._buffer.sublist(0, encoder._bufferIndex);
   }

   /// Starts a chunked conversion.
   ///
   /// The converter works more efficiently if the given [sink] is a
   /// [ByteConversionSink].
   StringConversionSink startChunkedConversion(Sink<List<int>> sink) {
     return _Utf8EncoderSink(
         sink is ByteConversionSink ? sink : ByteConversionSink.from(sink));
   }

   // Override the base-classes bind, to provide a better type.
   Stream<List<int>> bind(Stream<String> stream) => super.bind(stream);
 }

 /// This class encodes Strings to UTF-8 code units (unsigned 8 bit integers).
 // TODO(floitsch): make this class public.
 class _Utf8Encoder {
   int _carry = 0;
   int _bufferIndex = 0;
   final Uint8List _buffer;

   static const _DEFAULT_BYTE_BUFFER_SIZE = 1024;

   _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE);

   _Utf8Encoder.withBufferSize(int bufferSize)
       : _buffer = _createBuffer(bufferSize);

   /// Allow an implementation to pick the most efficient way of storing bytes.
   static Uint8List _createBuffer(int size) => Uint8List(size);

   /// Write a replacement character (U+FFFD). Used for unpaired surrogates.
   void _writeReplacementCharacter() {
     _buffer[_bufferIndex++] = 0xEF;
     _buffer[_bufferIndex++] = 0xBF;
     _buffer[_bufferIndex++] = 0xBD;
   }

   /// Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and
   /// writes it to [_buffer].
   ///
   /// Returns true if the [nextCodeUnit] was combined with the
   /// [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing
   /// surrogate and has not been written yet.
   ///
   /// It is safe to pass 0 for [nextCodeUnit] in which case a replacement
   /// character is written to represent the unpaired lead surrogate.
   bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {
     if (_isTailSurrogate(nextCodeUnit)) {
       var rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit);
       // If the rune is encoded with 2 code-units then it must be encoded
       // with 4 bytes in UTF-8.
       assert(rune > _THREE_BYTE_LIMIT);
       assert(rune <= _FOUR_BYTE_LIMIT);
       _buffer[_bufferIndex++] = 0xF0 | (rune >> 18);
       _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f);
       _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);
       _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
       return true;
     } else {
       // Unpaired lead surrogate.
       _writeReplacementCharacter();
       return false;
     }
   }

   /// Fills the [_buffer] with as many characters as possible.
   ///
   /// Does not encode any trailing lead-surrogate. This must be done by the
   /// caller.
   ///
   /// Returns the position in the string. The returned index points to the
   /// first code unit that hasn't been encoded.
   int _fillBuffer(String str, int start, int end) {
     if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) {
       // Don't handle a trailing lead-surrogate in this loop. The caller has
       // to deal with those.
       end--;
     }
     int stringIndex;
     for (stringIndex = start; stringIndex < end; stringIndex++) {
       var codeUnit = str.codeUnitAt(stringIndex);
       // ASCII has the same representation in UTF-8 and UTF-16.
       if (codeUnit <= _ONE_BYTE_LIMIT) {
         if (_bufferIndex >= _buffer.length) break;
         _buffer[_bufferIndex++] = codeUnit;
       } else if (_isLeadSurrogate(codeUnit)) {
         if (_bufferIndex + 4 > _buffer.length) break;
         // Note that it is safe to read the next code unit. We decremented
         // [end] above when the last valid code unit was a leading surrogate.
         var nextCodeUnit = str.codeUnitAt(stringIndex + 1);
         var wasCombined = _writeSurrogate(codeUnit, nextCodeUnit);
         if (wasCombined) stringIndex++;
       } else if (_isTailSurrogate(codeUnit)) {
         if (_bufferIndex + 3 > _buffer.length) break;
         // Unpaired tail surrogate.
         _writeReplacementCharacter();
       } else {
         var rune = codeUnit;
         if (rune <= _TWO_BYTE_LIMIT) {
           if (_bufferIndex + 1 >= _buffer.length) break;
           _buffer[_bufferIndex++] = 0xC0 | (rune >> 6);
           _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
         } else {
           assert(rune <= _THREE_BYTE_LIMIT);
           if (_bufferIndex + 2 >= _buffer.length) break;
           _buffer[_bufferIndex++] = 0xE0 | (rune >> 12);
           _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);
           _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
         }
       }
     }
     return stringIndex;
   }
 }

 /// This class encodes chunked strings to UTF-8 code units (unsigned 8-bit
 /// integers).
 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {
   final ByteConversionSink _sink;

   _Utf8EncoderSink(this._sink);

   void close() {
     if (_carry != 0) {
       // addSlice will call close again, but then the carry must be equal to 0.
       addSlice("", 0, 0, true);
       return;
     }
     _sink.close();
   }

   void addSlice(String str, int start, int end, bool isLast) {
     _bufferIndex = 0;

     if (start == end && !isLast) {
       return;
     }

     if (_carry != 0) {
       var nextCodeUnit = 0;
       if (start != end) {
         nextCodeUnit = str.codeUnitAt(start);
       } else {
         assert(isLast);
       }
       var wasCombined = _writeSurrogate(_carry, nextCodeUnit);
       // Either we got a non-empty string, or we must not have been combined.
       assert(!wasCombined || start != end);
       if (wasCombined) start++;
       _carry = 0;
     }
     do {
       start = _fillBuffer(str, start, end);
       var isLastSlice = isLast && (start == end);
       if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {
         if (isLast && _bufferIndex < _buffer.length - 3) {
           // There is still space for the replacement character to represent
           // the last incomplete surrogate.
           _writeReplacementCharacter();
         } else {
           // Otherwise store it in the carry. If isLast is true, then
           // close will flush the last carry.
           _carry = str.codeUnitAt(start);
         }
         start++;
       }
       _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);
       _bufferIndex = 0;
     } while (start < end);
     if (isLast) close();
   }

   // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it
   // needs to deal with malformed input.
 }

 /// This class converts UTF-8 code units (lists of unsigned 8-bit integers)
 /// to a string.
 class Utf8Decoder extends Converter<List<int>, String> {
   final bool _allowMalformed;

   /// Instantiates a new [Utf8Decoder].
   ///
   /// The optional [allowMalformed] argument defines how [convert] deals
   /// with invalid or unterminated character sequences.
   ///
   /// If it is `true` [convert] replaces invalid (or unterminated) character
   /// sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
   /// it throws a [FormatException].
   const Utf8Decoder({bool allowMalformed = false})
       : _allowMalformed = allowMalformed;

   /// Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
   /// corresponding string.
   ///
   /// Uses the code units from [start] to, but no including, [end].
   /// If [end] is omitted, it defaults to `codeUnits.length`.
   ///
   /// If the [codeUnits] start with the encoding of a
   /// [unicodeBomCharacterRune], that character is discarded.
   String convert(List<int> codeUnits, [int start = 0, int? end]) {
     // Allow the implementation to intercept and specialize based on the type
     // of codeUnits.
     var result = _convertIntercepted(_allowMalformed, codeUnits, start, end);
     if (result != null) {
       return result;
     }

     return _Utf8Decoder(_allowMalformed).convertSingle(codeUnits, start, end);
   }

   /// Starts a chunked conversion.
   ///
   /// The converter works more efficiently if the given [sink] is a
   /// [StringConversionSink].
   ByteConversionSink startChunkedConversion(Sink<String> sink) {
     StringConversionSink stringSink;
     if (sink is StringConversionSink) {
       stringSink = sink;
     } else {
       stringSink = StringConversionSink.from(sink);
     }
     return stringSink.asUtf8Sink(_allowMalformed);
   }

   // Override the base-classes bind, to provide a better type.
   Stream<String> bind(Stream<List<int>> stream) => super.bind(stream);

   external Converter<List<int>, T> fuse<T>(Converter<String, T> next);

   external static String? _convertIntercepted(
       bool allowMalformed, List<int> codeUnits, int start, int? end);
 }

 // UTF-8 constants.
 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits
 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits
 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits
 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.

 // UTF-16 constants.
 const int _SURROGATE_TAG_MASK = 0xFC00;
 const int _SURROGATE_VALUE_MASK = 0x3FF;
 const int _LEAD_SURROGATE_MIN = 0xD800;
 const int _TAIL_SURROGATE_MIN = 0xDC00;

 bool _isLeadSurrogate(int codeUnit) =>
     (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
 bool _isTailSurrogate(int codeUnit) =>
     (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
 int _combineSurrogatePair(int lead, int tail) =>
     0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) |
     (tail & _SURROGATE_VALUE_MASK);

 class _Utf8Decoder {
   /// Decode malformed UTF-8 as replacement characters (instead of throwing)?
   final bool allowMalformed;

   /// Decoder DFA state.
   int _state;

   /// Partially decoded character. Meaning depends on state. Not used when in
   /// the initial/accept state. When in an error state, contains the index into
   /// the input of the error.
   int _charOrIndex = 0;

   // State machine for UTF-8 decoding, based on this decoder by Björn Höhrmann:
   // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
   //
   // One iteration in the state machine proceeds as:
   //
   // type = typeTable[byte];
   // char = (state != accept)
   //     ? (byte & 0x3F) | (char << 6)
   //     : byte & (shiftedByteMask >> type);
   // state = transitionTable[state + type];
   //
   // After each iteration, if state == accept, char is output as a character.

   // Mask to and on the type read from the table.
   static const int typeMask = 0x1F;
   // Mask shifted right by byte type to mask first byte of sequence.
   static const int shiftedByteMask = 0xF0FE;

   // Byte types.
   // 'A' = ASCII, 00-7F
   // 'B' = 2-byte, C2-DF
   // 'C' = 3-byte, E1-EC, EE
   // 'D' = 3-byte (possibly surrogate), ED
   // 'E' = Illegal, C0-C1, F5+
   // 'F' = Low extension, 80-8F
   // 'G' = Mid extension, 90-9F
   // 'H' = High extension, A0-BA, BC-BE
   // 'I' = Second byte of BOM, BB
   // 'J' = Third byte of BOM, BF
   // 'K' = 3-byte (possibly overlong), E0
   // 'L' = First byte of BOM, EF
   // 'M' = 4-byte (possibly out-of-range), F4
   // 'N' = 4-byte, F1-F3
   // 'O' = 4-byte (possibly overlong), F0
   static const String typeTable = ""
       "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 00-1F
       "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 20-3F
       "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 40-5F
       "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 60-7F
       "FFFFFFFFFFFFFFFFGGGGGGGGGGGGGGGG" // 80-9F
       "HHHHHHHHHHHHHHHHHHHHHHHHHHHIHHHJ" // A0-BF
       "EEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" // C0-DF
       "KCCCCCCCCCCCCDCLONNNMEEEEEEEEEEE" // E0-FF
       ;

   // States (offsets into transition table).
   static const int IA = 0x00; // Initial / Accept
   static const int BB = 0x10; // Before BOM
   static const int AB = 0x20; // After BOM
   static const int X1 = 0x30; // Expecting one extension byte
   static const int X2 = 0x3A; // Expecting two extension bytes
   static const int X3 = 0x44; // Expecting three extension bytes
   static const int TO = 0x4E; // Possibly overlong 3-byte
   static const int TS = 0x58; // Possibly surrogate
   static const int QO = 0x62; // Possibly overlong 4-byte
   static const int QR = 0x6C; // Possibly out-of-range 4-byte
   static const int B1 = 0x76; // One byte into BOM
   static const int B2 = 0x80; // Two bytes into BOM
   static const int E1 = 0x41; // Error: Missing extension byte
   static const int E2 = 0x43; // Error: Unexpected extension byte
   static const int E3 = 0x45; // Error: Invalid byte
   static const int E4 = 0x47; // Error: Overlong encoding
   static const int E5 = 0x49; // Error: Out of range
   static const int E6 = 0x4B; // Error: Surrogate
   static const int E7 = 0x4D; // Error: Unfinished

   // Character equivalents for states.
   static const String _IA = '\u0000';
   static const String _BB = '\u0010';
   static const String _AB = '\u0020';
   static const String _X1 = '\u0030';
   static const String _X2 = '\u003A';
   static const String _X3 = '\u0044';
   static const String _TO = '\u004E';
   static const String _TS = '\u0058';
   static const String _QO = '\u0062';
   static const String _QR = '\u006C';
   static const String _B1 = '\u0076';
   static const String _B2 = '\u0080';
   static const String _E1 = '\u0041';
   static const String _E2 = '\u0043';
   static const String _E3 = '\u0045';
   static const String _E4 = '\u0047';
   static const String _E5 = '\u0049';
   static const String _E6 = '\u004B';
   static const String _E7 = '\u004D';

   // Transition table of the state machine. Maps state and byte type
   // to next state.
   static const String transitionTable = " "
       // A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
       "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // IA
       "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_B1$_QR$_X3$_QO " // BB
       "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // AB
       "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_IA" // Overlap 5 E1s        X1
       "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_X1$_X1" // Overlap 5 E1s        X2
       "$_E1$_E1$_E1$_E1$_E1$_X2$_X2$_X2$_X2$_X2" // Overlap 5 E1s        X3
       "$_E1$_E1$_E1$_E1$_E1$_E4$_E4$_X1$_X1$_X1" // Overlap 5 E1s        TO
       "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_E6$_E6$_E6" // Overlap 5 E1s        TS
       "$_E1$_E1$_E1$_E1$_E1$_E4$_X2$_X2$_X2$_X2" // Overlap 5 E1s        QO
       "$_E1$_E1$_E1$_E1$_E1$_X2$_E5$_E5$_E5$_E5" // Overlap 5 E1s        QR
       "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_B2$_X1" // Overlap 5 E1s        B1
       "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_AB$_E1$_E1$_E1$_E1$_E1" //  B2
       ;

   // Aliases for states.
   static const int initial = IA;
   static const int accept = IA;
   static const int beforeBom = BB;
   static const int afterBom = AB;
   static const int errorMissingExtension = E1;
   static const int errorUnexpectedExtension = E2;
   static const int errorInvalid = E3;
   static const int errorOverlong = E4;
   static const int errorOutOfRange = E5;
   static const int errorSurrogate = E6;
   static const int errorUnfinished = E7;

   @pragma("vm:prefer-inline")
   static bool isErrorState(int state) => (state & 1) != 0;

   static String errorDescription(int state) {
     switch (state) {
       case errorMissingExtension:
         return "Missing extension byte";
       case errorUnexpectedExtension:
         return "Unexpected extension byte";
       case errorInvalid:
         return "Invalid UTF-8 byte";
       case errorOverlong:
         return "Overlong encoding";
       case errorOutOfRange:
         return "Out of unicode range";
       case errorSurrogate:
         return "Encoded surrogate";
       case errorUnfinished:
         return "Unfinished UTF-8 octet sequence";
       default:
         return "";
     }
   }

   external _Utf8Decoder(bool allowMalformed);

   external String convertSingle(List<int> codeUnits, int start, int? maybeEnd);

   external String convertChunked(List<int> codeUnits, int start, int? maybeEnd);

   String convertGeneral(
       List<int> codeUnits, int start, int? maybeEnd, bool single) {
     int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);

     if (start == end) return "";

     // Have bytes as Uint8List.
     Uint8List bytes;
     int errorOffset;
     if (codeUnits is Uint8List) {
       bytes = codeUnits;
       errorOffset = 0;
     } else {
       bytes = _makeUint8List(codeUnits, start, end);
       errorOffset = start;
       end -= start;
       start = 0;
     }

     String result = _convertRecursive(bytes, start, end, single);
     if (isErrorState(_state)) {
       String message = errorDescription(_state);
       _state = initial; // Ready for more input.
       throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
     }
     return result;
   }

   String _convertRecursive(Uint8List bytes, int start, int end, bool single) {
     // Chunk long strings to avoid a pathological case of JS repeated string
     // concatenation.
     if (end - start > 1000) {
       int mid = (start + end) ~/ 2;
       String s1 = _convertRecursive(bytes, start, mid, false);
       if (isErrorState(_state)) return s1;
       String s2 = _convertRecursive(bytes, mid, end, single);
       return s1 + s2;
     }
     return decodeGeneral(bytes, start, end, single);
   }

   /// Flushes this decoder as if closed.
   ///
   /// This method throws if the input was partial and the decoder was
   /// constructed with `allowMalformed` set to `false`.
   void flush(StringSink sink) {
     final int state = _state;
     _state = initial;
     if (state <= afterBom) {
       return;
     }
     // Unfinished sequence.
     if (allowMalformed) {
       sink.writeCharCode(unicodeReplacementCharacterRune);
     } else {
       throw FormatException(errorDescription(errorUnfinished), null, null);
     }
   }

   String decodeGeneral(Uint8List bytes, int start, int end, bool single) {
     final String typeTable = _Utf8Decoder.typeTable;
     final String transitionTable = _Utf8Decoder.transitionTable;
     int state = _state;
     int char = _charOrIndex;
     final StringBuffer buffer = StringBuffer();
     int i = start;
     int byte = bytes[i++];
     loop:
     while (true) {
       multibyte:
       while (true) {
         int type = typeTable.codeUnitAt(byte) & typeMask;
         char = (state <= afterBom)
             ? byte & (shiftedByteMask >> type)
             : (byte & 0x3F) | (char << 6);
         state = transitionTable.codeUnitAt(state + type);
         if (state == accept) {
           buffer.writeCharCode(char);
           if (i == end) break loop;
           break multibyte;
         } else if (isErrorState(state)) {
           if (allowMalformed) {
             switch (state) {
               case errorInvalid:
               case errorUnexpectedExtension:
                 // A single byte that can't start a sequence.
                 buffer.writeCharCode(unicodeReplacementCharacterRune);
                 break;
               case errorMissingExtension:
                 // Unfinished sequence followed by a byte that can start a
                 // sequence.
                 buffer.writeCharCode(unicodeReplacementCharacterRune);
                 // Re-parse offending byte.
                 i -= 1;
                 break;
               default:
                 // Unfinished sequence followed by a byte that can't start a
                 // sequence.
                 buffer.writeCharCode(unicodeReplacementCharacterRune);
                 buffer.writeCharCode(unicodeReplacementCharacterRune);
                 break;
             }
             state = initial;
           } else {
             _state = state;
             _charOrIndex = i - 1;
             return "";
           }
         }
         if (i == end) break loop;
         byte = bytes[i++];
       }

       final int markStart = i;
       byte = bytes[i++];
       if (byte < 128) {
         int markEnd = end;
         while (i < end) {
           byte = bytes[i++];
           if (byte >= 128) {
             markEnd = i - 1;
             break;
           }
         }
         assert(markStart < markEnd);
         if (markEnd - markStart < 20) {
           for (int m = markStart; m < markEnd; m++) {
             buffer.writeCharCode(bytes[m]);
           }
         } else {
           buffer.write(String.fromCharCodes(bytes, markStart, markEnd));
         }
         if (markEnd == end) break loop;
       }
     }

     if (single && state > afterBom) {
       // Unfinished sequence.
       if (allowMalformed) {
         buffer.writeCharCode(unicodeReplacementCharacterRune);
       } else {
         _state = errorUnfinished;
         _charOrIndex = end;
         return "";
       }
     }
     _state = state;
     _charOrIndex = char;
     return buffer.toString();
   }

   static Uint8List _makeUint8List(List<int> codeUnits, int start, int end) {
     final int length = end - start;
     final Uint8List bytes = Uint8List(length);
     for (int i = 0; i < length; i++) {
       int b = codeUnits[start + i];
       if ((b & ~0xFF) != 0) {
         // Replace invalid byte values by FF, which is also invalid.
         b = 0xFF;
       }
       bytes[i] = b;
     }
     return bytes;
   }
 }