| // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| part of dart.convert; |
| |
| /// The Unicode Replacement character `U+FFFD` (�). |
| const int unicodeReplacementCharacterRune = 0xFFFD; |
| |
| /// The Unicode Byte Order Marker (BOM) character `U+FEFF`. |
| const int unicodeBomCharacterRune = 0xFEFF; |
| |
| /// An instance of the default implementation of the [Utf8Codec]. |
| /// |
| /// This instance provides a convenient access to the most common UTF-8 |
| /// use cases. |
| /// |
| /// Examples: |
| /// |
| /// var encoded = utf8.encode("Îñţérñåţîöñåļîžåţîờñ"); |
| /// var decoded = utf8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, |
| /// 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]); |
| const Utf8Codec utf8 = Utf8Codec(); |
| |
| /// A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes |
| /// UTF-8 code units to strings. |
| class Utf8Codec extends Encoding { |
| final bool _allowMalformed; |
| |
| /// Instantiates a new [Utf8Codec]. |
| /// |
| /// The optional [allowMalformed] argument defines how [decoder] (and [decode]) |
| /// deal with invalid or unterminated character sequences. |
| /// |
| /// If it is `true` (and not overridden at the method invocation) [decode] and |
| /// the [decoder] replace invalid (or unterminated) octet |
| /// sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
| /// they throw a [FormatException]. |
| const Utf8Codec({bool allowMalformed = false}) |
| : _allowMalformed = allowMalformed; |
| |
| /// The name of this codec, "utf-8". |
| String get name => "utf-8"; |
| |
| /// Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| /// corresponding string. |
| /// |
| /// If the [codeUnits] start with the encoding of a |
| /// [unicodeBomCharacterRune], that character is discarded. |
| /// |
| /// If [allowMalformed] is `true` the decoder replaces invalid (or |
| /// unterminated) character sequences with the Unicode Replacement character |
| /// `U+FFFD` (�). Otherwise it throws a [FormatException]. |
| /// |
| /// If [allowMalformed] is not given, it defaults to the `allowMalformed` that |
| /// was used to instantiate `this`. |
| String decode(List<int> codeUnits, {bool allowMalformed}) { |
| allowMalformed ??= _allowMalformed; |
| return Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); |
| } |
| |
| Utf8Encoder get encoder => const Utf8Encoder(); |
| Utf8Decoder get decoder { |
| return Utf8Decoder(allowMalformed: _allowMalformed); |
| } |
| } |
| |
| /// This class converts strings to their UTF-8 code units (a list of |
| /// unsigned 8-bit integers). |
| class Utf8Encoder extends Converter<String, List<int>> { |
| const Utf8Encoder(); |
| |
| /// Converts [string] to its UTF-8 code units (a list of |
| /// unsigned 8-bit integers). |
| /// |
| /// If [start] and [end] are provided, only the substring |
| /// `string.substring(start, end)` is converted. |
| List<int> convert(String string, [int start = 0, int end]) { |
| var stringLength = string.length; |
| end = RangeError.checkValidRange(start, end, stringLength); |
| var length = end - start; |
| if (length == 0) return Uint8List(0); |
| // Create a new encoder with a length that is guaranteed to be big enough. |
| // A single code unit uses at most 3 bytes, a surrogate pair at most 4. |
| var encoder = _Utf8Encoder.withBufferSize(length * 3); |
| var endPosition = encoder._fillBuffer(string, start, end); |
| assert(endPosition >= end - 1); |
| if (endPosition != end) { |
| // Encoding skipped the last code unit. |
| // That can only happen if the last code unit is a leadsurrogate. |
| // Force encoding of the lead surrogate by itself. |
| var lastCodeUnit = string.codeUnitAt(end - 1); |
| assert(_isLeadSurrogate(lastCodeUnit)); |
| // We use a non-surrogate as `nextUnit` so that _writeSurrogate just |
| // writes the lead-surrogate. |
| var wasCombined = encoder._writeSurrogate(lastCodeUnit, 0); |
| assert(!wasCombined); |
| } |
| return encoder._buffer.sublist(0, encoder._bufferIndex); |
| } |
| |
| /// Starts a chunked conversion. |
| /// |
| /// The converter works more efficiently if the given [sink] is a |
| /// [ByteConversionSink]. |
| StringConversionSink startChunkedConversion(Sink<List<int>> sink) { |
| return _Utf8EncoderSink( |
| sink is ByteConversionSink ? sink : ByteConversionSink.from(sink)); |
| } |
| |
| // Override the base-classes bind, to provide a better type. |
| Stream<List<int>> bind(Stream<String> stream) => super.bind(stream); |
| } |
| |
| /// This class encodes Strings to UTF-8 code units (unsigned 8 bit integers). |
| // TODO(floitsch): make this class public. |
| class _Utf8Encoder { |
| int _carry = 0; |
| int _bufferIndex = 0; |
| final List<int> _buffer; |
| |
| static const _DEFAULT_BYTE_BUFFER_SIZE = 1024; |
| |
| _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE); |
| |
| _Utf8Encoder.withBufferSize(int bufferSize) |
| : _buffer = _createBuffer(bufferSize); |
| |
| /// Allow an implementation to pick the most efficient way of storing bytes. |
| static List<int> _createBuffer(int size) => Uint8List(size); |
| |
| /// Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and |
| /// writes it to [_buffer]. |
| /// |
| /// Returns true if the [nextCodeUnit] was combined with the |
| /// [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing |
| /// surrogate and has not been written yet. |
| /// |
| /// It is safe to pass 0 for [nextCodeUnit] in which case only the leading |
| /// surrogate is written. |
| bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) { |
| if (_isTailSurrogate(nextCodeUnit)) { |
| var rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit); |
| // If the rune is encoded with 2 code-units then it must be encoded |
| // with 4 bytes in UTF-8. |
| assert(rune > _THREE_BYTE_LIMIT); |
| assert(rune <= _FOUR_BYTE_LIMIT); |
| _buffer[_bufferIndex++] = 0xF0 | (rune >> 18); |
| _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f); |
| _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); |
| _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); |
| return true; |
| } else { |
| // TODO(floitsch): allow to throw on malformed strings. |
| // Encode the half-surrogate directly into UTF-8. This yields |
| // invalid UTF-8, but we started out with invalid UTF-16. |
| |
| // Surrogates are always encoded in 3 bytes in UTF-8. |
| _buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12); |
| _buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f); |
| _buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f); |
| return false; |
| } |
| } |
| |
| /// Fills the [_buffer] with as many characters as possible. |
| /// |
| /// Does not encode any trailing lead-surrogate. This must be done by the |
| /// caller. |
| /// |
| /// Returns the position in the string. The returned index points to the |
| /// first code unit that hasn't been encoded. |
| int _fillBuffer(String str, int start, int end) { |
| if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) { |
| // Don't handle a trailing lead-surrogate in this loop. The caller has |
| // to deal with those. |
| end--; |
| } |
| int stringIndex; |
| for (stringIndex = start; stringIndex < end; stringIndex++) { |
| var codeUnit = str.codeUnitAt(stringIndex); |
| // ASCII has the same representation in UTF-8 and UTF-16. |
| if (codeUnit <= _ONE_BYTE_LIMIT) { |
| if (_bufferIndex >= _buffer.length) break; |
| _buffer[_bufferIndex++] = codeUnit; |
| } else if (_isLeadSurrogate(codeUnit)) { |
| if (_bufferIndex + 3 >= _buffer.length) break; |
| // Note that it is safe to read the next code unit. We decremented |
| // [end] above when the last valid code unit was a leading surrogate. |
| var nextCodeUnit = str.codeUnitAt(stringIndex + 1); |
| var wasCombined = _writeSurrogate(codeUnit, nextCodeUnit); |
| if (wasCombined) stringIndex++; |
| } else { |
| var rune = codeUnit; |
| if (rune <= _TWO_BYTE_LIMIT) { |
| if (_bufferIndex + 1 >= _buffer.length) break; |
| _buffer[_bufferIndex++] = 0xC0 | (rune >> 6); |
| _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); |
| } else { |
| assert(rune <= _THREE_BYTE_LIMIT); |
| if (_bufferIndex + 2 >= _buffer.length) break; |
| _buffer[_bufferIndex++] = 0xE0 | (rune >> 12); |
| _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); |
| _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); |
| } |
| } |
| } |
| return stringIndex; |
| } |
| } |
| |
| /// This class encodes chunked strings to UTF-8 code units (unsigned 8-bit |
| /// integers). |
| class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { |
| final ByteConversionSink _sink; |
| |
| _Utf8EncoderSink(this._sink); |
| |
| void close() { |
| if (_carry != 0) { |
| // addSlice will call close again, but then the carry must be equal to 0. |
| addSlice("", 0, 0, true); |
| return; |
| } |
| _sink.close(); |
| } |
| |
| void addSlice(String str, int start, int end, bool isLast) { |
| _bufferIndex = 0; |
| |
| if (start == end && !isLast) { |
| return; |
| } |
| |
| if (_carry != 0) { |
| var nextCodeUnit = 0; |
| if (start != end) { |
| nextCodeUnit = str.codeUnitAt(start); |
| } else { |
| assert(isLast); |
| } |
| var wasCombined = _writeSurrogate(_carry, nextCodeUnit); |
| // Either we got a non-empty string, or we must not have been combined. |
| assert(!wasCombined || start != end); |
| if (wasCombined) start++; |
| _carry = 0; |
| } |
| do { |
| start = _fillBuffer(str, start, end); |
| var isLastSlice = isLast && (start == end); |
| if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { |
| if (isLast && _bufferIndex < _buffer.length - 3) { |
| // There is still space for the last incomplete surrogate. |
| // We use a non-surrogate as second argument. This way the |
| // function will just add the surrogate-half to the buffer. |
| var hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0); |
| assert(!hasBeenCombined); |
| } else { |
| // Otherwise store it in the carry. If isLast is true, then |
| // close will flush the last carry. |
| _carry = str.codeUnitAt(start); |
| } |
| start++; |
| } |
| _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice); |
| _bufferIndex = 0; |
| } while (start < end); |
| if (isLast) close(); |
| } |
| |
| // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it |
| // needs to deal with malformed input. |
| } |
| |
| /// This class converts UTF-8 code units (lists of unsigned 8-bit integers) |
| /// to a string. |
| class Utf8Decoder extends Converter<List<int>, String> { |
| final bool _allowMalformed; |
| |
| /// Instantiates a new [Utf8Decoder]. |
| /// |
| /// The optional [allowMalformed] argument defines how [convert] deals |
| /// with invalid or unterminated character sequences. |
| /// |
| /// If it is `true` [convert] replaces invalid (or unterminated) character |
| /// sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
| /// it throws a [FormatException]. |
| const Utf8Decoder({bool allowMalformed = false}) |
| : _allowMalformed = allowMalformed; |
| |
| /// Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| /// corresponding string. |
| /// |
| /// Uses the code units from [start] to, but no including, [end]. |
| /// If [end] is omitted, it defaults to `codeUnits.length`. |
| /// |
| /// If the [codeUnits] start with the encoding of a |
| /// [unicodeBomCharacterRune], that character is discarded. |
| String convert(List<int> codeUnits, [int start = 0, int end]) { |
| // Allow the implementation to intercept and specialize based on the type |
| // of codeUnits. |
| var result = _convertIntercepted(_allowMalformed, codeUnits, start, end); |
| if (result != null) { |
| return result; |
| } |
| |
| var length = codeUnits.length; |
| end = RangeError.checkValidRange(start, end, length); |
| |
| // Fast case for ASCII strings avoids StringBuffer/_Utf8Decoder. |
| int oneBytes = _scanOneByteCharacters(codeUnits, start, end); |
| StringBuffer buffer; |
| bool isFirstCharacter = true; |
| if (oneBytes > 0) { |
| var firstPart = String.fromCharCodes(codeUnits, start, start + oneBytes); |
| start += oneBytes; |
| if (start == end) { |
| return firstPart; |
| } |
| buffer = StringBuffer(firstPart); |
| isFirstCharacter = false; |
| } |
| |
| buffer ??= StringBuffer(); |
| var decoder = _Utf8Decoder(buffer, _allowMalformed); |
| decoder._isFirstCharacter = isFirstCharacter; |
| decoder.convert(codeUnits, start, end); |
| decoder.flush(codeUnits, end); |
| return buffer.toString(); |
| } |
| |
| /// Starts a chunked conversion. |
| /// |
| /// The converter works more efficiently if the given [sink] is a |
| /// [StringConversionSink]. |
| ByteConversionSink startChunkedConversion(Sink<String> sink) { |
| StringConversionSink stringSink; |
| if (sink is StringConversionSink) { |
| stringSink = sink; |
| } else { |
| stringSink = StringConversionSink.from(sink); |
| } |
| return stringSink.asUtf8Sink(_allowMalformed); |
| } |
| |
| // Override the base-classes bind, to provide a better type. |
| Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); |
| |
| external Converter<List<int>, T> fuse<T>(Converter<String, T> next); |
| |
| external static String _convertIntercepted( |
| bool allowMalformed, List<int> codeUnits, int start, int end); |
| } |
| |
| // UTF-8 constants. |
| const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits |
| const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits |
| const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits |
| const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. |
| |
| // UTF-16 constants. |
| const int _SURROGATE_TAG_MASK = 0xFC00; |
| const int _SURROGATE_VALUE_MASK = 0x3FF; |
| const int _LEAD_SURROGATE_MIN = 0xD800; |
| const int _TAIL_SURROGATE_MIN = 0xDC00; |
| |
| bool _isLeadSurrogate(int codeUnit) => |
| (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
| bool _isTailSurrogate(int codeUnit) => |
| (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
| int _combineSurrogatePair(int lead, int tail) => |
| 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | |
| (tail & _SURROGATE_VALUE_MASK); |
| |
| /// Decodes UTF-8. |
| /// |
| /// The decoder handles chunked input. |
| // TODO(floitsch): make this class public. |
| class _Utf8Decoder { |
| final bool _allowMalformed; |
| final StringSink _stringSink; |
| bool _isFirstCharacter = true; |
| int _value = 0; |
| int _expectedUnits = 0; |
| int _extraUnits = 0; |
| |
| _Utf8Decoder(this._stringSink, this._allowMalformed); |
| |
| bool get hasPartialInput => _expectedUnits > 0; |
| |
| // Limits of one through four byte encodings. |
| static const List<int> _LIMITS = <int>[ |
| _ONE_BYTE_LIMIT, |
| _TWO_BYTE_LIMIT, |
| _THREE_BYTE_LIMIT, |
| _FOUR_BYTE_LIMIT |
| ]; |
| |
| void close() { |
| flush(); |
| } |
| |
| /// Flushes this decoder as if closed. |
| /// |
| /// This method throws if the input was partial and the decoder was |
| /// constructed with `allowMalformed` set to `false`. |
| /// |
| /// The [source] and [offset] of the current position may be provided, |
| /// and are included in the exception if one is thrown. |
| void flush([List<int> source, int offset]) { |
| if (hasPartialInput) { |
| if (!_allowMalformed) { |
| throw FormatException( |
| "Unfinished UTF-8 octet sequence", source, offset); |
| } |
| _stringSink.writeCharCode(unicodeReplacementCharacterRune); |
| _value = 0; |
| _expectedUnits = 0; |
| _extraUnits = 0; |
| } |
| } |
| |
| void convert(List<int> codeUnits, int startIndex, int endIndex) { |
| var value = _value; |
| var expectedUnits = _expectedUnits; |
| var extraUnits = _extraUnits; |
| _value = 0; |
| _expectedUnits = 0; |
| _extraUnits = 0; |
| |
| var i = startIndex; |
| loop: |
| while (true) { |
| multibyte: |
| if (expectedUnits > 0) { |
| do { |
| if (i == endIndex) { |
| break loop; |
| } |
| var unit = codeUnits[i]; |
| if ((unit & 0xC0) != 0x80) { |
| expectedUnits = 0; |
| if (!_allowMalformed) { |
| throw FormatException( |
| "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", |
| codeUnits, |
| i); |
| } |
| _isFirstCharacter = false; |
| _stringSink.writeCharCode(unicodeReplacementCharacterRune); |
| break multibyte; |
| } else { |
| value = (value << 6) | (unit & 0x3f); |
| expectedUnits--; |
| i++; |
| } |
| } while (expectedUnits > 0); |
| if (value <= _LIMITS[extraUnits - 1]) { |
| // Overly long encoding. The value could be encoded with a shorter |
| // encoding. |
| if (!_allowMalformed) { |
| throw FormatException( |
| "Overlong encoding of 0x${value.toRadixString(16)}", |
| codeUnits, |
| i - extraUnits - 1); |
| } |
| expectedUnits = extraUnits = 0; |
| value = unicodeReplacementCharacterRune; |
| } |
| if (value > _FOUR_BYTE_LIMIT) { |
| if (!_allowMalformed) { |
| throw FormatException( |
| "Character outside valid Unicode range: " |
| "0x${value.toRadixString(16)}", |
| codeUnits, |
| i - extraUnits - 1); |
| } |
| value = unicodeReplacementCharacterRune; |
| } |
| if (!_isFirstCharacter || value != unicodeBomCharacterRune) { |
| _stringSink.writeCharCode(value); |
| } |
| _isFirstCharacter = false; |
| } |
| |
| while (i < endIndex) { |
| var oneBytes = _scanOneByteCharacters(codeUnits, i, endIndex); |
| if (oneBytes > 0) { |
| _isFirstCharacter = false; |
| assert(i + oneBytes <= endIndex); |
| _stringSink.write(String.fromCharCodes(codeUnits, i, i + oneBytes)); |
| |
| i += oneBytes; |
| if (i == endIndex) break; |
| } |
| var unit = codeUnits[i++]; |
| // TODO(floitsch): the way we test we could potentially allow |
| // units that are too large, if they happen to have the |
| // right bit-pattern. (Same is true for the multibyte loop above). |
| // TODO(floitsch): optimize this loop. See: |
| // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.dart?column_width=80 |
| if (unit < 0) { |
| // TODO(floitsch): should this be unit <= 0 ? |
| if (!_allowMalformed) { |
| throw FormatException( |
| "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}", |
| codeUnits, |
| i - 1); |
| } |
| _stringSink.writeCharCode(unicodeReplacementCharacterRune); |
| } else { |
| assert(unit > _ONE_BYTE_LIMIT); |
| if ((unit & 0xE0) == 0xC0) { |
| value = unit & 0x1F; |
| expectedUnits = extraUnits = 1; |
| continue loop; |
| } |
| if ((unit & 0xF0) == 0xE0) { |
| value = unit & 0x0F; |
| expectedUnits = extraUnits = 2; |
| continue loop; |
| } |
| // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
| if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
| value = unit & 0x07; |
| expectedUnits = extraUnits = 3; |
| continue loop; |
| } |
| if (!_allowMalformed) { |
| throw FormatException( |
| "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", |
| codeUnits, |
| i - 1); |
| } |
| value = unicodeReplacementCharacterRune; |
| expectedUnits = extraUnits = 0; |
| _isFirstCharacter = false; |
| _stringSink.writeCharCode(value); |
| } |
| } |
| break loop; |
| } |
| if (expectedUnits > 0) { |
| _value = value; |
| _expectedUnits = expectedUnits; |
| _extraUnits = extraUnits; |
| } |
| } |
| } |
| |
| // Returns the number of bytes in [units] starting at offset [from] which have |
| // the leftmost bit set to 0. |
| // |
| // To increase performance of this critical method we have a special variant of |
| // it implemented in the VM's patch files, which is why we make it external. |
| external int _scanOneByteCharacters(List<int> units, int from, int endIndex); |