| // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| part of dart.utf; |
| |
| const int _UTF8_ONE_BYTE_MAX = 0x7f; |
| const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
| const int _UTF8_THREE_BYTE_MAX = 0xffff; |
| |
| const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
| |
| const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; |
| const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; |
| const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; |
| const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; |
| const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; |
| |
| const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; |
| const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; |
| const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; |
| |
| const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; |
| const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; |
| |
| /** |
| * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert |
| * as much of the input as needed. Set the replacementCharacter to null to |
| * throw an ArgumentError rather than replace the bad value. |
| */ |
| IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, |
| int length, |
| int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); |
| } |
| |
| /** |
| * Produce a String from a List of UTF-8 encoded bytes. The parameters |
| * can set an offset into a list of bytes (as int), limit the length of the |
| * values to be decoded, and override the default Unicode replacement character. |
| * Set the replacementCharacter to null to throw an ArgumentError |
| * rather than replace the bad value. |
| */ |
| String decodeUtf8(List<int> bytes, [int offset = 0, int length, |
| int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| return new String.fromCharCodes( |
| (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) |
| .decodeRest()); |
| } |
| |
| /** |
| * Produce a sequence of UTF-8 encoded bytes from the provided string. |
| */ |
| List<int> encodeUtf8(String str) => |
| codepointsToUtf8(stringToCodepoints(str)); |
| |
| int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { |
| while (bytes > 0) { |
| buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | |
| (value & _UTF8_LO_SIX_BIT_MASK); |
| value = value >> 6; |
| bytes--; |
| } |
| return value; |
| } |
| |
| /** |
| * Encode code points as UTF-8 code units. |
| */ |
| List<int> codepointsToUtf8( |
| List<int> codepoints, [int offset = 0, int length]) { |
| _ListRange source = new _ListRange(codepoints, offset, length); |
| |
| int encodedLength = 0; |
| for (int value in source) { |
| if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
| encodedLength += 3; |
| } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| encodedLength++; |
| } else if (value <= _UTF8_TWO_BYTE_MAX) { |
| encodedLength += 2; |
| } else if (value <= _UTF8_THREE_BYTE_MAX) { |
| encodedLength += 3; |
| } else if (value <= UNICODE_VALID_RANGE_MAX) { |
| encodedLength += 4; |
| } |
| } |
| |
| List<int> encoded = new List<int>(encodedLength); |
| int insertAt = 0; |
| for (int value in source) { |
| if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
| encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); |
| insertAt += 3; |
| } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| encoded[insertAt] = value; |
| insertAt++; |
| } else if (value <= _UTF8_TWO_BYTE_MAX) { |
| encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( |
| _UTF8_FIRST_BYTE_OF_TWO_MASK & |
| _addToEncoding(insertAt, 1, value, encoded)); |
| insertAt += 2; |
| } else if (value <= _UTF8_THREE_BYTE_MAX) { |
| encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( |
| _UTF8_FIRST_BYTE_OF_THREE_MASK & |
| _addToEncoding(insertAt, 2, value, encoded)); |
| insertAt += 3; |
| } else if (value <= UNICODE_VALID_RANGE_MAX) { |
| encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( |
| _UTF8_FIRST_BYTE_OF_FOUR_MASK & |
| _addToEncoding(insertAt, 3, value, encoded)); |
| insertAt += 4; |
| } |
| } |
| return encoded; |
| } |
| |
| // Because UTF-8 specifies byte order, we do not have to follow the pattern |
| // used by UTF-16 & UTF-32 regarding byte order. |
| List<int> utf8ToCodepoints( |
| List<int> utf8EncodedBytes, [int offset = 0, int length, |
| int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| return new Utf8Decoder(utf8EncodedBytes, offset, length, |
| replacementCodepoint).decodeRest(); |
| } |
| |
| /** |
| * Return type of [decodeUtf8AsIterable] and variants. The Iterable type |
| * provides an iterator on demand and the iterator will only translate bytes |
| * as requested by the user of the iterator. (Note: results are not cached.) |
| */ |
| // TODO(floitsch): Consider removing the extend and switch to implements since |
| // that's cheaper to allocate. |
| class IterableUtf8Decoder extends IterableBase<int> { |
| final List<int> bytes; |
| final int offset; |
| final int length; |
| final int replacementCodepoint; |
| |
| IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, |
| this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
| |
| Utf8Decoder get iterator => |
| new Utf8Decoder(bytes, offset, length, replacementCodepoint); |
| } |
| |
| /** |
| * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
| * parameters can set an offset into a list of bytes (as int), limit the length |
| * of the values to be decoded, and override the default Unicode replacement |
| * character. Set the replacementCharacter to null to throw an |
| * ArgumentError rather than replace the bad value. The return value |
| * from this method can be used as an Iterable (e.g. in a for-loop). |
| */ |
| class Utf8Decoder implements Iterator<int> { |
| final _ListRangeIterator utf8EncodedBytesIterator; |
| final int replacementCodepoint; |
| int _current = null; |
| |
| Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
| this.replacementCodepoint = |
| UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| utf8EncodedBytesIterator = |
| (new _ListRange(utf8EncodedBytes, offset, length)).iterator; |
| |
| |
| Utf8Decoder._fromListRangeIterator(_ListRange source, [ |
| this.replacementCodepoint = |
| UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| utf8EncodedBytesIterator = source.iterator; |
| |
| /** Decode the remaininder of the characters in this decoder |
| * into a [List<int>]. |
| */ |
| List<int> decodeRest() { |
| List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); |
| int i = 0; |
| while (moveNext()) { |
| codepoints[i++] = current; |
| } |
| if (i == codepoints.length) { |
| return codepoints; |
| } else { |
| List<int> truncCodepoints = new List<int>(i); |
| truncCodepoints.setRange(0, i, codepoints); |
| return truncCodepoints; |
| } |
| } |
| |
| int get current => _current; |
| |
| bool moveNext() { |
| _current = null; |
| |
| if (!utf8EncodedBytesIterator.moveNext()) return false; |
| |
| int value = utf8EncodedBytesIterator.current; |
| int additionalBytes = 0; |
| |
| if (value < 0) { |
| if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw new ArgumentError( |
| "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| } |
| } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| _current = value; |
| return true; |
| } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw new ArgumentError( |
| "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| } |
| } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
| additionalBytes = 1; |
| } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
| additionalBytes = 2; |
| } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
| additionalBytes = 3; |
| } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
| additionalBytes = 4; |
| } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
| value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
| additionalBytes = 5; |
| } else if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw new ArgumentError( |
| "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| } |
| int j = 0; |
| while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { |
| int nextValue = utf8EncodedBytesIterator.current; |
| if (nextValue > _UTF8_ONE_BYTE_MAX && |
| nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
| } else { |
| // if sequence-starting code unit, reposition cursor to start here |
| if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| utf8EncodedBytesIterator.backup(); |
| } |
| break; |
| } |
| j++; |
| } |
| bool validSequence = (j == additionalBytes && ( |
| value < UNICODE_UTF16_RESERVED_LO || |
| value > UNICODE_UTF16_RESERVED_HI)); |
| bool nonOverlong = |
| (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
| (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
| (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
| bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
| if (validSequence && nonOverlong && inRange) { |
| _current = value; |
| return true; |
| } else if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw new ArgumentError( |
| "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
| } |
| } |
| } |