|  | // Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file | 
|  | // for details. All rights reserved. Use of this source code is governed by a | 
|  | // BSD-style license that can be found in the LICENSE file. | 
|  |  | 
|  | part of dart.utf; | 
|  |  | 
|  | const int _UTF8_ONE_BYTE_MAX = 0x7f; | 
|  | const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 
|  | const int _UTF8_THREE_BYTE_MAX = 0xffff; | 
|  |  | 
|  | const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | 
|  |  | 
|  | const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; | 
|  | const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; | 
|  | const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; | 
|  | const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; | 
|  | const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; | 
|  |  | 
|  | const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; | 
|  | const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; | 
|  | const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; | 
|  |  | 
|  | const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; | 
|  | const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; | 
|  |  | 
|  | /** | 
|  | * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert | 
|  | * as much of the input as needed. Set the replacementCharacter to null to | 
|  | * throw an ArgumentError rather than replace the bad value. | 
|  | */ | 
|  | IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, | 
|  | int length, | 
|  | int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 
|  | return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Produce a String from a List of UTF-8 encoded bytes. The parameters | 
|  | * can set an offset into a list of bytes (as int), limit the length of the | 
|  | * values to be decoded, and override the default Unicode replacement character. | 
|  | * Set the replacementCharacter to null to throw an ArgumentError | 
|  | * rather than replace the bad value. | 
|  | */ | 
|  | String decodeUtf8(List<int> bytes, [int offset = 0, int length, | 
|  | int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 
|  | return new String.fromCharCodes( | 
|  | (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) | 
|  | .decodeRest()); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Produce a sequence of UTF-8 encoded bytes from the provided string. | 
|  | */ | 
|  | List<int> encodeUtf8(String str) => | 
|  | codepointsToUtf8(stringToCodepoints(str)); | 
|  |  | 
|  | int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { | 
|  | while (bytes > 0) { | 
|  | buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | | 
|  | (value & _UTF8_LO_SIX_BIT_MASK); | 
|  | value = value >> 6; | 
|  | bytes--; | 
|  | } | 
|  | return value; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Encode code points as UTF-8 code units. | 
|  | */ | 
|  | List<int> codepointsToUtf8( | 
|  | List<int> codepoints, [int offset = 0, int length]) { | 
|  | _ListRange source = new _ListRange(codepoints, offset, length); | 
|  |  | 
|  | int encodedLength = 0; | 
|  | for (int value in source) { | 
|  | if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | 
|  | encodedLength += 3; | 
|  | } else if (value <= _UTF8_ONE_BYTE_MAX) { | 
|  | encodedLength++; | 
|  | } else if (value <= _UTF8_TWO_BYTE_MAX) { | 
|  | encodedLength += 2; | 
|  | } else if (value <= _UTF8_THREE_BYTE_MAX) { | 
|  | encodedLength += 3; | 
|  | } else if (value <= UNICODE_VALID_RANGE_MAX) { | 
|  | encodedLength += 4; | 
|  | } | 
|  | } | 
|  |  | 
|  | List<int> encoded = new List<int>(encodedLength); | 
|  | int insertAt = 0; | 
|  | for (int value in source) { | 
|  | if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | 
|  | encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); | 
|  | insertAt += 3; | 
|  | } else if (value <= _UTF8_ONE_BYTE_MAX) { | 
|  | encoded[insertAt] = value; | 
|  | insertAt++; | 
|  | } else if (value <= _UTF8_TWO_BYTE_MAX) { | 
|  | encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( | 
|  | _UTF8_FIRST_BYTE_OF_TWO_MASK & | 
|  | _addToEncoding(insertAt, 1, value, encoded)); | 
|  | insertAt += 2; | 
|  | } else if (value <= _UTF8_THREE_BYTE_MAX) { | 
|  | encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( | 
|  | _UTF8_FIRST_BYTE_OF_THREE_MASK & | 
|  | _addToEncoding(insertAt, 2, value, encoded)); | 
|  | insertAt += 3; | 
|  | } else if (value <= UNICODE_VALID_RANGE_MAX) { | 
|  | encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( | 
|  | _UTF8_FIRST_BYTE_OF_FOUR_MASK & | 
|  | _addToEncoding(insertAt, 3, value, encoded)); | 
|  | insertAt += 4; | 
|  | } | 
|  | } | 
|  | return encoded; | 
|  | } | 
|  |  | 
|  | // Because UTF-8 specifies byte order, we do not have to follow the pattern | 
|  | // used by UTF-16 & UTF-32 regarding byte order. | 
|  | List<int> utf8ToCodepoints( | 
|  | List<int> utf8EncodedBytes, [int offset = 0, int length, | 
|  | int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 
|  | return new Utf8Decoder(utf8EncodedBytes, offset, length, | 
|  | replacementCodepoint).decodeRest(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Return type of [decodeUtf8AsIterable] and variants. The Iterable type | 
|  | * provides an iterator on demand and the iterator will only translate bytes | 
|  | * as requested by the user of the iterator. (Note: results are not cached.) | 
|  | */ | 
|  | // TODO(floitsch): Consider removing the extend and switch to implements since | 
|  | // that's cheaper to allocate. | 
|  | class IterableUtf8Decoder extends IterableBase<int> { | 
|  | final List<int> bytes; | 
|  | final int offset; | 
|  | final int length; | 
|  | final int replacementCodepoint; | 
|  |  | 
|  | IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | 
|  | this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | 
|  |  | 
|  | Utf8Decoder get iterator => | 
|  | new Utf8Decoder(bytes, offset, length, replacementCodepoint); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 
|  | * parameters can set an offset into a list of bytes (as int), limit the length | 
|  | * of the values to be decoded, and override the default Unicode replacement | 
|  | * character. Set the replacementCharacter to null to throw an | 
|  | * ArgumentError rather than replace the bad value. The return value | 
|  | * from this method can be used as an Iterable (e.g. in a for-loop). | 
|  | */ | 
|  | class Utf8Decoder implements Iterator<int> { | 
|  | final _ListRangeIterator utf8EncodedBytesIterator; | 
|  | final int replacementCodepoint; | 
|  | int _current = null; | 
|  |  | 
|  | Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 
|  | this.replacementCodepoint = | 
|  | UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 
|  | utf8EncodedBytesIterator = | 
|  | (new _ListRange(utf8EncodedBytes, offset, length)).iterator; | 
|  |  | 
|  |  | 
|  | Utf8Decoder._fromListRangeIterator(_ListRange source, [ | 
|  | this.replacementCodepoint = | 
|  | UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 
|  | utf8EncodedBytesIterator = source.iterator; | 
|  |  | 
|  | /** Decode the remaininder of the characters in this decoder | 
|  | * into a [List<int>]. | 
|  | */ | 
|  | List<int> decodeRest() { | 
|  | List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | 
|  | int i = 0; | 
|  | while (moveNext()) { | 
|  | codepoints[i++] = current; | 
|  | } | 
|  | if (i == codepoints.length) { | 
|  | return codepoints; | 
|  | } else { | 
|  | List<int> truncCodepoints = new List<int>(i); | 
|  | truncCodepoints.setRange(0, i, codepoints); | 
|  | return truncCodepoints; | 
|  | } | 
|  | } | 
|  |  | 
|  | int get current => _current; | 
|  |  | 
|  | bool moveNext() { | 
|  | _current = null; | 
|  |  | 
|  | if (!utf8EncodedBytesIterator.moveNext()) return false; | 
|  |  | 
|  | int value = utf8EncodedBytesIterator.current; | 
|  | int additionalBytes = 0; | 
|  |  | 
|  | if (value < 0) { | 
|  | if (replacementCodepoint != null) { | 
|  | _current = replacementCodepoint; | 
|  | return true; | 
|  | } else { | 
|  | throw new ArgumentError( | 
|  | "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 
|  | } | 
|  | } else if (value <= _UTF8_ONE_BYTE_MAX) { | 
|  | _current = value; | 
|  | return true; | 
|  | } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 
|  | if (replacementCodepoint != null) { | 
|  | _current = replacementCodepoint; | 
|  | return true; | 
|  | } else { | 
|  | throw new ArgumentError( | 
|  | "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 
|  | } | 
|  | } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | 
|  | value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | 
|  | additionalBytes = 1; | 
|  | } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | 
|  | value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | 
|  | additionalBytes = 2; | 
|  | } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | 
|  | value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | 
|  | additionalBytes = 3; | 
|  | } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | 
|  | value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | 
|  | additionalBytes = 4; | 
|  | } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | 
|  | value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | 
|  | additionalBytes = 5; | 
|  | } else if (replacementCodepoint != null) { | 
|  | _current = replacementCodepoint; | 
|  | return true; | 
|  | } else { | 
|  | throw new ArgumentError( | 
|  | "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 
|  | } | 
|  | int j = 0; | 
|  | while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { | 
|  | int nextValue = utf8EncodedBytesIterator.current; | 
|  | if (nextValue > _UTF8_ONE_BYTE_MAX && | 
|  | nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 
|  | value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 
|  | } else { | 
|  | // if sequence-starting code unit, reposition cursor to start here | 
|  | if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 
|  | utf8EncodedBytesIterator.backup(); | 
|  | } | 
|  | break; | 
|  | } | 
|  | j++; | 
|  | } | 
|  | bool validSequence = (j == additionalBytes && ( | 
|  | value < UNICODE_UTF16_RESERVED_LO || | 
|  | value > UNICODE_UTF16_RESERVED_HI)); | 
|  | bool nonOverlong = | 
|  | (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | 
|  | (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 
|  | (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 
|  | bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 
|  | if (validSequence && nonOverlong && inRange) { | 
|  | _current = value; | 
|  | return true; | 
|  | } else if (replacementCodepoint != null) { | 
|  | _current = replacementCodepoint; | 
|  | return true; | 
|  | } else { | 
|  | throw new ArgumentError( | 
|  | "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 
|  | } | 
|  | } | 
|  | } |