blob: 43a294429139a408ab0b12ecd42d9997aae444c6 [file] [log] [blame]
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
/**
* Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and
* UTF-32.
*/
library dart.utf;
import "dart:async";
import "dart:collection";
part "utf_stream.dart";
part "utf8.dart";
part "utf16.dart";
part "utf32.dart";
// TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
/**
* Provide a list of Unicode codepoints for a given string.
*/
List<int> stringToCodepoints(String str) {
// Note: str.codeUnits gives us 16-bit code units on all Dart implementations.
// So we need to convert.
return _utf16CodeUnitsToCodepoints(str.codeUnits);
}
/**
* Generate a string from the provided Unicode codepoints.
*
* *Deprecated* Use [String.fromCharCodes] instead.
*/
String codepointsToString(List<int> codepoints) {
return new String.fromCharCodes(codepoints);
}
/**
* Invalid codepoints or encodings may be substituted with the value U+fffd.
*/
const int UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xfffd;
const int UNICODE_BOM = 0xfeff;
const int UNICODE_UTF_BOM_LO = 0xff;
const int UNICODE_UTF_BOM_HI = 0xfe;
const int UNICODE_BYTE_ZERO_MASK = 0xff;
const int UNICODE_BYTE_ONE_MASK = 0xff00;
const int UNICODE_VALID_RANGE_MAX = 0x10ffff;
const int UNICODE_PLANE_ONE_MAX = 0xffff;
const int UNICODE_UTF16_RESERVED_LO = 0xd800;
const int UNICODE_UTF16_RESERVED_HI = 0xdfff;
const int UNICODE_UTF16_OFFSET = 0x10000;
const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800;
const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00;
const int UNICODE_UTF16_HI_MASK = 0xffc00;
const int UNICODE_UTF16_LO_MASK = 0x3ff;
/**
* Encode code points as UTF16 code units.
*/
List<int> _codepointsToUtf16CodeUnits(
List<int> codepoints,
[int offset = 0,
int length,
int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
_ListRange listRange = new _ListRange(codepoints, offset, length);
int encodedLength = 0;
for (int value in listRange) {
if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
(value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
encodedLength++;
} else if (value > UNICODE_PLANE_ONE_MAX &&
value <= UNICODE_VALID_RANGE_MAX) {
encodedLength += 2;
} else {
encodedLength++;
}
}
List<int> codeUnitsBuffer = new List<int>(encodedLength);
int j = 0;
for (int value in listRange) {
if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
(value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
codeUnitsBuffer[j++] = value;
} else if (value > UNICODE_PLANE_ONE_MAX &&
value <= UNICODE_VALID_RANGE_MAX) {
int base = value - UNICODE_UTF16_OFFSET;
codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +
((base & UNICODE_UTF16_HI_MASK) >> 10);
codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +
(base & UNICODE_UTF16_LO_MASK);
} else if (replacementCodepoint != null) {
codeUnitsBuffer[j++] = replacementCodepoint;
} else {
throw new ArgumentError("Invalid encoding");
}
}
return codeUnitsBuffer;
}
/**
* Decodes the utf16 codeunits to codepoints.
*/
List<int> _utf16CodeUnitsToCodepoints(
List<int> utf16CodeUnits, [int offset = 0, int length,
int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
_ListRangeIterator source =
(new _ListRange(utf16CodeUnits, offset, length)).iterator;
Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder
.fromListRangeIterator(source, replacementCodepoint);
List<int> codepoints = new List<int>(source.remaining);
int i = 0;
while (decoder.moveNext()) {
codepoints[i++] = decoder.current;
}
if (i == codepoints.length) {
return codepoints;
} else {
List<int> codepointTrunc = new List<int>(i);
codepointTrunc.setRange(0, i, codepoints);
return codepointTrunc;
}
}
/**
* An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.
* The parameters can override the default Unicode replacement character. Set
* the replacementCharacter to null to throw an ArgumentError
* rather than replace the bad value.
*/
class Utf16CodeUnitDecoder implements Iterator<int> {
final _ListRangeIterator utf16CodeUnitIterator;
final int replacementCodepoint;
int _current = null;
Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,
int this.replacementCodepoint =
UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
utf16CodeUnitIterator =
(new _ListRange(utf16CodeUnits, offset, length)).iterator;
Utf16CodeUnitDecoder.fromListRangeIterator(
_ListRangeIterator this.utf16CodeUnitIterator,
int this.replacementCodepoint);
Iterator<int> get iterator => this;
int get current => _current;
bool moveNext() {
_current = null;
if (!utf16CodeUnitIterator.moveNext()) return false;
int value = utf16CodeUnitIterator.current;
if (value < 0) {
if (replacementCodepoint != null) {
_current = replacementCodepoint;
} else {
throw new ArgumentError(
"Invalid UTF16 at ${utf16CodeUnitIterator.position}");
}
} else if (value < UNICODE_UTF16_RESERVED_LO ||
(value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
// transfer directly
_current = value;
} else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
utf16CodeUnitIterator.moveNext()) {
// merge surrogate pair
int nextValue = utf16CodeUnitIterator.current;
if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
nextValue <= UNICODE_UTF16_RESERVED_HI) {
value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;
value += UNICODE_UTF16_OFFSET +
(nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);
_current = value;
} else {
if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&
nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {
utf16CodeUnitIterator.backup();
}
if (replacementCodepoint != null) {
_current = replacementCodepoint;
} else {
throw new ArgumentError(
"Invalid UTF16 at ${utf16CodeUnitIterator.position}");
}
}
} else if (replacementCodepoint != null) {
_current = replacementCodepoint;
} else {
throw new ArgumentError(
"Invalid UTF16 at ${utf16CodeUnitIterator.position}");
}
return true;
}
}
/**
* _ListRange in an internal type used to create a lightweight Interable on a
* range within a source list. DO NOT MODIFY the underlying list while
* iterating over it. The results of doing so are undefined.
*/
// TODO(floitsch): Consider removing the extend and switch to implements since
// that's cheaper to allocate.
class _ListRange extends IterableBase {
final List _source;
final int _offset;
final int _length;
_ListRange(source, [offset = 0, length]) :
this._source = source,
this._offset = offset,
this._length = (length == null ? source.length - offset : length) {
if (_offset < 0 || _offset > _source.length) {
throw new RangeError.value(_offset);
}
if (_length != null && (_length < 0)) {
throw new RangeError.value(_length);
}
if (_length + _offset > _source.length) {
throw new RangeError.value(_length + _offset);
}
}
_ListRangeIterator get iterator =>
new _ListRangeIteratorImpl(_source, _offset, _offset + _length);
int get length => _length;
}
/**
* The _ListRangeIterator provides more capabilities than a standard iterator,
* including the ability to get the current position, count remaining items,
* and move forward/backward within the iterator.
*/
abstract class _ListRangeIterator implements Iterator<int> {
bool moveNext();
int get current;
int get position;
void backup([by]);
int get remaining;
void skip([count]);
}
class _ListRangeIteratorImpl implements _ListRangeIterator {
final List<int> _source;
int _offset;
final int _end;
_ListRangeIteratorImpl(this._source, int offset, this._end)
: _offset = offset - 1;
int get current => _source[_offset];
bool moveNext() => ++_offset < _end;
int get position => _offset;
void backup([int by = 1]) {
_offset -= by;
}
int get remaining => _end - _offset - 1;
void skip([int count = 1]) {
_offset += count;
}
}