blob: a1cedcece985c193b05a446ef78147a3bfc7d212 [file] [log] [blame]
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
// TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
/**
* Provide a list of Unicode codepoints for a given string.
*/
List<int> stringToCodepoints(String str) {
// Note: str.charCodes gives us 16-bit code units on all Dart implementations.
// So we need to convert. The same is not true of "new String.fromCharCodes",
// which accepts code points on the VM but not dart2js (dartbug.com/1357).
return _utf16CodeUnitsToCodepoints(str.charCodes);
}
/**
* Generate a string from the provided Unicode codepoints.
*/
String codepointsToString(List<int> codepoints) {
// TODO _is16BitCodeUnit() is used to work around a bug with dart2js
// (http://code.google.com/p/dart/issues/detail?id=1357). Consider
// removing after this issue is resolved.
if (_is16BitCodeUnit()) {
return new String.fromCharCodes(
_codepointsToUtf16CodeUnits(codepoints));
} else {
return new String.fromCharCodes(codepoints);
}
}
/*
* Test for presence of bug related to the use of UTF-16 code units for
* Dart compiled to JS.
*/
bool _test16BitCodeUnit = null;
// TODO _is16BitCodeUnit() is used to work around a bug with dart2js
// (http://code.google.com/p/dart/issues/detail?id=1357). Consider
// removing after this issue is resolved.
bool _is16BitCodeUnit() {
if (_test16BitCodeUnit == null) {
_test16BitCodeUnit = (new String.fromCharCodes([0x1D11E])) ==
(new String.fromCharCodes([0xD11E]));
}
return _test16BitCodeUnit;
}
/**
* Invalid codepoints or encodings may be substituted with the value U+fffd.
*/
const int UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xfffd;
const int UNICODE_BOM = 0xfeff;
const int UNICODE_UTF_BOM_LO = 0xff;
const int UNICODE_UTF_BOM_HI = 0xfe;
const int UNICODE_BYTE_ZERO_MASK = 0xff;
const int UNICODE_BYTE_ONE_MASK = 0xff00;
const int UNICODE_VALID_RANGE_MAX = 0x10ffff;
const int UNICODE_PLANE_ONE_MAX = 0xffff;
const int UNICODE_UTF16_RESERVED_LO = 0xd800;
const int UNICODE_UTF16_RESERVED_HI = 0xdfff;
const int UNICODE_UTF16_OFFSET = 0x10000;
const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800;
const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00;
const int UNICODE_UTF16_HI_MASK = 0xffc00;
const int UNICODE_UTF16_LO_MASK = 0x3ff;
/**
* Encode code points as UTF16 code units.
*/
List<int> _codepointsToUtf16CodeUnits(
List<int> codepoints, [int offset = 0, int length,
int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
_ListRange listRange = new _ListRange(codepoints, offset, length);
int encodedLength = 0;
for (int value in listRange) {
if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
(value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
encodedLength++;
} else if (value > UNICODE_PLANE_ONE_MAX &&
value <= UNICODE_VALID_RANGE_MAX) {
encodedLength += 2;
} else {
encodedLength++;
}
}
List<int> codeUnitsBuffer = new List<int>(encodedLength);
int j = 0;
for (int value in listRange) {
if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
(value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
codeUnitsBuffer[j++] = value;
} else if (value > UNICODE_PLANE_ONE_MAX &&
value <= UNICODE_VALID_RANGE_MAX) {
int base = value - UNICODE_UTF16_OFFSET;
codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +
((base & UNICODE_UTF16_HI_MASK) >> 10);
codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +
(base & UNICODE_UTF16_LO_MASK);
} else if (replacementCodepoint != null) {
codeUnitsBuffer[j++] = replacementCodepoint;
} else {
throw new ArgumentError("Invalid encoding");
}
}
return codeUnitsBuffer;
}
/**
* Decodes the utf16 codeunits to codepoints.
*/
List<int> _utf16CodeUnitsToCodepoints(
List<int> utf16CodeUnits, [int offset = 0, int length,
int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
_ListRangeIterator source =
(new _ListRange(utf16CodeUnits, offset, length)).iterator();
Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder
.fromListRangeIterator(source, replacementCodepoint);
List<int> codepoints = new List<int>(source.remaining);
int i = 0;
while (decoder.hasNext) {
codepoints[i++] = decoder.next();
}
if (i == codepoints.length) {
return codepoints;
} else {
List<int> codepointTrunc = new List<int>(i);
codepointTrunc.setRange(0, i, codepoints);
return codepointTrunc;
}
}
/**
* An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.
* The parameters can override the default Unicode replacement character. Set
* the replacementCharacter to null to throw an ArgumentError
* rather than replace the bad value.
*/
class Utf16CodeUnitDecoder implements Iterator<int> {
final _ListRangeIterator utf16CodeUnitIterator;
final int replacementCodepoint;
Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,
int this.replacementCodepoint =
UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
utf16CodeUnitIterator = (new _ListRange(utf16CodeUnits, offset, length))
.iterator();
Utf16CodeUnitDecoder.fromListRangeIterator(
_ListRangeIterator this.utf16CodeUnitIterator,
int this.replacementCodepoint);
Iterator<int> iterator() => this;
bool get hasNext => utf16CodeUnitIterator.hasNext;
int next() {
int value = utf16CodeUnitIterator.next();
if (value < 0) {
if (replacementCodepoint != null) {
return replacementCodepoint;
} else {
throw new ArgumentError(
"Invalid UTF16 at ${utf16CodeUnitIterator.position}");
}
} else if (value < UNICODE_UTF16_RESERVED_LO ||
(value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
// transfer directly
return value;
} else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
utf16CodeUnitIterator.hasNext) {
// merge surrogate pair
int nextValue = utf16CodeUnitIterator.next();
if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
nextValue <= UNICODE_UTF16_RESERVED_HI) {
value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;
value += UNICODE_UTF16_OFFSET +
(nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);
return value;
} else {
if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&
nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {
utf16CodeUnitIterator.backup();
}
if (replacementCodepoint != null) {
return replacementCodepoint;
} else {
throw new ArgumentError(
"Invalid UTF16 at ${utf16CodeUnitIterator.position}");
}
}
} else if (replacementCodepoint != null) {
return replacementCodepoint;
} else {
throw new ArgumentError(
"Invalid UTF16 at ${utf16CodeUnitIterator.position}");
}
}
}
/**
* _ListRange in an internal type used to create a lightweight Interable on a
* range within a source list. DO NOT MODIFY the underlying list while
* iterating over it. The results of doing so are undefined.
*/
class _ListRange implements Iterable {
final List _source;
final int _offset;
final int _length;
_ListRange(source, [offset = 0, length]) :
this._source = source,
this._offset = offset,
this._length = (length == null ? source.length - offset : length) {
if (_offset < 0 || _offset > _source.length) {
throw new RangeError.value(_offset);
}
if (_length != null && (_length < 0)) {
throw new RangeError.value(_length);
}
if (_length + _offset > _source.length) {
throw new RangeError.value(_length + _offset);
}
}
_ListRangeIterator iterator() =>
new _ListRangeIteratorImpl(_source, _offset, _offset + _length);
int get length => _length;
}
/**
* The _ListRangeIterator provides more capabilities than a standard iterator,
* including the ability to get the current position, count remaining items,
* and move forward/backward within the iterator.
*/
abstract class _ListRangeIterator implements Iterator<int> {
bool hasNext;
int next();
int get position;
void backup([by]);
int get remaining;
void skip([count]);
}
class _ListRangeIteratorImpl implements _ListRangeIterator {
final List<int> _source;
int _offset;
final int _end;
_ListRangeIteratorImpl(this._source, this._offset, this._end);
bool get hasNext => _offset < _end;
int next() => _source[_offset++];
int get position => _offset;
void backup([int by = 1]) {
_offset -= by;
}
int get remaining => _end - _offset;
void skip([int count = 1]) {
_offset += count;
}
}