blob: 6120056ba642474b30f6fc9cf5b6299e813e96f4 [file] [log] [blame]
import 'utf.dart';
// TODO(jmesserly): this function is conspicuously absent from dart:utf.
/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
/// used in HTML to detect the UTF-
bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
int end = length != null ? offset + length : bytes.length;
return (offset + 3) <= end &&
bytes[offset] == 0xEF &&
bytes[offset + 1] == 0xBB &&
bytes[offset + 2] == 0xBF;
}
// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
// file, but dart:utf does not expose stream-based decoders yet.
/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
/// the codepoints. Supports the major unicode encodings as well as ascii and
/// and windows-1252 encodings.
Iterable<int> decodeBytes(String encoding, List<int> bytes) {
switch (encoding) {
case 'ascii':
// TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
for (int byte in bytes) {
if (byte > 127) {
// TODO(jmesserly): ideally this would be DecoderException, like the
// one thrown in runtime/bin/string_stream.dart, but we don't want to
// depend on dart:io.
throw FormatException("Illegal ASCII character $byte");
}
}
return bytes;
case 'utf-8':
// NOTE: to match the behavior of the other decode functions, we eat the
// utf-8 BOM here.
var offset = 0;
var length = bytes.length;
if (hasUtf8Bom(bytes)) {
offset += 3;
length -= 3;
}
return decodeUtf8AsIterable(bytes, offset, length);
default:
throw ArgumentError('Encoding $encoding not supported');
}
}
// TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.
/// Returns the code points for the [input]. This works like [String.charCodes]
/// but it decodes UTF-16 surrogate pairs.
List<int> toCodepoints(String input) {
var newCodes = <int>[];
for (int i = 0; i < input.length; i++) {
var c = input.codeUnitAt(i);
if (0xD800 <= c && c <= 0xDBFF) {
int next = i + 1;
if (next < input.length) {
var d = input.codeUnitAt(next);
if (0xDC00 <= d && d <= 0xDFFF) {
c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
i = next;
}
}
}
newCodes.add(c);
}
return newCodes;
}