Remove support for deprecated encodings and parser_console.dart library (#93)
diff --git a/.travis.yml b/.travis.yml
index 714aa86..337ceca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,9 +1,12 @@
language: dart
+
dart:
- dev
+ - 2.0.0
+
dart_task:
- test: -p vm
- - test: -p chrome,firefox
+ - test: -p chrome
- dartanalyzer: --fatal-warnings --fatal-infos .
matrix:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14eb41c..2ddbae7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## 0.14.0
+
+*BREAKING CHANGES*
+
+- Drop support for encodings other than UTF-8 and ASCII.
+- Removed `parser_console.dart` library.
+
## 0.13.4+1
* Fixes to readme and pubspec.
diff --git a/lib/parser_console.dart b/lib/parser_console.dart
deleted file mode 100644
index 28dee14..0000000
--- a/lib/parser_console.dart
+++ /dev/null
@@ -1,42 +0,0 @@
-/// This library adds `dart:io` support to the HTML5 parser. Call
-/// [initDartIOSupport] before calling the [parse] methods and they will accept
-/// a [RandomAccessFile] as input, in addition to the other input types.
-library parser_console;
-
-import 'dart:io';
-import 'parser.dart';
-import 'src/inputstream.dart' as inputstream;
-
-/// Adds support to the [HtmlParser] for running on a console VM. In particular
-/// this means it will be able to handle `dart:io` and [RandomAccessFile]s as
-/// input to the various [parse] methods.
-void useConsole() {
- inputstream.consoleSupport = _ConsoleSupport();
-}
-
-class _ConsoleSupport extends inputstream.ConsoleSupport {
- List<int> bytesFromFile(source) {
- if (source is! RandomAccessFile) return null;
- return readAllBytesFromFile(source);
- }
-}
-
-// TODO(jmesserly): this should be `RandomAccessFile.readAllBytes`.
-/// Synchronously reads all bytes from the [file].
-List<int> readAllBytesFromFile(RandomAccessFile file) {
- int length = file.lengthSync();
- var bytes = List<int>(length);
-
- int bytesRead = 0;
- while (bytesRead < length) {
- int read = file.readIntoSync(bytes, bytesRead, length - bytesRead);
- if (read <= 0) {
- // This could happen if, for example, the file was resized while
- // we're reading. Just shrink the bytes array and move on.
- bytes = bytes.sublist(0, bytesRead);
- break;
- }
- bytesRead += read;
- }
- return bytes;
-}
diff --git a/lib/src/char_encodings.dart b/lib/src/char_encodings.dart
index ba10a4a..6120056 100644
--- a/lib/src/char_encodings.dart
+++ b/lib/src/char_encodings.dart
@@ -1,8 +1,4 @@
-/// Decodes bytes using the correct name. See [decodeBytes].
-library char_encodings;
-
-import 'dart:collection';
-import 'package:utf/utf.dart';
+import 'utf.dart';
// TODO(jmesserly): this function is conspicuously absent from dart:utf.
/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
@@ -21,15 +17,9 @@
/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
/// the codepoints. Supports the major unicode encodings as well as ascii and
/// and windows-1252 encodings.
-Iterable<int> decodeBytes(String encoding, List<int> bytes,
- [int offset = 0,
- int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- if (length == null) length = bytes.length;
- final replace = replacementCodepoint;
+Iterable<int> decodeBytes(String encoding, List<int> bytes) {
switch (encoding) {
case 'ascii':
- bytes = bytes.sublist(offset, offset + length);
// TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
for (int byte in bytes) {
if (byte > 127) {
@@ -41,32 +31,18 @@
}
return bytes;
- case 'windows-1252':
- case 'cp1252':
- return decodeWindows1252AsIterable(bytes, offset, length, replace);
-
case 'utf-8':
// NOTE: to match the behavior of the other decode functions, we eat the
// utf-8 BOM here.
- if (hasUtf8Bom(bytes, offset, length)) {
+
+ var offset = 0;
+ var length = bytes.length;
+
+ if (hasUtf8Bom(bytes)) {
offset += 3;
length -= 3;
}
- return decodeUtf8AsIterable(bytes, offset, length, replace);
-
- case 'utf-16':
- return decodeUtf16AsIterable(bytes, offset, length, replace);
- case 'utf-16-be':
- return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
- case 'utf-16-le':
- return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
-
- case 'utf-32':
- return decodeUtf32AsIterable(bytes, offset, length, replace);
- case 'utf-32-be':
- return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
- case 'utf-32-le':
- return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
+ return decodeUtf8AsIterable(bytes, offset, length);
default:
throw ArgumentError('Encoding $encoding not supported');
@@ -94,135 +70,3 @@
}
return newCodes;
}
-
-/// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as
-/// an iterable. Thus, the consumer can only convert as much of the input as
-/// needed. Set the [replacementCharacter] to null to throw an [ArgumentError]
-/// rather than replace the bad value.
-IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
- [int offset = 0,
- int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- return IterableWindows1252Decoder(
- bytes, offset, length, replacementCodepoint);
-}
-
-/// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
-/// provides an iterator on demand and the iterator will only translate bytes
-/// as requested by the user of the iterator. (Note: results are not cached.)
-class IterableWindows1252Decoder extends IterableBase<int> {
- final List<int> bytes;
- final int offset;
- final int length;
- final int replacementCodepoint;
-
- IterableWindows1252Decoder(this.bytes,
- [this.offset = 0,
- this.length,
- this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
-
- Windows1252Decoder get iterator =>
- Windows1252Decoder(bytes, offset, length, replacementCodepoint);
-}
-
-/// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
-/// The parameters can set an offset into a list of bytes (as int), limit the
-/// length of the values to be decoded, and override the default Unicode
-/// replacement character. Set the replacementCharacter to null to throw an
-/// ArgumentError rather than replace the bad value. The return value
-/// from this method can be used as an Iterable (e.g. in a for-loop).
-class Windows1252Decoder implements Iterator<int> {
- final int replacementCodepoint;
- final List<int> _bytes;
- int _offset;
- final int _length;
-
- Windows1252Decoder(List<int> bytes,
- [int offset = 0,
- int length,
- this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
- : _bytes = bytes,
- _offset = offset - 1,
- _length = length == null ? bytes.length : length;
-
- bool get _inRange => _offset >= 0 && _offset < _length;
- int get current => _inRange ? _mapChar(_bytes[_offset]) : null;
-
- bool moveNext() {
- _offset++;
- return _inRange;
- }
-
- int _mapChar(int char) {
- // TODO(jmesserly): this is duplicating entitiesWindows1252 and
- // replacementCharacters from constants.dart
- switch (char) {
- case 0x80:
- return 0x20AC; // EURO SIGN
- case 0x82:
- return 0x201A; // SINGLE LOW-9 QUOTATION MARK
- case 0x83:
- return 0x0192; // LATIN SMALL LETTER F WITH HOOK
- case 0x84:
- return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
- case 0x85:
- return 0x2026; // HORIZONTAL ELLIPSIS
- case 0x86:
- return 0x2020; // DAGGER
- case 0x87:
- return 0x2021; // DOUBLE DAGGER
- case 0x88:
- return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
- case 0x89:
- return 0x2030; // PER MILLE SIGN
- case 0x8A:
- return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
- case 0x8B:
- return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
- case 0x8C:
- return 0x0152; // LATIN CAPITAL LIGATURE OE
- case 0x8E:
- return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
- case 0x91:
- return 0x2018; // LEFT SINGLE QUOTATION MARK
- case 0x92:
- return 0x2019; // RIGHT SINGLE QUOTATION MARK
- case 0x93:
- return 0x201C; // LEFT DOUBLE QUOTATION MARK
- case 0x94:
- return 0x201D; // RIGHT DOUBLE QUOTATION MARK
- case 0x95:
- return 0x2022; // BULLET
- case 0x96:
- return 0x2013; // EN DASH
- case 0x97:
- return 0x2014; // EM DASH
- case 0x98:
- return 0x02DC; // SMALL TILDE
- case 0x99:
- return 0x2122; // TRADE MARK SIGN
- case 0x9A:
- return 0x0161; // LATIN SMALL LETTER S WITH CARON
- case 0x9B:
- return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
- case 0x9C:
- return 0x0153; // LATIN SMALL LIGATURE OE
- case 0x9E:
- return 0x017E; // LATIN SMALL LETTER Z WITH CARON
- case 0x9F:
- return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
-
- case 0x81:
- case 0x8D:
- case 0x8F:
- case 0x90:
- case 0x9D:
- if (replacementCodepoint == null) {
- throw ArgumentError(
- "Invalid windows-1252 code point $char at $_offset");
- }
- return replacementCodepoint;
- }
- return char;
- }
-}
diff --git a/lib/src/constants.dart b/lib/src/constants.dart
index fc0c6a1..34addfc 100644
--- a/lib/src/constants.dart
+++ b/lib/src/constants.dart
@@ -29,10 +29,10 @@
"Entity used with illegal number (windows-1252 reference).",
"cant-convert-numeric-entity":
"Numeric entity couldn't be converted to character "
- "(codepoint U+%(charAsInt)08x).",
+ "(codepoint U+%(charAsInt)08x).",
"illegal-codepoint-for-numeric-entity":
"Numeric entity represents an illegal codepoint: "
- "U+%(charAsInt)08x.",
+ "U+%(charAsInt)08x.",
"numeric-entity-without-semicolon": "Numeric entity didn't end with ';'.",
"expected-numeric-entity-but-got-eof":
"Numeric entity expected. Got end of file instead.",
@@ -46,7 +46,7 @@
"Expected tag name. Got '>' instead.",
"expected-tag-name-but-got-question-mark":
"Expected tag name. Got '?' instead. (HTML doesn't "
- "support processing instructions.)",
+ "support processing instructions.)",
"expected-tag-name": "Expected tag name. Got something else instead",
"expected-closing-tag-but-got-right-bracket":
"Expected closing tag. Got '>' instead. Ignoring '</>'.",
@@ -133,7 +133,7 @@
"missing-end-tags": "Missing end tags (%(name)s).",
"unexpected-start-tag-implies-end-tag":
"Unexpected start tag (%(startName)s) "
- "implies end tag (%(endName)s).",
+ "implies end tag (%(endName)s).",
"unexpected-start-tag-treated-as":
"Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
"deprecated-tag": "Unexpected start tag %(name)s. Don't use it!",
@@ -165,7 +165,7 @@
"unexpected-form-in-table": "Unexpected form in table context.",
"unexpected-start-tag-implies-table-voodoo":
"Unexpected start tag (%(name)s) in "
- "table context caused voodoo mode.",
+ "table context caused voodoo mode.",
"unexpected-end-tag-implies-table-voodoo": "Unexpected end tag (%(name)s) in "
"table context caused voodoo mode.",
"unexpected-cell-in-table-body": "Unexpected table cell start tag (%(name)s) "
@@ -180,12 +180,12 @@
"Unexpected end tag (%(name)s) in the table row phase. Ignored.",
"unexpected-select-in-select":
"Unexpected select start tag in the select phase "
- "treated as select end tag.",
+ "treated as select end tag.",
"unexpected-input-in-select":
"Unexpected input start tag in the select phase.",
"unexpected-start-tag-in-select":
"Unexpected start tag token (%(name)s in the select phase. "
- "Ignored.",
+ "Ignored.",
"unexpected-end-tag-in-select":
"Unexpected end tag (%(name)s) in the select phase. Ignored.",
"unexpected-table-element-start-tag-in-select-in-table":
@@ -204,7 +204,7 @@
" in the frameset phase. Ignored.",
"unexpected-frameset-in-frameset-innerhtml":
"Unexpected end tag token (frameset) "
- "in the frameset phase (innerHTML).",
+ "in the frameset phase (innerHTML).",
"unexpected-end-tag-in-frameset": "Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored.",
"unexpected-char-after-frameset": "Unexpected non-space characters in the "
diff --git a/lib/src/encoding_parser.dart b/lib/src/encoding_parser.dart
index d61e76a..d0f40d6 100644
--- a/lib/src/encoding_parser.dart
+++ b/lib/src/encoding_parser.dart
@@ -1,7 +1,5 @@
-library encoding_parser;
-
import 'constants.dart';
-import 'inputstream.dart';
+import 'html_input_stream.dart';
// TODO(jmesserly): I converted StopIteration to StateError("No more elements").
// Seems strange to throw this from outside of an iterator though.
@@ -10,15 +8,15 @@
/// raised.
class EncodingBytes {
final String _bytes;
- int _position = -1;
+ int __position = -1;
EncodingBytes(this._bytes);
- int get length => _bytes.length;
+ int get _length => _bytes.length;
- String next() {
- var p = _position = _position + 1;
- if (p >= length) {
+ String _next() {
+ var p = __position = __position + 1;
+ if (p >= _length) {
throw StateError("No more elements");
} else if (p < 0) {
throw RangeError(p);
@@ -26,59 +24,59 @@
return _bytes[p];
}
- String previous() {
- var p = _position;
- if (p >= length) {
+ String _previous() {
+ var p = __position;
+ if (p >= _length) {
throw StateError("No more elements");
} else if (p < 0) {
throw RangeError(p);
}
- _position = p = p - 1;
+ __position = p = p - 1;
return _bytes[p];
}
- set position(int value) {
- if (_position >= length) {
+ set _position(int value) {
+ if (__position >= _length) {
throw StateError("No more elements");
}
- _position = value;
+ __position = value;
}
- int get position {
- if (_position >= length) {
+ int get _position {
+ if (__position >= _length) {
throw StateError("No more elements");
}
- if (_position >= 0) {
- return _position;
+ if (__position >= 0) {
+ return __position;
} else {
return 0;
}
}
- String get currentByte => _bytes[position];
+ String get _currentByte => _bytes[_position];
/// Skip past a list of characters. Defaults to skipping [isWhitespace].
- String skipChars([CharPreciate skipChars]) {
+ String _skipChars([_CharPredicate skipChars]) {
if (skipChars == null) skipChars = isWhitespace;
- var p = position; // use property for the error-checking
- while (p < length) {
+ var p = _position; // use property for the error-checking
+ while (p < _length) {
var c = _bytes[p];
if (!skipChars(c)) {
- _position = p;
+ __position = p;
return c;
}
p += 1;
}
- _position = p;
+ __position = p;
return null;
}
- String skipUntil(CharPreciate untilChars) {
- var p = position;
- while (p < length) {
+ String _skipUntil(_CharPredicate untilChars) {
+ var p = _position;
+ while (p < _length) {
var c = _bytes[p];
if (untilChars(c)) {
- _position = p;
+ __position = p;
return c;
}
p += 1;
@@ -89,14 +87,14 @@
/// Look for a sequence of bytes at the start of a string. If the bytes
/// are found return true and advance the position to the byte after the
/// match. Otherwise return false and leave the position alone.
- bool matchBytes(String bytes) {
- var p = position;
+ bool _matchBytes(String bytes) {
+ var p = _position;
if (_bytes.length < p + bytes.length) {
return false;
}
var data = _bytes.substring(p, p + bytes.length);
if (data == bytes) {
- position += bytes.length;
+ _position += bytes.length;
return true;
}
return false;
@@ -104,19 +102,19 @@
/// Look for the next sequence of bytes matching a given sequence. If
/// a match is found advance the position to the last byte of the match
- bool jumpTo(String bytes) {
- var newPosition = _bytes.indexOf(bytes, position);
+ bool _jumpTo(String bytes) {
+ var newPosition = _bytes.indexOf(bytes, _position);
if (newPosition >= 0) {
- _position = newPosition + bytes.length - 1;
+ __position = newPosition + bytes.length - 1;
return true;
} else {
throw StateError("No more elements");
}
}
- String slice(int start, [int end]) {
- if (end == null) end = length;
- if (end < 0) end += length;
+ String _slice(int start, [int end]) {
+ if (end == null) end = _length;
+ if (end < 0) end += _length;
return _bytes.substring(start, end);
}
}
@@ -126,68 +124,69 @@
class _DispatchEntry {
final String pattern;
final _MethodHandler handler;
+
_DispatchEntry(this.pattern, this.handler);
}
/// Mini parser for detecting character encoding from meta elements.
class EncodingParser {
- final EncodingBytes data;
- String encoding;
+ final EncodingBytes _data;
+ String _encoding;
/// [bytes] - the data to work on for encoding detection.
EncodingParser(List<int> bytes)
// Note: this is intentionally interpreting bytes as codepoints.
- : data = EncodingBytes(String.fromCharCodes(bytes).toLowerCase());
+ : _data = EncodingBytes(String.fromCharCodes(bytes).toLowerCase());
String getEncoding() {
final methodDispatch = [
- _DispatchEntry("<!--", handleComment),
- _DispatchEntry("<meta", handleMeta),
- _DispatchEntry("</", handlePossibleEndTag),
- _DispatchEntry("<!", handleOther),
- _DispatchEntry("<?", handleOther),
- _DispatchEntry("<", handlePossibleStartTag),
+ _DispatchEntry("<!--", _handleComment),
+ _DispatchEntry("<meta", _handleMeta),
+ _DispatchEntry("</", _handlePossibleEndTag),
+ _DispatchEntry("<!", _handleOther),
+ _DispatchEntry("<?", _handleOther),
+ _DispatchEntry("<", _handlePossibleStartTag),
];
try {
for (;;) {
for (var dispatch in methodDispatch) {
- if (data.matchBytes(dispatch.pattern)) {
+ if (_data._matchBytes(dispatch.pattern)) {
var keepParsing = dispatch.handler();
if (keepParsing) break;
// We found an encoding. Stop.
- return encoding;
+ return _encoding;
}
}
- data.position += 1;
+ _data._position += 1;
}
} on StateError catch (_) {
// Catch this here to match behavior of Python's StopIteration
// TODO(jmesserly): refactor to not use exceptions
}
- return encoding;
+ return _encoding;
}
/// Skip over comments.
- bool handleComment() => data.jumpTo("-->");
+ bool _handleComment() => _data._jumpTo("-->");
- bool handleMeta() {
- if (!isWhitespace(data.currentByte)) {
+ bool _handleMeta() {
+ if (!isWhitespace(_data._currentByte)) {
// if we have <meta not followed by a space so just keep going
return true;
}
// We have a valid meta element we want to search for attributes
while (true) {
// Try to find the next attribute after the current position
- var attr = getAttribute();
+ var attr = _getAttribute();
if (attr == null) return true;
if (attr[0] == "charset") {
var tentativeEncoding = attr[1];
var codec = codecName(tentativeEncoding);
if (codec != null) {
- encoding = codec;
+ _encoding = codec;
return false;
}
} else if (attr[0] == "content") {
@@ -195,54 +194,54 @@
var tentativeEncoding = contentParser.parse();
var codec = codecName(tentativeEncoding);
if (codec != null) {
- encoding = codec;
+ _encoding = codec;
return false;
}
}
}
}
- bool handlePossibleStartTag() => handlePossibleTag(false);
+ bool _handlePossibleStartTag() => _handlePossibleTag(false);
- bool handlePossibleEndTag() {
- data.next();
- return handlePossibleTag(true);
+ bool _handlePossibleEndTag() {
+ _data._next();
+ return _handlePossibleTag(true);
}
- bool handlePossibleTag(bool endTag) {
- if (!isLetter(data.currentByte)) {
+ bool _handlePossibleTag(bool endTag) {
+ if (!isLetter(_data._currentByte)) {
//If the next byte is not an ascii letter either ignore this
//fragment (possible start tag case) or treat it according to
//handleOther
if (endTag) {
- data.previous();
- handleOther();
+ _data._previous();
+ _handleOther();
}
return true;
}
- var c = data.skipUntil(isSpaceOrAngleBracket);
+ var c = _data._skipUntil(_isSpaceOrAngleBracket);
if (c == "<") {
// return to the first step in the overall "two step" algorithm
// reprocessing the < byte
- data.previous();
+ _data._previous();
} else {
//Read all attributes
- var attr = getAttribute();
+ var attr = _getAttribute();
while (attr != null) {
- attr = getAttribute();
+ attr = _getAttribute();
}
}
return true;
}
- bool handleOther() => data.jumpTo(">");
+ bool _handleOther() => _data._jumpTo(">");
/// Return a name,value pair for the next attribute in the stream,
/// if one is found, or null
- List<String> getAttribute() {
+ List<String> _getAttribute() {
// Step 1 (skip chars)
- var c = data.skipChars((x) => x == "/" || isWhitespace(x));
+ var c = _data._skipChars((x) => x == "/" || isWhitespace(x));
// Step 2
if (c == ">" || c == null) {
return null;
@@ -258,8 +257,8 @@
break;
} else if (isWhitespace(c)) {
// Step 6!
- c = data.skipChars();
- c = data.next();
+ c = _data._skipChars();
+ c = _data._next();
break;
} else if (c == "/" || c == ">") {
return [attrName.join(), ""];
@@ -269,27 +268,27 @@
attrName.add(c);
}
// Step 5
- c = data.next();
+ c = _data._next();
}
// Step 7
if (c != "=") {
- data.previous();
+ _data._previous();
return [attrName.join(), ""];
}
// Step 8
- data.next();
+ _data._next();
// Step 9
- c = data.skipChars();
+ c = _data._skipChars();
// Step 10
if (c == "'" || c == '"') {
// 10.1
var quoteChar = c;
while (true) {
// 10.2
- c = data.next();
+ c = _data._next();
if (c == quoteChar) {
// 10.3
- data.next();
+ _data._next();
return [attrName.join(), attrValue.join()];
} else if (isLetter(c)) {
// 10.4
@@ -310,8 +309,8 @@
}
// Step 11
while (true) {
- c = data.next();
- if (isSpaceOrAngleBracket(c)) {
+ c = _data._next();
+ if (_isSpaceOrAngleBracket(c)) {
return [attrName.join(), attrValue.join()];
} else if (c == null) {
return null;
@@ -333,34 +332,34 @@
try {
// Check if the attr name is charset
// otherwise return
- data.jumpTo("charset");
- data.position += 1;
- data.skipChars();
- if (data.currentByte != "=") {
+ data._jumpTo("charset");
+ data._position += 1;
+ data._skipChars();
+ if (data._currentByte != "=") {
// If there is no = sign keep looking for attrs
return null;
}
- data.position += 1;
- data.skipChars();
+ data._position += 1;
+ data._skipChars();
// Look for an encoding between matching quote marks
- if (data.currentByte == '"' || data.currentByte == "'") {
- var quoteMark = data.currentByte;
- data.position += 1;
- var oldPosition = data.position;
- if (data.jumpTo(quoteMark)) {
- return data.slice(oldPosition, data.position);
+ if (data._currentByte == '"' || data._currentByte == "'") {
+ var quoteMark = data._currentByte;
+ data._position += 1;
+ var oldPosition = data._position;
+ if (data._jumpTo(quoteMark)) {
+ return data._slice(oldPosition, data._position);
} else {
return null;
}
} else {
// Unquoted value
- var oldPosition = data.position;
+ var oldPosition = data._position;
try {
- data.skipUntil(isWhitespace);
- return data.slice(oldPosition, data.position);
+ data._skipUntil(isWhitespace);
+ return data._slice(oldPosition, data._position);
} on StateError catch (_) {
//Return the whole remaining value
- return data.slice(oldPosition);
+ return data._slice(oldPosition);
}
}
} on StateError catch (_) {
@@ -369,8 +368,8 @@
}
}
-bool isSpaceOrAngleBracket(String char) {
+bool _isSpaceOrAngleBracket(String char) {
return char == ">" || char == "<" || isWhitespace(char);
}
-typedef CharPreciate = bool Function(String char);
+typedef _CharPredicate = bool Function(String char);
diff --git a/lib/src/inputstream.dart b/lib/src/html_input_stream.dart
similarity index 85%
rename from lib/src/inputstream.dart
rename to lib/src/html_input_stream.dart
index dbcf98b..84d2f13 100644
--- a/lib/src/inputstream.dart
+++ b/lib/src/html_input_stream.dart
@@ -1,21 +1,12 @@
-library inputstream;
-
import 'dart:collection';
-import 'package:utf/utf.dart';
+
import 'package:source_span/source_span.dart';
+
import 'char_encodings.dart';
import 'constants.dart';
import 'encoding_parser.dart';
import 'utils.dart';
-/// Hooks to call into dart:io without directly referencing it.
-class ConsoleSupport {
- List<int> bytesFromFile(source) => null;
-}
-
-// TODO(jmesserly): use lazy init here when supported.
-ConsoleSupport consoleSupport = ConsoleSupport();
-
/// Provides a unicode stream of characters to the HtmlTokenizer.
///
/// This class takes care of character encoding and removing or replacing
@@ -26,7 +17,7 @@
static const int numBytesMeta = 512;
/// Encoding to use if no other information can be found.
- static const String defaultEncoding = 'windows-1252';
+ static const String defaultEncoding = 'utf-8';
/// The name of the character encoding.
String charEncodingName;
@@ -81,18 +72,8 @@
} else if (source is List<int>) {
_rawBytes = source;
} else {
- // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,
- // but it's necessary because of how the UTF decoders work.
- _rawBytes = consoleSupport.bytesFromFile(source);
-
- if (_rawBytes == null) {
- // TODO(jmesserly): we should accept some kind of stream API too.
- // Unfortunately dart:io InputStream is async only, which won't work.
- throw ArgumentError("'source' must be a String or "
- "List<int> (of bytes). You can also pass a RandomAccessFile if you"
- "`import 'package:html/parser_console.dart'` and call "
- "`useConsole()`.");
- }
+ throw ArgumentError.value(
+ source, 'source', 'Must be a String or List<int>.');
}
// Detect encoding iff no explicit "transport level" encoding is supplied
@@ -121,7 +102,7 @@
if (c == NEWLINE) continue;
}
- if (invalidUnicode(c)) errors.add('invalid-codepoint');
+ if (_invalidUnicode(c)) errors.add('invalid-codepoint');
if (0xD800 <= c && c <= 0xDFFF) {
c = 0xFFFD;
@@ -199,14 +180,6 @@
if (hasUtf8Bom(_rawBytes)) {
return 'utf-8';
}
- // Note: we don't need to remember whether it was big or little endian
- // because the decoder will do that later. It will also eat the BOM for us.
- if (hasUtf16Bom(_rawBytes)) {
- return 'utf-16';
- }
- if (hasUtf32Bom(_rawBytes)) {
- return 'utf-32';
- }
return null;
}
@@ -262,7 +235,7 @@
// TODO(jmesserly): the Python code used a regex to check for this. But
// Dart doesn't let you create a regexp with invalid characters.
-bool invalidUnicode(int c) {
+bool _invalidUnicode(int c) {
if (0x0001 <= c && c <= 0x0008) return true;
if (0x000E <= c && c <= 0x001F) return true;
if (0x007F <= c && c <= 0x009F) return true;
diff --git a/lib/src/tokenizer.dart b/lib/src/tokenizer.dart
index 48d6365..638663e 100644
--- a/lib/src/tokenizer.dart
+++ b/lib/src/tokenizer.dart
@@ -3,7 +3,7 @@
import 'dart:collection';
import 'package:html/parser.dart' show HtmlParser;
import 'constants.dart';
-import 'inputstream.dart';
+import 'html_input_stream.dart';
import 'token.dart';
import 'utils.dart';
diff --git a/lib/src/utf.dart b/lib/src/utf.dart
new file mode 100644
index 0000000..a635db8
--- /dev/null
+++ b/lib/src/utf.dart
@@ -0,0 +1,237 @@
+// Large portions of this code where taken from https://github.com/dart-lang/utf
+
+import "dart:collection";
+
+const int _replacementCodepoint = 0xfffd;
+
+const int _UNICODE_VALID_RANGE_MAX = 0x10ffff;
+const int _UNICODE_UTF16_RESERVED_LO = 0xd800;
+const int _UNICODE_UTF16_RESERVED_HI = 0xdfff;
+
+const int _UTF8_ONE_BYTE_MAX = 0x7f;
+const int _UTF8_TWO_BYTE_MAX = 0x7ff;
+const int _UTF8_THREE_BYTE_MAX = 0xffff;
+
+const int _UTF8_LO_SIX_BIT_MASK = 0x3f;
+
+const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;
+const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;
+const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;
+const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;
+const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;
+
+const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;
+
+/// Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert
+/// as much of the input as needed. Set the replacementCharacter to null to
+/// throw an ArgumentError rather than replace the bad value.
+Iterable<int> decodeUtf8AsIterable(List<int> bytes, int offset, int length) =>
+ _IterableUtf8Decoder(bytes, offset, length);
+
+/// Return type of [decodeUtf8AsIterable] and variants. The Iterable type
+/// provides an iterator on demand and the iterator will only translate bytes
+/// as requested by the user of the iterator. (Note: results are not cached.)
+// TODO(floitsch): Consider removing the extend and switch to implements since
+// that's cheaper to allocate.
+class _IterableUtf8Decoder extends IterableBase<int> {
+ final List<int> bytes;
+ final int offset;
+ final int length;
+
+ _IterableUtf8Decoder(this.bytes, this.offset, this.length);
+
+ _Utf8Decoder get iterator => _Utf8Decoder(bytes, offset, length);
+}
+
+/// Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The
+/// parameters can set an offset into a list of bytes (as int), limit the length
+/// of the values to be decoded, and override the default Unicode replacement
+/// character. Set the replacementCharacter to null to throw an
+/// ArgumentError rather than replace the bad value. The return value
+/// from this method can be used as an Iterable (e.g. in a for-loop).
+class _Utf8Decoder implements Iterator<int> {
+ final _ListRangeIterator utf8EncodedBytesIterator;
+ int _current;
+
+ _Utf8Decoder(List<int> utf8EncodedBytes, int offset, int length)
+ : utf8EncodedBytesIterator =
+ (_ListRange(utf8EncodedBytes, offset, length)).iterator;
+
+ _Utf8Decoder._fromListRangeIterator(_ListRange source)
+ : utf8EncodedBytesIterator = source.iterator;
+
+ /// Decode the remaininder of the characters in this decoder
+ /// into a [List<int>].
+ List<int> decodeRest() {
+ List<int> codepoints = List<int>(utf8EncodedBytesIterator.remaining);
+ int i = 0;
+ while (moveNext()) {
+ codepoints[i++] = current;
+ }
+ if (i == codepoints.length) {
+ return codepoints;
+ } else {
+ List<int> truncCodepoints = List<int>(i);
+ truncCodepoints.setRange(0, i, codepoints);
+ return truncCodepoints;
+ }
+ }
+
+ int get current => _current;
+
+ bool moveNext() {
+ _current = null;
+
+ if (!utf8EncodedBytesIterator.moveNext()) return false;
+
+ int value = utf8EncodedBytesIterator.current;
+ int additionalBytes = 0;
+
+ if (value < 0) {
+ if (_replacementCodepoint != null) {
+ _current = _replacementCodepoint;
+ return true;
+ } else {
+ throw ArgumentError(
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
+ }
+ } else if (value <= _UTF8_ONE_BYTE_MAX) {
+ _current = value;
+ return true;
+ } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
+ if (_replacementCodepoint != null) {
+ _current = _replacementCodepoint;
+ return true;
+ } else {
+ throw ArgumentError(
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
+ }
+ } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
+ value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
+ additionalBytes = 1;
+ } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
+ value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
+ additionalBytes = 2;
+ } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
+ value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
+ additionalBytes = 3;
+ } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
+ value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
+ additionalBytes = 4;
+ } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
+ value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
+ additionalBytes = 5;
+ } else if (_replacementCodepoint != null) {
+ _current = _replacementCodepoint;
+ return true;
+ } else {
+ throw ArgumentError(
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
+ }
+ int j = 0;
+ while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
+ int nextValue = utf8EncodedBytesIterator.current;
+ if (nextValue > _UTF8_ONE_BYTE_MAX &&
+ nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
+ value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
+ } else {
+ // if sequence-starting code unit, reposition cursor to start here
+ if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
+ utf8EncodedBytesIterator.backup();
+ }
+ break;
+ }
+ j++;
+ }
+ bool validSequence = (j == additionalBytes &&
+ (value < _UNICODE_UTF16_RESERVED_LO ||
+ value > _UNICODE_UTF16_RESERVED_HI));
+ bool nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
+ (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
+ (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
+ bool inRange = value <= _UNICODE_VALID_RANGE_MAX;
+ if (validSequence && nonOverlong && inRange) {
+ _current = value;
+ return true;
+ } else if (_replacementCodepoint != null) {
+ _current = _replacementCodepoint;
+ return true;
+ } else {
+ throw ArgumentError(
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
+ }
+ }
+}
+
+/// _ListRange in an internal type used to create a lightweight Interable on a
+/// range within a source list. DO NOT MODIFY the underlying list while
+/// iterating over it. The results of doing so are undefined.
+// TODO(floitsch): Consider removing the extend and switch to implements since
+// that's cheaper to allocate.
+class _ListRange extends IterableBase<int> {
+ final List<int> _source;
+ final int _offset;
+ final int _length;
+
+ _ListRange(List<int> source, [int offset = 0, int length])
+ : _source = source,
+ _offset = offset,
+ _length = (length == null ? source.length - offset : length) {
+ if (_offset < 0 || _offset > _source.length) {
+ throw RangeError.value(_offset);
+ }
+ if (_length != null && (_length < 0)) {
+ throw RangeError.value(_length);
+ }
+ if (_length + _offset > _source.length) {
+ throw RangeError.value(_length + _offset);
+ }
+ }
+
+ _ListRangeIterator get iterator =>
+ _ListRangeIteratorImpl(_source, _offset, _offset + _length);
+
+ int get length => _length;
+}
+
+/// The ListRangeIterator provides more capabilities than a standard iterator,
+/// including the ability to get the current position, count remaining items,
+/// and move forward/backward within the iterator.
+abstract class _ListRangeIterator implements Iterator<int> {
+ bool moveNext();
+
+ int get current;
+
+ int get position;
+
+ void backup([int by]);
+
+ int get remaining;
+
+ void skip([int count]);
+}
+
+class _ListRangeIteratorImpl implements _ListRangeIterator {
+ final List<int> _source;
+ int _offset;
+ final int _end;
+
+ _ListRangeIteratorImpl(this._source, int offset, this._end)
+ : _offset = offset - 1;
+
+ int get current => _source[_offset];
+
+ bool moveNext() => ++_offset < _end;
+
+ int get position => _offset;
+
+ void backup([int by = 1]) {
+ _offset -= by;
+ }
+
+ int get remaining => _end - _offset - 1;
+
+ void skip([int count = 1]) {
+ _offset += count;
+ }
+}
diff --git a/pubspec.yaml b/pubspec.yaml
index b3cf488..92e502e 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,5 +1,5 @@
name: html
-version: 0.13.4+1
+version: 0.14.0-dev
description: APIs for parsing and manipulating HTML content outside the browser.
author: Dart Team <misc@dartlang.org>
@@ -11,9 +11,9 @@
dependencies:
csslib: '>=0.13.2 <0.15.0'
source_span: '>=1.0.0 <2.0.0'
- utf: '>=0.9.0 <0.10.0'
dev_dependencies:
path: ^1.6.2
pedantic: ^1.3.0
test: ^1.3.0
+ utf: '>=0.9.0 <0.10.0'
diff --git a/test/data/parser_feature/raw_file.html b/test/data/parser_feature/raw_file.html
deleted file mode 100644
index bcdbf76..0000000
--- a/test/data/parser_feature/raw_file.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<!doctype html>
-<html>
-<body>
-Hello world!
-</body>
-</html>
diff --git a/test/parser_feature_test.dart b/test/parser_feature_test.dart
index 2591a2d..0889f44 100644
--- a/test/parser_feature_test.dart
+++ b/test/parser_feature_test.dart
@@ -1,13 +1,13 @@
/// Additional feature tests that aren't based on test data.
library parser_feature_test;
-import 'package:test/test.dart';
import 'package:html/dom.dart';
import 'package:html/parser.dart';
import 'package:html/src/constants.dart';
import 'package:html/src/encoding_parser.dart';
import 'package:html/src/treebuilder.dart';
import 'package:source_span/source_span.dart';
+import 'package:test/test.dart';
main() {
_testElementSpans();
diff --git a/test/parser_test.dart b/test/parser_test.dart
index 1289f61..1db1586 100644
--- a/test/parser_test.dart
+++ b/test/parser_test.dart
@@ -2,13 +2,12 @@
library parser_test;
import 'dart:convert';
-import 'dart:io';
-import 'package:path/path.dart' as pathos;
-import 'package:test/test.dart';
+
import 'package:html/dom.dart';
import 'package:html/parser.dart';
-import 'package:html/parser_console.dart' as parser_console;
-import 'package:html/src/inputstream.dart' as inputstream;
+import 'package:path/path.dart' as pathos;
+import 'package:test/test.dart';
+
import 'support.dart';
// Run the parse error checks
@@ -71,16 +70,6 @@
}
void main() {
- test('dart:io', () {
- // ensure IO support is unregistered
- expect(inputstream.consoleSupport,
- const TypeMatcher<inputstream.ConsoleSupport>());
- var file = File('$testDataDir/parser_feature/raw_file.html').openSync();
- expect(() => parse(file), throwsA(const TypeMatcher<ArgumentError>()));
- parser_console.useConsole();
- expect(parse(file).body.innerHtml.trim(), 'Hello world!');
- });
-
for (var path in getDataFiles('tree-construction')) {
if (!path.endsWith('.dat')) continue;