Fixes encoding parser to handle whitespace correctly.
Caught this while investigating an unused local. In Python, the iterator would advance, but in the port it was not.
diff --git a/lib/src/encoding_parser.dart b/lib/src/encoding_parser.dart
index 04ffa0c..fd64ff9 100644
--- a/lib/src/encoding_parser.dart
+++ b/lib/src/encoding_parser.dart
@@ -1,6 +1,5 @@
library encoding_parser;
-import 'dart:collection';
import 'constants.dart';
import 'inputstream.dart';
@@ -9,14 +8,12 @@
/// String-like object with an associated position and various extra methods
/// If the position is ever greater than the string length then an exception is
/// raised.
-class EncodingBytes extends IterableBase<String> {
+class EncodingBytes {
final String _bytes;
int _position = -1;
EncodingBytes(this._bytes);
- Iterator<String> get iterator => _bytes.split('').iterator;
-
int get length => _bytes.length;
String next() {
@@ -145,25 +142,21 @@
];
try {
- for (var byte in data) {
- var keepParsing = true;
+ for (;;) {
for (var dispatch in methodDispatch) {
if (data.matchBytes(dispatch[0])) {
- try {
- keepParsing = dispatch[1]();
- break;
- } on StateError catch (e) {
- keepParsing = false;
- break;
- }
+ var keepParsing = dispatch[1]();
+ if (keepParsing) break;
+
+ // We found an encoding. Stop.
+ return encoding;
}
}
- if (!keepParsing) {
- break;
- }
+ data.position += 1;
}
} on StateError catch (e) {
// Catch this here to match behavior of Python's StopIteration
+ // TODO(jmesserly): refactor to not use exceptions
}
return encoding;
}
diff --git a/test/parser_feature_test.dart b/test/parser_feature_test.dart
index 1beea2a..538113d 100644
--- a/test/parser_feature_test.dart
+++ b/test/parser_feature_test.dart
@@ -5,6 +5,7 @@
import 'package:html/dom.dart';
import 'package:html/parser.dart';
import 'package:html/src/constants.dart';
+import 'package:html/src/encoding_parser.dart';
import 'package:html/src/treebuilder.dart';
main() {
@@ -291,4 +292,29 @@
expect(c.text, 'qux');
expect(e.text, 'bar');
});
+
+ group('Encoding pre-parser', () {
+ getEncoding(s) => new EncodingParser(s.codeUnits).getEncoding();
+
+ test('gets encoding from meta charset', () {
+ expect(getEncoding('<meta charset="utf-16">'), 'utf-16');
+ });
+
+ test('gets encoding from meta in head', () {
+ expect(getEncoding('<head><meta charset="utf-16">'), 'utf-16');
+ });
+
+ test('skips comments', () {
+ expect(getEncoding('<!--comment--><meta charset="utf-16">'), 'utf-16');
+ });
+
+ test('stops if no match', () {
+ // missing closing tag
+ expect(getEncoding('<meta charset="utf-16"'), null);
+ });
+
+ test('ignores whitespace', () {
+ expect(getEncoding(' <meta charset="utf-16">'), 'utf-16');
+ });
+ });
}