lib/src/html_input_stream.dart - html - Git at Google

 import 'dart:collection';
 import 'dart:convert' show ascii, utf8;

 import 'package:source_span/source_span.dart';

 import 'constants.dart';
 import 'encoding_parser.dart';
 import 'utils.dart';

 /// Provides a unicode stream of characters to the HtmlTokenizer.
 ///
 /// This class takes care of character encoding and removing or replacing
 /// incorrect byte-sequences and also provides column and line tracking.
 class HtmlInputStream {
   /// Number of bytes to use when looking for a meta element with
   /// encoding information.
   static const int numBytesMeta = 512;

   /// Encoding to use if no other information can be found.
   static const String defaultEncoding = 'utf-8';

   /// The name of the character encoding.
   String charEncodingName;

   /// True if we are certain about [charEncodingName], false for tenative.
   bool charEncodingCertain = true;

   final bool generateSpans;

   /// Location where the contents of the stream were found.
   final String sourceUrl;

   List<int> _rawBytes;

   /// Raw UTF-16 codes, used if a Dart String is passed in.
   Iterable<int> _rawChars;

   Queue<String> errors;

   SourceFile fileInfo;

   List<int> _lineStarts;

   List<int> _chars;

   int _offset;

   /// Initialises the HtmlInputStream.
   ///
   /// HtmlInputStream(source, [encoding]) -> Normalized stream from source
   /// for use by html5lib.
   ///
   /// [source] can be either a [String] or a [List<int>] containing the raw
   /// bytes, or a file if [consoleSupport] is initialized.
   ///
   /// The optional encoding parameter must be a string that indicates
   /// the encoding.  If specified, that encoding will be used,
   /// regardless of any BOM or later declaration (such as in a meta
   /// element)
   ///
   /// [parseMeta] - Look for a <meta> element containing encoding information
   HtmlInputStream(source,
       [String encoding,
       bool parseMeta = true,
       this.generateSpans = false,
       this.sourceUrl])
       : charEncodingName = codecName(encoding) {
     if (source is String) {
       _rawChars = source.runes.toList();
       charEncodingName = 'utf-8';
       charEncodingCertain = true;
     } else if (source is List<int>) {
       _rawBytes = source;
     } else {
       throw ArgumentError.value(
           source, 'source', 'Must be a String or List<int>.');
     }

     // Detect encoding iff no explicit "transport level" encoding is supplied
     if (charEncodingName == null) {
       detectEncoding(parseMeta);
     }

     reset();
   }

   void reset() {
     errors = Queue<String>();

     _offset = 0;
     _lineStarts = <int>[0];
     _chars = <int>[];

     if (_rawChars == null) {
       _rawChars = _decodeBytes(charEncodingName, _rawBytes);
     }

     bool skipNewline = false;
     for (var c in _rawChars) {
       if (skipNewline) {
         skipNewline = false;
         if (c == NEWLINE) continue;
       }

       if (_invalidUnicode(c)) errors.add('invalid-codepoint');

       if (0xD800 <= c && c <= 0xDFFF) {
         c = 0xFFFD;
       } else if (c == RETURN) {
         skipNewline = true;
         c = NEWLINE;
       }

       _chars.add(c);
       if (c == NEWLINE) _lineStarts.add(_chars.length);
     }

     // Free decoded characters if they aren't needed anymore.
     if (_rawBytes != null) _rawChars = null;

     // TODO(sigmund): Don't parse the file at all if spans aren't being
     // generated.
     fileInfo = SourceFile.decoded(_chars, url: sourceUrl);
   }

   void detectEncoding([bool parseMeta = true]) {
     // First look for a BOM
     // This will also read past the BOM if present
     charEncodingName = detectBOM();
     charEncodingCertain = true;

     // If there is no BOM need to look for meta elements with encoding
     // information
     if (charEncodingName == null && parseMeta) {
       charEncodingName = detectEncodingMeta();
       charEncodingCertain = false;
     }
     // If all else fails use the default encoding
     if (charEncodingName == null) {
       charEncodingCertain = false;
       charEncodingName = defaultEncoding;
     }

     // Substitute for equivalent encodings:
     if (charEncodingName.toLowerCase() == 'iso-8859-1') {
       charEncodingName = 'windows-1252';
     }
   }

   void changeEncoding(String newEncoding) {
     if (_rawBytes == null) {
       // We should never get here -- if encoding is certain we won't try to
       // change it.
       throw StateError('cannot change encoding when parsing a String.');
     }

     newEncoding = codecName(newEncoding);
     if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {
       newEncoding = 'utf-8';
     }
     if (newEncoding == null) {
       return;
     } else if (newEncoding == charEncodingName) {
       charEncodingCertain = true;
     } else {
       charEncodingName = newEncoding;
       charEncodingCertain = true;
       _rawChars = null;
       reset();
       throw ReparseException(
           'Encoding changed from $charEncodingName to $newEncoding');
     }
   }

   /// Attempts to detect at BOM at the start of the stream. If
   /// an encoding can be determined from the BOM return the name of the
   /// encoding otherwise return null.
   String detectBOM() {
     // Try detecting the BOM using bytes from the string
     if (_hasUtf8Bom(_rawBytes)) {
       return 'utf-8';
     }
     return null;
   }

   /// Report the encoding declared by the meta element.
   String detectEncodingMeta() {
     var parser = EncodingParser(slice(_rawBytes, 0, numBytesMeta));
     var encoding = parser.getEncoding();

     if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {
       encoding = 'utf-8';
     }

     return encoding;
   }

   /// Returns the current offset in the stream, i.e. the number of codepoints
   /// since the start of the file.
   int get position => _offset;

   /// Read one character from the stream or queue if available. Return
   /// EOF when EOF is reached.
   String char() {
     if (_offset >= _chars.length) return eof;
     return String.fromCharCodes([_chars[_offset++]]);
   }

   String peekChar() {
     if (_offset >= _chars.length) return eof;
     return String.fromCharCodes([_chars[_offset]]);
   }

   /// Returns a string of characters from the stream up to but not
   /// including any character in 'characters' or EOF.
   String charsUntil(String characters, [bool opposite = false]) {
     int start = _offset;
     String c;
     while ((c = peekChar()) != null && characters.contains(c) == opposite) {
       _offset++;
     }

     return String.fromCharCodes(_chars.sublist(start, _offset));
   }

   void unget(String ch) {
     // Only one character is allowed to be ungotten at once - it must
     // be consumed again before any further call to unget
     if (ch != null) {
       _offset--;
       assert(peekChar() == ch);
     }
   }
 }

 // TODO(jmesserly): the Python code used a regex to check for this. But
 // Dart doesn't let you create a regexp with invalid characters.
 bool _invalidUnicode(int c) {
   if (0x0001 <= c && c <= 0x0008) return true;
   if (0x000E <= c && c <= 0x001F) return true;
   if (0x007F <= c && c <= 0x009F) return true;
   if (0xD800 <= c && c <= 0xDFFF) return true;
   if (0xFDD0 <= c && c <= 0xFDEF) return true;
   switch (c) {
     case 0x000B:
     case 0xFFFE:
     case 0xFFFF:
     case 0x01FFFE:
     case 0x01FFFF:
     case 0x02FFFE:
     case 0x02FFFF:
     case 0x03FFFE:
     case 0x03FFFF:
     case 0x04FFFE:
     case 0x04FFFF:
     case 0x05FFFE:
     case 0x05FFFF:
     case 0x06FFFE:
     case 0x06FFFF:
     case 0x07FFFE:
     case 0x07FFFF:
     case 0x08FFFE:
     case 0x08FFFF:
     case 0x09FFFE:
     case 0x09FFFF:
     case 0x0AFFFE:
     case 0x0AFFFF:
     case 0x0BFFFE:
     case 0x0BFFFF:
     case 0x0CFFFE:
     case 0x0CFFFF:
     case 0x0DFFFE:
     case 0x0DFFFF:
     case 0x0EFFFE:
     case 0x0EFFFF:
     case 0x0FFFFE:
     case 0x0FFFFF:
     case 0x10FFFE:
     case 0x10FFFF:
       return true;
   }
   return false;
 }

 /// Return the python codec name corresponding to an encoding or null if the
 /// string doesn't correspond to a valid encoding.
 String codecName(String encoding) {
   final asciiPunctuation = RegExp(
       "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

   if (encoding == null) return null;
   var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
   return encodings[canonicalName];
 }

 /// Returns true if the [bytes] starts with a UTF-8 byte order mark.
 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
 /// used in HTML to detect the UTF-
 bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
   int end = length != null ? offset + length : bytes.length;
   return (offset + 3) <= end &&
       bytes[offset] == 0xEF &&
       bytes[offset + 1] == 0xBB &&
       bytes[offset + 2] == 0xBF;
 }

 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for
 /// the codepoints. Supports the major unicode encodings as well as ascii and
 /// and windows-1252 encodings.
 Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
   switch (encoding) {
     case 'ascii':
       return ascii.decode(bytes).runes;

     case 'utf-8':
       // NOTE: To match the behavior of the other decode functions, we eat the
       // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
       return utf8.decode(bytes).runes;

     default:
       throw ArgumentError('Encoding $encoding not supported');
   }
 }
	import 'dart:collection';
	import 'dart:convert' show ascii, utf8;

	import 'package:source_span/source_span.dart';

	import 'constants.dart';
	import 'encoding_parser.dart';
	import 'utils.dart';

	/// Provides a unicode stream of characters to the HtmlTokenizer.
	///
	/// This class takes care of character encoding and removing or replacing
	/// incorrect byte-sequences and also provides column and line tracking.
	class HtmlInputStream {
	/// Number of bytes to use when looking for a meta element with
	/// encoding information.
	static const int numBytesMeta = 512;

	/// Encoding to use if no other information can be found.
	static const String defaultEncoding = 'utf-8';

	/// The name of the character encoding.
	String charEncodingName;

	/// True if we are certain about [charEncodingName], false for tenative.
	bool charEncodingCertain = true;

	final bool generateSpans;

	/// Location where the contents of the stream were found.
	final String sourceUrl;

	List<int> _rawBytes;

	/// Raw UTF-16 codes, used if a Dart String is passed in.
	Iterable<int> _rawChars;

	Queue<String> errors;

	SourceFile fileInfo;

	List<int> _lineStarts;

	List<int> _chars;

	int _offset;

	/// Initialises the HtmlInputStream.
	///
	/// HtmlInputStream(source, [encoding]) -> Normalized stream from source
	/// for use by html5lib.
	///
	/// [source] can be either a [String] or a [List<int>] containing the raw
	/// bytes, or a file if [consoleSupport] is initialized.
	///
	/// The optional encoding parameter must be a string that indicates
	/// the encoding. If specified, that encoding will be used,
	/// regardless of any BOM or later declaration (such as in a meta
	/// element)
	///
	/// [parseMeta] - Look for a <meta> element containing encoding information
	HtmlInputStream(source,
	[String encoding,
	bool parseMeta = true,
	this.generateSpans = false,
	this.sourceUrl])
	: charEncodingName = codecName(encoding) {
	if (source is String) {
	_rawChars = source.runes.toList();
	charEncodingName = 'utf-8';
	charEncodingCertain = true;
	} else if (source is List<int>) {
	_rawBytes = source;
	} else {
	throw ArgumentError.value(
	source, 'source', 'Must be a String or List<int>.');
	}

	// Detect encoding iff no explicit "transport level" encoding is supplied
	if (charEncodingName == null) {
	detectEncoding(parseMeta);
	}

	reset();
	}

	void reset() {
	errors = Queue<String>();

	_offset = 0;
	_lineStarts = <int>[0];
	_chars = <int>[];

	if (_rawChars == null) {
	_rawChars = _decodeBytes(charEncodingName, _rawBytes);
	}

	bool skipNewline = false;
	for (var c in _rawChars) {
	if (skipNewline) {
	skipNewline = false;
	if (c == NEWLINE) continue;
	}

	if (_invalidUnicode(c)) errors.add('invalid-codepoint');

	if (0xD800 <= c && c <= 0xDFFF) {
	c = 0xFFFD;
	} else if (c == RETURN) {
	skipNewline = true;
	c = NEWLINE;
	}

	_chars.add(c);
	if (c == NEWLINE) _lineStarts.add(_chars.length);
	}

	// Free decoded characters if they aren't needed anymore.
	if (_rawBytes != null) _rawChars = null;

	// TODO(sigmund): Don't parse the file at all if spans aren't being
	// generated.
	fileInfo = SourceFile.decoded(_chars, url: sourceUrl);
	}

	void detectEncoding([bool parseMeta = true]) {
	// First look for a BOM
	// This will also read past the BOM if present
	charEncodingName = detectBOM();
	charEncodingCertain = true;

	// If there is no BOM need to look for meta elements with encoding
	// information
	if (charEncodingName == null && parseMeta) {
	charEncodingName = detectEncodingMeta();
	charEncodingCertain = false;
	}
	// If all else fails use the default encoding
	if (charEncodingName == null) {
	charEncodingCertain = false;
	charEncodingName = defaultEncoding;
	}

	// Substitute for equivalent encodings:
	if (charEncodingName.toLowerCase() == 'iso-8859-1') {
	charEncodingName = 'windows-1252';
	}
	}

	void changeEncoding(String newEncoding) {
	if (_rawBytes == null) {
	// We should never get here -- if encoding is certain we won't try to
	// change it.
	throw StateError('cannot change encoding when parsing a String.');
	}

	newEncoding = codecName(newEncoding);
	if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {
	newEncoding = 'utf-8';
	}
	if (newEncoding == null) {
	return;
	} else if (newEncoding == charEncodingName) {
	charEncodingCertain = true;
	} else {
	charEncodingName = newEncoding;
	charEncodingCertain = true;
	_rawChars = null;
	reset();
	throw ReparseException(
	'Encoding changed from $charEncodingName to $newEncoding');
	}
	}

	/// Attempts to detect at BOM at the start of the stream. If
	/// an encoding can be determined from the BOM return the name of the
	/// encoding otherwise return null.
	String detectBOM() {
	// Try detecting the BOM using bytes from the string
	if (_hasUtf8Bom(_rawBytes)) {
	return 'utf-8';
	}
	return null;
	}

	/// Report the encoding declared by the meta element.
	String detectEncodingMeta() {
	var parser = EncodingParser(slice(_rawBytes, 0, numBytesMeta));
	var encoding = parser.getEncoding();

	if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {
	encoding = 'utf-8';
	}

	return encoding;
	}

	/// Returns the current offset in the stream, i.e. the number of codepoints
	/// since the start of the file.
	int get position => _offset;

	/// Read one character from the stream or queue if available. Return
	/// EOF when EOF is reached.
	String char() {
	if (_offset >= _chars.length) return eof;
	return String.fromCharCodes([_chars[_offset++]]);
	}

	String peekChar() {
	if (_offset >= _chars.length) return eof;
	return String.fromCharCodes([_chars[_offset]]);
	}

	/// Returns a string of characters from the stream up to but not
	/// including any character in 'characters' or EOF.
	String charsUntil(String characters, [bool opposite = false]) {
	int start = _offset;
	String c;
	while ((c = peekChar()) != null && characters.contains(c) == opposite) {
	_offset++;
	}

	return String.fromCharCodes(_chars.sublist(start, _offset));
	}

	void unget(String ch) {
	// Only one character is allowed to be ungotten at once - it must
	// be consumed again before any further call to unget
	if (ch != null) {
	_offset--;
	assert(peekChar() == ch);
	}
	}
	}

	// TODO(jmesserly): the Python code used a regex to check for this. But
	// Dart doesn't let you create a regexp with invalid characters.
	bool _invalidUnicode(int c) {
	if (0x0001 <= c && c <= 0x0008) return true;
	if (0x000E <= c && c <= 0x001F) return true;
	if (0x007F <= c && c <= 0x009F) return true;
	if (0xD800 <= c && c <= 0xDFFF) return true;
	if (0xFDD0 <= c && c <= 0xFDEF) return true;
	switch (c) {
	case 0x000B:
	case 0xFFFE:
	case 0xFFFF:
	case 0x01FFFE:
	case 0x01FFFF:
	case 0x02FFFE:
	case 0x02FFFF:
	case 0x03FFFE:
	case 0x03FFFF:
	case 0x04FFFE:
	case 0x04FFFF:
	case 0x05FFFE:
	case 0x05FFFF:
	case 0x06FFFE:
	case 0x06FFFF:
	case 0x07FFFE:
	case 0x07FFFF:
	case 0x08FFFE:
	case 0x08FFFF:
	case 0x09FFFE:
	case 0x09FFFF:
	case 0x0AFFFE:
	case 0x0AFFFF:
	case 0x0BFFFE:
	case 0x0BFFFF:
	case 0x0CFFFE:
	case 0x0CFFFF:
	case 0x0DFFFE:
	case 0x0DFFFF:
	case 0x0EFFFE:
	case 0x0EFFFF:
	case 0x0FFFFE:
	case 0x0FFFFF:
	case 0x10FFFE:
	case 0x10FFFF:
	return true;
	}
	return false;
	}

	/// Return the python codec name corresponding to an encoding or null if the
	/// string doesn't correspond to a valid encoding.
	String codecName(String encoding) {
	final asciiPunctuation = RegExp(
	"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

	if (encoding == null) return null;
	var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
	return encodings[canonicalName];
	}

	/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
	/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
	/// used in HTML to detect the UTF-
	bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
	int end = length != null ? offset + length : bytes.length;
	return (offset + 3) <= end &&
	bytes[offset] == 0xEF &&
	bytes[offset + 1] == 0xBB &&
	bytes[offset + 2] == 0xBF;
	}

	/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
	/// the codepoints. Supports the major unicode encodings as well as ascii and
	/// and windows-1252 encodings.
	Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
	switch (encoding) {
	case 'ascii':
	return ascii.decode(bytes).runes;

	case 'utf-8':
	// NOTE: To match the behavior of the other decode functions, we eat the
	// UTF-8 BOM here. This is the default behavior of `utf8.decode`.
	return utf8.decode(bytes).runes;

	default:
	throw ArgumentError('Encoding $encoding not supported');
	}
	}