// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
import 'package:collection/collection.dart';
import 'package:string_scanner/string_scanner.dart';
import 'package:source_span/source_span.dart';
import 'style.dart';
import 'token.dart';
import 'utils.dart';
import 'yaml_exception.dart';
/// A scanner that reads a string of Unicode characters and emits [Token]s.
/// This is based on the libyaml scanner, available at
/// The license for
/// that is available in ../../libyaml-license.txt.
class Scanner {
static const TAB = 0x9;
static const LF = 0xA;
static const CR = 0xD;
static const SP = 0x20;
static const DOLLAR = 0x24;
static const LEFT_PAREN = 0x28;
static const RIGHT_PAREN = 0x29;
static const PLUS = 0x2B;
static const COMMA = 0x2C;
static const HYPHEN = 0x2D;
static const PERIOD = 0x2E;
static const QUESTION = 0x3F;
static const COLON = 0x3A;
static const SEMICOLON = 0x3B;
static const EQUALS = 0x3D;
static const LEFT_SQUARE = 0x5B;
static const RIGHT_SQUARE = 0x5D;
static const LEFT_CURLY = 0x7B;
static const RIGHT_CURLY = 0x7D;
static const HASH = 0x23;
static const AMPERSAND = 0x26;
static const ASTERISK = 0x2A;
static const EXCLAMATION = 0x21;
static const VERTICAL_BAR = 0x7C;
static const LEFT_ANGLE = 0x3C;
static const RIGHT_ANGLE = 0x3E;
static const SINGLE_QUOTE = 0x27;
static const DOUBLE_QUOTE = 0x22;
static const PERCENT = 0x25;
static const AT = 0x40;
static const GRAVE_ACCENT = 0x60;
static const TILDE = 0x7E;
static const NULL = 0x0;
static const BELL = 0x7;
static const BACKSPACE = 0x8;
static const VERTICAL_TAB = 0xB;
static const FORM_FEED = 0xC;
static const ESCAPE = 0x1B;
static const SLASH = 0x2F;
static const BACKSLASH = 0x5C;
static const UNDERSCORE = 0x5F;
static const NEL = 0x85;
static const NBSP = 0xA0;
static const LINE_SEPARATOR = 0x2028;
static const PARAGRAPH_SEPARATOR = 0x2029;
static const BOM = 0xFEFF;
static const NUMBER_0 = 0x30;
static const NUMBER_9 = 0x39;
static const LETTER_A = 0x61;
static const LETTER_B = 0x62;
static const LETTER_E = 0x65;
static const LETTER_F = 0x66;
static const LETTER_N = 0x6E;
static const LETTER_R = 0x72;
static const LETTER_T = 0x74;
static const LETTER_U = 0x75;
static const LETTER_V = 0x76;
static const LETTER_X = 0x78;
static const LETTER_Z = 0x7A;
static const LETTER_CAP_A = 0x41;
static const LETTER_CAP_F = 0x46;
static const LETTER_CAP_L = 0x4C;
static const LETTER_CAP_N = 0x4E;
static const LETTER_CAP_P = 0x50;
static const LETTER_CAP_U = 0x55;
static const LETTER_CAP_X = 0x58;
static const LETTER_CAP_Z = 0x5A;
/// The underlying [SpanScanner] used to read characters from the source text.
/// This is also used to track line and column information and to generate
/// [SourceSpan]s.
final SpanScanner _scanner;
/// Whether this scanner has produced a [TokenType.STREAM_START] token
/// indicating the beginning of the YAML stream.
var _streamStartProduced = false;
/// Whether this scanner has produced a [TokenType.STREAM_END] token
/// indicating the end of the YAML stream.
var _streamEndProduced = false;
/// The queue of tokens yet to be emitted.
/// These are queued up in advance so that [TokenType.KEY] tokens can be
/// inserted once the scanner determines that a series of tokens represents a
/// mapping key.
final _tokens = new QueueList<Token>();
/// The number of tokens that have been emitted.
/// This doesn't count tokens in [tokens].
var _tokensParsed = 0;
/// Whether the next token in [_tokens] is ready to be returned.
/// It might not be ready if there may still be a [TokenType.KEY] inserted
/// before it.
var _tokenAvailable = false;
/// The stack of indent levels for the current nested block contexts.
/// The YAML spec specifies that the initial indentation level is -1 spaces.
final _indents = <int>[-1];
/// Whether a simple key is allowed in this context.
/// A simple key refers to any mapping key that doesn't have an explicit "?".
var _simpleKeyAllowed = true;
/// The stack of potential simple keys for each level of flow nesting.
/// Entries in this list may be `null`, indicating that there is no valid
/// simple key for the associated level of nesting.
/// When a ":" is parsed and there's a simple key available, a [TokenType.KEY]
/// token is inserted in [_tokens] before that key's token. This allows the
/// parser to tell that the key is intended to be a mapping key.
final _simpleKeys = <_SimpleKey>[null];
/// The current indentation level.
int get _indent => _indents.last;
/// Whether the scanner's currently positioned in a block-level structure (as
/// opposed to flow-level).
bool get _inBlockContext => _simpleKeys.length == 1;
/// Whether the current character is a line break or the end of the source.
bool get _isBreakOrEnd => _scanner.isDone || _isBreak;
/// Whether the current character is a line break.
bool get _isBreak => _isBreakAt(0);
/// Whether the current character is whitespace or the end of the source.
bool get _isBlankOrEnd => _isBlankOrEndAt(0);
/// Whether the current character is whitespace.
bool get _isBlank => _isBlankAt(0);
/// Whether the current character is a valid tag name character.
/// See
bool get _isTagChar {
var char = _scanner.peekChar();
if (char == null) return false;
switch (char) {
case HYPHEN:
case SLASH:
case COLON:
case AT:
case EQUALS:
case PLUS:
case DOLLAR:
case PERIOD:
case TILDE:
return true;
return (char >= NUMBER_0 && char <= NUMBER_9) ||
(char >= LETTER_A && char <= LETTER_Z) ||
(char >= LETTER_CAP_A && char <= LETTER_CAP_Z);
/// Whether the current character is a valid anchor name character.
/// See
bool get _isAnchorChar {
if (!_isNonSpace) return false;
switch (_scanner.peekChar()) {
case COMMA:
return false;
return true;
/// Whether the character at the current position is a decimal digit.
bool get _isDigit {
var char = _scanner.peekChar();
return char != null && (char >= NUMBER_0 && char <= NUMBER_9);
/// Whether the character at the current position is a hexidecimal
/// digit.
bool get _isHex {
var char = _scanner.peekChar();
if (char == null) return false;
return (char >= NUMBER_0 && char <= NUMBER_9) ||
(char >= LETTER_A && char <= LETTER_F) ||
(char >= LETTER_CAP_A && char <= LETTER_CAP_F);
/// Whether the character at the current position is a plain character.
/// See
bool get _isPlainChar => _isPlainCharAt(0);
/// Whether the character at the current position is a printable character
/// other than a line break or byte-order mark.
/// See
bool get _isNonBreak {
var char = _scanner.peekChar();
if (char == null) return false;
switch (char) {
case LF:
case CR:
case BOM:
return false;
case TAB:
case NEL:
return true;
return (char >= 0x00020 && char <= 0x00007E) ||
(char >= 0x000A0 && char <= 0x00D7FF) ||
(char >= 0x0E000 && char <= 0x00FFFD) ||
(char >= 0x10000 && char <= 0x10FFFF);
/// Whether the character at the current position is a printable character
/// other than whitespace.
/// See
bool get _isNonSpace {
var char = _scanner.peekChar();
if (char == null) return false;
switch (char) {
case LF:
case CR:
case BOM:
case SP:
return false;
case NEL:
return true;
return (char >= 0x00020 && char <= 0x00007E) ||
(char >= 0x000A0 && char <= 0x00D7FF) ||
(char >= 0x0E000 && char <= 0x00FFFD) ||
(char >= 0x10000 && char <= 0x10FFFF);
/// Returns Whether or not the current character begins a documentation
/// indicator.
/// If so, this sets the scanner's last match to that indicator.
bool get _isDocumentIndicator {
return _scanner.column == 0 &&
_isBlankOrEndAt(3) &&
(_scanner.matches('---') || _scanner.matches('...'));
/// Creates a scanner that scans [source].
/// [sourceUrl] can be a String or a [Uri].
Scanner(String source, {sourceUrl})
: _scanner = new SpanScanner.eager(source, sourceUrl: sourceUrl);
/// Consumes and returns the next token.
Token scan() {
if (_streamEndProduced) throw new StateError("Out of tokens.");
if (!_tokenAvailable) _fetchMoreTokens();
var token = _tokens.removeFirst();
_tokenAvailable = false;
_streamEndProduced = token is Token && token.type == TokenType.STREAM_END;
return token;
/// Consumes the next token and returns the one after that.
Token advance() {
return peek();
/// Returns the next token without consuming it.
Token peek() {
if (_streamEndProduced) return null;
if (!_tokenAvailable) _fetchMoreTokens();
return _tokens.first;
/// Ensures that [_tokens] contains at least one token which can be returned.
void _fetchMoreTokens() {
while (true) {
if (_tokens.isNotEmpty) {
// If there are no more tokens to fetch, break.
if (_tokens.last.type == TokenType.STREAM_END) break;
// If the current token could be a simple key, we need to scan more
// tokens until we determine whether it is or not. Otherwise we might
// not emit the `KEY` token before we emit the value of the key.
if (!_simpleKeys
.any((key) => key != null && key.tokenNumber == _tokensParsed)) {
_tokenAvailable = true;
/// The dispatcher for token fetchers.
void _fetchNextToken() {
if (!_streamStartProduced) {
if (_scanner.isDone) {
if (_scanner.column == 0) {
if (_scanner.peekChar() == PERCENT) {
if (_isBlankOrEndAt(3)) {
if (_scanner.matches('---')) {
if (_scanner.matches('...')) {
switch (_scanner.peekChar()) {
case COMMA:
_fetchAnchor(anchor: false);
_fetchAnchor(anchor: true);
_fetchFlowScalar(singleQuote: true);
_fetchFlowScalar(singleQuote: false);
if (!_inBlockContext) _invalidScalarCharacter();
_fetchBlockScalar(literal: true);
if (!_inBlockContext) _invalidScalarCharacter();
_fetchBlockScalar(literal: false);
case AT:
// These characters may sometimes begin plain scalars.
case HYPHEN:
if (_isPlainCharAt(1)) {
} else {
if (_isPlainCharAt(1)) {
} else {
case COLON:
if (!_inBlockContext && _tokens.isNotEmpty) {
// If a colon follows a "JSON-like" value (an explicit map or list, or
// a quoted string) it isn't required to have whitespace after it
// since it unambiguously describes a map.
var token = _tokens.last;
if (token.type == TokenType.FLOW_SEQUENCE_END ||
token.type == TokenType.FLOW_MAPPING_END ||
(token.type == TokenType.SCALAR &&
(token as ScalarToken).style.isQuoted)) {
if (_isPlainCharAt(1)) {
} else {
if (!_isNonBreak) _invalidScalarCharacter();
/// Throws an error about a disallowed character.
void _invalidScalarCharacter() =>
_scanner.error("Unexpected character.", length: 1);
/// Checks the list of potential simple keys and remove the positions that
/// cannot contain simple keys anymore.
void _staleSimpleKeys() {
for (var i = 0; i < _simpleKeys.length; i++) {
var key = _simpleKeys[i];
if (key == null) continue;
// libyaml requires that all simple keys be a single line and no longer
// than 1024 characters. However, in section 7.4.2 of the spec
// (, these restrictions are
// only applied when the curly braces are omitted. It's difficult to
// retain enough context to know which keys need to have the restriction
// placed on them, so for now we go the other direction and allow
// everything but multiline simple keys in a block context.
if (!_inBlockContext) continue;
if (key.line == _scanner.line) continue;
if (key.required) {
throw new YamlException("Expected ':'.", _scanner.emptySpan);
_simpleKeys[i] = null;
/// Checks if a simple key may start at the current position and saves it if
/// so.
void _saveSimpleKey() {
// A simple key is required at the current position if the scanner is in the
// block context and the current column coincides with the indentation
// level.
var required = _inBlockContext && _indent == _scanner.column;
// A simple key is required only when it is the first token in the current
// line. Therefore it is always allowed. But we add a check anyway.
assert(_simpleKeyAllowed || !required);
if (!_simpleKeyAllowed) return;
// If the current position may start a simple key, save it.
_simpleKeys[_simpleKeys.length - 1] = new _SimpleKey(
_tokensParsed + _tokens.length,
required: required);
/// Removes a potential simple key at the current flow level.
void _removeSimpleKey() {
var key = _simpleKeys.last;
if (key != null && key.required) {
throw new YamlException("Could not find expected ':' for simple key.",
_simpleKeys[_simpleKeys.length - 1] = null;
/// Increases the flow level and resizes the simple key list.
void _increaseFlowLevel() {
/// Decreases the flow level.
void _decreaseFlowLevel() {
if (_inBlockContext) return;
/// Pushes the current indentation level to the stack and sets the new level
/// if [column] is greater than [_indent].
/// If it is, appends or inserts the specified token into [_tokens]. If
/// [tokenNumber] is provided, the corresponding token will be replaced;
/// otherwise, the token will be added at the end.
void _rollIndent(int column, TokenType type, SourceLocation location,
{int tokenNumber}) {
if (!_inBlockContext) return;
if (_indent != -1 && _indent >= column) return;
// Push the current indentation level to the stack and set the new
// indentation level.
// Create a token and insert it into the queue.
var token = new Token(type, location.pointSpan());
if (tokenNumber == null) {
} else {
_tokens.insert(tokenNumber - _tokensParsed, token);
/// Pops indentation levels from [_indents] until the current level becomes
/// less than or equal to [column].
/// For each indentation level, appends a [TokenType.BLOCK_END] token.
void _unrollIndent(int column) {
if (!_inBlockContext) return;
while (_indent > column) {
_tokens.add(new Token(TokenType.BLOCK_END, _scanner.emptySpan));
/// Pops indentation levels from [_indents] until the current level resets to
/// -1.
/// For each indentation level, appends a [TokenType.BLOCK_END] token.
void _resetIndent() => _unrollIndent(-1);
/// Produces a [TokenType.STREAM_START] token.
void _fetchStreamStart() {
// Much of libyaml's initialization logic here is done in variable
// initializers instead.
_streamStartProduced = true;
_tokens.add(new Token(TokenType.STREAM_START, _scanner.emptySpan));
/// Produces a [TokenType.STREAM_END] token.
void _fetchStreamEnd() {
_simpleKeyAllowed = false;
_tokens.add(new Token(TokenType.STREAM_END, _scanner.emptySpan));
/// Produces a [TokenType.VERSION_DIRECTIVE] or [TokenType.TAG_DIRECTIVE]
/// token.
void _fetchDirective() {
_simpleKeyAllowed = false;
var directive = _scanDirective();
if (directive != null) _tokens.add(directive);
/// Produces a [TokenType.DOCUMENT_START] or [TokenType.DOCUMENT_END] token.
void _fetchDocumentIndicator(TokenType type) {
_simpleKeyAllowed = false;
// Consume the indicator token.
var start = _scanner.state;
_tokens.add(new Token(type, _scanner.spanFrom(start)));
/// Produces a [TokenType.FLOW_SEQUENCE_START] or
/// [TokenType.FLOW_MAPPING_START] token.
void _fetchFlowCollectionStart(TokenType type) {
_simpleKeyAllowed = true;
/// Produces a [TokenType.FLOW_SEQUENCE_END] or [TokenType.FLOW_MAPPING_END]
/// token.
void _fetchFlowCollectionEnd(TokenType type) {
_simpleKeyAllowed = false;
/// Produces a [TokenType.FLOW_ENTRY] token.
void _fetchFlowEntry() {
_simpleKeyAllowed = true;
/// Produces a [TokenType.BLOCK_ENTRY] token.
void _fetchBlockEntry() {
if (_inBlockContext) {
if (!_simpleKeyAllowed) {
throw new YamlException(
"Block sequence entries are not allowed here.", _scanner.emptySpan);
_scanner.column, TokenType.BLOCK_SEQUENCE_START, _scanner.location);
} else {
// It is an error for the '-' indicator to occur in the flow context, but
// we let the Parser detect and report it because it's able to point to
// the context.
_simpleKeyAllowed = true;
/// Produces the [TokenType.KEY] token.
void _fetchKey() {
if (_inBlockContext) {
if (!_simpleKeyAllowed) {
throw new YamlException(
"Mapping keys are not allowed here.", _scanner.emptySpan);
_scanner.column, TokenType.BLOCK_MAPPING_START, _scanner.location);
// Simple keys are allowed after `?` in a block context.
_simpleKeyAllowed = _inBlockContext;
/// Produces the [TokenType.VALUE] token.
void _fetchValue() {
var simpleKey = _simpleKeys.last;
if (simpleKey != null) {
// Add a [TokenType.KEY] directive before the first token of the simple
// key so the parser knows that it's part of a key/value pair.
_tokens.insert(simpleKey.tokenNumber - _tokensParsed,
new Token(TokenType.KEY, simpleKey.location.pointSpan()));
// In the block context, we may need to add the
// [TokenType.BLOCK_MAPPING_START] token.
simpleKey.column, TokenType.BLOCK_MAPPING_START, simpleKey.location,
tokenNumber: simpleKey.tokenNumber);
// Remove the simple key.
_simpleKeys[_simpleKeys.length - 1] = null;
// A simple key cannot follow another simple key.
_simpleKeyAllowed = false;
} else if (_inBlockContext) {
if (!_simpleKeyAllowed) {
throw new YamlException(
"Mapping values are not allowed here. Did you miss a colon "
// If we're here, we've found the ':' indicator following a complex key.
_scanner.column, TokenType.BLOCK_MAPPING_START, _scanner.location);
_simpleKeyAllowed = true;
} else if (_simpleKeyAllowed) {
// If we're here, we've found the ':' indicator with an empty key. This
// behavior differs from libyaml, which disallows empty implicit keys.
_simpleKeyAllowed = false;
/// Adds a token with [type] to [_tokens].
/// The span of the new token is the current character.
void _addCharToken(TokenType type) {
var start = _scanner.state;
_tokens.add(new Token(type, _scanner.spanFrom(start)));
/// Produces a [TokenType.ALIAS] or [TokenType.ANCHOR] token.
void _fetchAnchor({bool anchor: true}) {
_simpleKeyAllowed = false;
_tokens.add(_scanAnchor(anchor: anchor));
/// Produces a [TokenType.TAG] token.
void _fetchTag() {
_simpleKeyAllowed = false;
/// Produces a [TokenType.SCALAR] token with style [ScalarStyle.LITERAL] or
/// [ScalarStyle.FOLDED].
void _fetchBlockScalar({bool literal: false}) {
_simpleKeyAllowed = true;
_tokens.add(_scanBlockScalar(literal: literal));
/// Produces a [TokenType.SCALAR] token with style [ScalarStyle.SINGLE_QUOTED]
/// or [ScalarStyle.DOUBLE_QUOTED].
void _fetchFlowScalar({bool singleQuote: false}) {
_simpleKeyAllowed = false;
_tokens.add(_scanFlowScalar(singleQuote: singleQuote));
/// Produces a [TokenType.SCALAR] token with style [ScalarStyle.PLAIN].
void _fetchPlainScalar() {
_simpleKeyAllowed = false;
/// Eats whitespace and comments until the next token is found.
void _scanToNextToken() {
var afterLineBreak = false;
while (true) {
// Allow the BOM to start a line.
if (_scanner.column == 0) _scanner.scan("\uFEFF");
// Eat whitespace.
// libyaml disallows tabs after "-", "?", or ":", but the spec allows
// them. See section 6.2:
while (_scanner.peekChar() == SP ||
((!_inBlockContext || !afterLineBreak) &&
_scanner.peekChar() == TAB)) {
if (_scanner.peekChar() == TAB) {
_scanner.error("Tab characters are not allowed as indentation.",
length: 1);
// Eat a comment until a line break.
// If we're at a line break, eat it.
if (_isBreak) {
// In the block context, a new line may start a simple key.
if (_inBlockContext) _simpleKeyAllowed = true;
afterLineBreak = true;
} else {
// Otherwise we've found a token.
/// Scans a [TokenType.YAML_DIRECTIVE] or [TokenType.TAG_DIRECTIVE] token.
/// %YAML 1.2 # a comment \n
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/// %TAG !yaml!,2002: \n
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Token _scanDirective() {
var start = _scanner.state;
// Eat '%'.
var token;
var name = _scanDirectiveName();
if (name == "YAML") {
token = _scanVersionDirectiveValue(start);
} else if (name == "TAG") {
token = _scanTagDirectiveValue(start);
} else {
warn("Warning: unknown directive.", _scanner.spanFrom(start));
// libyaml doesn't support unknown directives, but the spec says to ignore
// them and warn:
while (!_isBreakOrEnd) {
return null;
// Eat the rest of the line, including any comments.
if (!_isBreakOrEnd) {
throw new YamlException("Expected comment or line break after directive.",
return token;
/// Scans a directive name.
/// %YAML 1.2 # a comment \n
/// ^^^^
/// %TAG !yaml!,2002: \n
/// ^^^
String _scanDirectiveName() {
// libyaml only allows word characters in directive names, but the spec
// disagrees:
var start = _scanner.position;
while (_isNonSpace) {
var name = _scanner.substring(start);
if (name.isEmpty) {
throw new YamlException("Expected directive name.", _scanner.emptySpan);
} else if (!_isBlankOrEnd) {
throw new YamlException(
"Unexpected character in directive name.", _scanner.emptySpan);
return name;
/// Scans the value of a version directive.
/// %YAML 1.2 # a comment \n
/// ^^^^^^
Token _scanVersionDirectiveValue(LineScannerState start) {
var major = _scanVersionDirectiveNumber();
var minor = _scanVersionDirectiveNumber();
return new VersionDirectiveToken(_scanner.spanFrom(start), major, minor);
/// Scans the version number of a version directive.
/// %YAML 1.2 # a comment \n
/// ^
/// %YAML 1.2 # a comment \n
/// ^
int _scanVersionDirectiveNumber() {
var start = _scanner.position;
while (_isDigit) {
var number = _scanner.substring(start);
if (number.isEmpty) {
throw new YamlException("Expected version number.", _scanner.emptySpan);
return int.parse(number);
/// Scans the value of a tag directive.
/// %TAG !yaml!,2002: \n
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Token _scanTagDirectiveValue(LineScannerState start) {
var handle = _scanTagHandle(directive: true);
if (!_isBlank) {
throw new YamlException("Expected whitespace.", _scanner.emptySpan);
var prefix = _scanTagUri();
if (!_isBlankOrEnd) {
throw new YamlException("Expected whitespace.", _scanner.emptySpan);
return new TagDirectiveToken(_scanner.spanFrom(start), handle, prefix);
/// Scans a [TokenType.ANCHOR] token.
Token _scanAnchor({bool anchor: true}) {
var start = _scanner.state;
// Eat the indicator character.
// libyaml only allows word characters in anchor names, but the spec
// disagrees:
var startPosition = _scanner.position;
while (_isAnchorChar) {
var name = _scanner.substring(startPosition);
var next = _scanner.peekChar();
if (name.isEmpty ||
(!_isBlankOrEnd &&
next != QUESTION &&
next != COLON &&
next != COMMA &&
next != RIGHT_SQUARE &&
next != RIGHT_CURLY &&
next != PERCENT &&
next != AT &&
next != GRAVE_ACCENT)) {
throw new YamlException(
"Expected alphanumeric character.", _scanner.emptySpan);
if (anchor) {
return new AnchorToken(_scanner.spanFrom(start), name);
} else {
return new AliasToken(_scanner.spanFrom(start), name);
/// Scans a [TokenType.TAG] token.
Token _scanTag() {
var handle;
var suffix;
var start = _scanner.state;
// Check if the tag is in the canonical form.
if (_scanner.peekChar(1) == LEFT_ANGLE) {
// Eat '!<'.
handle = '';
suffix = _scanTagUri();
} else {
// The tag has either the '!suffix' or the '!handle!suffix' form.
// First, try to scan a handle.
handle = _scanTagHandle();
if (handle.length > 1 && handle.startsWith('!') && handle.endsWith('!')) {
suffix = _scanTagUri(flowSeparators: false);
} else {
suffix = _scanTagUri(head: handle, flowSeparators: false);
// There was no explicit handle.
if (suffix.isEmpty) {
// This is the special '!' tag.
handle = null;
suffix = '!';
} else {
handle = '!';
// libyaml insists on whitespace after a tag, but example 7.2 indicates
// that it's not required:
return new TagToken(_scanner.spanFrom(start), handle, suffix);
/// Scans a tag handle.
String _scanTagHandle({bool directive: false}) {
var buffer = new StringBuffer('!');
// libyaml only allows word characters in tags, but the spec disagrees:
var start = _scanner.position;
while (_isTagChar) {
if (_scanner.peekChar() == EXCLAMATION) {
} else {
// It's either the '!' tag or not really a tag handle. If it's a %TAG
// directive, it's an error. If it's a tag token, it must be part of a
// URI.
if (directive && buffer.toString() != '!') _scanner.expect('!');
return buffer.toString();
/// Scans a tag URI.
/// [head] is the initial portion of the tag that's already been scanned.
/// [flowSeparators] indicates whether the tag URI can contain flow
/// separators.
String _scanTagUri({String head, bool flowSeparators: true}) {
var length = head == null ? 0 : head.length;
var buffer = new StringBuffer();
// Copy the head if needed.
// Note that we don't copy the leading '!' character.
if (length > 1) buffer.write(head.substring(1));
// The set of characters that may appear in URI is as follows:
// '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&',
// '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']',
// '%'.
// In a shorthand tag annotation, the flow separators ',', '[', and ']' are
// disallowed.
var start = _scanner.position;
var char = _scanner.peekChar();
while (_isTagChar ||
(flowSeparators &&
(char == COMMA || char == LEFT_SQUARE || char == RIGHT_SQUARE))) {
char = _scanner.peekChar();
// libyaml manually decodes the URL, but we don't have to do that.
return Uri.decodeFull(_scanner.substring(start));
/// Scans a block scalar.
Token _scanBlockScalar({bool literal: false}) {
var start = _scanner.state;
// Eat the indicator '|' or '>'.
// Check for a chomping indicator.
var chomping = _Chomping.CLIP;
var increment = 0;
var char = _scanner.peekChar();
if (char == PLUS || char == HYPHEN) {
chomping = char == PLUS ? _Chomping.KEEP : _Chomping.STRIP;
// Check for an indentation indicator.
if (_isDigit) {
// Check that the indentation is greater than 0.
if (_scanner.peekChar() == NUMBER_0) {
throw new YamlException(
"0 may not be used as an indentation indicator.",
increment = _scanner.readChar() - NUMBER_0;
} else if (_isDigit) {
// Do the same as above, but in the opposite order.
if (_scanner.peekChar() == NUMBER_0) {
throw new YamlException(
"0 may not be used as an indentation indicator.",
increment = _scanner.readChar() - NUMBER_0;
char = _scanner.peekChar();
if (char == PLUS || char == HYPHEN) {
chomping = char == PLUS ? _Chomping.KEEP : _Chomping.STRIP;
// Eat whitespace and comments to the end of the line.
// Check if we're at the end of the line.
if (!_isBreakOrEnd) {
throw new YamlException(
"Expected comment or line break.", _scanner.emptySpan);
// If the block scalar has an explicit indentation indicator, add that to
// the current indentation to get the indentation level for the scalar's
// contents.
var indent = 0;
if (increment != 0) {
indent = _indent >= 0 ? _indent + increment : increment;
// Scan the leading line breaks to determine the indentation level if
// needed.
var pair = _scanBlockScalarBreaks(indent);
indent = pair.first;
var trailingBreaks = pair.last;
// Scan the block scalar contents.
var buffer = new StringBuffer();
var leadingBreak = '';
var leadingBlank = false;
var trailingBlank = false;
var end = _scanner.state;
while (_scanner.column == indent && !_scanner.isDone) {
// Check for a document indicator. libyaml doesn't do this, but the spec
// mandates it. See example 9.5:
if (_isDocumentIndicator) break;
// We are at the beginning of a non-empty line.
// Is there trailing whitespace?
trailingBlank = _isBlank;
// Check if we need to fold the leading line break.
if (!literal &&
leadingBreak.isNotEmpty &&
!leadingBlank &&
!trailingBlank) {
// Do we need to join the lines with a space?
if (trailingBreaks.isEmpty) buffer.writeCharCode(SP);
} else {
leadingBreak = '';
// Append the remaining line breaks.
// Is there leading whitespace?
leadingBlank = _isBlank;
var startPosition = _scanner.position;
while (!_isBreakOrEnd) {
end = _scanner.state;
// libyaml always reads a line here, but this breaks on block scalars at
// the end of the document that end without newlines. See example 8.1:
if (!_scanner.isDone) leadingBreak = _readLine();
// Eat the following indentation and spaces.
var pair = _scanBlockScalarBreaks(indent);
indent = pair.first;
trailingBreaks = pair.last;
// Chomp the tail.
if (chomping != _Chomping.STRIP) buffer.write(leadingBreak);
if (chomping == _Chomping.KEEP) buffer.write(trailingBreaks);
return new ScalarToken(_scanner.spanFrom(start, end), buffer.toString(),
literal ? ScalarStyle.LITERAL : ScalarStyle.FOLDED);
/// Scans indentation spaces and line breaks for a block scalar.
/// Determines the intendation level if needed. Returns the new indentation
/// level and the text of the line breaks.
Pair<int, String> _scanBlockScalarBreaks(int indent) {
var maxIndent = 0;
var breaks = new StringBuffer();
while (true) {
while ((indent == 0 || _scanner.column < indent) &&
_scanner.peekChar() == SP) {
if (_scanner.column > maxIndent) maxIndent = _scanner.column;
// libyaml throws an error here if a tab character is detected, but the
// spec treats tabs like any other non-space character. See example 8.2:
if (!_isBreak) break;
if (indent == 0) {
indent = maxIndent;
if (indent < _indent + 1) indent = _indent + 1;
// libyaml forces indent to be at least 1 here, but that doesn't seem to
// be supported by the spec.
return new Pair(indent, breaks.toString());
// Scans a quoted scalar.
Token _scanFlowScalar({bool singleQuote: false}) {
var start = _scanner.state;
var buffer = new StringBuffer();
// Eat the left quote.
while (true) {
// Check that there are no document indicators at the beginning of the
// line.
if (_isDocumentIndicator) {
_scanner.error("Unexpected document indicator.");
if (_scanner.isDone) {
throw new YamlException("Unexpected end of file.", _scanner.emptySpan);
var leadingBlanks = false;
while (!_isBlankOrEnd) {
var char = _scanner.peekChar();
if (singleQuote &&
char == SINGLE_QUOTE &&
_scanner.peekChar(1) == SINGLE_QUOTE) {
// An escaped single quote.
} else if (char == (singleQuote ? SINGLE_QUOTE : DOUBLE_QUOTE)) {
// The closing quote.
} else if (!singleQuote && char == BACKSLASH && _isBreakAt(1)) {
// An escaped newline.
leadingBlanks = true;
} else if (!singleQuote && char == BACKSLASH) {
var escapeStart = _scanner.state;
// An escape sequence.
var codeLength = null;
switch (_scanner.peekChar(1)) {
case NUMBER_0:
case LETTER_A:
case LETTER_B:
case LETTER_T:
case TAB:
case LETTER_N:
case LETTER_V:
case LETTER_F:
case LETTER_R:
case LETTER_E:
case SP:
case SLASH:
// libyaml doesn't support an escaped forward slash, but it was
// added in YAML 1.2. See section 5.7:
case LETTER_X:
codeLength = 2;
case LETTER_U:
codeLength = 4;
codeLength = 8;
throw new YamlException(
"Unknown escape character.", _scanner.spanFrom(escapeStart));
if (codeLength != null) {
var value = 0;
for (var i = 0; i < codeLength; i++) {
if (!_isHex) {
throw new YamlException(
"Expected $codeLength-digit hexidecimal number.",
value = (value << 4) + _asHex(_scanner.readChar());
// Check the value and write the character.
if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) {
throw new YamlException("Invalid Unicode character escape code.",
} else {
// Check if we're at the end of a scalar.
if (_scanner.peekChar() == (singleQuote ? SINGLE_QUOTE : DOUBLE_QUOTE)) {
var whitespace = new StringBuffer();
var leadingBreak = '';
var trailingBreaks = new StringBuffer();
while (_isBlank || _isBreak) {
if (_isBlank) {
// Consume a space or a tab.
if (!leadingBlanks) {
} else {
} else {
// Check if it's a first line break.
if (!leadingBlanks) {
leadingBreak = _readLine();
leadingBlanks = true;
} else {
// Join the whitespace or fold line breaks.
if (leadingBlanks) {
if (leadingBreak.isNotEmpty && trailingBreaks.isEmpty) {
} else {
} else {
// Eat the right quote.
return new ScalarToken(_scanner.spanFrom(start), buffer.toString(),
singleQuote ? ScalarStyle.SINGLE_QUOTED : ScalarStyle.DOUBLE_QUOTED);
/// Scans a plain scalar.
Token _scanPlainScalar() {
var start = _scanner.state;
var end = _scanner.state;
var buffer = new StringBuffer();
var leadingBreak = '';
var trailingBreaks = '';
var whitespace = new StringBuffer();
var indent = _indent + 1;
while (true) {
// Check for a document indicator.
if (_isDocumentIndicator) break;
// Check for a comment.
if (_scanner.peekChar() == HASH) break;
if (_isPlainChar) {
// Join the whitespace or fold line breaks.
if (leadingBreak.isNotEmpty) {
if (trailingBreaks.isEmpty) {
} else {
leadingBreak = '';
trailingBreaks = '';
} else {
// libyaml's notion of valid identifiers differs substantially from YAML
// 1.2's. We use [_isPlainChar] instead of libyaml's character here.
var startPosition = _scanner.position;
while (_isPlainChar) {
end = _scanner.state;
// Is it the end?
if (!_isBlank && !_isBreak) break;
while (_isBlank || _isBreak) {
if (_isBlank) {
// Check for a tab character messing up the intendation.
if (leadingBreak.isNotEmpty &&
_scanner.column < indent &&
_scanner.peekChar() == TAB) {
_scanner.error("Expected a space but found a tab.", length: 1);
if (leadingBreak.isEmpty) {
} else {
} else {
// Check if it's a first line break.
if (leadingBreak.isEmpty) {
leadingBreak = _readLine();
} else {
trailingBreaks = _readLine();
// Check the indentation level.
if (_inBlockContext && _scanner.column < indent) break;
// Allow a simple key after a plain scalar with leading blanks.
if (leadingBreak.isNotEmpty) _simpleKeyAllowed = true;
return new ScalarToken(
_scanner.spanFrom(start, end), buffer.toString(), ScalarStyle.PLAIN);
/// Moves past the current line break, if there is one.
void _skipLine() {
var char = _scanner.peekChar();
if (char != CR && char != LF) return;
if (char == CR && _scanner.peekChar() == LF) _scanner.readChar();
// Moves past the current line break and returns a newline.
String _readLine() {
var char = _scanner.peekChar();
// libyaml supports NEL, PS, and LS characters as line separators, but this
// is explicitly forbidden in section 5.4 of the YAML spec.
if (char != CR && char != LF) {
throw new YamlException("Expected newline.", _scanner.emptySpan);
// CR LF | CR | LF -> LF
if (char == CR && _scanner.peekChar() == LF) _scanner.readChar();
return "\n";
// Returns whether the character at [offset] is whitespace.
bool _isBlankAt(int offset) {
var char = _scanner.peekChar(offset);
return char == SP || char == TAB;
// Returns whether the character at [offset] is a line break.
bool _isBreakAt(int offset) {
// Libyaml considers NEL, LS, and PS to be line breaks as well, but that's
// contrary to the spec.
var char = _scanner.peekChar(offset);
return char == CR || char == LF;
// Returns whether the character at [offset] is whitespace or past the end of
// the source.
bool _isBlankOrEndAt(int offset) {
var char = _scanner.peekChar(offset);
return char == null ||
char == SP ||
char == TAB ||
char == CR ||
char == LF;
/// Returns whether the character at [offset] is a plain character.
/// See
bool _isPlainCharAt(int offset) {
switch (_scanner.peekChar(offset)) {
case COLON:
return _isPlainSafeAt(offset + 1);
case HASH:
var previous = _scanner.peekChar(offset - 1);
return previous != SP && previous != TAB;
return _isPlainSafeAt(offset);
/// Returns whether the character at [offset] is a plain-safe character.
/// See
bool _isPlainSafeAt(int offset) {
var char = _scanner.peekChar(offset);
switch (char) {
case COMMA:
// These characters are delimiters in a flow context and thus are only
// safe in a block context.
return _inBlockContext;
case SP:
case TAB:
case LF:
case CR:
case BOM:
return false;
case NEL:
return true;
return char != null &&
((char >= 0x00020 && char <= 0x00007E) ||
(char >= 0x000A0 && char <= 0x00D7FF) ||
(char >= 0x0E000 && char <= 0x00FFFD) ||
(char >= 0x10000 && char <= 0x10FFFF));
/// Returns the hexidecimal value of [char].
int _asHex(int char) {
if (char <= NUMBER_9) return char - NUMBER_0;
if (char <= LETTER_CAP_F) return 10 + char - LETTER_CAP_A;
return 10 + char - LETTER_A;
/// Moves the scanner past any blank characters.
void _skipBlanks() {
while (_isBlank) {
/// Moves the scanner past a comment, if one starts at the current position.
void _skipComment() {
if (_scanner.peekChar() != HASH) return;
while (!_isBreakOrEnd) {
/// A record of the location of a potential simple key.
class _SimpleKey {
/// The index of the token that begins the simple key.
/// This is the index relative to all tokens emitted, rather than relative to
/// [_tokens].
final int tokenNumber;
/// The source location of the beginning of the simple key.
/// This is used for error reporting and for determining when a simple key is
/// no longer on the current line.
final SourceLocation location;
/// The line on which the key appears.
/// We could get this from [location], but that requires a binary search
/// whereas this is O(1).
final int line;
/// The column on which the key appears.
/// We could get this from [location], but that requires a binary search
/// whereas this is O(1).
final int column;
/// Whether this key must exist for the document to be scanned.
final bool required;
_SimpleKey(this.tokenNumber, this.line, this.column, this.location,
{bool required})
: required = required;
/// An enum of chomping indicators that describe how to handle trailing
/// whitespace for a block scalar.
/// See
class _Chomping {
/// All trailing whitespace is discarded.
static const STRIP = const _Chomping("STRIP");
/// A single trailing newline is retained.
static const CLIP = const _Chomping("CLIP");
/// All trailing whitespace is preserved.
static const KEEP = const _Chomping("KEEP");
final String name;
const _Chomping(;
String toString() => name;