blob: 8666550226fc05515a60986e10e4a6af0d112bcc [file] [log] [blame]
// Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library PegParser;
/*
* The following functions are combinators for building Rules.
*
* A rule is one of the following
* - A String which matches the string literally.
* - A Symbol which matches the symbol's definition.
* - A list of rules with an optional reducing function, which matches a sequence.
* - The result of calling one of the combinators.
*
* Some rules are 'value-generating' rules, they return an 'abstract syntax
* tree' with the match. If a rule is not value-generating [:null:] is the
* value.
*
* A Symbol is always a value-generating rule. If the value is not required, use
* [:SKIP(aSymbol):] in place of [:aSymbol:].
*
* A String is not a value-generating rule but can be converted into one by
* using [:TEXT('string'):] in place of [:'string':].
*
* A list or sequence is value-generating depending on the subrules. The
* sequence is value-generating if any of the subrules are value-generating or
* if there is a reducing function. If no reducing function is given, the value
* returned depends on the number of value-generating subrules. If there is
* only one value generating subrule, that provideds the value for the sequence.
* If there are more, then the value is a list of the values of the
* value-generating subrules.
*/
/**
* Matches one character by a predicate on the character code.
* If [spec] is an int, that character is matched.
* If [spec] is a function it is used
*
* Example [: CHARCODE((code) => 48 <= code && code <= 57) :] recognizes an
* ASCII digit.
*
* CHARCODE does not generate a value.
*/
_Rule CHARCODE(spec, [name]) {
if (spec is int)
return new _CharCodeRule((code) => code == spec, name);
else
return new _CharCodeRule(spec, name);
}
/**
* Matches one of the [characters].
*
* CHAR does not generate a value.
*/
_Rule CHAR([characters]) {
if (characters == null)
return const _AnyCharRule();
if (characters is int)
return CHARCODE(characters);
// Find the range of character codes and construct an array of flags for codes
// within the range.
List<int> codes = characters.codeUnits.toList();
codes.sort((a, b) => a < b ? -1 : a > b ? 1 : 0);
int lo = codes[0];
int hi = codes[codes.length - 1];
if (lo == hi)
return CHARCODE(lo);
int len = hi - lo + 1;
var flags = new List<bool>(len);
for (int i = 0; i < len; ++i)
flags[i] = false;
for (int code in codes)
flags[code - lo] = true;
return CHARCODE((code) => code >= lo && code <= hi && flags[code - lo]);
}
/**
* Matches the end of the input.
*
* END does not generate a value.
*/
_Rule get END => new _EndOfInputRule();
/**
* Throws an exception.
*/
_Rule ERROR(String message) => new _ErrorRule(message);
/**
* Matches [rule] but does not consume the input. Useful for matching a right
* context.
*
* AT does not generate a value.
*/
_Rule AT(rule) => new _ContextRule(_compile(rule));
/**
* Matches when [rule] does not match. No input is consumed.
*
* NOT does not generate a value.
*/
_Rule NOT(rule) => new _NegativeContextRule(_compile(rule));
/**
* Matches [rule] but generates no value even if [rule] generates a value.
*
* SKIP never generates a value.
*/
_Rule SKIP(rule) => new _SkipRule(_compile(rule));
/**
* Matches [rule] in a lexical context where whitespace is not automatically
* skipped. Useful for matching what would normally be considered to be tokens.
* [name] is a user-friendly description of what is being matched and is used in
* error messages.
*
* LEX(rule)
* LEX(name, rule)
*
* LEX does not generate a value. If a value is required, wrap LEX with TEXT.
*/
_Rule LEX(arg1, [arg2]) {
if (arg2 == null)
return new _LexicalRule(arg1 is String ? arg1 : null, _compile(arg1));
else
return new _LexicalRule(arg1, _compile(arg2));
}
/**
* Matches [rule] and generates a value from the matched text. If the [rule]
* matches, then TEXT(rule) matches and has a value derived from the string
* fragment that was matched. The default derived value is the string fragment.
*
* TEXT always generates a value.
*/
_Rule TEXT(rule, [extractor]) =>
new _TextValueRule(_compile(rule),
extractor == null
? (string, start, end) => string.substring(start, end)
: extractor);
/**
* Matches an optional rule.
*
* MAYBE is a value generating matcher.
*
* If [rule] is value generating then the value is the value generated by [rule]
* if it matches, and [:null:] if it does not.
*
* If [rule] is not value generatinge then the value is [:true:] if [rule]
* matches and [:false:] if it does not.
*/
_Rule MAYBE(rule) => new _OptionalRule(_compile(rule));
/**
* MANY(rule) matches [rule] [min] or more times.
* [min] must be 0 or 1.
* If [separator] is provided it is used to match a separator between matches of
* [rule].
*
* MANY is a value generating matcher. The value is a list of the matches of
* [rule]. The list may be empty if [:min == 0:].
*/
_Rule MANY(rule, {separator: null, int min: 1}) {
assert(0 <= min && min <= 1);
return new _RepeatRule(_compile(rule), _compileOptional(separator), min);
}
/**
* Matches [rule] zero or more times. Shorthand for [:MANY(rule, min:0):]
* TODO: retire min: parameter?
*
* MANY0 is a value generating matcher.
*/
_Rule MANY0(rule, [separator = null]) {
return new _RepeatRule(_compile(rule), _compileOptional(separator), 0);
}
/**
* Matches [rules] in order until one succeeds.
*
* OR is value-generating.
*/
_Rule OR([a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z]) =>
_compileMultiRule(
(a is List && b == null) // Backward compat. OR([a, b]) => OR(a, b).
? a
: _unspread(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z),
false,
(compiledRules, valueCount, reducer) =>
new _ChoiceRule(compiledRules));
_Rule SEQ([a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z]) =>
_compile(_unspread(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z));
/**
* Matches [rule]
*/
_Rule MEMO(rule) => new _MemoRule(_compile(rule));
_Rule TAG(tag, rule) => _compile([rule, (ast) => [tag, ast]]);
class ParseError implements Exception {
const ParseError(String this._message);
String toString() => _message;
final String _message;
}
/**
* A grammar is a collection of symbols and rules that may be used to parse an
* input.
*/
class Grammar {
Map<String, Symbol> _symbols;
/** This rule may be set by the user to define whitespace. */
_Rule _whitespace;
_Rule get whitespace => _whitespace;
void set whitespace(rule) { _whitespace = _compile(rule); }
Grammar() {
_symbols = new Map<String, Symbol>();
whitespace = CHAR(' \t\r\n');
}
/**
* operator [] is used to find or create symbols. Symbols may appear in rules
* to define recursive rules.
*/
Symbol operator [](String name) {
if (_symbols.containsKey(name))
return _symbols[name];
Symbol s = new Symbol(name, this);
_symbols[name] = s;
return s;
}
/**
* Parses the input string and returns the parsed AST, or throws an exception
* if the input can't be parsed.
*/
parse(root, String text) {
for (var symbol in _symbols.values)
if (symbol._rule == null)
print('${symbol.name} is undefined');
var state = new _ParserState(text, whitespace: whitespace);
var match = _compile(root).match(state, 0);
if (match == null)
return diagnose(state);
var pos = match[0];
pos = _skip_whitespace(state, pos);
if (pos == state._end)
return match[1];
// TODO: Make this complain about expecting end of file.
return diagnose(state);
}
diagnose(state) {
var message = 'unexpected error';
if (!state.max_rule.isEmpty) {
var s = new Set();
for (var rule in state.max_rule)
s.add(rule.description());
var tokens = new List<String>.from(s);
tokens.sort((a, b) =>
a.startsWith("'") == b.startsWith("'")
? a.compareTo(b)
: a.startsWith("'") ? 1 : -1);
var expected = tokens.join(' or ');
var found = state.max_pos == state._end ? 'end of file'
: "'${state._text[state.max_pos]}'";
message = 'Expected $expected but found $found';
}
int start = state.max_pos;
int end = start;
while (start >= 1 && state._text[start - 1] != '\n') --start;
while (end < state._text.length && state._text[end] != '\n') ++end;
var line = state._text.substring(start, end);
var indicator = '';
for (var i = 0; i < line.length && start + i < state.max_pos; i++)
indicator = ' $indicator';
indicator = '$indicator^';
// TODO: Convert to an exception.
print(message);
print(line);
print(indicator);
return null;
}
}
class Symbol {
final String name;
final Grammar grammar;
_Rule _rule;
Symbol(this.name, this.grammar);
void set def(rule) {
assert(_rule == null); // Assign once.
_rule = _compile(rule);
}
toString() => _rule == null ? '<$name>' : '<$name = $_rule>';
}
class _ParserState {
_ParserState(this._text, {_Rule whitespace}) {
_end = this._text.length;
whitespaceRule = whitespace;
max_rule = [];
}
String _text;
int _end;
//
bool inWhitespaceMode = false;
_Rule whitespaceRule = null;
// Used for constructing an error message.
int inhibitExpectedTrackingDepth = 0;
int max_pos = 0;
var max_rule;
}
/**
* An interface tag for rules. If this tag is on a rule, then the description()
* of the rule is something sensible to put in a message.
*/
abstract class _Expectable {
String description();
}
class _Rule {
const _Rule();
// Returns null for a match failure or [pos, ast] for success.
match(_ParserState state, int pos) {
if (! state.inWhitespaceMode) {
pos = _skip_whitespace(state, pos);
}
return matchAfterWS(state, pos);
}
// Faster entry point for matching a sub-rule that is matched to the start
// position of the super-rule. Whitespace has already been skipped so no need
// to try to skip it again.
matchAfterWS(_ParserState state, int pos) {
if (state.inhibitExpectedTrackingDepth == 0) {
// Track position for possible error messaging
if (pos > state.max_pos) {
// Store position and the rule.
state.max_pos = pos;
if (this is _Expectable) {
state.max_rule = [this];
} else {
state.max_rule = [];
}
} else if (pos == state.max_pos) {
if (this is _Expectable) {
state.max_rule.add(this);
}
}
}
// Delegate the matching logic to the the specialized function.
return _match(state, pos);
}
// Overridden in subclasses to match the rule.
_match(_ParserState state, int pos) => null;
// Does the rule generate a value (AST) with the match?
bool get generatesValue => false;
get defaultValue => null;
}
int _skip_whitespace(state, pos) {
// Returns the next non-whitespace position.
// This is done by matching the optional whitespaceRule with the current text.
if (state.whitespaceRule == null)
return pos;
state.inWhitespaceMode = true;
state.inhibitExpectedTrackingDepth++;
while (true) {
var match = state.whitespaceRule.match(state, pos);
if (match == null)
break;
pos = match[0];
}
state.inWhitespaceMode = false;
state.inhibitExpectedTrackingDepth--;
return pos;
}
_Rule _compileOptional(rule) {
return rule == null ? null : _compile(rule);
}
_Rule _compile(rule) {
if (rule is _Rule)
return rule;
if (rule is String)
return new _StringRule(rule);
if (rule is Symbol)
return new _SymbolRule(rule);
if (rule is RegExp)
return new _RegExpRule(rule);
if (rule is List) {
return _compileMultiRule(
rule, true,
(compiledRules, valueCount, reducer) =>
new _SequenceRule(compiledRules, valueCount, reducer));
}
throw new Exception('Cannot compile rule: $rule');
}
class _EndOfInputRule extends _Rule {
_match(_ParserState state, int pos) {
if (pos == state._end)
return [pos, null];
return null;
}
toString() => 'END';
}
class _ErrorRule extends _Rule {
String message;
_ErrorRule(String this.message);
_match(_ParserState state, int pos) {
throw new ParseError(message);
}
toString() => 'ERROR($message)';
}
class _CharCodeRule extends _Rule {
Function _predicate;
var _name;
_CharCodeRule(this._predicate, this._name);
_match(_ParserState state, int pos) {
if (pos == state._end)
return null;
int code = state._text.codeUnitAt(pos);
if (_predicate(code))
return [pos + 1, null];
return null;
}
toString() => _name == null ? 'CHARCODE($_predicate)' : 'CHARCODE($_name)';
}
class _AnyCharRule extends _Rule {
const _AnyCharRule();
_match(_ParserState state, int pos) {
if (pos == state._end)
return null;
return [pos + 1, null];
}
toString() => 'CHAR()';
}
class _SymbolRule extends _Rule {
final Symbol _symbol;
_SymbolRule(Symbol this._symbol);
_match(_ParserState state, int pos) {
if (_symbol._rule == null)
throw new Exception("Symbol '${_symbol.name}' is undefined");
return _symbol._rule.match(state, pos);
}
bool get generatesValue => true;
toString() => '<${_symbol.name}>';
}
class _SkipRule extends _Rule {
// A rule that has no value.
_Rule _rule;
_SkipRule(_Rule this._rule);
_match(_ParserState state, int pos) {
var match = _rule.matchAfterWS(state, pos);
if (match == null)
return null;
return [match[0], null];
}
toString() => 'TOKEN($_rule)';
}
class _StringRule extends _Rule implements _Expectable {
final String _string;
int _len;
_StringRule(this._string) {
_len = _string.length;
}
_match(_ParserState state, int pos) {
if (pos + _len > state._end)
return null;
for (int i = 0; i < _len; i++) {
if (state._text.codeUnitAt(pos + i) != _string.codeUnitAt(i))
return null;
}
return [pos + _len, null];
}
//get defaultValue => _string;
toString() => '"$_string"';
description() => "'$_string'";
}
class _RegExpRule extends _Rule {
RegExp _re;
_RegExpRule(this._re) {
// There is no convenient way to match an anchored substring.
throw new Exception('RegExp matching not supported');
}
toString() => '"$_re"';
}
class _LexicalRule extends _Rule implements _Expectable {
final String _name;
final _Rule _rule;
_LexicalRule(String this._name, _Rule this._rule);
_match(_ParserState state, int pos) {
state.inWhitespaceMode = true;
state.inhibitExpectedTrackingDepth++;
var match = _rule.matchAfterWS(state, pos);
state.inhibitExpectedTrackingDepth--;
state.inWhitespaceMode = false;
return match;
}
toString() => _name;
description() => _name == null ? '?' : _name;
}
class _TextValueRule extends _Rule {
final _Rule _rule;
final _extract; // Function
_TextValueRule(_Rule this._rule, Function this._extract);
_match(_ParserState state, int pos) {
var match = _rule.matchAfterWS(state, pos);
if (match == null) {
return null;
}
var endPos = match[0];
return [endPos, _extract(state._text, pos, endPos)];
}
bool get generatesValue => true;
toString() => 'TEXT($_rule)';
}
_Rule _compileMultiRule(List rules,
bool allowReducer,
finish(compiledRules, valueCount, reducer)) {
int valueCount = 0;
List compiledRules = new List<_Rule>();
Function reducer;
for (var rule in rules) {
if (reducer != null)
throw new Exception('Reducer must be last in sequence: $rule');
if (rule is Function) {
if (allowReducer)
reducer = rule;
else
throw new Exception('Bad rule: "$rule"');
} else {
_Rule compiledRule = _compile(rule);
if (compiledRule.generatesValue)
++valueCount;
compiledRules.add(compiledRule);
}
}
return finish(compiledRules, valueCount, reducer);
}
String _formatMultiRule(String functor, List rules) {
var sb = new StringBuffer(functor);
sb.write('(');
var separator = '';
for (var rule in rules) {
sb.write(separator);
sb.write(rule);
separator = ',';
}
sb.write(')');
return sb.toString();
}
class _SequenceRule extends _Rule {
// This rule matches the component rules in order.
final List<_Rule> _rules;
final int _generatingSubRules;
final Function _reducer;
bool _generatesValue;
_SequenceRule(List<_Rule> this._rules,
int this._generatingSubRules,
Function this._reducer) {
_generatesValue = _generatingSubRules > 0 || _reducer != null;
}
_match(state, pos) {
var sequence = [];
for (var rule in _rules) {
var match = rule.match(state, pos);
if (match == null)
return null;
if (rule.generatesValue) {
var ast = match[1];
sequence.add(ast);
}
pos = match[0];
}
if (_reducer == null) {
if (_generatingSubRules == 0)
return [pos, null];
if (_generatingSubRules == 1)
return [pos, sequence[0]];
return [pos, sequence];
} else {
return [pos, _apply(_reducer, sequence)];
}
}
bool get generatesValue => _generatesValue;
toString() => _formatMultiRule('SEQ', _rules);
}
class _ChoiceRule extends _Rule {
// This rule matches the first component rule that matches.
List<_Rule> _rules;
_ChoiceRule(List<_Rule> this._rules);
_match(state, pos) {
for (var rule in _rules) {
var match = rule.match(state, pos);
if (match != null) {
/*
if (!rule.generatesValue) {
var value = rule.defaultValue;
if (value != null)
return [match[0], value];
}
*/
return match;
}
}
return null;
}
bool get generatesValue => true;
toString() => _formatMultiRule('OR', _rules);
}
class _OptionalRule extends _Rule {
_Rule _rule;
_OptionalRule(_Rule this._rule);
_match(_ParserState state, int pos) {
var match = _rule.match(state, pos);
if (_rule.generatesValue)
return match == null
? [pos, null]
: match;
return match == null
? [pos, false]
: [match[0], true];
}
bool get generatesValue => true;
toString() => 'MAYBE($_rule)';
}
class _ContextRule extends _Rule {
_Rule _rule;
_ContextRule(_Rule this._rule);
_match(_ParserState state, int pos) {
// TODO: protect error state.
var match = _rule._match(state, pos);
if (match == null)
return null;
return [pos, null];
}
toString() => 'AT($_rule)';
}
class _NegativeContextRule extends _Rule {
_Rule _rule;
_NegativeContextRule(_Rule this._rule);
_match(_ParserState state, int pos) {
// TODO: protect error state.
var match = _rule._match(state, pos);
if (match == null)
return [pos, null];
return null;
}
toString() => 'NOT($_rule)';
}
class _RepeatRule extends _Rule {
// Matches zero, one or more items.
_Rule _rule;
_Rule _separator;
int _min;
_RepeatRule(this._rule, this._separator, this._min);
_match(state, pos) {
// First match.
var match = _rule.match(state, pos);
if (match == null)
if (_min == 0)
return [pos, []];
else
return null;
pos = match[0];
var result = [match[1]];
// Subsequent matches:
while (true) {
var newPos = pos;
if (_separator != null) {
match = _separator.match(state, pos);
if (match == null)
return [pos, result];
newPos = match[0];
}
match = _rule.match(state, newPos);
if (match == null)
return [pos, result];
pos = match[0];
result.add(match[1]);
}
}
bool get generatesValue => true;
toString() => 'MANY(min:$_min, $_rule${_separator==null?'':", sep: $_separator"})';
}
class _MemoRule extends _Rule {
final _Rule _rule;
var parseInstance;
// A map from position to result. Can this be replaced with something
// smaller?
// TODO: figure out how to discard the map and parseInstance after parsing.
Map<int,Object> map;
_MemoRule(this._rule);
_match(state, pos) {
// See if we are still parsing the same input. Relies on the fact that the
// input is a string and strings are immutable.
if (!identical(parseInstance, state._text)) {
map = new Map<int,Object>();
parseInstance = state._text;
}
// TODO: does this have to check or preserve parse state (like
// inWhitespaceMode, error position info etc?)
// Stored result can be null (memoized failure).
if (map.containsKey(pos)) {
return map[pos];
}
var match = _rule.match(state, pos);
map[pos] = match;
return match;
}
bool get generatesValue => _rule.generatesValue;
toString() => 'MEMO($_rule)';
}
_apply(fn, List args) {
switch (args.length) {
case 0: return fn();
case 1: return fn(args[0]);
case 2: return fn(args[0], args[1]);
case 3: return fn(args[0], args[1], args[2]);
case 4: return fn(args[0], args[1], args[2], args[3]);
case 5: return fn(args[0], args[1], args[2], args[3], args[4]);
case 6: return fn(args[0], args[1], args[2], args[3], args[4],
args[5]);
case 7: return fn(args[0], args[1], args[2], args[3], args[4],
args[5], args[6]);
case 8: return fn(args[0], args[1], args[2], args[3], args[4],
args[5], args[6], args[7]);
case 9: return fn(args[0], args[1], args[2], args[3], args[4],
args[5], args[6], args[7], args[8]);
case 10: return fn(args[0], args[1], args[2], args[3], args[4],
args[5], args[6], args[7], args[8], args[9]);
default:
throw new Exception('Too many arguments in _apply: $args');
}
}
List _unspread(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z) {
List list = new List();
add(element) { if (element != null) list.add(element); }
add(a);
add(b);
add(c);
add(d);
add(e);
add(f);
add(g);
add(h);
add(i);
add(j);
add(k);
add(l);
add(m);
add(n);
add(o);
add(p);
add(q);
add(r);
add(s);
add(t);
add(u);
add(v);
add(w);
add(x);
add(y);
add(z);
return list;
}