| // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| interface Scanner { |
| Token tokenize(); |
| } |
| |
| /** |
| * Common base class for a Dart scanner. |
| */ |
| class AbstractScanner<T> implements Scanner { |
| abstract int advance(); |
| abstract int nextByte(); |
| abstract int peek(); |
| abstract int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); |
| abstract void appendPrecenceToken(PrecedenceInfo info); |
| abstract void appendStringToken(PrecedenceInfo info, String value); |
| abstract void appendByteStringToken(PrecedenceInfo info, T value); |
| abstract void appendKeywordToken(Keyword keyword); |
| abstract void appendWhiteSpace(int next); |
| abstract void appendEofToken(); |
| abstract T asciiString(int start, int offset); |
| abstract T utf8String(int start, int offset); |
| abstract Token firstToken(); |
| abstract void beginToken(); |
| abstract void addToCharOffset(int offset); |
| abstract int get charOffset(); |
| abstract int get byteOffset(); |
| abstract void appendBeginGroup(PrecedenceInfo info, String value); |
| abstract int appendEndGroup(PrecedenceInfo info, String value, int openKind); |
| abstract void appendGt(PrecedenceInfo info, String value); |
| abstract void appendGtGt(PrecedenceInfo info, String value); |
| abstract void appendGtGtGt(PrecedenceInfo info, String value); |
| abstract void discardOpenLt(); |
| |
| // TODO(ahe): Move this class to implementation. |
| |
| Token tokenize() { |
| int next = advance(); |
| while (next !== $EOF) { |
| next = bigSwitch(next); |
| } |
| appendEofToken(); |
| return firstToken(); |
| } |
| |
| int bigSwitch(int next) { |
| beginToken(); |
| if (next === $TAB || next === $LF || next === $CR || next === $SPACE) { |
| appendWhiteSpace(next); |
| return advance(); |
| } |
| |
| if ($a <= next && next <= $z) { |
| return tokenizeKeywordOrIdentifier(next, true); |
| } |
| |
| if (($A <= next && next <= $Z) || next === $_ || next === $$) { |
| return tokenizeIdentifier(next, byteOffset, true); |
| } |
| |
| if (next === $LT) { |
| return tokenizeLessThan(next); |
| } |
| |
| if (next === $GT) { |
| return tokenizeGreaterThan(next); |
| } |
| |
| if (next === $EQ) { |
| return tokenizeEquals(next); |
| } |
| |
| if (next === $BANG) { |
| return tokenizeExclamation(next); |
| } |
| |
| if (next === $PLUS) { |
| return tokenizePlus(next); |
| } |
| |
| if (next === $MINUS) { |
| return tokenizeMinus(next); |
| } |
| |
| if (next === $STAR) { |
| return tokenizeMultiply(next); |
| } |
| |
| if (next === $PERCENT) { |
| return tokenizePercent(next); |
| } |
| |
| if (next === $AMPERSAND) { |
| return tokenizeAmpersand(next); |
| } |
| |
| if (next === $BAR) { |
| return tokenizeBar(next); |
| } |
| |
| if (next === $CARET) { |
| return tokenizeCaret(next); |
| } |
| |
| if (next === $OPEN_SQUARE_BRACKET) { |
| return tokenizeOpenSquareBracket(next); |
| } |
| |
| if (next === $TILDE) { |
| return tokenizeTilde(next); |
| } |
| |
| if (next === $BACKSLASH) { |
| appendPrecenceToken(BACKSLASH_INFO); |
| return advance(); |
| } |
| |
| if (next === $HASH) { |
| return tokenizeTag(next); |
| } |
| |
| if (next === $OPEN_PAREN) { |
| appendBeginGroup(OPEN_PAREN_INFO, "("); |
| return advance(); |
| } |
| |
| if (next === $CLOSE_PAREN) { |
| return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); |
| } |
| |
| if (next === $COMMA) { |
| appendPrecenceToken(COMMA_INFO); |
| return advance(); |
| } |
| |
| if (next === $COLON) { |
| appendPrecenceToken(COLON_INFO); |
| return advance(); |
| } |
| |
| if (next === $SEMICOLON) { |
| appendPrecenceToken(SEMICOLON_INFO); |
| discardOpenLt(); |
| return advance(); |
| } |
| |
| if (next === $QUESTION) { |
| appendPrecenceToken(QUESTION_INFO); |
| return advance(); |
| } |
| |
| if (next === $CLOSE_SQUARE_BRACKET) { |
| return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", |
| OPEN_SQUARE_BRACKET_TOKEN); |
| } |
| |
| if (next === $BACKPING) { |
| appendPrecenceToken(BACKPING_INFO); |
| return advance(); |
| } |
| |
| if (next === $OPEN_CURLY_BRACKET) { |
| appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); |
| return advance(); |
| } |
| |
| if (next === $CLOSE_CURLY_BRACKET) { |
| return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", |
| OPEN_CURLY_BRACKET_TOKEN); |
| } |
| |
| if (next === $SLASH) { |
| return tokenizeSlashOrComment(next); |
| } |
| |
| if (next === $AT) { |
| return tokenizeRawString(next); |
| } |
| |
| if (next === $DQ || next === $SQ) { |
| return tokenizeString(next, byteOffset, false); |
| } |
| |
| if (next === $PERIOD) { |
| return tokenizeDotOrNumber(next); |
| } |
| |
| if (next === $0) { |
| return tokenizeHexOrNumber(next); |
| } |
| |
| // TODO(ahe): Would a range check be faster? |
| if (next === $1 || next === $2 || next === $3 || next === $4 || next === $5 |
| || next === $6 || next === $7 || next === $8 || next === $9) { |
| return tokenizeNumber(next); |
| } |
| |
| if (next === $EOF) { |
| return $EOF; |
| } |
| if (next < 0x1f) { |
| throw new MalformedInputException("illegal character $next", charOffset); |
| } |
| |
| // The following are non-ASCII characters. |
| |
| if (next === $NBSP) { |
| appendWhiteSpace(next); |
| return advance(); |
| } |
| |
| return tokenizeIdentifier(next, byteOffset, true); |
| } |
| |
| int tokenizeTag(int next) { |
| // # or #!.*[\n\r] |
| if (byteOffset === 0) { |
| if (peek() === $BANG) { |
| do { |
| next = advance(); |
| } while (next !== $LF && next !== $CR && next !== $EOF); |
| return next; |
| } |
| } |
| appendPrecenceToken(HASH_INFO); |
| return advance(); |
| } |
| |
| int tokenizeTilde(int next) { |
| // ~ ~/ ~/= |
| next = advance(); |
| if (next === $SLASH) { |
| return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); |
| } else { |
| appendPrecenceToken(TILDE_INFO); |
| return next; |
| } |
| } |
| |
| int tokenizeOpenSquareBracket(int next) { |
| // [ [] []= |
| next = advance(); |
| if (next === $CLOSE_SQUARE_BRACKET) { |
| return select($EQ, INDEX_EQ_INFO, INDEX_INFO); |
| } else { |
| appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); |
| return next; |
| } |
| } |
| |
| int tokenizeCaret(int next) { |
| // ^ ^= |
| return select($EQ, CARET_EQ_INFO, CARET_INFO); |
| } |
| |
| int tokenizeBar(int next) { |
| // | || |= |
| next = advance(); |
| if (next === $BAR) { |
| appendPrecenceToken(BAR_BAR_INFO); |
| return advance(); |
| } else if (next === $EQ) { |
| appendPrecenceToken(BAR_EQ_INFO); |
| return advance(); |
| } else { |
| appendPrecenceToken(BAR_INFO); |
| return next; |
| } |
| } |
| |
| int tokenizeAmpersand(int next) { |
| // && &= & |
| next = advance(); |
| if (next === $AMPERSAND) { |
| appendPrecenceToken(AMPERSAND_AMPERSAND_INFO); |
| return advance(); |
| } else if (next === $EQ) { |
| appendPrecenceToken(AMPERSAND_EQ_INFO); |
| return advance(); |
| } else { |
| appendPrecenceToken(AMPERSAND_INFO); |
| return next; |
| } |
| } |
| |
| int tokenizePercent(int next) { |
| // % %= |
| return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO); |
| } |
| |
| int tokenizeMultiply(int next) { |
| // * *= |
| return select($EQ, STAR_EQ_INFO, STAR_INFO); |
| } |
| |
| int tokenizeMinus(int next) { |
| // - -- -= |
| next = advance(); |
| if (next === $MINUS) { |
| appendPrecenceToken(MINUS_MINUS_INFO); |
| return advance(); |
| } else if (next === $EQ) { |
| appendPrecenceToken(MINUS_EQ_INFO); |
| return advance(); |
| } else { |
| appendPrecenceToken(MINUS_INFO); |
| return next; |
| } |
| } |
| |
| |
| int tokenizePlus(int next) { |
| // + ++ += |
| next = advance(); |
| if ($PLUS === next) { |
| appendPrecenceToken(PLUS_PLUS_INFO); |
| return advance(); |
| } else if ($EQ === next) { |
| appendPrecenceToken(PLUS_EQ_INFO); |
| return advance(); |
| } else { |
| appendPrecenceToken(PLUS_INFO); |
| return next; |
| } |
| } |
| |
| int tokenizeExclamation(int next) { |
| // ! != !== |
| next = advance(); |
| if (next === $EQ) { |
| return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); |
| } |
| appendPrecenceToken(BANG_INFO); |
| return next; |
| } |
| |
| int tokenizeEquals(int next) { |
| // = == === |
| next = advance(); |
| if (next === $EQ) { |
| return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); |
| } else if (next === $GT) { |
| appendPrecenceToken(FUNCTION_INFO); |
| return advance(); |
| } |
| appendPrecenceToken(EQ_INFO); |
| return next; |
| } |
| |
| int tokenizeGreaterThan(int next) { |
| // > >= >> >>= >>> >>>= |
| next = advance(); |
| if ($EQ === next) { |
| appendPrecenceToken(GT_EQ_INFO); |
| return advance(); |
| } else if ($GT === next) { |
| next = advance(); |
| if ($EQ === next) { |
| appendPrecenceToken(GT_GT_EQ_INFO); |
| return advance(); |
| } else if ($GT === next) { |
| next = advance(); |
| if (next === $EQ) { |
| appendPrecenceToken(GT_GT_GT_EQ_INFO); |
| return advance(); |
| } else { |
| appendGtGtGt(GT_GT_GT_INFO, ">>>"); |
| return next; |
| } |
| } else { |
| appendGtGt(GT_GT_INFO, ">>"); |
| return next; |
| } |
| } else { |
| appendGt(GT_INFO, ">"); |
| return next; |
| } |
| } |
| |
| int tokenizeLessThan(int next) { |
| // < <= << <<= |
| next = advance(); |
| if ($EQ === next) { |
| appendPrecenceToken(LT_EQ_INFO); |
| return advance(); |
| } else if ($LT === next) { |
| return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); |
| } else { |
| appendBeginGroup(LT_INFO, "<"); |
| return next; |
| } |
| } |
| |
| int tokenizeNumber(int next) { |
| int start = byteOffset; |
| while (true) { |
| next = advance(); |
| if ($0 <= next && next <= $9) { |
| continue; |
| } else if (next === $PERIOD) { |
| return tokenizeFractionPart(advance(), start); |
| } else if (next === $e || next === $E || next === $d || next === $D) { |
| return tokenizeFractionPart(next, start); |
| } else { |
| appendByteStringToken(INT_INFO, asciiString(start, 0)); |
| return next; |
| } |
| } |
| } |
| |
| int tokenizeHexOrNumber(int next) { |
| int x = peek(); |
| if (x === $x || x === $X) { |
| advance(); |
| return tokenizeHex(x); |
| } |
| return tokenizeNumber(next); |
| } |
| |
| int tokenizeHex(int next) { |
| int start = byteOffset - 1; |
| bool hasDigits = false; |
| while (true) { |
| next = advance(); |
| if (($0 <= next && next <= $9) |
| || ($A <= next && next <= $F) |
| || ($a <= next && next <= $f)) { |
| hasDigits = true; |
| } else { |
| if (!hasDigits) { |
| throw new MalformedInputException("hex digit expected", charOffset); |
| } |
| appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); |
| return next; |
| } |
| } |
| } |
| |
| int tokenizeDotOrNumber(int next) { |
| int start = byteOffset; |
| next = advance(); |
| if (($0 <= next && next <= $9)) { |
| return tokenizeFractionPart(next, start); |
| } else if ($PERIOD === next) { |
| return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
| } else { |
| appendPrecenceToken(PERIOD_INFO); |
| return next; |
| } |
| } |
| |
| int tokenizeFractionPart(int next, int start) { |
| bool done = false; |
| bool hasDigit = false; |
| LOOP: while (!done) { |
| if ($0 <= next && next <= $9) { |
| hasDigit = true; |
| } else if ($e === next || $E === next) { |
| hasDigit = true; |
| next = tokenizeExponent(advance()); |
| done = true; |
| continue LOOP; |
| } else { |
| done = true; |
| continue LOOP; |
| } |
| next = advance(); |
| } |
| if (!hasDigit) { |
| appendByteStringToken(INT_INFO, asciiString(start, -1)); |
| // TODO(ahe): Wrong offset for the period. |
| appendPrecenceToken(PERIOD_INFO); |
| return bigSwitch(next); |
| } |
| if (next === $d || next === $D) { |
| next = advance(); |
| } |
| appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); |
| return next; |
| } |
| |
| int tokenizeExponent(int next) { |
| if (next === $PLUS || next === $MINUS) { |
| next = advance(); |
| } |
| bool hasDigits = false; |
| while (true) { |
| if ($0 <= next && next <= $9) { |
| hasDigits = true; |
| } else { |
| if (!hasDigits) { |
| throw new MalformedInputException("digit expected", charOffset); |
| } |
| return next; |
| } |
| next = advance(); |
| } |
| } |
| |
| int tokenizeSlashOrComment(int next) { |
| next = advance(); |
| if ($STAR === next) { |
| return tokenizeMultiLineComment(next); |
| } else if ($SLASH === next) { |
| return tokenizeSingleLineComment(next); |
| } else if ($EQ === next) { |
| appendPrecenceToken(SLASH_EQ_INFO); |
| return advance(); |
| } else { |
| appendPrecenceToken(SLASH_INFO); |
| return next; |
| } |
| } |
| |
| int tokenizeSingleLineComment(int next) { |
| while (true) { |
| next = advance(); |
| if ($LF === next || $CR === next || $EOF === next) { |
| return next; |
| } |
| } |
| } |
| |
| int tokenizeMultiLineComment(int next) { |
| int nesting = 1; |
| next = advance(); |
| while (true) { |
| if ($EOF === next) { |
| // TODO(ahe): Report error. |
| return next; |
| } else if ($STAR === next) { |
| next = advance(); |
| if ($SLASH === next) { |
| --nesting; |
| if (0 === nesting) { |
| return advance(); |
| } else { |
| next = advance(); |
| } |
| } |
| } else if ($SLASH === next) { |
| next = advance(); |
| if ($STAR === next) { |
| next = advance(); |
| ++nesting; |
| } |
| } else { |
| next = advance(); |
| } |
| } |
| } |
| |
| int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { |
| KeywordState state = KeywordState.KEYWORD_STATE; |
| int start = byteOffset; |
| while (state !== null && $a <= next && next <= $z) { |
| state = state.next(next); |
| next = advance(); |
| } |
| if (state === null || state.keyword === null) { |
| return tokenizeIdentifier(next, start, allowDollar); |
| } |
| if (($A <= next && next <= $Z) || |
| ($0 <= next && next <= $9) || |
| next === $_ || |
| next === $$) { |
| return tokenizeIdentifier(next, start, allowDollar); |
| } else if (next < 128) { |
| appendKeywordToken(state.keyword); |
| return next; |
| } else { |
| return tokenizeIdentifier(next, start, allowDollar); |
| } |
| } |
| |
| int tokenizeIdentifier(int next, int start, bool allowDollar) { |
| bool isAscii = true; |
| while (true) { |
| if (($a <= next && next <= $z) || |
| ($A <= next && next <= $Z) || |
| ($0 <= next && next <= $9) || |
| next === $_ || |
| (next === $$ && allowDollar)) { |
| next = advance(); |
| } else if (next < 128) { |
| if (isAscii) { |
| appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0)); |
| } else { |
| appendByteStringToken(IDENTIFIER_INFO, utf8String(start, -1)); |
| } |
| return next; |
| } else { |
| int nonAsciiStart = byteOffset; |
| do { |
| next = nextByte(); |
| } while (next > 127); |
| String string = utf8String(nonAsciiStart, -1).slowToString(); |
| isAscii = false; |
| int byteLength = nonAsciiStart - byteOffset; |
| addToCharOffset(string.length - byteLength); |
| } |
| } |
| } |
| |
| int tokenizeRawString(int next) { |
| int start = byteOffset; |
| next = advance(); |
| if (next === $DQ || next === $SQ) { |
| return tokenizeString(next, start, true); |
| } else { |
| throw new MalformedInputException("expected ' or \"", charOffset); |
| } |
| } |
| |
| int tokenizeString(int next, int start, bool raw) { |
| int quoteChar = next; |
| next = advance(); |
| if (quoteChar === next) { |
| next = advance(); |
| if (quoteChar === next) { |
| // Multiline string. |
| return tokenizeMultiLineString(quoteChar, start, raw); |
| } else { |
| // Empty string. |
| appendByteStringToken(STRING_INFO, utf8String(start, -1)); |
| return next; |
| } |
| } |
| if (raw) { |
| return tokenizeSingleLineRawString(next, quoteChar, start); |
| } else { |
| return tokenizeSingleLineString(next, quoteChar, start); |
| } |
| } |
| |
| static bool isHexDigit(int character) { |
| if ($0 <= character && character <= $9) return true; |
| character |= 0x20; |
| return ($a <= character && character <= $f); |
| } |
| |
| int tokenizeSingleLineString(int next, int quoteChar, int start) { |
| while (next !== quoteChar) { |
| if (next === $BACKSLASH) { |
| next = advance(); |
| } else if (next === $$) { |
| next = tokenizeStringInterpolation(start); |
| start = byteOffset; |
| continue; |
| } |
| if (next <= $CR && (next === $LF || next === $CR || next === $EOF)) { |
| throw new MalformedInputException("unterminated string literal", |
| charOffset); |
| } |
| next = advance(); |
| } |
| appendByteStringToken(STRING_INFO, utf8String(start, 0)); |
| return advance(); |
| } |
| |
| int tokenizeStringInterpolation(int start) { |
| beginToken(); |
| int next = advance(); |
| if (next === $OPEN_CURLY_BRACKET) { |
| return tokenizeInterpolatedExpression(next, start); |
| } else { |
| return tokenizeInterpolatedIdentifier(next, start); |
| } |
| } |
| |
| int tokenizeInterpolatedExpression(int next, int start) { |
| appendByteStringToken(STRING_INFO, utf8String(start, -2)); |
| appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); |
| next = advance(); |
| while (next !== $EOF && next !== $STX) { |
| next = bigSwitch(next); |
| } |
| if (next === $EOF) return next; |
| return advance(); |
| } |
| |
| int tokenizeInterpolatedIdentifier(int next, int start) { |
| appendByteStringToken(STRING_INFO, utf8String(start, -2)); |
| appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); |
| next = tokenizeKeywordOrIdentifier(next, false); |
| appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", OPEN_CURLY_BRACKET_TOKEN); |
| return next; |
| } |
| |
| int tokenizeSingleLineRawString(int next, int quoteChar, int start) { |
| next = advance(); |
| while (next != $EOF) { |
| if (next === quoteChar) { |
| appendByteStringToken(STRING_INFO, utf8String(start, 0)); |
| return advance(); |
| } else if (next === $LF || next === $CR) { |
| throw new MalformedInputException("unterminated string literal", |
| charOffset); |
| } |
| next = advance(); |
| } |
| throw new MalformedInputException("unterminated string literal", |
| charOffset); |
| } |
| |
| int tokenizeMultiLineRawString(int quoteChar, int start) { |
| int next = advance(); |
| outer: while (next !== $EOF) { |
| while (next !== quoteChar) { |
| next = advance(); |
| if (next === $EOF) break outer; |
| } |
| next = advance(); |
| if (next === quoteChar) { |
| next = advance(); |
| if (next === quoteChar) { |
| appendByteStringToken(STRING_INFO, utf8String(start, 0)); |
| return advance(); |
| } |
| } |
| } |
| throw new MalformedInputException("unterminated string literal", |
| charOffset); |
| } |
| |
| int tokenizeMultiLineString(int quoteChar, int start, bool raw) { |
| if (raw) return tokenizeMultiLineRawString(quoteChar, start); |
| int next = advance(); |
| while (next !== $EOF) { |
| if (next === $$) { |
| next = tokenizeStringInterpolation(start); |
| start = byteOffset; |
| continue; |
| } |
| if (next === quoteChar) { |
| next = advance(); |
| if (next === quoteChar) { |
| next = advance(); |
| if (next === quoteChar) { |
| appendByteStringToken(STRING_INFO, utf8String(start, 0)); |
| return advance(); |
| } |
| } |
| continue; |
| } |
| if (next === $BACKSLASH) { |
| next = advance(); |
| if (next === $EOF) break; |
| } |
| next = advance(); |
| } |
| throw new MalformedInputException("unterminated string literal", |
| charOffset); |
| } |
| } |
| |
| class MalformedInputException { |
| final String message; |
| final position; |
| MalformedInputException(this.message, this.position); |
| toString() => message; |
| } |