blob: 08dcc8c45f0932e0f9362ff2f1b065b1179b928c [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "sky/engine/core/css/parser/MediaQueryTokenizer.h"
namespace blink {
#include "gen/sky/core/MediaQueryTokenizerCodepoints.cpp"
}
#include "sky/engine/core/css/parser/MediaQueryInputStream.h"
#include "sky/engine/core/html/parser/HTMLParserIdioms.h"
#include "sky/engine/wtf/text/StringBuilder.h"
#include "sky/engine/wtf/unicode/CharacterNames.h"
namespace blink {
// http://dev.w3.org/csswg/css-syntax/#name-start-code-point
static bool isNameStart(UChar c)
{
if (isASCIIAlpha(c))
return true;
if (c == '_')
return true;
return !isASCII(c);
}
// http://dev.w3.org/csswg/css-syntax/#name-code-point
static bool isNameChar(UChar c)
{
return isNameStart(c) || isASCIIDigit(c) || c == '-';
}
// http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
static bool twoCharsAreValidEscape(UChar first, UChar second)
{
return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker));
}
MediaQueryTokenizer::MediaQueryTokenizer(MediaQueryInputStream& inputStream)
: m_input(inputStream)
{
}
void MediaQueryTokenizer::reconsume(UChar c)
{
m_input.pushBack(c);
}
UChar MediaQueryTokenizer::consume()
{
UChar current = m_input.nextInputChar();
m_input.advance();
return current;
}
void MediaQueryTokenizer::consume(unsigned offset)
{
m_input.advance(offset);
}
MediaQueryToken MediaQueryTokenizer::whiteSpace(UChar cc)
{
// CSS Tokenization is currently lossy, but we could record
// the exact whitespace instead of discarding it here.
consumeUntilNonWhitespace();
return MediaQueryToken(WhitespaceToken);
}
static bool popIfBlockMatches(Vector<MediaQueryTokenType>& blockStack, MediaQueryTokenType type)
{
if (!blockStack.isEmpty() && blockStack.last() == type) {
blockStack.removeLast();
return true;
}
return false;
}
MediaQueryToken MediaQueryTokenizer::blockStart(MediaQueryTokenType type)
{
m_blockStack.append(type);
return MediaQueryToken(type, MediaQueryToken::BlockStart);
}
MediaQueryToken MediaQueryTokenizer::blockStart(MediaQueryTokenType blockType, MediaQueryTokenType type, String name)
{
m_blockStack.append(blockType);
return MediaQueryToken(type, name, MediaQueryToken::BlockStart);
}
MediaQueryToken MediaQueryTokenizer::blockEnd(MediaQueryTokenType type, MediaQueryTokenType startType)
{
if (popIfBlockMatches(m_blockStack, startType))
return MediaQueryToken(type, MediaQueryToken::BlockEnd);
return MediaQueryToken(type);
}
MediaQueryToken MediaQueryTokenizer::leftParenthesis(UChar cc)
{
return blockStart(LeftParenthesisToken);
}
MediaQueryToken MediaQueryTokenizer::rightParenthesis(UChar cc)
{
return blockEnd(RightParenthesisToken, LeftParenthesisToken);
}
MediaQueryToken MediaQueryTokenizer::leftBracket(UChar cc)
{
return blockStart(LeftBracketToken);
}
MediaQueryToken MediaQueryTokenizer::rightBracket(UChar cc)
{
return blockEnd(RightBracketToken, LeftBracketToken);
}
MediaQueryToken MediaQueryTokenizer::leftBrace(UChar cc)
{
return blockStart(LeftBraceToken);
}
MediaQueryToken MediaQueryTokenizer::rightBrace(UChar cc)
{
return blockEnd(RightBraceToken, LeftBraceToken);
}
MediaQueryToken MediaQueryTokenizer::plusOrFullStop(UChar cc)
{
if (nextCharsAreNumber(cc)) {
reconsume(cc);
return consumeNumericToken();
}
return MediaQueryToken(DelimiterToken, cc);
}
MediaQueryToken MediaQueryTokenizer::asterisk(UChar cc)
{
return MediaQueryToken(DelimiterToken, cc);
}
MediaQueryToken MediaQueryTokenizer::comma(UChar cc)
{
return MediaQueryToken(CommaToken);
}
MediaQueryToken MediaQueryTokenizer::hyphenMinus(UChar cc)
{
if (nextCharsAreNumber(cc)) {
reconsume(cc);
return consumeNumericToken();
}
if (nextCharsAreIdentifier(cc)) {
reconsume(cc);
return consumeIdentLikeToken();
}
return MediaQueryToken(DelimiterToken, cc);
}
MediaQueryToken MediaQueryTokenizer::solidus(UChar cc)
{
if (consumeIfNext('*')) {
// We're intentionally deviating from the spec here, by creating tokens for CSS comments.
return consumeUntilCommentEndFound()? MediaQueryToken(CommentToken): MediaQueryToken(EOFToken);
}
return MediaQueryToken(DelimiterToken, cc);
}
MediaQueryToken MediaQueryTokenizer::colon(UChar cc)
{
return MediaQueryToken(ColonToken);
}
MediaQueryToken MediaQueryTokenizer::semiColon(UChar cc)
{
return MediaQueryToken(SemicolonToken);
}
MediaQueryToken MediaQueryTokenizer::reverseSolidus(UChar cc)
{
if (twoCharsAreValidEscape(cc, m_input.nextInputChar())) {
reconsume(cc);
return consumeIdentLikeToken();
}
return MediaQueryToken(DelimiterToken, cc);
}
MediaQueryToken MediaQueryTokenizer::asciiDigit(UChar cc)
{
reconsume(cc);
return consumeNumericToken();
}
MediaQueryToken MediaQueryTokenizer::nameStart(UChar cc)
{
reconsume(cc);
return consumeIdentLikeToken();
}
MediaQueryToken MediaQueryTokenizer::stringStart(UChar cc)
{
return consumeStringTokenUntil(cc);
}
MediaQueryToken MediaQueryTokenizer::endOfFile(UChar cc)
{
return MediaQueryToken(EOFToken);
}
void MediaQueryTokenizer::tokenize(String string, Vector<MediaQueryToken>& outTokens)
{
// According to the spec, we should perform preprocessing here.
// See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
//
// However, we can skip this step since:
// * We're using HTML spaces (which accept \r and \f as a valid white space)
// * Do not count white spaces
// * consumeEscape replaces NULLs for replacement characters
if (string.isEmpty())
return;
MediaQueryInputStream input(string);
MediaQueryTokenizer tokenizer(input);
while (true) {
MediaQueryToken token = tokenizer.nextToken();
outTokens.append(token);
if (token.type() == EOFToken)
return;
}
}
MediaQueryToken MediaQueryTokenizer::nextToken()
{
// Unlike the HTMLTokenizer, the CSS Syntax spec is written
// as a stateless, (fixed-size) look-ahead tokenizer.
// We could move to the stateful model and instead create
// states for all the "next 3 codepoints are X" cases.
// State-machine tokenizers are easier to write to handle
// incremental tokenization of partial sources.
// However, for now we follow the spec exactly.
UChar cc = consume();
CodePoint codePointFunc = 0;
if (isASCII(cc)) {
ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
codePointFunc = codePoints[cc];
} else {
codePointFunc = &MediaQueryTokenizer::nameStart;
}
if (codePointFunc)
return ((this)->*(codePointFunc))(cc);
return MediaQueryToken(DelimiterToken, cc);
}
static int getSign(MediaQueryInputStream& input, unsigned& offset)
{
int sign = 1;
if (input.nextInputChar() == '+') {
++offset;
} else if (input.peek(offset) == '-') {
sign = -1;
++offset;
}
return sign;
}
static unsigned long long getInteger(MediaQueryInputStream& input, unsigned& offset)
{
unsigned intStartPos = offset;
offset = input.skipWhilePredicate<isASCIIDigit>(offset);
unsigned intEndPos = offset;
return input.getUInt(intStartPos, intEndPos);
}
static double getFraction(MediaQueryInputStream& input, unsigned& offset, unsigned& digitsNumber)
{
unsigned fractionStartPos = 0;
unsigned fractionEndPos = 0;
if (input.peek(offset) == '.' && isASCIIDigit(input.peek(++offset))) {
fractionStartPos = offset - 1;
offset = input.skipWhilePredicate<isASCIIDigit>(offset);
fractionEndPos = offset;
}
digitsNumber = fractionEndPos- fractionStartPos;
return input.getDouble(fractionStartPos, fractionEndPos);
}
static unsigned long long getExponent(MediaQueryInputStream& input, unsigned& offset, int& sign)
{
unsigned exponentStartPos = 0;
unsigned exponentEndPos = 0;
if ((input.peek(offset) == 'E' || input.peek(offset) == 'e')) {
int offsetBeforeExponent = offset;
++offset;
if (input.peek(offset) == '+') {
++offset;
} else if (input.peek(offset) =='-') {
sign = -1;
++offset;
}
exponentStartPos = offset;
offset = input.skipWhilePredicate<isASCIIDigit>(offset);
exponentEndPos = offset;
if (exponentEndPos == exponentStartPos)
offset = offsetBeforeExponent;
}
return input.getUInt(exponentStartPos, exponentEndPos);
}
// This method merges the following spec sections for efficiency
// http://www.w3.org/TR/css3-syntax/#consume-a-number
// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
MediaQueryToken MediaQueryTokenizer::consumeNumber()
{
ASSERT(nextCharsAreNumber());
NumericValueType type = IntegerValueType;
double value = 0;
unsigned offset = 0;
int exponentSign = 1;
unsigned fractionDigits;
int sign = getSign(m_input, offset);
unsigned long long integerPart = getInteger(m_input, offset);
double fractionPart = getFraction(m_input, offset, fractionDigits);
unsigned long long exponentPart = getExponent(m_input, offset, exponentSign);
double exponent = pow(10, (float)exponentSign * (double)exponentPart);
value = (double)sign * ((double)integerPart + fractionPart) * exponent;
m_input.advance(offset);
if (fractionDigits > 0)
type = NumberValueType;
return MediaQueryToken(NumberToken, value, type);
}
// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
MediaQueryToken MediaQueryTokenizer::consumeNumericToken()
{
MediaQueryToken token = consumeNumber();
if (nextCharsAreIdentifier())
token.convertToDimensionWithUnit(consumeName());
else if (consumeIfNext('%'))
token.convertToPercentage();
return token;
}
// http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token
MediaQueryToken MediaQueryTokenizer::consumeIdentLikeToken()
{
String name = consumeName();
if (consumeIfNext('(')) {
return blockStart(LeftParenthesisToken, FunctionToken, name);
}
return MediaQueryToken(IdentToken, name);
}
static bool isNewLine(UChar cc)
{
// We check \r and \f here, since we have no preprocessing stage
return (cc == '\r' || cc == '\n' || cc == '\f');
}
// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
MediaQueryToken MediaQueryTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
{
StringBuilder output;
while (true) {
UChar cc = consume();
if (cc == endingCodePoint || cc == kEndOfFileMarker) {
// The "reconsume" here deviates from the spec, but is required to avoid consuming past the EOF
if (cc == kEndOfFileMarker)
reconsume(cc);
return MediaQueryToken(StringToken, output.toString());
}
if (isNewLine(cc)) {
reconsume(cc);
return MediaQueryToken(BadStringToken);
}
if (cc == '\\') {
if (m_input.nextInputChar() == kEndOfFileMarker)
continue;
if (isNewLine(m_input.nextInputChar()))
consume();
else
output.append(consumeEscape());
} else {
output.append(cc);
}
}
}
void MediaQueryTokenizer::consumeUntilNonWhitespace()
{
// Using HTML space here rather than CSS space since we don't do preprocessing
while (isHTMLSpace<UChar>(m_input.nextInputChar()))
consume();
}
bool MediaQueryTokenizer::consumeUntilCommentEndFound()
{
UChar c = consume();
while (true) {
if (c == kEndOfFileMarker)
return false;
if (c != '*') {
c = consume();
continue;
}
c = consume();
if (c == '/')
break;
}
return true;
}
bool MediaQueryTokenizer::consumeIfNext(UChar character)
{
if (m_input.nextInputChar() == character) {
consume();
return true;
}
return false;
}
// http://www.w3.org/TR/css3-syntax/#consume-a-name
String MediaQueryTokenizer::consumeName()
{
// FIXME: Is this as efficient as it can be?
// The possibility of escape chars mandates a copy AFAICT.
StringBuilder result;
while (true) {
UChar cc = consume();
if (isNameChar(cc)) {
result.append(cc);
continue;
}
if (twoCharsAreValidEscape(cc, m_input.nextInputChar())) {
result.append(consumeEscape());
continue;
}
reconsume(cc);
return result.toString();
}
}
// http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
UChar MediaQueryTokenizer::consumeEscape()
{
UChar cc = consume();
ASSERT(cc != '\n');
if (isASCIIHexDigit(cc)) {
unsigned consumedHexDigits = 1;
StringBuilder hexChars;
hexChars.append(cc);
while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.nextInputChar())) {
cc = consume();
hexChars.append(cc);
consumedHexDigits++;
};
bool ok = false;
UChar codePoint = hexChars.toString().toUIntStrict(&ok, 16);
if (!ok)
return WTF::Unicode::replacementCharacter;
return codePoint;
}
// Replaces NULLs with replacement characters, since we do not perform preprocessing
if (cc == kEndOfFileMarker)
return WTF::Unicode::replacementCharacter;
return cc;
}
bool MediaQueryTokenizer::nextTwoCharsAreValidEscape()
{
if (m_input.leftChars() < 1)
return false;
return twoCharsAreValidEscape(m_input.nextInputChar(), m_input.peek(1));
}
// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
bool MediaQueryTokenizer::nextCharsAreNumber(UChar first)
{
UChar second = m_input.nextInputChar();
if (isASCIIDigit(first))
return true;
if (first == '+' || first == '-')
return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peek(1))));
if (first =='.')
return (isASCIIDigit(second));
return false;
}
bool MediaQueryTokenizer::nextCharsAreNumber()
{
UChar first = consume();
bool areNumber = nextCharsAreNumber(first);
reconsume(first);
return areNumber;
}
// http://www.w3.org/TR/css3-syntax/#would-start-an-identifier
bool MediaQueryTokenizer::nextCharsAreIdentifier(UChar first)
{
UChar second = m_input.nextInputChar();
if (isNameStart(first) || twoCharsAreValidEscape(first, second))
return true;
if (first == '-') {
if (isNameStart(m_input.nextInputChar()))
return true;
return nextTwoCharsAreValidEscape();
}
return false;
}
bool MediaQueryTokenizer::nextCharsAreIdentifier()
{
UChar first = consume();
bool areIdentifier = nextCharsAreIdentifier(first);
reconsume(first);
return areIdentifier;
}
} // namespace blink