blob: 42cedd4d3469e01eba9ceee433280c6d063d7e91 [file] [log] [blame]
// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
#ifndef VM_REGEXP_PARSER_H_
#define VM_REGEXP_PARSER_H_
// SNIP
namespace dart {
// SNIP
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder: public ZoneObject {
public:
explicit RegExpBuilder(Zone* zone);
void AddCharacter(uc16 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp();
private:
void FlushCharacters();
void FlushText();
void FlushTerms();
Zone* zone() const { return zone_; }
Zone* zone_;
bool pending_empty_;
ZoneList<uc16>* characters_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};
class RegExpParser BASE_EMBEDDED {
public:
RegExpParser(FlatStringReader* in,
Handle<String>* error,
bool multiline_mode,
Zone* zone);
static bool ParseRegExp(FlatStringReader* input,
bool multiline,
RegExpCompileData* result,
Zone* zone);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
RegExpTree* ParseCharacterClass();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
uc32 ParseClassCharacterEscape();
// Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is.
bool ParseHexEscape(int length, uc32* value);
uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);
CharacterRange ParseClassAtom(uc16* char_class);
RegExpTree* ReportError(Vector<const char> message);
void Advance();
void Advance(int dist);
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21);
private:
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAHEAD,
NEGATIVE_LOOKAHEAD,
GROUPING
};
class RegExpParserState : public ZoneObject {
public:
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
int disjunction_capture_index,
Zone* zone)
: previous_state_(previous_state),
builder_(new(zone) RegExpBuilder(zone)),
group_type_(group_type),
disjunction_capture_index_(disjunction_capture_index) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
bool IsSubexpression() { return previous_state_ != NULL; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() { return group_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() { return disjunction_capture_index_; }
private:
// Linked list implementation of stack of states.
RegExpParserState* previous_state_;
// Builder for the stored disjunction.
RegExpBuilder* builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
SubexpressionType group_type_;
// Stored disjunction's capture index (if any).
int disjunction_capture_index_;
};
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
uc32 current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
Isolate* isolate_;
Zone* zone_;
Handle<String>* error_;
ZoneList<RegExpCapture*>* captures_;
FlatStringReader* in_;
uc32 current_;
int next_pos_;
// The capture count is only valid after we have scanned for captures.
int capture_count_;
bool has_more_;
bool multiline_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool failed_;
};
// SNIP
} // namespace dart
#endif // VM_REGEXP_PARSER_H_