runtime/vm/regexp_parser.h - sdk - Git at Google

 // Copyright (c) 2014, the Dart project authors.  Please see the AUTHORS file
 // for details. All rights reserved. Use of this source code is governed by a
 // BSD-style license that can be found in the LICENSE file.

 #ifndef RUNTIME_VM_REGEXP_PARSER_H_
 #define RUNTIME_VM_REGEXP_PARSER_H_

 #include "vm/allocation.h"
 #include "vm/growable_array.h"
 #include "vm/regexp_ast.h"

 namespace dart {

 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
 class RegExpBuilder : public ZoneAllocated {
  public:
   explicit RegExpBuilder(RegExpFlags flags);

   void AddCharacter(uint16_t character);
   void AddUnicodeCharacter(uint32_t character);
   void AddEscapedUnicodeCharacter(uint32_t character);
   // "Adds" an empty expression. Does nothing except consume a
   // following quantifier
   void AddEmpty();
   void AddCharacterClass(RegExpCharacterClass* cc);
   void AddCharacterClassForDesugaring(uint32_t c);
   void AddAtom(RegExpTree* tree);
   void AddTerm(RegExpTree* tree);
   void AddAssertion(RegExpTree* tree);
   void NewAlternative();  // '|'
   // Attempt to add a quantifier to the last atom added. The return value
   // denotes whether the attempt succeeded, since some atoms like lookbehind
   // cannot be quantified.
   bool AddQuantifierToAtom(intptr_t min,
                            intptr_t max,
                            RegExpQuantifier::QuantifierType type);
   RegExpTree* ToRegExp();
   RegExpFlags flags() const { return flags_; }
   bool ignore_case() const { return flags_.IgnoreCase(); }
   bool is_multi_line() const { return flags_.IsMultiLine(); }
   bool is_dot_all() const { return flags_.IsDotAll(); }

  private:
   static constexpr uint16_t kNoPendingSurrogate = 0;
   void AddLeadSurrogate(uint16_t lead_surrogate);
   void AddTrailSurrogate(uint16_t trail_surrogate);
   void FlushPendingSurrogate();
   void FlushCharacters();
   void FlushText();
   void FlushTerms();
   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
   bool NeedsDesugaringForIgnoreCase(uint32_t c);

   Zone* zone() const { return zone_; }
   bool is_unicode() const { return flags_.IsUnicode(); }

   Zone* zone_;
   bool pending_empty_;
   RegExpFlags flags_;
   ZoneGrowableArray<uint16_t>* characters_;
   uint16_t pending_surrogate_;
   GrowableArray<RegExpTree*> terms_;
   GrowableArray<RegExpTree*> text_;
   GrowableArray<RegExpTree*> alternatives_;
 #ifdef DEBUG
   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
 #define LAST(x) last_added_ = x;
 #else
 #define LAST(x)
 #endif
 };

 using RegExpCaptureName = ZoneGrowableArray<uint16_t>;

 class RegExpParser : public ValueObject {
  public:
   RegExpParser(const String& in, String* error, RegExpFlags regexp_flags);

   static void ParseRegExp(const String& input,
                           RegExpFlags regexp_flags,
                           RegExpCompileData* result);

   RegExpTree* ParsePattern();
   RegExpTree* ParseDisjunction();
   RegExpTree* ParseGroup();

   // Parses a {...,...} quantifier and stores the range in the given
   // out parameters.
   bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out);

   // Parses and returns a single escaped character.  The character
   // must not be 'b' or 'B' since they are usually handle specially.
   uint32_t ParseClassCharacterEscape();

   // Checks whether the following is a length-digit hexadecimal number,
   // and sets the value if it is.
   bool ParseHexEscape(intptr_t length, uint32_t* value);
   bool ParseUnicodeEscape(uint32_t* value);
   bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value);

   // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or
   // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the
   // result in the given out parameters. If the shorthand is used,
   // nothing will be added to name_2.
   bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
                               ZoneGrowableArray<char>* name_2);
   // Adds the specified unicode property to the provided character range.
   bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to,
                              bool negate,
                              ZoneGrowableArray<char>* name_1,
                              ZoneGrowableArray<char>* name_2);
   // Returns a regexp node that corresponds to one of these unicode
   // property sequences: "Any", "ASCII", "Assigned".
   RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1);
   RegExpTree* ParseCharacterClass(const RegExpBuilder* builder);

   uint32_t ParseOctalLiteral();

   // Tries to parse the input as a back reference.  If successful it
   // stores the result in the output parameter and returns true.  If
   // it fails it will push back the characters read so the same characters
   // can be reparsed.
   bool ParseBackReferenceIndex(intptr_t* index_out);

   // Attempts to parse a possible escape within a character class.
   bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
                         bool add_unicode_case_equivalents,
                         uint32_t* char_out);
   void ReportError(const char* message);
   void Advance();
   void Advance(intptr_t dist);
   void Reset(intptr_t pos);

   // Reports whether the pattern might be used as a literal search string.
   // Only use if the result of the parse is a single atom node.
   bool simple();
   bool contains_anchor() { return contains_anchor_; }
   void set_contains_anchor() { contains_anchor_ = true; }
   intptr_t captures_started() { return captures_started_; }
   intptr_t position() { return next_pos_ - 1; }
   bool is_unicode() const { return top_level_flags_.IsUnicode(); }

   static bool IsSyntaxCharacterOrSlash(uint32_t c);

   static constexpr intptr_t kMaxCaptures = 1 << 16;
   static constexpr uint32_t kEndMarker = (1 << 21);

  private:
   enum SubexpressionType {
     INITIAL,
     CAPTURE,  // All positive values represent captures.
     POSITIVE_LOOKAROUND,
     NEGATIVE_LOOKAROUND,
     GROUPING
   };

   class RegExpParserState : public ZoneAllocated {
    public:
     RegExpParserState(RegExpParserState* previous_state,
                       SubexpressionType group_type,
                       RegExpLookaround::Type lookaround_type,
                       intptr_t disjunction_capture_index,
                       const RegExpCaptureName* capture_name,
                       RegExpFlags flags,
                       Zone* zone)
         : previous_state_(previous_state),
           builder_(new(zone) RegExpBuilder(flags)),
           group_type_(group_type),
           lookaround_type_(lookaround_type),
           disjunction_capture_index_(disjunction_capture_index),
           capture_name_(capture_name) {}
     // Parser state of containing expression, if any.
     RegExpParserState* previous_state() { return previous_state_; }
     bool IsSubexpression() { return previous_state_ != nullptr; }
     // RegExpBuilder building this regexp's AST.
     RegExpBuilder* builder() { return builder_; }
     // Type of regexp being parsed (parenthesized group or entire regexp).
     SubexpressionType group_type() { return group_type_; }
     // Lookahead or lookbehind.
     RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
     // Index in captures array of first capture in this sub-expression, if any.
     // Also the capture index of this sub-expression itself, if group_type
     // is CAPTURE.
     intptr_t capture_index() { return disjunction_capture_index_; }
     const RegExpCaptureName* capture_name() const { return capture_name_; }

     bool IsNamedCapture() const { return capture_name_ != nullptr; }

     // Check whether the parser is inside a capture group with the given index.
     bool IsInsideCaptureGroup(intptr_t index);
     // Check whether the parser is inside a capture group with the given name.
     bool IsInsideCaptureGroup(const RegExpCaptureName* name);

    private:
     // Linked list implementation of stack of states.
     RegExpParserState* previous_state_;
     // Builder for the stored disjunction.
     RegExpBuilder* builder_;
     // Stored disjunction type (capture, look-ahead or grouping), if any.
     SubexpressionType group_type_;
     // Stored read direction.
     const RegExpLookaround::Type lookaround_type_;
     // Stored disjunction's capture index (if any).
     intptr_t disjunction_capture_index_;
     // Stored capture name (if any).
     const RegExpCaptureName* const capture_name_;
   };

   // Return the 1-indexed RegExpCapture object, allocate if necessary.
   RegExpCapture* GetCapture(intptr_t index);

   // Creates a new named capture at the specified index. Must be called exactly
   // once for each named capture. Fails if a capture with the same name is
   // encountered.
   void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index);

   // Parses the name of a capture group (?<name>pattern). The name must adhere
   // to IdentifierName in the ECMAScript standard.
   const RegExpCaptureName* ParseCaptureGroupName();

   bool ParseNamedBackReference(RegExpBuilder* builder,
                                RegExpParserState* state);
   RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
   intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name);

   // After the initial parsing pass, patch corresponding RegExpCapture objects
   // into all RegExpBackReferences. This is done after initial parsing in order
   // to avoid complicating cases in which references come before the capture.
   void PatchNamedBackReferences();

   ArrayPtr CreateCaptureNameMap();

   // Returns true iff the pattern contains named captures. May call
   // ScanForCaptures to look ahead at the remaining pattern.
   bool HasNamedCaptures();

   Zone* zone() { return zone_; }

   uint32_t current() { return current_; }
   bool has_more() { return has_more_; }
   bool has_next() { return next_pos_ < in().Length(); }
   uint32_t Next();
   uint32_t ReadNext(bool update_position);
   const String& in() { return in_; }
   void ScanForCaptures();

   Zone* zone_;
   ZoneGrowableArray<RegExpCapture*>* captures_;
   ZoneGrowableArray<RegExpCapture*>* named_captures_;
   ZoneGrowableArray<RegExpBackReference*>* named_back_references_;
   const String& in_;
   uint32_t current_;
   intptr_t next_pos_;
   intptr_t captures_started_;
   // The capture count is only valid after we have scanned for captures.
   intptr_t capture_count_;
   bool has_more_;
   RegExpFlags top_level_flags_;
   bool simple_;
   bool contains_anchor_;
   bool is_scanned_for_captures_;
   bool has_named_captures_;
 };

 }  // namespace dart

 #endif  // RUNTIME_VM_REGEXP_PARSER_H_
	// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
	// for details. All rights reserved. Use of this source code is governed by a
	// BSD-style license that can be found in the LICENSE file.

	#ifndef RUNTIME_VM_REGEXP_PARSER_H_
	#define RUNTIME_VM_REGEXP_PARSER_H_

	#include "vm/allocation.h"
	#include "vm/growable_array.h"
	#include "vm/regexp_ast.h"

	namespace dart {

	// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
	class RegExpBuilder : public ZoneAllocated {
	public:
	explicit RegExpBuilder(RegExpFlags flags);

	void AddCharacter(uint16_t character);
	void AddUnicodeCharacter(uint32_t character);
	void AddEscapedUnicodeCharacter(uint32_t character);
	// "Adds" an empty expression. Does nothing except consume a
	// following quantifier
	void AddEmpty();
	void AddCharacterClass(RegExpCharacterClass* cc);
	void AddCharacterClassForDesugaring(uint32_t c);
	void AddAtom(RegExpTree* tree);
	void AddTerm(RegExpTree* tree);
	void AddAssertion(RegExpTree* tree);
	void NewAlternative(); // '\|'
	// Attempt to add a quantifier to the last atom added. The return value
	// denotes whether the attempt succeeded, since some atoms like lookbehind
	// cannot be quantified.
	bool AddQuantifierToAtom(intptr_t min,
	intptr_t max,
	RegExpQuantifier::QuantifierType type);
	RegExpTree* ToRegExp();
	RegExpFlags flags() const { return flags_; }
	bool ignore_case() const { return flags_.IgnoreCase(); }
	bool is_multi_line() const { return flags_.IsMultiLine(); }
	bool is_dot_all() const { return flags_.IsDotAll(); }

	private:
	static constexpr uint16_t kNoPendingSurrogate = 0;
	void AddLeadSurrogate(uint16_t lead_surrogate);
	void AddTrailSurrogate(uint16_t trail_surrogate);
	void FlushPendingSurrogate();
	void FlushCharacters();
	void FlushText();
	void FlushTerms();
	bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
	bool NeedsDesugaringForIgnoreCase(uint32_t c);

	Zone* zone() const { return zone_; }
	bool is_unicode() const { return flags_.IsUnicode(); }

	Zone* zone_;
	bool pending_empty_;
	RegExpFlags flags_;
	ZoneGrowableArray<uint16_t>* characters_;
	uint16_t pending_surrogate_;
	GrowableArray<RegExpTree*> terms_;
	GrowableArray<RegExpTree*> text_;
	GrowableArray<RegExpTree*> alternatives_;
	#ifdef DEBUG
	enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
	#define LAST(x) last_added_ = x;
	#else
	#define LAST(x)
	#endif
	};

	using RegExpCaptureName = ZoneGrowableArray<uint16_t>;

	class RegExpParser : public ValueObject {
	public:
	RegExpParser(const String& in, String* error, RegExpFlags regexp_flags);

	static void ParseRegExp(const String& input,
	RegExpFlags regexp_flags,
	RegExpCompileData* result);

	RegExpTree* ParsePattern();
	RegExpTree* ParseDisjunction();
	RegExpTree* ParseGroup();

	// Parses a {...,...} quantifier and stores the range in the given
	// out parameters.
	bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out);

	// Parses and returns a single escaped character. The character
	// must not be 'b' or 'B' since they are usually handle specially.
	uint32_t ParseClassCharacterEscape();

	// Checks whether the following is a length-digit hexadecimal number,
	// and sets the value if it is.
	bool ParseHexEscape(intptr_t length, uint32_t* value);
	bool ParseUnicodeEscape(uint32_t* value);
	bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value);

	// Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or
	// the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the
	// result in the given out parameters. If the shorthand is used,
	// nothing will be added to name_2.
	bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
	ZoneGrowableArray<char>* name_2);
	// Adds the specified unicode property to the provided character range.
	bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to,
	bool negate,
	ZoneGrowableArray<char>* name_1,
	ZoneGrowableArray<char>* name_2);
	// Returns a regexp node that corresponds to one of these unicode
	// property sequences: "Any", "ASCII", "Assigned".
	RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1);
	RegExpTree* ParseCharacterClass(const RegExpBuilder* builder);

	uint32_t ParseOctalLiteral();

	// Tries to parse the input as a back reference. If successful it
	// stores the result in the output parameter and returns true. If
	// it fails it will push back the characters read so the same characters
	// can be reparsed.
	bool ParseBackReferenceIndex(intptr_t* index_out);

	// Attempts to parse a possible escape within a character class.
	bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
	bool add_unicode_case_equivalents,
	uint32_t* char_out);
	void ReportError(const char* message);
	void Advance();
	void Advance(intptr_t dist);
	void Reset(intptr_t pos);

	// Reports whether the pattern might be used as a literal search string.
	// Only use if the result of the parse is a single atom node.
	bool simple();
	bool contains_anchor() { return contains_anchor_; }
	void set_contains_anchor() { contains_anchor_ = true; }
	intptr_t captures_started() { return captures_started_; }
	intptr_t position() { return next_pos_ - 1; }
	bool is_unicode() const { return top_level_flags_.IsUnicode(); }

	static bool IsSyntaxCharacterOrSlash(uint32_t c);

	static constexpr intptr_t kMaxCaptures = 1 << 16;
	static constexpr uint32_t kEndMarker = (1 << 21);

	private:
	enum SubexpressionType {
	INITIAL,
	CAPTURE, // All positive values represent captures.
	POSITIVE_LOOKAROUND,
	NEGATIVE_LOOKAROUND,
	GROUPING
	};

	class RegExpParserState : public ZoneAllocated {
	public:
	RegExpParserState(RegExpParserState* previous_state,
	SubexpressionType group_type,
	RegExpLookaround::Type lookaround_type,
	intptr_t disjunction_capture_index,
	const RegExpCaptureName* capture_name,
	RegExpFlags flags,
	Zone* zone)
	: previous_state_(previous_state),
	builder_(new(zone) RegExpBuilder(flags)),
	group_type_(group_type),
	lookaround_type_(lookaround_type),
	disjunction_capture_index_(disjunction_capture_index),
	capture_name_(capture_name) {}
	// Parser state of containing expression, if any.
	RegExpParserState* previous_state() { return previous_state_; }
	bool IsSubexpression() { return previous_state_ != nullptr; }
	// RegExpBuilder building this regexp's AST.
	RegExpBuilder* builder() { return builder_; }
	// Type of regexp being parsed (parenthesized group or entire regexp).
	SubexpressionType group_type() { return group_type_; }
	// Lookahead or lookbehind.
	RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
	// Index in captures array of first capture in this sub-expression, if any.
	// Also the capture index of this sub-expression itself, if group_type
	// is CAPTURE.
	intptr_t capture_index() { return disjunction_capture_index_; }
	const RegExpCaptureName* capture_name() const { return capture_name_; }

	bool IsNamedCapture() const { return capture_name_ != nullptr; }

	// Check whether the parser is inside a capture group with the given index.
	bool IsInsideCaptureGroup(intptr_t index);
	// Check whether the parser is inside a capture group with the given name.
	bool IsInsideCaptureGroup(const RegExpCaptureName* name);

	private:
	// Linked list implementation of stack of states.
	RegExpParserState* previous_state_;
	// Builder for the stored disjunction.
	RegExpBuilder* builder_;
	// Stored disjunction type (capture, look-ahead or grouping), if any.
	SubexpressionType group_type_;
	// Stored read direction.
	const RegExpLookaround::Type lookaround_type_;
	// Stored disjunction's capture index (if any).
	intptr_t disjunction_capture_index_;
	// Stored capture name (if any).
	const RegExpCaptureName* const capture_name_;
	};

	// Return the 1-indexed RegExpCapture object, allocate if necessary.
	RegExpCapture* GetCapture(intptr_t index);

	// Creates a new named capture at the specified index. Must be called exactly
	// once for each named capture. Fails if a capture with the same name is
	// encountered.
	void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index);

	// Parses the name of a capture group (?<name>pattern). The name must adhere
	// to IdentifierName in the ECMAScript standard.
	const RegExpCaptureName* ParseCaptureGroupName();

	bool ParseNamedBackReference(RegExpBuilder* builder,
	RegExpParserState* state);
	RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
	intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name);

	// After the initial parsing pass, patch corresponding RegExpCapture objects
	// into all RegExpBackReferences. This is done after initial parsing in order
	// to avoid complicating cases in which references come before the capture.
	void PatchNamedBackReferences();

	ArrayPtr CreateCaptureNameMap();

	// Returns true iff the pattern contains named captures. May call
	// ScanForCaptures to look ahead at the remaining pattern.
	bool HasNamedCaptures();

	Zone* zone() { return zone_; }

	uint32_t current() { return current_; }
	bool has_more() { return has_more_; }
	bool has_next() { return next_pos_ < in().Length(); }
	uint32_t Next();
	uint32_t ReadNext(bool update_position);
	const String& in() { return in_; }
	void ScanForCaptures();

	Zone* zone_;
	ZoneGrowableArray<RegExpCapture> captures_;
	ZoneGrowableArray<RegExpCapture> named_captures_;
	ZoneGrowableArray<RegExpBackReference> named_back_references_;
	const String& in_;
	uint32_t current_;
	intptr_t next_pos_;
	intptr_t captures_started_;
	// The capture count is only valid after we have scanned for captures.
	intptr_t capture_count_;
	bool has_more_;
	RegExpFlags top_level_flags_;
	bool simple_;
	bool contains_anchor_;
	bool is_scanned_for_captures_;
	bool has_named_captures_;
	};

	} // namespace dart

	#endif // RUNTIME_VM_REGEXP_PARSER_H_