| // Copyright 2016 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "vm/regexp/regexp-parser.h" |
| |
| #include "unicode/uchar.h" |
| #include "unicode/uniset.h" |
| #include "unicode/usetiter.h" |
| |
| #include "platform/unicode.h" |
| |
| #include "vm/longjump.h" |
| #include "vm/object_store.h" |
| #include "vm/regexp/char-predicates-inl.h" |
| #include "vm/regexp/regexp-compiler.h" |
| #include "vm/regexp/unibrow.h" |
| #include "vm/regexp/zone-containers.h" |
| #include "vm/symbols.h" |
| |
| namespace dart { |
| |
| namespace { |
| |
| // Whether we're currently inside the ClassEscape production |
| // (tc39.es/ecma262/#prod-annexB-CharacterEscape). |
| enum class InClassEscapeState { |
| kInClass, |
| kNotInClass, |
| }; |
| |
| // The production used to derive ClassSetOperand. |
| enum class ClassSetOperandType { |
| kClassSetCharacter, |
| kClassStringDisjunction, |
| kNestedClass, |
| kCharacterClassEscape, // \ CharacterClassEscape is a special nested class, |
| // as we can fold it directly into another range. |
| kClassSetRange |
| }; |
| |
| class RegExpTextBuilder { |
| public: |
| using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>; |
| |
| RegExpTextBuilder(Zone* zone, |
| SmallRegExpTreeVector* terms_storage, |
| RegExpFlags flags) |
| : zone_(zone), flags_(flags), terms_(terms_storage), text_(zone) {} |
| void AddCharacter(uint16_t character); |
| void AddUnicodeCharacter(uint32_t character); |
| void AddEscapedUnicodeCharacter(uint32_t character); |
| void AddAtom(RegExpTree* atom); |
| void AddTerm(RegExpTree* term); |
| void AddClassRanges(RegExpClassRanges* cc); |
| void FlushPendingSurrogate(); |
| void FlushText(); |
| RegExpTree* PopLastAtom(); |
| RegExpTree* ToRegExp(); |
| |
| private: |
| static const uint16_t kNoPendingSurrogate = 0; |
| |
| void AddLeadSurrogate(uint16_t lead_surrogate); |
| void AddTrailSurrogate(uint16_t trail_surrogate); |
| void FlushCharacters(); |
| bool NeedsDesugaringForUnicode(RegExpClassRanges* cc); |
| bool NeedsDesugaringForIgnoreCase(uint32_t c); |
| void AddClassRangesForDesugaring(uint32_t c); |
| bool ignore_case() const { return IsIgnoreCase(flags_); } |
| bool IsUnicodeMode() const { |
| // Either /v or /u enable UnicodeMode |
| // https://tc39.es/ecma262/#sec-parsepattern |
| return IsUnicode(flags_) || IsUnicodeSets(flags_); |
| } |
| Zone* zone() const { return zone_; } |
| |
| Zone* const zone_; |
| const RegExpFlags flags_; |
| ZoneList<uint16_t>* characters_ = nullptr; |
| uint16_t pending_surrogate_ = kNoPendingSurrogate; |
| SmallRegExpTreeVector* terms_; |
| SmallRegExpTreeVector text_; |
| }; |
| |
| void RegExpTextBuilder::AddLeadSurrogate(uint16_t lead_surrogate) { |
| ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); |
| FlushPendingSurrogate(); |
| // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
| pending_surrogate_ = lead_surrogate; |
| } |
| |
| void RegExpTextBuilder::AddTrailSurrogate(uint16_t trail_surrogate) { |
| ASSERT(Utf16::IsTrailSurrogate(trail_surrogate)); |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| uint16_t lead_surrogate = pending_surrogate_; |
| pending_surrogate_ = kNoPendingSurrogate; |
| ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); |
| uint32_t combined = |
| Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
| if (NeedsDesugaringForIgnoreCase(combined)) { |
| AddClassRangesForDesugaring(combined); |
| } else { |
| ZoneList<uint16_t> surrogate_pair(2, zone()); |
| surrogate_pair.Add(lead_surrogate, zone()); |
| surrogate_pair.Add(trail_surrogate, zone()); |
| RegExpAtom* atom = |
| zone()->New<RegExpAtom>(surrogate_pair.ToConstVector()); |
| AddAtom(atom); |
| } |
| } else { |
| pending_surrogate_ = trail_surrogate; |
| FlushPendingSurrogate(); |
| } |
| } |
| |
| void RegExpTextBuilder::FlushPendingSurrogate() { |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| ASSERT(IsUnicodeMode()); |
| uint32_t c = pending_surrogate_; |
| pending_surrogate_ = kNoPendingSurrogate; |
| AddClassRangesForDesugaring(c); |
| } |
| } |
| |
| void RegExpTextBuilder::FlushCharacters() { |
| FlushPendingSurrogate(); |
| if (characters_ != nullptr) { |
| RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector()); |
| characters_ = nullptr; |
| text_.emplace_back(atom); |
| } |
| } |
| |
| void RegExpTextBuilder::FlushText() { |
| FlushCharacters(); |
| size_t num_text = text_.size(); |
| if (num_text == 0) { |
| return; |
| } else if (num_text == 1) { |
| terms_->emplace_back(text_.back()); |
| } else { |
| RegExpText* text = zone()->New<RegExpText>(zone()); |
| for (size_t i = 0; i < num_text; i++) { |
| text_[i]->AppendToText(text, zone()); |
| } |
| terms_->emplace_back(text); |
| } |
| text_.clear(); |
| } |
| |
| void RegExpTextBuilder::AddCharacter(uint16_t c) { |
| FlushPendingSurrogate(); |
| if (characters_ == nullptr) { |
| characters_ = zone()->New<ZoneList<uint16_t>>(4, zone()); |
| } |
| characters_->Add(c, zone()); |
| } |
| |
| void RegExpTextBuilder::AddUnicodeCharacter(uint32_t c) { |
| if (c > static_cast<uint32_t>(Utf16::kMaxNonSurrogateCharCode)) { |
| ASSERT(IsUnicodeMode()); |
| AddLeadSurrogate(Utf16::LeadSurrogate(c)); |
| AddTrailSurrogate(Utf16::TrailSurrogate(c)); |
| } else if (IsUnicodeMode() && Utf16::IsLeadSurrogate(c)) { |
| AddLeadSurrogate(c); |
| } else if (IsUnicodeMode() && Utf16::IsTrailSurrogate(c)) { |
| AddTrailSurrogate(c); |
| } else { |
| AddCharacter(static_cast<uint16_t>(c)); |
| } |
| } |
| |
| void RegExpTextBuilder::AddEscapedUnicodeCharacter(uint32_t character) { |
| // A lead or trail surrogate parsed via escape sequence will not |
| // pair up with any preceding lead or following trail surrogate. |
| FlushPendingSurrogate(); |
| AddUnicodeCharacter(character); |
| FlushPendingSurrogate(); |
| } |
| |
| void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) { |
| if (NeedsDesugaringForUnicode(cr)) { |
| // With /u or /v, character class needs to be desugared, so it |
| // must be a standalone term instead of being part of a RegExpText. |
| AddTerm(cr); |
| } else { |
| AddAtom(cr); |
| } |
| } |
| |
| void RegExpTextBuilder::AddClassRangesForDesugaring(uint32_t c) { |
| AddTerm(zone()->New<RegExpClassRanges>( |
| zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)))); |
| } |
| |
| void RegExpTextBuilder::AddAtom(RegExpTree* atom) { |
| ASSERT(atom->IsTextElement()); |
| FlushCharacters(); |
| text_.emplace_back(atom); |
| } |
| |
| void RegExpTextBuilder::AddTerm(RegExpTree* term) { |
| ASSERT(term->IsTextElement()); |
| FlushText(); |
| terms_->emplace_back(term); |
| } |
| |
| bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) { |
| if (!IsUnicodeMode()) return false; |
| // TODO(yangguo): we could be smarter than this. Case-insensitivity does not |
| // necessarily mean that we need to desugar. It's probably nicer to have a |
| // separate pass to figure out unicode desugarings. |
| if (ignore_case()) return true; |
| ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| CharacterRange::Canonicalize(ranges); |
| |
| if (cc->is_negated()) { |
| ZoneList<CharacterRange>* negated_ranges = |
| zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone()); |
| CharacterRange::Negate(ranges, negated_ranges, zone()); |
| ranges = negated_ranges; |
| } |
| |
| for (int i = ranges->length() - 1; i >= 0; i--) { |
| uint32_t from = ranges->at(i).from(); |
| uint32_t to = ranges->at(i).to(); |
| // Check for non-BMP characters. |
| if (to >= kNonBmpStart) return true; |
| // Check for lone surrogates. |
| if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| } |
| return false; |
| } |
| |
| // We only use this for characters made of surrogate pairs. All other |
| // characters outside of character classes are made case independent in the |
| // code generation. |
| bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) { |
| #ifdef V8_INTL_SUPPORT |
| if (IsUnicodeMode() && ignore_case()) { |
| icu::UnicodeSet set(c, c); |
| set.closeOver(USET_CASE_INSENSITIVE); |
| set.removeAllStrings(); |
| return set.size() > 1; |
| } |
| // In the case where ICU is not included, we act as if the unicode flag is |
| // not set, and do not desugar. |
| #endif // V8_INTL_SUPPORT |
| return false; |
| } |
| |
| RegExpTree* RegExpTextBuilder::PopLastAtom() { |
| FlushPendingSurrogate(); |
| RegExpTree* atom; |
| if (characters_ != nullptr) { |
| base::Vector<const uint16_t> char_vector = characters_->ToConstVector(); |
| int num_chars = char_vector.length(); |
| if (num_chars > 1) { |
| base::Vector<const uint16_t> prefix = |
| char_vector.SubVector(0, num_chars - 1); |
| text_.emplace_back(zone()->New<RegExpAtom>(prefix)); |
| char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
| } |
| characters_ = nullptr; |
| atom = zone()->New<RegExpAtom>(char_vector); |
| return atom; |
| } else if (!text_.empty()) { |
| atom = text_.back(); |
| text_.pop_back(); |
| return atom; |
| } |
| return nullptr; |
| } |
| |
| RegExpTree* RegExpTextBuilder::ToRegExp() { |
| FlushText(); |
| size_t number_of_terms = terms_->size(); |
| if (number_of_terms == 0) return zone()->New<RegExpEmpty>(); |
| if (number_of_terms == 1) return terms_->back(); |
| return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( |
| base::VectorOf(terms_->begin(), terms_->size()), zone())); |
| } |
| |
| // Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
| class RegExpBuilder { |
| public: |
| RegExpBuilder(Zone* zone, RegExpFlags flags) |
| : zone_(zone), |
| flags_(flags), |
| terms_(zone), |
| alternatives_(zone), |
| text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {} |
| void AddCharacter(uint16_t character); |
| void AddUnicodeCharacter(uint32_t character); |
| void AddEscapedUnicodeCharacter(uint32_t character); |
| // "Adds" an empty expression. Does nothing except consume a |
| // following quantifier |
| void AddEmpty(); |
| void AddClassRanges(RegExpClassRanges* cc); |
| void AddAtom(RegExpTree* tree); |
| void AddTerm(RegExpTree* tree); |
| void AddAssertion(RegExpTree* tree); |
| void NewAlternative(); // '|' |
| bool AddQuantifierToAtom(int min, |
| int max, |
| int index, |
| RegExpQuantifier::QuantifierType type); |
| void FlushText(); |
| RegExpTree* ToRegExp(); |
| RegExpFlags flags() const { return flags_; } |
| |
| bool ignore_case() const { return IsIgnoreCase(flags_); } |
| bool multiline() const { return IsMultiline(flags_); } |
| bool dotall() const { return IsDotAll(flags_); } |
| |
| private: |
| void FlushTerms(); |
| bool IsUnicodeMode() const { |
| // Either /v or /u enable UnicodeMode |
| // https://tc39.es/ecma262/#sec-parsepattern |
| return IsUnicode(flags_) || IsUnicodeSets(flags_); |
| } |
| Zone* zone() const { return zone_; } |
| RegExpTextBuilder& text_builder() { return text_builder_; } |
| |
| Zone* const zone_; |
| bool pending_empty_ = false; |
| const RegExpFlags flags_; |
| |
| using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>; |
| SmallRegExpTreeVector terms_; |
| SmallRegExpTreeVector alternatives_; |
| RegExpTextBuilder text_builder_; |
| }; |
| |
| enum SubexpressionType { |
| INITIAL, |
| CAPTURE, // All positive values represent captures. |
| POSITIVE_LOOKAROUND, |
| NEGATIVE_LOOKAROUND, |
| GROUPING |
| }; |
| |
| class RegExpParserState : public ZoneObject { |
| public: |
| // Push a state on the stack. |
| RegExpParserState(RegExpParserState* previous_state, |
| SubexpressionType group_type, |
| RegExpLookaround::Type lookaround_type, |
| int disjunction_capture_index, |
| const ZoneVector<uint16_t>* capture_name, |
| RegExpFlags flags, |
| Zone* zone) |
| : previous_state_(previous_state), |
| builder_(zone, flags), |
| group_type_(group_type), |
| lookaround_type_(lookaround_type), |
| disjunction_capture_index_(disjunction_capture_index), |
| capture_name_(capture_name), |
| non_participating_capture_group_intervals_(zone) { |
| if (previous_state != nullptr) { |
| non_participating_capture_group_intervals_.insert( |
| non_participating_capture_group_intervals_.begin(), |
| previous_state->non_participating_capture_group_intervals_); |
| } |
| } |
| using IntervalVector = SmallZoneVector<Interval, 1>; |
| |
| // Parser state of containing expression, if any. |
| RegExpParserState* previous_state() const { return previous_state_; } |
| bool IsSubexpression() { return previous_state_ != nullptr; } |
| // RegExpBuilder building this regexp's AST. |
| RegExpBuilder* builder() { return &builder_; } |
| // Type of regexp being parsed (parenthesized group or entire regexp). |
| SubexpressionType group_type() const { return group_type_; } |
| // Lookahead or Lookbehind. |
| RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } |
| // Index in captures array of first capture in this sub-expression, if any. |
| // Also the capture index of this sub-expression itself, if group_type |
| // is CAPTURE. |
| int capture_index() const { return disjunction_capture_index_; } |
| // The name of the current sub-expression, if group_type is CAPTURE. Only |
| // used for named captures. |
| const ZoneVector<uint16_t>* capture_name() const { return capture_name_; } |
| const IntervalVector& non_participating_capture_group_intervals() const { |
| return non_participating_capture_group_intervals_; |
| } |
| |
| bool IsNamedCapture() const { return capture_name_ != nullptr; } |
| |
| // Check whether the parser is inside a capture group with the given index. |
| bool IsInsideCaptureGroup(int index) const { |
| for (const RegExpParserState* s = this; s != nullptr; |
| s = s->previous_state()) { |
| if (s->group_type() != CAPTURE) continue; |
| // Return true if we found the matching capture index. |
| if (index == s->capture_index()) return true; |
| // Abort if index is larger than what has been parsed up till this state. |
| if (index > s->capture_index()) return false; |
| } |
| return false; |
| } |
| |
| // Check whether the parser is inside a capture group with the given name. |
| bool IsInsideCaptureGroup(const ZoneVector<uint16_t>* name) const { |
| DCHECK_NOT_NULL(name); |
| for (const RegExpParserState* s = this; s != nullptr; |
| s = s->previous_state()) { |
| if (s->capture_name() == nullptr) continue; |
| if (*s->capture_name() == *name) return true; |
| } |
| return false; |
| } |
| |
| void NewAlternative(int captures_started) { |
| // Nothing to do if there were no new captures started before the |
| // alternative. |
| if (capture_index() == captures_started) return; |
| |
| // +1 to create a closed interval (capture_index() is exclusive). |
| int from = capture_index() + 1; |
| int to = captures_started; |
| DCHECK_LE(from, to); |
| // Extend the last interval if we increase its range by exactly 1. |
| if (!non_participating_capture_group_intervals().empty() && |
| non_participating_capture_group_intervals().back().to() + 1 == to) { |
| Interval& interval = non_participating_capture_group_intervals_.back(); |
| ASSERT(!interval.is_empty()); |
| DCHECK_GE(from, interval.from()); |
| interval = interval.Union({from, to}); |
| } else { |
| non_participating_capture_group_intervals_.push_back({from, to}); |
| } |
| } |
| |
| private: |
| // Linked list implementation of stack of states. |
| RegExpParserState* const previous_state_; |
| // Builder for the stored disjunction. |
| RegExpBuilder builder_; |
| // Stored disjunction type (capture, look-ahead or grouping), if any. |
| const SubexpressionType group_type_; |
| // Stored read direction. |
| const RegExpLookaround::Type lookaround_type_; |
| // Stored disjunction's capture index (if any). |
| const int disjunction_capture_index_; |
| // Stored capture name (if any). |
| const ZoneVector<uint16_t>* const capture_name_; |
| // List of Intervals of (named) capture indices [from, to] that are not |
| // participating in the current state (i.e. they cannot match). |
| // Capture indices are not participating if they were created in a different |
| // alternative. |
| IntervalVector non_participating_capture_group_intervals_; |
| }; |
| |
| template <class CharT> |
| class RegExpParserImpl final { |
| private: |
| RegExpParserImpl(const CharT* input, |
| int input_length, |
| RegExpFlags flags, |
| Zone* zone); |
| |
| bool Parse(RegExpCompileData* result); |
| |
| RegExpTree* ParsePattern(); |
| RegExpTree* ParseDisjunction(); |
| // RegExpTree* ParseGroup(); |
| |
| // Parses a {...,...} quantifier and stores the range in the given |
| // out parameters. |
| bool ParseIntervalQuantifier(int* min_out, int* max_out); |
| |
| // Checks whether the following is a length-digit hexadecimal number, |
| // and sets the value if it is. |
| bool ParseHexEscape(int length, uint32_t* value); |
| bool ParseUnicodeEscape(uint32_t* value); |
| bool ParseUnlimitedLengthHexNumber(int max_value, uint32_t* value); |
| |
| bool ParsePropertyClassName(ZoneVector<char>* name_1, |
| ZoneVector<char>* name_2); |
| bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range, |
| CharacterClassStrings* add_to_strings, |
| bool negate, |
| const ZoneVector<char>& name_1, |
| const ZoneVector<char>& name_2); |
| |
| RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges, |
| bool add_unicode_case_equivalents); |
| // Parse inside a class. Either add escaped class to the range, or return |
| // false and pass parsed single character through |char_out|. |
| void ParseClassEscape(ZoneList<CharacterRange>* ranges, |
| Zone* zone, |
| bool add_unicode_case_equivalents, |
| uint32_t* char_out, |
| bool* is_class_escape); |
| // Returns true iff parsing was successful. |
| bool TryParseCharacterClassEscape(uint32_t next, |
| InClassEscapeState in_class_escape_state, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| Zone* zone, |
| bool add_unicode_case_equivalents); |
| RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings); |
| RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, |
| ClassSetOperandType* type_out); |
| RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, |
| ClassSetOperandType* type_out, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| uint32_t* character); |
| uint32_t ParseClassSetCharacter(); |
| // Parses and returns a single escaped character. |
| uint32_t ParseCharacterEscape(InClassEscapeState in_class_escape_state, |
| bool* is_escaped_unicode_character); |
| |
| void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges, |
| CharacterRange new_range); |
| |
| RegExpTree* ParseClassUnion(const RegExpBuilder* builder, |
| bool is_negated, |
| RegExpTree* first_operand, |
| ClassSetOperandType first_operand_type, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| uint32_t first_character); |
| RegExpTree* ParseClassIntersection(const RegExpBuilder* builder, |
| bool is_negated, |
| RegExpTree* first_operand, |
| ClassSetOperandType first_operand_type); |
| RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder, |
| bool is_negated, |
| RegExpTree* first_operand, |
| ClassSetOperandType first_operand_type); |
| RegExpTree* ParseCharacterClass(const RegExpBuilder* state); |
| |
| uint32_t ParseOctalLiteral(); |
| |
| // Tries to parse the input as a back reference. If successful it |
| // stores the result in the output parameter and returns true. If |
| // it fails it will push back the characters read so the same characters |
| // can be reparsed. |
| bool ParseBackReferenceIndex(int* index_out); |
| |
| RegExpTree* ReportError(RegExpError error); |
| void Advance(); |
| void Advance(int dist); |
| void RewindByOneCodepoint(); // Rewinds to before the previous Advance(). |
| void Reset(int pos); |
| |
| // Reports whether the pattern might be used as a literal search string. |
| // Only use if the result of the parse is a single atom node. |
| bool simple() const { return simple_; } |
| bool contains_anchor() const { return contains_anchor_; } |
| void set_contains_anchor() { contains_anchor_ = true; } |
| int captures_started() const { return captures_started_; } |
| int position() const { |
| const bool current_is_surrogate = |
| current() != kEndMarker && current() > Utf16::kMaxNonSurrogateCharCode; |
| const int rewind_bytes = current_is_surrogate ? 2 : 1; |
| return next_pos_ - rewind_bytes; |
| } |
| bool failed() const { return failed_; } |
| RegExpFlags flags() const { return flags_; } |
| bool IsUnicodeMode() const { |
| // Either /v or /u enable UnicodeMode |
| // https://tc39.es/ecma262/#sec-parsepattern |
| return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_; |
| } |
| bool unicode_sets() const { return IsUnicodeSets(flags()); } |
| bool ignore_case() const { return IsIgnoreCase(flags()); } |
| |
| static bool IsSyntaxCharacterOrSlash(uint32_t c); |
| static bool IsClassSetSyntaxCharacter(uint32_t c); |
| static bool IsClassSetReservedPunctuator(uint32_t c); |
| bool IsClassSetReservedDoublePunctuator(uint32_t c); |
| |
| static const uint32_t kEndMarker = (1 << 21); |
| |
| private: |
| // Return the 1-indexed RegExpCapture object, allocate if necessary. |
| RegExpCapture* GetCapture(int index); |
| |
| // Creates a new named capture at the specified index. Must be called exactly |
| // once for each named capture. Fails if a capture with the same name is |
| // encountered. |
| bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index); |
| |
| // Parses the name of a capture group (?<name>pattern). The name must adhere |
| // to IdentifierName in the ECMAScript standard. |
| const ZoneVector<uint16_t>* ParseCaptureGroupName(); |
| |
| bool ParseNamedBackReference(RegExpBuilder* builder, |
| RegExpParserState* state); |
| RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); |
| |
| // After the initial parsing pass, patch corresponding RegExpCapture objects |
| // into all RegExpBackReferences. This is done after initial parsing in order |
| // to avoid complicating cases in which references comes before the capture. |
| void PatchNamedBackReferences(); |
| |
| ZoneVector<RegExpCapture*>* GetNamedCaptures(); |
| |
| // Returns true iff the pattern contains named captures. May call |
| // ScanForCaptures to look ahead at the remaining pattern. |
| bool HasNamedCaptures(InClassEscapeState in_class_escape_state); |
| |
| Zone* zone() const { return zone_; } |
| |
| uint32_t current() const { return current_; } |
| bool has_more() const { return has_more_; } |
| bool has_next() const { return next_pos_ < input_length(); } |
| uint32_t Next(); |
| template <bool update_position> |
| uint32_t ReadNext(); |
| CharT InputAt(int index) const { |
| ASSERT(0 <= index && index < input_length()); |
| return input_[index]; |
| } |
| int input_length() const { return input_length_; } |
| void ScanForCaptures(InClassEscapeState in_class_escape_state); |
| |
| struct RegExpCaptureNameLess { |
| bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { |
| DCHECK_NOT_NULL(lhs); |
| DCHECK_NOT_NULL(rhs); |
| return *lhs->name() < *rhs->name(); |
| } |
| }; |
| |
| class ForceUnicodeScope final { |
| public: |
| explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser) |
| : parser_(parser) { |
| ASSERT(!parser_->force_unicode_); |
| parser_->force_unicode_ = true; |
| } |
| ~ForceUnicodeScope() { |
| ASSERT(parser_->force_unicode_); |
| parser_->force_unicode_ = false; |
| } |
| |
| private: |
| RegExpParserImpl<CharT>* const parser_; |
| }; |
| |
| Zone* const zone_; |
| RegExpError error_ = RegExpError::kNone; |
| int error_pos_ = 0; |
| ZoneList<RegExpCapture*>* captures_; |
| // Maps capture names to a list of capture indices with this name. |
| ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>* |
| named_captures_; |
| ZoneList<RegExpBackReference*>* named_back_references_; |
| const CharT* const input_; |
| const int input_length_; |
| uint32_t current_; |
| RegExpFlags flags_; |
| bool force_unicode_ = false; // Force parser to act as if unicode were set. |
| int next_pos_; |
| int captures_started_; |
| int capture_count_; // Only valid after we have scanned for captures. |
| int quantifier_count_; |
| int lookaround_count_; // Only valid after we have scanned for lookbehinds. |
| bool has_more_; |
| bool simple_; |
| bool contains_anchor_; |
| bool is_scanned_for_captures_; |
| bool has_named_captures_; // Only valid after we have scanned for captures. |
| bool failed_; |
| |
| friend class ::dart::RegExpParser; |
| }; |
| |
| template <class CharT> |
| RegExpParserImpl<CharT>::RegExpParserImpl(const CharT* input, |
| int input_length, |
| RegExpFlags flags, |
| Zone* zone) |
| : zone_(zone), |
| captures_(nullptr), |
| named_captures_(nullptr), |
| named_back_references_(nullptr), |
| input_(input), |
| input_length_(input_length), |
| current_(kEndMarker), |
| flags_(flags), |
| next_pos_(0), |
| captures_started_(0), |
| capture_count_(0), |
| quantifier_count_(0), |
| lookaround_count_(0), |
| has_more_(true), |
| simple_(false), |
| contains_anchor_(false), |
| is_scanned_for_captures_(false), |
| has_named_captures_(false), |
| failed_(false) { |
| Advance(); |
| } |
| |
| template <> |
| template <bool update_position> |
| uint32_t RegExpParserImpl<uint8_t>::ReadNext() { |
| int position = next_pos_; |
| uint16_t c0 = InputAt(position); |
| position++; |
| ASSERT(!Utf16::IsLeadSurrogate(c0)); |
| if (update_position) next_pos_ = position; |
| return c0; |
| } |
| |
| template <> |
| template <bool update_position> |
| uint32_t RegExpParserImpl<uint16_t>::ReadNext() { |
| int position = next_pos_; |
| uint16_t c0 = InputAt(position); |
| uint32_t result = c0; |
| position++; |
| // Read the whole surrogate pair in case of unicode mode, if possible. |
| if (IsUnicodeMode() && position < input_length() && |
| Utf16::IsLeadSurrogate(c0)) { |
| uint16_t c1 = InputAt(position); |
| if (Utf16::IsTrailSurrogate(c1)) { |
| result = Utf16::CombineSurrogatePair(c0, c1); |
| position++; |
| } |
| } |
| if (update_position) next_pos_ = position; |
| return result; |
| } |
| |
| template <class CharT> |
| uint32_t RegExpParserImpl<CharT>::Next() { |
| if (has_next()) { |
| return ReadNext<false>(); |
| } else { |
| return kEndMarker; |
| } |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::Advance() { |
| if (has_next()) { |
| if (!OSThread::Current()->HasStackHeadroom()) { |
| if (FLAG_correctness_fuzzer_suppressions) { |
| FATAL("Aborting on stack overflow"); |
| } |
| ReportError(RegExpError::kStackOverflow); |
| } else { |
| current_ = ReadNext<true>(); |
| } |
| } else { |
| current_ = kEndMarker; |
| // Advance so that position() points to 1-after-the-last-character. This is |
| // important so that Reset() to this position works correctly. |
| next_pos_ = input_length() + 1; |
| has_more_ = false; |
| } |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::RewindByOneCodepoint() { |
| if (!has_more()) return; |
| // Rewinds by one code point, i.e.: two code units if `current` is outside |
| // the basic multilingual plane (= composed of a lead and trail surrogate), |
| // or one code unit otherwise. |
| const int rewind_by = current() > Utf16::kMaxNonSurrogateCharCode ? -2 : -1; |
| Advance(rewind_by); // Undo the last Advance. |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::Reset(int pos) { |
| next_pos_ = pos; |
| has_more_ = (pos < input_length()); |
| Advance(); |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::Advance(int dist) { |
| next_pos_ += dist - 1; |
| Advance(); |
| } |
| |
| // static |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(uint32_t c) { |
| switch (c) { |
| case '^': |
| case '$': |
| case '\\': |
| case '.': |
| case '*': |
| case '+': |
| case '?': |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| case '|': |
| case '/': |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| // static |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(uint32_t c) { |
| switch (c) { |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| case '/': |
| case '-': |
| case '\\': |
| case '|': |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| // static |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(uint32_t c) { |
| switch (c) { |
| case '&': |
| case '-': |
| case '!': |
| case '#': |
| case '%': |
| case ',': |
| case ':': |
| case ';': |
| case '<': |
| case '=': |
| case '>': |
| case '@': |
| case '`': |
| case '~': |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(uint32_t c) { |
| #define DOUBLE_PUNCTUATOR_CASE(Char) \ |
| case Char: \ |
| return Next() == Char |
| |
| switch (c) { |
| DOUBLE_PUNCTUATOR_CASE('&'); |
| DOUBLE_PUNCTUATOR_CASE('!'); |
| DOUBLE_PUNCTUATOR_CASE('#'); |
| DOUBLE_PUNCTUATOR_CASE('$'); |
| DOUBLE_PUNCTUATOR_CASE('%'); |
| DOUBLE_PUNCTUATOR_CASE('*'); |
| DOUBLE_PUNCTUATOR_CASE('+'); |
| DOUBLE_PUNCTUATOR_CASE(','); |
| DOUBLE_PUNCTUATOR_CASE('.'); |
| DOUBLE_PUNCTUATOR_CASE(':'); |
| DOUBLE_PUNCTUATOR_CASE(';'); |
| DOUBLE_PUNCTUATOR_CASE('<'); |
| DOUBLE_PUNCTUATOR_CASE('='); |
| DOUBLE_PUNCTUATOR_CASE('>'); |
| DOUBLE_PUNCTUATOR_CASE('?'); |
| DOUBLE_PUNCTUATOR_CASE('@'); |
| DOUBLE_PUNCTUATOR_CASE('^'); |
| DOUBLE_PUNCTUATOR_CASE('`'); |
| DOUBLE_PUNCTUATOR_CASE('~'); |
| default: |
| break; |
| } |
| #undef DOUBLE_PUNCTUATOR_CASE |
| |
| return false; |
| } |
| |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) { |
| if (failed_) return nullptr; // Do not overwrite any existing error. |
| failed_ = true; |
| error_ = error; |
| error_pos_ = position(); |
| // Zip to the end to make sure no more input is read. |
| current_ = kEndMarker; |
| next_pos_ = input_length(); |
| has_more_ = false; |
| return nullptr; |
| } |
| |
| #define CHECK_FAILED /**/); \ |
| if (failed_) return nullptr; \ |
| ((void)0 |
| |
| // Pattern :: |
| // Disjunction |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParsePattern() { |
| RegExpTree* result = ParseDisjunction(CHECK_FAILED); |
| PatchNamedBackReferences(CHECK_FAILED); |
| ASSERT(!has_more()); |
| // If the result of parsing is a literal string atom, and it has the |
| // same length as the input, then the atom is identical to the input. |
| if (result->IsAtom() && result->AsAtom()->length() == input_length()) { |
| simple_ = true; |
| } |
| return result; |
| } |
| |
| // Disjunction :: |
| // Alternative |
| // Alternative | Disjunction |
| // Alternative :: |
| // [empty] |
| // Term Alternative |
| // Term :: |
| // Assertion |
| // Atom |
| // Atom Quantifier |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { |
| // Used to store current state while parsing subexpressions. |
| RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, |
| 0, nullptr, flags(), zone()); |
| RegExpParserState* state = &initial_state; |
| // Cache the builder in a local variable for quick access. |
| RegExpBuilder* builder = initial_state.builder(); |
| while (true) { |
| switch (current()) { |
| case kEndMarker: |
| if (failed()) return nullptr; // E.g. the initial Advance failed. |
| if (state->IsSubexpression()) { |
| // Inside a parenthesized group when hitting end of input. |
| return ReportError(RegExpError::kUnterminatedGroup); |
| } |
| DCHECK_EQ(INITIAL, state->group_type()); |
| // Parsing completed successfully. |
| return builder->ToRegExp(); |
| case ')': { |
| if (!state->IsSubexpression()) { |
| return ReportError(RegExpError::kUnmatchedParen); |
| } |
| DCHECK_NE(INITIAL, state->group_type()); |
| |
| Advance(); |
| // End disjunction parsing and convert builder content to new single |
| // regexp atom. |
| RegExpTree* body = builder->ToRegExp(); |
| |
| int end_capture_index = captures_started(); |
| |
| int capture_index = state->capture_index(); |
| SubexpressionType group_type = state->group_type(); |
| |
| // Build result of subexpression. |
| if (group_type == CAPTURE) { |
| if (state->IsNamedCapture()) { |
| CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED); |
| } |
| RegExpCapture* capture = GetCapture(capture_index); |
| capture->set_body(body); |
| body = capture; |
| } else if (group_type == GROUPING) { |
| body = zone()->template New<RegExpGroup>(body, builder->flags()); |
| } else { |
| ASSERT(group_type == POSITIVE_LOOKAROUND || |
| group_type == NEGATIVE_LOOKAROUND); |
| bool is_positive = (group_type == POSITIVE_LOOKAROUND); |
| body = zone()->template New<RegExpLookaround>( |
| body, is_positive, end_capture_index - capture_index, |
| capture_index, state->lookaround_type(), lookaround_count_); |
| lookaround_count_++; |
| } |
| |
| // Restore previous state. |
| state = state->previous_state(); |
| builder = state->builder(); |
| |
| builder->AddAtom(body); |
| // For compatibility with JSC and ES3, we allow quantifiers after |
| // lookaheads, and break in all cases. |
| break; |
| } |
| case '|': { |
| Advance(); |
| state->NewAlternative(captures_started()); |
| builder->NewAlternative(); |
| continue; |
| } |
| case '*': |
| case '+': |
| case '?': |
| return ReportError(RegExpError::kNothingToRepeat); |
| case '^': { |
| Advance(); |
| builder->AddAssertion(zone()->template New<RegExpAssertion>( |
| builder->multiline() ? RegExpAssertion::Type::START_OF_LINE |
| : RegExpAssertion::Type::START_OF_INPUT)); |
| set_contains_anchor(); |
| continue; |
| } |
| case '$': { |
| Advance(); |
| RegExpAssertion::Type assertion_type = |
| builder->multiline() ? RegExpAssertion::Type::END_OF_LINE |
| : RegExpAssertion::Type::END_OF_INPUT; |
| builder->AddAssertion( |
| zone()->template New<RegExpAssertion>(assertion_type)); |
| continue; |
| } |
| case '.': { |
| Advance(); |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| |
| if (builder->dotall()) { |
| // Everything. |
| CharacterRange::AddClassEscape(StandardCharacterSet::kEverything, |
| ranges, false, zone()); |
| } else { |
| // Everything except \x0A, \x0D, \u2028 and \u2029. |
| CharacterRange::AddClassEscape( |
| StandardCharacterSet::kNotLineTerminator, ranges, false, zone()); |
| } |
| |
| RegExpClassRanges* cc = |
| zone()->template New<RegExpClassRanges>(zone(), ranges); |
| builder->AddClassRanges(cc); |
| break; |
| } |
| case '(': { |
| state = ParseOpenParenthesis(state CHECK_FAILED); |
| builder = state->builder(); |
| flags_ = builder->flags(); |
| continue; |
| } |
| case '[': { |
| RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); |
| if (cc->IsClassRanges()) { |
| builder->AddClassRanges(cc->AsClassRanges()); |
| } else { |
| ASSERT(cc->IsClassSetExpression()); |
| builder->AddTerm(cc); |
| } |
| break; |
| } |
| // Atom :: |
| // \ AtomEscape |
| case '\\': |
| switch (Next()) { |
| case kEndMarker: |
| return ReportError(RegExpError::kEscapeAtEndOfPattern); |
| // AtomEscape :: |
| // [+UnicodeMode] DecimalEscape |
| // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber |
| // of DecimalEscape is ≤ NcapturingParens |
| // CharacterEscape (some cases of this mixed in too) |
| // |
| // TODO(jgruber): It may make sense to disentangle all the different |
| // cases and make the structure mirror the spec, e.g. for AtomEscape: |
| // |
| // if (TryParseDecimalEscape(...)) return; |
| // if (TryParseCharacterClassEscape(...)) return; |
| // if (TryParseCharacterEscape(...)) return; |
| // if (TryParseGroupName(...)) return; |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| int index = 0; |
| const bool is_backref = |
| ParseBackReferenceIndex(&index CHECK_FAILED); |
| if (is_backref) { |
| if (state->IsInsideCaptureGroup(index)) { |
| // The back reference is inside the capture group it refers to. |
| // Nothing can possibly have been captured yet, so we use empty |
| // instead. This ensures that, when checking a back reference, |
| // the capture registers of the referenced capture are either |
| // both set or both cleared. |
| builder->AddEmpty(); |
| } else { |
| RegExpCapture* capture = GetCapture(index); |
| RegExpTree* atom = |
| zone()->template New<RegExpBackReference>(capture, zone()); |
| builder->AddAtom(atom); |
| } |
| break; |
| } |
| // With /u and /v, no identity escapes except for syntax characters |
| // are allowed. Otherwise, all identity escapes are allowed. |
| if (IsUnicodeMode()) { |
| return ReportError(RegExpError::kInvalidEscape); |
| } |
| uint32_t first_digit = Next(); |
| if (first_digit == '8' || first_digit == '9') { |
| builder->AddCharacter(first_digit); |
| Advance(2); |
| break; |
| } |
| [[fallthrough]]; |
| } |
| case '0': { |
| Advance(); |
| if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') { |
| // Decimal escape with leading 0 are not parsed as octal. |
| return ReportError(RegExpError::kInvalidDecimalEscape); |
| } |
| uint32_t octal = ParseOctalLiteral(); |
| builder->AddCharacter(octal); |
| break; |
| } |
| case 'b': |
| Advance(2); |
| builder->AddAssertion(zone()->template New<RegExpAssertion>( |
| RegExpAssertion::Type::BOUNDARY)); |
| continue; |
| case 'B': |
| Advance(2); |
| builder->AddAssertion(zone()->template New<RegExpAssertion>( |
| RegExpAssertion::Type::NON_BOUNDARY)); |
| continue; |
| // AtomEscape :: |
| // CharacterClassEscape |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': { |
| uint32_t next = Next(); |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| bool add_unicode_case_equivalents = |
| IsUnicodeMode() && ignore_case(); |
| bool parsed_character_class_escape = TryParseCharacterClassEscape( |
| next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(), |
| add_unicode_case_equivalents CHECK_FAILED); |
| |
| if (parsed_character_class_escape) { |
| RegExpClassRanges* cc = |
| zone()->template New<RegExpClassRanges>(zone(), ranges); |
| builder->AddClassRanges(cc); |
| } else { |
| CHECK(!IsUnicodeMode()); |
| Advance(2); |
| builder->AddCharacter(next); // IdentityEscape. |
| } |
| break; |
| } |
| case 'p': |
| case 'P': { |
| uint32_t next = Next(); |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| CharacterClassStrings* strings = nullptr; |
| if (unicode_sets()) { |
| strings = zone()->template New<CharacterClassStrings>(zone()); |
| } |
| bool add_unicode_case_equivalents = ignore_case(); |
| bool parsed_character_class_escape = TryParseCharacterClassEscape( |
| next, InClassEscapeState::kNotInClass, ranges, strings, zone(), |
| add_unicode_case_equivalents CHECK_FAILED); |
| |
| if (parsed_character_class_escape) { |
| if (unicode_sets()) { |
| RegExpClassSetOperand* op = |
| zone()->template New<RegExpClassSetOperand>(ranges, |
| strings); |
| builder->AddTerm(op); |
| } else { |
| RegExpClassRanges* cc = |
| zone()->template New<RegExpClassRanges>(zone(), ranges); |
| builder->AddClassRanges(cc); |
| } |
| } else { |
| CHECK(!IsUnicodeMode()); |
| Advance(2); |
| builder->AddCharacter(next); // IdentityEscape. |
| } |
| break; |
| } |
| // AtomEscape :: |
| // k GroupName |
| case 'k': { |
| // Either an identity escape or a named back-reference. The two |
| // interpretations are mutually exclusive: '\k' is interpreted as |
| // an identity escape for non-Unicode patterns without named |
| // capture groups, and as the beginning of a named back-reference |
| // in all other cases. |
| const bool has_named_captures = |
| HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED); |
| if (IsUnicodeMode() || has_named_captures) { |
| Advance(2); |
| ParseNamedBackReference(builder, state CHECK_FAILED); |
| break; |
| } |
| } |
| [[fallthrough]]; |
| // AtomEscape :: |
| // CharacterEscape |
| default: { |
| bool is_escaped_unicode_character = false; |
| uint32_t c = ParseCharacterEscape( |
| InClassEscapeState::kNotInClass, |
| &is_escaped_unicode_character CHECK_FAILED); |
| if (is_escaped_unicode_character) { |
| builder->AddEscapedUnicodeCharacter(c); |
| } else { |
| builder->AddCharacter(c); |
| } |
| break; |
| } |
| } |
| break; |
| case '{': { |
| int dummy; |
| bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); |
| if (parsed) return ReportError(RegExpError::kNothingToRepeat); |
| [[fallthrough]]; |
| } |
| case '}': |
| case ']': |
| if (IsUnicodeMode()) { |
| return ReportError(RegExpError::kLoneQuantifierBrackets); |
| } |
| [[fallthrough]]; |
| default: |
| builder->AddUnicodeCharacter(current()); |
| Advance(); |
| break; |
| } // end switch(current()) |
| |
| int min; |
| int max; |
| switch (current()) { |
| // QuantifierPrefix :: |
| // * |
| // + |
| // ? |
| // { |
| case '*': |
| min = 0; |
| max = RegExpTree::kInfinity; |
| Advance(); |
| break; |
| case '+': |
| min = 1; |
| max = RegExpTree::kInfinity; |
| Advance(); |
| break; |
| case '?': |
| min = 0; |
| max = 1; |
| Advance(); |
| break; |
| case '{': |
| if (ParseIntervalQuantifier(&min, &max)) { |
| if (max < min) { |
| return ReportError(RegExpError::kRangeOutOfOrder); |
| } |
| break; |
| } else if (IsUnicodeMode()) { |
| // Incomplete quantifiers are not allowed. |
| return ReportError(RegExpError::kIncompleteQuantifier); |
| } |
| continue; |
| default: |
| continue; |
| } |
| RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
| if (current() == '?') { |
| quantifier_type = RegExpQuantifier::NON_GREEDY; |
| Advance(); |
| } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
| // v8_flags.regexp_possessive_quantifier is a debug-only flag. |
| quantifier_type = RegExpQuantifier::POSSESSIVE; |
| Advance(); |
| } |
| if (!builder->AddQuantifierToAtom(min, max, quantifier_count_, |
| quantifier_type)) { |
| return ReportError(RegExpError::kInvalidQuantifier); |
| } |
| ++quantifier_count_; |
| } |
| } |
| |
| template <class CharT> |
| RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis( |
| RegExpParserState* state) { |
| RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
| bool is_named_capture = false; |
| const ZoneVector<uint16_t>* capture_name = nullptr; |
| SubexpressionType subexpr_type = CAPTURE; |
| RegExpFlags flags = state->builder()->flags(); |
| bool parsing_modifiers = false; |
| bool modifiers_polarity = true; |
| RegExpFlags modifiers; |
| Advance(); |
| if (current() == '?') { |
| do { |
| uint32_t next = Next(); |
| switch (next) { |
| case '-': |
| if (!FLAG_js_regexp_modifiers) { |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| Advance(); |
| parsing_modifiers = true; |
| if (modifiers_polarity == false) { |
| ReportError(RegExpError::kMultipleFlagDashes); |
| return nullptr; |
| } |
| modifiers_polarity = false; |
| break; |
| case 'm': |
| case 'i': |
| case 's': { |
| if (!FLAG_js_regexp_modifiers) { |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| Advance(); |
| parsing_modifiers = true; |
| RegExpFlag flag = TryRegExpFlagFromChar(next).value(); |
| if ((modifiers & flag) != 0) { |
| ReportError(RegExpError::kRepeatedFlag); |
| return nullptr; |
| } |
| modifiers |= flag; |
| flags.set(flag, modifiers_polarity); |
| break; |
| } |
| case ':': |
| Advance(2); |
| parsing_modifiers = false; |
| subexpr_type = GROUPING; |
| break; |
| case '=': |
| Advance(2); |
| if (parsing_modifiers) { |
| ASSERT(FLAG_js_regexp_modifiers); |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| break; |
| case '!': |
| Advance(2); |
| if (parsing_modifiers) { |
| ASSERT(FLAG_js_regexp_modifiers); |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| break; |
| case '<': |
| Advance(); |
| if (parsing_modifiers) { |
| ASSERT(FLAG_js_regexp_modifiers); |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| if (Next() == '=') { |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKBEHIND; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| break; |
| } else if (Next() == '!') { |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKBEHIND; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| break; |
| } |
| is_named_capture = true; |
| has_named_captures_ = true; |
| Advance(); |
| break; |
| default: |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| } while (parsing_modifiers); |
| } |
| if (modifiers_polarity == false) { |
| // We encountered a dash. |
| if (modifiers == 0) { |
| ReportError(RegExpError::kInvalidFlagGroup); |
| return nullptr; |
| } |
| } |
| if (subexpr_type == CAPTURE) { |
| if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) { |
| ReportError(RegExpError::kTooManyCaptures); |
| return nullptr; |
| } |
| captures_started_++; |
| |
| if (is_named_capture) { |
| capture_name = ParseCaptureGroupName(CHECK_FAILED); |
| } |
| } |
| // Store current state and begin new disjunction parsing. |
| return zone()->template New<RegExpParserState>( |
| state, subexpr_type, lookaround_type, captures_started_, capture_name, |
| flags, zone()); |
| } |
| |
| // In order to know whether an escape is a backreference or not we have to scan |
| // the entire regexp and find the number of capturing parentheses. However we |
| // don't want to scan the regexp twice unless it is necessary. This mini-parser |
| // is called when needed. It can see the difference between capturing and |
| // noncapturing parentheses and can skip character classes and backslash-escaped |
| // characters. |
| // |
| // Important: The scanner has to be in a consistent state when calling |
| // ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while |
| // parsing a nested class. |
| template <class CharT> |
| void RegExpParserImpl<CharT>::ScanForCaptures( |
| InClassEscapeState in_class_escape_state) { |
| ASSERT(!is_scanned_for_captures_); |
| const int saved_position = position(); |
| // Start with captures started previous to current position |
| int capture_count = captures_started(); |
| // When we start inside a character class, skip everything inside the class. |
| if (in_class_escape_state == InClassEscapeState::kInClass) { |
| // \k is always invalid within a class in unicode mode, thus we should never |
| // call ScanForCaptures within a class. |
| ASSERT(!IsUnicodeMode()); |
| int c; |
| while ((c = current()) != kEndMarker) { |
| Advance(); |
| if (c == '\\') { |
| Advance(); |
| } else { |
| if (c == ']') break; |
| } |
| } |
| } |
| // Add count of captures after this position. |
| int n; |
| while ((n = current()) != kEndMarker) { |
| Advance(); |
| switch (n) { |
| case '\\': |
| Advance(); |
| break; |
| case '[': { |
| int class_nest_level = 0; |
| int c; |
| while ((c = current()) != kEndMarker) { |
| Advance(); |
| if (c == '\\') { |
| Advance(); |
| } else if (c == '[') { |
| // With /v, '[' inside a class is treated as a nested class. |
| // Without /v, '[' is a normal character. |
| if (unicode_sets()) class_nest_level++; |
| } else if (c == ']') { |
| if (class_nest_level == 0) break; |
| class_nest_level--; |
| } |
| } |
| break; |
| } |
| case '(': |
| if (current() == '?') { |
| // At this point we could be in |
| // * a non-capturing group '(:', |
| // * a lookbehind assertion '(?<=' '(?<!' |
| // * or a named capture '(?<'. |
| // |
| // Of these, only named captures are capturing groups. |
| |
| Advance(); |
| if (current() != '<') break; |
| |
| Advance(); |
| if (current() == '=' || current() == '!') break; |
| |
| // Found a possible named capture. It could turn out to be a syntax |
| // error (e.g. an unterminated or invalid name), but that distinction |
| // does not matter for our purposes. |
| has_named_captures_ = true; |
| } |
| capture_count++; |
| break; |
| } |
| } |
| capture_count_ = capture_count; |
| is_scanned_for_captures_ = true; |
| Reset(saved_position); |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) { |
| DCHECK_EQ('\\', current()); |
| ASSERT('1' <= Next() && Next() <= '9'); |
| // Try to parse a decimal literal that is no greater than the total number |
| // of left capturing parentheses in the input. |
| int start = position(); |
| int value = Next() - '0'; |
| Advance(2); |
| while (true) { |
| uint32_t c = current(); |
| if (IsDecimalDigit(c)) { |
| value = 10 * value + (c - '0'); |
| if (value > RegExpMacroAssembler::kMaxCaptures) { |
| Reset(start); |
| return false; |
| } |
| Advance(); |
| } else { |
| break; |
| } |
| } |
| if (value > captures_started()) { |
| if (!is_scanned_for_captures_) { |
| ScanForCaptures(InClassEscapeState::kNotInClass); |
| } |
| if (value > capture_count_) { |
| Reset(start); |
| return false; |
| } |
| } |
| *index_out = value; |
| return true; |
| } |
| |
| namespace { |
| |
| void push_code_unit(ZoneVector<uint16_t>* v, uint32_t code_unit) { |
| if (code_unit <= Utf16::kMaxNonSurrogateCharCode) { |
| v->push_back(code_unit); |
| } else { |
| v->push_back(Utf16::LeadSurrogate(code_unit)); |
| v->push_back(Utf16::TrailSurrogate(code_unit)); |
| } |
| } |
| |
| } // namespace |
| |
| template <class CharT> |
| const ZoneVector<uint16_t>* RegExpParserImpl<CharT>::ParseCaptureGroupName() { |
| // Due to special Advance requirements (see the next comment), rewind by one |
| // such that names starting with a surrogate pair are parsed correctly for |
| // patterns where the unicode flag is unset. |
| // |
| // Note that we use this odd pattern of rewinding the last advance in order |
| // to adhere to the common parser behavior of expecting `current` to point at |
| // the first candidate character for a function (e.g. when entering ParseFoo, |
| // `current` should point at the first character of Foo). |
| RewindByOneCodepoint(); |
| |
| ZoneVector<uint16_t>* name = |
| zone()->template New<ZoneVector<uint16_t>>(zone()); |
| |
| { |
| // Advance behavior inside this function is tricky since |
| // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U) |
| // and thus allows surrogate pairs and \u{}-style escapes even in |
| // non-unicode patterns. Therefore Advance within the capture group name |
| // has to force-enable unicode, and outside the name revert to default |
| // behavior. |
| ForceUnicodeScope force_unicode(this); |
| |
| bool at_start = true; |
| while (true) { |
| Advance(); |
| uint32_t c = current(); |
| |
| // Convert unicode escapes. |
| if (c == '\\' && Next() == 'u') { |
| Advance(2); |
| if (!ParseUnicodeEscape(&c)) { |
| ReportError(RegExpError::kInvalidUnicodeEscape); |
| return nullptr; |
| } |
| RewindByOneCodepoint(); |
| } |
| |
| // The backslash char is misclassified as both ID_Start and ID_Continue. |
| if (c == '\\') { |
| ReportError(RegExpError::kInvalidCaptureGroupName); |
| return nullptr; |
| } |
| |
| if (at_start) { |
| if (!IsIdentifierStart(c)) { |
| ReportError(RegExpError::kInvalidCaptureGroupName); |
| return nullptr; |
| } |
| push_code_unit(name, c); |
| at_start = false; |
| } else { |
| if (c == '>') { |
| break; |
| } else if (IsIdentifierPart(c)) { |
| push_code_unit(name, c); |
| } else { |
| ReportError(RegExpError::kInvalidCaptureGroupName); |
| return nullptr; |
| } |
| } |
| } |
| } |
| |
| // This final advance goes back into the state of pointing at the next |
| // relevant char, which the rest of the parser expects. See also the previous |
| // comments in this function. |
| Advance(); |
| return name; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex( |
| const RegExpParserState* state, |
| int index) { |
| const ZoneVector<uint16_t>* name = state->capture_name(); |
| const auto& non_participating_capture_group_intervals = |
| state->non_participating_capture_group_intervals(); |
| ASSERT(0 < index && index <= captures_started_); |
| DCHECK_NOT_NULL(name); |
| |
| RegExpCapture* capture = GetCapture(index); |
| DCHECK_NULL(capture->name()); |
| |
| capture->set_name(name); |
| |
| if (named_captures_ == nullptr) { |
| named_captures_ = zone_->template New< |
| ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone()); |
| } else { |
| // Check for duplicates and bail if we find any. |
| const auto& named_capture_it = named_captures_->find(capture); |
| if (named_capture_it != named_captures_->end()) { |
| if (FLAG_js_regexp_duplicate_named_groups) { |
| ZoneList<int>* named_capture_indices = named_capture_it->second; |
| DCHECK_NOT_NULL(named_capture_indices); |
| ASSERT(!named_capture_indices->is_empty()); |
| for (int named_index : *named_capture_indices) { |
| bool is_duplicate = true; |
| for (Interval interval : non_participating_capture_group_intervals) { |
| ASSERT(!interval.is_empty()); |
| // We can stop as soon as we are inside one non-participating |
| // interval. There can't be a non-participating and participating |
| // interval, as intervals are never decreasing. |
| if (interval.Contains(named_index)) { |
| is_duplicate = false; |
| break; |
| } |
| // Intervals are ordered strictly increasing, so we can stop early |
| // when the current interval is past the current index. |
| if (named_index <= interval.from()) { |
| break; |
| } |
| } |
| if (is_duplicate) { |
| ReportError(RegExpError::kDuplicateCaptureGroupName); |
| return false; |
| } |
| } |
| } else { |
| ReportError(RegExpError::kDuplicateCaptureGroupName); |
| return false; |
| } |
| } |
| } |
| if (FLAG_js_regexp_duplicate_named_groups) { |
| // Check for nested named captures. This is necessary to find duplicate |
| // named captures within the same disjunct. |
| RegExpParserState* parent_state = state->previous_state(); |
| if (parent_state && parent_state->IsInsideCaptureGroup(name)) { |
| ReportError(RegExpError::kDuplicateCaptureGroupName); |
| return false; |
| } |
| } |
| |
| auto entry = named_captures_->try_emplace( |
| capture, zone()->template New<ZoneList<int>>(1, zone())); |
| entry.first->second->Add(index, zone()); |
| return true; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseNamedBackReference( |
| RegExpBuilder* builder, |
| RegExpParserState* state) { |
| // The parser is assumed to be on the '<' in \k<name>. |
| if (current() != '<') { |
| ReportError(RegExpError::kInvalidNamedReference); |
| return false; |
| } |
| |
| Advance(); |
| const ZoneVector<uint16_t>* name = ParseCaptureGroupName(); |
| if (name == nullptr) { |
| return false; |
| } |
| |
| if (state->IsInsideCaptureGroup(name)) { |
| builder->AddEmpty(); |
| } else { |
| RegExpBackReference* atom = |
| zone()->template New<RegExpBackReference>(zone()); |
| atom->set_name(name); |
| |
| builder->AddAtom(atom); |
| |
| if (named_back_references_ == nullptr) { |
| named_back_references_ = |
| zone()->template New<ZoneList<RegExpBackReference*>>(1, zone()); |
| } |
| named_back_references_->Add(atom, zone()); |
| } |
| |
| return true; |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::PatchNamedBackReferences() { |
| if (named_back_references_ == nullptr) return; |
| |
| if (named_captures_ == nullptr) { |
| ReportError(RegExpError::kInvalidNamedCaptureReference); |
| return; |
| } |
| |
| // Look up and patch the actual capture for each named back reference. |
| |
| for (int i = 0; i < named_back_references_->length(); i++) { |
| RegExpBackReference* ref = named_back_references_->at(i); |
| |
| // Capture used to search the named_captures_ by name, index of the |
| // capture is never used. |
| static const int kInvalidIndex = 0; |
| RegExpCapture* search_capture = |
| zone()->template New<RegExpCapture>(kInvalidIndex); |
| DCHECK_NULL(search_capture->name()); |
| search_capture->set_name(ref->name()); |
| |
| const auto& capture_it = named_captures_->find(search_capture); |
| if (capture_it == named_captures_->end()) { |
| ReportError(RegExpError::kInvalidNamedCaptureReference); |
| return; |
| } |
| |
| DCHECK_IMPLIES(!FLAG_js_regexp_duplicate_named_groups, |
| capture_it->second->length() == 1); |
| for (int index : *capture_it->second) { |
| ref->add_capture(GetCapture(index), zone()); |
| } |
| } |
| } |
| |
| template <class CharT> |
| RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) { |
| // The index for the capture groups are one-based. Its index in the list is |
| // zero-based. |
| const int known_captures = |
| is_scanned_for_captures_ ? capture_count_ : captures_started_; |
| SBXCHECK(index >= 1 && index <= known_captures); |
| if (captures_ == nullptr) { |
| captures_ = |
| zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone()); |
| } |
| while (captures_->length() < known_captures) { |
| captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1), |
| zone()); |
| } |
| return captures_->at(index - 1); |
| } |
| |
| template <class CharT> |
| ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() { |
| if (named_captures_ == nullptr) { |
| return nullptr; |
| } |
| ASSERT(!named_captures_->empty()); |
| |
| ZoneVector<RegExpCapture*>* flattened_named_captures = |
| zone()->template New<ZoneVector<RegExpCapture*>>(zone()); |
| for (auto capture : *named_captures_) { |
| DCHECK_IMPLIES(!FLAG_js_regexp_duplicate_named_groups, |
| capture.second->length() == 1); |
| for (int index : *capture.second) { |
| flattened_named_captures->push_back(GetCapture(index)); |
| } |
| } |
| return flattened_named_captures; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::HasNamedCaptures( |
| InClassEscapeState in_class_escape_state) { |
| if (has_named_captures_ || is_scanned_for_captures_) { |
| return has_named_captures_; |
| } |
| |
| ScanForCaptures(in_class_escape_state); |
| ASSERT(is_scanned_for_captures_); |
| return has_named_captures_; |
| } |
| |
| // QuantifierPrefix :: |
| // { DecimalDigits } |
| // { DecimalDigits , } |
| // { DecimalDigits , DecimalDigits } |
| // |
| // Returns true if parsing succeeds, and set the min_out and max_out |
| // values. Values are truncated to RegExpTree::kInfinity if they overflow. |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out, |
| int* max_out) { |
| DCHECK_EQ(current(), '{'); |
| int start = position(); |
| Advance(); |
| int min = 0; |
| if (!IsDecimalDigit(current())) { |
| Reset(start); |
| return false; |
| } |
| while (IsDecimalDigit(current())) { |
| int next = current() - '0'; |
| if (min > (RegExpTree::kInfinity - next) / 10) { |
| // Overflow. Skip past remaining decimal digits and return -1. |
| do { |
| Advance(); |
| } while (IsDecimalDigit(current())); |
| min = RegExpTree::kInfinity; |
| break; |
| } |
| min = 10 * min + next; |
| Advance(); |
| } |
| int max = 0; |
| if (current() == '}') { |
| max = min; |
| Advance(); |
| } else if (current() == ',') { |
| Advance(); |
| if (current() == '}') { |
| max = RegExpTree::kInfinity; |
| Advance(); |
| } else { |
| while (IsDecimalDigit(current())) { |
| int next = current() - '0'; |
| if (max > (RegExpTree::kInfinity - next) / 10) { |
| do { |
| Advance(); |
| } while (IsDecimalDigit(current())); |
| max = RegExpTree::kInfinity; |
| break; |
| } |
| max = 10 * max + next; |
| Advance(); |
| } |
| if (current() != '}') { |
| Reset(start); |
| return false; |
| } |
| Advance(); |
| } |
| } else { |
| Reset(start); |
| return false; |
| } |
| *min_out = min; |
| *max_out = max; |
| return true; |
| } |
| |
| template <class CharT> |
| uint32_t RegExpParserImpl<CharT>::ParseOctalLiteral() { |
| ASSERT(('0' <= current() && current() <= '7') || !has_more()); |
| // For compatibility with some other browsers (not all), we parse |
| // up to three octal digits with a value below 256. |
| // ES#prod-annexB-LegacyOctalEscapeSequence |
| uint32_t value = current() - '0'; |
| Advance(); |
| if ('0' <= current() && current() <= '7') { |
| value = value * 8 + current() - '0'; |
| Advance(); |
| if (value < 32 && '0' <= current() && current() <= '7') { |
| value = value * 8 + current() - '0'; |
| Advance(); |
| } |
| } |
| return value; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseHexEscape(int length, uint32_t* value) { |
| int start = position(); |
| uint32_t val = 0; |
| for (int i = 0; i < length; ++i) { |
| uint32_t c = current(); |
| int d = base::HexValue(c); |
| if (d < 0) { |
| Reset(start); |
| return false; |
| } |
| val = val * 16 + d; |
| Advance(); |
| } |
| *value = val; |
| return true; |
| } |
| |
| // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseUnicodeEscape(uint32_t* value) { |
| // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| // allowed). In the latter case, the number of hex digits between { } is |
| // arbitrary. \ and u have already been read. |
| if (current() == '{' && IsUnicodeMode()) { |
| int start = position(); |
| Advance(); |
| if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { |
| if (current() == '}') { |
| Advance(); |
| return true; |
| } |
| } |
| Reset(start); |
| return false; |
| } |
| // \u but no {, or \u{...} escapes not allowed. |
| bool result = ParseHexEscape(4, value); |
| if (result && IsUnicodeMode() && Utf16::IsLeadSurrogate(*value) && |
| current() == '\\') { |
| // Attempt to read trail surrogate. |
| int start = position(); |
| if (Next() == 'u') { |
| Advance(2); |
| uint32_t trail; |
| if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) { |
| *value = Utf16::CombineSurrogatePair(static_cast<uint16_t>(*value), |
| static_cast<uint16_t>(trail)); |
| return true; |
| } |
| } |
| Reset(start); |
| } |
| return result; |
| } |
| |
| #ifdef V8_INTL_SUPPORT |
| |
| namespace { |
| |
| bool IsExactPropertyAlias(const char* property_name, UProperty property) { |
| const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); |
| if (short_name != nullptr && strcmp(property_name, short_name) == 0) |
| return true; |
| for (int i = 0;; i++) { |
| const char* long_name = u_getPropertyName( |
| property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| if (long_name == nullptr) break; |
| if (strcmp(property_name, long_name) == 0) return true; |
| } |
| return false; |
| } |
| |
| bool IsExactPropertyValueAlias(const char* property_value_name, |
| UProperty property, |
| int32_t property_value) { |
| const char* short_name = |
| u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); |
| if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { |
| return true; |
| } |
| for (int i = 0;; i++) { |
| const char* long_name = u_getPropertyValueName( |
| property, property_value, |
| static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| if (long_name == nullptr) break; |
| if (strcmp(property_value_name, long_name) == 0) return true; |
| } |
| return false; |
| } |
| |
| void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set, |
| CharacterClassStrings* strings, |
| RegExpFlags flags, |
| Zone* zone) { |
| ASSERT(set.hasStrings()); |
| ASSERT(IsUnicodeSets(flags)); |
| DCHECK_NOT_NULL(strings); |
| |
| RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone); |
| RegExpTextBuilder string_builder(zone, &string_storage, flags); |
| const bool needs_case_folding = IsIgnoreCase(flags); |
| icu::UnicodeSetIterator iter(set); |
| iter.skipToStrings(); |
| while (iter.next()) { |
| const icu::UnicodeString& s = iter.getString(); |
| const char16_t* p = s.getBuffer(); |
| int32_t length = s.length(); |
| ZoneList<uint32_t>* string = |
| zone->template New<ZoneList<uint32_t>>(length, zone); |
| for (int32_t i = 0; i < length;) { |
| UChar32 c; |
| U16_NEXT(p, i, length, c); |
| string_builder.AddUnicodeCharacter(c); |
| if (needs_case_folding) { |
| c = u_foldCase(c, U_FOLD_CASE_DEFAULT); |
| } |
| string->Add(c, zone); |
| } |
| strings->emplace(string->ToVector(), string_builder.ToRegExp()); |
| string_storage.clear(); |
| } |
| } |
| |
| bool LookupPropertyValueName(UProperty property, |
| const char* property_value_name, |
| bool negate, |
| ZoneList<CharacterRange>* result_ranges, |
| CharacterClassStrings* result_strings, |
| RegExpFlags flags, |
| Zone* zone) { |
| UProperty property_for_lookup = property; |
| if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { |
| // For the property Script_Extensions, we have to do the property value |
| // name lookup as if the property is Script. |
| property_for_lookup = UCHAR_SCRIPT; |
| } |
| int32_t property_value = |
| u_getPropertyValueEnum(property_for_lookup, property_value_name); |
| if (property_value == UCHAR_INVALID_CODE) return false; |
| |
| // We require the property name to match exactly to one of the property value |
| // aliases. However, u_getPropertyValueEnum uses loose matching. |
| if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, |
| property_value)) { |
| return false; |
| } |
| |
| UErrorCode ec = U_ZERO_ERROR; |
| icu::UnicodeSet set; |
| set.applyIntPropertyValue(property, property_value, ec); |
| bool success = ec == U_ZERO_ERROR && !set.isEmpty(); |
| |
| if (success) { |
| if (set.hasStrings()) { |
| ExtractStringsFromUnicodeSet(set, result_strings, flags, zone); |
| } |
| const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags); |
| if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE); |
| set.removeAllStrings(); |
| if (negate) set.complement(); |
| for (int i = 0; i < set.getRangeCount(); i++) { |
| result_ranges->Add( |
| CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), |
| zone); |
| } |
| } |
| return success; |
| } |
| |
| template <size_t N> |
| inline bool NameEquals(const char* name, const char (&literal)[N]) { |
| return strncmp(name, literal, N + 1) == 0; |
| } |
| |
| bool LookupSpecialPropertyValueName(const char* name, |
| ZoneList<CharacterRange>* result, |
| bool negate, |
| RegExpFlags flags, |
| Zone* zone) { |
| if (NameEquals(name, "Any")) { |
| if (negate) { |
| // Leave the list of character ranges empty, since the negation of 'Any' |
| // is the empty set. |
| } else { |
| result->Add(CharacterRange::Everything(), zone); |
| } |
| } else if (NameEquals(name, "ASCII")) { |
| result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) |
| : CharacterRange::Range(0x0, 0x7F), |
| zone); |
| } else if (NameEquals(name, "Assigned")) { |
| return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", |
| !negate, result, nullptr, flags, zone); |
| } else { |
| return false; |
| } |
| return true; |
| } |
| |
| // Explicitly allowlist supported binary properties. The spec forbids supporting |
| // properties outside of this set to ensure interoperability. |
| bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) { |
| switch (property) { |
| case UCHAR_ALPHABETIC: |
| // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. |
| // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. |
| case UCHAR_ASCII_HEX_DIGIT: |
| // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. |
| case UCHAR_BIDI_CONTROL: |
| case UCHAR_BIDI_MIRRORED: |
| case UCHAR_CASE_IGNORABLE: |
| case UCHAR_CASED: |
| case UCHAR_CHANGES_WHEN_CASEFOLDED: |
| case UCHAR_CHANGES_WHEN_CASEMAPPED: |
| case UCHAR_CHANGES_WHEN_LOWERCASED: |
| case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: |
| case UCHAR_CHANGES_WHEN_TITLECASED: |
| case UCHAR_CHANGES_WHEN_UPPERCASED: |
| case UCHAR_DASH: |
| case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: |
| case UCHAR_DEPRECATED: |
| case UCHAR_DIACRITIC: |
| case UCHAR_EMOJI: |
| case UCHAR_EMOJI_COMPONENT: |
| case UCHAR_EMOJI_MODIFIER_BASE: |
| case UCHAR_EMOJI_MODIFIER: |
| case UCHAR_EMOJI_PRESENTATION: |
| case UCHAR_EXTENDED_PICTOGRAPHIC: |
| case UCHAR_EXTENDER: |
| case UCHAR_GRAPHEME_BASE: |
| case UCHAR_GRAPHEME_EXTEND: |
| case UCHAR_HEX_DIGIT: |
| case UCHAR_ID_CONTINUE: |
| case UCHAR_ID_START: |
| case UCHAR_IDEOGRAPHIC: |
| case UCHAR_IDS_BINARY_OPERATOR: |
| case UCHAR_IDS_TRINARY_OPERATOR: |
| case UCHAR_JOIN_CONTROL: |
| case UCHAR_LOGICAL_ORDER_EXCEPTION: |
| case UCHAR_LOWERCASE: |
| case UCHAR_MATH: |
| case UCHAR_NONCHARACTER_CODE_POINT: |
| case UCHAR_PATTERN_SYNTAX: |
| case UCHAR_PATTERN_WHITE_SPACE: |
| case UCHAR_QUOTATION_MARK: |
| case UCHAR_RADICAL: |
| case UCHAR_REGIONAL_INDICATOR: |
| case UCHAR_S_TERM: |
| case UCHAR_SOFT_DOTTED: |
| case UCHAR_TERMINAL_PUNCTUATION: |
| case UCHAR_UNIFIED_IDEOGRAPH: |
| case UCHAR_UPPERCASE: |
| case UCHAR_VARIATION_SELECTOR: |
| case UCHAR_WHITE_SPACE: |
| case UCHAR_XID_CONTINUE: |
| case UCHAR_XID_START: |
| return true; |
| case UCHAR_BASIC_EMOJI: |
| case UCHAR_EMOJI_KEYCAP_SEQUENCE: |
| case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: |
| case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: |
| case UCHAR_RGI_EMOJI_TAG_SEQUENCE: |
| case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: |
| case UCHAR_RGI_EMOJI: |
| return unicode_sets; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| bool IsBinaryPropertyOfStrings(UProperty property) { |
| switch (property) { |
| case UCHAR_BASIC_EMOJI: |
| case UCHAR_EMOJI_KEYCAP_SEQUENCE: |
| case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: |
| case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: |
| case UCHAR_RGI_EMOJI_TAG_SEQUENCE: |
| case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: |
| case UCHAR_RGI_EMOJI: |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| bool IsUnicodePropertyValueCharacter(char c) { |
| // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ |
| // |
| // Note that using this to validate each parsed char is quite conservative. |
| // A possible alternative solution would be to only ensure the parsed |
| // property name/value candidate string does not contain '\0' characters and |
| // let ICU lookups trigger the final failure. |
| if ('a' <= c && c <= 'z') return true; |
| if ('A' <= c && c <= 'Z') return true; |
| if ('0' <= c && c <= '9') return true; |
| return (c == '_'); |
| } |
| |
| } // namespace |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, |
| ZoneVector<char>* name_2) { |
| ASSERT(name_1->empty()); |
| ASSERT(name_2->empty()); |
| // Parse the property class as follows: |
| // - In \p{name}, 'name' is interpreted |
| // - either as a general category property value name. |
| // - or as a binary property name. |
| // - In \p{name=value}, 'name' is interpreted as an enumerated property name, |
| // and 'value' is interpreted as one of the available property value names. |
| // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. |
| // - Loose matching is not applied. |
| if (current() == '{') { |
| // Parse \p{[PropertyName=]PropertyNameValue} |
| for (Advance(); current() != '}' && current() != '='; Advance()) { |
| if (!IsUnicodePropertyValueCharacter(current())) return false; |
| if (!has_next()) return false; |
| name_1->push_back(static_cast<char>(current())); |
| } |
| if (current() == '=') { |
| for (Advance(); current() != '}'; Advance()) { |
| if (!IsUnicodePropertyValueCharacter(current())) return false; |
| if (!has_next()) return false; |
| name_2->push_back(static_cast<char>(current())); |
| } |
| name_2->push_back(0); // null-terminate string. |
| } |
| } else { |
| return false; |
| } |
| Advance(); |
| name_1->push_back(0); // null-terminate string. |
| |
| ASSERT(name_1->size() - 1 == std::strlen(name_1->data())); |
| ASSERT(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); |
| return true; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::AddPropertyClassRange( |
| ZoneList<CharacterRange>* add_to_ranges, |
| CharacterClassStrings* add_to_strings, |
| bool negate, |
| const ZoneVector<char>& name_1, |
| const ZoneVector<char>& name_2) { |
| if (name_2.empty()) { |
| // First attempt to interpret as general category property value name. |
| const char* name = name_1.data(); |
| if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, |
| add_to_ranges, add_to_strings, flags(), |
| zone())) { |
| return true; |
| } |
| // Interpret "Any", "ASCII", and "Assigned". |
| if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(), |
| zone())) { |
| return true; |
| } |
| // Then attempt to interpret as binary property name with value name 'Y'. |
| UProperty property = u_getPropertyEnum(name); |
| if (!IsSupportedBinaryProperty(property, unicode_sets())) return false; |
| if (!IsExactPropertyAlias(name, property)) return false; |
| // Negation of properties with strings is not allowed. |
| // See |
| // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings |
| if (negate && IsBinaryPropertyOfStrings(property)) return false; |
| if (unicode_sets()) { |
| // In /v mode we can't simple lookup the "false" binary property values, |
| // as the spec requires us to perform case folding before calculating the |
| // complement. |
| // See https://tc39.es/ecma262/#sec-compiletocharset |
| // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue |
| return LookupPropertyValueName(property, "Y", negate, add_to_ranges, |
| add_to_strings, flags(), zone()); |
| } else { |
| return LookupPropertyValueName(property, negate ? "N" : "Y", false, |
| add_to_ranges, add_to_strings, flags(), |
| zone()); |
| } |
| } else { |
| // Both property name and value name are specified. Attempt to interpret |
| // the property name as enumerated property. |
| const char* property_name = name_1.data(); |
| const char* value_name = name_2.data(); |
| UProperty property = u_getPropertyEnum(property_name); |
| if (!IsExactPropertyAlias(property_name, property)) return false; |
| if (property == UCHAR_GENERAL_CATEGORY) { |
| // We want to allow aggregate value names such as "Letter". |
| property = UCHAR_GENERAL_CATEGORY_MASK; |
| } else if (property != UCHAR_SCRIPT && |
| property != UCHAR_SCRIPT_EXTENSIONS) { |
| return false; |
| } |
| return LookupPropertyValueName(property, value_name, negate, add_to_ranges, |
| add_to_strings, flags(), zone()); |
| } |
| } |
| |
| #else // V8_INTL_SUPPORT |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, |
| ZoneVector<char>* name_2) { |
| return false; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::AddPropertyClassRange( |
| ZoneList<CharacterRange>* add_to_ranges, |
| CharacterClassStrings* add_to_strings, |
| bool negate, |
| const ZoneVector<char>& name_1, |
| const ZoneVector<char>& name_2) { |
| return false; |
| } |
| |
| #endif // V8_INTL_SUPPORT |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value, |
| uint32_t* value) { |
| uint32_t x = 0; |
| int d = base::HexValue(current()); |
| if (d < 0) { |
| return false; |
| } |
| while (d >= 0) { |
| x = x * 16 + d; |
| if (x > static_cast<uint32_t>(max_value)) { |
| return false; |
| } |
| Advance(); |
| d = base::HexValue(current()); |
| } |
| *value = x; |
| return true; |
| } |
| |
| // https://tc39.es/ecma262/#prod-CharacterEscape |
| template <class CharT> |
| uint32_t RegExpParserImpl<CharT>::ParseCharacterEscape( |
| InClassEscapeState in_class_escape_state, |
| bool* is_escaped_unicode_character) { |
| DCHECK_EQ('\\', current()); |
| ASSERT(has_next()); |
| |
| Advance(); |
| |
| const uint32_t c = current(); |
| switch (c) { |
| // CharacterEscape :: |
| // ControlEscape :: one of |
| // f n r t v |
| case 'f': |
| Advance(); |
| return '\f'; |
| case 'n': |
| Advance(); |
| return '\n'; |
| case 'r': |
| Advance(); |
| return '\r'; |
| case 't': |
| Advance(); |
| return '\t'; |
| case 'v': |
| Advance(); |
| return '\v'; |
| // CharacterEscape :: |
| // c ControlLetter |
| case 'c': { |
| uint32_t controlLetter = Next(); |
| uint32_t letter = controlLetter & ~('A' ^ 'a'); |
| if (letter >= 'A' && letter <= 'Z') { |
| Advance(2); |
| // Control letters mapped to ASCII control characters in the range |
| // 0x00-0x1F. |
| return controlLetter & 0x1F; |
| } |
| if (IsUnicodeMode()) { |
| // With /u and /v, invalid escapes are not treated as identity escapes. |
| ReportError(RegExpError::kInvalidUnicodeEscape); |
| return 0; |
| } |
| if (in_class_escape_state == InClassEscapeState::kInClass) { |
| // Inside a character class, we also accept digits and underscore as |
| // control characters, unless with /u or /v. See Annex B: |
| // ES#prod-annexB-ClassControlLetter |
| if ((controlLetter >= '0' && controlLetter <= '9') || |
| controlLetter == '_') { |
| Advance(2); |
| return controlLetter & 0x1F; |
| } |
| } |
| // We match JSC in reading the backslash as a literal |
| // character instead of as starting an escape. |
| return '\\'; |
| } |
| // CharacterEscape :: |
| // 0 [lookahead ∉ DecimalDigit] |
| // [~UnicodeMode] LegacyOctalEscapeSequence |
| case '0': |
| // \0 is interpreted as NUL if not followed by another digit. |
| if (Next() < '0' || Next() > '9') { |
| Advance(); |
| return 0; |
| } |
| [[fallthrough]]; |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| // For compatibility, we interpret a decimal escape that isn't |
| // a back reference (and therefore either \0 or not valid according |
| // to the specification) as a 1..3 digit octal character code. |
| // ES#prod-annexB-LegacyOctalEscapeSequence |
| if (IsUnicodeMode()) { |
| // With /u or /v, decimal escape is not interpreted as octal character |
| // code. |
| ReportError(RegExpError::kInvalidDecimalEscape); |
| return 0; |
| } |
| return ParseOctalLiteral(); |
| // CharacterEscape :: |
| // HexEscapeSequence |
| case 'x': { |
| Advance(); |
| uint32_t value; |
| if (ParseHexEscape(2, &value)) return value; |
| if (IsUnicodeMode()) { |
| // With /u or /v, invalid escapes are not treated as identity escapes. |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| // If \x is not followed by a two-digit hexadecimal, treat it |
| // as an identity escape. |
| return 'x'; |
| } |
| // CharacterEscape :: |
| // RegExpUnicodeEscapeSequence [?UnicodeMode] |
| case 'u': { |
| Advance(); |
| uint32_t value; |
| if (ParseUnicodeEscape(&value)) { |
| *is_escaped_unicode_character = true; |
| return value; |
| } |
| if (IsUnicodeMode()) { |
| // With /u or /v, invalid escapes are not treated as identity escapes. |
| ReportError(RegExpError::kInvalidUnicodeEscape); |
| return 0; |
| } |
| // If \u is not followed by a two-digit hexadecimal, treat it |
| // as an identity escape. |
| return 'u'; |
| } |
| default: |
| break; |
| } |
| |
| // CharacterEscape :: |
| // IdentityEscape[?UnicodeMode, ?N] |
| // |
| // * With /u, no identity escapes except for syntax characters are |
| // allowed. |
| // * With /v, no identity escapes except for syntax characters and |
| // ClassSetReservedPunctuators (if within a class) are allowed. |
| // * Without /u or /v: |
| // * '\c' is not an IdentityEscape. |
| // * '\k' is not an IdentityEscape when named captures exist. |
| // * Otherwise, all identity escapes are allowed. |
| if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) { |
| if (IsClassSetReservedPunctuator(c)) { |
| Advance(); |
| return c; |
| } |
| } |
| if (IsUnicodeMode()) { |
| if (!IsSyntaxCharacterOrSlash(c)) { |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| Advance(); |
| return c; |
| } |
| ASSERT(!IsUnicodeMode()); |
| if (c == 'c') { |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| Advance(); |
| // Note: It's important to Advance before the HasNamedCaptures call s.t. we |
| // don't start scanning in the middle of an escape. |
| if (c == 'k' && HasNamedCaptures(in_class_escape_state)) { |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| return c; |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassRanges |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges( |
| ZoneList<CharacterRange>* ranges, |
| bool add_unicode_case_equivalents) { |
| uint32_t char_1, char_2; |
| bool is_class_1, is_class_2; |
| while (has_more() && current() != ']') { |
| ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, |
| &is_class_1 CHECK_FAILED); |
| // ClassAtom |
| if (current() == '-') { |
| Advance(); |
| if (!has_more()) { |
| // If we reach the end we break out of the loop and let the |
| // following code report an error. |
| break; |
| } else if (current() == ']') { |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| break; |
| } |
| ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, |
| &is_class_2 CHECK_FAILED); |
| if (is_class_1 || is_class_2) { |
| // Either end is an escaped character class. Treat the '-' verbatim. |
| if (IsUnicodeMode()) { |
| // ES2015 21.2.2.15.1 step 1. |
| return ReportError(RegExpError::kInvalidCharacterClass); |
| } |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); |
| continue; |
| } |
| // ES2015 21.2.2.15.1 step 6. |
| if (char_1 > char_2) { |
| return ReportError(RegExpError::kOutOfOrderCharacterClass); |
| } |
| ranges->Add(CharacterRange::Range(char_1, char_2), zone()); |
| } else { |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| } |
| } |
| return nullptr; |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassEscape |
| template <class CharT> |
| void RegExpParserImpl<CharT>::ParseClassEscape( |
| ZoneList<CharacterRange>* ranges, |
| Zone* zone, |
| bool add_unicode_case_equivalents, |
| uint32_t* char_out, |
| bool* is_class_escape) { |
| *is_class_escape = false; |
| |
| if (current() != '\\') { |
| // Not a ClassEscape. |
| *char_out = current(); |
| Advance(); |
| return; |
| } |
| |
| const uint32_t next = Next(); |
| switch (next) { |
| case 'b': |
| *char_out = '\b'; |
| Advance(2); |
| return; |
| case '-': |
| if (IsUnicodeMode()) { |
| *char_out = next; |
| Advance(2); |
| return; |
| } |
| break; |
| case kEndMarker: |
| ReportError(RegExpError::kEscapeAtEndOfPattern); |
| return; |
| default: |
| break; |
| } |
| |
| static constexpr InClassEscapeState kInClassEscape = |
| InClassEscapeState::kInClass; |
| *is_class_escape = |
| TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone, |
| add_unicode_case_equivalents); |
| if (*is_class_escape) return; |
| |
| bool dummy = false; // Unused. |
| *char_out = ParseCharacterEscape(kInClassEscape, &dummy); |
| } |
| |
| // https://tc39.es/ecma262/#prod-CharacterClassEscape |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape( |
| uint32_t next, |
| InClassEscapeState in_class_escape_state, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| Zone* zone, |
| bool add_unicode_case_equivalents) { |
| DCHECK_EQ(current(), '\\'); |
| DCHECK_EQ(Next(), next); |
| |
| switch (next) { |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': |
| CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next), |
| ranges, add_unicode_case_equivalents, |
| zone); |
| Advance(2); |
| return true; |
| case 'p': |
| case 'P': { |
| if (!IsUnicodeMode()) return false; |
| bool negate = next == 'P'; |
| Advance(2); |
| ZoneVector<char> name_1(zone); |
| ZoneVector<char> name_2(zone); |
| if (!ParsePropertyClassName(&name_1, &name_2) || |
| !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) { |
| ReportError(in_class_escape_state == InClassEscapeState::kInClass |
| ? RegExpError::kInvalidClassPropertyName |
| : RegExpError::kInvalidPropertyName); |
| } |
| return true; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| namespace { |
| |
| // Add |string| to |ranges| if length of |string| == 1, otherwise add |string| |
| // to |strings|. |
| void AddClassString(ZoneList<uint32_t>* normalized_string, |
| RegExpTree* regexp_string, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| Zone* zone) { |
| if (normalized_string->length() == 1) { |
| ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone); |
| } else { |
| strings->emplace(normalized_string->ToVector(), regexp_string); |
| } |
| } |
| |
| } // namespace |
| |
| // https://tc39.es/ecma262/#prod-ClassStringDisjunction |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction( |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings) { |
| ASSERT(unicode_sets()); |
| DCHECK_EQ(current(), '\\'); |
| DCHECK_EQ(Next(), 'q'); |
| Advance(2); |
| if (current() != '{') { |
| // Identity escape of 'q' is not allowed in unicode mode. |
| return ReportError(RegExpError::kInvalidEscape); |
| } |
| Advance(); |
| |
| ZoneList<uint32_t>* string = |
| zone()->template New<ZoneList<uint32_t>>(4, zone()); |
| RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone()); |
| RegExpTextBuilder string_builder(zone(), &string_storage, flags()); |
| |
| while (has_more() && current() != '}') { |
| if (current() == '|') { |
| AddClassString(string, string_builder.ToRegExp(), ranges, strings, |
| zone()); |
| string = zone()->template New<ZoneList<uint32_t>>(4, zone()); |
| string_storage.clear(); |
| Advance(); |
| } else { |
| uint32_t c = ParseClassSetCharacter(CHECK_FAILED); |
| if (ignore_case()) { |
| #ifdef V8_INTL_SUPPORT |
| c = u_foldCase(c, U_FOLD_CASE_DEFAULT); |
| #else |
| c = AsciiAlphaToLower(c); |
| #endif |
| } |
| string->Add(c, zone()); |
| string_builder.AddUnicodeCharacter(c); |
| } |
| } |
| |
| AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone()); |
| CharacterRange::Canonicalize(ranges); |
| |
| // We don't need to handle missing closing '}' here. |
| // If the character class is correctly closed, ParseClassSetCharacter will |
| // report an error. |
| Advance(); |
| return nullptr; |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassSetOperand |
| // Tree returned based on type_out: |
| // * kNestedClass: RegExpClassSetExpression |
| // * For all other types: RegExpClassSetOperand |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand( |
| const RegExpBuilder* builder, |
| ClassSetOperandType* type_out) { |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(1, zone()); |
| CharacterClassStrings* strings = |
| zone()->template New<CharacterClassStrings>(zone()); |
| uint32_t character; |
| RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings, |
| &character CHECK_FAILED); |
| DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass, |
| tree == nullptr); |
| DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, |
| ranges->is_empty()); |
| DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, |
| strings->empty()); |
| DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, |
| ranges->is_empty()); |
| DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, |
| strings->empty()); |
| DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, |
| tree->IsClassSetExpression()); |
| // ClassSetRange is only used within ClassSetUnion(). |
| DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange); |
| // There are no restrictions for kCharacterClassEscape. |
| // CharacterClassEscape includes \p{}, which can contain ranges, strings or |
| // both and \P{}, which could contain nothing (i.e. \P{Any}). |
| if (tree == nullptr) { |
| if (*type_out == ClassSetOperandType::kClassSetCharacter) { |
| AddMaybeSimpleCaseFoldedRange(ranges, |
| CharacterRange::Singleton(character)); |
| } |
| tree = zone()->template New<RegExpClassSetOperand>(ranges, strings); |
| } |
| return tree; |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassSetOperand |
| // Based on |type_out| either a tree is returned or |
| // |ranges|/|strings|/|character| modified. If a tree is returned, |
| // ranges/strings are not modified. If |type_out| is kNestedClass, a tree of |
| // type RegExpClassSetExpression is returned. If | type_out| is |
| // kClassSetCharacter, |character| is set and nullptr returned. For all other |
| // types, |ranges|/|strings|/|character| is modified and nullptr is returned. |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand( |
| const RegExpBuilder* builder, |
| ClassSetOperandType* type_out, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| uint32_t* character) { |
| ASSERT(unicode_sets()); |
| uint32_t c = current(); |
| if (c == '\\') { |
| const uint32_t next = Next(); |
| if (next == 'q') { |
| *type_out = ClassSetOperandType::kClassStringDisjunction; |
| ParseClassStringDisjunction(ranges, strings CHECK_FAILED); |
| return nullptr; |
| } |
| static constexpr InClassEscapeState kInClassEscape = |
| InClassEscapeState::kInClass; |
| const bool add_unicode_case_equivalents = ignore_case(); |
| if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings, |
| zone(), add_unicode_case_equivalents)) { |
| *type_out = ClassSetOperandType::kCharacterClassEscape; |
| return nullptr; |
| } |
| } |
| |
| if (c == '[') { |
| *type_out = ClassSetOperandType::kNestedClass; |
| return ParseCharacterClass(builder); |
| } |
| |
| *type_out = ClassSetOperandType::kClassSetCharacter; |
| c = ParseClassSetCharacter(CHECK_FAILED); |
| *character = c; |
| return nullptr; |
| } |
| |
| template <class CharT> |
| uint32_t RegExpParserImpl<CharT>::ParseClassSetCharacter() { |
| ASSERT(unicode_sets()); |
| const uint32_t c = current(); |
| if (c == '\\') { |
| const uint32_t next = Next(); |
| switch (next) { |
| case 'b': |
| Advance(2); |
| return '\b'; |
| case kEndMarker: |
| ReportError(RegExpError::kEscapeAtEndOfPattern); |
| return 0; |
| } |
| static constexpr InClassEscapeState kInClassEscape = |
| InClassEscapeState::kInClass; |
| |
| bool dummy = false; // Unused. |
| return ParseCharacterEscape(kInClassEscape, &dummy); |
| } |
| if (IsClassSetSyntaxCharacter(c)) { |
| ReportError(RegExpError::kInvalidCharacterInClass); |
| return 0; |
| } |
| if (IsClassSetReservedDoublePunctuator(c)) { |
| ReportError(RegExpError::kInvalidClassSetOperation); |
| return 0; |
| } |
| Advance(); |
| return c; |
| } |
| |
| namespace { |
| |
| bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) { |
| switch (type) { |
| case ClassSetOperandType::kClassSetCharacter: |
| case ClassSetOperandType::kClassSetRange: |
| return false; |
| case ClassSetOperandType::kCharacterClassEscape: |
| case ClassSetOperandType::kClassStringDisjunction: |
| return operand->AsClassSetOperand()->has_strings(); |
| case ClassSetOperandType::kNestedClass: |
| if (operand->IsClassRanges()) return false; |
| return operand->AsClassSetExpression()->may_contain_strings(); |
| } |
| UNREACHABLE(); |
| return false; |
| } |
| |
| } // namespace |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange( |
| ZoneList<CharacterRange>* ranges, |
| CharacterRange new_range) { |
| ASSERT(unicode_sets()); |
| if (ignore_case()) { |
| ZoneList<CharacterRange>* new_ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| new_ranges->Add(new_range, zone()); |
| CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone()); |
| ranges->AddAll(*new_ranges, zone()); |
| } else { |
| ranges->Add(new_range, zone()); |
| } |
| CharacterRange::Canonicalize(ranges); |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassUnion |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion( |
| const RegExpBuilder* builder, |
| bool is_negated, |
| RegExpTree* first_operand, |
| ClassSetOperandType first_operand_type, |
| ZoneList<CharacterRange>* ranges, |
| CharacterClassStrings* strings, |
| uint32_t character) { |
| ASSERT(unicode_sets()); |
| ZoneList<RegExpTree*>* operands = |
| zone()->template New<ZoneList<RegExpTree*>>(2, zone()); |
| bool may_contain_strings = false; |
| // Add the lhs to operands if necessary. |
| // Either the lhs values were added to |ranges|/|strings| (in which case |
| // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed |
| // as |first_operand| (in which case |ranges| and |strings| are empty). |
| if (first_operand != nullptr) { |
| may_contain_strings = MayContainStrings(first_operand_type, first_operand); |
| operands->Add(first_operand, zone()); |
| } |
| ClassSetOperandType last_type = first_operand_type; |
| while (has_more() && current() != ']') { |
| if (current() == '-') { |
| // Mix of ClassSetRange and ClassSubtraction is not allowed. |
| if (Next() == '-') { |
| return ReportError(RegExpError::kInvalidClassSetOperation); |
| } |
| Advance(); |
| if (!has_more()) { |
| // If we reach the end we break out of the loop and let the |
| // following code report an error. |
| break; |
| } |
| // If the lhs and rhs around '-' are both ClassSetCharacters, they |
| // represent a character range. |
| // In case one of them is not a ClassSetCharacter, it is a syntax error, |
| // as '-' can not be used unescaped within a class with /v. |
| // See |
| // https://tc39.es/ecma262/#prod-ClassSetRange |
| if (last_type != ClassSetOperandType::kClassSetCharacter) { |
| return ReportError(RegExpError::kInvalidCharacterClass); |
| } |
| uint32_t from = character; |
| ParseClassSetOperand(builder, &last_type, ranges, strings, |
| &character CHECK_FAILED); |
| if (last_type != ClassSetOperandType::kClassSetCharacter) { |
| return ReportError(RegExpError::kInvalidCharacterClass); |
| } |
| if (from > character) { |
| return ReportError(RegExpError::kOutOfOrderCharacterClass); |
| } |
| AddMaybeSimpleCaseFoldedRange(ranges, |
| CharacterRange::Range(from, character)); |
| last_type = ClassSetOperandType::kClassSetRange; |
| } else { |
| DCHECK_NE(current(), '-'); |
| if (last_type == ClassSetOperandType::kClassSetCharacter) { |
| AddMaybeSimpleCaseFoldedRange(ranges, |
| CharacterRange::Singleton(character)); |
| } |
| RegExpTree* operand = ParseClassSetOperand( |
| builder, &last_type, ranges, strings, &character CHECK_FAILED); |
| if (operand != nullptr) { |
| may_contain_strings |= MayContainStrings(last_type, operand); |
| // Add the range we started building as operand and reset the current |
| // range. |
| if (!ranges->is_empty() || !strings->empty()) { |
| may_contain_strings |= !strings->empty(); |
| operands->Add( |
| zone()->template New<RegExpClassSetOperand>(ranges, strings), |
| zone()); |
| ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| strings = zone()->template New<CharacterClassStrings>(zone()); |
| } |
| operands->Add(operand, zone()); |
| } |
| } |
| } |
| |
| if (!has_more()) { |
| return ReportError(RegExpError::kUnterminatedCharacterClass); |
| } |
| |
| if (last_type == ClassSetOperandType::kClassSetCharacter) { |
| AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character)); |
| } |
| |
| // Add the range we started building as operand. |
| if (!ranges->is_empty() || !strings->empty()) { |
| may_contain_strings |= !strings->empty(); |
| operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings), |
| zone()); |
| } |
| |
| DCHECK_EQ(current(), ']'); |
| Advance(); |
| |
| if (is_negated && may_contain_strings) { |
| return ReportError(RegExpError::kNegatedCharacterClassWithStrings); |
| } |
| |
| if (operands->is_empty()) { |
| // Return empty expression if no operands were added (e.g. [\P{Any}] |
| // produces an empty range). |
| ASSERT(ranges->is_empty()); |
| ASSERT(strings->empty()); |
| return RegExpClassSetExpression::Empty(zone(), is_negated); |
| } |
| |
| return zone()->template New<RegExpClassSetExpression>( |
| RegExpClassSetExpression::OperationType::kUnion, is_negated, |
| may_contain_strings, operands); |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassIntersection |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection( |
| const RegExpBuilder* builder, |
| bool is_negated, |
| RegExpTree* first_operand, |
| ClassSetOperandType first_operand_type) { |
| ASSERT(unicode_sets()); |
| ASSERT(current() == '&' && Next() == '&'); |
| bool may_contain_strings = |
| MayContainStrings(first_operand_type, first_operand); |
| ZoneList<RegExpTree*>* operands = |
| zone()->template New<ZoneList<RegExpTree*>>(2, zone()); |
| operands->Add(first_operand, zone()); |
| while (has_more() && current() != ']') { |
| if (current() != '&' || Next() != '&') { |
| return ReportError(RegExpError::kInvalidClassSetOperation); |
| } |
| Advance(2); |
| // [lookahead ≠&] |
| if (current() == '&') { |
| return ReportError(RegExpError::kInvalidCharacterInClass); |
| } |
| |
| ClassSetOperandType operand_type; |
| RegExpTree* operand = |
| ParseClassSetOperand(builder, &operand_type CHECK_FAILED); |
| may_contain_strings &= MayContainStrings(operand_type, operand); |
| operands->Add(operand, zone()); |
| } |
| if (!has_more()) { |
| return ReportError(RegExpError::kUnterminatedCharacterClass); |
| } |
| if (is_negated && may_contain_strings) { |
| return ReportError(RegExpError::kNegatedCharacterClassWithStrings); |
| } |
| DCHECK_EQ(current(), ']'); |
| Advance(); |
| return zone()->template New<RegExpClassSetExpression>( |
| RegExpClassSetExpression::OperationType::kIntersection, is_negated, |
| may_contain_strings, operands); |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassSubtraction |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction( |
| const RegExpBuilder* builder, |
| bool is_negated, |
| RegExpTree* first_operand, |
| ClassSetOperandType first_operand_type) { |
| ASSERT(unicode_sets()); |
| ASSERT(current() == '-' && Next() == '-'); |
| const bool may_contain_strings = |
| MayContainStrings(first_operand_type, first_operand); |
| if (is_negated && may_contain_strings) { |
| return ReportError(RegExpError::kNegatedCharacterClassWithStrings); |
| } |
| ZoneList<RegExpTree*>* operands = |
| zone()->template New<ZoneList<RegExpTree*>>(2, zone()); |
| operands->Add(first_operand, zone()); |
| while (has_more() && current() != ']') { |
| if (current() != '-' || Next() != '-') { |
| return ReportError(RegExpError::kInvalidClassSetOperation); |
| } |
| Advance(2); |
| ClassSetOperandType dummy; // unused |
| RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED); |
| operands->Add(operand, zone()); |
| } |
| if (!has_more()) { |
| return ReportError(RegExpError::kUnterminatedCharacterClass); |
| } |
| DCHECK_EQ(current(), ']'); |
| Advance(); |
| return zone()->template New<RegExpClassSetExpression>( |
| RegExpClassSetExpression::OperationType::kSubtraction, is_negated, |
| may_contain_strings, operands); |
| } |
| |
| // https://tc39.es/ecma262/#prod-CharacterClass |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass( |
| const RegExpBuilder* builder) { |
| DCHECK_EQ(current(), '['); |
| Advance(); |
| bool is_negated = false; |
| if (current() == '^') { |
| is_negated = true; |
| Advance(); |
| } |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| if (current() == ']') { |
| Advance(); |
| if (unicode_sets()) { |
| return RegExpClassSetExpression::Empty(zone(), is_negated); |
| } else { |
| RegExpClassRanges::ClassRangesFlags class_ranges_flags; |
| if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED; |
| return zone()->template New<RegExpClassRanges>(zone(), ranges, |
| class_ranges_flags); |
| } |
| } |
| |
| if (!unicode_sets()) { |
| bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case(); |
| ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED); |
| if (!has_more()) { |
| return ReportError(RegExpError::kUnterminatedCharacterClass); |
| } |
| DCHECK_EQ(current(), ']'); |
| Advance(); |
| RegExpClassRanges::ClassRangesFlags character_class_flags; |
| if (is_negated) character_class_flags = RegExpClassRanges::NEGATED; |
| if (!ignore_case()) { |
| character_class_flags |= RegExpClassRanges::NO_CASE_FOLDING_NEEDED; |
| } |
| if (sizeof(CharT) == 1) { |
| // No surrogate pairs. |
| character_class_flags |= RegExpClassRanges::IS_CERTAINLY_ONE_CODE_POINT; |
| } |
| return zone()->template New<RegExpClassRanges>(zone(), ranges, |
| character_class_flags); |
| } else { |
| ClassSetOperandType operand_type; |
| CharacterClassStrings* strings = |
| zone()->template New<CharacterClassStrings>(zone()); |
| uint32_t character; |
| RegExpTree* operand = ParseClassSetOperand( |
| builder, &operand_type, ranges, strings, &character CHECK_FAILED); |
| switch (current()) { |
| case '-': |
| if (Next() == '-') { |
| if (operand == nullptr) { |
| if (operand_type == ClassSetOperandType::kClassSetCharacter) { |
| AddMaybeSimpleCaseFoldedRange( |
| ranges, CharacterRange::Singleton(character)); |
| } |
| operand = |
| zone()->template New<RegExpClassSetOperand>(ranges, strings); |
| } |
| return ParseClassSubtraction(builder, is_negated, operand, |
| operand_type); |
| } |
| // ClassSetRange is handled in ParseClassUnion(). |
| break; |
| case '&': |
| if (Next() == '&') { |
| if (operand == nullptr) { |
| if (operand_type == ClassSetOperandType::kClassSetCharacter) { |
| AddMaybeSimpleCaseFoldedRange( |
| ranges, CharacterRange::Singleton(character)); |
| } |
| operand = |
| zone()->template New<RegExpClassSetOperand>(ranges, strings); |
| } |
| return ParseClassIntersection(builder, is_negated, operand, |
| operand_type); |
| } |
| } |
| return ParseClassUnion(builder, is_negated, operand, operand_type, ranges, |
| strings, character); |
| } |
| } |
| |
| #undef CHECK_FAILED |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) { |
| DCHECK_NOT_NULL(result); |
| RegExpTree* tree = ParsePattern(); |
| |
| if (failed()) { |
| DCHECK_NULL(tree); |
| DCHECK_NE(error_, RegExpError::kNone); |
| result->error = error_; |
| result->error_pos = error_pos_; |
| return false; |
| } |
| |
| DCHECK_NOT_NULL(tree); |
| DCHECK_EQ(error_, RegExpError::kNone); |
| #ifdef V8_ENABLE_REGEXP_DIAGNOSTICS |
| if (UNLIKELY(FLAG_trace_regexp_parser)) { |
| StdoutStream os; |
| RegExpAstNodePrinter printer(os, nullptr, zone()); |
| printer.Print(tree); |
| os << "\n"; |
| } |
| #endif |
| |
| result->tree = tree; |
| const int capture_count = captures_started(); |
| result->simple = tree->IsAtom() && simple() && capture_count == 0; |
| result->contains_anchor = contains_anchor(); |
| result->capture_count = capture_count; |
| result->named_captures = GetNamedCaptures(); |
| return true; |
| } |
| |
| void RegExpBuilder::FlushText() { |
| text_builder().FlushText(); |
| } |
| |
| void RegExpBuilder::AddCharacter(uint16_t c) { |
| pending_empty_ = false; |
| text_builder().AddCharacter(c); |
| } |
| |
| void RegExpBuilder::AddUnicodeCharacter(uint32_t c) { |
| pending_empty_ = false; |
| text_builder().AddUnicodeCharacter(c); |
| } |
| |
| void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) { |
| pending_empty_ = false; |
| text_builder().AddEscapedUnicodeCharacter(character); |
| } |
| |
| void RegExpBuilder::AddEmpty() { |
| text_builder().FlushPendingSurrogate(); |
| pending_empty_ = true; |
| } |
| |
| void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) { |
| pending_empty_ = false; |
| text_builder().AddClassRanges(cc); |
| } |
| |
| void RegExpBuilder::AddAtom(RegExpTree* term) { |
| if (term->IsEmpty()) { |
| AddEmpty(); |
| return; |
| } |
| pending_empty_ = false; |
| if (term->IsTextElement()) { |
| text_builder().AddAtom(term); |
| } else { |
| FlushText(); |
| terms_.emplace_back(term); |
| } |
| } |
| |
| void RegExpBuilder::AddTerm(RegExpTree* term) { |
| ASSERT(!term->IsEmpty()); |
| pending_empty_ = false; |
| if (term->IsTextElement()) { |
| text_builder().AddTerm(term); |
| } else { |
| FlushText(); |
| terms_.emplace_back(term); |
| } |
| } |
| |
| void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| FlushText(); |
| pending_empty_ = false; |
| terms_.emplace_back(assert); |
| } |
| |
| void RegExpBuilder::NewAlternative() { |
| FlushTerms(); |
| } |
| |
| void RegExpBuilder::FlushTerms() { |
| FlushText(); |
| size_t num_terms = terms_.size(); |
| RegExpTree* alternative; |
| if (num_terms == 0) { |
| alternative = zone()->New<RegExpEmpty>(); |
| } else if (num_terms == 1) { |
| alternative = terms_.back(); |
| } else { |
| alternative = |
| zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( |
| base::VectorOf(terms_.begin(), terms_.size()), zone())); |
| } |
| alternatives_.emplace_back(alternative); |
| terms_.clear(); |
| } |
| |
| RegExpTree* RegExpBuilder::ToRegExp() { |
| FlushTerms(); |
| size_t num_alternatives = alternatives_.size(); |
| if (num_alternatives == 0) return zone()->New<RegExpEmpty>(); |
| if (num_alternatives == 1) return alternatives_.back(); |
| return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>( |
| base::VectorOf(alternatives_.begin(), alternatives_.size()), zone())); |
| } |
| |
| bool RegExpBuilder::AddQuantifierToAtom( |
| int min, |
| int max, |
| int index, |
| RegExpQuantifier::QuantifierType quantifier_type) { |
| if (pending_empty_) { |
| pending_empty_ = false; |
| return true; |
| } |
| RegExpTree* atom = text_builder().PopLastAtom(); |
| if (atom != nullptr) { |
| FlushText(); |
| } else if (!terms_.empty()) { |
| atom = terms_.back(); |
| terms_.pop_back(); |
| if (atom->IsLookaround()) { |
| // With /u or /v, lookarounds are not quantifiable. |
| if (IsUnicodeMode()) return false; |
| // Lookbehinds are not quantifiable. |
| if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { |
| return false; |
| } |
| } |
| if (atom->max_match() == 0) { |
| // Guaranteed to only match an empty string. |
| if (min == 0) { |
| return true; |
| } |
| terms_.emplace_back(atom); |
| return true; |
| } |
| } else { |
| // Only call immediately after adding an atom or character! |
| UNREACHABLE(); |
| } |
| terms_.emplace_back( |
| zone()->New<RegExpQuantifier>(min, max, quantifier_type, index, atom)); |
| return true; |
| } |
| |
| template class RegExpParserImpl<uint8_t>; |
| template class RegExpParserImpl<uint16_t>; |
| |
| } // namespace |
| |
| // static |
| bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, |
| Zone* zone, |
| const String& input, |
| RegExpFlags flags, |
| RegExpCompileData* result) { |
| NoSafepointScope no_safepoint; |
| if (input.IsOneByteString()) { |
| RegExpParserImpl<uint8_t> p(OneByteString::DataStart(input), input.Length(), |
| flags, zone); |
| return p.Parse(result); |
| } else { |
| RegExpParserImpl<uint16_t> p(TwoByteString::DataStart(input), |
| input.Length(), flags, zone); |
| return p.Parse(result); |
| } |
| } |
| |
| // static |
| template <class CharT> |
| bool RegExpParser::VerifyRegExpSyntax(Zone* zone, |
| uintptr_t stack_limit, |
| const CharT* input, |
| int input_length, |
| RegExpFlags flags, |
| RegExpCompileData* result) { |
| RegExpParserImpl<CharT> p(input, input_length, flags, zone); |
| return p.Parse(result); |
| } |
| |
| template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(Zone*, |
| uintptr_t, |
| const uint8_t*, |
| int, |
| RegExpFlags, |
| RegExpCompileData*); |
| template bool RegExpParser::VerifyRegExpSyntax<uint16_t>(Zone*, |
| uintptr_t, |
| const uint16_t*, |
| int, |
| RegExpFlags, |
| RegExpCompileData*); |
| |
| } // namespace dart |