runtime/vm/regexp_parser.cc - sdk.git - Git at Google

 // Copyright (c) 2014, the Dart project authors.  Please see the AUTHORS file
 // for details. All rights reserved. Use of this source code is governed by a
 // BSD-style license that can be found in the LICENSE file.

 #include "vm/regexp_parser.h"

 #include "unicode/uchar.h"
 #include "unicode/uniset.h"

 #include "platform/unicode.h"

 #include "vm/longjump.h"
 #include "vm/object_store.h"

 namespace dart {

 #define Z zone()

 // Enables possessive quantifier syntax for testing.
 static const bool FLAG_regexp_possessive_quantifier = false;

 RegExpBuilder::RegExpBuilder(RegExpFlags flags)
     : zone_(Thread::Current()->zone()),
       pending_empty_(false),
       flags_(flags),
       characters_(NULL),
       pending_surrogate_(kNoPendingSurrogate),
       terms_(),
       text_(),
       alternatives_()
 #ifdef DEBUG
       ,
       last_added_(ADD_NONE)
 #endif
 {
 }

 void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) {
   ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
   FlushPendingSurrogate();
   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
   pending_surrogate_ = lead_surrogate;
 }

 void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) {
   ASSERT(Utf16::IsTrailSurrogate(trail_surrogate));
   if (pending_surrogate_ != kNoPendingSurrogate) {
     uint16_t lead_surrogate = pending_surrogate_;
     pending_surrogate_ = kNoPendingSurrogate;
     ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
     uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate);
     if (NeedsDesugaringForIgnoreCase(combined)) {
       AddCharacterClassForDesugaring(combined);
     } else {
       auto surrogate_pair = new (Z) ZoneGrowableArray<uint16_t>(2);
       surrogate_pair->Add(lead_surrogate);
       surrogate_pair->Add(trail_surrogate);
       RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_);
       AddAtom(atom);
     }
   } else {
     pending_surrogate_ = trail_surrogate;
     FlushPendingSurrogate();
   }
 }

 void RegExpBuilder::FlushPendingSurrogate() {
   if (pending_surrogate_ != kNoPendingSurrogate) {
     ASSERT(is_unicode());
     uint32_t c = pending_surrogate_;
     pending_surrogate_ = kNoPendingSurrogate;
     AddCharacterClassForDesugaring(c);
   }
 }

 void RegExpBuilder::FlushCharacters() {
   FlushPendingSurrogate();
   pending_empty_ = false;
   if (characters_ != NULL) {
     RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_);
     characters_ = NULL;
     text_.Add(atom);
     LAST(ADD_ATOM);
   }
 }

 void RegExpBuilder::FlushText() {
   FlushCharacters();
   intptr_t num_text = text_.length();
   if (num_text == 0) {
     return;
   } else if (num_text == 1) {
     terms_.Add(text_.Last());
   } else {
     RegExpText* text = new (Z) RegExpText();
     for (intptr_t i = 0; i < num_text; i++)
       text_[i]->AppendToText(text);
     terms_.Add(text);
   }
   text_.Clear();
 }

 void RegExpBuilder::AddCharacter(uint16_t c) {
   FlushPendingSurrogate();
   pending_empty_ = false;
   if (NeedsDesugaringForIgnoreCase(c)) {
     AddCharacterClassForDesugaring(c);
   } else {
     if (characters_ == NULL) {
       characters_ = new (Z) ZoneGrowableArray<uint16_t>(4);
     }
     characters_->Add(c);
     LAST(ADD_CHAR);
   }
 }

 void RegExpBuilder::AddUnicodeCharacter(uint32_t c) {
   if (c > static_cast<uint32_t>(Utf16::kMaxCodeUnit)) {
     ASSERT(is_unicode());
     uint16_t surrogates[2];
     Utf16::Encode(c, surrogates);
     AddLeadSurrogate(surrogates[0]);
     AddTrailSurrogate(surrogates[1]);
   } else if (is_unicode() && Utf16::IsLeadSurrogate(c)) {
     AddLeadSurrogate(c);
   } else if (is_unicode() && Utf16::IsTrailSurrogate(c)) {
     AddTrailSurrogate(c);
   } else {
     AddCharacter(static_cast<uint16_t>(c));
   }
 }

 void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) {
   // A lead or trail surrogate parsed via escape sequence will not
   // pair up with any preceding lead or following trail surrogate.
   FlushPendingSurrogate();
   AddUnicodeCharacter(character);
   FlushPendingSurrogate();
 }

 void RegExpBuilder::AddEmpty() {
   pending_empty_ = true;
 }

 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
   if (NeedsDesugaringForUnicode(cc)) {
     // With /u, character class needs to be desugared, so it
     // must be a standalone term instead of being part of a RegExpText.
     AddTerm(cc);
   } else {
     AddAtom(cc);
   }
 }

 void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) {
   auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c));
   AddTerm(new (Z) RegExpCharacterClass(ranges, flags_));
 }

 void RegExpBuilder::AddAtom(RegExpTree* term) {
   if (term->IsEmpty()) {
     AddEmpty();
     return;
   }
   if (term->IsTextElement()) {
     FlushCharacters();
     text_.Add(term);
   } else {
     FlushText();
     terms_.Add(term);
   }
   LAST(ADD_ATOM);
 }

 void RegExpBuilder::AddTerm(RegExpTree* term) {
   FlushText();
   terms_.Add(term);
   LAST(ADD_ATOM);
 }

 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
   FlushText();
   terms_.Add(assert);
   LAST(ADD_ASSERT);
 }

 void RegExpBuilder::NewAlternative() {
   FlushTerms();
 }

 void RegExpBuilder::FlushTerms() {
   FlushText();
   intptr_t num_terms = terms_.length();
   RegExpTree* alternative;
   if (num_terms == 0) {
     alternative = RegExpEmpty::GetInstance();
   } else if (num_terms == 1) {
     alternative = terms_.Last();
   } else {
     ZoneGrowableArray<RegExpTree*>* terms =
         new (Z) ZoneGrowableArray<RegExpTree*>();
     for (intptr_t i = 0; i < terms_.length(); i++) {
       terms->Add(terms_[i]);
     }
     alternative = new (Z) RegExpAlternative(terms);
   }
   alternatives_.Add(alternative);
   terms_.Clear();
   LAST(ADD_NONE);
 }

 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
   if (!is_unicode()) return false;
   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
   // necessarily mean that we need to desugar. It's probably nicer to have a
   // separate pass to figure out unicode desugarings.
   if (ignore_case()) return true;
   ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
   CharacterRange::Canonicalize(ranges);
   for (int i = ranges->length() - 1; i >= 0; i--) {
     uint32_t from = ranges->At(i).from();
     uint32_t to = ranges->At(i).to();
     // Check for non-BMP characters.
     if (to >= Utf16::kMaxCodeUnit) return true;
     // Check for lone surrogates.
     if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) {
       return true;
     }
   }
   return false;
 }

 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) {
   if (is_unicode() && ignore_case()) {
     icu::UnicodeSet set(c, c);
     set.closeOver(USET_CASE_INSENSITIVE);
     set.removeAllStrings();
     return set.size() > 1;
   }
   return false;
 }

 RegExpTree* RegExpBuilder::ToRegExp() {
   FlushTerms();
   intptr_t num_alternatives = alternatives_.length();
   if (num_alternatives == 0) {
     return RegExpEmpty::GetInstance();
   }
   if (num_alternatives == 1) {
     return alternatives_.Last();
   }
   ZoneGrowableArray<RegExpTree*>* alternatives =
       new (Z) ZoneGrowableArray<RegExpTree*>();
   for (intptr_t i = 0; i < alternatives_.length(); i++) {
     alternatives->Add(alternatives_[i]);
   }
   return new (Z) RegExpDisjunction(alternatives);
 }

 bool RegExpBuilder::AddQuantifierToAtom(
     intptr_t min,
     intptr_t max,
     RegExpQuantifier::QuantifierType quantifier_type) {
   if (pending_empty_) {
     pending_empty_ = false;
     return true;
   }
   RegExpTree* atom;
   if (characters_ != NULL) {
     DEBUG_ASSERT(last_added_ == ADD_CHAR);
     // Last atom was character.

     ZoneGrowableArray<uint16_t>* char_vector =
         new (Z) ZoneGrowableArray<uint16_t>();
     char_vector->AddArray(*characters_);
     intptr_t num_chars = char_vector->length();
     if (num_chars > 1) {
       ZoneGrowableArray<uint16_t>* prefix =
           new (Z) ZoneGrowableArray<uint16_t>();
       for (intptr_t i = 0; i < num_chars - 1; i++) {
         prefix->Add(char_vector->At(i));
       }
       text_.Add(new (Z) RegExpAtom(prefix, flags_));
       ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>();
       tail->Add(char_vector->At(num_chars - 1));
       char_vector = tail;
     }
     characters_ = NULL;
     atom = new (Z) RegExpAtom(char_vector, flags_);
     FlushText();
   } else if (text_.length() > 0) {
     DEBUG_ASSERT(last_added_ == ADD_ATOM);
     atom = text_.RemoveLast();
     FlushText();
   } else if (terms_.length() > 0) {
     DEBUG_ASSERT(last_added_ == ADD_ATOM);
     atom = terms_.RemoveLast();
     if (auto lookaround = atom->AsLookaround()) {
       // With /u, lookarounds are not quantifiable.
       if (is_unicode()) return false;
       // Lookbehinds are not quantifiable.
       if (lookaround->type() == RegExpLookaround::LOOKBEHIND) {
         return false;
       }
     }
     if (atom->max_match() == 0) {
       // Guaranteed to only match an empty string.
       LAST(ADD_TERM);
       if (min == 0) {
         return true;
       }
       terms_.Add(atom);
       return true;
     }
   } else {
     // Only call immediately after adding an atom or character!
     UNREACHABLE();
   }
   terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));
   LAST(ADD_TERM);
   return true;
 }

 // ----------------------------------------------------------------------------
 // Implementation of Parser

 RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags)
     : zone_(Thread::Current()->zone()),
       captures_(nullptr),
       named_captures_(nullptr),
       named_back_references_(nullptr),
       in_(in),
       current_(kEndMarker),
       next_pos_(0),
       captures_started_(0),
       capture_count_(0),
       has_more_(true),
       top_level_flags_(flags),
       simple_(false),
       contains_anchor_(false),
       is_scanned_for_captures_(false),
       has_named_captures_(false) {
   Advance();
 }

 inline uint32_t RegExpParser::ReadNext(bool update_position) {
   intptr_t position = next_pos_;
   const uint16_t c0 = in().CharAt(position);
   uint32_t c = c0;
   position++;
   if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) {
     const uint16_t c1 = in().CharAt(position);
     if (Utf16::IsTrailSurrogate(c1)) {
       c = Utf16::Decode(c0, c1);
       position++;
     }
   }
   if (update_position) next_pos_ = position;
   return c;
 }

 uint32_t RegExpParser::Next() {
   if (has_next()) {
     return ReadNext(false);
   } else {
     return kEndMarker;
   }
 }

 void RegExpParser::Advance() {
   if (has_next()) {
     current_ = ReadNext(true);
   } else {
     current_ = kEndMarker;
     // Advance so that position() points to 1 after the last character. This is
     // important so that Reset() to this position works correctly.
     next_pos_ = in().Length() + 1;
     has_more_ = false;
   }
 }

 void RegExpParser::Reset(intptr_t pos) {
   next_pos_ = pos;
   has_more_ = (pos < in().Length());
   Advance();
 }

 void RegExpParser::Advance(intptr_t dist) {
   next_pos_ += dist - 1;
   Advance();
 }

 bool RegExpParser::simple() {
   return simple_;
 }

 bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) {
   switch (c) {
     case '^':
     case '$':
     case '\\':
     case '.':
     case '*':
     case '+':
     case '?':
     case '(':
     case ')':
     case '[':
     case ']':
     case '{':
     case '}':
     case '|':
     case '/':
       return true;
     default:
       break;
   }
   return false;
 }

 void RegExpParser::ReportError(const char* message) {
   // Zip to the end to make sure the no more input is read.
   current_ = kEndMarker;
   next_pos_ = in().Length();

   // Throw a FormatException on parsing failures.
   const String& msg = String::Handle(
       String::Concat(String::Handle(String::New(message)), in()));
   const Array& args = Array::Handle(Array::New(1));
   args.SetAt(0, msg);
   Exceptions::ThrowByType(Exceptions::kFormat, args);
   UNREACHABLE();
 }

 // Pattern ::
 //   Disjunction
 RegExpTree* RegExpParser::ParsePattern() {
   RegExpTree* result = ParseDisjunction();
   PatchNamedBackReferences();
   ASSERT(!has_more());
   // If the result of parsing is a literal string atom, and it has the
   // same length as the input, then the atom is identical to the input.
   if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
     simple_ = true;
   }
   return result;
 }

 // Used for error messages where we would have fallen back on treating an
 // escape as the identity escape, but we are in Unicode mode.
 static const char* kUnicodeIdentity =
     "Invalid identity escape in Unicode pattern";

 // Disjunction ::
 //   Alternative
 //   Alternative | Disjunction
 // Alternative ::
 //   [empty]
 //   Term Alternative
 // Term ::
 //   Assertion
 //   Atom
 //   Atom Quantifier
 RegExpTree* RegExpParser::ParseDisjunction() {
   // Used to store current state while parsing subexpressions.
   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
                                   0, nullptr, top_level_flags_, Z);
   RegExpParserState* stored_state = &initial_state;
   // Cache the builder in a local variable for quick access.
   RegExpBuilder* builder = initial_state.builder();
   while (true) {
     switch (current()) {
       case kEndMarker:
         if (stored_state->IsSubexpression()) {
           // Inside a parenthesized group when hitting end of input.
           ReportError("Unterminated group");
           UNREACHABLE();
         }
         ASSERT(INITIAL == stored_state->group_type());
         // Parsing completed successfully.
         return builder->ToRegExp();
       case ')': {
         if (!stored_state->IsSubexpression()) {
           ReportError("Unmatched ')'");
           UNREACHABLE();
         }
         ASSERT(INITIAL != stored_state->group_type());

         Advance();
         // End disjunction parsing and convert builder content to new single
         // regexp atom.
         RegExpTree* body = builder->ToRegExp();

         intptr_t end_capture_index = captures_started();

         intptr_t capture_index = stored_state->capture_index();
         SubexpressionType group_type = stored_state->group_type();

         // Build result of subexpression.
         if (group_type == CAPTURE) {
           if (stored_state->IsNamedCapture()) {
             CreateNamedCaptureAtIndex(stored_state->capture_name(),
                                       capture_index);
           }
           RegExpCapture* capture = GetCapture(capture_index);
           capture->set_body(body);
           body = capture;
         } else if (group_type != GROUPING) {
           ASSERT(group_type == POSITIVE_LOOKAROUND ||
                  group_type == NEGATIVE_LOOKAROUND);
           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
           body = new (Z) RegExpLookaround(
               body, is_positive, end_capture_index - capture_index,
               capture_index, stored_state->lookaround_type());
         }

         // Restore previous state.
         stored_state = stored_state->previous_state();
         builder = stored_state->builder();

         builder->AddAtom(body);
         // For compatibility with JSC and ES3, we allow quantifiers after
         // lookaheads, and break in all cases.
         break;
       }
       case '|': {
         Advance();
         builder->NewAlternative();
         continue;
       }
       case '*':
       case '+':
       case '?':
         ReportError("Nothing to repeat");
         UNREACHABLE();
       case '^': {
         Advance();
         if (builder->is_multi_line()) {
           builder->AddAssertion(new (Z) RegExpAssertion(
               RegExpAssertion::START_OF_LINE, builder->flags()));
         } else {
           builder->AddAssertion(new (Z) RegExpAssertion(
               RegExpAssertion::START_OF_INPUT, builder->flags()));
           set_contains_anchor();
         }
         continue;
       }
       case '$': {
         Advance();
         RegExpAssertion::AssertionType assertion_type =
             builder->is_multi_line() ? RegExpAssertion::END_OF_LINE
                                      : RegExpAssertion::END_OF_INPUT;
         builder->AddAssertion(
             new (Z) RegExpAssertion(assertion_type, builder->flags()));
         continue;
       }
       case '.': {
         Advance();
         auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
         if (builder->is_dot_all()) {
           // Everything.
           CharacterRange::AddClassEscape(
               '*', ranges,
               /*add_unicode_case_equivalents=*/false);
         } else {
           // everything except \x0a, \x0d, \u2028 and \u2029
           CharacterRange::AddClassEscape(
               '.', ranges,
               /*add_unicode_case_equivalents=*/false);
         }
         RegExpCharacterClass* cc =
             new (Z) RegExpCharacterClass(ranges, builder->flags());
         builder->AddCharacterClass(cc);
         break;
       }
       case '(': {
         stored_state = ParseOpenParenthesis(stored_state);
         builder = stored_state->builder();
         continue;
       }
       case '[': {
         RegExpTree* atom = ParseCharacterClass(builder);
         builder->AddCharacterClass(atom->AsCharacterClass());
         break;
       }
       // Atom ::
       //   \ AtomEscape
       case '\\':
         switch (Next()) {
           case kEndMarker:
             ReportError("\\ at end of pattern");
             UNREACHABLE();
           case 'b':
             Advance(2);
             builder->AddAssertion(new (Z) RegExpAssertion(
                 RegExpAssertion::BOUNDARY, builder->flags()));
             continue;
           case 'B':
             Advance(2);
             builder->AddAssertion(new (Z) RegExpAssertion(
                 RegExpAssertion::NON_BOUNDARY, builder->flags()));
             continue;
           // AtomEscape ::
           //   CharacterClassEscape
           //
           // CharacterClassEscape :: one of
           //   d D s S w W
           case 'd':
           case 'D':
           case 's':
           case 'S':
           case 'w':
           case 'W': {
             uint32_t c = Next();
             Advance(2);
             auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
             CharacterRange::AddClassEscape(
                 c, ranges, is_unicode() && builder->ignore_case());
             RegExpCharacterClass* cc =
                 new (Z) RegExpCharacterClass(ranges, builder->flags());
             builder->AddCharacterClass(cc);
             break;
           }
           case 'p':
           case 'P': {
             uint32_t p = Next();
             Advance(2);

             if (is_unicode()) {
               auto name_1 = new (Z) ZoneGrowableArray<char>();
               auto name_2 = new (Z) ZoneGrowableArray<char>();
               auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
               if (ParsePropertyClassName(name_1, name_2)) {
                 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
                   RegExpCharacterClass* cc =
                       new (Z) RegExpCharacterClass(ranges, builder->flags());
                   builder->AddCharacterClass(cc);
                   break;
                 }
               }
               ReportError("Invalid property name");
               UNREACHABLE();
             } else {
               builder->AddCharacter(p);
             }
             break;
           }
           case '1':
           case '2':
           case '3':
           case '4':
           case '5':
           case '6':
           case '7':
           case '8':
           case '9': {
             intptr_t index = 0;
             if (ParseBackReferenceIndex(&index)) {
               if (stored_state->IsInsideCaptureGroup(index)) {
                 // The back reference is inside the capture group it refers to.
                 // Nothing can possibly have been captured yet, so we use empty
                 // instead. This ensures that, when checking a back reference,
                 // the capture registers of the referenced capture are either
                 // both set or both cleared.
                 builder->AddEmpty();
               } else {
                 RegExpCapture* capture = GetCapture(index);
                 RegExpTree* atom =
                     new (Z) RegExpBackReference(capture, builder->flags());
                 builder->AddAtom(atom);
               }
               break;
             }
             // With /u, no identity escapes except for syntax characters are
             // allowed. Otherwise, all identity escapes are allowed.
             if (is_unicode()) {
               ReportError(kUnicodeIdentity);
               UNREACHABLE();
             }
             uint32_t first_digit = Next();
             if (first_digit == '8' || first_digit == '9') {
               builder->AddCharacter(first_digit);
               Advance(2);
               break;
             }
           }
             FALL_THROUGH;
           case '0': {
             Advance();
             if (is_unicode() && Next() >= '0' && Next() <= '9') {
               // With /u, decimal escape with leading 0 are not parsed as octal.
               ReportError("Invalid decimal escape");
               UNREACHABLE();
             }
             uint32_t octal = ParseOctalLiteral();
             builder->AddCharacter(octal);
             break;
           }
           // ControlEscape :: one of
           //   f n r t v
           case 'f':
             Advance(2);
             builder->AddCharacter('\f');
             break;
           case 'n':
             Advance(2);
             builder->AddCharacter('\n');
             break;
           case 'r':
             Advance(2);
             builder->AddCharacter('\r');
             break;
           case 't':
             Advance(2);
             builder->AddCharacter('\t');
             break;
           case 'v':
             Advance(2);
             builder->AddCharacter('\v');
             break;
           case 'c': {
             Advance();
             uint32_t controlLetter = Next();
             // Special case if it is an ASCII letter.
             // Convert lower case letters to uppercase.
             uint32_t letter = controlLetter & ~('a' ^ 'A');
             if (letter < 'A' || 'Z' < letter) {
               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
               // This is outside the specification. We match JSC in
               // reading the backslash as a literal character instead
               // of as starting an escape.
               if (is_unicode()) {
                 // With /u, invalid escapes are not treated as identity escapes.
                 ReportError(kUnicodeIdentity);
                 UNREACHABLE();
               }
               builder->AddCharacter('\\');
             } else {
               Advance(2);
               builder->AddCharacter(controlLetter & 0x1f);
             }
             break;
           }
           case 'x': {
             Advance(2);
             uint32_t value;
             if (ParseHexEscape(2, &value)) {
               builder->AddCharacter(value);
             } else if (!is_unicode()) {
               builder->AddCharacter('x');
             } else {
               // With /u, invalid escapes are not treated as identity escapes.
               ReportError(kUnicodeIdentity);
               UNREACHABLE();
             }
             break;
           }
           case 'u': {
             Advance(2);
             uint32_t value;
             if (ParseUnicodeEscape(&value)) {
               builder->AddEscapedUnicodeCharacter(value);
             } else if (!is_unicode()) {
               builder->AddCharacter('u');
             } else {
               // With /u, invalid escapes are not treated as identity escapes.
               ReportError(kUnicodeIdentity);
               UNREACHABLE();
             }
             break;
           }
           case 'k':
             // Either an identity escape or a named back-reference.  The two
             // interpretations are mutually exclusive: '\k' is interpreted as
             // an identity escape for non-Unicode patterns without named
             // capture groups, and as the beginning of a named back-reference
             // in all other cases.
             if (is_unicode() || HasNamedCaptures()) {
               Advance(2);
               ParseNamedBackReference(builder, stored_state);
               break;
             }
             FALL_THROUGH;
           default:
             Advance();
             // With the unicode flag, no identity escapes except for syntax
             // characters are allowed. Otherwise, all identity escapes are
             // allowed.
             if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) {
               builder->AddCharacter(current());
               Advance();
             } else {
               ReportError(kUnicodeIdentity);
               UNREACHABLE();
             }
             break;
         }
         break;
       case '{': {
         intptr_t dummy;
         if (ParseIntervalQuantifier(&dummy, &dummy)) {
           ReportError("Nothing to repeat");
           UNREACHABLE();
         }
       }
         FALL_THROUGH;
       case '}':
       case ']':
         if (is_unicode()) {
           ReportError("Lone quantifier brackets");
           UNREACHABLE();
         }
         FALL_THROUGH;
       default:
         builder->AddUnicodeCharacter(current());
         Advance();
         break;
     }  // end switch(current())

     intptr_t min;
     intptr_t max;
     switch (current()) {
       // QuantifierPrefix ::
       //   *
       //   +
       //   ?
       //   {
       case '*':
         min = 0;
         max = RegExpTree::kInfinity;
         Advance();
         break;
       case '+':
         min = 1;
         max = RegExpTree::kInfinity;
         Advance();
         break;
       case '?':
         min = 0;
         max = 1;
         Advance();
         break;
       case '{':
         if (ParseIntervalQuantifier(&min, &max)) {
           if (max < min) {
             ReportError("numbers out of order in {} quantifier.");
             UNREACHABLE();
           }
           break;
         } else {
           continue;
         }
       default:
         continue;
     }
     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
     if (current() == '?') {
       quantifier_type = RegExpQuantifier::NON_GREEDY;
       Advance();
     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
       // FLAG_regexp_possessive_quantifier is a debug-only flag.
       quantifier_type = RegExpQuantifier::POSSESSIVE;
       Advance();
     }
     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
       ReportError("invalid quantifier.");
       UNREACHABLE();
     }
   }
 }

 #ifdef DEBUG
 // Currently only used in an ASSERT.
 static bool IsSpecialClassEscape(uint32_t c) {
   switch (c) {
     case 'd':
     case 'D':
     case 's':
     case 'S':
     case 'w':
     case 'W':
       return true;
     default:
       return false;
   }
 }
 #endif

 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
     RegExpParserState* state) {
   RegExpLookaround::Type lookaround_type = state->lookaround_type();
   bool is_named_capture = false;
   const RegExpCaptureName* capture_name = nullptr;
   SubexpressionType subexpr_type = CAPTURE;
   Advance();
   if (current() == '?') {
     switch (Next()) {
       case ':':
         Advance(2);
         subexpr_type = GROUPING;
         break;
       case '=':
         Advance(2);
         lookaround_type = RegExpLookaround::LOOKAHEAD;
         subexpr_type = POSITIVE_LOOKAROUND;
         break;
       case '!':
         Advance(2);
         lookaround_type = RegExpLookaround::LOOKAHEAD;
         subexpr_type = NEGATIVE_LOOKAROUND;
         break;
       case '<':
         Advance();
         if (Next() == '=') {
           Advance(2);
           lookaround_type = RegExpLookaround::LOOKBEHIND;
           subexpr_type = POSITIVE_LOOKAROUND;
           break;
         } else if (Next() == '!') {
           Advance(2);
           lookaround_type = RegExpLookaround::LOOKBEHIND;
           subexpr_type = NEGATIVE_LOOKAROUND;
           break;
         }
         is_named_capture = true;
         has_named_captures_ = true;
         Advance();
         break;
       default:
         ReportError("Invalid group");
         UNREACHABLE();
     }
   }

   if (subexpr_type == CAPTURE) {
     if (captures_started_ >= kMaxCaptures) {
       ReportError("Too many captures");
       UNREACHABLE();
     }
     captures_started_++;

     if (is_named_capture) {
       capture_name = ParseCaptureGroupName();
     }
   }
   // Store current state and begin new disjunction parsing.
   return new (Z)
       RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
                         capture_name, state->builder()->flags(), Z);
 }

 // In order to know whether an escape is a backreference or not we have to scan
 // the entire regexp and find the number of capturing parentheses.  However we
 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
 // is called when needed.  It can see the difference between capturing and
 // noncapturing parentheses and can skip character classes and backslash-escaped
 // characters.
 void RegExpParser::ScanForCaptures() {
   ASSERT(!is_scanned_for_captures_);
   const intptr_t saved_position = position();
   // Start with captures started previous to current position
   intptr_t capture_count = captures_started();
   // Add count of captures after this position.
   uintptr_t n;
   while ((n = current()) != kEndMarker) {
     Advance();
     switch (n) {
       case '\\':
         Advance();
         break;
       case '[': {
         uintptr_t c;
         while ((c = current()) != kEndMarker) {
           Advance();
           if (c == '\\') {
             Advance();
           } else {
             if (c == ']') break;
           }
         }
         break;
       }
       case '(':
         // At this point we could be in
         // * a non-capturing group '(:',
         // * a lookbehind assertion '(?<=' '(?<!'
         // * or a named capture '(?<'.
         //
         // Of these, only named captures are capturing groups.
         if (current() == '?') {
           Advance();
           if (current() != '<') break;

           Advance();
           if (current() == '=' || current() == '!') break;

           // Found a possible named capture. It could turn out to be a syntax
           // error (e.g. an unterminated or invalid name), but that distinction
           // does not matter for our purposes.
           has_named_captures_ = true;
         }
         capture_count++;
         break;
     }
   }
   capture_count_ = capture_count;
   is_scanned_for_captures_ = true;
   Reset(saved_position);
 }

 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
   ASSERT('\\' == current());
   ASSERT('1' <= Next() && Next() <= '9');
   // Try to parse a decimal literal that is no greater than the total number
   // of left capturing parentheses in the input.
   intptr_t start = position();
   intptr_t value = Next() - '0';
   Advance(2);
   while (true) {
     uint32_t c = current();
     if (Utils::IsDecimalDigit(c)) {
       value = 10 * value + (c - '0');
       if (value > kMaxCaptures) {
         Reset(start);
         return false;
       }
       Advance();
     } else {
       break;
     }
   }
   if (value > captures_started()) {
     if (!is_scanned_for_captures_) ScanForCaptures();
     if (value > capture_count_) {
       Reset(start);
       return false;
     }
   }
   *index_out = value;
   return true;
 }

 namespace {

 static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) {
   return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$';
 }

 // ES#sec-names-and-keywords Names and Keywords
 // UnicodeIDStart, '$', '_' and '\'
 static bool IsIdentifierStartSlow(uint32_t c) {
   // cannot use u_isIDStart because it does not work for
   // Other_ID_Start characters.
   return u_hasBinaryProperty(c, UCHAR_ID_START) ||
          (c < 0x60 && (c == '$' || c == '\\' || c == '_'));
 }

 // ES#sec-names-and-keywords Names and Keywords
 // UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
 static bool IsIdentifierPartSlow(uint32_t c) {
   const uint32_t kZeroWidthNonJoiner = 0x200C;
   const uint32_t kZeroWidthJoiner = 0x200D;
   // Can't use u_isIDPart because it does not work for
   // Other_ID_Continue characters.
   return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) ||
          (c < 0x60 && (c == '$' || c == '\\' || c == '_')) ||
          c == kZeroWidthNonJoiner || c == kZeroWidthJoiner;
 }

 static inline bool IsIdentifierStart(uint32_t c) {
   if (c > 127) return IsIdentifierStartSlow(c);
   return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c);
 }

 static inline bool IsIdentifierPart(uint32_t c) {
   if (c > 127) return IsIdentifierPartSlow(c);
   return IsAsciiIdentifierPart(c);
 }

 static bool IsSameName(const RegExpCaptureName* name1,
                        const RegExpCaptureName* name2) {
   if (name1->length() != name2->length()) return false;
   for (intptr_t i = 0; i < name1->length(); i++) {
     if (name1->At(i) != name2->At(i)) return false;
   }
   return true;
 }

 }  // end namespace

 static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) {
   if (code_unit <= Utf16::kMaxCodeUnit) {
     v->Add(code_unit);
   } else {
     uint16_t units[2];
     Utf16::Encode(code_unit, units);
     v->Add(units[0]);
     v->Add(units[1]);
   }
 }

 const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() {
   auto name = new (Z) RegExpCaptureName();

   bool at_start = true;
   while (true) {
     uint32_t c = current();
     Advance();

     // Convert unicode escapes.
     if (c == '\\' && current() == 'u') {
       Advance();
       if (!ParseUnicodeEscape(&c)) {
         ReportError("Invalid Unicode escape sequence");
         UNREACHABLE();
       }
     }

     // The backslash char is misclassified as both ID_Start and ID_Continue.
     if (c == '\\') {
       ReportError("Invalid capture group name");
       UNREACHABLE();
     }

     if (at_start) {
       if (!IsIdentifierStart(c)) {
         ReportError("Invalid capture group name");
         UNREACHABLE();
       }
       PushCodeUnit(name, c);
       at_start = false;
     } else {
       if (c == '>') {
         break;
       } else if (IsIdentifierPart(c)) {
         PushCodeUnit(name, c);
       } else {
         ReportError("Invalid capture group name");
         UNREACHABLE();
       }
     }
   }

   return name;
 }

 intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) {
   for (const auto& capture : *named_captures_) {
     if (IsSameName(name, capture->name())) return capture->index();
   }
   return -1;
 }

 void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name,
                                              intptr_t index) {
   ASSERT(0 < index && index <= captures_started_);
   ASSERT(name != nullptr);

   if (named_captures_ == nullptr) {
     named_captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(1);
   } else {
     // Check for duplicates and bail if we find any. Currently O(n^2).
     if (GetNamedCaptureIndex(name) >= 0) {
       ReportError("Duplicate capture group name");
       UNREACHABLE();
     }
   }

   RegExpCapture* capture = GetCapture(index);
   ASSERT(capture->name() == nullptr);

   capture->set_name(name);
   named_captures_->Add(capture);
 }

 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
                                            RegExpParserState* state) {
   // The parser is assumed to be on the '<' in \k<name>.
   if (current() != '<') {
     ReportError("Invalid named reference");
     UNREACHABLE();
   }

   Advance();
   const RegExpCaptureName* name = ParseCaptureGroupName();
   if (name == nullptr) {
     return false;
   }

   if (state->IsInsideCaptureGroup(name)) {
     builder->AddEmpty();
   } else {
     RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags());
     atom->set_name(name);

     builder->AddAtom(atom);

     if (named_back_references_ == nullptr) {
       named_back_references_ =
           new (Z) ZoneGrowableArray<RegExpBackReference*>(1);
     }
     named_back_references_->Add(atom);
   }

   return true;
 }

 void RegExpParser::PatchNamedBackReferences() {
   if (named_back_references_ == nullptr) return;

   if (named_captures_ == nullptr) {
     ReportError("Invalid named capture referenced");
     return;
   }

   // Look up and patch the actual capture for each named back reference.
   // Currently O(n^2), optimize if necessary.
   for (intptr_t i = 0; i < named_back_references_->length(); i++) {
     RegExpBackReference* ref = named_back_references_->At(i);
     intptr_t index = GetNamedCaptureIndex(ref->name());

     if (index < 0) {
       ReportError("Invalid named capture referenced");
       UNREACHABLE();
     }
     ref->set_capture(GetCapture(index));
   }
 }

 RegExpCapture* RegExpParser::GetCapture(intptr_t index) {
   // The index for the capture groups are one-based. Its index in the list is
   // zero-based.
   const intptr_t know_captures =
       is_scanned_for_captures_ ? capture_count_ : captures_started_;
   ASSERT(index <= know_captures);
   if (captures_ == nullptr) {
     captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures);
   }
   while (captures_->length() < know_captures) {
     captures_->Add(new (Z) RegExpCapture(captures_->length() + 1));
   }
   return captures_->At(index - 1);
 }

 ArrayPtr RegExpParser::CreateCaptureNameMap() {
   if (named_captures_ == nullptr || named_captures_->is_empty()) {
     return Array::null();
   }

   const intptr_t len = named_captures_->length() * 2;

   const Array& array = Array::Handle(Array::New(len));

   auto& name = String::Handle();
   auto& smi = Smi::Handle();
   for (intptr_t i = 0; i < named_captures_->length(); i++) {
     RegExpCapture* capture = named_captures_->At(i);
     name =
         String::FromUTF16(capture->name()->data(), capture->name()->length());
     smi = Smi::New(capture->index());
     array.SetAt(i * 2, name);
     array.SetAt(i * 2 + 1, smi);
   }

   return array.ptr();
 }

 bool RegExpParser::HasNamedCaptures() {
   if (has_named_captures_ || is_scanned_for_captures_) {
     return has_named_captures_;
   }

   ScanForCaptures();
   ASSERT(is_scanned_for_captures_);
   return has_named_captures_;
 }

 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) {
   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
     if (s->group_type() != CAPTURE) continue;
     // Return true if we found the matching capture index.
     if (index == s->capture_index()) return true;
     // Abort if index is larger than what has been parsed up till this state.
     if (index > s->capture_index()) return false;
   }
   return false;
 }

 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
     const RegExpCaptureName* name) {
   ASSERT(name != nullptr);
   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
     if (s->capture_name() == nullptr) continue;
     if (IsSameName(s->capture_name(), name)) return true;
   }
   return false;
 }

 // QuantifierPrefix ::
 //   { DecimalDigits }
 //   { DecimalDigits , }
 //   { DecimalDigits , DecimalDigits }
 //
 // Returns true if parsing succeeds, and set the min_out and max_out
 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
                                            intptr_t* max_out) {
   ASSERT(current() == '{');
   intptr_t start = position();
   Advance();
   intptr_t min = 0;
   if (!Utils::IsDecimalDigit(current())) {
     Reset(start);
     return false;
   }
   while (Utils::IsDecimalDigit(current())) {
     intptr_t next = current() - '0';
     if (min > (RegExpTree::kInfinity - next) / 10) {
       // Overflow. Skip past remaining decimal digits and return -1.
       do {
         Advance();
       } while (Utils::IsDecimalDigit(current()));
       min = RegExpTree::kInfinity;
       break;
     }
     min = 10 * min + next;
     Advance();
   }
   intptr_t max = 0;
   if (current() == '}') {
     max = min;
     Advance();
   } else if (current() == ',') {
     Advance();
     if (current() == '}') {
       max = RegExpTree::kInfinity;
       Advance();
     } else {
       while (Utils::IsDecimalDigit(current())) {
         intptr_t next = current() - '0';
         if (max > (RegExpTree::kInfinity - next) / 10) {
           do {
             Advance();
           } while (Utils::IsDecimalDigit(current()));
           max = RegExpTree::kInfinity;
           break;
         }
         max = 10 * max + next;
         Advance();
       }
       if (current() != '}') {
         Reset(start);
         return false;
       }
       Advance();
     }
   } else {
     Reset(start);
     return false;
   }
   *min_out = min;
   *max_out = max;
   return true;
 }

 uint32_t RegExpParser::ParseOctalLiteral() {
   ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
   // For compatibility with some other browsers (not all), we parse
   // up to three octal digits with a value below 256.
   uint32_t value = current() - '0';
   Advance();
   if ('0' <= current() && current() <= '7') {
     value = value * 8 + current() - '0';
     Advance();
     if (value < 32 && '0' <= current() && current() <= '7') {
       value = value * 8 + current() - '0';
       Advance();
     }
   }
   return value;
 }

 // Returns the value (0 .. 15) of a hexadecimal character c.
 // If c is not a legal hexadecimal character, returns a value < 0.
 static inline intptr_t HexValue(uint32_t c) {
   c -= '0';
   if (static_cast<unsigned>(c) <= 9) return c;
   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
   if (static_cast<unsigned>(c) <= 5) return c + 10;
   return -1;
 }

 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {
   intptr_t start = position();
   uint32_t val = 0;
   bool done = false;
   for (intptr_t i = 0; !done; i++) {
     uint32_t c = current();
     intptr_t d = HexValue(c);
     if (d < 0) {
       Reset(start);
       return false;
     }
     val = val * 16 + d;
     Advance();
     if (i == length - 1) {
       done = true;
     }
   }
   *value = val;
   return true;
 }

 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
 bool RegExpParser::ParseUnicodeEscape(uint32_t* value) {
   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
   // allowed). In the latter case, the number of hex digits between { } is
   // arbitrary. \ and u have already been read.
   if (current() == '{' && is_unicode()) {
     int start = position();
     Advance();
     if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) {
       if (current() == '}') {
         Advance();
         return true;
       }
     }
     Reset(start);
     return false;
   }
   // \u but no {, or \u{...} escapes not allowed.
   bool result = ParseHexEscape(4, value);
   if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) &&
       current() == '\\') {
     // Attempt to read trail surrogate.
     int start = position();
     if (Next() == 'u') {
       Advance(2);
       uint32_t trail;
       if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) {
         *value = Utf16::Decode(static_cast<uint16_t>(*value),
                                static_cast<uint16_t>(trail));
         return true;
       }
     }
     Reset(start);
   }
   return result;
 }

 namespace {

 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
   if (short_name != nullptr && strcmp(property_name, short_name) == 0) {
     return true;
   }
   for (int i = 0;; i++) {
     const char* long_name = u_getPropertyName(
         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
     if (long_name == nullptr) break;
     if (strcmp(property_name, long_name) == 0) return true;
   }
   return false;
 }

 bool IsExactPropertyValueAlias(const char* property_value_name,
                                UProperty property,
                                int32_t property_value) {
   const char* short_name =
       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
     return true;
   }
   for (int i = 0;; i++) {
     const char* long_name = u_getPropertyValueName(
         property, property_value,
         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
     if (long_name == nullptr) break;
     if (strcmp(property_value_name, long_name) == 0) return true;
   }
   return false;
 }

 bool LookupPropertyValueName(UProperty property,
                              const char* property_value_name,
                              bool negate,
                              ZoneGrowableArray<CharacterRange>* result) {
   UProperty property_for_lookup = property;
   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
     // For the property Script_Extensions, we have to do the property value
     // name lookup as if the property is Script.
     property_for_lookup = UCHAR_SCRIPT;
   }
   int32_t property_value =
       u_getPropertyValueEnum(property_for_lookup, property_value_name);
   if (property_value == UCHAR_INVALID_CODE) return false;

   // We require the property name to match exactly to one of the property value
   // aliases. However, u_getPropertyValueEnum uses loose matching.
   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
                                  property_value)) {
     return false;
   }

   UErrorCode ec = U_ZERO_ERROR;
   icu::UnicodeSet set;
   set.applyIntPropertyValue(property, property_value, ec);
   bool success = ec == U_ZERO_ERROR && (set.isEmpty() == 0);

   if (success) {
     set.removeAllStrings();
     if (negate) set.complement();
     for (int i = 0; i < set.getRangeCount(); i++) {
       result->Add(
           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
     }
   }
   return success;
 }

 template <size_t N>
 inline bool NameEquals(const char* name, const char (&literal)[N]) {
   return strncmp(name, literal, N + 1) == 0;
 }

 bool LookupSpecialPropertyValueName(const char* name,
                                     ZoneGrowableArray<CharacterRange>* result,
                                     bool negate) {
   if (NameEquals(name, "Any")) {
     if (negate) {
       // Leave the list of character ranges empty, since the negation of 'Any'
       // is the empty set.
     } else {
       result->Add(CharacterRange::Everything());
     }
   } else if (NameEquals(name, "ASCII")) {
     result->Add(negate ? CharacterRange::Range(0x80, Utf::kMaxCodePoint)
                        : CharacterRange::Range(0x0, 0x7F));
   } else if (NameEquals(name, "Assigned")) {
     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
                                    !negate, result);
   } else {
     return false;
   }
   return true;
 }

 // Explicitly list supported binary properties. The spec forbids supporting
 // properties outside of this set to ensure interoperability.
 bool IsSupportedBinaryProperty(UProperty property) {
   switch (property) {
     case UCHAR_ALPHABETIC:
     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
     case UCHAR_ASCII_HEX_DIGIT:
     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
     case UCHAR_BIDI_CONTROL:
     case UCHAR_BIDI_MIRRORED:
     case UCHAR_CASE_IGNORABLE:
     case UCHAR_CASED:
     case UCHAR_CHANGES_WHEN_CASEFOLDED:
     case UCHAR_CHANGES_WHEN_CASEMAPPED:
     case UCHAR_CHANGES_WHEN_LOWERCASED:
     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
     case UCHAR_CHANGES_WHEN_TITLECASED:
     case UCHAR_CHANGES_WHEN_UPPERCASED:
     case UCHAR_DASH:
     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
     case UCHAR_DEPRECATED:
     case UCHAR_DIACRITIC:
     case UCHAR_EMOJI:
     case UCHAR_EMOJI_COMPONENT:
     case UCHAR_EMOJI_MODIFIER_BASE:
     case UCHAR_EMOJI_MODIFIER:
     case UCHAR_EMOJI_PRESENTATION:
     case UCHAR_EXTENDED_PICTOGRAPHIC:
     case UCHAR_EXTENDER:
     case UCHAR_GRAPHEME_BASE:
     case UCHAR_GRAPHEME_EXTEND:
     case UCHAR_HEX_DIGIT:
     case UCHAR_ID_CONTINUE:
     case UCHAR_ID_START:
     case UCHAR_IDEOGRAPHIC:
     case UCHAR_IDS_BINARY_OPERATOR:
     case UCHAR_IDS_TRINARY_OPERATOR:
     case UCHAR_JOIN_CONTROL:
     case UCHAR_LOGICAL_ORDER_EXCEPTION:
     case UCHAR_LOWERCASE:
     case UCHAR_MATH:
     case UCHAR_NONCHARACTER_CODE_POINT:
     case UCHAR_PATTERN_SYNTAX:
     case UCHAR_PATTERN_WHITE_SPACE:
     case UCHAR_QUOTATION_MARK:
     case UCHAR_RADICAL:
     case UCHAR_REGIONAL_INDICATOR:
     case UCHAR_S_TERM:
     case UCHAR_SOFT_DOTTED:
     case UCHAR_TERMINAL_PUNCTUATION:
     case UCHAR_UNIFIED_IDEOGRAPH:
     case UCHAR_UPPERCASE:
     case UCHAR_VARIATION_SELECTOR:
     case UCHAR_WHITE_SPACE:
     case UCHAR_XID_CONTINUE:
     case UCHAR_XID_START:
       return true;
     default:
       break;
   }
   return false;
 }

 bool IsUnicodePropertyValueCharacter(char c) {
   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
   //
   // Note that using this to validate each parsed char is quite conservative.
   // A possible alternative solution would be to only ensure the parsed
   // property name/value candidate string does not contain '\0' characters and
   // let ICU lookups trigger the final failure.
   if (Utils::IsAlphaNumeric(c)) return true;
   return (c == '_');
 }

 }  // anonymous namespace

 bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
                                           ZoneGrowableArray<char>* name_2) {
   ASSERT(name_1->is_empty());
   ASSERT(name_2->is_empty());
   // Parse the property class as follows:
   // - In \p{name}, 'name' is interpreted
   //   - either as a general category property value name.
   //   - or as a binary property name.
   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
   //   and 'value' is interpreted as one of the available property value names.
   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
   // - Loose matching is not applied.
   if (current() == '{') {
     // Parse \p{[PropertyName=]PropertyNameValue}
     for (Advance(); current() != '}' && current() != '='; Advance()) {
       if (!IsUnicodePropertyValueCharacter(current())) return false;
       if (!has_next()) return false;
       name_1->Add(static_cast<char>(current()));
     }
     if (current() == '=') {
       for (Advance(); current() != '}'; Advance()) {
         if (!IsUnicodePropertyValueCharacter(current())) return false;
         if (!has_next()) return false;
         name_2->Add(static_cast<char>(current()));
       }
       name_2->Add(0);  // null-terminate string.
     }
   } else {
     return false;
   }
   Advance();
   name_1->Add(0);  // null-terminate string.

   ASSERT(static_cast<size_t>(name_1->length() - 1) == strlen(name_1->data()));
   ASSERT(name_2->is_empty() ||
          static_cast<size_t>(name_2->length() - 1) == strlen(name_2->data()));
   return true;
 }

 bool RegExpParser::AddPropertyClassRange(
     ZoneGrowableArray<CharacterRange>* add_to,
     bool negate,
     ZoneGrowableArray<char>* name_1,
     ZoneGrowableArray<char>* name_2) {
   ASSERT(name_1->At(name_1->length() - 1) == '\0');
   ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0');
   if (name_2->is_empty()) {
     // First attempt to interpret as general category property value name.
     const char* name = name_1->data();
     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
                                 add_to)) {
       return true;
     }
     // Interpret "Any", "ASCII", and "Assigned".
     if (LookupSpecialPropertyValueName(name, add_to, negate)) {
       return true;
     }
     // Then attempt to interpret as binary property name with value name 'Y'.
     UProperty property = u_getPropertyEnum(name);
     if (!IsSupportedBinaryProperty(property)) return false;
     if (!IsExactPropertyAlias(name, property)) return false;
     return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to);
   } else {
     // Both property name and value name are specified. Attempt to interpret
     // the property name as enumerated property.
     const char* property_name = name_1->data();
     const char* value_name = name_2->data();
     UProperty property = u_getPropertyEnum(property_name);
     if (!IsExactPropertyAlias(property_name, property)) return false;
     if (property == UCHAR_GENERAL_CATEGORY) {
       // We want to allow aggregate value names such as "Letter".
       property = UCHAR_GENERAL_CATEGORY_MASK;
     } else if (property != UCHAR_SCRIPT &&
                property != UCHAR_SCRIPT_EXTENSIONS) {
       return false;
     }
     return LookupPropertyValueName(property, value_name, negate, add_to);
   }
 }

 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value,
                                                  uint32_t* value) {
   uint32_t x = 0;
   int d = HexValue(current());
   if (d < 0) {
     return false;
   }
   while (d >= 0) {
     x = x * 16 + d;
     if (x > max_value) {
       return false;
     }
     Advance();
     d = HexValue(current());
   }
   *value = x;
   return true;
 }

 uint32_t RegExpParser::ParseClassCharacterEscape() {
   ASSERT(current() == '\\');
   DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
   Advance();
   switch (current()) {
     case 'b':
       Advance();
       return '\b';
     // ControlEscape :: one of
     //   f n r t v
     case 'f':
       Advance();
       return '\f';
     case 'n':
       Advance();
       return '\n';
     case 'r':
       Advance();
       return '\r';
     case 't':
       Advance();
       return '\t';
     case 'v':
       Advance();
       return '\v';
     case 'c': {
       uint32_t controlLetter = Next();
       uint32_t letter = controlLetter & ~('A' ^ 'a');
       // For compatibility with JSC, inside a character class
       // we also accept digits and underscore as control characters.
       if (letter >= 'A' && letter <= 'Z') {
         Advance(2);
         // Control letters mapped to ASCII control characters in the range
         // 0x00-0x1f.
         return controlLetter & 0x1f;
       }
       if (is_unicode()) {
         // With /u, \c# or \c_ are invalid.
         ReportError("Invalid class escape");
         UNREACHABLE();
       }
       if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') {
         Advance(2);
         return controlLetter & 0x1f;
       }
       // We match JSC in reading the backslash as a literal
       // character instead of as starting an escape.
       return '\\';
     }
     case '0':
       // With /u, \0 is interpreted as NUL if not followed by another digit.
       if (is_unicode() && !(Next() >= '0' && Next() <= '9')) {
         Advance();
         return 0;
       }
       FALL_THROUGH;
     case '1':
     case '2':
     case '3':
     case '4':
     case '5':
     case '6':
     case '7':
       // For compatibility, we interpret a decimal escape that isn't
       // a back reference (and therefore either \0 or not valid according
       // to the specification) as a 1..3 digit octal character code.
       if (is_unicode()) {
         // With \u, decimal escape is not interpreted as octal character code.
         ReportError("Invalid class escape");
         UNREACHABLE();
       }
       return ParseOctalLiteral();
     case 'x': {
       Advance();
       uint32_t value;
       if (ParseHexEscape(2, &value)) {
         return value;
       }
       if (is_unicode()) {
         // With \u, invalid escapes are not treated as identity escapes.
         ReportError("Invalid escape");
         UNREACHABLE();
       }
       // If \x is not followed by a two-digit hexadecimal, treat it
       // as an identity escape.
       return 'x';
     }
     case 'u': {
       Advance();
       uint32_t value;
       if (ParseUnicodeEscape(&value)) {
         return value;
       }
       if (is_unicode()) {
         // With \u, invalid escapes are not treated as identity escapes.
         ReportError(kUnicodeIdentity);
         UNREACHABLE();
       }
       // If \u is not followed by a four-digit hexadecimal, treat it
       // as an identity escape.
       return 'u';
     }
     default: {
       // Extended identity escape. We accept any character that hasn't
       // been matched by a more specific case, not just the subset required
       // by the ECMAScript specification.
       uint32_t result = current();
       if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
         Advance();
         return result;
       }
       ReportError(kUnicodeIdentity);
       UNREACHABLE();
     }
   }
   return 0;
 }

 bool RegExpParser::ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
                                     bool add_unicode_case_equivalents,
                                     uint32_t* char_out) {
   uint32_t first = current();
   if (first == '\\') {
     switch (Next()) {
       case 'w':
       case 'W':
       case 'd':
       case 'D':
       case 's':
       case 'S': {
         CharacterRange::AddClassEscape(static_cast<uint16_t>(Next()), ranges,
                                        add_unicode_case_equivalents);
         Advance(2);
         return true;
       }
       case 'p':
       case 'P': {
         if (!is_unicode()) break;
         bool negate = Next() == 'P';
         Advance(2);
         auto name_1 = new (Z) ZoneGrowableArray<char>();
         auto name_2 = new (Z) ZoneGrowableArray<char>();
         if (!ParsePropertyClassName(name_1, name_2) ||
             !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
           ReportError("Invalid property name in character class");
           UNREACHABLE();
         }
         return true;
       }
       case kEndMarker:
         ReportError("\\ at end of pattern");
         UNREACHABLE();
       default:
         break;
     }
     *char_out = ParseClassCharacterEscape();
     return false;
   }
   Advance();
   *char_out = first;
   return false;
 }

 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
   static const char* kUnterminated = "Unterminated character class";
   static const char* kRangeInvalid = "Invalid character class";
   static const char* kRangeOutOfOrder = "Range out of order in character class";

   ASSERT(current() == '[');
   Advance();
   bool is_negated = false;
   if (current() == '^') {
     is_negated = true;
     Advance();
   }
   ZoneGrowableArray<CharacterRange>* ranges =
       new (Z) ZoneGrowableArray<CharacterRange>(2);
   bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case();
   while (has_more() && current() != ']') {
     uint32_t char_1 = 0;
     bool is_class_1 =
         ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1);
     if (current() == '-') {
       Advance();
       if (current() == kEndMarker) {
         // If we reach the end we break out of the loop and let the
         // following code report an error.
         break;
       } else if (current() == ']') {
         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
         ranges->Add(CharacterRange::Singleton('-'));
         break;
       }
       uint32_t char_2 = 0;
       bool is_class_2 =
           ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2);
       if (is_class_1 || is_class_2) {
         // Either end is an escaped character class. Treat the '-' verbatim.
         if (is_unicode()) {
           // ES2015 21.2.2.15.1 step 1.
           ReportError(kRangeInvalid);
           UNREACHABLE();
         }
         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
         ranges->Add(CharacterRange::Singleton('-'));
         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2));
         continue;
       }
       if (char_1 > char_2) {
         ReportError(kRangeOutOfOrder);
         UNREACHABLE();
       }
       ranges->Add(CharacterRange::Range(char_1, char_2));
     } else {
       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
     }
   }
   if (!has_more()) {
     ReportError(kUnterminated);
     UNREACHABLE();
   }
   Advance();
   RegExpCharacterClass::CharacterClassFlags character_class_flags =
       RegExpCharacterClass::DefaultFlags();
   if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED;
   return new (Z)
       RegExpCharacterClass(ranges, builder->flags(), character_class_flags);
 }

 // ----------------------------------------------------------------------------
 // The Parser interface.

 void RegExpParser::ParseRegExp(const String& input,
                                RegExpFlags flags,
                                RegExpCompileData* result) {
   ASSERT(result != NULL);
   RegExpParser parser(input, &result->error, flags);
   // Throws an exception if 'input' is not valid.
   RegExpTree* tree = parser.ParsePattern();
   ASSERT(tree != NULL);
   ASSERT(result->error.IsNull());
   result->tree = tree;
   intptr_t capture_count = parser.captures_started();
   result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
   result->contains_anchor = parser.contains_anchor();
   result->capture_name_map = parser.CreateCaptureNameMap();
   result->capture_count = capture_count;
 }

 }  // namespace dart