blob: 048f62157847d59b686f66ee99004de7bc4d02f7 [file] [log] [blame] [edit]
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
part of "dart:core";
/// A regular expression pattern.
///
/// Regular expressions (abbreviated as regex or regexp)
/// consist of a sequence of characters that specify
/// a match-checking algorithm for text _inputs_.
/// Applying a regexp to an input text results either in the regexp matching,
/// or accepting, the text, or the text being rejected.
/// When the regexp matches the text, it further provides some information
/// about *how* it matched the text.
///
/// Dart regular expressions have the same syntax and semantics as
/// JavaScript regular expressions.
/// To learn more about JavaScript regular expressions, see
/// <https://ecma-international.org/ecma-262/9.0/#sec-regexp-regular-expression-objects>.
///
/// Dart provides the basic regexp matching algorithm as [matchAsPrefix],
/// which checks if the regexp matches a part of the input starting
/// at a specific position.
/// If the regexp matches, Dart returns the details of the match as a
/// [RegExpMatch].
///
/// You can build all the other methods of [RegExp] from that basic match
/// check.
///
/// The most common use of a regexp is to *search* for a match in the input.
/// The [firstMatch] method provides this functionality.
/// This method searches a string for the first position where the regexp
/// matches.
/// Again, if a match is found, Dart returns its details as a [RegExpMatch].
///
/// The following example finds the first match of a regular expression in
/// a string.
/// ```dart
/// RegExp exp = RegExp(r'(\w+)');
/// String str = 'Parse my string';
/// RegExpMatch? match = exp.firstMatch(str);
/// print(match![0]); // "Parse"
/// ```
/// Use [allMatches] to look for all matches of a regular expression in
/// a string.
///
/// The following example finds all matches of a regular expression in
/// a string.
/// ```dart
/// RegExp exp = RegExp(r'(\w+)');
/// String str = 'Parse my string';
/// Iterable<RegExpMatch> matches = exp.allMatches(str);
/// for (final m in matches) {
/// print(m[0]);
/// }
/// ```
/// The output of the example is:
/// ```
/// Parse
/// my
/// string
/// ```
///
/// The preceding examples use a _raw string_,
/// a specific string type that prefixes the string literal with `r`.
/// Use a raw string to treat each character, including `\` and `$`,
/// in a string as a literal character. Each character then gets passed
/// to the [RegExp] parser.
/// You should use a raw string as the argument to the
/// [RegExp] constructor.
///
/// **Performance Notice**: Regular expressions do not resolve issues
/// magically.
/// Anyone can write a regexp that performs inefficiently when
/// applied to some string inputs.
/// Often, such a regexp will perform well enough on small or common
/// inputs, but have pathological performance on large and uncommon inputs.
/// This inconsistent behavior makes performance issues harder to detect
/// in testing.
///
/// A regexp might not find text any faster than using `String` operations
/// to inspect a string.
/// The strength of regexp comes from the ability to specify
/// *somewhat* complicated patterns in very few characters.
/// These regexps provide reasonable efficiency in most common cases.
/// This conciseness comes at a cost of readability.
/// Due to their syntactic complexity, regexes cannot be considered
/// self documenting.
///
/// Dart regexps implement the ECMAScript RegExp specification.
/// This specification provides a both common and well-known regexp behavior.
/// When compiling Dart for the web, the compiled code can use the browser’s
/// regexp implementation.
///
/// The specification defines ECMAScript regexp behavior using *backtracking*.
/// When a regexp can choose between different ways to match,
/// it tries each way in the order given in the pattern.
/// For example: `RegExp(r"(foo|bar)baz")` wants to check for `foo` or `bar`,
/// so it checks for `foo` first.
/// If continuing along that path doesn't match the input,
/// the regexp implementation *backtracks*.
/// The implementation resets to the original state from before
/// checking for `foo`,
/// forgetting all the work it has done after that,
/// and then tries the next choice; `bar` in this example.
///
/// The specification defines these choices
/// and the order in which they must be attempted.
/// If a regexp could match an input in more than one way,
/// the order of the choices decides which match the regexp returns.
/// Commonly used regexps order their matching choices to ensure
/// a specific result.
/// The ECMAScript regexp specification limits how Dart
/// can implement regular expressions.
/// It must be a backtracking implementation which checks choices in
/// a specific order.
/// Dart cannot choose a different regexp implementation,
/// because then regexp matching would behave differently.
///
/// The backtracking approach works, but at a cost.
/// For some regexps and some inputs,
/// finding a *correct* match can take a *lot* of tries.
/// It can take even more tries to reject an input
/// that the regexp *almost* matches.
///
/// A well-known dangerous regexp pattern comes from
/// nesting quantifiers like `*`:
///
/// ```dart
/// var re = RegExp(r"^(a*|b)*c");
/// print(re.hasMatch("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
/// ```
///
/// The regexp pattern doesn't match the input string of only `a`s
/// as the input doesn’t contain the required `c`.
/// There exists an _exponential_ number of different ways for `(a*|b)*`
/// to match all the `a`s.
/// The backtracking regexp implementation tries *all* of them
/// before deciding that none of those can lead to a complete match.
/// Each extra `a` added to the input doubles the time the regexp
/// takes to return `false`.
/// (When backtracking has this exponential potential, it is called
/// [“catastrophic backtracking”](https://www.google.com/search?q=regexp+catastrophic+backtracking)).
///
/// Sequential quantifiers provide another dangerous pattern,
/// but they provide “only” polynomial complexity.
///
/// ```dart
/// // Like `\w*-\d`, but check for `b` and `c` in that order.
/// var re = RegExp(r"^\w*(b)?\w*(c)?\w*-\d");
/// print(re.hasMatch("a" * 512));
/// ```
///
/// Again the input doesn’t match, but `RegExp` must try *n*<sup>3</sup> ways
/// to match the *n* `a`s _before_ deciding that.
/// Doubling the input’s length increases the time to return `false`
/// _eightfold_.
/// This exponent increases with the number of sequential quantifiers.
///
/// Both of these patterns look trivial when reduced to such simple regexps.
/// However, these "trivial" patterns often arise as parts of
/// more complicated regular expressions,
/// where your ability to find the problem gets more difficult.
///
/// In general, if a regexp has potential for *super-linear complexity*,
/// you can craft an input that takes an inordinate amount of time to search.
/// These patterns can then be used for [denial of service attacks](https://en.wikipedia.org/wiki/ReDoS)
/// if you apply vulnerable regexp patterns to user-provided inputs.
///
/// No guaranteed solution exists for this problem.
/// Be careful to not use regexps with super-linear behavior
/// where the program may match that regexp against inputs
/// with no guaranteed match.
///
/// Rules of thumb to avoid regexps with super-linear execution time include:
///
/// * Whenever the regexp has a choice, try to make sure
/// that the choice can be made based on the next character
/// (or very limited look-ahead).
/// This limits the need to perform a lot of computation along both choices.
/// * When using quantifiers, ensure that the same string cannot match
/// both one and more-than-one iteration of the quantifier's
/// regular expression.
/// (For `(a*|b)*`, the string `"aa"` can match both
/// `(a*|b){1}` and `(a*|b){2}`.)
/// * Most uses of Dart regular expressions *search* for a match,
/// for example using [firstMatch].
/// If you do not *anchor* the pattern
/// to the start of a line or input using `^`,
/// this search acts as if the regexp began with an implicit `[^]*`.
/// Starting your actual regular expression with `.*`
/// then results in potential quadratic behavior for the search.
/// Use anchors or [matchAsPrefix] where appropriate,
/// or avoid starting the regexp with a quantified pattern.
/// * *For experts only:* Neither Dart nor ECMAScript have general
/// [“atomic grouping”](https://github.com/tc39/proposal-regexp-atomic-operators).
/// Other regular expression dialects use this to limit backtracking.
/// If an atomic capture group succeeds once,
/// the regexp cannot backtrack into the same match later.
/// As lookarounds also serve as atomic groups,
/// something similar can be achieved using a *lookahead*:
/// `var re = RegExp(r"^(?=((a*|b)*))\1d");`
/// The preceding example does the same inefficient matching of `(a*|b)*`.
/// Once the regexp has matched as far as possible,
/// it completes the positive lookahead.
/// Then it skips what the lookahead matched using a back-reference.
/// After that, it can no longer backtrack
/// and try other combinations of `a`s.
///
/// Try to reduce how many ways the regexp can match the same string.
/// That reduces the number of possible backtracks performed
/// when the regexp does not find a match.
/// Several guides to [improving the performance of regular expressions](https://www.google.com/search?q=performance+of+regular+expressions)
/// exist on the internet. Use these as inspirations, too.
abstract interface class RegExp implements Pattern {
/// Constructs a regular expression.
///
/// Throws a [FormatException] if [source] does not follow valid regular
/// expression syntax.
///
/// If your code enables `multiLine`, then `^` and `$` will match
/// the beginning and end of a _line_, as well as matching beginning and
/// end of the input, respectively.
///
/// If your code disables `caseSensitive`,
/// then Dart ignores the case of letters when matching.
/// For example, with `caseSensitive` disable, the regexp pattern `a`
/// matches both `a` and `A`.
///
/// If your code enables `unicode`, then Dart treats the pattern as a
/// Unicode pattern per the ECMAScript standard.
///
/// If your code enables `dotAll`, then the `.` pattern will match _all_
/// characters, including line terminators.
///
/// Example:
///
/// ```dart
/// final wordPattern = RegExp(r'(\w+)');
/// final digitPattern = RegExp(r'(\d+)');
/// ```
///
/// These examples use a _raw string_ as the argument.
/// You should prefer to use a raw string as argument to the [RegExp]
/// constructor, because it makes it easy to write
/// the `\` and `$` characters as regexp reserved characters.
///
/// The same examples written using non-raw strings would be:
/// ```dart
/// final wordPattern = RegExp('(\\w+)'); // Should be raw string.
/// final digitPattern = RegExp('(\\d+)'); // Should be raw string.
/// ```
/// Use a non-raw string only when you need to use
/// string interpolation. For example:
/// ```dart
/// Pattern keyValuePattern(String keyIdentifier) =>
/// RegExp('$keyIdentifier=(\\w+)');
/// ```
/// When including a string verbatim into the regexp pattern like this,
/// be careful that the string does not contain regular expression
/// reserved characters.
/// If that risk exists, use the [escape] function to convert those
/// characters to safe versions of the reserved characters
/// and match only the string itself:
/// ```dart
/// Pattern keyValuePattern(String anyStringKey) =>
/// RegExp('${RegExp.escape(anyStringKey)}=(\\w+)');
/// ```
external factory RegExp(
String source, {
bool multiLine = false,
bool caseSensitive = true,
bool unicode = false,
bool dotAll = false,
});
/// Creates regular expression syntax that matches the input [text].
///
/// If [text] contains regular expression reserved characters,
/// the resulting regular expression matches those characters literally.
/// If [text] contains no regular expression reserved characters,
/// Dart returns the expression unmodified.
///
/// The reserved characters in regular expressions are:
/// `(`, `)`, `[`, `]`, `{`, `}`, `*`, `+`, `?`, `.`, `^`, `$`, `|` and `\`.
///
/// Use this method to create a pattern to be included in a
/// larger regular expression. Since a [String] is itself a [Pattern]
/// which matches itself, converting the string to a regular expression
/// isn't needed to search for that exact string.
/// ```dart
/// print(RegExp.escape('dash@example.com')); // dash@example\.com
/// print(RegExp.escape('a+b')); // a\+b
/// print(RegExp.escape('a*b')); // a\*b
/// print(RegExp.escape('{a-b}')); // \{a-b\}
/// print(RegExp.escape('a?')); // a\?
/// ```
external static String escape(String text);
/// Finds the first match of the regular expression in the string [input].
///
/// Returns `null` if there is no match.
/// ```dart
/// final string = '[00:13.37] This is a chat message.';
/// final regExp = RegExp(r'c\w*');
/// final match = regExp.firstMatch(string)!;
/// print(match[0]); // chat
/// ```
RegExpMatch? firstMatch(String input);
Iterable<RegExpMatch> allMatches(String input, [int start = 0]);
/// Checks whether this regular expression has a match in the [input].
///
/// ```dart
/// var string = 'Dash is a bird';
/// var regExp = RegExp(r'(humming)?bird');
/// var match = regExp.hasMatch(string); // true
///
/// regExp = RegExp(r'dog');
/// match = regExp.hasMatch(string); // false
/// ```
bool hasMatch(String input);
/// Finds the string of the first match of this regular expression
/// in [input].
///
/// Searches for a match for this regular expression in [input],
/// just like [firstMatch],
/// but returns only the matched substring if a match is found,
/// not a [RegExpMatch].
///
/// ```dart
/// var string = 'Dash is a bird';
/// var regExp = RegExp(r'(humming)?bird');
/// var match = regExp.stringMatch(string); // Match
///
/// regExp = RegExp(r'dog');
/// match = regExp.stringMatch(string); // No match
/// ```
String? stringMatch(String input);
/// The regular expression pattern source of this `RegExp`.
///
/// ```dart
/// final regExp = RegExp(r'\p{L}');
/// print(regExp.pattern); // \p{L}
/// ```
String get pattern;
/// Whether this regular expression matches multiple lines.
///
/// If the regexp does match multiple lines, the "^" and "$" characters
/// match the beginning and end of lines. If not, the characters match the
/// beginning and end of the input.
bool get isMultiLine;
/// Whether this regular expression is case sensitive.
///
/// If the regular expression is not case sensitive, it will match an input
/// letter with a pattern letter even if the two letters are different case
/// versions of the same letter.
/// ```dart
/// final text = 'Parse my string';
/// var regExp = RegExp(r'STRING', caseSensitive: false);
/// print(regExp.isCaseSensitive); // false
/// print(regExp.hasMatch(text)); // true, matches.
///
/// regExp = RegExp(r'STRING', caseSensitive: true);
/// print(regExp.isCaseSensitive); // true
/// print(regExp.hasMatch(text)); // false, no match.
/// ```
bool get isCaseSensitive;
/// Whether this regular expression uses Unicode mode.
///
/// In Unicode mode, Dart treats UTF-16 surrogate pairs in the original
/// string as a single code point and will not match each code unit in the
/// pair separately. Otherwise,
/// Dart treats the target string as a sequence of individual code
/// units and does not treat surrogates as special.
///
/// In Unicode mode, Dart restricts the syntax of the RegExp pattern,
/// for example disallowing some unescaped uses
/// of restricted regexp characters,
/// and disallowing unnecessary `\`-escapes ("identity escapes"),
/// which have both historically been allowed in non-Unicode mode.
/// Dart also allows some pattern features, like Unicode property escapes,
/// only in this mode.
/// ```dart
/// var regExp = RegExp(r'^\p{L}$', unicode: true);
/// print(regExp.hasMatch('a')); // true
/// print(regExp.hasMatch('b')); // true
/// print(regExp.hasMatch('?')); // false
/// print(regExp.hasMatch(r'p{L}')); // false
///
/// // U+1F600 (😀), one code point, two code units.
/// var smiley = '\ud83d\ude00';
///
/// regExp = RegExp(r'^.$', unicode: true); // Matches one code point.
/// print(regExp.hasMatch(smiley)); // true
/// regExp = RegExp(r'^..$', unicode: true); // Matches two code points.
/// print(regExp.hasMatch(smiley)); // false
///
/// regExp = RegExp(r'^\p{L}$', unicode: false);
/// print(regExp.hasMatch('a')); // false
/// print(regExp.hasMatch('b')); // false
/// print(regExp.hasMatch('?')); // false
/// print(regExp.hasMatch(r'p{L}')); // true
///
/// regExp = RegExp(r'^.$', unicode: false); // Matches one code unit.
/// print(regExp.hasMatch(smiley)); // false
/// regExp = RegExp(r'^..$', unicode: false); // Matches two code units.
/// print(regExp.hasMatch(smiley)); // true
/// ```
bool get isUnicode;
/// Whether "." in this regular expression matches line terminators.
///
/// When false, the "." character matches a single character, unless that
/// character terminates a line. When true, then the "." character will
/// match any single character including line terminators.
///
/// This feature is distinct from [isMultiLine]. They affect the behavior
/// of different pattern characters, so they can be used together or
/// separately.
bool get isDotAll;
}
/// A regular expression match.
///
/// Regular expression matches are [Match]es. They also include the ability
/// to retrieve the names for any named capture groups and to retrieve
/// matches for named capture groups by name instead of by their index.
///
/// Example:
/// ```dart
/// const pattern =
/// r'^\[(?<Time>\s*((?<hour>\d+)):((?<minute>\d+))\.((?<second>\d+)))\]'
/// r'\s(?<Message>\s*(.*)$)';
///
/// final regExp = RegExp(
/// pattern,
/// multiLine: true,
/// );
///
/// const multilineText = '[00:13.37] This is a first message.\n'
/// '[01:15.57] This is a second message.\n';
///
/// RegExpMatch regExpMatch = regExp.firstMatch(multilineText)!;
/// print(regExpMatch.groupNames.join('-')); // hour-minute-second-Time-Message.
/// final time = regExpMatch.namedGroup('Time'); // 00:13.37
/// final hour = regExpMatch.namedGroup('hour'); // 00
/// final minute = regExpMatch.namedGroup('minute'); // 13
/// final second = regExpMatch.namedGroup('second'); // 37
/// final message =
/// regExpMatch.namedGroup('Message'); // This is the first message.
/// final date = regExpMatch.namedGroup('Date'); // Undefined `Date`, throws.
///
/// Iterable<RegExpMatch> matches = regExp.allMatches(multilineText);
/// for (final m in matches) {
/// print(m.namedGroup('Time'));
/// print(m.namedGroup('Message'));
/// // 00:13.37
/// // This is the first message.
/// // 01:15.57
/// // This is the second message.
/// }
/// ```
abstract interface class RegExpMatch implements Match {
/// The string captured by the named capture group [name].
///
/// Returns the substring of the input that the
/// labeled capture group, labeled [name], matched,
/// or `null` if that capture group was not part of the match.
///
/// The [name] must be the name of a named capture group in the regular
/// expression [pattern] which created this match.
/// That is, the name must be in [groupNames].
String? namedGroup(String name);
/// The names of the named capture groups of [pattern].
Iterable<String> get groupNames;
RegExp get pattern;
}