blob: 7cb08941720a052c87ca421f3069532803ef2a1d [file] [log] [blame]
// Copyright (c) 2019, the Dart project authors. All rights reserved.
// Copyright 2014 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// ES6 extends the \uxxxx escape and also allows \u{xxxxx}.
import 'package:expect/expect.dart';
import 'v8_regexp_utils.dart';
void testRegExpHelper(RegExp r) {
assertTrue(r.hasMatch("foo"));
assertTrue(r.hasMatch("boo"));
assertFalse(r.hasMatch("moo"));
}
void TestUnicodeEscapes() {
testRegExpHelper(RegExp(r"(\u0066|\u0062)oo"));
testRegExpHelper(RegExp(r"(\u0066|\u0062)oo", unicode: true));
testRegExpHelper(RegExp(r"(\u{0066}|\u{0062})oo", unicode: true));
testRegExpHelper(RegExp(r"(\u{66}|\u{000062})oo", unicode: true));
// Note that we need \\ inside a string, otherwise it's interpreted as a
// unicode escape inside a string.
testRegExpHelper(RegExp("(\\u0066|\\u0062)oo"));
testRegExpHelper(RegExp("(\\u0066|\\u0062)oo", unicode: true));
testRegExpHelper(RegExp("(\\u{0066}|\\u{0062})oo", unicode: true));
testRegExpHelper(RegExp("(\\u{66}|\\u{000062})oo", unicode: true));
// Though, unicode escapes via strings should work too.
testRegExpHelper(RegExp("(\u0066|\u0062)oo"));
testRegExpHelper(RegExp("(\u0066|\u0062)oo", unicode: true));
testRegExpHelper(RegExp("(\u{0066}|\u{0062})oo", unicode: true));
testRegExpHelper(RegExp("(\u{66}|\u{000062})oo", unicode: true));
}
void TestUnicodeEscapesInCharacterClasses() {
testRegExpHelper(RegExp(r"[\u0062-\u0066]oo"));
testRegExpHelper(RegExp(r"[\u0062-\u0066]oo", unicode: true));
testRegExpHelper(RegExp(r"[\u{0062}-\u{0066}]oo", unicode: true));
testRegExpHelper(RegExp(r"[\u{62}-\u{000066}]oo", unicode: true));
// Note that we need \\ inside a string, otherwise it's interpreted as a
// unicode escape inside a string.
testRegExpHelper(RegExp("[\\u0062-\\u0066]oo"));
testRegExpHelper(RegExp("[\\u0062-\\u0066]oo", unicode: true));
testRegExpHelper(RegExp("[\\u{0062}-\\u{0066}]oo", unicode: true));
testRegExpHelper(RegExp("[\\u{62}-\\u{000066}]oo", unicode: true));
// Though, unicode escapes via strings should work too.
testRegExpHelper(RegExp("[\u0062-\u0066]oo"));
testRegExpHelper(RegExp("[\u0062-\u0066]oo", unicode: true));
testRegExpHelper(RegExp("[\u{0062}-\u{0066}]oo", unicode: true));
testRegExpHelper(RegExp("[\u{62}-\u{000066}]oo", unicode: true));
}
void TestBraceEscapesWithoutUnicodeFlag() {
// \u followed by illegal escape will be parsed as u. {x} will be the
// character count.
void helper1(RegExp r) {
assertFalse(r.hasMatch("fbar"));
assertFalse(r.hasMatch("fubar"));
assertTrue(r.hasMatch("fuubar"));
assertFalse(r.hasMatch("fuuubar"));
}
helper1(RegExp(r"f\u{2}bar"));
helper1(RegExp("f\\u{2}bar"));
void helper2(RegExp r) {
assertFalse(r.hasMatch("fbar"));
assertTrue(r.hasMatch("fubar"));
assertTrue(r.hasMatch("fuubar"));
assertFalse(r.hasMatch("fuuubar"));
}
helper2(RegExp(r"f\u{1,2}bar"));
helper2(RegExp("f\\u{1,2}bar"));
void helper3(RegExp r) {
assertTrue(r.hasMatch("u"));
assertTrue(r.hasMatch("{"));
assertTrue(r.hasMatch("2"));
assertTrue(r.hasMatch("}"));
assertFalse(r.hasMatch("q"));
assertFalse(r.hasMatch("("));
assertFalse(r.hasMatch(")"));
}
helper3(RegExp(r"[\u{2}]"));
helper3(RegExp("[\\u{2}]"));
}
void TestInvalidEscapes() {
// Without the u flag, invalid unicode escapes and other invalid escapes are
// treated as identity escapes.
void helper1(RegExp r) {
assertTrue(r.hasMatch("firstuxz89second"));
}
helper1(RegExp(r"first\u\x\z\8\9second"));
helper1(RegExp("first\\u\\x\\z\\8\\9second"));
void helper2(RegExp r) {
assertTrue(r.hasMatch("u"));
assertTrue(r.hasMatch("x"));
assertTrue(r.hasMatch("z"));
assertTrue(r.hasMatch("8"));
assertTrue(r.hasMatch("9"));
assertFalse(r.hasMatch("q"));
assertFalse(r.hasMatch("7"));
}
helper2(RegExp(r"[\u\x\z\8\9]"));
helper2(RegExp("[\\u\\x\\z\\8\\9]"));
// However, with the u flag, these are treated as invalid escapes.
assertThrows(() => RegExp(r"\u", unicode: true));
assertThrows(() => RegExp(r"\u12", unicode: true));
assertThrows(() => RegExp(r"\ufoo", unicode: true));
assertThrows(() => RegExp(r"\x", unicode: true));
assertThrows(() => RegExp(r"\xfoo", unicode: true));
assertThrows(() => RegExp(r"\z", unicode: true));
assertThrows(() => RegExp(r"\8", unicode: true));
assertThrows(() => RegExp(r"\9", unicode: true));
assertThrows(() => RegExp("\\u", unicode: true));
assertThrows(() => RegExp("\\u12", unicode: true));
assertThrows(() => RegExp("\\ufoo", unicode: true));
assertThrows(() => RegExp("\\x", unicode: true));
assertThrows(() => RegExp("\\xfoo", unicode: true));
assertThrows(() => RegExp("\\z", unicode: true));
assertThrows(() => RegExp("\\8", unicode: true));
assertThrows(() => RegExp("\\9", unicode: true));
}
void TestTooBigHexEscape() {
// The hex number inside \u{} has a maximum value.
RegExp(r"\u{10ffff}", unicode: true);
RegExp("\\u{10ffff}", unicode: true);
assertThrows(() => RegExp(r"\u{110000}", unicode: true));
assertThrows(() => RegExp("\\u{110000}", unicode: true));
// Without the u flag, they're of course fine ({x} is the count).
RegExp(r"\u{110000}");
RegExp("\\u{110000}");
}
void TestSyntaxEscapes() {
// Syntax escapes work the same with or without the u flag.
void helper(RegExp r) {
assertTrue(r.hasMatch("foo[bar"));
assertFalse(r.hasMatch("foo]bar"));
}
helper(RegExp(r"foo\[bar"));
helper(RegExp("foo\\[bar"));
helper(RegExp(r"foo\[bar", unicode: true));
helper(RegExp("foo\\[bar", unicode: true));
}
void TestUnicodeSurrogates() {
// U+10E6D corresponds to the surrogate pair [U+D803, U+DE6D].
void helper(RegExp r) {
assertTrue(r.hasMatch("foo\u{10e6d}bar"));
}
helper(RegExp(r"foo\ud803\ude6dbar", unicode: true));
helper(RegExp("foo\\ud803\\ude6dbar", unicode: true));
}
void main() {
TestUnicodeEscapes();
TestUnicodeEscapesInCharacterClasses();
TestBraceEscapesWithoutUnicodeFlag();
TestInvalidEscapes();
TestTooBigHexEscape();
TestSyntaxEscapes();
TestUnicodeSurrogates();
// Non-BMP patterns.
// Single character atom.
assertTrue(RegExp("\u{12345}", unicode: true).hasMatch("\u{12345}"));
assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\u{12345}"));
assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\ud808\udf45"));
assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\ud808\udf45"));
assertFalse(RegExp(r"\u{12345}", unicode: true).hasMatch("\udf45"));
assertFalse(RegExp(r"\u{12345}", unicode: true).hasMatch("\udf45"));
// Multi-character atom.
assertTrue(RegExp(r"\u{12345}\u{23456}", unicode: true)
.hasMatch("a\u{12345}\u{23456}b"));
assertTrue(RegExp(r"\u{12345}\u{23456}", unicode: true)
.hasMatch("b\u{12345}\u{23456}c"));
assertFalse(RegExp(r"\u{12345}\u{23456}", unicode: true)
.hasMatch("a\udf45\u{23456}b"));
assertFalse(RegExp(r"\u{12345}\u{23456}", unicode: true)
.hasMatch("b\udf45\u{23456}c"));
// Disjunction.
assertTrue(RegExp(r"\u{12345}(?:\u{23456})", unicode: true)
.hasMatch("a\u{12345}\u{23456}b"));
assertTrue(RegExp(r"\u{12345}(?:\u{23456})", unicode: true)
.hasMatch("b\u{12345}\u{23456}c"));
assertFalse(RegExp(r"\u{12345}(?:\u{23456})", unicode: true)
.hasMatch("a\udf45\u{23456}b"));
assertFalse(RegExp(r"\u{12345}(?:\u{23456})", unicode: true)
.hasMatch("b\udf45\u{23456}c"));
// Alternative.
assertTrue(
RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("a\u{12345}b"));
assertTrue(
RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("b\u{23456}c"));
assertFalse(
RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("a\udf45\ud84db"));
assertFalse(
RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("b\udf45\ud808c"));
// Capture.
assertTrue(RegExp("(\u{12345}|\u{23456}).\\1", unicode: true)
.hasMatch("\u{12345}b\u{12345}"));
assertTrue(RegExp(r"(\u{12345}|\u{23456}).\1", unicode: true)
.hasMatch("\u{12345}b\u{12345}"));
assertFalse(RegExp("(\u{12345}|\u{23456}).\\1", unicode: true)
.hasMatch("\u{12345}b\u{23456}"));
assertFalse(RegExp(r"(\u{12345}|\u{23456}).\1", unicode: true)
.hasMatch("\u{12345}b\u{23456}"));
// Quantifier.
assertTrue(RegExp("\u{12345}{3}", unicode: true)
.hasMatch("\u{12345}\u{12345}\u{12345}"));
assertTrue(RegExp(r"\u{12345}{3}", unicode: true)
.hasMatch("\u{12345}\u{12345}\u{12345}"));
assertTrue(RegExp("\u{12345}{3}").hasMatch("\u{12345}\udf45\udf45"));
assertFalse(RegExp(r"\ud808\udf45{3}", unicode: true)
.hasMatch("\u{12345}\udf45\udf45"));
assertTrue(RegExp(r"\ud808\udf45{3}", unicode: true)
.hasMatch("\u{12345}\u{12345}\u{12345}"));
assertFalse(
RegExp("\u{12345}{3}", unicode: true).hasMatch("\u{12345}\udf45\udf45"));
assertFalse(
RegExp(r"\u{12345}{3}", unicode: true).hasMatch("\u{12345}\udf45\udf45"));
// Literal surrogates.
shouldBe(
RegExp("\ud800\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"),
["\u{10000}\u{10000}"]);
shouldBe(
RegExp("\\ud800\\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"),
["\u{10000}\u{10000}"]);
shouldBe(
RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", unicode: true)
.firstMatch("\u{10003}\u{50001}"),
["\u{10003}\u{50001}"]);
shouldBe(
RegExp("[\ud800\udc03-\u{50001}\]+", unicode: true)
.firstMatch("\u{10003}\u{50001}"),
["\u{10003}\u{50001}"]);
// Unicode escape sequences to represent a non-BMP character cannot have
// mixed notation, and must follow the rules for RegExpUnicodeEscapeSequence.
assertThrows(() => RegExp("[\\ud800\udc03-\ud900\\udc01\]+", unicode: true));
assertNull(
RegExp("\\ud800\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"));
assertNull(
RegExp("\ud800\\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"));
assertNull(RegExp("[\\ud800\udc00]", unicode: true).firstMatch("\u{10000}"));
assertNull(
RegExp("[\\{ud800}\udc00]", unicode: true).firstMatch("\u{10000}"));
assertNull(RegExp("[\ud800\\udc00]", unicode: true).firstMatch("\u{10000}"));
assertNull(
RegExp("[\ud800\\{udc00}]", unicode: true).firstMatch("\u{10000}"));
assertNull(RegExp(r"\u{d800}\u{dc00}+", unicode: true)
.firstMatch("\ud800\udc00\udc00"));
assertNull(RegExp(r"\ud800\u{dc00}+", unicode: true)
.firstMatch("\ud800\udc00\udc00"));
assertNull(RegExp(r"\u{d800}\udc00+", unicode: true)
.firstMatch("\ud800\udc00\udc00"));
}