blob: 034616d266d6e1bc4955690b9e61f85de2eb5158 [file] [edit]
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "vm/regexp/regexp.h"
#include <algorithm>
#include <bitset>
#include <memory>
#include <utility>
#include "vm/regexp/regexp-bytecode-generator.h"
#include "vm/regexp/regexp-bytecodes.h"
#include "vm/regexp/regexp-compiler.h"
#include "vm/regexp/regexp-interpreter.h"
#include "vm/regexp/regexp-macro-assembler.h"
#include "vm/regexp/regexp-parser.h"
#include "vm/symbols.h"
namespace dart {
using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
class RegExpImpl final : public AllStatic {
public:
// Returns a string representation of a regular expression.
// Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
// This function calls the garbage collector if necessary.
static const StringPtr ToString(const Object& value);
// Prepares a JSRegExp object with Irregexp-specific data.
static void IrregexpInitialize(Isolate* isolate,
const RegExp& re,
const String& pattern,
RegExpFlags flags,
int capture_count,
uint32_t backtrack_limit,
uint32_t bit_field);
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is thrown as indicated by a negative return value.
static int IrregexpPrepare(Isolate* isolate,
const RegExp& regexp_data,
const String& subject,
bool is_sticky);
// Execute a regular expression on the subject, starting from index.
// If matching succeeds, return the number of matches. This can be larger
// than one in the case of global regular expressions.
// The captures and subcaptures are stored into the registers vector.
// If matching fails, returns RE_FAILURE.
// If execution fails, sets an exception and returns RE_EXCEPTION.
static int IrregexpExecRaw(Isolate* isolate,
const RegExp& regexp_data,
const String& subject,
int index,
int32_t* output,
int output_size);
// Execute an Irregexp bytecode pattern. Returns the number of matches, or an
// empty handle in case of an exception.
V8_WARN_UNUSED_RESULT static std::optional<int> IrregexpExec(
Isolate* isolate,
const RegExp& regexp_data,
const String& subject,
int index,
int32_t* result_offsets_vector,
uint32_t result_offsets_vector_length);
static bool CompileIrregexpFromSource(
Thread* thread,
const RegExp& re_data,
const String& sample_subject,
bool is_one_byte,
bool sticky,
RegExpCompilationTarget compilation_target);
static bool CompileIrregexpFromBytecode(Isolate* isolate,
const RegExp& re_data,
const String& sample_subject,
bool is_one_byte);
static inline bool EnsureCompiledIrregexp(Thread* thread,
const RegExp& re_data,
const String& sample_subject,
bool is_one_byte,
bool sticky);
// Returns true on success, false on failure.
static bool Compile(Isolate* isolate,
Zone* zone,
RegExpCompileData* input,
RegExpFlags flags,
const String& pattern,
const String& sample_subject,
const RegExp& re_data,
bool is_one_byte);
};
// static
bool RegExpStatics::CanGenerateBytecode() {
return true;
}
// static
bool RegExpStatics::VerifyFlags(RegExpFlags flags) {
if (IsUnicode(flags) && IsUnicodeSets(flags)) return false;
return true;
}
// static
template <class CharT>
bool RegExpStatics::VerifySyntax(Zone* zone,
uintptr_t stack_limit,
const CharT* input,
int input_length,
RegExpFlags flags,
RegExpError* regexp_error_out) {
RegExpCompileData data;
bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax(
zone, stack_limit, input, input_length, flags, &data);
*regexp_error_out = data.error;
return pattern_is_valid;
}
template bool RegExpStatics::VerifySyntax<uint8_t>(
Zone*,
uintptr_t,
const uint8_t*,
int,
RegExpFlags,
RegExpError* regexp_error_out);
template bool RegExpStatics::VerifySyntax<uint16_t>(
Zone*,
uintptr_t,
const uint16_t*,
int,
RegExpFlags,
RegExpError* regexp_error_out);
ObjectPtr RegExpStatics::ThrowRegExpException(Isolate* isolate,
RegExpFlags flags,
const String& pattern,
RegExpError error) {
Array& args = Array::Handle();
String& str = String::Handle();
str ^= String::New(RegExpErrorString(error));
args ^= Array::New(2);
args.SetAt(0, str);
args.SetAt(1, pattern);
// TODO(regexp) args.SetAt(2, position) sometimes available but not used by V8
Exceptions::ThrowByType(Exceptions::kFormat, args);
}
void RegExpStatics::ThrowRegExpException(Isolate* isolate,
const RegExp& re_data,
RegExpError error_text) {
USE(ThrowRegExpException(isolate, re_data.flags(),
String::Handle(re_data.pattern()), error_text));
}
bool RegExpStatics::IsUnmodifiedRegExp(Isolate* isolate, const RegExp& regexp) {
// Can't monkey patch in Dart.
return true;
}
// Irregexp implementation.
// Ensures that the regexp object contains a compiled version of the
// source for either one-byte or two-byte subject strings.
// If the compiled version doesn't already exist, it is compiled
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
bool RegExpImpl::EnsureCompiledIrregexp(Thread* thread,
const RegExp& re_data,
const String& sample_subject,
bool is_one_byte,
bool sticky) {
if (re_data.bytecode(is_one_byte, sticky) != TypedData::null()) return true;
return CompileIrregexpFromSource(thread, re_data, sample_subject, is_one_byte,
sticky, RegExpCompilationTarget::kBytecode);
}
namespace {
struct RegExpCaptureIndexLess {
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
DCHECK_NOT_NULL(lhs);
DCHECK_NOT_NULL(rhs);
return lhs->index() < rhs->index();
}
};
} // namespace
// static
ArrayPtr RegExpStatics::CreateCaptureNameMap(
Isolate* isolate,
ZoneVector<RegExpCapture*>* named_captures) {
if (named_captures == nullptr) return Array::null();
ASSERT(!named_captures->empty());
// Named captures are sorted by name (because the set is used to ensure
// name uniqueness). But the capture name map must to be sorted by index.
std::sort(named_captures->begin(), named_captures->end(),
RegExpCaptureIndexLess{});
int len = static_cast<int>(named_captures->size()) * 2;
const Array& array = Array::Handle(Array::New(len));
int i = 0;
for (const RegExpCapture* capture : *named_captures) {
const String& name = String::Handle(
String::FromUTF16(capture->name()->data(), capture->name()->size()));
array.SetAt(i * 2, name);
array.SetAt(i * 2 + 1, Smi::Handle(Smi::New(capture->index())));
i++;
}
DCHECK_EQ(i * 2, len);
return array.ptr();
}
bool RegExpImpl::CompileIrregexpFromSource(
Thread* thread,
const RegExp& re_data,
const String& sample_subject,
bool is_one_byte,
bool sticky,
RegExpCompilationTarget compilation_target) {
// Since we can't abort gracefully during compilation, check for sufficient
// stack space (including the additional gap as used for Turbofan
// compilation) here in advance.
if (!OSThread::Current()->HasStackHeadroom()) {
RegExpStatics::ThrowRegExpException(thread->isolate(), re_data,
RegExpError::kAnalysisStackOverflow);
return false;
}
// Compile the RegExp.
// Zone zone(isolate->allocator(), ZONE_NAME);
// PostponeInterruptsScope postpone(isolate);
// ASSERT(RegExpCodeIsValidForPreCompilation(isolate, re_data, is_one_byte));
RegExpFlags flags = re_data.flags();
if (sticky) {
flags |= RegExpFlag::kSticky;
}
Zone* zone = thread->zone();
const String& pattern = String::Handle(zone, re_data.pattern());
RegExpCompileData compile_data;
if (!RegExpParser::ParseRegExpFromHeapString(thread->isolate(), zone, pattern,
flags, &compile_data)) {
// Throw an exception if we fail to parse the pattern.
// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
USE(RegExpStatics::ThrowRegExpException(thread->isolate(), flags, pattern,
compile_data.error));
return false;
}
compile_data.compilation_target = compilation_target;
const bool compilation_succeeded =
Compile(thread->isolate(), zone, &compile_data, flags, pattern,
sample_subject, re_data, is_one_byte);
if (!compilation_succeeded) {
ASSERT(compile_data.error != RegExpError::kNone);
RegExpStatics::ThrowRegExpException(thread->isolate(), re_data,
compile_data.error);
return false;
}
// Set num_bracket_expression after setting capture_name_map.
// RegExp_getGroupNameMap will read num_bracket_expression first and assume
// capture_name_map is available if the count is not -1.
const Array& capture_name_map =
Array::Handle(zone, RegExpStatics::CreateCaptureNameMap(
thread->isolate(), compile_data.named_captures));
re_data.set_capture_name_map(capture_name_map);
re_data.set_num_bracket_expressions<std::memory_order_release>(
compile_data.capture_count);
// Set bytecode after setting num_registers. RegExpStatics::Interpret will
// read bytecode first and assume num_register is available if bytecode is not
// null.
re_data.set_num_registers(is_one_byte, compile_data.register_count);
DCHECK_EQ(compile_data.compilation_target,
RegExpCompilationTarget::kBytecode);
re_data.set_bytecode(is_one_byte, sticky,
TypedData::Cast(*compile_data.code));
return true;
}
namespace {
void SetBacktrackAndExperimentalFallback(RegExpMacroAssembler* macro_assembler,
const RegExp& re_data) {
uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
macro_assembler->set_backtrack_limit(backtrack_limit);
macro_assembler->set_can_fallback(false);
}
} // namespace
namespace {
// Returns true if we've either generated too much irregex code within this
// isolate, or the pattern string is too long.
bool TooMuchRegExpCode(Isolate* isolate, const String& pattern) {
// Limit the space regexps take up on the heap. In order to limit this we
// would like to keep track of the amount of regexp code on the heap. This
// is not tracked, however. As a conservative approximation we track the
// total regexp code compiled including code that has subsequently been freed
// and the total executable memory at any point.
// static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
// static constexpr size_t kRegExpCompiledLimit = 1 * MB;
// Heap* heap = isolate->heap();
if (pattern.Length() > RegExpStatics::kRegExpTooLargeToOptimize) return true;
// TODO(regexp): Not relevant for Dart if we're only ever doing bytecode?
// return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
// heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
return false;
}
} // namespace
bool RegExpImpl::Compile(Isolate* isolate,
Zone* zone,
RegExpCompileData* data,
RegExpFlags flags,
const String& pattern,
const String& sample_subject,
const RegExp& re_data,
bool is_one_byte) {
if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
RegExpMacroAssembler::kMaxRegisterCount) {
data->error = RegExpError::kTooLarge;
return false;
}
RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
is_one_byte);
#ifdef V8_ENABLE_REGEXP_DIAGNOSTICS
const bool needs_graph_printer = v8_flags.print_regexp_graph ||
v8_flags.trace_regexp_graph_building ||
v8_flags.trace_regexp_compiler;
const bool needs_ast_printer = v8_flags.trace_regexp_graph_building;
std::unique_ptr<RegExpDiagnostics> diagnostics;
if (UNLIKELY(needs_ast_printer || needs_graph_printer)) {
diagnostics = std::make_unique<RegExpDiagnostics>(std::cout, zone);
}
if (UNLIKELY(needs_ast_printer)) {
diagnostics->set_tree_labeller(
std::make_unique<RegExpGraphLabeller<RegExpTree>>());
diagnostics->set_ast_printer(std::make_unique<RegExpAstNodePrinter>(
diagnostics->os(), diagnostics->tree_labeller(), diagnostics->zone()));
}
if (UNLIKELY(needs_graph_printer)) {
diagnostics->set_graph_labeller(
std::make_unique<RegExpGraphLabeller<RegExpNode>>());
diagnostics->set_graph_printer(std::make_unique<RegExpGraphPrinter>(
std::make_unique<RegExpGraphNodePrinter>(diagnostics->os(),
diagnostics->graph_labeller(),
diagnostics->zone())));
}
if (UNLIKELY(needs_ast_printer || needs_graph_printer)) {
compiler.set_diagnostics(std::move(diagnostics));
}
#endif
if (compiler.optimize()) {
compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
}
data->node = compiler.PreprocessRegExp(data, is_one_byte);
if (data->error != RegExpError::kNone) {
return false;
}
data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node);
if (data->error != RegExpError::kNone) {
return false;
}
#ifdef V8_ENABLE_REGEXP_DIAGNOSTICS
if (UNLIKELY(v8_flags.print_regexp_graph))
compiler.diagnostics()->graph_printer()->PrintGraph(data->node);
if (v8_flags.trace_regexp_graph) DotPrinter::DotPrint("Start", data->node);
#endif
std::unique_ptr<RegExpMacroAssembler> macro_assembler;
if (data->compilation_target == RegExpCompilationTarget::kNative) {
UNREACHABLE();
} else {
DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
// Interpreted regexp implementation.
macro_assembler.reset(
new RegExpBytecodeGenerator(isolate, zone,
is_one_byte ? RegExpMacroAssembler::LATIN1
: RegExpMacroAssembler::UC16));
#ifdef V8_ENABLE_REGEXP_DIAGNOSTICS
if (UNLIKELY(v8_flags.trace_regexp_assembler)) {
std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler =
std::make_unique<RegExpMacroAssemblerTracer>(
std::move(macro_assembler));
macro_assembler = std::move(tracer_macro_assembler);
}
#endif
}
macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
SetBacktrackAndExperimentalFallback(macro_assembler.get(), re_data);
// Inserted here, instead of in Assembler, because it depends on information
// in the AST that isn't replicated in the Node structure.
bool is_end_anchored = data->tree->IsAnchoredAtEnd();
bool is_start_anchored = data->tree->IsAnchoredAtStart();
int max_length = data->tree->max_match();
static const int kMaxBacksearchLimit = 1024;
if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
max_length < kMaxBacksearchLimit) {
macro_assembler->SetCurrentPositionFromEnd(max_length);
}
if (IsGlobal(flags)) {
RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
if (data->tree->min_match() > 0) {
mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
} else if (IsEitherUnicode(flags)) {
mode = RegExpMacroAssembler::GLOBAL_UNICODE;
}
macro_assembler->set_global_mode(mode);
}
RegExpCompiler::CompilationResult result = compiler.Assemble(
isolate, macro_assembler.get(), data->node, data->capture_count, pattern);
// Code / bytecode printing.
{
#ifdef ENABLE_DISASSEMBLER
if (UNLIKELY(v8_flags.print_regexp_code &&
data->compilation_target == RegExpCompilationTarget::kNative &&
result.Succeeded())) {
CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
OFStream os(trace_scope.file());
auto code = CheckedCast<Code>(result.code);
std::unique_ptr<char[]> pattern_cstring = pattern->ToCString();
code->Disassemble(pattern_cstring.get(), os, isolate);
}
if (UNLIKELY(v8_flags.print_regexp_bytecode &&
data->compilation_target ==
RegExpCompilationTarget::kBytecode &&
result.Succeeded())) {
auto bytecode = CheckedCast<TrustedByteArray>(result.code);
std::unique_ptr<char[]> pattern_cstring = pattern->ToCString();
RegExpBytecodeDisassemble(bytecode->begin(), bytecode->length(),
pattern_cstring.get());
}
#endif
}
if (result.error != RegExpError::kNone) {
if (FLAG_correctness_fuzzer_suppressions &&
result.error == RegExpError::kStackOverflow) {
FATAL("Aborting on stack overflow");
}
data->error = result.error;
}
data->code = result.code;
data->register_count = result.num_registers;
return result.Succeeded();
}
std::ostream& operator<<(std::ostream& os, RegExpFlags flags) {
#define V(Lower, Camel, LowerCamel, Char, Bit) \
if (flags & RegExpFlag::k##Camel) os << Char;
REGEXP_FLAG_LIST(V)
#undef V
return os;
}
ObjectPtr RegExpStatics::Interpret(Thread* thread,
const RegExp& regexp,
const String& subject,
int start_index,
bool sticky) {
bool is_one_byte = subject.IsOneByteString();
if (regexp.bytecode(is_one_byte, sticky) == TypedData::null()) {
if (!RegExpImpl::CompileIrregexpFromSource(
thread, regexp, subject, is_one_byte, sticky,
RegExpCompilationTarget::kBytecode)) {
// RegExp was verified at construction.
UNREACHABLE();
}
}
int register_count = regexp.num_registers(is_one_byte);
ASSERT(register_count >= 2);
int32_t* registers = thread->zone()->Alloc<int32_t>(register_count);
for (intptr_t i = 0; i < register_count; i++) {
registers[i] = -1;
}
int r = IrregexpInterpreter::MatchForCallFromRuntime(
thread, regexp, subject, registers, register_count, start_index, sticky);
if (r == IrregexpInterpreter::SUCCESS) {
const TypedData& result = TypedData::Handle(
thread->zone(),
TypedData::New(kTypedDataInt32ArrayCid, register_count));
{
#ifdef DEBUG
// These indices will be used with substring operations that don't check
// bounds, so sanity check them here.
for (intptr_t i = 0; i < register_count; i++) {
int32_t val = registers[i];
ASSERT(val == -1 || (val >= 0 && val <= subject.Length()));
}
#endif
NoSafepointScope no_safepoint(thread);
memcpy(result.DataAddr(0), registers,
register_count * sizeof(int32_t)); // NOLINT
}
return result.ptr();
} else if (r == IrregexpInterpreter::FAILURE) {
return Instance::null();
} else if (r == IrregexpInterpreter::EXCEPTION) {
const Error& error = Error::Handle(thread->StealStickyError());
Exceptions::PropagateError(error);
UNREACHABLE();
} else if (r == IrregexpInterpreter::RETRY) {
UNREACHABLE(); // No tier up in Dart.
} else if (r == IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL) {
UNREACHABLE(); // No alt implementation for Dart.
} else {
UNREACHABLE();
}
return Instance::null();
}
} // namespace dart