diff options
Diffstat (limited to 'js/src/new-regexp/regexp-macro-assembler.cc')
-rw-r--r-- | js/src/new-regexp/regexp-macro-assembler.cc | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/js/src/new-regexp/regexp-macro-assembler.cc b/js/src/new-regexp/regexp-macro-assembler.cc new file mode 100644 index 0000000000..52c1cb1ba3 --- /dev/null +++ b/js/src/new-regexp/regexp-macro-assembler.cc @@ -0,0 +1,344 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "new-regexp/regexp-macro-assembler.h" + +#include "new-regexp/regexp-stack.h" + +#ifdef V8_INTL_SUPPORT +#include "unicode/uchar.h" +#include "unicode/unistr.h" +#endif // V8_INTL_SUPPORT + +namespace v8 { +namespace internal { + +RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) + : slow_safe_compiler_(false), + global_mode_(NOT_GLOBAL), + isolate_(isolate), + zone_(zone) {} + +RegExpMacroAssembler::~RegExpMacroAssembler() = default; + +int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate) { + // This function is not allowed to cause a garbage collection. + // A GC might move the calling generated code and invalidate the + // return address on the stack. + DCHECK_EQ(0, byte_length % 2); + +#ifdef V8_INTL_SUPPORT + int32_t length = (int32_t)(byte_length >> 1); + icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1), + length); + return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2), + length, U_FOLD_CASE_DEFAULT) == 0; +#else + uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1); + uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2); + size_t length = byte_length >> 1; + DCHECK_NOT_NULL(isolate); + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = + isolate->regexp_macro_assembler_canonicalize(); + for (size_t i = 0; i < length; i++) { + unibrow::uchar c1 = substring1[i]; + unibrow::uchar c2 = substring2[i]; + if (c1 != c2) { + unibrow::uchar s1[1] = {c1}; + canonicalize->get(c1, '\0', s1); + if (s1[0] != c2) { + unibrow::uchar s2[1] = {c2}; + canonicalize->get(c2, '\0', s2); + if (s1[0] != s2[0]) { + return 0; + } + } + } + } + return 1; +#endif // V8_INTL_SUPPORT +} + + +void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, + Label* on_failure) { + Label ok; + // Check that current character is not a trail surrogate. + LoadCurrentCharacter(cp_offset, &ok); + CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok); + // Check that previous character is not a lead surrogate. + LoadCurrentCharacter(cp_offset - 1, &ok); + CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure); + Bind(&ok); +} + +void RegExpMacroAssembler::CheckPosition(int cp_offset, + Label* on_outside_input) { + LoadCurrentCharacter(cp_offset, on_outside_input, true); +} + +void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset, + Label* on_end_of_input, + bool check_bounds, + int characters, + int eats_at_least) { + // By default, eats_at_least = characters. + if (eats_at_least == kUseCharactersValue) { + eats_at_least = characters; + } + + LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters, + eats_at_least); +} + +bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type, + Label* on_no_match) { + return false; +} + +NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, + Zone* zone) + : RegExpMacroAssembler(isolate, zone) {} + +NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default; + +bool NativeRegExpMacroAssembler::CanReadUnaligned() { + return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); +} + +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +// This method may only be called after an interrupt. +int NativeRegExpMacroAssembler::CheckStackGuardState( + Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, + Address* return_address, Code re_code, Address* subject, + const byte** input_start, const byte** input_end) { + DisallowHeapAllocation no_gc; + Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0); + DCHECK_LE(re_code.raw_instruction_start(), old_pc); + DCHECK_LE(old_pc, re_code.raw_instruction_end()); + + StackLimitCheck check(isolate); + bool js_has_overflowed = check.JsHasOverflowed(); + + if (call_origin == RegExp::CallOrigin::kFromJs) { + // Direct calls from JavaScript can be interrupted in two ways: + // 1. A real stack overflow, in which case we let the caller throw the + // exception. + // 2. The stack guard was used to interrupt execution for another purpose, + // forcing the call through the runtime system. + + // Bug(v8:9540) Investigate why this method is called from JS although no + // stackoverflow or interrupt is pending on ARM64. We return 0 in this case + // to continue execution normally. + if (js_has_overflowed) { + return EXCEPTION; + } else if (check.InterruptRequested()) { + return RETRY; + } else { + return 0; + } + } + DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime); + + // Prepare for possible GC. + HandleScope handles(isolate); + Handle<Code> code_handle(re_code, isolate); + Handle<String> subject_handle(String::cast(Object(*subject)), isolate); + bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle); + int return_value = 0; + + if (js_has_overflowed) { + AllowHeapAllocation yes_gc; + isolate->StackOverflow(); + return_value = EXCEPTION; + } else if (check.InterruptRequested()) { + AllowHeapAllocation yes_gc; + Object result = isolate->stack_guard()->HandleInterrupts(); + if (result.IsException(isolate)) return_value = EXCEPTION; + } + + if (*code_handle != re_code) { // Return address no longer valid + // Overwrite the return address on the stack. + intptr_t delta = code_handle->address() - re_code.address(); + Address new_pc = old_pc + delta; + // TODO(v8:10026): avoid replacing a signed pointer. + PointerAuthentication::ReplacePC(return_address, new_pc, 0); + } + + // If we continue, we need to update the subject string addresses. + if (return_value == 0) { + // String encoding might have changed. + if (String::IsOneByteRepresentationUnderneath(*subject_handle) != + is_one_byte) { + // If we changed between an LATIN1 and an UC16 string, the specialized + // code cannot be used, and we need to restart regexp matching from + // scratch (including, potentially, compiling a new version of the code). + return_value = RETRY; + } else { + *subject = subject_handle->ptr(); + intptr_t byte_length = *input_end - *input_start; + *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc); + *input_end = *input_start + byte_length; + } + } + return return_value; +} + +// Returns a {Result} sentinel, or the number of successful matches. +int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp, + Handle<String> subject, + int* offsets_vector, + int offsets_vector_length, + int previous_index, Isolate* isolate) { + DCHECK(subject->IsFlat()); + DCHECK_LE(0, previous_index); + DCHECK_LE(previous_index, subject->length()); + + // No allocations before calling the regexp, but we can't use + // DisallowHeapAllocation, since regexps might be preempted, and another + // thread might do allocation anyway. + + String subject_ptr = *subject; + // Character offsets into string. + int start_offset = previous_index; + int char_length = subject_ptr.length() - start_offset; + int slice_offset = 0; + + // The string has been flattened, so if it is a cons string it contains the + // full string in the first part. + if (StringShape(subject_ptr).IsCons()) { + DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length()); + subject_ptr = ConsString::cast(subject_ptr).first(); + } else if (StringShape(subject_ptr).IsSliced()) { + SlicedString slice = SlicedString::cast(subject_ptr); + subject_ptr = slice.parent(); + slice_offset = slice.offset(); + } + if (StringShape(subject_ptr).IsThin()) { + subject_ptr = ThinString::cast(subject_ptr).actual(); + } + // Ensure that an underlying string has the same representation. + bool is_one_byte = subject_ptr.IsOneByteRepresentation(); + DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString()); + // String is now either Sequential or External + int char_size_shift = is_one_byte ? 0 : 1; + + DisallowHeapAllocation no_gc; + const byte* input_start = + subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc); + int byte_length = char_length << char_size_shift; + const byte* input_end = input_start + byte_length; + return Execute(*subject, start_offset, input_start, input_end, offsets_vector, + offsets_vector_length, isolate, *regexp); +} + +// Returns a {Result} sentinel, or the number of successful matches. +// TODO(pthier): The JSRegExp object is passed to native irregexp code to match +// the signature of the interpreter. We should get rid of JS objects passed to +// internal methods. +int NativeRegExpMacroAssembler::Execute( + String input, // This needs to be the unpacked (sliced, cons) string. + int start_offset, const byte* input_start, const byte* input_end, + int* output, int output_size, Isolate* isolate, JSRegExp regexp) { + // Ensure that the minimum stack has been allocated. + RegExpStackScope stack_scope(isolate); + Address stack_base = stack_scope.stack()->stack_base(); + + bool is_one_byte = String::IsOneByteRepresentationUnderneath(input); + Code code = Code::cast(regexp.Code(is_one_byte)); + RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime; + + using RegexpMatcherSig = int( + Address input_string, int start_offset, // NOLINT(readability/casting) + const byte* input_start, const byte* input_end, int* output, + int output_size, Address stack_base, int call_origin, Isolate* isolate, + Address regexp); + + auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code); + int result = + fn.Call(input.ptr(), start_offset, input_start, input_end, output, + output_size, stack_base, call_origin, isolate, regexp.ptr()); + DCHECK(result >= RETRY); + + if (result == EXCEPTION && !isolate->has_pending_exception()) { + // We detected a stack overflow (on the backtrack stack) in RegExp code, + // but haven't created the exception yet. Additionally, we allow heap + // allocation because even though it invalidates {input_start} and + // {input_end}, we are about to return anyway. + AllowHeapAllocation allow_allocation; + isolate->StackOverflow(); + } + return result; +} + +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +// clang-format off +const byte NativeRegExpMacroAssembler::word_character_map[] = { + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7' + 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9' + + 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W' + 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_' + + 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w' + 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' + // Latin-1 range + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, +}; +// clang-format on + +Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer, + Address* stack_base, + Isolate* isolate) { + RegExpStack* regexp_stack = isolate->regexp_stack(); + size_t size = regexp_stack->stack_capacity(); + Address old_stack_base = regexp_stack->stack_base(); + DCHECK(old_stack_base == *stack_base); + DCHECK(stack_pointer <= old_stack_base); + DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size); + Address new_stack_base = regexp_stack->EnsureCapacity(size * 2); + if (new_stack_base == kNullAddress) { + return kNullAddress; + } + *stack_base = new_stack_base; + intptr_t stack_content_size = old_stack_base - stack_pointer; + return new_stack_base - stack_content_size; +} + +} // namespace internal +} // namespace v8 |