From 22e29728789d598287d22bc290af666940937dd4 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Wed, 5 Oct 2022 01:01:44 -0500 Subject: [JS:RegEx] Remove the unfinished and lost cause "updated" regex engine until such time it isn't a lost cause. We are retaining the minor changes made elsewhere. --- js/moz.configure | 13 - js/src/moz.build | 3 - js/src/new-regexp/RegExpTypes.h | 51 - js/src/new-regexp/VERSION | 2 - js/src/new-regexp/gen-regexp-special-case.cc | 165 - js/src/new-regexp/import-irregexp.py | 143 - js/src/new-regexp/moz.build | 42 - js/src/new-regexp/property-sequences.cc | 1246 ------- js/src/new-regexp/property-sequences.h | 27 - js/src/new-regexp/regexp-ast.cc | 342 -- js/src/new-regexp/regexp-ast.h | 615 ---- js/src/new-regexp/regexp-bytecode-generator-inl.h | 55 - js/src/new-regexp/regexp-bytecode-generator.cc | 395 -- js/src/new-regexp/regexp-bytecode-generator.h | 119 - js/src/new-regexp/regexp-bytecode-peephole.cc | 1028 ------ js/src/new-regexp/regexp-bytecode-peephole.h | 30 - js/src/new-regexp/regexp-bytecodes.cc | 45 - js/src/new-regexp/regexp-bytecodes.h | 251 -- js/src/new-regexp/regexp-compiler-tonode.cc | 1589 -------- js/src/new-regexp/regexp-compiler.cc | 3831 -------------------- js/src/new-regexp/regexp-compiler.h | 621 ---- js/src/new-regexp/regexp-dotprinter.cc | 252 -- js/src/new-regexp/regexp-dotprinter.h | 23 - js/src/new-regexp/regexp-error.cc | 22 - js/src/new-regexp/regexp-error.h | 57 - js/src/new-regexp/regexp-interpreter.cc | 1039 ------ js/src/new-regexp/regexp-interpreter.h | 61 - js/src/new-regexp/regexp-macro-assembler-arch.h | 291 -- js/src/new-regexp/regexp-macro-assembler-tracer.cc | 418 --- js/src/new-regexp/regexp-macro-assembler-tracer.h | 80 - js/src/new-regexp/regexp-macro-assembler.cc | 344 -- js/src/new-regexp/regexp-macro-assembler.h | 280 -- js/src/new-regexp/regexp-native-macro-assembler.cc | 1213 ------- js/src/new-regexp/regexp-nodes.h | 750 ---- js/src/new-regexp/regexp-parser.cc | 2109 ----------- js/src/new-regexp/regexp-parser.h | 363 -- js/src/new-regexp/regexp-shim.cc | 212 -- js/src/new-regexp/regexp-shim.h | 1181 ------ js/src/new-regexp/regexp-stack.cc | 97 - js/src/new-regexp/regexp-stack.h | 141 - js/src/new-regexp/regexp.h | 195 - js/src/new-regexp/special-case.cc | 88 - js/src/new-regexp/special-case.h | 117 - js/src/new-regexp/util/flags.h | 93 - js/src/new-regexp/util/unicode.cc | 1865 ---------- js/src/new-regexp/util/vector.h | 204 -- js/src/new-regexp/util/zone.h | 375 -- 47 files changed, 22483 deletions(-) delete mode 100644 js/src/new-regexp/RegExpTypes.h delete mode 100644 js/src/new-regexp/VERSION delete mode 100644 js/src/new-regexp/gen-regexp-special-case.cc delete mode 100644 js/src/new-regexp/import-irregexp.py delete mode 100644 js/src/new-regexp/moz.build delete mode 100644 js/src/new-regexp/property-sequences.cc delete mode 100644 js/src/new-regexp/property-sequences.h delete mode 100644 js/src/new-regexp/regexp-ast.cc delete mode 100644 js/src/new-regexp/regexp-ast.h delete mode 100644 js/src/new-regexp/regexp-bytecode-generator-inl.h delete mode 100644 js/src/new-regexp/regexp-bytecode-generator.cc delete mode 100644 js/src/new-regexp/regexp-bytecode-generator.h delete mode 100644 js/src/new-regexp/regexp-bytecode-peephole.cc delete mode 100644 js/src/new-regexp/regexp-bytecode-peephole.h delete mode 100644 js/src/new-regexp/regexp-bytecodes.cc delete mode 100644 js/src/new-regexp/regexp-bytecodes.h delete mode 100644 js/src/new-regexp/regexp-compiler-tonode.cc delete mode 100644 js/src/new-regexp/regexp-compiler.cc delete mode 100644 js/src/new-regexp/regexp-compiler.h delete mode 100644 js/src/new-regexp/regexp-dotprinter.cc delete mode 100644 js/src/new-regexp/regexp-dotprinter.h delete mode 100644 js/src/new-regexp/regexp-error.cc delete mode 100644 js/src/new-regexp/regexp-error.h delete mode 100644 js/src/new-regexp/regexp-interpreter.cc delete mode 100644 js/src/new-regexp/regexp-interpreter.h delete mode 100644 js/src/new-regexp/regexp-macro-assembler-arch.h delete mode 100644 js/src/new-regexp/regexp-macro-assembler-tracer.cc delete mode 100644 js/src/new-regexp/regexp-macro-assembler-tracer.h delete mode 100644 js/src/new-regexp/regexp-macro-assembler.cc delete mode 100644 js/src/new-regexp/regexp-macro-assembler.h delete mode 100644 js/src/new-regexp/regexp-native-macro-assembler.cc delete mode 100644 js/src/new-regexp/regexp-nodes.h delete mode 100644 js/src/new-regexp/regexp-parser.cc delete mode 100644 js/src/new-regexp/regexp-parser.h delete mode 100644 js/src/new-regexp/regexp-shim.cc delete mode 100644 js/src/new-regexp/regexp-shim.h delete mode 100644 js/src/new-regexp/regexp-stack.cc delete mode 100644 js/src/new-regexp/regexp-stack.h delete mode 100644 js/src/new-regexp/regexp.h delete mode 100644 js/src/new-regexp/special-case.cc delete mode 100644 js/src/new-regexp/special-case.h delete mode 100644 js/src/new-regexp/util/flags.h delete mode 100644 js/src/new-regexp/util/unicode.cc delete mode 100644 js/src/new-regexp/util/vector.h delete mode 100644 js/src/new-regexp/util/zone.h diff --git a/js/moz.configure b/js/moz.configure index 3bbaf01a3..7687731f9 100644 --- a/js/moz.configure +++ b/js/moz.configure @@ -246,16 +246,3 @@ with only_when('--enable-compile-environment'): set_config('LIBFUZZER', enable_libfuzzer) set_define('LIBFUZZER', enable_libfuzzer) - -# Initial support for new regexp engine -# ================================================== - -js_option('--enable-new-regexp', default=False, help='Enable new regexp engine') - -@depends('--enable-new-regexp') -def enable_new_regexp(value): - if value: - return True - -set_config('JS_NEW_REGEXP', enable_new_regexp) -set_define('JS_NEW_REGEXP', enable_new_regexp) diff --git a/js/src/moz.build b/js/src/moz.build index 7d653d828..a3a0f8791 100644 --- a/js/src/moz.build +++ b/js/src/moz.build @@ -122,9 +122,6 @@ if CONFIG['JS_HAS_CTYPES']: if CONFIG['JS_BUNDLED_EDITLINE']: DIRS += ['editline'] -if CONFIG['JS_NEW_REGEXP']: - DIRS += ['new-regexp'] - if not CONFIG['JS_DISABLE_SHELL']: DIRS += ['shell'] diff --git a/js/src/new-regexp/RegExpTypes.h b/js/src/new-regexp/RegExpTypes.h deleted file mode 100644 index e260b5bb6..000000000 --- a/js/src/new-regexp/RegExpTypes.h +++ /dev/null @@ -1,51 +0,0 @@ -/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim: set ts=8 sts=2 et sw=2 tw=80: - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// This file forward-defines Irregexp classes that need to be visible -// to the rest of Spidermonkey and re-exports them into js::irregexp. - -#ifndef regexp_RegExpTypes_h -#define regexp_RegExpTypes_h - -namespace js { -class MatchPairs; -} - -namespace v8 { -namespace internal { - -struct InputOutputData { - const void* inputStart; - const void* inputEnd; - - // Index into inputStart (in chars) at which to begin matching. - size_t startIndex; - - js::MatchPairs* matches; - - template - InputOutputData(const CharT* inputStart, const CharT* inputEnd, - size_t startIndex, js::MatchPairs* matches) - : inputStart(inputStart), - inputEnd(inputEnd), - startIndex(startIndex), - matches(matches) - {} -}; - -} // namespace internal -} // namespace v8 - - -namespace js { -namespace irregexp { - -using InputOutputData = v8::internal::InputOutputData; - -} // namespace irregexp -} // namespace js - -#endif // regexp_RegExpTypes_h diff --git a/js/src/new-regexp/VERSION b/js/src/new-regexp/VERSION deleted file mode 100644 index c7d35a2bb..000000000 --- a/js/src/new-regexp/VERSION +++ /dev/null @@ -1,2 +0,0 @@ -Imported using import-irregexp.py from: -https://github.com/v8/v8/tree/560f2d8bb3f3a72d78e1a7d7654235d53fdcc83c/src/regexp diff --git a/js/src/new-regexp/gen-regexp-special-case.cc b/js/src/new-regexp/gen-regexp-special-case.cc deleted file mode 100644 index 5a82c5d27..000000000 --- a/js/src/new-regexp/gen-regexp-special-case.cc +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include -#include -#include -#include - -#include "new-regexp/special-case.h" - -namespace v8 { -namespace internal { - -static const uc32 kSurrogateStart = 0xd800; -static const uc32 kSurrogateEnd = 0xdfff; -static const uc32 kNonBmpStart = 0x10000; - -// The following code generates "src/regexp/special-case.cc". -void PrintSet(std::ofstream& out, const char* name, - const icu::UnicodeSet& set) { - out << "icu::UnicodeSet Build" << name << "() {\n" - << " icu::UnicodeSet set;\n"; - for (int32_t i = 0; i < set.getRangeCount(); i++) { - if (set.getRangeStart(i) == set.getRangeEnd(i)) { - out << " set.add(0x" << set.getRangeStart(i) << ");\n"; - } else { - out << " set.add(0x" << set.getRangeStart(i) << ", 0x" - << set.getRangeEnd(i) << ");\n"; - } - } - out << " set.freeze();\n" - << " return set;\n" - << "}\n\n"; - - out << "struct " << name << "Data {\n" - << " " << name << "Data() : set(Build" << name << "()) {}\n" - << " const icu::UnicodeSet set;\n" - << "};\n\n"; - - out << "//static\n" - << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" - << " static base::LazyInstance<" << name << "Data>::type set =\n" - << " LAZY_INSTANCE_INITIALIZER;\n" - << " return set.Pointer()->set;\n" - << "}\n\n"; -} - -void PrintSpecial(std::ofstream& out) { - icu::UnicodeSet current; - icu::UnicodeSet special_add; - icu::UnicodeSet ignore; - UErrorCode status = U_ZERO_ERROR; - icu::UnicodeSet upper("[\\p{Lu}]", status); - CHECK(U_SUCCESS(status)); - - // Iterate through all chars in BMP except surrogates. - for (UChar32 i = 0; i < kNonBmpStart; i++) { - if (i >= kSurrogateStart && i <= kSurrogateEnd) { - continue; // Ignore surrogate range - } - current.set(i, i); - current.closeOver(USET_CASE_INSENSITIVE); - - // Check to see if all characters in the case-folding equivalence - // class as defined by UnicodeSet::closeOver all map to the same - // canonical value. - UChar32 canonical = RegExpCaseFolding::Canonicalize(i); - bool class_has_matching_canonical_char = false; - bool class_has_non_matching_canonical_char = false; - for (int32_t j = 0; j < current.getRangeCount(); j++) { - for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); - c++) { - if (c == i) { - continue; - } - UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); - if (canonical == other_canonical) { - class_has_matching_canonical_char = true; - } else { - class_has_non_matching_canonical_char = true; - } - } - } - // If any other character in i's equivalence class has a - // different canonical value, then i needs special handling. If - // no other character shares a canonical value with i, we can - // ignore i when adding alternatives for case-independent - // comparison. If at least one other character shares a - // canonical value, then i needs special handling. - if (class_has_non_matching_canonical_char) { - if (class_has_matching_canonical_char) { - special_add.add(i); - } else { - ignore.add(i); - } - } - } - - // Verify that no Unicode equivalence class contains two non-trivial - // JS equivalence classes. Every character in SpecialAddSet has the - // same canonical value as every other non-IgnoreSet character in - // its Unicode equivalence class. Therefore, if we call closeOver on - // a set containing no IgnoreSet characters, the only characters - // that must be removed from the result are in IgnoreSet. This fact - // is used in CharacterRange::AddCaseEquivalents. - for (int32_t i = 0; i < special_add.getRangeCount(); i++) { - for (UChar32 c = special_add.getRangeStart(i); - c <= special_add.getRangeEnd(i); c++) { - UChar32 canonical = RegExpCaseFolding::Canonicalize(c); - current.set(c, c); - current.closeOver(USET_CASE_INSENSITIVE); - current.removeAll(ignore); - for (int32_t j = 0; j < current.getRangeCount(); j++) { - for (UChar32 c2 = current.getRangeStart(j); - c2 <= current.getRangeEnd(j); c2++) { - CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); - } - } - } - } - - PrintSet(out, "IgnoreSet", ignore); - PrintSet(out, "SpecialAddSet", special_add); -} - -void WriteHeader(const char* header_filename) { - std::ofstream out(header_filename); - out << std::hex << std::setfill('0') << std::setw(4); - out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" - << "// Use of this source code is governed by a BSD-style license that\n" - << "// can be found in the LICENSE file.\n\n" - << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" - << "// The following functions are used to build UnicodeSets\n" - << "// for special cases where the case-folding algorithm used by\n" - << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" - << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" - << "// Semantics: Canonicalize) step 3.\n\n" - << "#ifdef V8_INTL_SUPPORT\n" - << "#include \"src/base/lazy-instance.h\"\n\n" - << "#include \"src/regexp/special-case.h\"\n\n" - << "#include \"unicode/uniset.h\"\n" - << "namespace v8 {\n" - << "namespace internal {\n\n"; - - PrintSpecial(out); - - out << "\n" - << "} // namespace internal\n" - << "} // namespace v8\n" - << "#endif // V8_INTL_SUPPORT\n"; -} - -} // namespace internal -} // namespace v8 - -int main(int argc, const char** argv) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " \n"; - std::exit(1); - } - v8::internal::WriteHeader(argv[1]); - - return 0; -} diff --git a/js/src/new-regexp/import-irregexp.py b/js/src/new-regexp/import-irregexp.py deleted file mode 100644 index 870387232..000000000 --- a/js/src/new-regexp/import-irregexp.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python3 - -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. - -# This script handles all the mechanical steps of importing irregexp from v8: -# -# 1. Acquire the source: either from github, or optionally from a local copy of v8. -# 2. Copy the contents of v8/src/regexp into js/src/regexp -# - Exclude files that we have chosen not to import. -# 3. While doing so, update #includes: -# - Change "src/regexp/*" to "regexp/*". -# - Remove other v8-specific headers completely. -# 4. Add '#include "regexp/regexp-shim.h" in the necessary places. -# 5. Update the VERSION file to include the correct git hash. -# -# Usage: -# cd path/to/js/src/regexp -# ./import-irregexp.py --path path/to/v8/src/regexp -# -# Alternatively, without the --path argument, import-irregexp.py will -# clone v8 from github into a temporary directory. -# -# After running this script, changes to the shim code may be necessary -# to account for changes in upstream irregexp. - -import os -import re -import subprocess -import sys -from pathlib import Path - - -def get_hash(path): - # Get the hash for the current git revision - cwd = os.getcwd() - os.chdir(path) - command = ['git', 'rev-parse', 'HEAD'] - result = subprocess.check_output(command, encoding='utf-8') - os.chdir(cwd) - return result.rstrip() - - -def copy_and_update_includes(src_path, dst_path): - # List of header files that need to include the shim header - need_shim = ['property-sequences.h', - 'regexp-ast.h', - 'regexp-bytecode-peephole.h', - 'regexp-bytecodes.h', - 'regexp-dotprinter.h', - 'regexp.h', - 'regexp-macro-assembler.h', - 'regexp-stack.h', - 'special-case.h'] - - src = open(str(src_path), 'r') - dst = open(str(dst_path), 'w') - - # 1. Rewrite includes of V8 regexp headers: - regexp_include = re.compile('#include "src/regexp') - regexp_include_new = '#include "regexp' - - # 2. Remove includes of other V8 headers - other_include = re.compile('#include "src/') - - # 3. If needed, add '#include "regexp/regexp-shim.h"'. - # Note: We get a little fancy to ensure that header files are - # in alphabetic order. `need_to_add_shim` is true if we still - # have to add the shim header in this file. `adding_shim_now` - # is true if we have found a '#include "src/*' and we are just - # waiting to find something alphabetically smaller (or an empty - # line) so that we can insert the shim header in the right place. - need_to_add_shim = src_path.name in need_shim - adding_shim_now = False - - for line in src: - if adding_shim_now: - if (line == '\n' or line > '#include "src/regexp/regexp-shim.h"'): - dst.write('#include "regexp/regexp-shim.h"\n') - need_to_add_shim = False - adding_shim_now = False - - if regexp_include.search(line): - dst.write(re.sub(regexp_include, regexp_include_new, line)) - elif other_include.search(line): - if need_to_add_shim: - adding_shim_now = True - else: - dst.write(line) - - -def import_from(srcdir, dstdir): - excluded = ['OWNERS', - 'regexp.cc', - 'regexp-utils.cc', - 'regexp-utils.h', - 'regexp-macro-assembler-arch.h'] - - for file in srcdir.iterdir(): - if file.is_dir(): - continue - if str(file.name) in excluded: - continue - copy_and_update_includes(file, dstdir / file.name) - - # Update VERSION file - hash = get_hash(srcdir) - version_file = open(str(dstdir / 'VERSION'), 'w') - version_file.write('Imported using import-irregexp.py from:\n') - version_file.write('https://github.com/v8/v8/tree/%s/src/regexp\n' % hash) - - -if __name__ == '__main__': - import argparse - import tempfile - - # This script should be run from js/src/regexp to work correctly. - current_path = Path(os.getcwd()) - expected_path = 'js/src/regexp' - if not current_path.match(expected_path): - raise RuntimeError('%s must be run from %s' % (sys.argv[0], - expected_path)) - - parser = argparse.ArgumentParser(description='Import irregexp from v8') - parser.add_argument('-p', '--path', help='path to v8/src/regexp') - args = parser.parse_args() - - if args.path: - src_path = Path(args.path) - - if not (src_path / 'regexp.h').exists(): - print('Usage:\n import-irregexp.py --path ') - sys.exit(1) - import_from(src_path, current_path) - sys.exit(0) - - with tempfile.TemporaryDirectory() as tempdir: - v8_git = 'https://github.com/v8/v8.git' - clone = 'git clone --depth 1 %s %s' % (v8_git, tempdir) - os.system(clone) - src_path = Path(tempdir) / 'src/regexp' - import_from(src_path, current_path) diff --git a/js/src/new-regexp/moz.build b/js/src/new-regexp/moz.build deleted file mode 100644 index 2a8fab2ef..000000000 --- a/js/src/new-regexp/moz.build +++ /dev/null @@ -1,42 +0,0 @@ -# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -include('../js-config.mozbuild') -include('../js-cxxflags.mozbuild') - -FINAL_LIBRARY = "js" - -# Includes should be relative to parent path -LOCAL_INCLUDES += ["!..", ".."] - -SOURCES += [ - 'regexp-ast.cc', - 'regexp-bytecode-generator.cc', - 'regexp-bytecode-peephole.cc', - 'regexp-bytecodes.cc', - 'regexp-compiler-tonode.cc', - 'regexp-compiler.cc', - 'regexp-dotprinter.cc', - 'regexp-interpreter.cc', - 'regexp-macro-assembler-tracer.cc', - 'regexp-macro-assembler.cc', - 'regexp-native-macro-assembler.cc', - 'regexp-parser.cc', - 'regexp-shim.cc', - 'regexp-stack.cc', - 'util/unicode.cc' -] - -if CONFIG['ENABLE_INTL_API']: - CXXFLAGS += ['-DV8_INTL_SUPPORT'] - SOURCES += [ - 'property-sequences.cc', - 'special-case.cc' - ] - -if CONFIG['_MSC_VER']: - # This is intended as a temporary workaround to unblock compilation - # on VS2015 in warnings as errors mode. - CXXFLAGS += ['-wd4275'] \ No newline at end of file diff --git a/js/src/new-regexp/property-sequences.cc b/js/src/new-regexp/property-sequences.cc deleted file mode 100644 index ca1a7f2c3..000000000 --- a/js/src/new-regexp/property-sequences.cc +++ /dev/null @@ -1,1246 +0,0 @@ -// Copyright 2018 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifdef V8_INTL_SUPPORT - -#include "new-regexp/property-sequences.h" - -namespace v8 { -namespace internal { - -/* -Generated from following Node.js source: - -package.json - -``` -{ - "private": true, - "dependencies": { - "unicode-12.0.0": "^0.7.9" - } -} -``` - -generate-unicode-sequence-property-data.js - -``` -const toHex = (symbol) => { - return '0x' + symbol.codePointAt(0).toString(16) - .toUpperCase().padStart(6, '0'); -}; - -const generateData = (property) => { - const sequences = - require(`unicode-12.0.0/Sequence_Property/${ property }/index.js`); - const id = property.replace(/_/g, '') + 's'; - const buffer = []; - for (const sequence of sequences) { - const symbols = [...sequence]; - const codePoints = symbols.map(symbol => toHex(symbol)); - buffer.push(' ' + codePoints.join(', ') + ', 0,'); - } - const output = - `const uc32 UnicodePropertySequences::k${ id }[] = {\n` + - `${ buffer.join('\n') }\n 0 // null-terminating the list\n};\n`; - return output; -}; - -const properties = [ - 'Emoji_Flag_Sequence', - 'Emoji_Tag_Sequence', - 'Emoji_ZWJ_Sequence', -]; - -for (const property of properties) { - console.log(generateData(property)); -} -``` -*/ - -// clang-format off -const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = { - 0x01F1E6, 0x01F1E8, 0, - 0x01F1FF, 0x01F1FC, 0, - 0x01F1E6, 0x01F1EA, 0, - 0x01F1E6, 0x01F1EB, 0, - 0x01F1E6, 0x01F1EC, 0, - 0x01F1E6, 0x01F1EE, 0, - 0x01F1E6, 0x01F1F1, 0, - 0x01F1E6, 0x01F1F2, 0, - 0x01F1E6, 0x01F1F4, 0, - 0x01F1E6, 0x01F1F6, 0, - 0x01F1E6, 0x01F1F7, 0, - 0x01F1E6, 0x01F1F8, 0, - 0x01F1E6, 0x01F1F9, 0, - 0x01F1E6, 0x01F1FA, 0, - 0x01F1E6, 0x01F1FC, 0, - 0x01F1E6, 0x01F1FD, 0, - 0x01F1E6, 0x01F1FF, 0, - 0x01F1E7, 0x01F1E6, 0, - 0x01F1E7, 0x01F1E7, 0, - 0x01F1E7, 0x01F1E9, 0, - 0x01F1E7, 0x01F1EA, 0, - 0x01F1E7, 0x01F1EB, 0, - 0x01F1E7, 0x01F1EC, 0, - 0x01F1E7, 0x01F1ED, 0, - 0x01F1E7, 0x01F1EE, 0, - 0x01F1E7, 0x01F1EF, 0, - 0x01F1E7, 0x01F1F1, 0, - 0x01F1E7, 0x01F1F2, 0, - 0x01F1E7, 0x01F1F3, 0, - 0x01F1E7, 0x01F1F4, 0, - 0x01F1E7, 0x01F1F6, 0, - 0x01F1E7, 0x01F1F7, 0, - 0x01F1E7, 0x01F1F8, 0, - 0x01F1E7, 0x01F1F9, 0, - 0x01F1E7, 0x01F1FB, 0, - 0x01F1E7, 0x01F1FC, 0, - 0x01F1E7, 0x01F1FE, 0, - 0x01F1E7, 0x01F1FF, 0, - 0x01F1E8, 0x01F1E6, 0, - 0x01F1E8, 0x01F1E8, 0, - 0x01F1E8, 0x01F1E9, 0, - 0x01F1E8, 0x01F1EB, 0, - 0x01F1E8, 0x01F1EC, 0, - 0x01F1E8, 0x01F1ED, 0, - 0x01F1E8, 0x01F1EE, 0, - 0x01F1E8, 0x01F1F0, 0, - 0x01F1E8, 0x01F1F1, 0, - 0x01F1E8, 0x01F1F2, 0, - 0x01F1E8, 0x01F1F3, 0, - 0x01F1E8, 0x01F1F4, 0, - 0x01F1E8, 0x01F1F5, 0, - 0x01F1E8, 0x01F1F7, 0, - 0x01F1E8, 0x01F1FA, 0, - 0x01F1E8, 0x01F1FB, 0, - 0x01F1E8, 0x01F1FC, 0, - 0x01F1E8, 0x01F1FD, 0, - 0x01F1E8, 0x01F1FE, 0, - 0x01F1E8, 0x01F1FF, 0, - 0x01F1E9, 0x01F1EA, 0, - 0x01F1E9, 0x01F1EC, 0, - 0x01F1E9, 0x01F1EF, 0, - 0x01F1E9, 0x01F1F0, 0, - 0x01F1E9, 0x01F1F2, 0, - 0x01F1E9, 0x01F1F4, 0, - 0x01F1E9, 0x01F1FF, 0, - 0x01F1EA, 0x01F1E6, 0, - 0x01F1EA, 0x01F1E8, 0, - 0x01F1EA, 0x01F1EA, 0, - 0x01F1EA, 0x01F1EC, 0, - 0x01F1EA, 0x01F1ED, 0, - 0x01F1EA, 0x01F1F7, 0, - 0x01F1EA, 0x01F1F8, 0, - 0x01F1EA, 0x01F1F9, 0, - 0x01F1EA, 0x01F1FA, 0, - 0x01F1EB, 0x01F1EE, 0, - 0x01F1EB, 0x01F1EF, 0, - 0x01F1EB, 0x01F1F0, 0, - 0x01F1EB, 0x01F1F2, 0, - 0x01F1EB, 0x01F1F4, 0, - 0x01F1EB, 0x01F1F7, 0, - 0x01F1EC, 0x01F1E6, 0, - 0x01F1EC, 0x01F1E7, 0, - 0x01F1EC, 0x01F1E9, 0, - 0x01F1EC, 0x01F1EA, 0, - 0x01F1EC, 0x01F1EB, 0, - 0x01F1EC, 0x01F1EC, 0, - 0x01F1EC, 0x01F1ED, 0, - 0x01F1EC, 0x01F1EE, 0, - 0x01F1EC, 0x01F1F1, 0, - 0x01F1EC, 0x01F1F2, 0, - 0x01F1EC, 0x01F1F3, 0, - 0x01F1EC, 0x01F1F5, 0, - 0x01F1EC, 0x01F1F6, 0, - 0x01F1EC, 0x01F1F7, 0, - 0x01F1EC, 0x01F1F8, 0, - 0x01F1EC, 0x01F1F9, 0, - 0x01F1EC, 0x01F1FA, 0, - 0x01F1EC, 0x01F1FC, 0, - 0x01F1EC, 0x01F1FE, 0, - 0x01F1ED, 0x01F1F0, 0, - 0x01F1ED, 0x01F1F2, 0, - 0x01F1ED, 0x01F1F3, 0, - 0x01F1ED, 0x01F1F7, 0, - 0x01F1ED, 0x01F1F9, 0, - 0x01F1ED, 0x01F1FA, 0, - 0x01F1EE, 0x01F1E8, 0, - 0x01F1EE, 0x01F1E9, 0, - 0x01F1EE, 0x01F1EA, 0, - 0x01F1EE, 0x01F1F1, 0, - 0x01F1EE, 0x01F1F2, 0, - 0x01F1EE, 0x01F1F3, 0, - 0x01F1EE, 0x01F1F4, 0, - 0x01F1EE, 0x01F1F6, 0, - 0x01F1EE, 0x01F1F7, 0, - 0x01F1EE, 0x01F1F8, 0, - 0x01F1EE, 0x01F1F9, 0, - 0x01F1EF, 0x01F1EA, 0, - 0x01F1EF, 0x01F1F2, 0, - 0x01F1EF, 0x01F1F4, 0, - 0x01F1EF, 0x01F1F5, 0, - 0x01F1F0, 0x01F1EA, 0, - 0x01F1F0, 0x01F1EC, 0, - 0x01F1F0, 0x01F1ED, 0, - 0x01F1F0, 0x01F1EE, 0, - 0x01F1F0, 0x01F1F2, 0, - 0x01F1F0, 0x01F1F3, 0, - 0x01F1F0, 0x01F1F5, 0, - 0x01F1F0, 0x01F1F7, 0, - 0x01F1F0, 0x01F1FC, 0, - 0x01F1E6, 0x01F1E9, 0, - 0x01F1F0, 0x01F1FF, 0, - 0x01F1F1, 0x01F1E6, 0, - 0x01F1F1, 0x01F1E7, 0, - 0x01F1F1, 0x01F1E8, 0, - 0x01F1F1, 0x01F1EE, 0, - 0x01F1F1, 0x01F1F0, 0, - 0x01F1F1, 0x01F1F7, 0, - 0x01F1F1, 0x01F1F8, 0, - 0x01F1F1, 0x01F1F9, 0, - 0x01F1F1, 0x01F1FA, 0, - 0x01F1F1, 0x01F1FB, 0, - 0x01F1F1, 0x01F1FE, 0, - 0x01F1F2, 0x01F1E6, 0, - 0x01F1F2, 0x01F1E8, 0, - 0x01F1F2, 0x01F1E9, 0, - 0x01F1F2, 0x01F1EA, 0, - 0x01F1F2, 0x01F1EB, 0, - 0x01F1F2, 0x01F1EC, 0, - 0x01F1F2, 0x01F1ED, 0, - 0x01F1F2, 0x01F1F0, 0, - 0x01F1F2, 0x01F1F1, 0, - 0x01F1F2, 0x01F1F2, 0, - 0x01F1F2, 0x01F1F3, 0, - 0x01F1F2, 0x01F1F4, 0, - 0x01F1F2, 0x01F1F5, 0, - 0x01F1F2, 0x01F1F6, 0, - 0x01F1F2, 0x01F1F7, 0, - 0x01F1F2, 0x01F1F8, 0, - 0x01F1F2, 0x01F1F9, 0, - 0x01F1F2, 0x01F1FA, 0, - 0x01F1F2, 0x01F1FB, 0, - 0x01F1F2, 0x01F1FC, 0, - 0x01F1F2, 0x01F1FD, 0, - 0x01F1F2, 0x01F1FE, 0, - 0x01F1F2, 0x01F1FF, 0, - 0x01F1F3, 0x01F1E6, 0, - 0x01F1F3, 0x01F1E8, 0, - 0x01F1F3, 0x01F1EA, 0, - 0x01F1F3, 0x01F1EB, 0, - 0x01F1F3, 0x01F1EC, 0, - 0x01F1F3, 0x01F1EE, 0, - 0x01F1F3, 0x01F1F1, 0, - 0x01F1F3, 0x01F1F4, 0, - 0x01F1F3, 0x01F1F5, 0, - 0x01F1F3, 0x01F1F7, 0, - 0x01F1F3, 0x01F1FA, 0, - 0x01F1F3, 0x01F1FF, 0, - 0x01F1F4, 0x01F1F2, 0, - 0x01F1F5, 0x01F1E6, 0, - 0x01F1F5, 0x01F1EA, 0, - 0x01F1F5, 0x01F1EB, 0, - 0x01F1F5, 0x01F1EC, 0, - 0x01F1F5, 0x01F1ED, 0, - 0x01F1F5, 0x01F1F0, 0, - 0x01F1F5, 0x01F1F1, 0, - 0x01F1F5, 0x01F1F2, 0, - 0x01F1F5, 0x01F1F3, 0, - 0x01F1F5, 0x01F1F7, 0, - 0x01F1F5, 0x01F1F8, 0, - 0x01F1F5, 0x01F1F9, 0, - 0x01F1F5, 0x01F1FC, 0, - 0x01F1F5, 0x01F1FE, 0, - 0x01F1F6, 0x01F1E6, 0, - 0x01F1F7, 0x01F1EA, 0, - 0x01F1F7, 0x01F1F4, 0, - 0x01F1F7, 0x01F1F8, 0, - 0x01F1F7, 0x01F1FA, 0, - 0x01F1F7, 0x01F1FC, 0, - 0x01F1F8, 0x01F1E6, 0, - 0x01F1F8, 0x01F1E7, 0, - 0x01F1F8, 0x01F1E8, 0, - 0x01F1F8, 0x01F1E9, 0, - 0x01F1F8, 0x01F1EA, 0, - 0x01F1F8, 0x01F1EC, 0, - 0x01F1F8, 0x01F1ED, 0, - 0x01F1F8, 0x01F1EE, 0, - 0x01F1F8, 0x01F1EF, 0, - 0x01F1F8, 0x01F1F0, 0, - 0x01F1F8, 0x01F1F1, 0, - 0x01F1F8, 0x01F1F2, 0, - 0x01F1F8, 0x01F1F3, 0, - 0x01F1F8, 0x01F1F4, 0, - 0x01F1F8, 0x01F1F7, 0, - 0x01F1F8, 0x01F1F8, 0, - 0x01F1F8, 0x01F1F9, 0, - 0x01F1F8, 0x01F1FB, 0, - 0x01F1F8, 0x01F1FD, 0, - 0x01F1F8, 0x01F1FE, 0, - 0x01F1F8, 0x01F1FF, 0, - 0x01F1F9, 0x01F1E6, 0, - 0x01F1F9, 0x01F1E8, 0, - 0x01F1F9, 0x01F1E9, 0, - 0x01F1F9, 0x01F1EB, 0, - 0x01F1F9, 0x01F1EC, 0, - 0x01F1F9, 0x01F1ED, 0, - 0x01F1F9, 0x01F1EF, 0, - 0x01F1F9, 0x01F1F0, 0, - 0x01F1F9, 0x01F1F1, 0, - 0x01F1F9, 0x01F1F2, 0, - 0x01F1F9, 0x01F1F3, 0, - 0x01F1F9, 0x01F1F4, 0, - 0x01F1F9, 0x01F1F7, 0, - 0x01F1F9, 0x01F1F9, 0, - 0x01F1F9, 0x01F1FB, 0, - 0x01F1F9, 0x01F1FC, 0, - 0x01F1F9, 0x01F1FF, 0, - 0x01F1FA, 0x01F1E6, 0, - 0x01F1FA, 0x01F1EC, 0, - 0x01F1FA, 0x01F1F2, 0, - 0x01F1FA, 0x01F1F3, 0, - 0x01F1FA, 0x01F1F8, 0, - 0x01F1FA, 0x01F1FE, 0, - 0x01F1FA, 0x01F1FF, 0, - 0x01F1FB, 0x01F1E6, 0, - 0x01F1FB, 0x01F1E8, 0, - 0x01F1FB, 0x01F1EA, 0, - 0x01F1FB, 0x01F1EC, 0, - 0x01F1FB, 0x01F1EE, 0, - 0x01F1FB, 0x01F1F3, 0, - 0x01F1FB, 0x01F1FA, 0, - 0x01F1FC, 0x01F1EB, 0, - 0x01F1FC, 0x01F1F8, 0, - 0x01F1FD, 0x01F1F0, 0, - 0x01F1FE, 0x01F1EA, 0, - 0x01F1FE, 0x01F1F9, 0, - 0x01F1FF, 0x01F1E6, 0, - 0x01F1FF, 0x01F1F2, 0, - 0x01F1F0, 0x01F1FE, 0, - 0 // null-terminating the list -}; - -const uc32 UnicodePropertySequences::kEmojiTagSequences[] = { - 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0065, 0x0E006E, 0x0E0067, 0x0E007F, 0, - 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0073, 0x0E0063, 0x0E0074, 0x0E007F, 0, - 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0077, 0x0E006C, 0x0E0073, 0x0E007F, 0, - 0 // null-terminating the list -}; - -const uc32 UnicodePropertySequences::kEmojiZWJSequences[] = { - 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0, - 0x01F441, 0x00FE0F, 0x00200D, 0x01F5E8, 0x00FE0F, 0, - 0x01F468, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F467, 0, - 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, - 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0, - 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, - 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0, - 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, - 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, - 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0, - 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F469, 0, - 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D, - 0x01F468, 0, - 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D, - 0x01F469, 0, - 0x01F469, 0x00200D, 0x01F466, 0, - 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, - 0x01F469, 0x00200D, 0x01F467, 0, - 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, - 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, - 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0, - 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, - 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0, - 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, - 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FD, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FD, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FE, 0, - 0x01F9D1, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0, - 0x01F9D1, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, - 0x01F9D1, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, - 0x01F9D1, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, - 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, - 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, - 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0, - 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, - 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, - 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0, - 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FE, 0, - 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, - 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, - 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0, - 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FE, 0, - 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FF, 0, - 0x01F468, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F468, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F468, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F468, 0x00200D, 0x01F33E, 0, - 0x01F468, 0x00200D, 0x01F373, 0, - 0x01F468, 0x00200D, 0x01F393, 0, - 0x01F468, 0x00200D, 0x01F3A4, 0, - 0x01F468, 0x00200D, 0x01F3A8, 0, - 0x01F468, 0x00200D, 0x01F3EB, 0, - 0x01F468, 0x00200D, 0x01F3ED, 0, - 0x01F468, 0x00200D, 0x01F4BB, 0, - 0x01F468, 0x00200D, 0x01F4BC, 0, - 0x01F468, 0x00200D, 0x01F527, 0, - 0x01F468, 0x00200D, 0x01F52C, 0, - 0x01F468, 0x00200D, 0x01F680, 0, - 0x01F468, 0x00200D, 0x01F692, 0, - 0x01F468, 0x00200D, 0x01F9AF, 0, - 0x01F468, 0x00200D, 0x01F9BC, 0, - 0x01F468, 0x00200D, 0x01F9BD, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F33E, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F373, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F393, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F3A4, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F3A8, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F3EB, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F3ED, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F4BB, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F4BC, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F527, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F52C, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F680, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F692, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9AF, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9BC, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9BD, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F33E, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F373, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F393, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F3A4, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F3A8, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F3EB, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F3ED, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F4BB, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F4BC, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F527, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F52C, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F680, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F692, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9AF, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9BC, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9BD, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F33E, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F373, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F393, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F3A4, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F3A8, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F3EB, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F3ED, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F4BB, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F4BC, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F527, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F52C, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F680, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F692, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9AF, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9BC, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9BD, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F33E, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F373, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F393, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F3A4, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F3A8, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F3EB, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F3ED, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F4BB, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F4BC, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F527, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F52C, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F680, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F692, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9AF, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9BC, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9BD, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F33E, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F373, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F393, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F3A4, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F3A8, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F3EB, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F3ED, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F4BB, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F4BC, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F527, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F52C, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F680, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F692, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9AF, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9BC, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9BD, 0, - 0x01F469, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F469, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F469, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F469, 0x00200D, 0x01F33E, 0, - 0x01F469, 0x00200D, 0x01F373, 0, - 0x01F469, 0x00200D, 0x01F393, 0, - 0x01F469, 0x00200D, 0x01F3A4, 0, - 0x01F469, 0x00200D, 0x01F3A8, 0, - 0x01F469, 0x00200D, 0x01F3EB, 0, - 0x01F469, 0x00200D, 0x01F3ED, 0, - 0x01F469, 0x00200D, 0x01F4BB, 0, - 0x01F469, 0x00200D, 0x01F4BC, 0, - 0x01F469, 0x00200D, 0x01F527, 0, - 0x01F469, 0x00200D, 0x01F52C, 0, - 0x01F469, 0x00200D, 0x01F680, 0, - 0x01F469, 0x00200D, 0x01F692, 0, - 0x01F469, 0x00200D, 0x01F9AF, 0, - 0x01F469, 0x00200D, 0x01F9BC, 0, - 0x01F469, 0x00200D, 0x01F9BD, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F33E, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F373, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F393, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F3A4, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F3A8, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F3EB, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F3ED, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F4BB, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F4BC, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F527, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F52C, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F680, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F692, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9AF, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9BC, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9BD, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F33E, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F373, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F393, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F3A4, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F3A8, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F3EB, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F3ED, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F4BB, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F4BC, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F527, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F52C, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F680, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F692, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9AF, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9BC, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9BD, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F33E, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F373, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F393, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F3A4, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F3A8, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F3EB, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F3ED, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F4BB, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F4BC, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F527, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F52C, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F680, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F692, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9AF, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9BC, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9BD, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F33E, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F373, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F393, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F3A4, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F3A8, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F3EB, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F3ED, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F4BB, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F4BC, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F527, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F52C, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F680, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F692, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9AF, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9BC, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9BD, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x002695, 0x00FE0F, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x002696, 0x00FE0F, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x002708, 0x00FE0F, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F33E, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F373, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F393, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F3A4, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F3A8, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F3EB, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F3ED, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F4BB, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F4BC, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F527, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F52C, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F680, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F692, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9AF, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9BC, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9BD, 0, - 0x0026F9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x0026F9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x0026F9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x0026F9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x0026F9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x0026F9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x0026F9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x0026F9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x0026F9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x0026F9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x0026F9, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x0026F9, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C3, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C3, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C3, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C4, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C4, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3C4, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CA, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CA, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CA, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CB, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CB, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CB, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CC, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F3CC, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F3CC, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46E, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46E, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F46F, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F46F, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F471, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F471, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F471, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F471, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F471, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F471, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F471, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F471, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F471, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F471, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F471, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F471, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F473, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F473, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F473, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F473, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F473, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F473, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F473, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F473, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F473, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F473, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F473, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F473, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F477, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F477, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F477, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F477, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F477, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F477, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F477, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F477, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F477, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F477, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F477, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F477, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F481, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F481, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F481, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F481, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F481, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F481, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F481, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F481, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F481, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F481, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F481, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F481, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F482, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F482, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D, - 0x01F468, 0, - 0x01F482, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F482, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F482, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F482, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F482, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F482, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F482, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F482, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F482, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F486, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F486, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F486, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F486, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F486, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F486, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F486, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F486, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F486, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F486, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F486, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F486, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F487, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F487, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F487, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F487, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F487, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F487, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F487, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F487, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F487, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F487, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F487, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F487, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F575, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F575, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F575, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F575, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F575, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F575, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F575, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F575, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F575, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F575, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F575, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F575, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F645, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F645, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F645, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F645, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F645, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F645, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F645, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F645, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F645, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F645, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F645, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F645, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F646, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F646, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F646, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F646, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F646, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F646, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F646, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F646, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F646, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F646, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F646, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F646, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F647, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F647, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F647, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F647, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F647, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F647, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F647, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F647, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F647, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F647, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F647, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F647, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64B, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64B, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64B, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64B, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64B, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64B, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64B, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64B, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64B, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64B, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64B, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64B, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64D, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64D, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64D, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64D, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64D, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64D, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64D, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64D, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64D, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64D, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64D, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64D, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64E, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64E, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F64E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F64E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6A3, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6A3, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6A3, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B4, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B4, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B4, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B5, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B5, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B5, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B6, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B6, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F6B6, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F926, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F926, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F926, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F926, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F926, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F926, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F926, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F926, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F926, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F926, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F926, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F926, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F937, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F937, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F937, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F937, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F937, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F937, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F937, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F937, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F937, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F937, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F937, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F937, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F938, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F938, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F938, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F938, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F938, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F938, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F938, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F938, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F938, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F938, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F938, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F938, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F939, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F939, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F939, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F939, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F939, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F939, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F939, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F939, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F939, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F939, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F939, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F939, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93C, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93C, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93D, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93D, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93D, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93D, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93D, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93D, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93D, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93D, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93D, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93D, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93D, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93D, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93E, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93E, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F93E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F93E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B8, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B8, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B8, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B9, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B9, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9B9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CD, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CE, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9CF, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D6, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D6, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D6, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D7, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D7, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D7, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D8, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D8, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D8, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D9, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D9, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9D9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DA, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DA, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DA, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DB, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DC, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DD, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DE, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DE, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F9DF, 0x00200D, 0x002640, 0x00FE0F, 0, - 0x01F9DF, 0x00200D, 0x002642, 0x00FE0F, 0, - 0x01F468, 0x00200D, 0x01F9B0, 0, - 0x01F468, 0x00200D, 0x01F9B1, 0, - 0x01F468, 0x00200D, 0x01F9B2, 0, - 0x01F468, 0x00200D, 0x01F9B3, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B0, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B1, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B2, 0, - 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B3, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B0, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B1, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B2, 0, - 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B3, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B0, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B1, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B2, 0, - 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B3, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B0, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B1, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B2, 0, - 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B3, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B0, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B1, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B2, 0, - 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B3, 0, - 0x01F469, 0x00200D, 0x01F9B0, 0, - 0x01F469, 0x00200D, 0x01F9B1, 0, - 0x01F469, 0x00200D, 0x01F9B2, 0, - 0x01F469, 0x00200D, 0x01F9B3, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B0, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B1, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B2, 0, - 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B3, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B0, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B1, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B2, 0, - 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B3, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B0, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B1, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B2, 0, - 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B3, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B0, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B1, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B2, 0, - 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B3, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B0, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B1, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B2, 0, - 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B3, 0, - 0x01F3F3, 0x00FE0F, 0x00200D, 0x01F308, 0, - 0x01F3F4, 0x00200D, 0x002620, 0x00FE0F, 0, - 0x01F415, 0x00200D, 0x01F9BA, 0, - 0x01F482, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, - 0 // null-terminating the list -}; -// clang-format on - -} // namespace internal -} // namespace v8 - -#endif // V8_INTL_SUPPORT diff --git a/js/src/new-regexp/property-sequences.h b/js/src/new-regexp/property-sequences.h deleted file mode 100644 index f079da7ac..000000000 --- a/js/src/new-regexp/property-sequences.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2018 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_PROPERTY_SEQUENCES_H_ -#define V8_REGEXP_PROPERTY_SEQUENCES_H_ - -#ifdef V8_INTL_SUPPORT - -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -class UnicodePropertySequences : public AllStatic { - public: - static const uc32 kEmojiFlagSequences[]; - static const uc32 kEmojiTagSequences[]; - static const uc32 kEmojiZWJSequences[]; -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_INTL_SUPPORT - -#endif // V8_REGEXP_PROPERTY_SEQUENCES_H_ diff --git a/js/src/new-regexp/regexp-ast.cc b/js/src/new-regexp/regexp-ast.cc deleted file mode 100644 index 8de26720f..000000000 --- a/js/src/new-regexp/regexp-ast.cc +++ /dev/null @@ -1,342 +0,0 @@ -// Copyright 2016 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-ast.h" - -namespace v8 { -namespace internal { - -#define MAKE_ACCEPT(Name) \ - void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \ - return visitor->Visit##Name(this, data); \ - } -FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT) -#undef MAKE_ACCEPT - -#define MAKE_TYPE_CASE(Name) \ - RegExp##Name* RegExpTree::As##Name() { return nullptr; } \ - bool RegExpTree::Is##Name() { return false; } -FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) -#undef MAKE_TYPE_CASE - -#define MAKE_TYPE_CASE(Name) \ - RegExp##Name* RegExp##Name::As##Name() { return this; } \ - bool RegExp##Name::Is##Name() { return true; } -FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) -#undef MAKE_TYPE_CASE - - -static Interval ListCaptureRegisters(ZoneList* children) { - Interval result = Interval::Empty(); - for (int i = 0; i < children->length(); i++) - result = result.Union(children->at(i)->CaptureRegisters()); - return result; -} - - -Interval RegExpAlternative::CaptureRegisters() { - return ListCaptureRegisters(nodes()); -} - - -Interval RegExpDisjunction::CaptureRegisters() { - return ListCaptureRegisters(alternatives()); -} - - -Interval RegExpLookaround::CaptureRegisters() { - return body()->CaptureRegisters(); -} - - -Interval RegExpCapture::CaptureRegisters() { - Interval self(StartRegister(index()), EndRegister(index())); - return self.Union(body()->CaptureRegisters()); -} - - -Interval RegExpQuantifier::CaptureRegisters() { - return body()->CaptureRegisters(); -} - - -bool RegExpAssertion::IsAnchoredAtStart() { - return assertion_type() == RegExpAssertion::START_OF_INPUT; -} - - -bool RegExpAssertion::IsAnchoredAtEnd() { - return assertion_type() == RegExpAssertion::END_OF_INPUT; -} - - -bool RegExpAlternative::IsAnchoredAtStart() { - ZoneList* nodes = this->nodes(); - for (int i = 0; i < nodes->length(); i++) { - RegExpTree* node = nodes->at(i); - if (node->IsAnchoredAtStart()) { - return true; - } - if (node->max_match() > 0) { - return false; - } - } - return false; -} - - -bool RegExpAlternative::IsAnchoredAtEnd() { - ZoneList* nodes = this->nodes(); - for (int i = nodes->length() - 1; i >= 0; i--) { - RegExpTree* node = nodes->at(i); - if (node->IsAnchoredAtEnd()) { - return true; - } - if (node->max_match() > 0) { - return false; - } - } - return false; -} - - -bool RegExpDisjunction::IsAnchoredAtStart() { - ZoneList* alternatives = this->alternatives(); - for (int i = 0; i < alternatives->length(); i++) { - if (!alternatives->at(i)->IsAnchoredAtStart()) return false; - } - return true; -} - - -bool RegExpDisjunction::IsAnchoredAtEnd() { - ZoneList* alternatives = this->alternatives(); - for (int i = 0; i < alternatives->length(); i++) { - if (!alternatives->at(i)->IsAnchoredAtEnd()) return false; - } - return true; -} - - -bool RegExpLookaround::IsAnchoredAtStart() { - return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart(); -} - - -bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); } - - -bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); } - - -// Convert regular expression trees to a simple sexp representation. -// This representation should be different from the input grammar -// in as many cases as possible, to make it more difficult for incorrect -// parses to look as correct ones which is likely if the input and -// output formats are alike. -class RegExpUnparser final : public RegExpVisitor { - public: - RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {} - void VisitCharacterRange(CharacterRange that); -#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override; - FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) -#undef MAKE_CASE - private: - std::ostream& os_; - Zone* zone_; -}; - - -void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) { - os_ << "(|"; - for (int i = 0; i < that->alternatives()->length(); i++) { - os_ << " "; - that->alternatives()->at(i)->Accept(this, data); - } - os_ << ")"; - return nullptr; -} - - -void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) { - os_ << "(:"; - for (int i = 0; i < that->nodes()->length(); i++) { - os_ << " "; - that->nodes()->at(i)->Accept(this, data); - } - os_ << ")"; - return nullptr; -} - - -void RegExpUnparser::VisitCharacterRange(CharacterRange that) { - os_ << AsUC32(that.from()); - if (!that.IsSingleton()) { - os_ << "-" << AsUC32(that.to()); - } -} - - -void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that, - void* data) { - if (that->is_negated()) os_ << "^"; - os_ << "["; - for (int i = 0; i < that->ranges(zone_)->length(); i++) { - if (i > 0) os_ << " "; - VisitCharacterRange(that->ranges(zone_)->at(i)); - } - os_ << "]"; - return nullptr; -} - - -void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) { - switch (that->assertion_type()) { - case RegExpAssertion::START_OF_INPUT: - os_ << "@^i"; - break; - case RegExpAssertion::END_OF_INPUT: - os_ << "@$i"; - break; - case RegExpAssertion::START_OF_LINE: - os_ << "@^l"; - break; - case RegExpAssertion::END_OF_LINE: - os_ << "@$l"; - break; - case RegExpAssertion::BOUNDARY: - os_ << "@b"; - break; - case RegExpAssertion::NON_BOUNDARY: - os_ << "@B"; - break; - } - return nullptr; -} - - -void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) { - os_ << "'"; - Vector chardata = that->data(); - for (int i = 0; i < chardata.length(); i++) { - os_ << AsUC16(chardata[i]); - } - os_ << "'"; - return nullptr; -} - - -void* RegExpUnparser::VisitText(RegExpText* that, void* data) { - if (that->elements()->length() == 1) { - that->elements()->at(0).tree()->Accept(this, data); - } else { - os_ << "(!"; - for (int i = 0; i < that->elements()->length(); i++) { - os_ << " "; - that->elements()->at(i).tree()->Accept(this, data); - } - os_ << ")"; - } - return nullptr; -} - - -void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) { - os_ << "(# " << that->min() << " "; - if (that->max() == RegExpTree::kInfinity) { - os_ << "- "; - } else { - os_ << that->max() << " "; - } - os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n "); - that->body()->Accept(this, data); - os_ << ")"; - return nullptr; -} - - -void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) { - os_ << "(^ "; - that->body()->Accept(this, data); - os_ << ")"; - return nullptr; -} - -void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) { - os_ << "(?: "; - that->body()->Accept(this, data); - os_ << ")"; - return nullptr; -} - -void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) { - os_ << "("; - os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-"); - os_ << (that->is_positive() ? " + " : " - "); - that->body()->Accept(this, data); - os_ << ")"; - return nullptr; -} - - -void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, - void* data) { - os_ << "(<- " << that->index() << ")"; - return nullptr; -} - - -void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) { - os_ << '%'; - return nullptr; -} - - -std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { // NOLINT - RegExpUnparser unparser(os, zone); - Accept(&unparser, nullptr); - return os; -} - - -RegExpDisjunction::RegExpDisjunction(ZoneList* alternatives) - : alternatives_(alternatives) { - DCHECK_LT(1, alternatives->length()); - RegExpTree* first_alternative = alternatives->at(0); - min_match_ = first_alternative->min_match(); - max_match_ = first_alternative->max_match(); - for (int i = 1; i < alternatives->length(); i++) { - RegExpTree* alternative = alternatives->at(i); - min_match_ = Min(min_match_, alternative->min_match()); - max_match_ = Max(max_match_, alternative->max_match()); - } -} - - -static int IncreaseBy(int previous, int increase) { - if (RegExpTree::kInfinity - previous < increase) { - return RegExpTree::kInfinity; - } else { - return previous + increase; - } -} - - -RegExpAlternative::RegExpAlternative(ZoneList* nodes) - : nodes_(nodes) { - DCHECK_LT(1, nodes->length()); - min_match_ = 0; - max_match_ = 0; - for (int i = 0; i < nodes->length(); i++) { - RegExpTree* node = nodes->at(i); - int node_min_match = node->min_match(); - min_match_ = IncreaseBy(min_match_, node_min_match); - int node_max_match = node->max_match(); - max_match_ = IncreaseBy(max_match_, node_max_match); - } -} - - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-ast.h b/js/src/new-regexp/regexp-ast.h deleted file mode 100644 index 32bbcf0bf..000000000 --- a/js/src/new-regexp/regexp-ast.h +++ /dev/null @@ -1,615 +0,0 @@ -// Copyright 2016 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_AST_H_ -#define V8_REGEXP_REGEXP_AST_H_ - -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ - VISIT(Disjunction) \ - VISIT(Alternative) \ - VISIT(Assertion) \ - VISIT(CharacterClass) \ - VISIT(Atom) \ - VISIT(Quantifier) \ - VISIT(Capture) \ - VISIT(Group) \ - VISIT(Lookaround) \ - VISIT(BackReference) \ - VISIT(Empty) \ - VISIT(Text) - -#define FORWARD_DECLARE(Name) class RegExp##Name; -FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) -#undef FORWARD_DECLARE - -class RegExpCompiler; -class RegExpNode; -class RegExpTree; - -class RegExpVisitor { - public: - virtual ~RegExpVisitor() = default; -#define MAKE_CASE(Name) \ - virtual void* Visit##Name(RegExp##Name*, void* data) = 0; - FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) -#undef MAKE_CASE -}; - - -// A simple closed interval. -class Interval { - public: - Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). - Interval(int from, int to) : from_(from), to_(to) {} - Interval Union(Interval that) { - if (that.from_ == kNone) - return *this; - else if (from_ == kNone) - return that; - else - return Interval(Min(from_, that.from_), Max(to_, that.to_)); - } - - bool Contains(int value) { return (from_ <= value) && (value <= to_); } - bool is_empty() { return from_ == kNone; } - int from() const { return from_; } - int to() const { return to_; } - int size() const { return to_ - from_ + 1; } - - static Interval Empty() { return Interval(); } - - static constexpr int kNone = -1; - - private: - int from_; - int to_; -}; - - -// Represents code units in the range from from_ to to_, both ends are -// inclusive. -class CharacterRange { - public: - CharacterRange() : from_(0), to_(0) {} - // For compatibility with the CHECK_OK macro - CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT - V8_EXPORT_PRIVATE static void AddClassEscape(char type, - ZoneList* ranges, - Zone* zone); - // Add class escapes. Add case equivalent closure for \w and \W if necessary. - V8_EXPORT_PRIVATE static void AddClassEscape( - char type, ZoneList* ranges, - bool add_unicode_case_equivalents, Zone* zone); - static Vector GetWordBounds(); - static inline CharacterRange Singleton(uc32 value) { - return CharacterRange(value, value); - } - static inline CharacterRange Range(uc32 from, uc32 to) { - DCHECK(0 <= from && to <= String::kMaxCodePoint); - DCHECK(static_cast(from) <= static_cast(to)); - return CharacterRange(from, to); - } - static inline CharacterRange Everything() { - return CharacterRange(0, String::kMaxCodePoint); - } - static inline ZoneList* List(Zone* zone, - CharacterRange range) { - ZoneList* list = - new (zone) ZoneList(1, zone); - list->Add(range, zone); - return list; - } - bool Contains(uc32 i) { return from_ <= i && i <= to_; } - uc32 from() const { return from_; } - void set_from(uc32 value) { from_ = value; } - uc32 to() const { return to_; } - void set_to(uc32 value) { to_ = value; } - bool is_valid() { return from_ <= to_; } - bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; } - bool IsSingleton() { return (from_ == to_); } - V8_EXPORT_PRIVATE static void AddCaseEquivalents( - Isolate* isolate, Zone* zone, ZoneList* ranges, - bool is_one_byte); - // Whether a range list is in canonical form: Ranges ordered by from value, - // and ranges non-overlapping and non-adjacent. - V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList* ranges); - // Convert range list to canonical form. The characters covered by the ranges - // will still be the same, but no character is in more than one range, and - // adjacent ranges are merged. The resulting list may be shorter than the - // original, but cannot be longer. - static void Canonicalize(ZoneList* ranges); - // Negate the contents of a character range in canonical form. - static void Negate(ZoneList* src, - ZoneList* dst, Zone* zone); - static const int kStartMarker = (1 << 24); - static const int kPayloadMask = (1 << 24) - 1; - - private: - CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {} - - uc32 from_; - uc32 to_; -}; - -class CharacterSet final { - public: - explicit CharacterSet(uc16 standard_set_type) - : ranges_(nullptr), standard_set_type_(standard_set_type) {} - explicit CharacterSet(ZoneList* ranges) - : ranges_(ranges), standard_set_type_(0) {} - ZoneList* ranges(Zone* zone); - uc16 standard_set_type() const { return standard_set_type_; } - void set_standard_set_type(uc16 special_set_type) { - standard_set_type_ = special_set_type; - } - bool is_standard() { return standard_set_type_ != 0; } - V8_EXPORT_PRIVATE void Canonicalize(); - - private: - ZoneList* ranges_; - // If non-zero, the value represents a standard set (e.g., all whitespace - // characters) without having to expand the ranges. - uc16 standard_set_type_; -}; - -class TextElement final { - public: - enum TextType { ATOM, CHAR_CLASS }; - - static TextElement Atom(RegExpAtom* atom); - static TextElement CharClass(RegExpCharacterClass* char_class); - - int cp_offset() const { return cp_offset_; } - void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } - int length() const; - - TextType text_type() const { return text_type_; } - - RegExpTree* tree() const { return tree_; } - - RegExpAtom* atom() const { - DCHECK(text_type() == ATOM); - return reinterpret_cast(tree()); - } - - RegExpCharacterClass* char_class() const { - DCHECK(text_type() == CHAR_CLASS); - return reinterpret_cast(tree()); - } - - private: - TextElement(TextType text_type, RegExpTree* tree) - : cp_offset_(-1), text_type_(text_type), tree_(tree) {} - - int cp_offset_; - TextType text_type_; - RegExpTree* tree_; -}; - - -class RegExpTree : public ZoneObject { - public: - static const int kInfinity = kMaxInt; - virtual ~RegExpTree() = default; - virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; - virtual RegExpNode* ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) = 0; - virtual bool IsTextElement() { return false; } - virtual bool IsAnchoredAtStart() { return false; } - virtual bool IsAnchoredAtEnd() { return false; } - virtual int min_match() = 0; - virtual int max_match() = 0; - // Returns the interval of registers used for captures within this - // expression. - virtual Interval CaptureRegisters() { return Interval::Empty(); } - virtual void AppendToText(RegExpText* text, Zone* zone); - V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, - Zone* zone); // NOLINT -#define MAKE_ASTYPE(Name) \ - virtual RegExp##Name* As##Name(); \ - virtual bool Is##Name(); - FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) -#undef MAKE_ASTYPE -}; - - -class RegExpDisjunction final : public RegExpTree { - public: - explicit RegExpDisjunction(ZoneList* alternatives); - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpDisjunction* AsDisjunction() override; - Interval CaptureRegisters() override; - bool IsDisjunction() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - ZoneList* alternatives() { return alternatives_; } - - private: - bool SortConsecutiveAtoms(RegExpCompiler* compiler); - void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); - void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); - ZoneList* alternatives_; - int min_match_; - int max_match_; -}; - - -class RegExpAlternative final : public RegExpTree { - public: - explicit RegExpAlternative(ZoneList* nodes); - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpAlternative* AsAlternative() override; - Interval CaptureRegisters() override; - bool IsAlternative() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - ZoneList* nodes() { return nodes_; } - - private: - ZoneList* nodes_; - int min_match_; - int max_match_; -}; - - -class RegExpAssertion final : public RegExpTree { - public: - enum AssertionType { - START_OF_LINE = 0, - START_OF_INPUT = 1, - END_OF_LINE = 2, - END_OF_INPUT = 3, - BOUNDARY = 4, - NON_BOUNDARY = 5, - LAST_TYPE = NON_BOUNDARY, - }; - RegExpAssertion(AssertionType type, JSRegExp::Flags flags) - : assertion_type_(type), flags_(flags) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpAssertion* AsAssertion() override; - bool IsAssertion() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - int min_match() override { return 0; } - int max_match() override { return 0; } - AssertionType assertion_type() const { return assertion_type_; } - JSRegExp::Flags flags() const { return flags_; } - - private: - const AssertionType assertion_type_; - const JSRegExp::Flags flags_; -}; - - -class RegExpCharacterClass final : public RegExpTree { - public: - // NEGATED: The character class is negated and should match everything but - // the specified ranges. - // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split - // surrogate and should not be unicode-desugared (crbug.com/641091). - enum Flag { - NEGATED = 1 << 0, - CONTAINS_SPLIT_SURROGATE = 1 << 1, - }; - using CharacterClassFlags = base::Flags; - - RegExpCharacterClass( - Zone* zone, ZoneList* ranges, JSRegExp::Flags flags, - CharacterClassFlags character_class_flags = CharacterClassFlags()) - : set_(ranges), - flags_(flags), - character_class_flags_(character_class_flags) { - // Convert the empty set of ranges to the negated Everything() range. - if (ranges->is_empty()) { - ranges->Add(CharacterRange::Everything(), zone); - character_class_flags_ ^= NEGATED; - } - } - RegExpCharacterClass(uc16 type, JSRegExp::Flags flags) - : set_(type), - flags_(flags), - character_class_flags_(CharacterClassFlags()) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpCharacterClass* AsCharacterClass() override; - bool IsCharacterClass() override; - bool IsTextElement() override { return true; } - int min_match() override { return 1; } - // The character class may match two code units for unicode regexps. - // TODO(yangguo): we should split this class for usage in TextElement, and - // make max_match() dependent on the character class content. - int max_match() override { return 2; } - void AppendToText(RegExpText* text, Zone* zone) override; - CharacterSet character_set() { return set_; } - // TODO(lrn): Remove need for complex version if is_standard that - // recognizes a mangled standard set and just do { return set_.is_special(); } - bool is_standard(Zone* zone); - // Returns a value representing the standard character set if is_standard() - // returns true. - // Currently used values are: - // s : unicode whitespace - // S : unicode non-whitespace - // w : ASCII word character (digit, letter, underscore) - // W : non-ASCII word character - // d : ASCII digit - // D : non-ASCII digit - // . : non-newline - // * : All characters, for advancing unanchored regexp - uc16 standard_type() const { return set_.standard_set_type(); } - ZoneList* ranges(Zone* zone) { return set_.ranges(zone); } - bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } - JSRegExp::Flags flags() const { return flags_; } - bool contains_split_surrogate() const { - return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; - } - - private: - CharacterSet set_; - const JSRegExp::Flags flags_; - CharacterClassFlags character_class_flags_; -}; - - -class RegExpAtom final : public RegExpTree { - public: - explicit RegExpAtom(Vector data, JSRegExp::Flags flags) - : data_(data), flags_(flags) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpAtom* AsAtom() override; - bool IsAtom() override; - bool IsTextElement() override { return true; } - int min_match() override { return data_.length(); } - int max_match() override { return data_.length(); } - void AppendToText(RegExpText* text, Zone* zone) override; - Vector data() { return data_; } - int length() { return data_.length(); } - JSRegExp::Flags flags() const { return flags_; } - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - - private: - Vector data_; - const JSRegExp::Flags flags_; -}; - - -class RegExpText final : public RegExpTree { - public: - explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpText* AsText() override; - bool IsText() override; - bool IsTextElement() override { return true; } - int min_match() override { return length_; } - int max_match() override { return length_; } - void AppendToText(RegExpText* text, Zone* zone) override; - void AddElement(TextElement elm, Zone* zone) { - elements_.Add(elm, zone); - length_ += elm.length(); - } - ZoneList* elements() { return &elements_; } - - private: - ZoneList elements_; - int length_; -}; - - -class RegExpQuantifier final : public RegExpTree { - public: - enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; - RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) - : body_(body), - min_(min), - max_(max), - quantifier_type_(type) { - if (min > 0 && body->min_match() > kInfinity / min) { - min_match_ = kInfinity; - } else { - min_match_ = min * body->min_match(); - } - if (max > 0 && body->max_match() > kInfinity / max) { - max_match_ = kInfinity; - } else { - max_match_ = max * body->max_match(); - } - } - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, - RegExpCompiler* compiler, RegExpNode* on_success, - bool not_at_start = false); - RegExpQuantifier* AsQuantifier() override; - Interval CaptureRegisters() override; - bool IsQuantifier() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - int min() { return min_; } - int max() { return max_; } - bool is_possessive() { return quantifier_type_ == POSSESSIVE; } - bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } - bool is_greedy() { return quantifier_type_ == GREEDY; } - RegExpTree* body() { return body_; } - - private: - RegExpTree* body_; - int min_; - int max_; - int min_match_; - int max_match_; - QuantifierType quantifier_type_; -}; - - -class RegExpCapture final : public RegExpTree { - public: - explicit RegExpCapture(int index) - : body_(nullptr), - index_(index), - min_match_(0), - max_match_(0), - name_(nullptr) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - static RegExpNode* ToNode(RegExpTree* body, int index, - RegExpCompiler* compiler, RegExpNode* on_success); - RegExpCapture* AsCapture() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - Interval CaptureRegisters() override; - bool IsCapture() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - RegExpTree* body() { return body_; } - void set_body(RegExpTree* body) { - body_ = body; - min_match_ = body->min_match(); - max_match_ = body->max_match(); - } - int index() const { return index_; } - const ZoneVector* name() const { return name_; } - void set_name(const ZoneVector* name) { name_ = name; } - static int StartRegister(int index) { return index * 2; } - static int EndRegister(int index) { return index * 2 + 1; } - - private: - RegExpTree* body_; - int index_; - int min_match_; - int max_match_; - const ZoneVector* name_; -}; - -class RegExpGroup final : public RegExpTree { - public: - explicit RegExpGroup(RegExpTree* body) - : body_(body), - min_match_(body->min_match()), - max_match_(body->max_match()) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) override { - return body_->ToNode(compiler, on_success); - } - RegExpGroup* AsGroup() override; - bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } - bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } - bool IsGroup() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - Interval CaptureRegisters() override { return body_->CaptureRegisters(); } - RegExpTree* body() { return body_; } - - private: - RegExpTree* body_; - int min_match_; - int max_match_; -}; - -class RegExpLookaround final : public RegExpTree { - public: - enum Type { LOOKAHEAD, LOOKBEHIND }; - - RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, - int capture_from, Type type) - : body_(body), - is_positive_(is_positive), - capture_count_(capture_count), - capture_from_(capture_from), - type_(type) {} - - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpLookaround* AsLookaround() override; - Interval CaptureRegisters() override; - bool IsLookaround() override; - bool IsAnchoredAtStart() override; - int min_match() override { return 0; } - int max_match() override { return 0; } - RegExpTree* body() { return body_; } - bool is_positive() { return is_positive_; } - int capture_count() { return capture_count_; } - int capture_from() { return capture_from_; } - Type type() { return type_; } - - class Builder { - public: - Builder(bool is_positive, RegExpNode* on_success, - int stack_pointer_register, int position_register, - int capture_register_count = 0, int capture_register_start = 0); - RegExpNode* on_match_success() { return on_match_success_; } - RegExpNode* ForMatch(RegExpNode* match); - - private: - bool is_positive_; - RegExpNode* on_match_success_; - RegExpNode* on_success_; - int stack_pointer_register_; - int position_register_; - }; - - private: - RegExpTree* body_; - bool is_positive_; - int capture_count_; - int capture_from_; - Type type_; -}; - - -class RegExpBackReference final : public RegExpTree { - public: - explicit RegExpBackReference(JSRegExp::Flags flags) - : capture_(nullptr), name_(nullptr), flags_(flags) {} - RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags) - : capture_(capture), name_(nullptr), flags_(flags) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpBackReference* AsBackReference() override; - bool IsBackReference() override; - int min_match() override { return 0; } - // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite - // recursion, we give up. Ignorance is bliss. - int max_match() override { return kInfinity; } - int index() { return capture_->index(); } - RegExpCapture* capture() { return capture_; } - void set_capture(RegExpCapture* capture) { capture_ = capture; } - const ZoneVector* name() const { return name_; } - void set_name(const ZoneVector* name) { name_ = name; } - - private: - RegExpCapture* capture_; - const ZoneVector* name_; - const JSRegExp::Flags flags_; -}; - - -class RegExpEmpty final : public RegExpTree { - public: - RegExpEmpty() = default; - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpEmpty* AsEmpty() override; - bool IsEmpty() override; - int min_match() override { return 0; } - int max_match() override { return 0; } -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_AST_H_ diff --git a/js/src/new-regexp/regexp-bytecode-generator-inl.h b/js/src/new-regexp/regexp-bytecode-generator-inl.h deleted file mode 100644 index a2d1ac1cb..000000000 --- a/js/src/new-regexp/regexp-bytecode-generator-inl.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2008-2009 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_ -#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_ - -#include "new-regexp/regexp-bytecode-generator.h" - -#include "new-regexp/regexp-bytecodes.h" - -namespace v8 { -namespace internal { - -void RegExpBytecodeGenerator::Emit(uint32_t byte, uint32_t twenty_four_bits) { - uint32_t word = ((twenty_four_bits << BYTECODE_SHIFT) | byte); - DCHECK(pc_ <= buffer_.length()); - if (pc_ + 3 >= buffer_.length()) { - Expand(); - } - *reinterpret_cast(buffer_.begin() + pc_) = word; - pc_ += 4; -} - -void RegExpBytecodeGenerator::Emit16(uint32_t word) { - DCHECK(pc_ <= buffer_.length()); - if (pc_ + 1 >= buffer_.length()) { - Expand(); - } - *reinterpret_cast(buffer_.begin() + pc_) = word; - pc_ += 2; -} - -void RegExpBytecodeGenerator::Emit8(uint32_t word) { - DCHECK(pc_ <= buffer_.length()); - if (pc_ == buffer_.length()) { - Expand(); - } - *reinterpret_cast(buffer_.begin() + pc_) = word; - pc_ += 1; -} - -void RegExpBytecodeGenerator::Emit32(uint32_t word) { - DCHECK(pc_ <= buffer_.length()); - if (pc_ + 3 >= buffer_.length()) { - Expand(); - } - *reinterpret_cast(buffer_.begin() + pc_) = word; - pc_ += 4; -} - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_ diff --git a/js/src/new-regexp/regexp-bytecode-generator.cc b/js/src/new-regexp/regexp-bytecode-generator.cc deleted file mode 100644 index 2670322d3..000000000 --- a/js/src/new-regexp/regexp-bytecode-generator.cc +++ /dev/null @@ -1,395 +0,0 @@ -// Copyright 2008-2009 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-bytecode-generator.h" - -#include "new-regexp/regexp-bytecode-generator-inl.h" -#include "new-regexp/regexp-bytecode-peephole.h" -#include "new-regexp/regexp-bytecodes.h" -#include "new-regexp/regexp-macro-assembler.h" - -namespace v8 { -namespace internal { - -RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone) - : RegExpMacroAssembler(isolate, zone), - buffer_(Vector::New(1024)), - pc_(0), - advance_current_end_(kInvalidPC), - jump_edges_(zone), - isolate_(isolate) {} - -RegExpBytecodeGenerator::~RegExpBytecodeGenerator() { - if (backtrack_.is_linked()) backtrack_.Unuse(); - buffer_.Dispose(); -} - -RegExpBytecodeGenerator::IrregexpImplementation -RegExpBytecodeGenerator::Implementation() { - return kBytecodeImplementation; -} - -void RegExpBytecodeGenerator::Bind(Label* l) { - advance_current_end_ = kInvalidPC; - DCHECK(!l->is_bound()); - if (l->is_linked()) { - int pos = l->pos(); - while (pos != 0) { - int fixup = pos; - pos = *reinterpret_cast(buffer_.begin() + fixup); - *reinterpret_cast(buffer_.begin() + fixup) = pc_; - jump_edges_.emplace(fixup, pc_); - } - } - l->bind_to(pc_); -} - -void RegExpBytecodeGenerator::EmitOrLink(Label* l) { - if (l == nullptr) l = &backtrack_; - int pos = 0; - if (l->is_bound()) { - pos = l->pos(); - jump_edges_.emplace(pc_, pos); - } else { - if (l->is_linked()) { - pos = l->pos(); - } - l->link_to(pc_); - } - Emit32(pos); -} - -void RegExpBytecodeGenerator::PopRegister(int register_index) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_POP_REGISTER, register_index); -} - -void RegExpBytecodeGenerator::PushRegister(int register_index, - StackCheckFlag check_stack_limit) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_PUSH_REGISTER, register_index); -} - -void RegExpBytecodeGenerator::WriteCurrentPositionToRegister(int register_index, - int cp_offset) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_SET_REGISTER_TO_CP, register_index); - Emit32(cp_offset); // Current position offset. -} - -void RegExpBytecodeGenerator::ClearRegisters(int reg_from, int reg_to) { - DCHECK(reg_from <= reg_to); - for (int reg = reg_from; reg <= reg_to; reg++) { - SetRegister(reg, -1); - } -} - -void RegExpBytecodeGenerator::ReadCurrentPositionFromRegister( - int register_index) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_SET_CP_TO_REGISTER, register_index); -} - -void RegExpBytecodeGenerator::WriteStackPointerToRegister(int register_index) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_SET_REGISTER_TO_SP, register_index); -} - -void RegExpBytecodeGenerator::ReadStackPointerFromRegister(int register_index) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_SET_SP_TO_REGISTER, register_index); -} - -void RegExpBytecodeGenerator::SetCurrentPositionFromEnd(int by) { - DCHECK(is_uint24(by)); - Emit(BC_SET_CURRENT_POSITION_FROM_END, by); -} - -void RegExpBytecodeGenerator::SetRegister(int register_index, int to) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_SET_REGISTER, register_index); - Emit32(to); -} - -void RegExpBytecodeGenerator::AdvanceRegister(int register_index, int by) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_ADVANCE_REGISTER, register_index); - Emit32(by); -} - -void RegExpBytecodeGenerator::PopCurrentPosition() { Emit(BC_POP_CP, 0); } - -void RegExpBytecodeGenerator::PushCurrentPosition() { Emit(BC_PUSH_CP, 0); } - -void RegExpBytecodeGenerator::Backtrack() { Emit(BC_POP_BT, 0); } - -void RegExpBytecodeGenerator::GoTo(Label* l) { - if (advance_current_end_ == pc_) { - // Combine advance current and goto. - pc_ = advance_current_start_; - Emit(BC_ADVANCE_CP_AND_GOTO, advance_current_offset_); - EmitOrLink(l); - advance_current_end_ = kInvalidPC; - } else { - // Regular goto. - Emit(BC_GOTO, 0); - EmitOrLink(l); - } -} - -void RegExpBytecodeGenerator::PushBacktrack(Label* l) { - Emit(BC_PUSH_BT, 0); - EmitOrLink(l); -} - -bool RegExpBytecodeGenerator::Succeed() { - Emit(BC_SUCCEED, 0); - return false; // Restart matching for global regexp not supported. -} - -void RegExpBytecodeGenerator::Fail() { Emit(BC_FAIL, 0); } - -void RegExpBytecodeGenerator::AdvanceCurrentPosition(int by) { - DCHECK_LE(kMinCPOffset, by); - DCHECK_GE(kMaxCPOffset, by); - advance_current_start_ = pc_; - advance_current_offset_ = by; - Emit(BC_ADVANCE_CP, by); - advance_current_end_ = pc_; -} - -void RegExpBytecodeGenerator::CheckGreedyLoop( - Label* on_tos_equals_current_position) { - Emit(BC_CHECK_GREEDY, 0); - EmitOrLink(on_tos_equals_current_position); -} - -void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset, - Label* on_failure, - bool check_bounds, - int characters, - int eats_at_least) { - DCHECK_GE(eats_at_least, characters); - if (eats_at_least > characters && check_bounds) { - DCHECK(is_uint24(cp_offset + eats_at_least)); - Emit(BC_CHECK_CURRENT_POSITION, cp_offset + eats_at_least); - EmitOrLink(on_failure); - check_bounds = false; // Load below doesn't need to check. - } - - DCHECK_LE(kMinCPOffset, cp_offset); - DCHECK_GE(kMaxCPOffset, cp_offset); - int bytecode; - if (check_bounds) { - if (characters == 4) { - bytecode = BC_LOAD_4_CURRENT_CHARS; - } else if (characters == 2) { - bytecode = BC_LOAD_2_CURRENT_CHARS; - } else { - DCHECK_EQ(1, characters); - bytecode = BC_LOAD_CURRENT_CHAR; - } - } else { - if (characters == 4) { - bytecode = BC_LOAD_4_CURRENT_CHARS_UNCHECKED; - } else if (characters == 2) { - bytecode = BC_LOAD_2_CURRENT_CHARS_UNCHECKED; - } else { - DCHECK_EQ(1, characters); - bytecode = BC_LOAD_CURRENT_CHAR_UNCHECKED; - } - } - Emit(bytecode, cp_offset); - if (check_bounds) EmitOrLink(on_failure); -} - -void RegExpBytecodeGenerator::CheckCharacterLT(uc16 limit, Label* on_less) { - Emit(BC_CHECK_LT, limit); - EmitOrLink(on_less); -} - -void RegExpBytecodeGenerator::CheckCharacterGT(uc16 limit, Label* on_greater) { - Emit(BC_CHECK_GT, limit); - EmitOrLink(on_greater); -} - -void RegExpBytecodeGenerator::CheckCharacter(uint32_t c, Label* on_equal) { - if (c > MAX_FIRST_ARG) { - Emit(BC_CHECK_4_CHARS, 0); - Emit32(c); - } else { - Emit(BC_CHECK_CHAR, c); - } - EmitOrLink(on_equal); -} - -void RegExpBytecodeGenerator::CheckAtStart(int cp_offset, Label* on_at_start) { - Emit(BC_CHECK_AT_START, cp_offset); - EmitOrLink(on_at_start); -} - -void RegExpBytecodeGenerator::CheckNotAtStart(int cp_offset, - Label* on_not_at_start) { - Emit(BC_CHECK_NOT_AT_START, cp_offset); - EmitOrLink(on_not_at_start); -} - -void RegExpBytecodeGenerator::CheckNotCharacter(uint32_t c, - Label* on_not_equal) { - if (c > MAX_FIRST_ARG) { - Emit(BC_CHECK_NOT_4_CHARS, 0); - Emit32(c); - } else { - Emit(BC_CHECK_NOT_CHAR, c); - } - EmitOrLink(on_not_equal); -} - -void RegExpBytecodeGenerator::CheckCharacterAfterAnd(uint32_t c, uint32_t mask, - Label* on_equal) { - if (c > MAX_FIRST_ARG) { - Emit(BC_AND_CHECK_4_CHARS, 0); - Emit32(c); - } else { - Emit(BC_AND_CHECK_CHAR, c); - } - Emit32(mask); - EmitOrLink(on_equal); -} - -void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c, - uint32_t mask, - Label* on_not_equal) { - if (c > MAX_FIRST_ARG) { - Emit(BC_AND_CHECK_NOT_4_CHARS, 0); - Emit32(c); - } else { - Emit(BC_AND_CHECK_NOT_CHAR, c); - } - Emit32(mask); - EmitOrLink(on_not_equal); -} - -void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd( - uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) { - Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c); - Emit16(minus); - Emit16(mask); - EmitOrLink(on_not_equal); -} - -void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to, - Label* on_in_range) { - Emit(BC_CHECK_CHAR_IN_RANGE, 0); - Emit16(from); - Emit16(to); - EmitOrLink(on_in_range); -} - -void RegExpBytecodeGenerator::CheckCharacterNotInRange(uc16 from, uc16 to, - Label* on_not_in_range) { - Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0); - Emit16(from); - Emit16(to); - EmitOrLink(on_not_in_range); -} - -void RegExpBytecodeGenerator::CheckBitInTable(Handle table, - Label* on_bit_set) { - Emit(BC_CHECK_BIT_IN_TABLE, 0); - EmitOrLink(on_bit_set); - for (int i = 0; i < kTableSize; i += kBitsPerByte) { - int byte = 0; - for (int j = 0; j < kBitsPerByte; j++) { - if (table->get(i + j) != 0) byte |= 1 << j; - } - Emit8(byte); - } -} - -void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg, - bool read_backward, - Label* on_not_equal) { - DCHECK_LE(0, start_reg); - DCHECK_GE(kMaxRegister, start_reg); - Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF, - start_reg); - EmitOrLink(on_not_equal); -} - -void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_not_equal) { - DCHECK_LE(0, start_reg); - DCHECK_GE(kMaxRegister, start_reg); - Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD - : BC_CHECK_NOT_BACK_REF_NO_CASE, - start_reg); - EmitOrLink(on_not_equal); -} - -void RegExpBytecodeGenerator::IfRegisterLT(int register_index, int comparand, - Label* on_less_than) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_CHECK_REGISTER_LT, register_index); - Emit32(comparand); - EmitOrLink(on_less_than); -} - -void RegExpBytecodeGenerator::IfRegisterGE(int register_index, int comparand, - Label* on_greater_or_equal) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_CHECK_REGISTER_GE, register_index); - Emit32(comparand); - EmitOrLink(on_greater_or_equal); -} - -void RegExpBytecodeGenerator::IfRegisterEqPos(int register_index, - Label* on_eq) { - DCHECK_LE(0, register_index); - DCHECK_GE(kMaxRegister, register_index); - Emit(BC_CHECK_REGISTER_EQ_POS, register_index); - EmitOrLink(on_eq); -} - -Handle RegExpBytecodeGenerator::GetCode(Handle source) { - Bind(&backtrack_); - Emit(BC_POP_BT, 0); - - Handle array; - if (FLAG_regexp_peephole_optimization) { - array = RegExpBytecodePeepholeOptimization::OptimizeBytecode( - isolate_, zone(), source, buffer_.begin(), length(), jump_edges_); - } else { - array = isolate_->factory()->NewByteArray(length()); - Copy(array->GetDataStartAddress()); - } - - return array; -} - -int RegExpBytecodeGenerator::length() { return pc_; } - -void RegExpBytecodeGenerator::Copy(byte* a) { - MemCopy(a, buffer_.begin(), length()); -} - -void RegExpBytecodeGenerator::Expand() { - Vector old_buffer = buffer_; - buffer_ = Vector::New(old_buffer.length() * 2); - MemCopy(buffer_.begin(), old_buffer.begin(), old_buffer.length()); - old_buffer.Dispose(); -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-bytecode-generator.h b/js/src/new-regexp/regexp-bytecode-generator.h deleted file mode 100644 index 274fd3953..000000000 --- a/js/src/new-regexp/regexp-bytecode-generator.h +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2012 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_ -#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_ - -#include "new-regexp/regexp-macro-assembler.h" - -namespace v8 { -namespace internal { - -// An assembler/generator for the Irregexp byte code. -class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { - public: - // Create an assembler. Instructions and relocation information are emitted - // into a buffer, with the instructions starting from the beginning and the - // relocation information starting from the end of the buffer. See CodeDesc - // for a detailed comment on the layout (globals.h). - // - // The assembler allocates and grows its own buffer, and buffer_size - // determines the initial buffer size. The buffer is owned by the assembler - // and deallocated upon destruction of the assembler. - RegExpBytecodeGenerator(Isolate* isolate, Zone* zone); - virtual ~RegExpBytecodeGenerator(); - // The byte-code interpreter checks on each push anyway. - virtual int stack_limit_slack() { return 1; } - virtual bool CanReadUnaligned() { return false; } - virtual void Bind(Label* label); - virtual void AdvanceCurrentPosition(int by); // Signed cp change. - virtual void PopCurrentPosition(); - virtual void PushCurrentPosition(); - virtual void Backtrack(); - virtual void GoTo(Label* label); - virtual void PushBacktrack(Label* label); - virtual bool Succeed(); - virtual void Fail(); - virtual void PopRegister(int register_index); - virtual void PushRegister(int register_index, - StackCheckFlag check_stack_limit); - virtual void AdvanceRegister(int reg, int by); // r[reg] += by. - virtual void SetCurrentPositionFromEnd(int by); - virtual void SetRegister(int register_index, int to); - virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); - virtual void ClearRegisters(int reg_from, int reg_to); - virtual void ReadCurrentPositionFromRegister(int reg); - virtual void WriteStackPointerToRegister(int reg); - virtual void ReadStackPointerFromRegister(int reg); - virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, - bool check_bounds, int characters, - int eats_at_least); - virtual void CheckCharacter(unsigned c, Label* on_equal); - virtual void CheckCharacterAfterAnd(unsigned c, unsigned mask, - Label* on_equal); - virtual void CheckCharacterGT(uc16 limit, Label* on_greater); - virtual void CheckCharacterLT(uc16 limit, Label* on_less); - virtual void CheckGreedyLoop(Label* on_tos_equals_current_position); - virtual void CheckAtStart(int cp_offset, Label* on_at_start); - virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start); - virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); - virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, - Label* on_not_equal); - virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask, - Label* on_not_equal); - virtual void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range); - virtual void CheckCharacterNotInRange(uc16 from, uc16 to, - Label* on_not_in_range); - virtual void CheckBitInTable(Handle table, Label* on_bit_set); - virtual void CheckNotBackReference(int start_reg, bool read_backward, - Label* on_no_match); - virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, - Label* on_no_match); - virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt); - virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge); - virtual void IfRegisterEqPos(int register_index, Label* if_eq); - - virtual IrregexpImplementation Implementation(); - virtual Handle GetCode(Handle source); - - private: - void Expand(); - // Code and bitmap emission. - inline void EmitOrLink(Label* label); - inline void Emit32(uint32_t x); - inline void Emit16(uint32_t x); - inline void Emit8(uint32_t x); - inline void Emit(uint32_t bc, uint32_t arg); - // Bytecode buffer. - int length(); - void Copy(byte* a); - - // The buffer into which code and relocation info are generated. - Vector buffer_; - // The program counter. - int pc_; - Label backtrack_; - - int advance_current_start_; - int advance_current_offset_; - int advance_current_end_; - - // Stores jump edges emitted for the bytecode (used by - // RegExpBytecodePeepholeOptimization). - // Key: jump source (offset in buffer_ where jump destination is stored). - // Value: jump destination (offset in buffer_ to jump to). - ZoneUnorderedMap jump_edges_; - - Isolate* isolate_; - - static const int kInvalidPC = -1; - - DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodeGenerator); -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_ diff --git a/js/src/new-regexp/regexp-bytecode-peephole.cc b/js/src/new-regexp/regexp-bytecode-peephole.cc deleted file mode 100644 index f105a5094..000000000 --- a/js/src/new-regexp/regexp-bytecode-peephole.cc +++ /dev/null @@ -1,1028 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-bytecode-peephole.h" - -#include "new-regexp/regexp-bytecodes.h" - -namespace v8 { -namespace internal { - -namespace { - -struct BytecodeArgument { - int offset; - int length; - - BytecodeArgument(int offset, int length) : offset(offset), length(length) {} -}; - -struct BytecodeArgumentMapping : BytecodeArgument { - int new_length; - - BytecodeArgumentMapping(int offset, int length, int new_length) - : BytecodeArgument(offset, length), new_length(new_length) {} -}; - -struct BytecodeArgumentCheck : BytecodeArgument { - enum CheckType { kCheckAddress = 0, kCheckValue }; - CheckType type; - int check_offset; - int check_length; - - BytecodeArgumentCheck(int offset, int length, int check_offset) - : BytecodeArgument(offset, length), - type(kCheckAddress), - check_offset(check_offset) {} - BytecodeArgumentCheck(int offset, int length, int check_offset, - int check_length) - : BytecodeArgument(offset, length), - type(kCheckValue), - check_offset(check_offset), - check_length(check_length) {} -}; - -// Trie-Node for storing bytecode sequences we want to optimize. -class BytecodeSequenceNode { - public: - // Dummy bytecode used when we need to store/return a bytecode but it's not a - // valid bytecode in the current context. - static constexpr int kDummyBytecode = -1; - - BytecodeSequenceNode(int bytecode, Zone* zone); - // Adds a new node as child of the current node if it isn't a child already. - BytecodeSequenceNode& FollowedBy(int bytecode); - // Marks the end of a sequence and sets optimized bytecode to replace all - // bytecodes of the sequence with. - BytecodeSequenceNode& ReplaceWith(int bytecode); - // Maps arguments of bytecodes in the sequence to the optimized bytecode. - // Order of invocation determines order of arguments in the optimized - // bytecode. - // Invoking this method is only allowed on nodes that mark the end of a valid - // sequence (i.e. after ReplaceWith()). - // bytecode_index_in_sequence: Zero-based index of the referred bytecode - // within the sequence (e.g. the bytecode passed to CreateSequence() has - // index 0). - // argument_offset: Zero-based offset to the argument within the bytecode - // (e.g. the first argument that's not packed with the bytecode has offset 4). - // argument_byte_length: Length of the argument. - // new_argument_byte_length: Length of the argument in the new bytecode - // (= argument_byte_length if omitted). - BytecodeSequenceNode& MapArgument(int bytecode_index_in_sequence, - int argument_offset, - int argument_byte_length, - int new_argument_byte_length = 0); - // Adds a check to the sequence node making it only a valid sequence when the - // argument of the current bytecode at the specified offset matches the offset - // to check against. - // argument_offset: Zero-based offset to the argument within the bytecode - // (e.g. the first argument that's not packed with the bytecode has offset 4). - // argument_byte_length: Length of the argument. - // check_byte_offset: Zero-based offset relative to the beginning of the - // sequence that needs to match the value given by argument_offset. (e.g. - // check_byte_offset 0 matches the address of the first bytecode in the - // sequence). - BytecodeSequenceNode& IfArgumentEqualsOffset(int argument_offset, - int argument_byte_length, - int check_byte_offset); - // Adds a check to the sequence node making it only a valid sequence when the - // argument of the current bytecode at the specified offset matches the - // argument of another bytecode in the sequence. - // This is similar to IfArgumentEqualsOffset, except that this method matches - // the values of both arguments. - BytecodeSequenceNode& IfArgumentEqualsValueAtOffset( - int argument_offset, int argument_byte_length, - int other_bytecode_index_in_sequence, int other_argument_offset, - int other_argument_byte_length); - // Marks an argument as unused. - // All arguments that are not mapped explicitly have to be marked as unused. - // bytecode_index_in_sequence: Zero-based index of the referred bytecode - // within the sequence (e.g. the bytecode passed to CreateSequence() has - // index 0). - // argument_offset: Zero-based offset to the argument within the bytecode - // (e.g. the first argument that's not packed with the bytecode has offset 4). - // argument_byte_length: Length of the argument. - BytecodeSequenceNode& IgnoreArgument(int bytecode_index_in_sequence, - int argument_offset, - int argument_byte_length); - // Checks if the current node is valid for the sequence. I.e. all conditions - // set by IfArgumentEqualsOffset and IfArgumentEquals are fulfilled by this - // node for the actual bytecode sequence. - bool CheckArguments(const byte* bytecode, int pc); - // Returns whether this node marks the end of a valid sequence (i.e. can be - // replaced with an optimized bytecode). - bool IsSequence() const; - // Returns the length of the sequence in bytes. - int SequenceLength() const; - // Returns the optimized bytecode for the node or kDummyBytecode if it is not - // the end of a valid sequence. - int OptimizedBytecode() const; - // Returns the child of the current node matching the given bytecode or - // nullptr if no such child is found. - BytecodeSequenceNode* Find(int bytecode) const; - // Returns number of arguments mapped to the current node. - // Invoking this method is only allowed on nodes that mark the end of a valid - // sequence (i.e. if IsSequence()) - size_t ArgumentSize() const; - // Returns the argument-mapping of the argument at index. - // Invoking this method is only allowed on nodes that mark the end of a valid - // sequence (i.e. if IsSequence()) - BytecodeArgumentMapping ArgumentMapping(size_t index) const; - // Returns an iterator to begin of ignored arguments. - // Invoking this method is only allowed on nodes that mark the end of a valid - // sequence (i.e. if IsSequence()) - ZoneLinkedList::iterator ArgumentIgnoredBegin() const; - // Returns an iterator to end of ignored arguments. - // Invoking this method is only allowed on nodes that mark the end of a valid - // sequence (i.e. if IsSequence()) - ZoneLinkedList::iterator ArgumentIgnoredEnd() const; - // Returns whether the current node has ignored argument or not. - bool HasIgnoredArguments() const; - - private: - // Returns a node in the sequence specified by its index within the sequence. - BytecodeSequenceNode& GetNodeByIndexInSequence(int index_in_sequence); - Zone* zone() const; - - int bytecode_; - int bytecode_replacement_; - int index_in_sequence_; - int start_offset_; - BytecodeSequenceNode* parent_; - ZoneUnorderedMap children_; - ZoneVector* argument_mapping_; - ZoneLinkedList* argument_check_; - ZoneLinkedList* argument_ignored_; - - Zone* zone_; -}; - -class RegExpBytecodePeephole { - public: - RegExpBytecodePeephole(Zone* zone, size_t buffer_size, - const ZoneUnorderedMap& jump_edges); - - // Parses bytecode and fills the internal buffer with the potentially - // optimized bytecode. Returns true when optimizations were performed, false - // otherwise. - bool OptimizeBytecode(const byte* bytecode, int length); - // Copies the internal bytecode buffer to another buffer. The caller is - // responsible for allocating/freeing the memory. - void CopyOptimizedBytecode(byte* to_address) const; - int Length() const; - - private: - // Sets up all sequences that are going to be used. - void DefineStandardSequences(); - // Starts a new bytecode sequence. - BytecodeSequenceNode& CreateSequence(int bytecode); - // Checks for optimization candidates at pc and emits optimized bytecode to - // the internal buffer. Returns the length of replaced bytecodes in bytes. - int TryOptimizeSequence(const byte* bytecode, int start_pc); - // Emits optimized bytecode to the internal buffer. start_pc points to the - // start of the sequence in bytecode and last_node is the last - // BytecodeSequenceNode of the matching sequence found. - void EmitOptimization(int start_pc, const byte* bytecode, - const BytecodeSequenceNode& last_node); - // Adds a relative jump source fixup at pos. - // Jump source fixups are used to find offsets in the new bytecode that - // contain jump sources. - void AddJumpSourceFixup(int fixup, int pos); - // Adds a relative jump destination fixup at pos. - // Jump destination fixups are used to find offsets in the new bytecode that - // can be jumped to. - void AddJumpDestinationFixup(int fixup, int pos); - // Sets an absolute jump destination fixup at pos. - void SetJumpDestinationFixup(int fixup, int pos); - // Prepare internal structures used to fixup jumps. - void PrepareJumpStructures(const ZoneUnorderedMap& jump_edges); - // Updates all jump targets in the new bytecode. - void FixJumps(); - // Update a single jump. - void FixJump(int jump_source, int jump_destination); - void AddSentinelFixups(int pos); - template - void EmitValue(T value); - template - void OverwriteValue(int offset, T value); - void CopyRangeToOutput(const byte* orig_bytecode, int start, int length); - void SetRange(byte value, int count); - void EmitArgument(int start_pc, const byte* bytecode, - BytecodeArgumentMapping arg); - int pc() const; - Zone* zone() const; - - ZoneVector optimized_bytecode_buffer_; - BytecodeSequenceNode* sequences_; - // Jumps used in old bytecode. - // Key: Jump source (offset where destination is stored in old bytecode) - // Value: Destination - ZoneMap jump_edges_; - // Jumps used in new bytecode. - // Key: Jump source (offset where destination is stored in new bytecode) - // Value: Destination - ZoneMap jump_edges_mapped_; - // Number of times a jump destination is used within the bytecode. - // Key: Jump destination (offset in old bytecode). - // Value: Number of times jump destination is used. - ZoneMap jump_usage_counts_; - // Maps offsets in old bytecode to fixups of sources (delta to new bytecode). - // Key: Offset in old bytecode from where the fixup is valid. - // Value: Delta to map jump source from old bytecode to new bytecode in bytes. - ZoneMap jump_source_fixups_; - // Maps offsets in old bytecode to fixups of destinations (delta to new - // bytecode). - // Key: Offset in old bytecode from where the fixup is valid. - // Value: Delta to map jump destinations from old bytecode to new bytecode in - // bytes. - ZoneMap jump_destination_fixups_; - - Zone* zone_; - - DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodePeephole); -}; - -template -T GetValue(const byte* buffer, int pos) { - DCHECK(IsAligned(reinterpret_cast
(buffer + pos), alignof(T))); - return *reinterpret_cast(buffer + pos); -} - -int32_t GetArgumentValue(const byte* bytecode, int offset, int length) { - switch (length) { - case 1: - return GetValue(bytecode, offset); - break; - case 2: - return GetValue(bytecode, offset); - break; - case 4: - return GetValue(bytecode, offset); - break; - default: - UNREACHABLE(); - } -} - -BytecodeSequenceNode::BytecodeSequenceNode(int bytecode, Zone* zone) - : bytecode_(bytecode), - bytecode_replacement_(kDummyBytecode), - index_in_sequence_(0), - start_offset_(0), - parent_(nullptr), - children_(ZoneUnorderedMap(zone)), - argument_mapping_(new (zone->New(sizeof(*argument_mapping_))) - ZoneVector(zone)), - argument_check_(new (zone->New(sizeof(*argument_check_))) - ZoneLinkedList(zone)), - argument_ignored_(new (zone->New(sizeof(*argument_ignored_))) - ZoneLinkedList(zone)), - zone_(zone) {} - -BytecodeSequenceNode& BytecodeSequenceNode::FollowedBy(int bytecode) { - DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount); - - if (children_.find(bytecode) == children_.end()) { - BytecodeSequenceNode* new_node = - new (zone()->New(sizeof(BytecodeSequenceNode))) - BytecodeSequenceNode(bytecode, zone()); - // If node is not the first in the sequence, set offsets and parent. - if (bytecode_ != kDummyBytecode) { - new_node->start_offset_ = start_offset_ + RegExpBytecodeLength(bytecode_); - new_node->index_in_sequence_ = index_in_sequence_ + 1; - new_node->parent_ = this; - } - children_[bytecode] = new_node; - } - - return *children_[bytecode]; -} - -BytecodeSequenceNode& BytecodeSequenceNode::ReplaceWith(int bytecode) { - DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount); - - bytecode_replacement_ = bytecode; - - return *this; -} - -BytecodeSequenceNode& BytecodeSequenceNode::MapArgument( - int bytecode_index_in_sequence, int argument_offset, - int argument_byte_length, int new_argument_byte_length) { - DCHECK(IsSequence()); - DCHECK_LE(bytecode_index_in_sequence, index_in_sequence_); - - BytecodeSequenceNode& ref_node = - GetNodeByIndexInSequence(bytecode_index_in_sequence); - DCHECK_LT(argument_offset, RegExpBytecodeLength(ref_node.bytecode_)); - - int absolute_offset = ref_node.start_offset_ + argument_offset; - if (new_argument_byte_length == 0) { - new_argument_byte_length = argument_byte_length; - } - - argument_mapping_->push_back(BytecodeArgumentMapping{ - absolute_offset, argument_byte_length, new_argument_byte_length}); - - return *this; -} - -BytecodeSequenceNode& BytecodeSequenceNode::IfArgumentEqualsOffset( - int argument_offset, int argument_byte_length, int check_byte_offset) { - DCHECK_LT(argument_offset, RegExpBytecodeLength(bytecode_)); - DCHECK(argument_byte_length == 1 || argument_byte_length == 2 || - argument_byte_length == 4); - - int absolute_offset = start_offset_ + argument_offset; - - argument_check_->push_back(BytecodeArgumentCheck{ - absolute_offset, argument_byte_length, check_byte_offset}); - - return *this; -} - -BytecodeSequenceNode& BytecodeSequenceNode::IfArgumentEqualsValueAtOffset( - int argument_offset, int argument_byte_length, - int other_bytecode_index_in_sequence, int other_argument_offset, - int other_argument_byte_length) { - DCHECK_LT(argument_offset, RegExpBytecodeLength(bytecode_)); - DCHECK_LE(other_bytecode_index_in_sequence, index_in_sequence_); - DCHECK_EQ(argument_byte_length, other_argument_byte_length); - - BytecodeSequenceNode& ref_node = - GetNodeByIndexInSequence(other_bytecode_index_in_sequence); - DCHECK_LT(other_argument_offset, RegExpBytecodeLength(ref_node.bytecode_)); - - int absolute_offset = start_offset_ + argument_offset; - int other_absolute_offset = ref_node.start_offset_ + other_argument_offset; - - argument_check_->push_back( - BytecodeArgumentCheck{absolute_offset, argument_byte_length, - other_absolute_offset, other_argument_byte_length}); - - return *this; -} - -BytecodeSequenceNode& BytecodeSequenceNode::IgnoreArgument( - int bytecode_index_in_sequence, int argument_offset, - int argument_byte_length) { - DCHECK(IsSequence()); - DCHECK_LE(bytecode_index_in_sequence, index_in_sequence_); - - BytecodeSequenceNode& ref_node = - GetNodeByIndexInSequence(bytecode_index_in_sequence); - DCHECK_LT(argument_offset, RegExpBytecodeLength(ref_node.bytecode_)); - - int absolute_offset = ref_node.start_offset_ + argument_offset; - - argument_ignored_->push_back( - BytecodeArgument{absolute_offset, argument_byte_length}); - - return *this; -} - -bool BytecodeSequenceNode::CheckArguments(const byte* bytecode, int pc) { - bool is_valid = true; - for (auto check_iter = argument_check_->begin(); - check_iter != argument_check_->end() && is_valid; check_iter++) { - auto value = - GetArgumentValue(bytecode, pc + check_iter->offset, check_iter->length); - if (check_iter->type == BytecodeArgumentCheck::kCheckAddress) { - is_valid &= value == pc + check_iter->check_offset; - } else if (check_iter->type == BytecodeArgumentCheck::kCheckValue) { - auto other_value = GetArgumentValue( - bytecode, pc + check_iter->check_offset, check_iter->check_length); - is_valid &= value == other_value; - } else { - UNREACHABLE(); - } - } - return is_valid; -} - -bool BytecodeSequenceNode::IsSequence() const { - return bytecode_replacement_ != kDummyBytecode; -} - -int BytecodeSequenceNode::SequenceLength() const { - return start_offset_ + RegExpBytecodeLength(bytecode_); -} - -int BytecodeSequenceNode::OptimizedBytecode() const { - return bytecode_replacement_; -} - -BytecodeSequenceNode* BytecodeSequenceNode::Find(int bytecode) const { - auto found = children_.find(bytecode); - if (found == children_.end()) return nullptr; - return found->second; -} - -size_t BytecodeSequenceNode::ArgumentSize() const { - DCHECK(IsSequence()); - return argument_mapping_->size(); -} - -BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping( - size_t index) const { - DCHECK(IsSequence()); - DCHECK(argument_mapping_ != nullptr); - DCHECK_LT(index, argument_mapping_->size()); - - return argument_mapping_->at(index); -} - -ZoneLinkedList::iterator -BytecodeSequenceNode::ArgumentIgnoredBegin() const { - DCHECK(IsSequence()); - DCHECK(argument_ignored_ != nullptr); - return argument_ignored_->begin(); -} - -ZoneLinkedList::iterator -BytecodeSequenceNode::ArgumentIgnoredEnd() const { - DCHECK(IsSequence()); - DCHECK(argument_ignored_ != nullptr); - return argument_ignored_->end(); -} - -bool BytecodeSequenceNode::HasIgnoredArguments() const { - return argument_ignored_ != nullptr; -} - -BytecodeSequenceNode& BytecodeSequenceNode::GetNodeByIndexInSequence( - int index_in_sequence) { - DCHECK_LE(index_in_sequence, index_in_sequence_); - - if (index_in_sequence < index_in_sequence_) { - DCHECK(parent_ != nullptr); - return parent_->GetNodeByIndexInSequence(index_in_sequence); - } else { - return *this; - } -} - -Zone* BytecodeSequenceNode::zone() const { return zone_; } - -RegExpBytecodePeephole::RegExpBytecodePeephole( - Zone* zone, size_t buffer_size, - const ZoneUnorderedMap& jump_edges) - : optimized_bytecode_buffer_(zone), - sequences_(new (zone->New(sizeof(*sequences_))) BytecodeSequenceNode( - BytecodeSequenceNode::kDummyBytecode, zone)), - jump_edges_(zone), - jump_edges_mapped_(zone), - jump_usage_counts_(zone), - jump_source_fixups_(zone), - jump_destination_fixups_(zone), - zone_(zone) { - optimized_bytecode_buffer_.reserve(buffer_size); - PrepareJumpStructures(jump_edges); - DefineStandardSequences(); - // Sentinel fixups at beginning of bytecode (position -1) so we don't have to - // check for end of iterator inside the fixup loop. - // In general fixups are deltas of original offsets of jump - // sources/destinations (in the old bytecode) to find them in the new - // bytecode. All jump targets are fixed after the new bytecode is fully - // emitted in the internal buffer. - AddSentinelFixups(-1); - // Sentinel fixups at end of (old) bytecode so we don't have to check for - // end of iterator inside the fixup loop. - DCHECK_LE(buffer_size, std::numeric_limits::max()); - AddSentinelFixups(static_cast(buffer_size)); -} - -void RegExpBytecodePeephole::DefineStandardSequences() { - // Commonly used sequences can be found by creating regexp bytecode traces - // (--trace-regexp-bytecodes) and using v8/tools/regexp-sequences.py. - CreateSequence(BC_LOAD_CURRENT_CHAR) - .FollowedBy(BC_CHECK_BIT_IN_TABLE) - .FollowedBy(BC_ADVANCE_CP_AND_GOTO) - // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the - // first bytecode in this sequence. - .IfArgumentEqualsOffset(4, 4, 0) - .ReplaceWith(BC_SKIP_UNTIL_BIT_IN_TABLE) - .MapArgument(0, 1, 3) // load offset - .MapArgument(2, 1, 3, 4) // advance by - .MapArgument(1, 8, 16) // bit table - .MapArgument(1, 4, 4) // goto when match - .MapArgument(0, 4, 4) // goto on failure - .IgnoreArgument(2, 4, 4); // loop jump - - CreateSequence(BC_CHECK_CURRENT_POSITION) - .FollowedBy(BC_LOAD_CURRENT_CHAR_UNCHECKED) - .FollowedBy(BC_CHECK_CHAR) - .FollowedBy(BC_ADVANCE_CP_AND_GOTO) - // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the - // first bytecode in this sequence. - .IfArgumentEqualsOffset(4, 4, 0) - .ReplaceWith(BC_SKIP_UNTIL_CHAR_POS_CHECKED) - .MapArgument(1, 1, 3) // load offset - .MapArgument(3, 1, 3, 2) // advance_by - .MapArgument(2, 1, 3, 2) // c - .MapArgument(0, 1, 3, 4) // eats at least - .MapArgument(2, 4, 4) // goto when match - .MapArgument(0, 4, 4) // goto on failure - .IgnoreArgument(3, 4, 4); // loop jump - - CreateSequence(BC_CHECK_CURRENT_POSITION) - .FollowedBy(BC_LOAD_CURRENT_CHAR_UNCHECKED) - .FollowedBy(BC_AND_CHECK_CHAR) - .FollowedBy(BC_ADVANCE_CP_AND_GOTO) - // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the - // first bytecode in this sequence. - .IfArgumentEqualsOffset(4, 4, 0) - .ReplaceWith(BC_SKIP_UNTIL_CHAR_AND) - .MapArgument(1, 1, 3) // load offset - .MapArgument(3, 1, 3, 2) // advance_by - .MapArgument(2, 1, 3, 2) // c - .MapArgument(2, 4, 4) // mask - .MapArgument(0, 1, 3, 4) // eats at least - .MapArgument(2, 8, 4) // goto when match - .MapArgument(0, 4, 4) // goto on failure - .IgnoreArgument(3, 4, 4); // loop jump - - // TODO(pthier): It might make sense for short sequences like this one to only - // optimize them if the resulting optimization is not longer than the current - // one. This could be the case if there are jumps inside the sequence and we - // have to replicate parts of the sequence. A method to mark such sequences - // might be useful. - CreateSequence(BC_LOAD_CURRENT_CHAR) - .FollowedBy(BC_CHECK_CHAR) - .FollowedBy(BC_ADVANCE_CP_AND_GOTO) - // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the - // first bytecode in this sequence. - .IfArgumentEqualsOffset(4, 4, 0) - .ReplaceWith(BC_SKIP_UNTIL_CHAR) - .MapArgument(0, 1, 3) // load offset - .MapArgument(2, 1, 3, 2) // advance by - .MapArgument(1, 1, 3, 2) // character - .MapArgument(1, 4, 4) // goto when match - .MapArgument(0, 4, 4) // goto on failure - .IgnoreArgument(2, 4, 4); // loop jump - - CreateSequence(BC_LOAD_CURRENT_CHAR) - .FollowedBy(BC_CHECK_CHAR) - .FollowedBy(BC_CHECK_CHAR) - // Sequence is only valid if the jump targets of both CHECK_CHAR bytecodes - // are equal. - .IfArgumentEqualsValueAtOffset(4, 4, 1, 4, 4) - .FollowedBy(BC_ADVANCE_CP_AND_GOTO) - // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the - // first bytecode in this sequence. - .IfArgumentEqualsOffset(4, 4, 0) - .ReplaceWith(BC_SKIP_UNTIL_CHAR_OR_CHAR) - .MapArgument(0, 1, 3) // load offset - .MapArgument(3, 1, 3, 4) // advance by - .MapArgument(1, 1, 3, 2) // character 1 - .MapArgument(2, 1, 3, 2) // character 2 - .MapArgument(1, 4, 4) // goto when match - .MapArgument(0, 4, 4) // goto on failure - .IgnoreArgument(2, 4, 4) // goto when match 2 - .IgnoreArgument(3, 4, 4); // loop jump - - CreateSequence(BC_LOAD_CURRENT_CHAR) - .FollowedBy(BC_CHECK_GT) - // Sequence is only valid if the jump target of CHECK_GT is the first - // bytecode AFTER the whole sequence. - .IfArgumentEqualsOffset(4, 4, 56) - .FollowedBy(BC_CHECK_BIT_IN_TABLE) - // Sequence is only valid if the jump target of CHECK_BIT_IN_TABLE is - // the ADVANCE_CP_AND_GOTO bytecode at the end of the sequence. - .IfArgumentEqualsOffset(4, 4, 48) - .FollowedBy(BC_GOTO) - // Sequence is only valid if the jump target of GOTO is the same as the - // jump target of CHECK_GT (i.e. both jump to the first bytecode AFTER the - // whole sequence. - .IfArgumentEqualsValueAtOffset(4, 4, 1, 4, 4) - .FollowedBy(BC_ADVANCE_CP_AND_GOTO) - // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the - // first bytecode in this sequence. - .IfArgumentEqualsOffset(4, 4, 0) - .ReplaceWith(BC_SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) - .MapArgument(0, 1, 3) // load offset - .MapArgument(4, 1, 3, 2) // advance by - .MapArgument(1, 1, 3, 2) // character - .MapArgument(2, 8, 16) // bit table - .MapArgument(1, 4, 4) // goto when match - .MapArgument(0, 4, 4) // goto on failure - .IgnoreArgument(2, 4, 4) // indirect loop jump - .IgnoreArgument(3, 4, 4) // jump out of loop - .IgnoreArgument(4, 4, 4); // loop jump -} - -bool RegExpBytecodePeephole::OptimizeBytecode(const byte* bytecode, - int length) { - int old_pc = 0; - bool did_optimize = false; - - while (old_pc < length) { - int replaced_len = TryOptimizeSequence(bytecode, old_pc); - if (replaced_len > 0) { - old_pc += replaced_len; - did_optimize = true; - } else { - int bc = bytecode[old_pc]; - int bc_len = RegExpBytecodeLength(bc); - CopyRangeToOutput(bytecode, old_pc, bc_len); - old_pc += bc_len; - } - } - - if (did_optimize) { - FixJumps(); - } - - return did_optimize; -} - -void RegExpBytecodePeephole::CopyOptimizedBytecode(byte* to_address) const { - MemCopy(to_address, &(*optimized_bytecode_buffer_.begin()), Length()); -} - -int RegExpBytecodePeephole::Length() const { return pc(); } - -BytecodeSequenceNode& RegExpBytecodePeephole::CreateSequence(int bytecode) { - DCHECK(sequences_ != nullptr); - DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount); - - return sequences_->FollowedBy(bytecode); -} - -int RegExpBytecodePeephole::TryOptimizeSequence(const byte* bytecode, - int start_pc) { - BytecodeSequenceNode* seq_node = sequences_; - BytecodeSequenceNode* valid_seq_end = nullptr; - - int current_pc = start_pc; - - // Check for the longest valid sequence matching any of the pre-defined - // sequences in the Trie data structure. - while ((seq_node = seq_node->Find(bytecode[current_pc]))) { - if (!seq_node->CheckArguments(bytecode, start_pc)) { - break; - } - if (seq_node->IsSequence()) { - valid_seq_end = seq_node; - } - current_pc += RegExpBytecodeLength(bytecode[current_pc]); - } - - if (valid_seq_end) { - EmitOptimization(start_pc, bytecode, *valid_seq_end); - return valid_seq_end->SequenceLength(); - } - - return 0; -} - -void RegExpBytecodePeephole::EmitOptimization( - int start_pc, const byte* bytecode, const BytecodeSequenceNode& last_node) { -#ifdef DEBUG - int optimized_start_pc = pc(); -#endif - // Jump sources that are mapped or marked as unused will be deleted at the end - // of this method. We don't delete them immediately as we might need the - // information when we have to preserve bytecodes at the end. - // TODO(pthier): Replace with a stack-allocated data structure. - ZoneLinkedList delete_jumps = ZoneLinkedList(zone()); - - uint32_t bc = last_node.OptimizedBytecode(); - EmitValue(bc); - - for (size_t arg = 0; arg < last_node.ArgumentSize(); arg++) { - BytecodeArgumentMapping arg_map = last_node.ArgumentMapping(arg); - int arg_pos = start_pc + arg_map.offset; - // If we map any jump source we mark the old source for deletion and insert - // a new jump. - auto jump_edge_iter = jump_edges_.find(arg_pos); - if (jump_edge_iter != jump_edges_.end()) { - int jump_source = jump_edge_iter->first; - int jump_destination = jump_edge_iter->second; - // Add new jump edge add current position. - jump_edges_mapped_.emplace(Length(), jump_destination); - // Mark old jump edge for deletion. - delete_jumps.push_back(jump_source); - // Decrement usage count of jump destination. - auto jump_count_iter = jump_usage_counts_.find(jump_destination); - DCHECK(jump_count_iter != jump_usage_counts_.end()); - int& usage_count = jump_count_iter->second; - --usage_count; - } - // TODO(pthier): DCHECK that mapped arguments are never sources of jumps - // to destinations inside the sequence. - EmitArgument(start_pc, bytecode, arg_map); - } - DCHECK_EQ(pc(), optimized_start_pc + - RegExpBytecodeLength(last_node.OptimizedBytecode())); - - // Remove jumps from arguments we ignore. - if (last_node.HasIgnoredArguments()) { - for (auto ignored_arg = last_node.ArgumentIgnoredBegin(); - ignored_arg != last_node.ArgumentIgnoredEnd(); ignored_arg++) { - auto jump_edge_iter = jump_edges_.find(start_pc + ignored_arg->offset); - if (jump_edge_iter != jump_edges_.end()) { - int jump_source = jump_edge_iter->first; - int jump_destination = jump_edge_iter->second; - // Mark old jump edge for deletion. - delete_jumps.push_back(jump_source); - // Decrement usage count of jump destination. - auto jump_count_iter = jump_usage_counts_.find(jump_destination); - DCHECK(jump_count_iter != jump_usage_counts_.end()); - int& usage_count = jump_count_iter->second; - --usage_count; - } - } - } - - int fixup_length = RegExpBytecodeLength(bc) - last_node.SequenceLength(); - - // Check if there are any jumps inside the old sequence. - // If so we have to keep the bytecodes that are jumped to around. - auto jump_destination_candidate = jump_usage_counts_.upper_bound(start_pc); - int jump_candidate_destination = jump_destination_candidate->first; - int jump_candidate_count = jump_destination_candidate->second; - // Jump destinations only jumped to from inside the sequence will be ignored. - while (jump_destination_candidate != jump_usage_counts_.end() && - jump_candidate_count == 0) { - ++jump_destination_candidate; - jump_candidate_destination = jump_destination_candidate->first; - jump_candidate_count = jump_destination_candidate->second; - } - - int preserve_from = start_pc + last_node.SequenceLength(); - if (jump_destination_candidate != jump_usage_counts_.end() && - jump_candidate_destination < start_pc + last_node.SequenceLength()) { - preserve_from = jump_candidate_destination; - // Check if any jump in the sequence we are preserving has a jump - // destination inside the optimized sequence before the current position we - // want to preserve. If so we have to preserve all bytecodes starting at - // this jump destination. - for (auto jump_iter = jump_edges_.lower_bound(preserve_from); - jump_iter != jump_edges_.end() && - jump_iter->first /* jump source */ < - start_pc + last_node.SequenceLength(); - ++jump_iter) { - int jump_destination = jump_iter->second; - if (jump_destination > start_pc && jump_destination < preserve_from) { - preserve_from = jump_destination; - } - } - - // We preserve everything to the end of the sequence. This is conservative - // since it would be enough to preserve all bytecudes up to an unconditional - // jump. - int preserve_length = start_pc + last_node.SequenceLength() - preserve_from; - fixup_length += preserve_length; - // Jumps after the start of the preserved sequence need fixup. - AddJumpSourceFixup(fixup_length, - start_pc + last_node.SequenceLength() - preserve_length); - // All jump targets after the start of the optimized sequence need to be - // fixed relative to the length of the optimized sequence including - // bytecodes we preserved. - AddJumpDestinationFixup(fixup_length, start_pc + 1); - // Jumps to the sequence we preserved need absolute fixup as they could - // occur before or after the sequence. - SetJumpDestinationFixup(pc() - preserve_from, preserve_from); - CopyRangeToOutput(bytecode, preserve_from, preserve_length); - } else { - AddJumpDestinationFixup(fixup_length, start_pc + 1); - // Jumps after the end of the old sequence need fixup. - AddJumpSourceFixup(fixup_length, start_pc + last_node.SequenceLength()); - } - - // Delete jumps we definitely don't need anymore - for (int del : delete_jumps) { - if (del < preserve_from) { - jump_edges_.erase(del); - } - } -} - -void RegExpBytecodePeephole::AddJumpSourceFixup(int fixup, int pos) { - auto previous_fixup = jump_source_fixups_.lower_bound(pos); - DCHECK(previous_fixup != jump_source_fixups_.end()); - DCHECK(previous_fixup != jump_source_fixups_.begin()); - - int previous_fixup_value = (--previous_fixup)->second; - jump_source_fixups_[pos] = previous_fixup_value + fixup; -} - -void RegExpBytecodePeephole::AddJumpDestinationFixup(int fixup, int pos) { - auto previous_fixup = jump_destination_fixups_.lower_bound(pos); - DCHECK(previous_fixup != jump_destination_fixups_.end()); - DCHECK(previous_fixup != jump_destination_fixups_.begin()); - - int previous_fixup_value = (--previous_fixup)->second; - jump_destination_fixups_[pos] = previous_fixup_value + fixup; -} - -void RegExpBytecodePeephole::SetJumpDestinationFixup(int fixup, int pos) { - auto previous_fixup = jump_destination_fixups_.lower_bound(pos); - DCHECK(previous_fixup != jump_destination_fixups_.end()); - DCHECK(previous_fixup != jump_destination_fixups_.begin()); - - int previous_fixup_value = (--previous_fixup)->second; - jump_destination_fixups_.emplace(pos, fixup); - jump_destination_fixups_.emplace(pos + 1, previous_fixup_value); -} - -void RegExpBytecodePeephole::PrepareJumpStructures( - const ZoneUnorderedMap& jump_edges) { - for (auto jump_edge : jump_edges) { - int jump_source = jump_edge.first; - int jump_destination = jump_edge.second; - - jump_edges_.emplace(jump_source, jump_destination); - jump_usage_counts_[jump_destination]++; - } -} - -void RegExpBytecodePeephole::FixJumps() { - int position_fixup = 0; - // Next position where fixup changes. - auto next_source_fixup = jump_source_fixups_.lower_bound(0); - int next_source_fixup_offset = next_source_fixup->first; - int next_source_fixup_value = next_source_fixup->second; - - for (auto jump_edge : jump_edges_) { - int jump_source = jump_edge.first; - int jump_destination = jump_edge.second; - while (jump_source >= next_source_fixup_offset) { - position_fixup = next_source_fixup_value; - ++next_source_fixup; - next_source_fixup_offset = next_source_fixup->first; - next_source_fixup_value = next_source_fixup->second; - } - jump_source += position_fixup; - - FixJump(jump_source, jump_destination); - } - - // Mapped jump edges don't need source fixups, as the position already is an - // offset in the new bytecode. - for (auto jump_edge : jump_edges_mapped_) { - int jump_source = jump_edge.first; - int jump_destination = jump_edge.second; - - FixJump(jump_source, jump_destination); - } -} - -void RegExpBytecodePeephole::FixJump(int jump_source, int jump_destination) { - int fixed_jump_destination = - jump_destination + - (--jump_destination_fixups_.upper_bound(jump_destination))->second; - DCHECK_LT(fixed_jump_destination, Length()); -#ifdef DEBUG - // TODO(pthier): This check could be better if we track the bytecodes - // actually used and check if we jump to one of them. - byte jump_bc = optimized_bytecode_buffer_[fixed_jump_destination]; - DCHECK_GT(jump_bc, 0); - DCHECK_LT(jump_bc, kRegExpBytecodeCount); -#endif - - if (jump_destination != fixed_jump_destination) { - OverwriteValue(jump_source, fixed_jump_destination); - } -} - -void RegExpBytecodePeephole::AddSentinelFixups(int pos) { - jump_source_fixups_.emplace(pos, 0); - jump_destination_fixups_.emplace(pos, 0); -} - -template -void RegExpBytecodePeephole::EmitValue(T value) { - DCHECK(optimized_bytecode_buffer_.begin() + pc() == - optimized_bytecode_buffer_.end()); - byte* value_byte_iter = reinterpret_cast(&value); - optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), - value_byte_iter, - value_byte_iter + sizeof(T)); -} - -template -void RegExpBytecodePeephole::OverwriteValue(int offset, T value) { - byte* value_byte_iter = reinterpret_cast(&value); - byte* value_byte_iter_end = value_byte_iter + sizeof(T); - while (value_byte_iter < value_byte_iter_end) { - optimized_bytecode_buffer_[offset++] = *value_byte_iter++; - } -} - -void RegExpBytecodePeephole::CopyRangeToOutput(const byte* orig_bytecode, - int start, int length) { - DCHECK(optimized_bytecode_buffer_.begin() + pc() == - optimized_bytecode_buffer_.end()); - optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), - orig_bytecode + start, - orig_bytecode + start + length); -} - -void RegExpBytecodePeephole::SetRange(byte value, int count) { - DCHECK(optimized_bytecode_buffer_.begin() + pc() == - optimized_bytecode_buffer_.end()); - optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), count, - value); -} - -void RegExpBytecodePeephole::EmitArgument(int start_pc, const byte* bytecode, - BytecodeArgumentMapping arg) { - int arg_pos = start_pc + arg.offset; - switch (arg.length) { - case 1: - DCHECK_EQ(arg.new_length, arg.length); - EmitValue(GetValue(bytecode, arg_pos)); - break; - case 2: - DCHECK_EQ(arg.new_length, arg.length); - EmitValue(GetValue(bytecode, arg_pos)); - break; - case 3: { - // Length 3 only occurs in 'packed' arguments where the lowermost byte is - // the current bytecode, and the remaining 3 bytes are the packed value. - // - // We load 4 bytes from position - 1 and shift out the bytecode. -#ifdef V8_TARGET_BIG_ENDIAN - UNIMPLEMENTED(); - int32_t val = 0; -#else - int32_t val = GetValue(bytecode, arg_pos - 1) >> kBitsPerByte; -#endif // V8_TARGET_BIG_ENDIAN - - switch (arg.new_length) { - case 2: - EmitValue(val); - break; - case 3: { - // Pack with previously emitted value. - auto prev_val = - GetValue(&(*optimized_bytecode_buffer_.begin()), - Length() - sizeof(uint32_t)); -#ifdef V8_TARGET_BIG_ENDIAN - UNIMPLEMENTED(); - USE(prev_val); -#else - DCHECK_EQ(prev_val & 0xFFFFFF00, 0); - OverwriteValue( - pc() - sizeof(uint32_t), - (static_cast(val) << 8) | (prev_val & 0xFF)); -#endif // V8_TARGET_BIG_ENDIAN - break; - } - case 4: - EmitValue(val); - break; - } - break; - } - case 4: - DCHECK_EQ(arg.new_length, arg.length); - EmitValue(GetValue(bytecode, arg_pos)); - break; - case 8: - DCHECK_EQ(arg.new_length, arg.length); - EmitValue(GetValue(bytecode, arg_pos)); - break; - default: - CopyRangeToOutput(bytecode, arg_pos, Min(arg.length, arg.new_length)); - if (arg.length < arg.new_length) { - SetRange(0x00, arg.new_length - arg.length); - } - break; - } -} - -int RegExpBytecodePeephole::pc() const { - DCHECK_LE(optimized_bytecode_buffer_.size(), std::numeric_limits::max()); - return static_cast(optimized_bytecode_buffer_.size()); -} - -Zone* RegExpBytecodePeephole::zone() const { return zone_; } - -} // namespace - -// static -Handle RegExpBytecodePeepholeOptimization::OptimizeBytecode( - Isolate* isolate, Zone* zone, Handle source, const byte* bytecode, - int length, const ZoneUnorderedMap& jump_edges) { - RegExpBytecodePeephole peephole(zone, length, jump_edges); - bool did_optimize = peephole.OptimizeBytecode(bytecode, length); - Handle array = isolate->factory()->NewByteArray(peephole.Length()); - peephole.CopyOptimizedBytecode(array->GetDataStartAddress()); - - if (did_optimize && FLAG_trace_regexp_peephole_optimization) { - PrintF("Original Bytecode:\n"); - RegExpBytecodeDisassemble(bytecode, length, source->ToCString().get()); - PrintF("Optimized Bytecode:\n"); - RegExpBytecodeDisassemble(array->GetDataStartAddress(), peephole.Length(), - source->ToCString().get()); - } - - return array; -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-bytecode-peephole.h b/js/src/new-regexp/regexp-bytecode-peephole.h deleted file mode 100644 index 781f0c914..000000000 --- a/js/src/new-regexp/regexp-bytecode-peephole.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_ -#define V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_ - -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -class ByteArray; - -// Peephole optimization for regexp interpreter bytecode. -// Pre-defined bytecode sequences occuring in the bytecode generated by the -// RegExpBytecodeGenerator can be optimized into a single bytecode. -class RegExpBytecodePeepholeOptimization : public AllStatic { - public: - // Performs peephole optimization on the given bytecode and returns the - // optimized bytecode. - static Handle OptimizeBytecode( - Isolate* isolate, Zone* zone, Handle source, const byte* bytecode, - int length, const ZoneUnorderedMap& jump_edges); -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_ diff --git a/js/src/new-regexp/regexp-bytecodes.cc b/js/src/new-regexp/regexp-bytecodes.cc deleted file mode 100644 index 679a7c06a..000000000 --- a/js/src/new-regexp/regexp-bytecodes.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-bytecodes.h" - -#include - - -namespace v8 { -namespace internal { - -void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc) { - PrintF("%s", RegExpBytecodeName(*pc)); - - // Args and the bytecode as hex. - for (int i = 0; i < RegExpBytecodeLength(*pc); i++) { - PrintF(", %02x", pc[i]); - } - PrintF(" "); - - // Args as ascii. - for (int i = 1; i < RegExpBytecodeLength(*pc); i++) { - unsigned char b = pc[i]; - PrintF("%c", std::isprint(b) ? b : '.'); - } - PrintF("\n"); -} - -void RegExpBytecodeDisassemble(const byte* code_base, int length, - const char* pattern) { - PrintF("[generated bytecode for regexp pattern: '%s']\n", pattern); - - ptrdiff_t offset = 0; - - while (offset < length) { - const byte* const pc = code_base + offset; - PrintF("%p %4" V8PRIxPTRDIFF " ", pc, offset); - RegExpBytecodeDisassembleSingle(code_base, pc); - offset += RegExpBytecodeLength(*pc); - } -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-bytecodes.h b/js/src/new-regexp/regexp-bytecodes.h deleted file mode 100644 index e5ab7cf66..000000000 --- a/js/src/new-regexp/regexp-bytecodes.h +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2011 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_BYTECODES_H_ -#define V8_REGEXP_REGEXP_BYTECODES_H_ - -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -// Maximum number of bytecodes that will be used (next power of 2 of actually -// defined bytecodes). -// All slots between the last actually defined bytecode and maximum id will be -// filled with BREAKs, indicating an invalid operation. This way using -// BYTECODE_MASK guarantees no OOB access to the dispatch table. -constexpr int kRegExpPaddedBytecodeCount = 1 << 6; -constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1; -// The first argument is packed in with the byte code in one word, but so it -// has 24 bits, but it can be positive and negative so only use 23 bits for -// positive values. -const unsigned int MAX_FIRST_ARG = 0x7fffffu; -const int BYTECODE_SHIFT = 8; -STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); - -// TODO(pthier): Argument offsets of bytecodes should be easily accessible by -// name or at least by position. -#define BYTECODE_ITERATOR(V) \ - V(BREAK, 0, 4) /* bc8 */ \ - V(PUSH_CP, 1, 4) /* bc8 pad24 */ \ - V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \ - V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \ - V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \ - V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \ - V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \ - V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \ - V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \ - V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \ - V(POP_CP, 10, 4) /* bc8 pad24 */ \ - V(POP_BT, 11, 4) /* bc8 pad24 */ \ - V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \ - V(FAIL, 13, 4) /* bc8 pad24 */ \ - V(SUCCEED, 14, 4) /* bc8 pad24 */ \ - V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \ - /* Jump to another bytecode given its offset. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x10 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: 0x00 (unused) Padding */ \ - /* 0x20 - 0x3F: Address of bytecode to jump to */ \ - V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \ - /* Check if offset is in range and load character at given offset. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x11 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: Offset from current position */ \ - /* 0x20 - 0x3F: Address of bytecode when load is out of range */ \ - V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \ - /* Load character at given offset without range checks. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x12 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: Offset from current position */ \ - V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \ - V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \ - V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \ - V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \ - V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \ - V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \ - /* Check if current character is equal to a given character */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x19 (fixed) Bytecode */ \ - /* 0x08 - 0x0F: 0x00 (unused) Padding */ \ - /* 0x10 - 0x1F: Character to check */ \ - /* 0x20 - 0x3F: Address of bytecode when matched */ \ - V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \ - V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \ - V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \ - V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ - /* Checks if the current character combined with mask (bitwise and) */ \ - /* matches a character (e.g. used when two characters in a disjunction */ \ - /* differ by only a single bit */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x1c (fixed) Bytecode */ \ - /* 0x08 - 0x0F: 0x00 (unused) Padding */ \ - /* 0x10 - 0x1F: Character to match against (after mask aplied) */ \ - /* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \ - /* 0x40 - 0x5F: Address of bytecode when matched */ \ - V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ - V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ - V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ - V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \ - V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \ - V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \ - /* Checks if the current character matches any of the characters encoded */ \ - /* in a bit table. Similar to/inspired by boyer moore string search */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x22 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: 0x00 (unused) Padding */ \ - /* 0x20 - 0x3F: Address of bytecode when bit is set */ \ - /* 0x40 - 0xBF: Bit table */ \ - V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ - V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ - V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ - V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \ - V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \ - V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ - V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ - V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ - V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \ - V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \ - /* Checks if the current position matches top of backtrack stack */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x31 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: 0x00 (unused) Padding */ \ - /* 0x20 - 0x3F: Address of bytecode when current matches tos */ \ - V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \ - /* Advance character pointer by given offset and jump to another bytecode.*/ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x32 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: Number of characters to advance */ \ - /* 0x20 - 0x3F: Address of bytecode to jump to */ \ - V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \ - V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ \ - /* Checks if current position + given offset is in range. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07: 0x34 (fixed) Bytecode */ \ - /* 0x08 - 0x1F: Offset from current position */ \ - /* 0x20 - 0x3F: Address of bytecode when position is out of range */ \ - V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32 */ \ - /* Combination of: */ \ - /* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO */ \ - /* Emitted by RegExpBytecodePeepholeOptimization. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07 0x35 (fixed) Bytecode */ \ - /* 0x08 - 0x1F Load character offset from current position */ \ - /* 0x20 - 0x3F Number of characters to advance */ \ - /* 0x40 - 0xBF Bit Table */ \ - /* 0xC0 - 0xDF Address of bytecode when character is matched */ \ - /* 0xE0 - 0xFF Address of bytecode when no match */ \ - V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32) \ - /* Combination of: */ \ - /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR */ \ - /* and ADVANCE_CP_AND_GOTO */ \ - /* Emitted by RegExpBytecodePeepholeOptimization. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07 0x36 (fixed) Bytecode */ \ - /* 0x08 - 0x1F Load character offset from current position */ \ - /* 0x20 - 0x2F Number of characters to advance */ \ - /* 0x30 - 0x3F Character to match against (after mask applied) */ \ - /* 0x40 - 0x5F: Bitmask bitwise and combined with current character */ \ - /* 0x60 - 0x7F Minimum number of characters this pattern consumes */ \ - /* 0x80 - 0x9F Address of bytecode when character is matched */ \ - /* 0xA0 - 0xBF Address of bytecode when no match */ \ - V(SKIP_UNTIL_CHAR_AND, 54, 24) \ - /* Combination of: */ \ - /* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \ - /* Emitted by RegExpBytecodePeepholeOptimization. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07 0x37 (fixed) Bytecode */ \ - /* 0x08 - 0x1F Load character offset from current position */ \ - /* 0x20 - 0x2F Number of characters to advance */ \ - /* 0x30 - 0x3F Character to match */ \ - /* 0x40 - 0x5F Address of bytecode when character is matched */ \ - /* 0x60 - 0x7F Address of bytecode when no match */ \ - V(SKIP_UNTIL_CHAR, 55, 16) \ - /* Combination of: */ \ - /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR */ \ - /* and ADVANCE_CP_AND_GOTO */ \ - /* Emitted by RegExpBytecodePeepholeOptimization. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07 0x38 (fixed) Bytecode */ \ - /* 0x08 - 0x1F Load character offset from current position */ \ - /* 0x20 - 0x2F Number of characters to advance */ \ - /* 0x30 - 0x3F Character to match */ \ - /* 0x40 - 0x5F Minimum number of characters this pattern consumes */ \ - /* 0x60 - 0x7F Address of bytecode when character is matched */ \ - /* 0x80 - 0x9F Address of bytecode when no match */ \ - V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20) \ - /* Combination of: */ \ - /* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \ - /* Emitted by RegExpBytecodePeepholeOptimization. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07 0x39 (fixed) Bytecode */ \ - /* 0x08 - 0x1F Load character offset from current position */ \ - /* 0x20 - 0x3F Number of characters to advance */ \ - /* 0x40 - 0x4F Character to match */ \ - /* 0x50 - 0x5F Other Character to match */ \ - /* 0x60 - 0x7F Address of bytecode when either character is matched */ \ - /* 0x80 - 0x9F Address of bytecode when no match */ \ - V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20) \ - /* Combination of: */ \ - /* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and */ \ - /* and ADVANCE_CP_AND_GOTO */ \ - /* Emitted by RegExpBytecodePeepholeOptimization. */ \ - /* Bit Layout: */ \ - /* 0x00 - 0x07 0x3A (fixed) Bytecode */ \ - /* 0x08 - 0x1F Load character offset from current position */ \ - /* 0x20 - 0x2F Number of characters to advance */ \ - /* 0x30 - 0x3F Character to check if it is less than current char */ \ - /* 0x40 - 0xBF Bit Table */ \ - /* 0xC0 - 0xDF Address of bytecode when character is matched */ \ - /* 0xE0 - 0xFF Address of bytecode when no match */ \ - V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32) - -#define COUNT(...) +1 -static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT); -#undef COUNT - -// Just making sure we assigned values above properly. They should be -// contiguous, strictly increasing, and start at 0. -// TODO(jgruber): Do not explicitly assign values, instead generate them -// implicitly from the list order. -STATIC_ASSERT(kRegExpBytecodeCount == 59); - -#define DECLARE_BYTECODES(name, code, length) \ - static constexpr int BC_##name = code; -BYTECODE_ITERATOR(DECLARE_BYTECODES) -#undef DECLARE_BYTECODES - -static constexpr int kRegExpBytecodeLengths[] = { -#define DECLARE_BYTECODE_LENGTH(name, code, length) length, - BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH) -#undef DECLARE_BYTECODE_LENGTH -}; - -inline constexpr int RegExpBytecodeLength(int bytecode) { - return kRegExpBytecodeLengths[bytecode]; -} - -static const char* const kRegExpBytecodeNames[] = { -#define DECLARE_BYTECODE_NAME(name, ...) #name, - BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME) -#undef DECLARE_BYTECODE_NAME -}; - -inline const char* RegExpBytecodeName(int bytecode) { - return kRegExpBytecodeNames[bytecode]; -} - -void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc); -void RegExpBytecodeDisassemble(const byte* code_base, int length, - const char* pattern); - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_BYTECODES_H_ diff --git a/js/src/new-regexp/regexp-compiler-tonode.cc b/js/src/new-regexp/regexp-compiler-tonode.cc deleted file mode 100644 index 7de167eef..000000000 --- a/js/src/new-regexp/regexp-compiler-tonode.cc +++ /dev/null @@ -1,1589 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-compiler.h" - -#include "new-regexp/regexp.h" -#ifdef V8_INTL_SUPPORT -#include "new-regexp/special-case.h" -#endif // V8_INTL_SUPPORT - -#ifdef V8_INTL_SUPPORT -#include "unicode/locid.h" -#include "unicode/uniset.h" -#include "unicode/utypes.h" -#endif // V8_INTL_SUPPORT - -namespace v8 { -namespace internal { - -using namespace regexp_compiler_constants; // NOLINT(build/namespaces) - -// ------------------------------------------------------------------- -// Tree to graph conversion - -RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - ZoneList* elms = - new (compiler->zone()) ZoneList(1, compiler->zone()); - elms->Add(TextElement::Atom(this), compiler->zone()); - return new (compiler->zone()) - TextNode(elms, compiler->read_backward(), on_success); -} - -RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - return new (compiler->zone()) - TextNode(elements(), compiler->read_backward(), on_success); -} - -static bool CompareInverseRanges(ZoneList* ranges, - const int* special_class, int length) { - length--; // Remove final marker. - DCHECK_EQ(kRangeEndMarker, special_class[length]); - DCHECK_NE(0, ranges->length()); - DCHECK_NE(0, length); - DCHECK_NE(0, special_class[0]); - if (ranges->length() != (length >> 1) + 1) { - return false; - } - CharacterRange range = ranges->at(0); - if (range.from() != 0) { - return false; - } - for (int i = 0; i < length; i += 2) { - if (special_class[i] != (range.to() + 1)) { - return false; - } - range = ranges->at((i >> 1) + 1); - if (special_class[i + 1] != range.from()) { - return false; - } - } - if (range.to() != String::kMaxCodePoint) { - return false; - } - return true; -} - -static bool CompareRanges(ZoneList* ranges, - const int* special_class, int length) { - length--; // Remove final marker. - DCHECK_EQ(kRangeEndMarker, special_class[length]); - if (ranges->length() * 2 != length) { - return false; - } - for (int i = 0; i < length; i += 2) { - CharacterRange range = ranges->at(i >> 1); - if (range.from() != special_class[i] || - range.to() != special_class[i + 1] - 1) { - return false; - } - } - return true; -} - -bool RegExpCharacterClass::is_standard(Zone* zone) { - // TODO(lrn): Remove need for this function, by not throwing away information - // along the way. - if (is_negated()) { - return false; - } - if (set_.is_standard()) { - return true; - } - if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { - set_.set_standard_set_type('s'); - return true; - } - if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { - set_.set_standard_set_type('S'); - return true; - } - if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, - kLineTerminatorRangeCount)) { - set_.set_standard_set_type('.'); - return true; - } - if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, - kLineTerminatorRangeCount)) { - set_.set_standard_set_type('n'); - return true; - } - if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { - set_.set_standard_set_type('w'); - return true; - } - if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { - set_.set_standard_set_type('W'); - return true; - } - return false; -} - -UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList* base) { - // The unicode range splitter categorizes given character ranges into: - // - Code points from the BMP representable by one code unit. - // - Code points outside the BMP that need to be split into surrogate pairs. - // - Lone lead surrogates. - // - Lone trail surrogates. - // Lone surrogates are valid code points, even though no actual characters. - // They require special matching to make sure we do not split surrogate pairs. - - for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); -} - -void UnicodeRangeSplitter::AddRange(CharacterRange range) { - static constexpr uc32 kBmp1Start = 0; - static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1; - static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1; - static constexpr uc32 kBmp2End = kNonBmpStart - 1; - - // Ends are all inclusive. - STATIC_ASSERT(kBmp1Start == 0); - STATIC_ASSERT(kBmp1Start < kBmp1End); - STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart); - STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd); - STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); - STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd); - STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start); - STATIC_ASSERT(kBmp2Start < kBmp2End); - STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart); - STATIC_ASSERT(kNonBmpStart < kNonBmpEnd); - - static constexpr uc32 kStarts[] = { - kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, - kBmp2Start, kNonBmpStart, - }; - - static constexpr uc32 kEnds[] = { - kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, - }; - - CharacterRangeVector* const kTargets[] = { - &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, - }; - - static constexpr int kCount = arraysize(kStarts); - STATIC_ASSERT(kCount == arraysize(kEnds)); - STATIC_ASSERT(kCount == arraysize(kTargets)); - - for (int i = 0; i < kCount; i++) { - if (kStarts[i] > range.to()) break; - const uc32 from = std::max(kStarts[i], range.from()); - const uc32 to = std::min(kEnds[i], range.to()); - if (from > to) continue; - kTargets[i]->emplace_back(CharacterRange::Range(from, to)); - } -} - -namespace { - -// Translates between new and old V8-isms (SmallVector, ZoneList). -ZoneList* ToCanonicalZoneList( - const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { - if (v->empty()) return nullptr; - - ZoneList* result = - new (zone) ZoneList(static_cast(v->size()), zone); - for (size_t i = 0; i < v->size(); i++) { - result->Add(v->at(i), zone); - } - - CharacterRange::Canonicalize(result); - return result; -} - -void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, - RegExpNode* on_success, UnicodeRangeSplitter* splitter) { - ZoneList* bmp = - ToCanonicalZoneList(splitter->bmp(), compiler->zone()); - if (bmp == nullptr) return; - JSRegExp::Flags default_flags = JSRegExp::Flags(); - result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( - compiler->zone(), bmp, compiler->read_backward(), on_success, - default_flags))); -} - -void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - ZoneList* non_bmp = - ToCanonicalZoneList(splitter->non_bmp(), compiler->zone()); - if (non_bmp == nullptr) return; - DCHECK(!compiler->one_byte()); - Zone* zone = compiler->zone(); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - CharacterRange::Canonicalize(non_bmp); - for (int i = 0; i < non_bmp->length(); i++) { - // Match surrogate pair. - // E.g. [\u10005-\u11005] becomes - // \ud800[\udc05-\udfff]| - // [\ud801-\ud803][\udc00-\udfff]| - // \ud804[\udc00-\udc05] - uc32 from = non_bmp->at(i).from(); - uc32 to = non_bmp->at(i).to(); - uc16 from_l = unibrow::Utf16::LeadSurrogate(from); - uc16 from_t = unibrow::Utf16::TrailSurrogate(from); - uc16 to_l = unibrow::Utf16::LeadSurrogate(to); - uc16 to_t = unibrow::Utf16::TrailSurrogate(to); - if (from_l == to_l) { - // The lead surrogate is the same. - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Singleton(from_l), - CharacterRange::Range(from_t, to_t), compiler->read_backward(), - on_success, default_flags))); - } else { - if (from_t != kTrailSurrogateStart) { - // Add [from_l][from_t-\udfff] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Singleton(from_l), - CharacterRange::Range(from_t, kTrailSurrogateEnd), - compiler->read_backward(), on_success, default_flags))); - from_l++; - } - if (to_t != kTrailSurrogateEnd) { - // Add [to_l][\udc00-to_t] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Singleton(to_l), - CharacterRange::Range(kTrailSurrogateStart, to_t), - compiler->read_backward(), on_success, default_flags))); - to_l--; - } - if (from_l <= to_l) { - // Add [from_l-to_l][\udc00-\udfff] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Range(from_l, to_l), - CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), - compiler->read_backward(), on_success, default_flags))); - } - } - } -} - -RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( - RegExpCompiler* compiler, ZoneList* lookbehind, - ZoneList* match, RegExpNode* on_success, bool read_backward, - JSRegExp::Flags flags) { - Zone* zone = compiler->zone(); - RegExpNode* match_node = TextNode::CreateForCharacterRanges( - zone, match, read_backward, on_success, flags); - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - RegExpLookaround::Builder lookaround(false, match_node, stack_register, - position_register); - RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookbehind, !read_backward, lookaround.on_match_success(), flags); - return lookaround.ForMatch(negative_match); -} - -RegExpNode* MatchAndNegativeLookaroundInReadDirection( - RegExpCompiler* compiler, ZoneList* match, - ZoneList* lookahead, RegExpNode* on_success, - bool read_backward, JSRegExp::Flags flags) { - Zone* zone = compiler->zone(); - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - RegExpLookaround::Builder lookaround(false, on_success, stack_register, - position_register); - RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookahead, read_backward, lookaround.on_match_success(), flags); - return TextNode::CreateForCharacterRanges( - zone, match, read_backward, lookaround.ForMatch(negative_match), flags); -} - -void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - JSRegExp::Flags default_flags = JSRegExp::Flags(); - ZoneList* lead_surrogates = - ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); - if (lead_surrogates == nullptr) return; - Zone* zone = compiler->zone(); - // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). - ZoneList* trail_surrogates = CharacterRange::List( - zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); - - RegExpNode* match; - if (compiler->read_backward()) { - // Reading backward. Assert that reading forward, there is no trail - // surrogate, and then backward match the lead surrogate. - match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, trail_surrogates, lead_surrogates, on_success, true, - default_flags); - } else { - // Reading forward. Forward match the lead surrogate and assert that - // no trail surrogate follows. - match = MatchAndNegativeLookaroundInReadDirection( - compiler, lead_surrogates, trail_surrogates, on_success, false, - default_flags); - } - result->AddAlternative(GuardedAlternative(match)); -} - -void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - JSRegExp::Flags default_flags = JSRegExp::Flags(); - ZoneList* trail_surrogates = - ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); - if (trail_surrogates == nullptr) return; - Zone* zone = compiler->zone(); - // E.g. \udc01 becomes (?* lead_surrogates = CharacterRange::List( - zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); - - RegExpNode* match; - if (compiler->read_backward()) { - // Reading backward. Backward match the trail surrogate and assert that no - // lead surrogate precedes it. - match = MatchAndNegativeLookaroundInReadDirection( - compiler, trail_surrogates, lead_surrogates, on_success, true, - default_flags); - } else { - // Reading forward. Assert that reading backward, there is no lead - // surrogate, and then forward match the trail surrogate. - match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, lead_surrogates, trail_surrogates, on_success, false, - default_flags); - } - result->AddAlternative(GuardedAlternative(match)); -} - -RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, - RegExpNode* on_success) { - // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. - DCHECK(!compiler->read_backward()); - Zone* zone = compiler->zone(); - // Advance any character. If the character happens to be a lead surrogate and - // we advanced into the middle of a surrogate pair, it will work out, as - // nothing will match from there. We will have to advance again, consuming - // the associated trail surrogate. - ZoneList* range = CharacterRange::List( - zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - return TextNode::CreateForCharacterRanges(zone, range, false, on_success, - default_flags); -} - -void AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone) { -#ifdef V8_INTL_SUPPORT - DCHECK(CharacterRange::IsCanonical(ranges)); - - // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. - // See also https://crbug.com/v8/6727. - // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, - // which we use frequently internally. But large ranges can also easily be - // created by the user. We might want to have a more general caching mechanism - // for such ranges. - if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; - - // Use ICU to compute the case fold closure over the ranges. - icu::UnicodeSet set; - for (int i = 0; i < ranges->length(); i++) { - set.add(ranges->at(i).from(), ranges->at(i).to()); - } - ranges->Clear(); - set.closeOver(USET_CASE_INSENSITIVE); - // Full case mapping map single characters to multiple characters. - // Those are represented as strings in the set. Remove them so that - // we end up with only simple and common case mappings. - set.removeAllStrings(); - for (int i = 0; i < set.getRangeCount(); i++) { - ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), - zone); - } - // No errors and everything we collected have been ranges. - CharacterRange::Canonicalize(ranges); -#endif // V8_INTL_SUPPORT -} - -} // namespace - -RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - set_.Canonicalize(); - Zone* zone = compiler->zone(); - ZoneList* ranges = this->ranges(zone); - if (NeedsUnicodeCaseEquivalents(flags_)) { - AddUnicodeCaseEquivalents(ranges, zone); - } - if (IsUnicode(flags_) && !compiler->one_byte() && - !contains_split_surrogate()) { - if (is_negated()) { - ZoneList* negated = - new (zone) ZoneList(2, zone); - CharacterRange::Negate(ranges, negated, zone); - ranges = negated; - } - if (ranges->length() == 0) { - JSRegExp::Flags default_flags; - RegExpCharacterClass* fail = - new (zone) RegExpCharacterClass(zone, ranges, default_flags); - return new (zone) TextNode(fail, compiler->read_backward(), on_success); - } - if (standard_type() == '*') { - return UnanchoredAdvance(compiler, on_success); - } else { - ChoiceNode* result = new (zone) ChoiceNode(2, zone); - UnicodeRangeSplitter splitter(ranges); - AddBmpCharacters(compiler, result, on_success, &splitter); - AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); - AddLoneLeadSurrogates(compiler, result, on_success, &splitter); - AddLoneTrailSurrogates(compiler, result, on_success, &splitter); - return result; - } - } else { - return new (zone) TextNode(this, compiler->read_backward(), on_success); - } -} - -int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { - RegExpAtom* atom1 = (*a)->AsAtom(); - RegExpAtom* atom2 = (*b)->AsAtom(); - uc16 character1 = atom1->data().at(0); - uc16 character2 = atom2->data().at(0); - if (character1 < character2) return -1; - if (character1 > character2) return 1; - return 0; -} - -#ifdef V8_INTL_SUPPORT - -// Case Insensitve comparesion -int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) { - RegExpAtom* atom1 = (*a)->AsAtom(); - RegExpAtom* atom2 = (*b)->AsAtom(); - icu::UnicodeString character1(atom1->data().at(0)); - return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT); -} - -#else - -static unibrow::uchar Canonical( - unibrow::Mapping* canonicalize, - unibrow::uchar c) { - unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; - int length = canonicalize->get(c, '\0', chars); - DCHECK_LE(length, 1); - unibrow::uchar canonical = c; - if (length == 1) canonical = chars[0]; - return canonical; -} - -int CompareFirstCharCaseIndependent( - unibrow::Mapping* canonicalize, - RegExpTree* const* a, RegExpTree* const* b) { - RegExpAtom* atom1 = (*a)->AsAtom(); - RegExpAtom* atom2 = (*b)->AsAtom(); - unibrow::uchar character1 = atom1->data().at(0); - unibrow::uchar character2 = atom2->data().at(0); - if (character1 == character2) return 0; - if (character1 >= 'a' || character2 >= 'a') { - character1 = Canonical(canonicalize, character1); - character2 = Canonical(canonicalize, character2); - } - return static_cast(character1) - static_cast(character2); -} -#endif // V8_INTL_SUPPORT - -// We can stable sort runs of atoms, since the order does not matter if they -// start with different characters. -// Returns true if any consecutive atoms were found. -bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { - ZoneList* alternatives = this->alternatives(); - int length = alternatives->length(); - bool found_consecutive_atoms = false; - for (int i = 0; i < length; i++) { - while (i < length) { - RegExpTree* alternative = alternatives->at(i); - if (alternative->IsAtom()) break; - i++; - } - // i is length or it is the index of an atom. - if (i == length) break; - int first_atom = i; - JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags(); - i++; - while (i < length) { - RegExpTree* alternative = alternatives->at(i); - if (!alternative->IsAtom()) break; - if (alternative->AsAtom()->flags() != flags) break; - i++; - } - // Sort atoms to get ones with common prefixes together. - // This step is more tricky if we are in a case-independent regexp, - // because it would change /is|I/ to /I|is/, and order matters when - // the regexp parts don't match only disjoint starting points. To fix - // this we have a version of CompareFirstChar that uses case- - // independent character classes for comparison. - DCHECK_LT(first_atom, alternatives->length()); - DCHECK_LE(i, alternatives->length()); - DCHECK_LE(first_atom, i); - if (IgnoreCase(flags)) { -#ifdef V8_INTL_SUPPORT - alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom, - i - first_atom); -#else - unibrow::Mapping* canonicalize = - compiler->isolate()->regexp_macro_assembler_canonicalize(); - auto compare_closure = [canonicalize](RegExpTree* const* a, - RegExpTree* const* b) { - return CompareFirstCharCaseIndependent(canonicalize, a, b); - }; - alternatives->StableSort(compare_closure, first_atom, i - first_atom); -#endif // V8_INTL_SUPPORT - } else { - alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); - } - if (i - first_atom > 1) found_consecutive_atoms = true; - } - return found_consecutive_atoms; -} - -// Optimizes ab|ac|az to a(?:b|c|d). -void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { - Zone* zone = compiler->zone(); - ZoneList* alternatives = this->alternatives(); - int length = alternatives->length(); - - int write_posn = 0; - int i = 0; - while (i < length) { - RegExpTree* alternative = alternatives->at(i); - if (!alternative->IsAtom()) { - alternatives->at(write_posn++) = alternatives->at(i); - i++; - continue; - } - RegExpAtom* const atom = alternative->AsAtom(); - JSRegExp::Flags flags = atom->flags(); -#ifdef V8_INTL_SUPPORT - icu::UnicodeString common_prefix(atom->data().at(0)); -#else - unibrow::uchar common_prefix = atom->data().at(0); -#endif // V8_INTL_SUPPORT - int first_with_prefix = i; - int prefix_length = atom->length(); - i++; - while (i < length) { - alternative = alternatives->at(i); - if (!alternative->IsAtom()) break; - RegExpAtom* const atom = alternative->AsAtom(); - if (atom->flags() != flags) break; -#ifdef V8_INTL_SUPPORT - icu::UnicodeString new_prefix(atom->data().at(0)); - if (new_prefix != common_prefix) { - if (!IgnoreCase(flags)) break; - if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0) - break; - } -#else - unibrow::uchar new_prefix = atom->data().at(0); - if (new_prefix != common_prefix) { - if (!IgnoreCase(flags)) break; - unibrow::Mapping* canonicalize = - compiler->isolate()->regexp_macro_assembler_canonicalize(); - new_prefix = Canonical(canonicalize, new_prefix); - common_prefix = Canonical(canonicalize, common_prefix); - if (new_prefix != common_prefix) break; - } -#endif // V8_INTL_SUPPORT - prefix_length = Min(prefix_length, atom->length()); - i++; - } - if (i > first_with_prefix + 2) { - // Found worthwhile run of alternatives with common prefix of at least one - // character. The sorting function above did not sort on more than one - // character for reasons of correctness, but there may still be a longer - // common prefix if the terms were similar or presorted in the input. - // Find out how long the common prefix is. - int run_length = i - first_with_prefix; - RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom(); - for (int j = 1; j < run_length && prefix_length > 1; j++) { - RegExpAtom* old_atom = - alternatives->at(j + first_with_prefix)->AsAtom(); - for (int k = 1; k < prefix_length; k++) { - if (atom->data().at(k) != old_atom->data().at(k)) { - prefix_length = k; - break; - } - } - } - RegExpAtom* prefix = new (zone) - RegExpAtom(atom->data().SubVector(0, prefix_length), flags); - ZoneList* pair = new (zone) ZoneList(2, zone); - pair->Add(prefix, zone); - ZoneList* suffixes = - new (zone) ZoneList(run_length, zone); - for (int j = 0; j < run_length; j++) { - RegExpAtom* old_atom = - alternatives->at(j + first_with_prefix)->AsAtom(); - int len = old_atom->length(); - if (len == prefix_length) { - suffixes->Add(new (zone) RegExpEmpty(), zone); - } else { - RegExpTree* suffix = new (zone) RegExpAtom( - old_atom->data().SubVector(prefix_length, old_atom->length()), - flags); - suffixes->Add(suffix, zone); - } - } - pair->Add(new (zone) RegExpDisjunction(suffixes), zone); - alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); - } else { - // Just copy any non-worthwhile alternatives. - for (int j = first_with_prefix; j < i; j++) { - alternatives->at(write_posn++) = alternatives->at(j); - } - } - } - alternatives->Rewind(write_posn); // Trim end of array. -} - -// Optimizes b|c|z to [bcz]. -void RegExpDisjunction::FixSingleCharacterDisjunctions( - RegExpCompiler* compiler) { - Zone* zone = compiler->zone(); - ZoneList* alternatives = this->alternatives(); - int length = alternatives->length(); - - int write_posn = 0; - int i = 0; - while (i < length) { - RegExpTree* alternative = alternatives->at(i); - if (!alternative->IsAtom()) { - alternatives->at(write_posn++) = alternatives->at(i); - i++; - continue; - } - RegExpAtom* const atom = alternative->AsAtom(); - if (atom->length() != 1) { - alternatives->at(write_posn++) = alternatives->at(i); - i++; - continue; - } - JSRegExp::Flags flags = atom->flags(); - DCHECK_IMPLIES(IsUnicode(flags), - !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); - bool contains_trail_surrogate = - unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); - int first_in_run = i; - i++; - // Find a run of single-character atom alternatives that have identical - // flags (case independence and unicode-ness). - while (i < length) { - alternative = alternatives->at(i); - if (!alternative->IsAtom()) break; - RegExpAtom* const atom = alternative->AsAtom(); - if (atom->length() != 1) break; - if (atom->flags() != flags) break; - DCHECK_IMPLIES(IsUnicode(flags), - !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); - contains_trail_surrogate |= - unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); - i++; - } - if (i > first_in_run + 1) { - // Found non-trivial run of single-character alternatives. - int run_length = i - first_in_run; - ZoneList* ranges = - new (zone) ZoneList(2, zone); - for (int j = 0; j < run_length; j++) { - RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); - DCHECK_EQ(old_atom->length(), 1); - ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); - } - RegExpCharacterClass::CharacterClassFlags character_class_flags; - if (IsUnicode(flags) && contains_trail_surrogate) { - character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; - } - alternatives->at(write_posn++) = new (zone) - RegExpCharacterClass(zone, ranges, flags, character_class_flags); - } else { - // Just copy any trivial alternatives. - for (int j = first_in_run; j < i; j++) { - alternatives->at(write_posn++) = alternatives->at(j); - } - } - } - alternatives->Rewind(write_posn); // Trim end of array. -} - -RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - ZoneList* alternatives = this->alternatives(); - - if (alternatives->length() > 2) { - bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); - if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); - FixSingleCharacterDisjunctions(compiler); - if (alternatives->length() == 1) { - return alternatives->at(0)->ToNode(compiler, on_success); - } - } - - int length = alternatives->length(); - - ChoiceNode* result = - new (compiler->zone()) ChoiceNode(length, compiler->zone()); - for (int i = 0; i < length; i++) { - GuardedAlternative alternative( - alternatives->at(i)->ToNode(compiler, on_success)); - result->AddAlternative(alternative); - } - return result; -} - -RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - return ToNode(min(), max(), is_greedy(), body(), compiler, on_success); -} - -namespace { -// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and -// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) -RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, - RegExpNode* on_success, - RegExpAssertion::AssertionType type, - JSRegExp::Flags flags) { - DCHECK(NeedsUnicodeCaseEquivalents(flags)); - Zone* zone = compiler->zone(); - ZoneList* word_range = - new (zone) ZoneList(2, zone); - CharacterRange::AddClassEscape('w', word_range, true, zone); - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - ChoiceNode* result = new (zone) ChoiceNode(2, zone); - // Add two choices. The (non-)boundary could start with a word or - // a non-word-character. - for (int i = 0; i < 2; i++) { - bool lookbehind_for_word = i == 0; - bool lookahead_for_word = - (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; - // Look to the left. - RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, - stack_register, position_register); - RegExpNode* backward = TextNode::CreateForCharacterRanges( - zone, word_range, true, lookbehind.on_match_success(), flags); - // Look to the right. - RegExpLookaround::Builder lookahead(lookahead_for_word, - lookbehind.ForMatch(backward), - stack_register, position_register); - RegExpNode* forward = TextNode::CreateForCharacterRanges( - zone, word_range, false, lookahead.on_match_success(), flags); - result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); - } - return result; -} -} // anonymous namespace - -RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - NodeInfo info; - Zone* zone = compiler->zone(); - - switch (assertion_type()) { - case START_OF_LINE: - return AssertionNode::AfterNewline(on_success); - case START_OF_INPUT: - return AssertionNode::AtStart(on_success); - case BOUNDARY: - return NeedsUnicodeCaseEquivalents(flags_) - ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, - flags_) - : AssertionNode::AtBoundary(on_success); - case NON_BOUNDARY: - return NeedsUnicodeCaseEquivalents(flags_) - ? BoundaryAssertionAsLookaround(compiler, on_success, - NON_BOUNDARY, flags_) - : AssertionNode::AtNonBoundary(on_success); - case END_OF_INPUT: - return AssertionNode::AtEnd(on_success); - case END_OF_LINE: { - // Compile $ in multiline regexps as an alternation with a positive - // lookahead in one side and an end-of-input on the other side. - // We need two registers for the lookahead. - int stack_pointer_register = compiler->AllocateRegister(); - int position_register = compiler->AllocateRegister(); - // The ChoiceNode to distinguish between a newline and end-of-input. - ChoiceNode* result = new (zone) ChoiceNode(2, zone); - // Create a newline atom. - ZoneList* newline_ranges = - new (zone) ZoneList(3, zone); - CharacterRange::AddClassEscape('n', newline_ranges, false, zone); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - RegExpCharacterClass* newline_atom = - new (zone) RegExpCharacterClass('n', default_flags); - TextNode* newline_matcher = - new (zone) TextNode(newline_atom, false, - ActionNode::PositiveSubmatchSuccess( - stack_pointer_register, position_register, - 0, // No captures inside. - -1, // Ignored if no captures. - on_success)); - // Create an end-of-input matcher. - RegExpNode* end_of_line = ActionNode::BeginSubmatch( - stack_pointer_register, position_register, newline_matcher); - // Add the two alternatives to the ChoiceNode. - GuardedAlternative eol_alternative(end_of_line); - result->AddAlternative(eol_alternative); - GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); - result->AddAlternative(end_alternative); - return result; - } - default: - UNREACHABLE(); - } - return on_success; -} - -RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - return new (compiler->zone()) - BackReferenceNode(RegExpCapture::StartRegister(index()), - RegExpCapture::EndRegister(index()), flags_, - compiler->read_backward(), on_success); -} - -RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - return on_success; -} - -RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, - int stack_pointer_register, - int position_register, - int capture_register_count, - int capture_register_start) - : is_positive_(is_positive), - on_success_(on_success), - stack_pointer_register_(stack_pointer_register), - position_register_(position_register) { - if (is_positive_) { - on_match_success_ = ActionNode::PositiveSubmatchSuccess( - stack_pointer_register, position_register, capture_register_count, - capture_register_start, on_success_); - } else { - Zone* zone = on_success_->zone(); - on_match_success_ = new (zone) NegativeSubmatchSuccess( - stack_pointer_register, position_register, capture_register_count, - capture_register_start, zone); - } -} - -RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { - if (is_positive_) { - return ActionNode::BeginSubmatch(stack_pointer_register_, - position_register_, match); - } else { - Zone* zone = on_success_->zone(); - // We use a ChoiceNode to represent the negative lookaround. The first - // alternative is the negative match. On success, the end node backtracks. - // On failure, the second alternative is tried and leads to success. - // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the - // first exit when calculating quick checks. - ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( - GuardedAlternative(match), GuardedAlternative(on_success_), zone); - return ActionNode::BeginSubmatch(stack_pointer_register_, - position_register_, choice_node); - } -} - -RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - int stack_pointer_register = compiler->AllocateRegister(); - int position_register = compiler->AllocateRegister(); - - const int registers_per_capture = 2; - const int register_of_first_capture = 2; - int register_count = capture_count_ * registers_per_capture; - int register_start = - register_of_first_capture + capture_from_ * registers_per_capture; - - RegExpNode* result; - bool was_reading_backward = compiler->read_backward(); - compiler->set_read_backward(type() == LOOKBEHIND); - Builder builder(is_positive(), on_success, stack_pointer_register, - position_register, register_count, register_start); - RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); - result = builder.ForMatch(match); - compiler->set_read_backward(was_reading_backward); - return result; -} - -RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - return ToNode(body(), index(), compiler, on_success); -} - -RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, - RegExpCompiler* compiler, - RegExpNode* on_success) { - DCHECK_NOT_NULL(body); - int start_reg = RegExpCapture::StartRegister(index); - int end_reg = RegExpCapture::EndRegister(index); - if (compiler->read_backward()) std::swap(start_reg, end_reg); - RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); - RegExpNode* body_node = body->ToNode(compiler, store_end); - return ActionNode::StorePosition(start_reg, true, body_node); -} - -namespace { - -class AssertionSequenceRewriter final { - public: - // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass - // instead of sprinkling rewrites into the AST->Node conversion process. - static void MaybeRewrite(ZoneList* terms, Zone* zone) { - AssertionSequenceRewriter rewriter(terms, zone); - - static constexpr int kNoIndex = -1; - int from = kNoIndex; - - for (int i = 0; i < terms->length(); i++) { - RegExpTree* t = terms->at(i); - if (from == kNoIndex && t->IsAssertion()) { - from = i; // Start a sequence. - } else if (from != kNoIndex && !t->IsAssertion()) { - // Terminate and process the sequence. - if (i - from > 1) rewriter.Rewrite(from, i); - from = kNoIndex; - } - } - - if (from != kNoIndex && terms->length() - from > 1) { - rewriter.Rewrite(from, terms->length()); - } - } - - // All assertions are zero width. A consecutive sequence of assertions is - // order-independent. There's two ways we can optimize here: - // 1. fold all identical assertions. - // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire - // sequence fails. - void Rewrite(int from, int to) { - DCHECK_GT(to, from + 1); - - // Bitfield of all seen assertions. - uint32_t seen_assertions = 0; - STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte); - - // Flags must match for folding. - JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags(); - bool saw_mismatched_flags = false; - - for (int i = from; i < to; i++) { - RegExpAssertion* t = terms_->at(i)->AsAssertion(); - if (t->flags() != flags) saw_mismatched_flags = true; - const uint32_t bit = 1 << t->assertion_type(); - - if ((seen_assertions & bit) && !saw_mismatched_flags) { - // Fold duplicates. - terms_->Set(i, new (zone_) RegExpEmpty()); - } - - seen_assertions |= bit; - } - - // Collapse failures. - const uint32_t always_fails_mask = - 1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY; - if ((seen_assertions & always_fails_mask) == always_fails_mask) { - ReplaceSequenceWithFailure(from, to); - } - } - - void ReplaceSequenceWithFailure(int from, int to) { - // Replace the entire sequence with a single node that always fails. - // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the - // negated '*' (everything) range serves the purpose. - ZoneList* ranges = - new (zone_) ZoneList(0, zone_); - RegExpCharacterClass* cc = - new (zone_) RegExpCharacterClass(zone_, ranges, JSRegExp::Flags()); - terms_->Set(from, cc); - - // Zero out the rest. - RegExpEmpty* empty = new (zone_) RegExpEmpty(); - for (int i = from + 1; i < to; i++) terms_->Set(i, empty); - } - - private: - AssertionSequenceRewriter(ZoneList* terms, Zone* zone) - : zone_(zone), terms_(terms) {} - - Zone* zone_; - ZoneList* terms_; -}; - -} // namespace - -RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) { - ZoneList* children = nodes(); - - AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); - - RegExpNode* current = on_success; - if (compiler->read_backward()) { - for (int i = 0; i < children->length(); i++) { - current = children->at(i)->ToNode(compiler, current); - } - } else { - for (int i = children->length() - 1; i >= 0; i--) { - current = children->at(i)->ToNode(compiler, current); - } - } - return current; -} - -static void AddClass(const int* elmv, int elmc, - ZoneList* ranges, Zone* zone) { - elmc--; - DCHECK_EQ(kRangeEndMarker, elmv[elmc]); - for (int i = 0; i < elmc; i += 2) { - DCHECK(elmv[i] < elmv[i + 1]); - ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); - } -} - -static void AddClassNegated(const int* elmv, int elmc, - ZoneList* ranges, Zone* zone) { - elmc--; - DCHECK_EQ(kRangeEndMarker, elmv[elmc]); - DCHECK_NE(0x0000, elmv[0]); - DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]); - uc16 last = 0x0000; - for (int i = 0; i < elmc; i += 2) { - DCHECK(last <= elmv[i] - 1); - DCHECK(elmv[i] < elmv[i + 1]); - ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); - last = elmv[i + 1]; - } - ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); -} - -void CharacterRange::AddClassEscape(char type, ZoneList* ranges, - bool add_unicode_case_equivalents, - Zone* zone) { - if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { - // See #sec-runtime-semantics-wordcharacters-abstract-operation - // In case of unicode and ignore_case, we need to create the closure over - // case equivalent characters before negating. - ZoneList* new_ranges = - new (zone) ZoneList(2, zone); - AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); - AddUnicodeCaseEquivalents(new_ranges, zone); - if (type == 'W') { - ZoneList* negated = - new (zone) ZoneList(2, zone); - CharacterRange::Negate(new_ranges, negated, zone); - new_ranges = negated; - } - ranges->AddAll(*new_ranges, zone); - return; - } - AddClassEscape(type, ranges, zone); -} - -void CharacterRange::AddClassEscape(char type, ZoneList* ranges, - Zone* zone) { - switch (type) { - case 's': - AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); - break; - case 'S': - AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); - break; - case 'w': - AddClass(kWordRanges, kWordRangeCount, ranges, zone); - break; - case 'W': - AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); - break; - case 'd': - AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); - break; - case 'D': - AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); - break; - case '.': - AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, - zone); - break; - // This is not a character range as defined by the spec but a - // convenient shorthand for a character class that matches any - // character. - case '*': - ranges->Add(CharacterRange::Everything(), zone); - break; - // This is the set of characters matched by the $ and ^ symbols - // in multiline mode. - case 'n': - AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); - break; - default: - UNREACHABLE(); - } -} - -Vector CharacterRange::GetWordBounds() { - return Vector(kWordRanges, kWordRangeCount - 1); -} - -// static -void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, - ZoneList* ranges, - bool is_one_byte) { - CharacterRange::Canonicalize(ranges); - int range_count = ranges->length(); -#ifdef V8_INTL_SUPPORT - icu::UnicodeSet others; - for (int i = 0; i < range_count; i++) { - CharacterRange range = ranges->at(i); - uc32 from = range.from(); - if (from > String::kMaxUtf16CodeUnit) continue; - uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit); - // Nothing to be done for surrogates. - if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; - if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { - if (from > String::kMaxOneByteCharCode) continue; - if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode; - } - others.add(from, to); - } - - // Compute the set of additional characters that should be added, - // using UnicodeSet::closeOver. ECMA 262 defines slightly different - // case-folding rules than Unicode, so some characters that are - // added by closeOver do not match anything other than themselves in - // JS. For example, 'Å¿' (U+017F LATIN SMALL LETTER LONG S) is the - // same case-insensitive character as 's' or 'S' according to - // Unicode, but does not match any other character in JS. To handle - // this case, we add such characters to the IgnoreSet and filter - // them out. We filter twice: once before calling closeOver (to - // prevent 'Å¿' from adding 's'), and once after calling closeOver - // (to prevent 's' from adding 'Å¿'). See regexp/special-case.h for - // more information. - icu::UnicodeSet already_added(others); - others.removeAll(RegExpCaseFolding::IgnoreSet()); - others.closeOver(USET_CASE_INSENSITIVE); - others.removeAll(RegExpCaseFolding::IgnoreSet()); - others.removeAll(already_added); - - // Add others to the ranges - for (int32_t i = 0; i < others.getRangeCount(); i++) { - UChar32 from = others.getRangeStart(i); - UChar32 to = others.getRangeEnd(i); - if (from == to) { - ranges->Add(CharacterRange::Singleton(from), zone); - } else { - ranges->Add(CharacterRange::Range(from, to), zone); - } - } -#else - for (int i = 0; i < range_count; i++) { - CharacterRange range = ranges->at(i); - uc32 bottom = range.from(); - if (bottom > String::kMaxUtf16CodeUnit) continue; - uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); - // Nothing to be done for surrogates. - if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; - if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { - if (bottom > String::kMaxOneByteCharCode) continue; - if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; - } - unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - if (top == bottom) { - // If this is a singleton we just expand the one character. - int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); - for (int i = 0; i < length; i++) { - uc32 chr = chars[i]; - if (chr != bottom) { - ranges->Add(CharacterRange::Singleton(chars[i]), zone); - } - } - } else { - // If this is a range we expand the characters block by block, expanding - // contiguous subranges (blocks) one at a time. The approach is as - // follows. For a given start character we look up the remainder of the - // block that contains it (represented by the end point), for instance we - // find 'z' if the character is 'c'. A block is characterized by the - // property that all characters uncanonicalize in the same way, except - // that each entry in the result is incremented by the distance from the - // first element. So a-z is a block because 'a' uncanonicalizes to ['a', - // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once - // we've found the end point we look up its uncanonicalization and - // produce a range for each element. For instance for [c-f] we look up - // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if - // it is not already contained in the input, so [c-f] will be skipped but - // [C-F] will be added. If this range is not completely contained in a - // block we do this for all the blocks covered by the range (handling - // characters that is not in a block as a "singleton block"). - unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - int pos = bottom; - while (pos <= top) { - int length = - isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); - uc32 block_end; - if (length == 0) { - block_end = pos; - } else { - DCHECK_EQ(1, length); - block_end = equivalents[0]; - } - int end = (block_end > top) ? top : block_end; - length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', - equivalents); - for (int i = 0; i < length; i++) { - uc32 c = equivalents[i]; - uc32 range_from = c - (block_end - pos); - uc32 range_to = c - (block_end - end); - if (!(bottom <= range_from && range_to <= top)) { - ranges->Add(CharacterRange::Range(range_from, range_to), zone); - } - } - pos = end + 1; - } - } - } -#endif // V8_INTL_SUPPORT -} - -bool CharacterRange::IsCanonical(ZoneList* ranges) { - DCHECK_NOT_NULL(ranges); - int n = ranges->length(); - if (n <= 1) return true; - int max = ranges->at(0).to(); - for (int i = 1; i < n; i++) { - CharacterRange next_range = ranges->at(i); - if (next_range.from() <= max + 1) return false; - max = next_range.to(); - } - return true; -} - -ZoneList* CharacterSet::ranges(Zone* zone) { - if (ranges_ == nullptr) { - ranges_ = new (zone) ZoneList(2, zone); - CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); - } - return ranges_; -} - -// Move a number of elements in a zonelist to another position -// in the same list. Handles overlapping source and target areas. -static void MoveRanges(ZoneList* list, int from, int to, - int count) { - // Ranges are potentially overlapping. - if (from < to) { - for (int i = count - 1; i >= 0; i--) { - list->at(to + i) = list->at(from + i); - } - } else { - for (int i = 0; i < count; i++) { - list->at(to + i) = list->at(from + i); - } - } -} - -static int InsertRangeInCanonicalList(ZoneList* list, int count, - CharacterRange insert) { - // Inserts a range into list[0..count[, which must be sorted - // by from value and non-overlapping and non-adjacent, using at most - // list[0..count] for the result. Returns the number of resulting - // canonicalized ranges. Inserting a range may collapse existing ranges into - // fewer ranges, so the return value can be anything in the range 1..count+1. - uc32 from = insert.from(); - uc32 to = insert.to(); - int start_pos = 0; - int end_pos = count; - for (int i = count - 1; i >= 0; i--) { - CharacterRange current = list->at(i); - if (current.from() > to + 1) { - end_pos = i; - } else if (current.to() + 1 < from) { - start_pos = i + 1; - break; - } - } - - // Inserted range overlaps, or is adjacent to, ranges at positions - // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are - // not affected by the insertion. - // If start_pos == end_pos, the range must be inserted before start_pos. - // if start_pos < end_pos, the entire range from start_pos to end_pos - // must be merged with the insert range. - - if (start_pos == end_pos) { - // Insert between existing ranges at position start_pos. - if (start_pos < count) { - MoveRanges(list, start_pos, start_pos + 1, count - start_pos); - } - list->at(start_pos) = insert; - return count + 1; - } - if (start_pos + 1 == end_pos) { - // Replace single existing range at position start_pos. - CharacterRange to_replace = list->at(start_pos); - int new_from = Min(to_replace.from(), from); - int new_to = Max(to_replace.to(), to); - list->at(start_pos) = CharacterRange::Range(new_from, new_to); - return count; - } - // Replace a number of existing ranges from start_pos to end_pos - 1. - // Move the remaining ranges down. - - int new_from = Min(list->at(start_pos).from(), from); - int new_to = Max(list->at(end_pos - 1).to(), to); - if (end_pos < count) { - MoveRanges(list, end_pos, start_pos + 1, count - end_pos); - } - list->at(start_pos) = CharacterRange::Range(new_from, new_to); - return count - (end_pos - start_pos) + 1; -} - -void CharacterSet::Canonicalize() { - // Special/default classes are always considered canonical. The result - // of calling ranges() will be sorted. - if (ranges_ == nullptr) return; - CharacterRange::Canonicalize(ranges_); -} - -void CharacterRange::Canonicalize(ZoneList* character_ranges) { - if (character_ranges->length() <= 1) return; - // Check whether ranges are already canonical (increasing, non-overlapping, - // non-adjacent). - int n = character_ranges->length(); - int max = character_ranges->at(0).to(); - int i = 1; - while (i < n) { - CharacterRange current = character_ranges->at(i); - if (current.from() <= max + 1) { - break; - } - max = current.to(); - i++; - } - // Canonical until the i'th range. If that's all of them, we are done. - if (i == n) return; - - // The ranges at index i and forward are not canonicalized. Make them so by - // doing the equivalent of insertion sort (inserting each into the previous - // list, in order). - // Notice that inserting a range can reduce the number of ranges in the - // result due to combining of adjacent and overlapping ranges. - int read = i; // Range to insert. - int num_canonical = i; // Length of canonicalized part of list. - do { - num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical, - character_ranges->at(read)); - read++; - } while (read < n); - character_ranges->Rewind(num_canonical); - - DCHECK(CharacterRange::IsCanonical(character_ranges)); -} - -void CharacterRange::Negate(ZoneList* ranges, - ZoneList* negated_ranges, - Zone* zone) { - DCHECK(CharacterRange::IsCanonical(ranges)); - DCHECK_EQ(0, negated_ranges->length()); - int range_count = ranges->length(); - uc32 from = 0; - int i = 0; - if (range_count > 0 && ranges->at(0).from() == 0) { - from = ranges->at(0).to() + 1; - i = 1; - } - while (i < range_count) { - CharacterRange range = ranges->at(i); - negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); - from = range.to() + 1; - i++; - } - if (from < String::kMaxCodePoint) { - negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), - zone); - } -} - -// Scoped object to keep track of how much we unroll quantifier loops in the -// regexp graph generator. -class RegExpExpansionLimiter { - public: - static const int kMaxExpansionFactor = 6; - RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) - : compiler_(compiler), - saved_expansion_factor_(compiler->current_expansion_factor()), - ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { - DCHECK_LT(0, factor); - if (ok_to_expand_) { - if (factor > kMaxExpansionFactor) { - // Avoid integer overflow of the current expansion factor. - ok_to_expand_ = false; - compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); - } else { - int new_factor = saved_expansion_factor_ * factor; - ok_to_expand_ = (new_factor <= kMaxExpansionFactor); - compiler->set_current_expansion_factor(new_factor); - } - } - } - - ~RegExpExpansionLimiter() { - compiler_->set_current_expansion_factor(saved_expansion_factor_); - } - - bool ok_to_expand() { return ok_to_expand_; } - - private: - RegExpCompiler* compiler_; - int saved_expansion_factor_; - bool ok_to_expand_; - - DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); -}; - -RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, - RegExpTree* body, RegExpCompiler* compiler, - RegExpNode* on_success, - bool not_at_start) { - // x{f, t} becomes this: - // - // (r++)<-. - // | ` - // | (x) - // v ^ - // (r=0)-->(?)---/ [if r < t] - // | - // [if r >= f] \----> ... - // - - // 15.10.2.5 RepeatMatcher algorithm. - // The parser has already eliminated the case where max is 0. In the case - // where max_match is zero the parser has removed the quantifier if min was - // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. - - // If we know that we cannot match zero length then things are a little - // simpler since we don't need to make the special zero length match check - // from step 2.1. If the min and max are small we can unroll a little in - // this case. - static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} - static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} - if (max == 0) return on_success; // This can happen due to recursion. - bool body_can_be_empty = (body->min_match() == 0); - int body_start_reg = RegExpCompiler::kNoRegister; - Interval capture_registers = body->CaptureRegisters(); - bool needs_capture_clearing = !capture_registers.is_empty(); - Zone* zone = compiler->zone(); - - if (body_can_be_empty) { - body_start_reg = compiler->AllocateRegister(); - } else if (compiler->optimize() && !needs_capture_clearing) { - // Only unroll if there are no captures and the body can't be - // empty. - { - RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0)); - if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { - int new_max = (max == kInfinity) ? max : max - min; - // Recurse once to get the loop or optional matches after the fixed - // ones. - RegExpNode* answer = - ToNode(0, new_max, is_greedy, body, compiler, on_success, true); - // Unroll the forced matches from 0 to min. This can cause chains of - // TextNodes (which the parser does not generate). These should be - // combined if it turns out they hinder good code generation. - for (int i = 0; i < min; i++) { - answer = body->ToNode(compiler, answer); - } - return answer; - } - } - if (max <= kMaxUnrolledMaxMatches && min == 0) { - DCHECK_LT(0, max); // Due to the 'if' above. - RegExpExpansionLimiter limiter(compiler, max); - if (limiter.ok_to_expand()) { - // Unroll the optional matches up to max. - RegExpNode* answer = on_success; - for (int i = 0; i < max; i++) { - ChoiceNode* alternation = new (zone) ChoiceNode(2, zone); - if (is_greedy) { - alternation->AddAlternative( - GuardedAlternative(body->ToNode(compiler, answer))); - alternation->AddAlternative(GuardedAlternative(on_success)); - } else { - alternation->AddAlternative(GuardedAlternative(on_success)); - alternation->AddAlternative( - GuardedAlternative(body->ToNode(compiler, answer))); - } - answer = alternation; - if (not_at_start && !compiler->read_backward()) { - alternation->set_not_at_start(); - } - } - return answer; - } - } - } - bool has_min = min > 0; - bool has_max = max < RegExpTree::kInfinity; - bool needs_counter = has_min || has_max; - int reg_ctr = needs_counter ? compiler->AllocateRegister() - : RegExpCompiler::kNoRegister; - LoopChoiceNode* center = new (zone) LoopChoiceNode( - body->min_match() == 0, compiler->read_backward(), min, zone); - if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); - RegExpNode* loop_return = - needs_counter ? static_cast( - ActionNode::IncrementRegister(reg_ctr, center)) - : static_cast(center); - if (body_can_be_empty) { - // If the body can be empty we need to check if it was and then - // backtrack. - loop_return = - ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return); - } - RegExpNode* body_node = body->ToNode(compiler, loop_return); - if (body_can_be_empty) { - // If the body can be empty we need to store the start position - // so we can bail out if it was empty. - body_node = ActionNode::StorePosition(body_start_reg, false, body_node); - } - if (needs_capture_clearing) { - // Before entering the body of this loop we need to clear captures. - body_node = ActionNode::ClearCaptures(capture_registers, body_node); - } - GuardedAlternative body_alt(body_node); - if (has_max) { - Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max); - body_alt.AddGuard(body_guard, zone); - } - GuardedAlternative rest_alt(on_success); - if (has_min) { - Guard* rest_guard = new (compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); - rest_alt.AddGuard(rest_guard, zone); - } - if (is_greedy) { - center->AddLoopAlternative(body_alt); - center->AddContinueAlternative(rest_alt); - } else { - center->AddContinueAlternative(rest_alt); - center->AddLoopAlternative(body_alt); - } - if (needs_counter) { - return ActionNode::SetRegisterForLoop(reg_ctr, 0, center); - } else { - return center; - } -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-compiler.cc b/js/src/new-regexp/regexp-compiler.cc deleted file mode 100644 index 98771354c..000000000 --- a/js/src/new-regexp/regexp-compiler.cc +++ /dev/null @@ -1,3831 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-compiler.h" - -#include "new-regexp/regexp-macro-assembler-arch.h" -#ifdef V8_INTL_SUPPORT -#include "new-regexp/special-case.h" -#endif // V8_INTL_SUPPORT - -#ifdef V8_INTL_SUPPORT -#include "unicode/locid.h" -#include "unicode/uniset.h" -#include "unicode/utypes.h" -#endif // V8_INTL_SUPPORT - -namespace v8 { -namespace internal { - -using namespace regexp_compiler_constants; // NOLINT(build/namespaces) - -// ------------------------------------------------------------------- -// Implementation of the Irregexp regular expression engine. -// -// The Irregexp regular expression engine is intended to be a complete -// implementation of ECMAScript regular expressions. It generates either -// bytecodes or native code. - -// The Irregexp regexp engine is structured in three steps. -// 1) The parser generates an abstract syntax tree. See ast.cc. -// 2) From the AST a node network is created. The nodes are all -// subclasses of RegExpNode. The nodes represent states when -// executing a regular expression. Several optimizations are -// performed on the node network. -// 3) From the nodes we generate either byte codes or native code -// that can actually execute the regular expression (perform -// the search). The code generation step is described in more -// detail below. - -// Code generation. -// -// The nodes are divided into four main categories. -// * Choice nodes -// These represent places where the regular expression can -// match in more than one way. For example on entry to an -// alternation (foo|bar) or a repetition (*, +, ? or {}). -// * Action nodes -// These represent places where some action should be -// performed. Examples include recording the current position -// in the input string to a register (in order to implement -// captures) or other actions on register for example in order -// to implement the counters needed for {} repetitions. -// * Matching nodes -// These attempt to match some element part of the input string. -// Examples of elements include character classes, plain strings -// or back references. -// * End nodes -// These are used to implement the actions required on finding -// a successful match or failing to find a match. -// -// The code generated (whether as byte codes or native code) maintains -// some state as it runs. This consists of the following elements: -// -// * The capture registers. Used for string captures. -// * Other registers. Used for counters etc. -// * The current position. -// * The stack of backtracking information. Used when a matching node -// fails to find a match and needs to try an alternative. -// -// Conceptual regular expression execution model: -// -// There is a simple conceptual model of regular expression execution -// which will be presented first. The actual code generated is a more -// efficient simulation of the simple conceptual model: -// -// * Choice nodes are implemented as follows: -// For each choice except the last { -// push current position -// push backtrack code location -// -// backtrack code location: -// pop current position -// } -// -// -// * Actions nodes are generated as follows -// -// -// push backtrack code location -// -// backtrack code location: -// -// -// -// * Matching nodes are generated as follows: -// if input string matches at current position -// update current position -// -// else -// -// -// Thus it can be seen that the current position is saved and restored -// by the choice nodes, whereas the registers are saved and restored by -// by the action nodes that manipulate them. -// -// The other interesting aspect of this model is that nodes are generated -// at the point where they are needed by a recursive call to Emit(). If -// the node has already been code generated then the Emit() call will -// generate a jump to the previously generated code instead. In order to -// limit recursion it is possible for the Emit() function to put the node -// on a work list for later generation and instead generate a jump. The -// destination of the jump is resolved later when the code is generated. -// -// Actual regular expression code generation. -// -// Code generation is actually more complicated than the above. In order -// to improve the efficiency of the generated code some optimizations are -// performed -// -// * Choice nodes have 1-character lookahead. -// A choice node looks at the following character and eliminates some of -// the choices immediately based on that character. This is not yet -// implemented. -// * Simple greedy loops store reduced backtracking information. -// A quantifier like /.*foo/m will greedily match the whole input. It will -// then need to backtrack to a point where it can match "foo". The naive -// implementation of this would push each character position onto the -// backtracking stack, then pop them off one by one. This would use space -// proportional to the length of the input string. However since the "." -// can only match in one way and always has a constant length (in this case -// of 1) it suffices to store the current position on the top of the stack -// once. Matching now becomes merely incrementing the current position and -// backtracking becomes decrementing the current position and checking the -// result against the stored current position. This is faster and saves -// space. -// * The current state is virtualized. -// This is used to defer expensive operations until it is clear that they -// are needed and to generate code for a node more than once, allowing -// specialized an efficient versions of the code to be created. This is -// explained in the section below. -// -// Execution state virtualization. -// -// Instead of emitting code, nodes that manipulate the state can record their -// manipulation in an object called the Trace. The Trace object can record a -// current position offset, an optional backtrack code location on the top of -// the virtualized backtrack stack and some register changes. When a node is -// to be emitted it can flush the Trace or update it. Flushing the Trace -// will emit code to bring the actual state into line with the virtual state. -// Avoiding flushing the state can postpone some work (e.g. updates of capture -// registers). Postponing work can save time when executing the regular -// expression since it may be found that the work never has to be done as a -// failure to match can occur. In addition it is much faster to jump to a -// known backtrack code location than it is to pop an unknown backtrack -// location from the stack and jump there. -// -// The virtual state found in the Trace affects code generation. For example -// the virtual state contains the difference between the actual current -// position and the virtual current position, and matching code needs to use -// this offset to attempt a match in the correct location of the input -// string. Therefore code generated for a non-trivial trace is specialized -// to that trace. The code generator therefore has the ability to generate -// code for each node several times. In order to limit the size of the -// generated code there is an arbitrary limit on how many specialized sets of -// code may be generated for a given node. If the limit is reached, the -// trace is flushed and a generic version of the code for a node is emitted. -// This is subsequently used for that node. The code emitted for non-generic -// trace is not recorded in the node and so it cannot currently be reused in -// the event that code generation is requested for an identical trace. - -void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); } - -void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { - text->AddElement(TextElement::Atom(this), zone); -} - -void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { - text->AddElement(TextElement::CharClass(this), zone); -} - -void RegExpText::AppendToText(RegExpText* text, Zone* zone) { - for (int i = 0; i < elements()->length(); i++) - text->AddElement(elements()->at(i), zone); -} - -TextElement TextElement::Atom(RegExpAtom* atom) { - return TextElement(ATOM, atom); -} - -TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { - return TextElement(CHAR_CLASS, char_class); -} - -int TextElement::length() const { - switch (text_type()) { - case ATOM: - return atom()->length(); - - case CHAR_CLASS: - return 1; - } - UNREACHABLE(); -} - -class RecursionCheck { - public: - explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { - compiler->IncrementRecursionDepth(); - } - ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } - - private: - RegExpCompiler* compiler_; -}; - -// Attempts to compile the regexp using an Irregexp code generator. Returns -// a fixed array or a null handle depending on whether it succeeded. -RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool one_byte) - : next_register_(2 * (capture_count + 1)), - unicode_lookaround_stack_register_(kNoRegister), - unicode_lookaround_position_register_(kNoRegister), - work_list_(nullptr), - recursion_depth_(0), - one_byte_(one_byte), - reg_exp_too_big_(false), - limiting_recursion_(false), - optimize_(FLAG_regexp_optimization), - read_backward_(false), - current_expansion_factor_(1), - frequency_collator_(), - isolate_(isolate), - zone_(zone) { - accept_ = new (zone) EndNode(EndNode::ACCEPT, zone); - DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1); -} - -RegExpCompiler::CompilationResult RegExpCompiler::Assemble( - Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, - int capture_count, Handle pattern) { - macro_assembler_ = macro_assembler; - - ZoneVector work_list(zone()); - work_list_ = &work_list; - Label fail; - macro_assembler_->PushBacktrack(&fail); - Trace new_trace; - start->Emit(this, &new_trace); - macro_assembler_->BindJumpTarget(&fail); - macro_assembler_->Fail(); - while (!work_list.empty()) { - RegExpNode* node = work_list.back(); - work_list.pop_back(); - node->set_on_work_list(false); - if (!node->label()->is_bound()) node->Emit(this, &new_trace); - } - if (reg_exp_too_big_) { - macro_assembler_->AbortedCodeGeneration(); - return CompilationResult::RegExpTooBig(); - } - - Handle code = macro_assembler_->GetCode(pattern); - isolate->IncreaseTotalRegexpCodeGenerated(code); - work_list_ = nullptr; - - return {*code, next_register_}; -} - -bool Trace::DeferredAction::Mentions(int that) { - if (action_type() == ActionNode::CLEAR_CAPTURES) { - Interval range = static_cast(this)->range(); - return range.Contains(that); - } else { - return reg() == that; - } -} - -bool Trace::mentions_reg(int reg) { - for (DeferredAction* action = actions_; action != nullptr; - action = action->next()) { - if (action->Mentions(reg)) return true; - } - return false; -} - -bool Trace::GetStoredPosition(int reg, int* cp_offset) { - DCHECK_EQ(0, *cp_offset); - for (DeferredAction* action = actions_; action != nullptr; - action = action->next()) { - if (action->Mentions(reg)) { - if (action->action_type() == ActionNode::STORE_POSITION) { - *cp_offset = static_cast(action)->cp_offset(); - return true; - } else { - return false; - } - } - } - return false; -} - -// A (dynamically-sized) set of unsigned integers that behaves especially well -// on small integers (< kFirstLimit). May do zone-allocation. -class DynamicBitSet : public ZoneObject { - public: - V8_EXPORT_PRIVATE bool Get(unsigned value) const { - if (value < kFirstLimit) { - return (first_ & (1 << value)) != 0; - } else if (remaining_ == nullptr) { - return false; - } else { - return remaining_->Contains(value); - } - } - - // Destructively set a value in this set. - void Set(unsigned value, Zone* zone) { - if (value < kFirstLimit) { - first_ |= (1 << value); - } else { - if (remaining_ == nullptr) - remaining_ = new (zone) ZoneList(1, zone); - if (remaining_->is_empty() || !remaining_->Contains(value)) - remaining_->Add(value, zone); - } - } - - private: - static constexpr unsigned kFirstLimit = 32; - - uint32_t first_ = 0; - ZoneList* remaining_ = nullptr; -}; - -int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers, - Zone* zone) { - int max_register = RegExpCompiler::kNoRegister; - for (DeferredAction* action = actions_; action != nullptr; - action = action->next()) { - if (action->action_type() == ActionNode::CLEAR_CAPTURES) { - Interval range = static_cast(action)->range(); - for (int i = range.from(); i <= range.to(); i++) - affected_registers->Set(i, zone); - if (range.to() > max_register) max_register = range.to(); - } else { - affected_registers->Set(action->reg(), zone); - if (action->reg() > max_register) max_register = action->reg(); - } - } - return max_register; -} - -void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, - int max_register, - const DynamicBitSet& registers_to_pop, - const DynamicBitSet& registers_to_clear) { - for (int reg = max_register; reg >= 0; reg--) { - if (registers_to_pop.Get(reg)) { - assembler->PopRegister(reg); - } else if (registers_to_clear.Get(reg)) { - int clear_to = reg; - while (reg > 0 && registers_to_clear.Get(reg - 1)) { - reg--; - } - assembler->ClearRegisters(reg, clear_to); - } - } -} - -void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, - int max_register, - const DynamicBitSet& affected_registers, - DynamicBitSet* registers_to_pop, - DynamicBitSet* registers_to_clear, - Zone* zone) { - // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. - const int push_limit = (assembler->stack_limit_slack() + 1) / 2; - - // Count pushes performed to force a stack limit check occasionally. - int pushes = 0; - - for (int reg = 0; reg <= max_register; reg++) { - if (!affected_registers.Get(reg)) { - continue; - } - - // The chronologically first deferred action in the trace - // is used to infer the action needed to restore a register - // to its previous state (or not, if it's safe to ignore it). - enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; - DeferredActionUndoType undo_action = IGNORE; - - int value = 0; - bool absolute = false; - bool clear = false; - static const int kNoStore = kMinInt; - int store_position = kNoStore; - // This is a little tricky because we are scanning the actions in reverse - // historical order (newest first). - for (DeferredAction* action = actions_; action != nullptr; - action = action->next()) { - if (action->Mentions(reg)) { - switch (action->action_type()) { - case ActionNode::SET_REGISTER_FOR_LOOP: { - Trace::DeferredSetRegisterForLoop* psr = - static_cast(action); - if (!absolute) { - value += psr->value(); - absolute = true; - } - // SET_REGISTER_FOR_LOOP is only used for newly introduced loop - // counters. They can have a significant previous value if they - // occur in a loop. TODO(lrn): Propagate this information, so - // we can set undo_action to IGNORE if we know there is no value to - // restore. - undo_action = RESTORE; - DCHECK_EQ(store_position, kNoStore); - DCHECK(!clear); - break; - } - case ActionNode::INCREMENT_REGISTER: - if (!absolute) { - value++; - } - DCHECK_EQ(store_position, kNoStore); - DCHECK(!clear); - undo_action = RESTORE; - break; - case ActionNode::STORE_POSITION: { - Trace::DeferredCapture* pc = - static_cast(action); - if (!clear && store_position == kNoStore) { - store_position = pc->cp_offset(); - } - - // For captures we know that stores and clears alternate. - // Other register, are never cleared, and if the occur - // inside a loop, they might be assigned more than once. - if (reg <= 1) { - // Registers zero and one, aka "capture zero", is - // always set correctly if we succeed. There is no - // need to undo a setting on backtrack, because we - // will set it again or fail. - undo_action = IGNORE; - } else { - undo_action = pc->is_capture() ? CLEAR : RESTORE; - } - DCHECK(!absolute); - DCHECK_EQ(value, 0); - break; - } - case ActionNode::CLEAR_CAPTURES: { - // Since we're scanning in reverse order, if we've already - // set the position we have to ignore historically earlier - // clearing operations. - if (store_position == kNoStore) { - clear = true; - } - undo_action = RESTORE; - DCHECK(!absolute); - DCHECK_EQ(value, 0); - break; - } - default: - UNREACHABLE(); - break; - } - } - } - // Prepare for the undo-action (e.g., push if it's going to be popped). - if (undo_action == RESTORE) { - pushes++; - RegExpMacroAssembler::StackCheckFlag stack_check = - RegExpMacroAssembler::kNoStackLimitCheck; - if (pushes == push_limit) { - stack_check = RegExpMacroAssembler::kCheckStackLimit; - pushes = 0; - } - - assembler->PushRegister(reg, stack_check); - registers_to_pop->Set(reg, zone); - } else if (undo_action == CLEAR) { - registers_to_clear->Set(reg, zone); - } - // Perform the chronologically last action (or accumulated increment) - // for the register. - if (store_position != kNoStore) { - assembler->WriteCurrentPositionToRegister(reg, store_position); - } else if (clear) { - assembler->ClearRegisters(reg, reg); - } else if (absolute) { - assembler->SetRegister(reg, value); - } else if (value != 0) { - assembler->AdvanceRegister(reg, value); - } - } -} - -// This is called as we come into a loop choice node and some other tricky -// nodes. It normalizes the state of the code generator to ensure we can -// generate generic code. -void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - - DCHECK(!is_trivial()); - - if (actions_ == nullptr && backtrack() == nullptr) { - // Here we just have some deferred cp advances to fix and we are back to - // a normal situation. We may also have to forget some information gained - // through a quick check that was already performed. - if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); - // Create a new trivial state and generate the node with that. - Trace new_state; - successor->Emit(compiler, &new_state); - return; - } - - // Generate deferred actions here along with code to undo them again. - DynamicBitSet affected_registers; - - if (backtrack() != nullptr) { - // Here we have a concrete backtrack location. These are set up by choice - // nodes and so they indicate that we have a deferred save of the current - // position which we may need to emit here. - assembler->PushCurrentPosition(); - } - - int max_register = - FindAffectedRegisters(&affected_registers, compiler->zone()); - DynamicBitSet registers_to_pop; - DynamicBitSet registers_to_clear; - PerformDeferredActions(assembler, max_register, affected_registers, - ®isters_to_pop, ®isters_to_clear, - compiler->zone()); - if (cp_offset_ != 0) { - assembler->AdvanceCurrentPosition(cp_offset_); - } - - // Create a new trivial state and generate the node with that. - Label undo; - assembler->PushBacktrack(&undo); - if (successor->KeepRecursing(compiler)) { - Trace new_state; - successor->Emit(compiler, &new_state); - } else { - compiler->AddWork(successor); - assembler->GoTo(successor->label()); - } - - // On backtrack we need to restore state. - assembler->BindJumpTarget(&undo); - RestoreAffectedRegisters(assembler, max_register, registers_to_pop, - registers_to_clear); - if (backtrack() == nullptr) { - assembler->Backtrack(); - } else { - assembler->PopCurrentPosition(); - assembler->GoTo(backtrack()); - } -} - -void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - - // Omit flushing the trace. We discard the entire stack frame anyway. - - if (!label()->is_bound()) { - // We are completely independent of the trace, since we ignore it, - // so this code can be used as the generic version. - assembler->Bind(label()); - } - - // Throw away everything on the backtrack stack since the start - // of the negative submatch and restore the character position. - assembler->ReadCurrentPositionFromRegister(current_position_register_); - assembler->ReadStackPointerFromRegister(stack_pointer_register_); - if (clear_capture_count_ > 0) { - // Clear any captures that might have been performed during the success - // of the body of the negative look-ahead. - int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; - assembler->ClearRegisters(clear_capture_start_, clear_capture_end); - } - // Now that we have unwound the stack we find at the top of the stack the - // backtrack that the BeginSubmatch node got. - assembler->Backtrack(); -} - -void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { - if (!trace->is_trivial()) { - trace->Flush(compiler, this); - return; - } - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - if (!label()->is_bound()) { - assembler->Bind(label()); - } - switch (action_) { - case ACCEPT: - assembler->Succeed(); - return; - case BACKTRACK: - assembler->GoTo(trace->backtrack()); - return; - case NEGATIVE_SUBMATCH_SUCCESS: - // This case is handled in a different virtual method. - UNREACHABLE(); - } - UNIMPLEMENTED(); -} - -void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { - if (guards_ == nullptr) guards_ = new (zone) ZoneList(1, zone); - guards_->Add(guard, zone); -} - -ActionNode* ActionNode::SetRegisterForLoop(int reg, int val, - RegExpNode* on_success) { - ActionNode* result = - new (on_success->zone()) ActionNode(SET_REGISTER_FOR_LOOP, on_success); - result->data_.u_store_register.reg = reg; - result->data_.u_store_register.value = val; - return result; -} - -ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { - ActionNode* result = - new (on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); - result->data_.u_increment_register.reg = reg; - return result; -} - -ActionNode* ActionNode::StorePosition(int reg, bool is_capture, - RegExpNode* on_success) { - ActionNode* result = - new (on_success->zone()) ActionNode(STORE_POSITION, on_success); - result->data_.u_position_register.reg = reg; - result->data_.u_position_register.is_capture = is_capture; - return result; -} - -ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) { - ActionNode* result = - new (on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); - result->data_.u_clear_captures.range_from = range.from(); - result->data_.u_clear_captures.range_to = range.to(); - return result; -} - -ActionNode* ActionNode::BeginSubmatch(int stack_reg, int position_reg, - RegExpNode* on_success) { - ActionNode* result = - new (on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); - result->data_.u_submatch.stack_pointer_register = stack_reg; - result->data_.u_submatch.current_position_register = position_reg; - return result; -} - -ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg, - int clear_register_count, - int clear_register_from, - RegExpNode* on_success) { - ActionNode* result = new (on_success->zone()) - ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); - result->data_.u_submatch.stack_pointer_register = stack_reg; - result->data_.u_submatch.current_position_register = position_reg; - result->data_.u_submatch.clear_register_count = clear_register_count; - result->data_.u_submatch.clear_register_from = clear_register_from; - return result; -} - -ActionNode* ActionNode::EmptyMatchCheck(int start_register, - int repetition_register, - int repetition_limit, - RegExpNode* on_success) { - ActionNode* result = - new (on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); - result->data_.u_empty_match_check.start_register = start_register; - result->data_.u_empty_match_check.repetition_register = repetition_register; - result->data_.u_empty_match_check.repetition_limit = repetition_limit; - return result; -} - -#define DEFINE_ACCEPT(Type) \ - void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); } -FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) -#undef DEFINE_ACCEPT - -// ------------------------------------------------------------------- -// Emit code. - -void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, - Guard* guard, Trace* trace) { - switch (guard->op()) { - case Guard::LT: - DCHECK(!trace->mentions_reg(guard->reg())); - macro_assembler->IfRegisterGE(guard->reg(), guard->value(), - trace->backtrack()); - break; - case Guard::GEQ: - DCHECK(!trace->mentions_reg(guard->reg())); - macro_assembler->IfRegisterLT(guard->reg(), guard->value(), - trace->backtrack()); - break; - } -} - -// Returns the number of characters in the equivalence class, omitting those -// that cannot occur in the source string because it is Latin1. -static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, - bool one_byte_subject, - unibrow::uchar* letters, - int letter_length) { -#ifdef V8_INTL_SUPPORT - if (RegExpCaseFolding::IgnoreSet().contains(character)) { - letters[0] = character; - return 1; - } - bool in_special_add_set = - RegExpCaseFolding::SpecialAddSet().contains(character); - - icu::UnicodeSet set; - set.add(character); - set = set.closeOver(USET_CASE_INSENSITIVE); - - UChar32 canon = 0; - if (in_special_add_set) { - canon = RegExpCaseFolding::Canonicalize(character); - } - - int32_t range_count = set.getRangeCount(); - int items = 0; - for (int32_t i = 0; i < range_count; i++) { - UChar32 start = set.getRangeStart(i); - UChar32 end = set.getRangeEnd(i); - CHECK(end - start + items <= letter_length); - for (UChar32 cu = start; cu <= end; cu++) { - if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; - if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { - continue; - } - letters[items++] = (unibrow::uchar)(cu); - } - } - return items; -#else - int length = - isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); - // Unibrow returns 0 or 1 for characters where case independence is - // trivial. - if (length == 0) { - letters[0] = character; - length = 1; - } - - if (one_byte_subject) { - int new_length = 0; - for (int i = 0; i < length; i++) { - if (letters[i] <= String::kMaxOneByteCharCode) { - letters[new_length++] = letters[i]; - } - } - length = new_length; - } - - return length; -#endif // V8_INTL_SUPPORT -} - -static inline bool EmitSimpleCharacter(Isolate* isolate, - RegExpCompiler* compiler, uc16 c, - Label* on_failure, int cp_offset, - bool check, bool preloaded) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - bool bound_checked = false; - if (!preloaded) { - assembler->LoadCurrentCharacter(cp_offset, on_failure, check); - bound_checked = true; - } - assembler->CheckNotCharacter(c, on_failure); - return bound_checked; -} - -// Only emits non-letters (things that don't have case). Only used for case -// independent matches. -static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded) { - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - bool one_byte = compiler->one_byte(); - unibrow::uchar chars[4]; - int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); - if (length < 1) { - // This can't match. Must be an one-byte subject and a non-one-byte - // character. We do not need to do anything since the one-byte pass - // already handled this. - return false; // Bounds not checked. - } - bool checked = false; - // We handle the length > 1 case in a later pass. - if (length == 1) { - if (one_byte && c > String::kMaxOneByteCharCodeU) { - // Can't match - see above. - return false; // Bounds not checked. - } - if (!preloaded) { - macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); - checked = check; - } - macro_assembler->CheckNotCharacter(c, on_failure); - } - return checked; -} - -static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, - bool one_byte, uc16 c1, uc16 c2, - Label* on_failure) { - uc16 char_mask; - if (one_byte) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } - uc16 exor = c1 ^ c2; - // Check whether exor has only one bit set. - if (((exor - 1) & exor) == 0) { - // If c1 and c2 differ only by one bit. - // Ecma262UnCanonicalize always gives the highest number last. - DCHECK(c2 > c1); - uc16 mask = char_mask ^ exor; - macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); - return true; - } - DCHECK(c2 > c1); - uc16 diff = c2 - c1; - if (((diff - 1) & diff) == 0 && c1 >= diff) { - // If the characters differ by 2^n but don't differ by one bit then - // subtract the difference from the found character, then do the or - // trick. We avoid the theoretical case where negative numbers are - // involved in order to simplify code generation. - uc16 mask = char_mask ^ diff; - macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask, - on_failure); - return true; - } - return false; -} - -// Only emits letters (things that have case). Only used for case independent -// matches. -static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded) { - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - bool one_byte = compiler->one_byte(); - unibrow::uchar chars[4]; - int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); - if (length <= 1) return false; - // We may not need to check against the end of the input string - // if this character lies before a character that matched. - if (!preloaded) { - macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); - } - Label ok; - switch (length) { - case 2: { - if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], - chars[1], on_failure)) { - } else { - macro_assembler->CheckCharacter(chars[0], &ok); - macro_assembler->CheckNotCharacter(chars[1], on_failure); - macro_assembler->Bind(&ok); - } - break; - } - case 4: - macro_assembler->CheckCharacter(chars[3], &ok); - V8_FALLTHROUGH; - case 3: - macro_assembler->CheckCharacter(chars[0], &ok); - macro_assembler->CheckCharacter(chars[1], &ok); - macro_assembler->CheckNotCharacter(chars[2], on_failure); - macro_assembler->Bind(&ok); - break; - default: - UNREACHABLE(); - } - return true; -} - -static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, - Label* fall_through, Label* above_or_equal, - Label* below) { - if (below != fall_through) { - masm->CheckCharacterLT(border, below); - if (above_or_equal != fall_through) masm->GoTo(above_or_equal); - } else { - masm->CheckCharacterGT(border - 1, above_or_equal); - } -} - -static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, - int last, Label* fall_through, - Label* in_range, Label* out_of_range) { - if (in_range == fall_through) { - if (first == last) { - masm->CheckNotCharacter(first, out_of_range); - } else { - masm->CheckCharacterNotInRange(first, last, out_of_range); - } - } else { - if (first == last) { - masm->CheckCharacter(first, in_range); - } else { - masm->CheckCharacterInRange(first, last, in_range); - } - if (out_of_range != fall_through) masm->GoTo(out_of_range); - } -} - -// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. -// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. -static void EmitUseLookupTable(RegExpMacroAssembler* masm, - ZoneList* ranges, int start_index, - int end_index, int min_char, Label* fall_through, - Label* even_label, Label* odd_label) { - static const int kSize = RegExpMacroAssembler::kTableSize; - static const int kMask = RegExpMacroAssembler::kTableMask; - - int base = (min_char & ~kMask); - USE(base); - - // Assert that everything is on one kTableSize page. - for (int i = start_index; i <= end_index; i++) { - DCHECK_EQ(ranges->at(i) & ~kMask, base); - } - DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); - - char templ[kSize]; - Label* on_bit_set; - Label* on_bit_clear; - int bit; - if (even_label == fall_through) { - on_bit_set = odd_label; - on_bit_clear = even_label; - bit = 1; - } else { - on_bit_set = even_label; - on_bit_clear = odd_label; - bit = 0; - } - for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { - templ[i] = bit; - } - int j = 0; - bit ^= 1; - for (int i = start_index; i < end_index; i++) { - for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { - templ[j] = bit; - } - bit ^= 1; - } - for (int i = j; i < kSize; i++) { - templ[i] = bit; - } - Factory* factory = masm->isolate()->factory(); - // TODO(erikcorry): Cache these. - Handle ba = factory->NewByteArray(kSize, AllocationType::kOld); - for (int i = 0; i < kSize; i++) { - ba->set(i, templ[i]); - } - masm->CheckBitInTable(ba, on_bit_set); - if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); -} - -static void CutOutRange(RegExpMacroAssembler* masm, ZoneList* ranges, - int start_index, int end_index, int cut_index, - Label* even_label, Label* odd_label) { - bool odd = (((cut_index - start_index) & 1) == 1); - Label* in_range_label = odd ? odd_label : even_label; - Label dummy; - EmitDoubleBoundaryTest(masm, ranges->at(cut_index), - ranges->at(cut_index + 1) - 1, &dummy, in_range_label, - &dummy); - DCHECK(!dummy.is_linked()); - // Cut out the single range by rewriting the array. This creates a new - // range that is a merger of the two ranges on either side of the one we - // are cutting out. The oddity of the labels is preserved. - for (int j = cut_index; j > start_index; j--) { - ranges->at(j) = ranges->at(j - 1); - } - for (int j = cut_index + 1; j < end_index; j++) { - ranges->at(j) = ranges->at(j + 1); - } -} - -// Unicode case. Split the search space into kSize spaces that are handled -// with recursion. -static void SplitSearchSpace(ZoneList* ranges, int start_index, - int end_index, int* new_start_index, - int* new_end_index, int* border) { - static const int kSize = RegExpMacroAssembler::kTableSize; - static const int kMask = RegExpMacroAssembler::kTableMask; - - int first = ranges->at(start_index); - int last = ranges->at(end_index) - 1; - - *new_start_index = start_index; - *border = (ranges->at(start_index) & ~kMask) + kSize; - while (*new_start_index < end_index) { - if (ranges->at(*new_start_index) > *border) break; - (*new_start_index)++; - } - // new_start_index is the index of the first edge that is beyond the - // current kSize space. - - // For very large search spaces we do a binary chop search of the non-Latin1 - // space instead of just going to the end of the current kSize space. The - // heuristics are complicated a little by the fact that any 128-character - // encoding space can be quickly tested with a table lookup, so we don't - // wish to do binary chop search at a smaller granularity than that. A - // 128-character space can take up a lot of space in the ranges array if, - // for example, we only want to match every second character (eg. the lower - // case characters on some Unicode pages). - int binary_chop_index = (end_index + start_index) / 2; - // The first test ensures that we get to the code that handles the Latin1 - // range with a single not-taken branch, speeding up this important - // character range (even non-Latin1 charset-based text has spaces and - // punctuation). - if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. - end_index - start_index > (*new_start_index - start_index) * 2 && - last - first > kSize * 2 && binary_chop_index > *new_start_index && - ranges->at(binary_chop_index) >= first + 2 * kSize) { - int scan_forward_for_section_border = binary_chop_index; - int new_border = (ranges->at(binary_chop_index) | kMask) + 1; - - while (scan_forward_for_section_border < end_index) { - if (ranges->at(scan_forward_for_section_border) > new_border) { - *new_start_index = scan_forward_for_section_border; - *border = new_border; - break; - } - scan_forward_for_section_border++; - } - } - - DCHECK(*new_start_index > start_index); - *new_end_index = *new_start_index - 1; - if (ranges->at(*new_end_index) == *border) { - (*new_end_index)--; - } - if (*border >= ranges->at(end_index)) { - *border = ranges->at(end_index); - *new_start_index = end_index; // Won't be used. - *new_end_index = end_index - 1; - } -} - -// Gets a series of segment boundaries representing a character class. If the -// character is in the range between an even and an odd boundary (counting from -// start_index) then go to even_label, otherwise go to odd_label. We already -// know that the character is in the range of min_char to max_char inclusive. -// Either label can be nullptr indicating backtracking. Either label can also -// be equal to the fall_through label. -static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList* ranges, - int start_index, int end_index, uc32 min_char, - uc32 max_char, Label* fall_through, - Label* even_label, Label* odd_label) { - DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); - DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); - - int first = ranges->at(start_index); - int last = ranges->at(end_index) - 1; - - DCHECK_LT(min_char, first); - - // Just need to test if the character is before or on-or-after - // a particular character. - if (start_index == end_index) { - EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); - return; - } - - // Another almost trivial case: There is one interval in the middle that is - // different from the end intervals. - if (start_index + 1 == end_index) { - EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label, - odd_label); - return; - } - - // It's not worth using table lookup if there are very few intervals in the - // character class. - if (end_index - start_index <= 6) { - // It is faster to test for individual characters, so we look for those - // first, then try arbitrary ranges in the second round. - static int kNoCutIndex = -1; - int cut = kNoCutIndex; - for (int i = start_index; i < end_index; i++) { - if (ranges->at(i) == ranges->at(i + 1) - 1) { - cut = i; - break; - } - } - if (cut == kNoCutIndex) cut = start_index; - CutOutRange(masm, ranges, start_index, end_index, cut, even_label, - odd_label); - DCHECK_GE(end_index - start_index, 2); - GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char, - max_char, fall_through, even_label, odd_label); - return; - } - - // If there are a lot of intervals in the regexp, then we will use tables to - // determine whether the character is inside or outside the character class. - static const int kBits = RegExpMacroAssembler::kTableSizeBits; - - if ((max_char >> kBits) == (min_char >> kBits)) { - EmitUseLookupTable(masm, ranges, start_index, end_index, min_char, - fall_through, even_label, odd_label); - return; - } - - if ((min_char >> kBits) != (first >> kBits)) { - masm->CheckCharacterLT(first, odd_label); - GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char, - fall_through, odd_label, even_label); - return; - } - - int new_start_index = 0; - int new_end_index = 0; - int border = 0; - - SplitSearchSpace(ranges, start_index, end_index, &new_start_index, - &new_end_index, &border); - - Label handle_rest; - Label* above = &handle_rest; - if (border == last + 1) { - // We didn't find any section that started after the limit, so everything - // above the border is one of the terminal labels. - above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; - DCHECK(new_end_index == end_index - 1); - } - - DCHECK_LE(start_index, new_end_index); - DCHECK_LE(new_start_index, end_index); - DCHECK_LT(start_index, new_start_index); - DCHECK_LT(new_end_index, end_index); - DCHECK(new_end_index + 1 == new_start_index || - (new_end_index + 2 == new_start_index && - border == ranges->at(new_end_index + 1))); - DCHECK_LT(min_char, border - 1); - DCHECK_LT(border, max_char); - DCHECK_LT(ranges->at(new_end_index), border); - DCHECK(border < ranges->at(new_start_index) || - (border == ranges->at(new_start_index) && - new_start_index == end_index && new_end_index == end_index - 1 && - border == last + 1)); - DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); - - masm->CheckCharacterGT(border - 1, above); - Label dummy; - GenerateBranches(masm, ranges, start_index, new_end_index, min_char, - border - 1, &dummy, even_label, odd_label); - if (handle_rest.is_linked()) { - masm->Bind(&handle_rest); - bool flip = (new_start_index & 1) != (start_index & 1); - GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char, - &dummy, flip ? odd_label : even_label, - flip ? even_label : odd_label); - } -} - -static void EmitCharClass(RegExpMacroAssembler* macro_assembler, - RegExpCharacterClass* cc, bool one_byte, - Label* on_failure, int cp_offset, bool check_offset, - bool preloaded, Zone* zone) { - ZoneList* ranges = cc->ranges(zone); - CharacterRange::Canonicalize(ranges); - - int max_char; - if (one_byte) { - max_char = String::kMaxOneByteCharCode; - } else { - max_char = String::kMaxUtf16CodeUnit; - } - - int range_count = ranges->length(); - - int last_valid_range = range_count - 1; - while (last_valid_range >= 0) { - CharacterRange& range = ranges->at(last_valid_range); - if (range.from() <= max_char) { - break; - } - last_valid_range--; - } - - if (last_valid_range < 0) { - if (!cc->is_negated()) { - macro_assembler->GoTo(on_failure); - } - if (check_offset) { - macro_assembler->CheckPosition(cp_offset, on_failure); - } - return; - } - - if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) { - if (cc->is_negated()) { - macro_assembler->GoTo(on_failure); - } else { - // This is a common case hit by non-anchored expressions. - if (check_offset) { - macro_assembler->CheckPosition(cp_offset, on_failure); - } - } - return; - } - - if (!preloaded) { - macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); - } - - if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass( - cc->standard_type(), on_failure)) { - return; - } - - // A new list with ascending entries. Each entry is a code unit - // where there is a boundary between code units that are part of - // the class and code units that are not. Normally we insert an - // entry at zero which goes to the failure label, but if there - // was already one there we fall through for success on that entry. - // Subsequent entries have alternating meaning (success/failure). - ZoneList* range_boundaries = - new (zone) ZoneList(last_valid_range, zone); - - bool zeroth_entry_is_failure = !cc->is_negated(); - - for (int i = 0; i <= last_valid_range; i++) { - CharacterRange& range = ranges->at(i); - if (range.from() == 0) { - DCHECK_EQ(i, 0); - zeroth_entry_is_failure = !zeroth_entry_is_failure; - } else { - range_boundaries->Add(range.from(), zone); - } - range_boundaries->Add(range.to() + 1, zone); - } - int end_index = range_boundaries->length() - 1; - if (range_boundaries->at(end_index) > max_char) { - end_index--; - } - - Label fall_through; - GenerateBranches(macro_assembler, range_boundaries, - 0, // start_index. - end_index, - 0, // min_char. - max_char, &fall_through, - zeroth_entry_is_failure ? &fall_through : on_failure, - zeroth_entry_is_failure ? on_failure : &fall_through); - macro_assembler->Bind(&fall_through); -} - -RegExpNode::~RegExpNode() = default; - -RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, - Trace* trace) { - // If we are generating a greedy loop then don't stop and don't reuse code. - if (trace->stop_node() != nullptr) { - return CONTINUE; - } - - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - if (trace->is_trivial()) { - if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { - // If a generic version is already scheduled to be generated or we have - // recursed too deeply then just generate a jump to that code. - macro_assembler->GoTo(&label_); - // This will queue it up for generation of a generic version if it hasn't - // already been queued. - compiler->AddWork(this); - return DONE; - } - // Generate generic version of the node and bind the label for later use. - macro_assembler->Bind(&label_); - return CONTINUE; - } - - // We are being asked to make a non-generic version. Keep track of how many - // non-generic versions we generate so as not to overdo it. - trace_count_++; - if (KeepRecursing(compiler) && compiler->optimize() && - trace_count_ < kMaxCopiesCodeGenerated) { - return CONTINUE; - } - - // If we get here code has been generated for this node too many times or - // recursion is too deep. Time to switch to a generic version. The code for - // generic versions above can handle deep recursion properly. - bool was_limiting = compiler->limiting_recursion(); - compiler->set_limiting_recursion(true); - trace->Flush(compiler, this); - compiler->set_limiting_recursion(was_limiting); - return DONE; -} - -bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { - return !compiler->limiting_recursion() && - compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; -} - -void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) { - if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) { - // Anything may follow a positive submatch success, thus we need to accept - // all characters from this position onwards. - bm->SetRest(offset); - } else { - on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); - } - SaveBMInfo(bm, not_at_start, offset); -} - -void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int filled_in, - bool not_at_start) { - if (action_type_ == SET_REGISTER_FOR_LOOP) { - on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler, - filled_in, not_at_start); - } else { - on_success()->GetQuickCheckDetails(details, compiler, filled_in, - not_at_start); - } -} - -void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) { - // Match the behaviour of EatsAtLeast on this node. - if (assertion_type() == AT_START && not_at_start) return; - on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); - SaveBMInfo(bm, not_at_start, offset); -} - -void NegativeLookaroundChoiceNode::GetQuickCheckDetails( - QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, - bool not_at_start) { - RegExpNode* node = continue_node(); - return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); -} - -// Takes the left-most 1-bit and smears it out, setting all bits to its right. -static inline uint32_t SmearBitsRight(uint32_t v) { - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v; -} - -bool QuickCheckDetails::Rationalize(bool asc) { - bool found_useful_op = false; - uint32_t char_mask; - if (asc) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } - mask_ = 0; - value_ = 0; - int char_shift = 0; - for (int i = 0; i < characters_; i++) { - Position* pos = &positions_[i]; - if ((pos->mask & String::kMaxOneByteCharCode) != 0) { - found_useful_op = true; - } - mask_ |= (pos->mask & char_mask) << char_shift; - value_ |= (pos->value & char_mask) << char_shift; - char_shift += asc ? 8 : 16; - } - return found_useful_op; -} - -int RegExpNode::EatsAtLeast(bool not_at_start) { - return not_at_start ? eats_at_least_.eats_at_least_from_not_start - : eats_at_least_.eats_at_least_from_possibly_start; -} - -EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() { - // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it - // implies that the following node must be a LoopChoiceNode. If we need to - // set registers to constant values for other reasons, we could introduce a - // new action type SET_REGISTER that doesn't imply anything about its - // successor. - UNREACHABLE(); -} - -void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start) { - // See comment in RegExpNode::EatsAtLeastFromLoopEntry. - UNREACHABLE(); -} - -EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() { - DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. - - if (read_backward()) { - // Can't do anything special for a backward loop, so return the basic values - // that we got during analysis. - return *eats_at_least_info(); - } - - // Figure out how much the loop body itself eats, not including anything in - // the continuation case. In general, the nodes in the loop body should report - // that they eat at least the number eaten by the continuation node, since any - // successful match in the loop body must also include the continuation node. - // However, in some cases involving positive lookaround, the loop body under- - // reports its appetite, so use saturated math here to avoid negative numbers. - uint8_t loop_body_from_not_start = base::saturated_cast( - loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true)); - uint8_t loop_body_from_possibly_start = base::saturated_cast( - loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true)); - - // Limit the number of loop iterations to avoid overflow in subsequent steps. - int loop_iterations = base::saturated_cast(min_loop_iterations()); - - EatsAtLeastInfo result; - result.eats_at_least_from_not_start = - base::saturated_cast(loop_iterations * loop_body_from_not_start + - continue_node_->EatsAtLeast(true)); - if (loop_iterations > 0 && loop_body_from_possibly_start > 0) { - // First loop iteration eats at least one, so all subsequent iterations - // and the after-loop chunk are guaranteed to not be at the start. - result.eats_at_least_from_possibly_start = base::saturated_cast( - loop_body_from_possibly_start + - (loop_iterations - 1) * loop_body_from_not_start + - continue_node_->EatsAtLeast(true)); - } else { - // Loop body might eat nothing, so only continue node contributes. - result.eats_at_least_from_possibly_start = - continue_node_->EatsAtLeast(false); - } - return result; -} - -bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, - Trace* bounds_check_trace, Trace* trace, - bool preload_has_checked_bounds, - Label* on_possible_success, - QuickCheckDetails* details, - bool fall_through_on_failure, - ChoiceNode* predecessor) { - DCHECK_NOT_NULL(predecessor); - if (details->characters() == 0) return false; - GetQuickCheckDetails(details, compiler, 0, - trace->at_start() == Trace::FALSE_VALUE); - if (details->cannot_match()) return false; - if (!details->Rationalize(compiler->one_byte())) return false; - DCHECK(details->characters() == 1 || - compiler->macro_assembler()->CanReadUnaligned()); - uint32_t mask = details->mask(); - uint32_t value = details->value(); - - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - - if (trace->characters_preloaded() != details->characters()) { - DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); - // The bounds check is performed using the minimum number of characters - // any choice would eat, so if the bounds check fails, then none of the - // choices can succeed, so we can just immediately backtrack, rather - // than go to the next choice. The number of characters preloaded may be - // less than the number used for the bounds check. - int eats_at_least = predecessor->EatsAtLeast( - bounds_check_trace->at_start() == Trace::FALSE_VALUE); - DCHECK_GE(eats_at_least, details->characters()); - assembler->LoadCurrentCharacter( - trace->cp_offset(), bounds_check_trace->backtrack(), - !preload_has_checked_bounds, details->characters(), eats_at_least); - } - - bool need_mask = true; - - if (details->characters() == 1) { - // If number of characters preloaded is 1 then we used a byte or 16 bit - // load so the value is already masked down. - uint32_t char_mask; - if (compiler->one_byte()) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } - if ((mask & char_mask) == char_mask) need_mask = false; - mask &= char_mask; - } else { - // For 2-character preloads in one-byte mode or 1-character preloads in - // two-byte mode we also use a 16 bit load with zero extend. - static const uint32_t kTwoByteMask = 0xFFFF; - static const uint32_t kFourByteMask = 0xFFFFFFFF; - if (details->characters() == 2 && compiler->one_byte()) { - if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; - } else if (details->characters() == 1 && !compiler->one_byte()) { - if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; - } else { - if (mask == kFourByteMask) need_mask = false; - } - } - - if (fall_through_on_failure) { - if (need_mask) { - assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); - } else { - assembler->CheckCharacter(value, on_possible_success); - } - } else { - if (need_mask) { - assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); - } else { - assembler->CheckNotCharacter(value, trace->backtrack()); - } - } - return true; -} - -// Here is the meat of GetQuickCheckDetails (see also the comment on the -// super-class in the .h file). -// -// We iterate along the text object, building up for each character a -// mask and value that can be used to test for a quick failure to match. -// The masks and values for the positions will be combined into a single -// machine word for the current character width in order to be used in -// generating a quick check. -void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start) { - // Do not collect any quick check details if the text node reads backward, - // since it reads in the opposite direction than we use for quick checks. - if (read_backward()) return; - Isolate* isolate = compiler->macro_assembler()->isolate(); - DCHECK(characters_filled_in < details->characters()); - int characters = details->characters(); - int char_mask; - if (compiler->one_byte()) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } - for (int k = 0; k < elements()->length(); k++) { - TextElement elm = elements()->at(k); - if (elm.text_type() == TextElement::ATOM) { - Vector quarks = elm.atom()->data(); - for (int i = 0; i < characters && i < quarks.length(); i++) { - QuickCheckDetails::Position* pos = - details->positions(characters_filled_in); - uc16 c = quarks[i]; - if (elm.atom()->ignore_case()) { - unibrow::uchar chars[4]; - int length = GetCaseIndependentLetters( - isolate, c, compiler->one_byte(), chars, 4); - if (length == 0) { - // This can happen because all case variants are non-Latin1, but we - // know the input is Latin1. - details->set_cannot_match(); - pos->determines_perfectly = false; - return; - } - if (length == 1) { - // This letter has no case equivalents, so it's nice and simple - // and the mask-compare will determine definitely whether we have - // a match at this character position. - pos->mask = char_mask; - pos->value = chars[0]; - pos->determines_perfectly = true; - } else { - uint32_t common_bits = char_mask; - uint32_t bits = chars[0]; - for (int j = 1; j < length; j++) { - uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); - common_bits ^= differing_bits; - bits &= common_bits; - } - // If length is 2 and common bits has only one zero in it then - // our mask and compare instruction will determine definitely - // whether we have a match at this character position. Otherwise - // it can only be an approximate check. - uint32_t one_zero = (common_bits | ~char_mask); - if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { - pos->determines_perfectly = true; - } - pos->mask = common_bits; - pos->value = bits; - } - } else { - // Don't ignore case. Nice simple case where the mask-compare will - // determine definitely whether we have a match at this character - // position. - if (c > char_mask) { - details->set_cannot_match(); - pos->determines_perfectly = false; - return; - } - pos->mask = char_mask; - pos->value = c; - pos->determines_perfectly = true; - } - characters_filled_in++; - DCHECK(characters_filled_in <= details->characters()); - if (characters_filled_in == details->characters()) { - return; - } - } - } else { - QuickCheckDetails::Position* pos = - details->positions(characters_filled_in); - RegExpCharacterClass* tree = elm.char_class(); - ZoneList* ranges = tree->ranges(zone()); - DCHECK(!ranges->is_empty()); - if (tree->is_negated()) { - // A quick check uses multi-character mask and compare. There is no - // useful way to incorporate a negative char class into this scheme - // so we just conservatively create a mask and value that will always - // succeed. - pos->mask = 0; - pos->value = 0; - } else { - int first_range = 0; - while (ranges->at(first_range).from() > char_mask) { - first_range++; - if (first_range == ranges->length()) { - details->set_cannot_match(); - pos->determines_perfectly = false; - return; - } - } - CharacterRange range = ranges->at(first_range); - uc16 from = range.from(); - uc16 to = range.to(); - if (to > char_mask) { - to = char_mask; - } - uint32_t differing_bits = (from ^ to); - // A mask and compare is only perfect if the differing bits form a - // number like 00011111 with one single block of trailing 1s. - if ((differing_bits & (differing_bits + 1)) == 0 && - from + differing_bits == to) { - pos->determines_perfectly = true; - } - uint32_t common_bits = ~SmearBitsRight(differing_bits); - uint32_t bits = (from & common_bits); - for (int i = first_range + 1; i < ranges->length(); i++) { - CharacterRange range = ranges->at(i); - uc16 from = range.from(); - uc16 to = range.to(); - if (from > char_mask) continue; - if (to > char_mask) to = char_mask; - // Here we are combining more ranges into the mask and compare - // value. With each new range the mask becomes more sparse and - // so the chances of a false positive rise. A character class - // with multiple ranges is assumed never to be equivalent to a - // mask and compare operation. - pos->determines_perfectly = false; - uint32_t new_common_bits = (from ^ to); - new_common_bits = ~SmearBitsRight(new_common_bits); - common_bits &= new_common_bits; - bits &= new_common_bits; - uint32_t differing_bits = (from & common_bits) ^ bits; - common_bits ^= differing_bits; - bits &= common_bits; - } - pos->mask = common_bits; - pos->value = bits; - } - characters_filled_in++; - DCHECK(characters_filled_in <= details->characters()); - if (characters_filled_in == details->characters()) { - return; - } - } - } - DCHECK(characters_filled_in != details->characters()); - if (!details->cannot_match()) { - on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in, - true); - } -} - -void QuickCheckDetails::Clear() { - for (int i = 0; i < characters_; i++) { - positions_[i].mask = 0; - positions_[i].value = 0; - positions_[i].determines_perfectly = false; - } - characters_ = 0; -} - -void QuickCheckDetails::Advance(int by, bool one_byte) { - if (by >= characters_ || by < 0) { - DCHECK_IMPLIES(by < 0, characters_ == 0); - Clear(); - return; - } - DCHECK_LE(characters_ - by, 4); - DCHECK_LE(characters_, 4); - for (int i = 0; i < characters_ - by; i++) { - positions_[i] = positions_[by + i]; - } - for (int i = characters_ - by; i < characters_; i++) { - positions_[i].mask = 0; - positions_[i].value = 0; - positions_[i].determines_perfectly = false; - } - characters_ -= by; - // We could change mask_ and value_ here but we would never advance unless - // they had already been used in a check and they won't be used again because - // it would gain us nothing. So there's no point. -} - -void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { - DCHECK(characters_ == other->characters_); - if (other->cannot_match_) { - return; - } - if (cannot_match_) { - *this = *other; - return; - } - for (int i = from_index; i < characters_; i++) { - QuickCheckDetails::Position* pos = positions(i); - QuickCheckDetails::Position* other_pos = other->positions(i); - if (pos->mask != other_pos->mask || pos->value != other_pos->value || - !other_pos->determines_perfectly) { - // Our mask-compare operation will be approximate unless we have the - // exact same operation on both sides of the alternation. - pos->determines_perfectly = false; - } - pos->mask &= other_pos->mask; - pos->value &= pos->mask; - other_pos->value &= pos->mask; - uc16 differing_bits = (pos->value ^ other_pos->value); - pos->mask &= ~differing_bits; - pos->value &= pos->mask; - } -} - -class VisitMarker { - public: - explicit VisitMarker(NodeInfo* info) : info_(info) { - DCHECK(!info->visited); - info->visited = true; - } - ~VisitMarker() { info_->visited = false; } - - private: - NodeInfo* info_; -}; - -// Temporarily sets traversed_loop_initialization_node_. -class LoopInitializationMarker { - public: - explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) { - DCHECK(!node_->traversed_loop_initialization_node_); - node_->traversed_loop_initialization_node_ = true; - } - ~LoopInitializationMarker() { - DCHECK(node_->traversed_loop_initialization_node_); - node_->traversed_loop_initialization_node_ = false; - } - - private: - LoopChoiceNode* node_; - DISALLOW_COPY_AND_ASSIGN(LoopInitializationMarker); -}; - -// Temporarily decrements min_loop_iterations_. -class IterationDecrementer { - public: - explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) { - DCHECK_GT(node_->min_loop_iterations_, 0); - --node_->min_loop_iterations_; - } - ~IterationDecrementer() { ++node_->min_loop_iterations_; } - - private: - LoopChoiceNode* node_; - DISALLOW_COPY_AND_ASSIGN(IterationDecrementer); -}; - -RegExpNode* SeqRegExpNode::FilterOneByte(int depth) { - if (info()->replacement_calculated) return replacement(); - if (depth < 0) return this; - DCHECK(!info()->visited); - VisitMarker marker(info()); - return FilterSuccessor(depth - 1); -} - -RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { - RegExpNode* next = on_success_->FilterOneByte(depth - 1); - if (next == nullptr) return set_replacement(nullptr); - on_success_ = next; - return set_replacement(this); -} - -// We need to check for the following characters: 0x39C 0x3BC 0x178. -bool RangeContainsLatin1Equivalents(CharacterRange range) { - // TODO(dcarney): this could be a lot more efficient. - return range.Contains(0x039C) || range.Contains(0x03BC) || - range.Contains(0x0178); -} - -static bool RangesContainLatin1Equivalents(ZoneList* ranges) { - for (int i = 0; i < ranges->length(); i++) { - // TODO(dcarney): this could be a lot more efficient. - if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; - } - return false; -} - -RegExpNode* TextNode::FilterOneByte(int depth) { - if (info()->replacement_calculated) return replacement(); - if (depth < 0) return this; - DCHECK(!info()->visited); - VisitMarker marker(info()); - int element_count = elements()->length(); - for (int i = 0; i < element_count; i++) { - TextElement elm = elements()->at(i); - if (elm.text_type() == TextElement::ATOM) { - Vector quarks = elm.atom()->data(); - for (int j = 0; j < quarks.length(); j++) { - uc16 c = quarks[j]; - if (elm.atom()->ignore_case()) { - c = unibrow::Latin1::TryConvertToLatin1(c); - } - if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); - // Replace quark in case we converted to Latin-1. - uc16* writable_quarks = const_cast(quarks.begin()); - writable_quarks[j] = c; - } - } else { - DCHECK(elm.text_type() == TextElement::CHAR_CLASS); - RegExpCharacterClass* cc = elm.char_class(); - ZoneList* ranges = cc->ranges(zone()); - CharacterRange::Canonicalize(ranges); - // Now they are in order so we only need to look at the first. - int range_count = ranges->length(); - if (cc->is_negated()) { - if (range_count != 0 && ranges->at(0).from() == 0 && - ranges->at(0).to() >= String::kMaxOneByteCharCode) { - // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) - continue; - return set_replacement(nullptr); - } - } else { - if (range_count == 0 || - ranges->at(0).from() > String::kMaxOneByteCharCode) { - // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) - continue; - return set_replacement(nullptr); - } - } - } - } - return FilterSuccessor(depth - 1); -} - -RegExpNode* LoopChoiceNode::FilterOneByte(int depth) { - if (info()->replacement_calculated) return replacement(); - if (depth < 0) return this; - if (info()->visited) return this; - { - VisitMarker marker(info()); - - RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); - // If we can't continue after the loop then there is no sense in doing the - // loop. - if (continue_replacement == nullptr) return set_replacement(nullptr); - } - - return ChoiceNode::FilterOneByte(depth - 1); -} - -RegExpNode* ChoiceNode::FilterOneByte(int depth) { - if (info()->replacement_calculated) return replacement(); - if (depth < 0) return this; - if (info()->visited) return this; - VisitMarker marker(info()); - int choice_count = alternatives_->length(); - - for (int i = 0; i < choice_count; i++) { - GuardedAlternative alternative = alternatives_->at(i); - if (alternative.guards() != nullptr && - alternative.guards()->length() != 0) { - set_replacement(this); - return this; - } - } - - int surviving = 0; - RegExpNode* survivor = nullptr; - for (int i = 0; i < choice_count; i++) { - GuardedAlternative alternative = alternatives_->at(i); - RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); - DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. - if (replacement != nullptr) { - alternatives_->at(i).set_node(replacement); - surviving++; - survivor = replacement; - } - } - if (surviving < 2) return set_replacement(survivor); - - set_replacement(this); - if (surviving == choice_count) { - return this; - } - // Only some of the nodes survived the filtering. We need to rebuild the - // alternatives list. - ZoneList* new_alternatives = - new (zone()) ZoneList(surviving, zone()); - for (int i = 0; i < choice_count; i++) { - RegExpNode* replacement = - alternatives_->at(i).node()->FilterOneByte(depth - 1); - if (replacement != nullptr) { - alternatives_->at(i).set_node(replacement); - new_alternatives->Add(alternatives_->at(i), zone()); - } - } - alternatives_ = new_alternatives; - return this; -} - -RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { - if (info()->replacement_calculated) return replacement(); - if (depth < 0) return this; - if (info()->visited) return this; - VisitMarker marker(info()); - // Alternative 0 is the negative lookahead, alternative 1 is what comes - // afterwards. - RegExpNode* node = continue_node(); - RegExpNode* replacement = node->FilterOneByte(depth - 1); - if (replacement == nullptr) return set_replacement(nullptr); - alternatives_->at(kContinueIndex).set_node(replacement); - - RegExpNode* neg_node = lookaround_node(); - RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); - // If the negative lookahead is always going to fail then - // we don't need to check it. - if (neg_replacement == nullptr) return set_replacement(replacement); - alternatives_->at(kLookaroundIndex).set_node(neg_replacement); - return set_replacement(this); -} - -void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start) { - if (body_can_be_zero_length_ || info()->visited) return; - not_at_start = not_at_start || this->not_at_start(); - DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. - if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 && - loop_node_->EatsAtLeast(not_at_start) > - continue_node_->EatsAtLeast(true)) { - // Loop body is guaranteed to execute at least once, and consume characters - // when it does, meaning the only possible quick checks from this point - // begin with the loop body. We may recursively visit this LoopChoiceNode, - // but we temporarily decrease its minimum iteration counter so we know when - // to check the continue case. - IterationDecrementer next_iteration(this); - loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in, - not_at_start); - } else { - // Might not consume anything in the loop body, so treat it like a normal - // ChoiceNode (and don't recursively visit this node again). - VisitMarker marker(info()); - ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in, - not_at_start); - } -} - -void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry( - QuickCheckDetails* details, RegExpCompiler* compiler, - int characters_filled_in, bool not_at_start) { - if (traversed_loop_initialization_node_) { - // We already entered this loop once, exited via its continuation node, and - // followed an outer loop's back-edge to before the loop entry point. We - // could try to reset the minimum iteration count to its starting value at - // this point, but that seems like more trouble than it's worth. It's safe - // to keep going with the current (possibly reduced) minimum iteration - // count. - GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); - } else { - // We are entering a loop via its counter initialization action, meaning we - // are guaranteed to run the loop body at least some minimum number of times - // before running the continuation node. Set a flag so that this node knows - // (now and any times we visit it again recursively) that it was entered - // from the top. - LoopInitializationMarker marker(this); - GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); - } -} - -void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) { - if (body_can_be_zero_length_ || budget <= 0) { - bm->SetRest(offset); - SaveBMInfo(bm, not_at_start, offset); - return; - } - ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); - SaveBMInfo(bm, not_at_start, offset); -} - -void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start) { - not_at_start = (not_at_start || not_at_start_); - int choice_count = alternatives_->length(); - DCHECK_LT(0, choice_count); - alternatives_->at(0).node()->GetQuickCheckDetails( - details, compiler, characters_filled_in, not_at_start); - for (int i = 1; i < choice_count; i++) { - QuickCheckDetails new_details(details->characters()); - RegExpNode* node = alternatives_->at(i).node(); - node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in, - not_at_start); - // Here we merge the quick match details of the two branches. - details->Merge(&new_details, characters_filled_in); - } -} - -namespace { - -// Check for [0-9A-Z_a-z]. -void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word, - Label* non_word, bool fall_through_on_word) { - if (assembler->CheckSpecialCharacterClass( - fall_through_on_word ? 'w' : 'W', - fall_through_on_word ? non_word : word)) { - // Optimized implementation available. - return; - } - assembler->CheckCharacterGT('z', non_word); - assembler->CheckCharacterLT('0', non_word); - assembler->CheckCharacterGT('a' - 1, word); - assembler->CheckCharacterLT('9' + 1, word); - assembler->CheckCharacterLT('A', non_word); - assembler->CheckCharacterLT('Z' + 1, word); - if (fall_through_on_word) { - assembler->CheckNotCharacter('_', non_word); - } else { - assembler->CheckCharacter('_', word); - } -} - -// Emit the code to check for a ^ in multiline mode (1-character lookbehind -// that matches newline or the start of input). -void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - - // We will load the previous character into the current character register. - Trace new_trace(*trace); - new_trace.InvalidateCurrentCharacter(); - - // A positive (> 0) cp_offset means we've already successfully matched a - // non-empty-width part of the pattern, and thus cannot be at or before the - // start of the subject string. We can thus skip both at-start and - // bounds-checks when loading the one-character lookbehind. - const bool may_be_at_or_before_subject_string_start = - new_trace.cp_offset() <= 0; - - Label ok; - if (may_be_at_or_before_subject_string_start) { - // The start of input counts as a newline in this context, so skip to ok if - // we are at the start. - assembler->CheckAtStart(new_trace.cp_offset(), &ok); - } - - // If we've already checked that we are not at the start of input, it's okay - // to load the previous character without bounds checks. - const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; - assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, - new_trace.backtrack(), can_skip_bounds_check); - if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) { - // Newline means \n, \r, 0x2028 or 0x2029. - if (!compiler->one_byte()) { - assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok); - } - assembler->CheckCharacter('\n', &ok); - assembler->CheckNotCharacter('\r', new_trace.backtrack()); - } - assembler->Bind(&ok); - on_success->Emit(compiler, &new_trace); -} - -} // namespace - -// Emit the code to handle \b and \B (word-boundary or non-word-boundary). -void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - Isolate* isolate = assembler->isolate(); - Trace::TriBool next_is_word_character = Trace::UNKNOWN; - bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); - BoyerMooreLookahead* lookahead = bm_info(not_at_start); - if (lookahead == nullptr) { - int eats_at_least = - Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start)); - if (eats_at_least >= 1) { - BoyerMooreLookahead* bm = - new (zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); - FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); - if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE; - if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; - } - } else { - if (lookahead->at(0)->is_non_word()) - next_is_word_character = Trace::FALSE_VALUE; - if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; - } - bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); - if (next_is_word_character == Trace::UNKNOWN) { - Label before_non_word; - Label before_word; - if (trace->characters_preloaded() != 1) { - assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); - } - // Fall through on non-word. - EmitWordCheck(assembler, &before_word, &before_non_word, false); - // Next character is not a word character. - assembler->Bind(&before_non_word); - Label ok; - BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); - assembler->GoTo(&ok); - - assembler->Bind(&before_word); - BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); - assembler->Bind(&ok); - } else if (next_is_word_character == Trace::TRUE_VALUE) { - BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); - } else { - DCHECK(next_is_word_character == Trace::FALSE_VALUE); - BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); - } -} - -void AssertionNode::BacktrackIfPrevious( - RegExpCompiler* compiler, Trace* trace, - AssertionNode::IfPrevious backtrack_if_previous) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - Trace new_trace(*trace); - new_trace.InvalidateCurrentCharacter(); - - Label fall_through; - Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack() - : &fall_through; - Label* word = backtrack_if_previous == kIsNonWord ? &fall_through - : new_trace.backtrack(); - - // A positive (> 0) cp_offset means we've already successfully matched a - // non-empty-width part of the pattern, and thus cannot be at or before the - // start of the subject string. We can thus skip both at-start and - // bounds-checks when loading the one-character lookbehind. - const bool may_be_at_or_before_subject_string_start = - new_trace.cp_offset() <= 0; - - if (may_be_at_or_before_subject_string_start) { - // The start of input counts as a non-word character, so the question is - // decided if we are at the start. - assembler->CheckAtStart(new_trace.cp_offset(), non_word); - } - - // If we've already checked that we are not at the start of input, it's okay - // to load the previous character without bounds checks. - const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; - assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word, - can_skip_bounds_check); - EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); - - assembler->Bind(&fall_through); - on_success()->Emit(compiler, &new_trace); -} - -void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, - int filled_in, bool not_at_start) { - if (assertion_type_ == AT_START && not_at_start) { - details->set_cannot_match(); - return; - } - return on_success()->GetQuickCheckDetails(details, compiler, filled_in, - not_at_start); -} - -void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - switch (assertion_type_) { - case AT_END: { - Label ok; - assembler->CheckPosition(trace->cp_offset(), &ok); - assembler->GoTo(trace->backtrack()); - assembler->Bind(&ok); - break; - } - case AT_START: { - if (trace->at_start() == Trace::FALSE_VALUE) { - assembler->GoTo(trace->backtrack()); - return; - } - if (trace->at_start() == Trace::UNKNOWN) { - assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); - Trace at_start_trace = *trace; - at_start_trace.set_at_start(Trace::TRUE_VALUE); - on_success()->Emit(compiler, &at_start_trace); - return; - } - } break; - case AFTER_NEWLINE: - EmitHat(compiler, on_success(), trace); - return; - case AT_BOUNDARY: - case AT_NON_BOUNDARY: { - EmitBoundaryCheck(compiler, trace); - return; - } - } - on_success()->Emit(compiler, trace); -} - -static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { - if (quick_check == nullptr) return false; - if (offset >= quick_check->characters()) return false; - return quick_check->positions(offset)->determines_perfectly; -} - -static void UpdateBoundsCheck(int index, int* checked_up_to) { - if (index > *checked_up_to) { - *checked_up_to = index; - } -} - -// We call this repeatedly to generate code for each pass over the text node. -// The passes are in increasing order of difficulty because we hope one -// of the first passes will fail in which case we are saved the work of the -// later passes. for example for the case independent regexp /%[asdfghjkl]a/ -// we will check the '%' in the first pass, the case independent 'a' in the -// second pass and the character class in the last pass. -// -// The passes are done from right to left, so for example to test for /bar/ -// we will first test for an 'r' with offset 2, then an 'a' with offset 1 -// and then a 'b' with offset 0. This means we can avoid the end-of-input -// bounds check most of the time. In the example we only need to check for -// end-of-input when loading the putative 'r'. -// -// A slight complication involves the fact that the first character may already -// be fetched into a register by the previous node. In this case we want to -// do the test for that character first. We do this in separate passes. The -// 'preloaded' argument indicates that we are doing such a 'pass'. If such a -// pass has been performed then subsequent passes will have true in -// first_element_checked to indicate that that character does not need to be -// checked again. -// -// In addition to all this we are passed a Trace, which can -// contain an AlternativeGeneration object. In this AlternativeGeneration -// object we can see details of any quick check that was already passed in -// order to get to the code we are now generating. The quick check can involve -// loading characters, which means we do not need to recheck the bounds -// up to the limit the quick check already checked. In addition the quick -// check can have involved a mask and compare operation which may simplify -// or obviate the need for further checks at some character positions. -void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, - bool preloaded, Trace* trace, - bool first_element_checked, int* checked_up_to) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - Isolate* isolate = assembler->isolate(); - bool one_byte = compiler->one_byte(); - Label* backtrack = trace->backtrack(); - QuickCheckDetails* quick_check = trace->quick_check_performed(); - int element_count = elements()->length(); - int backward_offset = read_backward() ? -Length() : 0; - for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { - TextElement elm = elements()->at(i); - int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; - if (elm.text_type() == TextElement::ATOM) { - if (SkipPass(pass, elm.atom()->ignore_case())) continue; - Vector quarks = elm.atom()->data(); - for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { - if (first_element_checked && i == 0 && j == 0) continue; - if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; - uc16 quark = quarks[j]; - if (elm.atom()->ignore_case()) { - // Everywhere else we assume that a non-Latin-1 character cannot match - // a Latin-1 character. Avoid the cases where this is assumption is - // invalid by using the Latin1 equivalent instead. - quark = unibrow::Latin1::TryConvertToLatin1(quark); - } - bool needs_bounds_check = - *checked_up_to < cp_offset + j || read_backward(); - bool bounds_checked = false; - switch (pass) { - case NON_LATIN1_MATCH: - DCHECK(one_byte); - if (quark > String::kMaxOneByteCharCode) { - assembler->GoTo(backtrack); - return; - } - break; - case NON_LETTER_CHARACTER_MATCH: - bounds_checked = - EmitAtomNonLetter(isolate, compiler, quark, backtrack, - cp_offset + j, needs_bounds_check, preloaded); - break; - case SIMPLE_CHARACTER_MATCH: - bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, - backtrack, cp_offset + j, - needs_bounds_check, preloaded); - break; - case CASE_CHARACTER_MATCH: - bounds_checked = - EmitAtomLetter(isolate, compiler, quark, backtrack, - cp_offset + j, needs_bounds_check, preloaded); - break; - default: - break; - } - if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); - } - } else { - DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); - if (pass == CHARACTER_CLASS_MATCH) { - if (first_element_checked && i == 0) continue; - if (DeterminedAlready(quick_check, elm.cp_offset())) continue; - RegExpCharacterClass* cc = elm.char_class(); - bool bounds_check = *checked_up_to < cp_offset || read_backward(); - EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, - bounds_check, preloaded, zone()); - UpdateBoundsCheck(cp_offset, checked_up_to); - } - } - } -} - -int TextNode::Length() { - TextElement elm = elements()->last(); - DCHECK_LE(0, elm.cp_offset()); - return elm.cp_offset() + elm.length(); -} - -bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { - if (ignore_case) { - return pass == SIMPLE_CHARACTER_MATCH; - } else { - return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; - } -} - -TextNode* TextNode::CreateForCharacterRanges(Zone* zone, - ZoneList* ranges, - bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags) { - DCHECK_NOT_NULL(ranges); - ZoneList* elms = new (zone) ZoneList(1, zone); - elms->Add(TextElement::CharClass( - new (zone) RegExpCharacterClass(zone, ranges, flags)), - zone); - return new (zone) TextNode(elms, read_backward, on_success); -} - -TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, - CharacterRange trail, - bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags) { - ZoneList* lead_ranges = CharacterRange::List(zone, lead); - ZoneList* trail_ranges = CharacterRange::List(zone, trail); - ZoneList* elms = new (zone) ZoneList(2, zone); - elms->Add(TextElement::CharClass( - new (zone) RegExpCharacterClass(zone, lead_ranges, flags)), - zone); - elms->Add(TextElement::CharClass( - new (zone) RegExpCharacterClass(zone, trail_ranges, flags)), - zone); - return new (zone) TextNode(elms, read_backward, on_success); -} - -// This generates the code to match a text node. A text node can contain -// straight character sequences (possibly to be matched in a case-independent -// way) and character classes. For efficiency we do not do this in a single -// pass from left to right. Instead we pass over the text node several times, -// emitting code for some character positions every time. See the comment on -// TextEmitPass for details. -void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { - LimitResult limit_result = LimitVersions(compiler, trace); - if (limit_result == DONE) return; - DCHECK(limit_result == CONTINUE); - - if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { - compiler->SetRegExpTooBig(); - return; - } - - if (compiler->one_byte()) { - int dummy = 0; - TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); - } - - bool first_elt_done = false; - int bound_checked_to = trace->cp_offset() - 1; - bound_checked_to += trace->bound_checked_up_to(); - - // If a character is preloaded into the current character register then - // check that now. - if (trace->characters_preloaded() == 1) { - for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { - TextEmitPass(compiler, static_cast(pass), true, trace, - false, &bound_checked_to); - } - first_elt_done = true; - } - - for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { - TextEmitPass(compiler, static_cast(pass), false, trace, - first_elt_done, &bound_checked_to); - } - - Trace successor_trace(*trace); - // If we advance backward, we may end up at the start. - successor_trace.AdvanceCurrentPositionInTrace( - read_backward() ? -Length() : Length(), compiler); - successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN - : Trace::FALSE_VALUE); - RecursionCheck rc(compiler); - on_success()->Emit(compiler, &successor_trace); -} - -void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; } - -void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { - // We don't have an instruction for shifting the current character register - // down or for using a shifted value for anything so lets just forget that - // we preloaded any characters into it. - characters_preloaded_ = 0; - // Adjust the offsets of the quick check performed information. This - // information is used to find out what we already determined about the - // characters by means of mask and compare. - quick_check_performed_.Advance(by, compiler->one_byte()); - cp_offset_ += by; - if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { - compiler->SetRegExpTooBig(); - cp_offset_ = 0; - } - bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); -} - -void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { - int element_count = elements()->length(); - for (int i = 0; i < element_count; i++) { - TextElement elm = elements()->at(i); - if (elm.text_type() == TextElement::CHAR_CLASS) { - RegExpCharacterClass* cc = elm.char_class(); -#ifdef V8_INTL_SUPPORT - bool case_equivalents_already_added = - NeedsUnicodeCaseEquivalents(cc->flags()); -#else - bool case_equivalents_already_added = false; -#endif - if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) { - // None of the standard character classes is different in the case - // independent case and it slows us down if we don't know that. - if (cc->is_standard(zone())) continue; - ZoneList* ranges = cc->ranges(zone()); - CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, - is_one_byte); - } - } - } -} - -int TextNode::GreedyLoopTextLength() { return Length(); } - -RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( - RegExpCompiler* compiler) { - if (read_backward()) return nullptr; - if (elements()->length() != 1) return nullptr; - TextElement elm = elements()->at(0); - if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr; - RegExpCharacterClass* node = elm.char_class(); - ZoneList* ranges = node->ranges(zone()); - CharacterRange::Canonicalize(ranges); - if (node->is_negated()) { - return ranges->length() == 0 ? on_success() : nullptr; - } - if (ranges->length() != 1) return nullptr; - uint32_t max_char; - if (compiler->one_byte()) { - max_char = String::kMaxOneByteCharCode; - } else { - max_char = String::kMaxUtf16CodeUnit; - } - return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; -} - -// Finds the fixed match length of a sequence of nodes that goes from -// this alternative and back to this choice node. If there are variable -// length nodes or other complications in the way then return a sentinel -// value indicating that a greedy loop cannot be constructed. -int ChoiceNode::GreedyLoopTextLengthForAlternative( - GuardedAlternative* alternative) { - int length = 0; - RegExpNode* node = alternative->node(); - // Later we will generate code for all these text nodes using recursion - // so we have to limit the max number. - int recursion_depth = 0; - while (node != this) { - if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { - return kNodeIsTooComplexForGreedyLoops; - } - int node_length = node->GreedyLoopTextLength(); - if (node_length == kNodeIsTooComplexForGreedyLoops) { - return kNodeIsTooComplexForGreedyLoops; - } - length += node_length; - SeqRegExpNode* seq_node = static_cast(node); - node = seq_node->on_success(); - } - return read_backward() ? -length : length; -} - -void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { - DCHECK_NULL(loop_node_); - AddAlternative(alt); - loop_node_ = alt.node(); -} - -void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { - DCHECK_NULL(continue_node_); - AddAlternative(alt); - continue_node_ = alt.node(); -} - -void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - if (trace->stop_node() == this) { - // Back edge of greedy optimized loop node graph. - int text_length = - GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); - DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length); - // Update the counter-based backtracking info on the stack. This is an - // optimization for greedy loops (see below). - DCHECK(trace->cp_offset() == text_length); - macro_assembler->AdvanceCurrentPosition(text_length); - macro_assembler->GoTo(trace->loop_label()); - return; - } - DCHECK_NULL(trace->stop_node()); - if (!trace->is_trivial()) { - trace->Flush(compiler, this); - return; - } - ChoiceNode::Emit(compiler, trace); -} - -int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, - int eats_at_least) { - int preload_characters = Min(4, eats_at_least); - DCHECK_LE(preload_characters, 4); - if (compiler->macro_assembler()->CanReadUnaligned()) { - bool one_byte = compiler->one_byte(); - if (one_byte) { - // We can't preload 3 characters because there is no machine instruction - // to do that. We can't just load 4 because we could be reading - // beyond the end of the string, which could cause a memory fault. - if (preload_characters == 3) preload_characters = 2; - } else { - if (preload_characters > 2) preload_characters = 2; - } - } else { - if (preload_characters > 1) preload_characters = 1; - } - return preload_characters; -} - -// This class is used when generating the alternatives in a choice node. It -// records the way the alternative is being code generated. -class AlternativeGeneration : public Malloced { - public: - AlternativeGeneration() - : possible_success(), - expects_preload(false), - after(), - quick_check_details() {} - Label possible_success; - bool expects_preload; - Label after; - QuickCheckDetails quick_check_details; -}; - -// Creates a list of AlternativeGenerations. If the list has a reasonable -// size then it is on the stack, otherwise the excess is on the heap. -class AlternativeGenerationList { - public: - AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) { - for (int i = 0; i < count && i < kAFew; i++) { - alt_gens_.Add(a_few_alt_gens_ + i, zone); - } - for (int i = kAFew; i < count; i++) { - alt_gens_.Add(new AlternativeGeneration(), zone); - } - } - ~AlternativeGenerationList() { - for (int i = kAFew; i < alt_gens_.length(); i++) { - delete alt_gens_[i]; - alt_gens_[i] = nullptr; - } - } - - AlternativeGeneration* at(int i) { return alt_gens_[i]; } - - private: - static const int kAFew = 10; - ZoneList alt_gens_; - AlternativeGeneration a_few_alt_gens_[kAFew]; -}; - -void BoyerMoorePositionInfo::Set(int character) { - SetInterval(Interval(character, character)); -} - -namespace { - -ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges, - int ranges_length, Interval new_range) { - DCHECK_EQ(1, ranges_length & 1); - DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]); - if (containment == kLatticeUnknown) return containment; - bool inside = false; - int last = 0; - for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { - // Consider the range from last to ranges[i]. - // We haven't got to the new range yet. - if (ranges[i] <= new_range.from()) continue; - // New range is wholly inside last-ranges[i]. Note that new_range.to() is - // inclusive, but the values in ranges are not. - if (last <= new_range.from() && new_range.to() < ranges[i]) { - return Combine(containment, inside ? kLatticeIn : kLatticeOut); - } - return kLatticeUnknown; - } - return containment; -} - -int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) { - STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == - 2 * kInt64Size * kBitsPerByte); - - // Slight fiddling is needed here, since the bitset is of length 128 while - // CountTrailingZeros requires an integral type and std::bitset can only - // convert to unsigned long long. So we handle the most- and least-significant - // bits separately. - - { - static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0}); - BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask; - STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong()))); - uint64_t lsb = masked_bitset.to_ullong(); - if (lsb != 0) return base::bits::CountTrailingZeros(lsb); - } - - { - BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64; - uint64_t msb = masked_bitset.to_ullong(); - if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb); - } - - return -1; -} - -} // namespace - -void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { - w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); - - if (interval.size() >= kMapSize) { - map_count_ = kMapSize; - map_.set(); - return; - } - - for (int i = interval.from(); i <= interval.to(); i++) { - int mod_character = (i & kMask); - if (!map_[mod_character]) { - map_count_++; - map_.set(mod_character); - } - if (map_count_ == kMapSize) return; - } -} - -void BoyerMoorePositionInfo::SetAll() { - w_ = kLatticeUnknown; - if (map_count_ != kMapSize) { - map_count_ = kMapSize; - map_.set(); - } -} - -BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler, - Zone* zone) - : length_(length), compiler_(compiler) { - if (compiler->one_byte()) { - max_char_ = String::kMaxOneByteCharCode; - } else { - max_char_ = String::kMaxUtf16CodeUnit; - } - bitmaps_ = new (zone) ZoneList(length, zone); - for (int i = 0; i < length; i++) { - bitmaps_->Add(new (zone) BoyerMoorePositionInfo(), zone); - } -} - -// Find the longest range of lookahead that has the fewest number of different -// characters that can occur at a given position. Since we are optimizing two -// different parameters at once this is a tradeoff. -bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { - int biggest_points = 0; - // If more than 32 characters out of 128 can occur it is unlikely that we can - // be lucky enough to step forwards much of the time. - const int kMaxMax = 32; - for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax; - max_number_of_chars *= 2) { - biggest_points = - FindBestInterval(max_number_of_chars, biggest_points, from, to); - } - if (biggest_points == 0) return false; - return true; -} - -// Find the highest-points range between 0 and length_ where the character -// information is not too vague. 'Too vague' means that there are more than -// max_number_of_chars that can occur at this position. Calculates the number -// of points as the product of width-of-the-range and -// probability-of-finding-one-of-the-characters, where the probability is -// calculated using the frequency distribution of the sample subject string. -int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars, - int old_biggest_points, int* from, - int* to) { - int biggest_points = old_biggest_points; - static const int kSize = RegExpMacroAssembler::kTableSize; - for (int i = 0; i < length_;) { - while (i < length_ && Count(i) > max_number_of_chars) i++; - if (i == length_) break; - int remembered_from = i; - - BoyerMoorePositionInfo::Bitset union_bitset; - for (; i < length_ && Count(i) <= max_number_of_chars; i++) { - union_bitset |= bitmaps_->at(i)->raw_bitset(); - } - - int frequency = 0; - - // Iterate only over set bits. - int j; - while ((j = BitsetFirstSetBit(union_bitset)) != -1) { - DCHECK(union_bitset[j]); // Sanity check. - // Add 1 to the frequency to give a small per-character boost for - // the cases where our sampling is not good enough and many - // characters have a frequency of zero. This means the frequency - // can theoretically be up to 2*kSize though we treat it mostly as - // a fraction of kSize. - frequency += compiler_->frequency_collator()->Frequency(j) + 1; - union_bitset.reset(j); - } - - // We use the probability of skipping times the distance we are skipping to - // judge the effectiveness of this. Actually we have a cut-off: By - // dividing by 2 we switch off the skipping if the probability of skipping - // is less than 50%. This is because the multibyte mask-and-compare - // skipping in quickcheck is more likely to do well on this case. - bool in_quickcheck_range = - ((i - remembered_from < 4) || - (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); - // Called 'probability' but it is only a rough estimate and can actually - // be outside the 0-kSize range. - int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; - int points = (i - remembered_from) * probability; - if (points > biggest_points) { - *from = remembered_from; - *to = i - 1; - biggest_points = points; - } - } - return biggest_points; -} - -// Take all the characters that will not prevent a successful match if they -// occur in the subject string in the range between min_lookahead and -// max_lookahead (inclusive) measured from the current position. If the -// character at max_lookahead offset is not one of these characters, then we -// can safely skip forwards by the number of characters in the range. -int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead, - Handle boolean_skip_table) { - const int kSkipArrayEntry = 0; - const int kDontSkipArrayEntry = 1; - - std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry, - boolean_skip_table->length()); - - for (int i = max_lookahead; i >= min_lookahead; i--) { - BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset(); - - // Iterate only over set bits. - int j; - while ((j = BitsetFirstSetBit(bitset)) != -1) { - DCHECK(bitset[j]); // Sanity check. - boolean_skip_table->set(j, kDontSkipArrayEntry); - bitset.reset(j); - } - } - - const int skip = max_lookahead + 1 - min_lookahead; - return skip; -} - -// See comment above on the implementation of GetSkipTable. -void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { - const int kSize = RegExpMacroAssembler::kTableSize; - - int min_lookahead = 0; - int max_lookahead = 0; - - if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; - - // Check if we only have a single non-empty position info, and that info - // contains precisely one character. - bool found_single_character = false; - int single_character = 0; - for (int i = max_lookahead; i >= min_lookahead; i--) { - BoyerMoorePositionInfo* map = bitmaps_->at(i); - if (map->map_count() == 0) continue; - - if (found_single_character || map->map_count() > 1) { - found_single_character = false; - break; - } - - DCHECK(!found_single_character); - DCHECK_EQ(map->map_count(), 1); - - found_single_character = true; - single_character = BitsetFirstSetBit(map->raw_bitset()); - - DCHECK_NE(single_character, -1); - } - - int lookahead_width = max_lookahead + 1 - min_lookahead; - - if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { - // The mask-compare can probably handle this better. - return; - } - - if (found_single_character) { - Label cont, again; - masm->Bind(&again); - masm->LoadCurrentCharacter(max_lookahead, &cont, true); - if (max_char_ > kSize) { - masm->CheckCharacterAfterAnd(single_character, - RegExpMacroAssembler::kTableMask, &cont); - } else { - masm->CheckCharacter(single_character, &cont); - } - masm->AdvanceCurrentPosition(lookahead_width); - masm->GoTo(&again); - masm->Bind(&cont); - return; - } - - Factory* factory = masm->isolate()->factory(); - Handle boolean_skip_table = - factory->NewByteArray(kSize, AllocationType::kOld); - int skip_distance = - GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table); - DCHECK_NE(0, skip_distance); - - Label cont, again; - masm->Bind(&again); - masm->LoadCurrentCharacter(max_lookahead, &cont, true); - masm->CheckBitInTable(boolean_skip_table, &cont); - masm->AdvanceCurrentPosition(skip_distance); - masm->GoTo(&again); - masm->Bind(&cont); -} - -/* Code generation for choice nodes. - * - * We generate quick checks that do a mask and compare to eliminate a - * choice. If the quick check succeeds then it jumps to the continuation to - * do slow checks and check subsequent nodes. If it fails (the common case) - * it falls through to the next choice. - * - * Here is the desired flow graph. Nodes directly below each other imply - * fallthrough. Alternatives 1 and 2 have quick checks. Alternative - * 3 doesn't have a quick check so we have to call the slow check. - * Nodes are marked Qn for quick checks and Sn for slow checks. The entire - * regexp continuation is generated directly after the Sn node, up to the - * next GoTo if we decide to reuse some already generated code. Some - * nodes expect preload_characters to be preloaded into the current - * character register. R nodes do this preloading. Vertices are marked - * F for failures and S for success (possible success in the case of quick - * nodes). L, V, < and > are used as arrow heads. - * - * ----------> R - * | - * V - * Q1 -----> S1 - * | S / - * F| / - * | F/ - * | / - * | R - * | / - * V L - * Q2 -----> S2 - * | S / - * F| / - * | F/ - * | / - * | R - * | / - * V L - * S3 - * | - * F| - * | - * R - * | - * backtrack V - * <----------Q4 - * \ F | - * \ |S - * \ F V - * \-----S4 - * - * For greedy loops we push the current position, then generate the code that - * eats the input specially in EmitGreedyLoop. The other choice (the - * continuation) is generated by the normal code in EmitChoices, and steps back - * in the input to the starting position when it fails to match. The loop code - * looks like this (U is the unwind code that steps back in the greedy loop). - * - * _____ - * / \ - * V | - * ----------> S1 | - * /| | - * / |S | - * F/ \_____/ - * / - * |<----- - * | \ - * V |S - * Q2 ---> U----->backtrack - * | F / - * S| / - * V F / - * S2--/ - */ - -GreedyLoopState::GreedyLoopState(bool not_at_start) { - counter_backtrack_trace_.set_backtrack(&label_); - if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); -} - -void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { -#ifdef DEBUG - int choice_count = alternatives_->length(); - for (int i = 0; i < choice_count - 1; i++) { - GuardedAlternative alternative = alternatives_->at(i); - ZoneList* guards = alternative.guards(); - int guard_count = (guards == nullptr) ? 0 : guards->length(); - for (int j = 0; j < guard_count; j++) { - DCHECK(!trace->mentions_reg(guards->at(j)->reg())); - } - } -#endif -} - -void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, - PreloadState* state) { - if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { - // Save some time by looking at most one machine word ahead. - state->eats_at_least_ = - EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE); - } - state->preload_characters_ = - CalculatePreloadCharacters(compiler, state->eats_at_least_); - - state->preload_is_current_ = - (current_trace->characters_preloaded() == state->preload_characters_); - state->preload_has_checked_bounds_ = state->preload_is_current_; -} - -void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { - int choice_count = alternatives_->length(); - - if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) { - alternatives_->at(0).node()->Emit(compiler, trace); - return; - } - - AssertGuardsMentionRegisters(trace); - - LimitResult limit_result = LimitVersions(compiler, trace); - if (limit_result == DONE) return; - DCHECK(limit_result == CONTINUE); - - // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for - // other choice nodes we only flush if we are out of code size budget. - if (trace->flush_budget() == 0 && trace->actions() != nullptr) { - trace->Flush(compiler, this); - return; - } - - RecursionCheck rc(compiler); - - PreloadState preload; - preload.init(); - GreedyLoopState greedy_loop_state(not_at_start()); - - int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); - AlternativeGenerationList alt_gens(choice_count, zone()); - - if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { - trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload, - &greedy_loop_state, text_length); - } else { - // TODO(erikcorry): Delete this. We don't need this label, but it makes us - // match the traces produced pre-cleanup. - Label second_choice; - compiler->macro_assembler()->Bind(&second_choice); - - preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); - - EmitChoices(compiler, &alt_gens, 0, trace, &preload); - } - - // At this point we need to generate slow checks for the alternatives where - // the quick check was inlined. We can recognize these because the associated - // label was bound. - int new_flush_budget = trace->flush_budget() / choice_count; - for (int i = 0; i < choice_count; i++) { - AlternativeGeneration* alt_gen = alt_gens.at(i); - Trace new_trace(*trace); - // If there are actions to be flushed we have to limit how many times - // they are flushed. Take the budget of the parent trace and distribute - // it fairly amongst the children. - if (new_trace.actions() != nullptr) { - new_trace.set_flush_budget(new_flush_budget); - } - bool next_expects_preload = - i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; - EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i), - alt_gen, preload.preload_characters_, - next_expects_preload); - } -} - -Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, - AlternativeGenerationList* alt_gens, - PreloadState* preload, - GreedyLoopState* greedy_loop_state, - int text_length) { - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - // Here we have special handling for greedy loops containing only text nodes - // and other simple nodes. These are handled by pushing the current - // position on the stack and then incrementing the current position each - // time around the switch. On backtrack we decrement the current position - // and check it against the pushed value. This avoids pushing backtrack - // information for each iteration of the loop, which could take up a lot of - // space. - DCHECK(trace->stop_node() == nullptr); - macro_assembler->PushCurrentPosition(); - Label greedy_match_failed; - Trace greedy_match_trace; - if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); - greedy_match_trace.set_backtrack(&greedy_match_failed); - Label loop_label; - macro_assembler->Bind(&loop_label); - greedy_match_trace.set_stop_node(this); - greedy_match_trace.set_loop_label(&loop_label); - alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); - macro_assembler->Bind(&greedy_match_failed); - - Label second_choice; // For use in greedy matches. - macro_assembler->Bind(&second_choice); - - Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); - - EmitChoices(compiler, alt_gens, 1, new_trace, preload); - - macro_assembler->Bind(greedy_loop_state->label()); - // If we have unwound to the bottom then backtrack. - macro_assembler->CheckGreedyLoop(trace->backtrack()); - // Otherwise try the second priority at an earlier position. - macro_assembler->AdvanceCurrentPosition(-text_length); - macro_assembler->GoTo(&second_choice); - return new_trace; -} - -int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, - Trace* trace) { - int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; - if (alternatives_->length() != 2) return eats_at_least; - - GuardedAlternative alt1 = alternatives_->at(1); - if (alt1.guards() != nullptr && alt1.guards()->length() != 0) { - return eats_at_least; - } - RegExpNode* eats_anything_node = alt1.node(); - if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { - return eats_at_least; - } - - // Really we should be creating a new trace when we execute this function, - // but there is no need, because the code it generates cannot backtrack, and - // we always arrive here with a trivial trace (since it's the entry to a - // loop. That also implies that there are no preloaded characters, which is - // good, because it means we won't be violating any assumptions by - // overwriting those characters with new load instructions. - DCHECK(trace->is_trivial()); - - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - Isolate* isolate = macro_assembler->isolate(); - // At this point we know that we are at a non-greedy loop that will eat - // any character one at a time. Any non-anchored regexp has such a - // loop prepended to it in order to find where it starts. We look for - // a pattern of the form ...abc... where we can look 6 characters ahead - // and step forwards 3 if the character is not one of abc. Abc need - // not be atoms, they can be any reasonably limited character class or - // small alternation. - BoyerMooreLookahead* bm = bm_info(false); - if (bm == nullptr) { - eats_at_least = Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false)); - if (eats_at_least >= 1) { - bm = new (zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); - GuardedAlternative alt0 = alternatives_->at(0); - alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); - } - } - if (bm != nullptr) { - bm->EmitSkipInstructions(macro_assembler); - } - return eats_at_least; -} - -void ChoiceNode::EmitChoices(RegExpCompiler* compiler, - AlternativeGenerationList* alt_gens, - int first_choice, Trace* trace, - PreloadState* preload) { - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - SetUpPreLoad(compiler, trace, preload); - - // For now we just call all choices one after the other. The idea ultimately - // is to use the Dispatch table to try only the relevant ones. - int choice_count = alternatives_->length(); - - int new_flush_budget = trace->flush_budget() / choice_count; - - for (int i = first_choice; i < choice_count; i++) { - bool is_last = i == choice_count - 1; - bool fall_through_on_failure = !is_last; - GuardedAlternative alternative = alternatives_->at(i); - AlternativeGeneration* alt_gen = alt_gens->at(i); - alt_gen->quick_check_details.set_characters(preload->preload_characters_); - ZoneList* guards = alternative.guards(); - int guard_count = (guards == nullptr) ? 0 : guards->length(); - Trace new_trace(*trace); - new_trace.set_characters_preloaded( - preload->preload_is_current_ ? preload->preload_characters_ : 0); - if (preload->preload_has_checked_bounds_) { - new_trace.set_bound_checked_up_to(preload->preload_characters_); - } - new_trace.quick_check_performed()->Clear(); - if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); - if (!is_last) { - new_trace.set_backtrack(&alt_gen->after); - } - alt_gen->expects_preload = preload->preload_is_current_; - bool generate_full_check_inline = false; - if (compiler->optimize() && - try_to_emit_quick_check_for_alternative(i == 0) && - alternative.node()->EmitQuickCheck( - compiler, trace, &new_trace, preload->preload_has_checked_bounds_, - &alt_gen->possible_success, &alt_gen->quick_check_details, - fall_through_on_failure, this)) { - // Quick check was generated for this choice. - preload->preload_is_current_ = true; - preload->preload_has_checked_bounds_ = true; - // If we generated the quick check to fall through on possible success, - // we now need to generate the full check inline. - if (!fall_through_on_failure) { - macro_assembler->Bind(&alt_gen->possible_success); - new_trace.set_quick_check_performed(&alt_gen->quick_check_details); - new_trace.set_characters_preloaded(preload->preload_characters_); - new_trace.set_bound_checked_up_to(preload->preload_characters_); - generate_full_check_inline = true; - } - } else if (alt_gen->quick_check_details.cannot_match()) { - if (!fall_through_on_failure) { - macro_assembler->GoTo(trace->backtrack()); - } - continue; - } else { - // No quick check was generated. Put the full code here. - // If this is not the first choice then there could be slow checks from - // previous cases that go here when they fail. There's no reason to - // insist that they preload characters since the slow check we are about - // to generate probably can't use it. - if (i != first_choice) { - alt_gen->expects_preload = false; - new_trace.InvalidateCurrentCharacter(); - } - generate_full_check_inline = true; - } - if (generate_full_check_inline) { - if (new_trace.actions() != nullptr) { - new_trace.set_flush_budget(new_flush_budget); - } - for (int j = 0; j < guard_count; j++) { - GenerateGuard(macro_assembler, guards->at(j), &new_trace); - } - alternative.node()->Emit(compiler, &new_trace); - preload->preload_is_current_ = false; - } - macro_assembler->Bind(&alt_gen->after); - } -} - -void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, - Trace* trace, - GuardedAlternative alternative, - AlternativeGeneration* alt_gen, - int preload_characters, - bool next_expects_preload) { - if (!alt_gen->possible_success.is_linked()) return; - - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); - macro_assembler->Bind(&alt_gen->possible_success); - Trace out_of_line_trace(*trace); - out_of_line_trace.set_characters_preloaded(preload_characters); - out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); - if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); - ZoneList* guards = alternative.guards(); - int guard_count = (guards == nullptr) ? 0 : guards->length(); - if (next_expects_preload) { - Label reload_current_char; - out_of_line_trace.set_backtrack(&reload_current_char); - for (int j = 0; j < guard_count; j++) { - GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); - } - alternative.node()->Emit(compiler, &out_of_line_trace); - macro_assembler->Bind(&reload_current_char); - // Reload the current character, since the next quick check expects that. - // We don't need to check bounds here because we only get into this - // code through a quick check which already did the checked load. - macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false, - preload_characters); - macro_assembler->GoTo(&(alt_gen->after)); - } else { - out_of_line_trace.set_backtrack(&(alt_gen->after)); - for (int j = 0; j < guard_count; j++) { - GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); - } - alternative.node()->Emit(compiler, &out_of_line_trace); - } -} - -void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - LimitResult limit_result = LimitVersions(compiler, trace); - if (limit_result == DONE) return; - DCHECK(limit_result == CONTINUE); - - RecursionCheck rc(compiler); - - switch (action_type_) { - case STORE_POSITION: { - Trace::DeferredCapture new_capture(data_.u_position_register.reg, - data_.u_position_register.is_capture, - trace); - Trace new_trace = *trace; - new_trace.add_action(&new_capture); - on_success()->Emit(compiler, &new_trace); - break; - } - case INCREMENT_REGISTER: { - Trace::DeferredIncrementRegister new_increment( - data_.u_increment_register.reg); - Trace new_trace = *trace; - new_trace.add_action(&new_increment); - on_success()->Emit(compiler, &new_trace); - break; - } - case SET_REGISTER_FOR_LOOP: { - Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg, - data_.u_store_register.value); - Trace new_trace = *trace; - new_trace.add_action(&new_set); - on_success()->Emit(compiler, &new_trace); - break; - } - case CLEAR_CAPTURES: { - Trace::DeferredClearCaptures new_capture(Interval( - data_.u_clear_captures.range_from, data_.u_clear_captures.range_to)); - Trace new_trace = *trace; - new_trace.add_action(&new_capture); - on_success()->Emit(compiler, &new_trace); - break; - } - case BEGIN_SUBMATCH: - if (!trace->is_trivial()) { - trace->Flush(compiler, this); - } else { - assembler->WriteCurrentPositionToRegister( - data_.u_submatch.current_position_register, 0); - assembler->WriteStackPointerToRegister( - data_.u_submatch.stack_pointer_register); - on_success()->Emit(compiler, trace); - } - break; - case EMPTY_MATCH_CHECK: { - int start_pos_reg = data_.u_empty_match_check.start_register; - int stored_pos = 0; - int rep_reg = data_.u_empty_match_check.repetition_register; - bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); - bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); - if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { - // If we know we haven't advanced and there is no minimum we - // can just backtrack immediately. - assembler->GoTo(trace->backtrack()); - } else if (know_dist && stored_pos < trace->cp_offset()) { - // If we know we've advanced we can generate the continuation - // immediately. - on_success()->Emit(compiler, trace); - } else if (!trace->is_trivial()) { - trace->Flush(compiler, this); - } else { - Label skip_empty_check; - // If we have a minimum number of repetitions we check the current - // number first and skip the empty check if it's not enough. - if (has_minimum) { - int limit = data_.u_empty_match_check.repetition_limit; - assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); - } - // If the match is empty we bail out, otherwise we fall through - // to the on-success continuation. - assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, - trace->backtrack()); - assembler->Bind(&skip_empty_check); - on_success()->Emit(compiler, trace); - } - break; - } - case POSITIVE_SUBMATCH_SUCCESS: { - if (!trace->is_trivial()) { - trace->Flush(compiler, this); - return; - } - assembler->ReadCurrentPositionFromRegister( - data_.u_submatch.current_position_register); - assembler->ReadStackPointerFromRegister( - data_.u_submatch.stack_pointer_register); - int clear_register_count = data_.u_submatch.clear_register_count; - if (clear_register_count == 0) { - on_success()->Emit(compiler, trace); - return; - } - int clear_registers_from = data_.u_submatch.clear_register_from; - Label clear_registers_backtrack; - Trace new_trace = *trace; - new_trace.set_backtrack(&clear_registers_backtrack); - on_success()->Emit(compiler, &new_trace); - - assembler->Bind(&clear_registers_backtrack); - int clear_registers_to = clear_registers_from + clear_register_count - 1; - assembler->ClearRegisters(clear_registers_from, clear_registers_to); - - DCHECK(trace->backtrack() == nullptr); - assembler->Backtrack(); - return; - } - default: - UNREACHABLE(); - } -} - -void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { - RegExpMacroAssembler* assembler = compiler->macro_assembler(); - if (!trace->is_trivial()) { - trace->Flush(compiler, this); - return; - } - - LimitResult limit_result = LimitVersions(compiler, trace); - if (limit_result == DONE) return; - DCHECK(limit_result == CONTINUE); - - RecursionCheck rc(compiler); - - DCHECK_EQ(start_reg_ + 1, end_reg_); - if (IgnoreCase(flags_)) { - assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), - trace->backtrack()); - } else { - assembler->CheckNotBackReference(start_reg_, read_backward(), - trace->backtrack()); - } - // We are going to advance backward, so we may end up at the start. - if (read_backward()) trace->set_at_start(Trace::UNKNOWN); - - // Check that the back reference does not end inside a surrogate pair. - if (IsUnicode(flags_) && !compiler->one_byte()) { - assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); - } - on_success()->Emit(compiler, trace); -} - -void TextNode::CalculateOffsets() { - int element_count = elements()->length(); - // Set up the offsets of the elements relative to the start. This is a fixed - // quantity since a TextNode can only contain fixed-width things. - int cp_offset = 0; - for (int i = 0; i < element_count; i++) { - TextElement& elm = elements()->at(i); - elm.set_cp_offset(cp_offset); - cp_offset += elm.length(); - } -} - -namespace { - -// Assertion propagation moves information about assertions such as -// \b to the affected nodes. For instance, in /.\b./ information must -// be propagated to the first '.' that whatever follows needs to know -// if it matched a word or a non-word, and to the second '.' that it -// has to check if it succeeds a word or non-word. In this case the -// result will be something like: -// -// +-------+ +------------+ -// | . | | . | -// +-------+ ---> +------------+ -// | word? | | check word | -// +-------+ +------------+ -class AssertionPropagator : public AllStatic { - public: - static void VisitText(TextNode* that) {} - - static void VisitAction(ActionNode* that) { - // If the next node is interested in what it follows then this node - // has to be interested too so it can pass the information on. - that->info()->AddFromFollowing(that->on_success()->info()); - } - - static void VisitChoice(ChoiceNode* that, int i) { - // Anything the following nodes need to know has to be known by - // this node also, so it can pass it on. - that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info()); - } - - static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { - that->info()->AddFromFollowing(that->continue_node()->info()); - } - - static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) { - that->info()->AddFromFollowing(that->loop_node()->info()); - } - - static void VisitNegativeLookaroundChoiceLookaroundNode( - NegativeLookaroundChoiceNode* that) { - VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex); - } - - static void VisitNegativeLookaroundChoiceContinueNode( - NegativeLookaroundChoiceNode* that) { - VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex); - } - - static void VisitBackReference(BackReferenceNode* that) {} - - static void VisitAssertion(AssertionNode* that) {} -}; - -// Propagates information about the minimum size of successful matches from -// successor nodes to their predecessors. Note that all eats_at_least values -// are initialized to zero before analysis. -class EatsAtLeastPropagator : public AllStatic { - public: - static void VisitText(TextNode* that) { - // The eats_at_least value is not used if reading backward. - if (!that->read_backward()) { - // We are not at the start after this node, and thus we can use the - // successor's eats_at_least_from_not_start value. - uint8_t eats_at_least = base::saturated_cast( - that->Length() + that->on_success() - ->eats_at_least_info() - ->eats_at_least_from_not_start); - that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least)); - } - } - - static void VisitAction(ActionNode* that) { - // POSITIVE_SUBMATCH_SUCCESS rewinds input, so we must not consider - // successor nodes for eats_at_least. SET_REGISTER_FOR_LOOP indicates a loop - // entry point, which means the loop body will run at least the minimum - // number of times before the continuation case can run. Otherwise the - // current node eats at least as much as its successor. - switch (that->action_type()) { - case ActionNode::POSITIVE_SUBMATCH_SUCCESS: - break; // Was already initialized to zero. - case ActionNode::SET_REGISTER_FOR_LOOP: - that->set_eats_at_least_info( - that->on_success()->EatsAtLeastFromLoopEntry()); - break; - default: - that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); - break; - } - } - - static void VisitChoice(ChoiceNode* that, int i) { - // The minimum possible match from a choice node is the minimum of its - // successors. - EatsAtLeastInfo eats_at_least = - i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info(); - eats_at_least.SetMin( - *that->alternatives()->at(i).node()->eats_at_least_info()); - that->set_eats_at_least_info(eats_at_least); - } - - static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { - that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info()); - } - - static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {} - - static void VisitNegativeLookaroundChoiceLookaroundNode( - NegativeLookaroundChoiceNode* that) {} - - static void VisitNegativeLookaroundChoiceContinueNode( - NegativeLookaroundChoiceNode* that) { - that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info()); - } - - static void VisitBackReference(BackReferenceNode* that) { - if (!that->read_backward()) { - that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); - } - } - - static void VisitAssertion(AssertionNode* that) { - EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info(); - if (that->assertion_type() == AssertionNode::AT_START) { - // If we know we are not at the start and we are asked "how many - // characters will you match if you succeed?" then we can answer anything - // since false implies false. So let's just set the max answer - // (UINT8_MAX) since that won't prevent us from preloading a lot of - // characters for the other branches in the node graph. - eats_at_least.eats_at_least_from_not_start = UINT8_MAX; - } - that->set_eats_at_least_info(eats_at_least); - } -}; - -} // namespace - -// ------------------------------------------------------------------- -// Analysis - -// Iterates the node graph and provides the opportunity for propagators to set -// values that depend on successor nodes. -template -class Analysis : public NodeVisitor { - public: - Analysis(Isolate* isolate, bool is_one_byte) - : isolate_(isolate), - is_one_byte_(is_one_byte), - error_(RegExpError::kNone) {} - - void EnsureAnalyzed(RegExpNode* that) { - StackLimitCheck check(isolate()); - if (check.HasOverflowed()) { - if (FLAG_correctness_fuzzer_suppressions) { - FATAL("Analysis: Aborting on stack overflow"); - } - fail(RegExpError::kAnalysisStackOverflow); - return; - } - if (that->info()->been_analyzed || that->info()->being_analyzed) return; - that->info()->being_analyzed = true; - that->Accept(this); - that->info()->being_analyzed = false; - that->info()->been_analyzed = true; - } - - bool has_failed() { return error_ != RegExpError::kNone; } - RegExpError error() { - DCHECK(error_ != RegExpError::kNone); - return error_; - } - void fail(RegExpError error) { error_ = error; } - - Isolate* isolate() const { return isolate_; } - - void VisitEnd(EndNode* that) override { - // nothing to do - } - -// Used to call the given static function on each propagator / variadic template -// argument. -#define STATIC_FOR_EACH(expr) \ - do { \ - int dummy[] = {((expr), 0)...}; \ - USE(dummy); \ - } while (false) - - void VisitText(TextNode* that) override { - that->MakeCaseIndependent(isolate(), is_one_byte_); - EnsureAnalyzed(that->on_success()); - if (has_failed()) return; - that->CalculateOffsets(); - STATIC_FOR_EACH(Propagators::VisitText(that)); - } - - void VisitAction(ActionNode* that) override { - EnsureAnalyzed(that->on_success()); - if (has_failed()) return; - STATIC_FOR_EACH(Propagators::VisitAction(that)); - } - - void VisitChoice(ChoiceNode* that) override { - for (int i = 0; i < that->alternatives()->length(); i++) { - EnsureAnalyzed(that->alternatives()->at(i).node()); - if (has_failed()) return; - STATIC_FOR_EACH(Propagators::VisitChoice(that, i)); - } - } - - void VisitLoopChoice(LoopChoiceNode* that) override { - DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue. - - // First propagate all information from the continuation node. - EnsureAnalyzed(that->continue_node()); - if (has_failed()) return; - STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that)); - - // Check the loop last since it may need the value of this node - // to get a correct result. - EnsureAnalyzed(that->loop_node()); - if (has_failed()) return; - STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that)); - } - - void VisitNegativeLookaroundChoice( - NegativeLookaroundChoiceNode* that) override { - DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue. - - EnsureAnalyzed(that->lookaround_node()); - if (has_failed()) return; - STATIC_FOR_EACH( - Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that)); - - EnsureAnalyzed(that->continue_node()); - if (has_failed()) return; - STATIC_FOR_EACH( - Propagators::VisitNegativeLookaroundChoiceContinueNode(that)); - } - - void VisitBackReference(BackReferenceNode* that) override { - EnsureAnalyzed(that->on_success()); - if (has_failed()) return; - STATIC_FOR_EACH(Propagators::VisitBackReference(that)); - } - - void VisitAssertion(AssertionNode* that) override { - EnsureAnalyzed(that->on_success()); - if (has_failed()) return; - STATIC_FOR_EACH(Propagators::VisitAssertion(that)); - } - -#undef STATIC_FOR_EACH - - private: - Isolate* isolate_; - bool is_one_byte_; - RegExpError error_; - - DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); -}; - -RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, - RegExpNode* node) { - Analysis analysis(isolate, - is_one_byte); - DCHECK_EQ(node->info()->been_analyzed, false); - analysis.EnsureAnalyzed(node); - DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); - return analysis.has_failed() ? analysis.error() : RegExpError::kNone; -} - -void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, - bool not_at_start) { - // Working out the set of characters that a backreference can match is too - // hard, so we just say that any character can match. - bm->SetRest(offset); - SaveBMInfo(bm, not_at_start, offset); -} - -STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == - RegExpMacroAssembler::kTableSize); - -void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) { - ZoneList* alts = alternatives(); - budget = (budget - 1) / alts->length(); - for (int i = 0; i < alts->length(); i++) { - GuardedAlternative& alt = alts->at(i); - if (alt.guards() != nullptr && alt.guards()->length() != 0) { - bm->SetRest(offset); // Give up trying to fill in info. - SaveBMInfo(bm, not_at_start, offset); - return; - } - alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); - } - SaveBMInfo(bm, not_at_start, offset); -} - -void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) { - if (initial_offset >= bm->length()) return; - int offset = initial_offset; - int max_char = bm->max_char(); - for (int i = 0; i < elements()->length(); i++) { - if (offset >= bm->length()) { - if (initial_offset == 0) set_bm_info(not_at_start, bm); - return; - } - TextElement text = elements()->at(i); - if (text.text_type() == TextElement::ATOM) { - RegExpAtom* atom = text.atom(); - for (int j = 0; j < atom->length(); j++, offset++) { - if (offset >= bm->length()) { - if (initial_offset == 0) set_bm_info(not_at_start, bm); - return; - } - uc16 character = atom->data()[j]; - if (IgnoreCase(atom->flags())) { - unibrow::uchar chars[4]; - int length = GetCaseIndependentLetters( - isolate, character, bm->max_char() == String::kMaxOneByteCharCode, - chars, 4); - for (int j = 0; j < length; j++) { - bm->Set(offset, chars[j]); - } - } else { - if (character <= max_char) bm->Set(offset, character); - } - } - } else { - DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); - RegExpCharacterClass* char_class = text.char_class(); - ZoneList* ranges = char_class->ranges(zone()); - if (char_class->is_negated()) { - bm->SetAll(offset); - } else { - for (int k = 0; k < ranges->length(); k++) { - CharacterRange& range = ranges->at(k); - if (range.from() > max_char) continue; - int to = Min(max_char, static_cast(range.to())); - bm->SetInterval(offset, Interval(range.from(), to)); - } - } - offset++; - } - } - if (offset >= bm->length()) { - if (initial_offset == 0) set_bm_info(not_at_start, bm); - return; - } - on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, - true); // Not at start after a text node. - if (initial_offset == 0) set_bm_info(not_at_start, bm); -} - -// static -RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( - RegExpCompiler* compiler, RegExpNode* on_success, JSRegExp::Flags flags) { - DCHECK(!compiler->read_backward()); - Zone* zone = compiler->zone(); - ZoneList* lead_surrogates = CharacterRange::List( - zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); - ZoneList* trail_surrogates = CharacterRange::List( - zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); - - ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); - - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - RegExpNode* step_back = TextNode::CreateForCharacterRanges( - zone, lead_surrogates, true, on_success, flags); - RegExpLookaround::Builder builder(true, step_back, stack_register, - position_register); - RegExpNode* match_trail = TextNode::CreateForCharacterRanges( - zone, trail_surrogates, false, builder.on_match_success(), flags); - - optional_step_back->AddAlternative( - GuardedAlternative(builder.ForMatch(match_trail))); - optional_step_back->AddAlternative(GuardedAlternative(on_success)); - - return optional_step_back; -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-compiler.h b/js/src/new-regexp/regexp-compiler.h deleted file mode 100644 index 186d5e838..000000000 --- a/js/src/new-regexp/regexp-compiler.h +++ /dev/null @@ -1,621 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_COMPILER_H_ -#define V8_REGEXP_REGEXP_COMPILER_H_ - -#include - -#include "new-regexp/regexp-nodes.h" - -namespace v8 { -namespace internal { - -class DynamicBitSet; -class Isolate; - -namespace regexp_compiler_constants { - -// The '2' variant is has inclusive from and exclusive to. -// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, -// which include WhiteSpace (7.2) or LineTerminator (7.3) values. -constexpr uc32 kRangeEndMarker = 0x110000; -constexpr int kSpaceRanges[] = { - '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, - 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, - 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; -constexpr int kSpaceRangeCount = arraysize(kSpaceRanges); - -constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', - '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; -constexpr int kWordRangeCount = arraysize(kWordRanges); -constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; -constexpr int kDigitRangeCount = arraysize(kDigitRanges); -constexpr int kSurrogateRanges[] = {kLeadSurrogateStart, - kLeadSurrogateStart + 1, kRangeEndMarker}; -constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); -constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, - 0x2028, 0x202A, kRangeEndMarker}; -constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); - -// More makes code generation slower, less makes V8 benchmark score lower. -constexpr int kMaxLookaheadForBoyerMoore = 8; -// In a 3-character pattern you can maximally step forwards 3 characters -// at a time, which is not always enough to pay for the extra logic. -constexpr int kPatternTooShortForBoyerMoore = 2; - -} // namespace regexp_compiler_constants - -inline bool IgnoreCase(JSRegExp::Flags flags) { - return (flags & JSRegExp::kIgnoreCase) != 0; -} - -inline bool IsUnicode(JSRegExp::Flags flags) { - return (flags & JSRegExp::kUnicode) != 0; -} - -inline bool IsSticky(JSRegExp::Flags flags) { - return (flags & JSRegExp::kSticky) != 0; -} - -inline bool IsGlobal(JSRegExp::Flags flags) { - return (flags & JSRegExp::kGlobal) != 0; -} - -inline bool DotAll(JSRegExp::Flags flags) { - return (flags & JSRegExp::kDotAll) != 0; -} - -inline bool Multiline(JSRegExp::Flags flags) { - return (flags & JSRegExp::kMultiline) != 0; -} - -inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { - // Both unicode and ignore_case flags are set. We need to use ICU to find - // the closure over case equivalents. - return IsUnicode(flags) && IgnoreCase(flags); -} - -// Details of a quick mask-compare check that can look ahead in the -// input stream. -class QuickCheckDetails { - public: - QuickCheckDetails() - : characters_(0), mask_(0), value_(0), cannot_match_(false) {} - explicit QuickCheckDetails(int characters) - : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} - bool Rationalize(bool one_byte); - // Merge in the information from another branch of an alternation. - void Merge(QuickCheckDetails* other, int from_index); - // Advance the current position by some amount. - void Advance(int by, bool one_byte); - void Clear(); - bool cannot_match() { return cannot_match_; } - void set_cannot_match() { cannot_match_ = true; } - struct Position { - Position() : mask(0), value(0), determines_perfectly(false) {} - uc16 mask; - uc16 value; - bool determines_perfectly; - }; - int characters() { return characters_; } - void set_characters(int characters) { characters_ = characters; } - Position* positions(int index) { - DCHECK_LE(0, index); - DCHECK_GT(characters_, index); - return positions_ + index; - } - uint32_t mask() { return mask_; } - uint32_t value() { return value_; } - - private: - // How many characters do we have quick check information from. This is - // the same for all branches of a choice node. - int characters_; - Position positions_[4]; - // These values are the condensate of the above array after Rationalize(). - uint32_t mask_; - uint32_t value_; - // If set to true, there is no way this quick check can match at all. - // E.g., if it requires to be at the start of the input, and isn't. - bool cannot_match_; -}; - -// Improve the speed that we scan for an initial point where a non-anchored -// regexp can match by using a Boyer-Moore-like table. This is done by -// identifying non-greedy non-capturing loops in the nodes that eat any -// character one at a time. For example in the middle of the regexp -// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly -// inserted at the start of any non-anchored regexp. -// -// When we have found such a loop we look ahead in the nodes to find the set of -// characters that can come at given distances. For example for the regexp -// /.?foo/ we know that there are at least 3 characters ahead of us, and the -// sets of characters that can occur are [any, [f, o], [o]]. We find a range in -// the lookahead info where the set of characters is reasonably constrained. In -// our example this is from index 1 to 2 (0 is not constrained). We can now -// look 3 characters ahead and if we don't find one of [f, o] (the union of -// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). -// -// For Unicode input strings we do the same, but modulo 128. -// -// We also look at the first string fed to the regexp and use that to get a hint -// of the character frequencies in the inputs. This affects the assessment of -// whether the set of characters is 'reasonably constrained'. -// -// We also have another lookahead mechanism (called quick check in the code), -// which uses a wide load of multiple characters followed by a mask and compare -// to determine whether a match is possible at this point. -enum ContainedInLattice { - kNotYet = 0, - kLatticeIn = 1, - kLatticeOut = 2, - kLatticeUnknown = 3 // Can also mean both in and out. -}; - -inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { - return static_cast(a | b); -} - -class BoyerMoorePositionInfo : public ZoneObject { - public: - bool at(int i) const { return map_[i]; } - - static constexpr int kMapSize = 128; - static constexpr int kMask = kMapSize - 1; - - int map_count() const { return map_count_; } - - void Set(int character); - void SetInterval(const Interval& interval); - void SetAll(); - - bool is_non_word() { return w_ == kLatticeOut; } - bool is_word() { return w_ == kLatticeIn; } - - using Bitset = std::bitset; - Bitset raw_bitset() const { return map_; } - - private: - Bitset map_; - int map_count_ = 0; // Number of set bits in the map. - ContainedInLattice w_ = kNotYet; // The \w character class. -}; - -class BoyerMooreLookahead : public ZoneObject { - public: - BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); - - int length() { return length_; } - int max_char() { return max_char_; } - RegExpCompiler* compiler() { return compiler_; } - - int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } - - BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } - - void Set(int map_number, int character) { - if (character > max_char_) return; - BoyerMoorePositionInfo* info = bitmaps_->at(map_number); - info->Set(character); - } - - void SetInterval(int map_number, const Interval& interval) { - if (interval.from() > max_char_) return; - BoyerMoorePositionInfo* info = bitmaps_->at(map_number); - if (interval.to() > max_char_) { - info->SetInterval(Interval(interval.from(), max_char_)); - } else { - info->SetInterval(interval); - } - } - - void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } - - void SetRest(int from_map) { - for (int i = from_map; i < length_; i++) SetAll(i); - } - void EmitSkipInstructions(RegExpMacroAssembler* masm); - - private: - // This is the value obtained by EatsAtLeast. If we do not have at least this - // many characters left in the sample string then the match is bound to fail. - // Therefore it is OK to read a character this far ahead of the current match - // point. - int length_; - RegExpCompiler* compiler_; - // 0xff for Latin1, 0xffff for UTF-16. - int max_char_; - ZoneList* bitmaps_; - - int GetSkipTable(int min_lookahead, int max_lookahead, - Handle boolean_skip_table); - bool FindWorthwhileInterval(int* from, int* to); - int FindBestInterval(int max_number_of_chars, int old_biggest_points, - int* from, int* to); -}; - -// There are many ways to generate code for a node. This class encapsulates -// the current way we should be generating. In other words it encapsulates -// the current state of the code generator. The effect of this is that we -// generate code for paths that the matcher can take through the regular -// expression. A given node in the regexp can be code-generated several times -// as it can be part of several traces. For example for the regexp: -// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part -// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code -// to match foo is generated only once (the traces have a common prefix). The -// code to store the capture is deferred and generated (twice) after the places -// where baz has been matched. -class Trace { - public: - // A value for a property that is either known to be true, know to be false, - // or not known. - enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; - - class DeferredAction { - public: - DeferredAction(ActionNode::ActionType action_type, int reg) - : action_type_(action_type), reg_(reg), next_(nullptr) {} - DeferredAction* next() { return next_; } - bool Mentions(int reg); - int reg() { return reg_; } - ActionNode::ActionType action_type() { return action_type_; } - - private: - ActionNode::ActionType action_type_; - int reg_; - DeferredAction* next_; - friend class Trace; - }; - - class DeferredCapture : public DeferredAction { - public: - DeferredCapture(int reg, bool is_capture, Trace* trace) - : DeferredAction(ActionNode::STORE_POSITION, reg), - cp_offset_(trace->cp_offset()), - is_capture_(is_capture) {} - int cp_offset() { return cp_offset_; } - bool is_capture() { return is_capture_; } - - private: - int cp_offset_; - bool is_capture_; - void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } - }; - - class DeferredSetRegisterForLoop : public DeferredAction { - public: - DeferredSetRegisterForLoop(int reg, int value) - : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg), - value_(value) {} - int value() { return value_; } - - private: - int value_; - }; - - class DeferredClearCaptures : public DeferredAction { - public: - explicit DeferredClearCaptures(Interval range) - : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {} - Interval range() { return range_; } - - private: - Interval range_; - }; - - class DeferredIncrementRegister : public DeferredAction { - public: - explicit DeferredIncrementRegister(int reg) - : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {} - }; - - Trace() - : cp_offset_(0), - actions_(nullptr), - backtrack_(nullptr), - stop_node_(nullptr), - loop_label_(nullptr), - characters_preloaded_(0), - bound_checked_up_to_(0), - flush_budget_(100), - at_start_(UNKNOWN) {} - - // End the trace. This involves flushing the deferred actions in the trace - // and pushing a backtrack location onto the backtrack stack. Once this is - // done we can start a new trace or go to one that has already been - // generated. - void Flush(RegExpCompiler* compiler, RegExpNode* successor); - int cp_offset() { return cp_offset_; } - DeferredAction* actions() { return actions_; } - // A trivial trace is one that has no deferred actions or other state that - // affects the assumptions used when generating code. There is no recorded - // backtrack location in a trivial trace, so with a trivial trace we will - // generate code that, on a failure to match, gets the backtrack location - // from the backtrack stack rather than using a direct jump instruction. We - // always start code generation with a trivial trace and non-trivial traces - // are created as we emit code for nodes or add to the list of deferred - // actions in the trace. The location of the code generated for a node using - // a trivial trace is recorded in a label in the node so that gotos can be - // generated to that code. - bool is_trivial() { - return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && - characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && - quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; - } - TriBool at_start() { return at_start_; } - void set_at_start(TriBool at_start) { at_start_ = at_start; } - Label* backtrack() { return backtrack_; } - Label* loop_label() { return loop_label_; } - RegExpNode* stop_node() { return stop_node_; } - int characters_preloaded() { return characters_preloaded_; } - int bound_checked_up_to() { return bound_checked_up_to_; } - int flush_budget() { return flush_budget_; } - QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } - bool mentions_reg(int reg); - // Returns true if a deferred position store exists to the specified - // register and stores the offset in the out-parameter. Otherwise - // returns false. - bool GetStoredPosition(int reg, int* cp_offset); - // These set methods and AdvanceCurrentPositionInTrace should be used only on - // new traces - the intention is that traces are immutable after creation. - void add_action(DeferredAction* new_action) { - DCHECK(new_action->next_ == nullptr); - new_action->next_ = actions_; - actions_ = new_action; - } - void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } - void set_stop_node(RegExpNode* node) { stop_node_ = node; } - void set_loop_label(Label* label) { loop_label_ = label; } - void set_characters_preloaded(int count) { characters_preloaded_ = count; } - void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } - void set_flush_budget(int to) { flush_budget_ = to; } - void set_quick_check_performed(QuickCheckDetails* d) { - quick_check_performed_ = *d; - } - void InvalidateCurrentCharacter(); - void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); - - private: - int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); - void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, - const DynamicBitSet& affected_registers, - DynamicBitSet* registers_to_pop, - DynamicBitSet* registers_to_clear, Zone* zone); - void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, - const DynamicBitSet& registers_to_pop, - const DynamicBitSet& registers_to_clear); - int cp_offset_; - DeferredAction* actions_; - Label* backtrack_; - RegExpNode* stop_node_; - Label* loop_label_; - int characters_preloaded_; - int bound_checked_up_to_; - QuickCheckDetails quick_check_performed_; - int flush_budget_; - TriBool at_start_; -}; - -class GreedyLoopState { - public: - explicit GreedyLoopState(bool not_at_start); - - Label* label() { return &label_; } - Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } - - private: - Label label_; - Trace counter_backtrack_trace_; -}; - -struct PreloadState { - static const int kEatsAtLeastNotYetInitialized = -1; - bool preload_is_current_; - bool preload_has_checked_bounds_; - int preload_characters_; - int eats_at_least_; - void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } -}; - -// Analysis performs assertion propagation and computes eats_at_least_ values. -// See the comments on AssertionPropagator and EatsAtLeastPropagator for more -// details. -RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); - -class FrequencyCollator { - public: - FrequencyCollator() : total_samples_(0) { - for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { - frequencies_[i] = CharacterFrequency(i); - } - } - - void CountCharacter(int character) { - int index = (character & RegExpMacroAssembler::kTableMask); - frequencies_[index].Increment(); - total_samples_++; - } - - // Does not measure in percent, but rather per-128 (the table size from the - // regexp macro assembler). - int Frequency(int in_character) { - DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); - if (total_samples_ < 1) return 1; // Division by zero. - int freq_in_per128 = - (frequencies_[in_character].counter() * 128) / total_samples_; - return freq_in_per128; - } - - private: - class CharacterFrequency { - public: - CharacterFrequency() : counter_(0), character_(-1) {} - explicit CharacterFrequency(int character) - : counter_(0), character_(character) {} - - void Increment() { counter_++; } - int counter() { return counter_; } - int character() { return character_; } - - private: - int counter_; - int character_; - }; - - private: - CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; - int total_samples_; -}; - -class RegExpCompiler { - public: - RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool is_one_byte); - - int AllocateRegister() { - if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { - reg_exp_too_big_ = true; - return next_register_; - } - return next_register_++; - } - - // Lookarounds to match lone surrogates for unicode character class matches - // are never nested. We can therefore reuse registers. - int UnicodeLookaroundStackRegister() { - if (unicode_lookaround_stack_register_ == kNoRegister) { - unicode_lookaround_stack_register_ = AllocateRegister(); - } - return unicode_lookaround_stack_register_; - } - - int UnicodeLookaroundPositionRegister() { - if (unicode_lookaround_position_register_ == kNoRegister) { - unicode_lookaround_position_register_ = AllocateRegister(); - } - return unicode_lookaround_position_register_; - } - - struct CompilationResult final { - explicit CompilationResult(RegExpError err) : error(err) {} - CompilationResult(Object code, int registers) - : code(code), num_registers(registers) {} - - static CompilationResult RegExpTooBig() { - return CompilationResult(RegExpError::kTooLarge); - } - - bool Succeeded() const { return error == RegExpError::kNone; } - - const RegExpError error = RegExpError::kNone; - Object code; - int num_registers = 0; - }; - - CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, - RegExpNode* start, int capture_count, - Handle pattern); - - // If the regexp matching starts within a surrogate pair, step back to the - // lead surrogate and start matching from there. - static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, - RegExpNode* on_success, - JSRegExp::Flags flags); - - inline void AddWork(RegExpNode* node) { - if (!node->on_work_list() && !node->label()->is_bound()) { - node->set_on_work_list(true); - work_list_->push_back(node); - } - } - - static const int kImplementationOffset = 0; - static const int kNumberOfRegistersOffset = 0; - static const int kCodeOffset = 1; - - RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } - EndNode* accept() { return accept_; } - - static const int kMaxRecursion = 100; - inline int recursion_depth() { return recursion_depth_; } - inline void IncrementRecursionDepth() { recursion_depth_++; } - inline void DecrementRecursionDepth() { recursion_depth_--; } - - void SetRegExpTooBig() { reg_exp_too_big_ = true; } - - inline bool one_byte() { return one_byte_; } - inline bool optimize() { return optimize_; } - inline void set_optimize(bool value) { optimize_ = value; } - inline bool limiting_recursion() { return limiting_recursion_; } - inline void set_limiting_recursion(bool value) { - limiting_recursion_ = value; - } - bool read_backward() { return read_backward_; } - void set_read_backward(bool value) { read_backward_ = value; } - FrequencyCollator* frequency_collator() { return &frequency_collator_; } - - int current_expansion_factor() { return current_expansion_factor_; } - void set_current_expansion_factor(int value) { - current_expansion_factor_ = value; - } - - Isolate* isolate() const { return isolate_; } - Zone* zone() const { return zone_; } - - static const int kNoRegister = -1; - - private: - EndNode* accept_; - int next_register_; - int unicode_lookaround_stack_register_; - int unicode_lookaround_position_register_; - ZoneVector* work_list_; - int recursion_depth_; - RegExpMacroAssembler* macro_assembler_; - bool one_byte_; - bool reg_exp_too_big_; - bool limiting_recursion_; - bool optimize_; - bool read_backward_; - int current_expansion_factor_; - FrequencyCollator frequency_collator_; - Isolate* isolate_; - Zone* zone_; -}; - -// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. -class UnicodeRangeSplitter { - public: - V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList* base); - - static constexpr int kInitialSize = 8; - using CharacterRangeVector = base::SmallVector; - - const CharacterRangeVector* bmp() const { return &bmp_; } - const CharacterRangeVector* lead_surrogates() const { - return &lead_surrogates_; - } - const CharacterRangeVector* trail_surrogates() const { - return &trail_surrogates_; - } - const CharacterRangeVector* non_bmp() const { return &non_bmp_; } - - private: - void AddRange(CharacterRange range); - - CharacterRangeVector bmp_; - CharacterRangeVector lead_surrogates_; - CharacterRangeVector trail_surrogates_; - CharacterRangeVector non_bmp_; -}; - -// We need to check for the following characters: 0x39C 0x3BC 0x178. -// TODO(jgruber): Move to CharacterRange. -bool RangeContainsLatin1Equivalents(CharacterRange range); - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_COMPILER_H_ diff --git a/js/src/new-regexp/regexp-dotprinter.cc b/js/src/new-regexp/regexp-dotprinter.cc deleted file mode 100644 index 2bf393c32..000000000 --- a/js/src/new-regexp/regexp-dotprinter.cc +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-dotprinter.h" - -#include "new-regexp/regexp-compiler.h" - -namespace v8 { -namespace internal { - -// ------------------------------------------------------------------- -// Dot/dotty output - -#ifdef DEBUG - -class DotPrinterImpl : public NodeVisitor { - public: - explicit DotPrinterImpl(std::ostream& os) : os_(os) {} - void PrintNode(const char* label, RegExpNode* node); - void Visit(RegExpNode* node); - void PrintAttributes(RegExpNode* from); - void PrintOnFailure(RegExpNode* from, RegExpNode* to); -#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that); - FOR_EACH_NODE_TYPE(DECLARE_VISIT) -#undef DECLARE_VISIT - private: - std::ostream& os_; -}; - -void DotPrinterImpl::PrintNode(const char* label, RegExpNode* node) { - os_ << "digraph G {\n graph [label=\""; - for (int i = 0; label[i]; i++) { - switch (label[i]) { - case '\\': - os_ << "\\\\"; - break; - case '"': - os_ << "\""; - break; - default: - os_ << label[i]; - break; - } - } - os_ << "\"];\n"; - Visit(node); - os_ << "}" << std::endl; -} - -void DotPrinterImpl::Visit(RegExpNode* node) { - if (node->info()->visited) return; - node->info()->visited = true; - node->Accept(this); -} - -void DotPrinterImpl::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { - os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; - Visit(on_failure); -} - -class AttributePrinter { - public: - explicit AttributePrinter(std::ostream& os) // NOLINT - : os_(os), first_(true) {} - void PrintSeparator() { - if (first_) { - first_ = false; - } else { - os_ << "|"; - } - } - void PrintBit(const char* name, bool value) { - if (!value) return; - PrintSeparator(); - os_ << "{" << name << "}"; - } - void PrintPositive(const char* name, int value) { - if (value < 0) return; - PrintSeparator(); - os_ << "{" << name << "|" << value << "}"; - } - - private: - std::ostream& os_; - bool first_; -}; - -void DotPrinterImpl::PrintAttributes(RegExpNode* that) { - os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " - << "margin=0.1, fontsize=10, label=\"{"; - AttributePrinter printer(os_); - NodeInfo* info = that->info(); - printer.PrintBit("NI", info->follows_newline_interest); - printer.PrintBit("WI", info->follows_word_interest); - printer.PrintBit("SI", info->follows_start_interest); - Label* label = that->label(); - if (label->is_bound()) printer.PrintPositive("@", label->pos()); - os_ << "}\"];\n" - << " a" << that << " -> n" << that - << " [style=dashed, color=grey, arrowhead=none];\n"; -} - -void DotPrinterImpl::VisitChoice(ChoiceNode* that) { - os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; - for (int i = 0; i < that->alternatives()->length(); i++) { - GuardedAlternative alt = that->alternatives()->at(i); - os_ << " n" << that << " -> n" << alt.node(); - } - for (int i = 0; i < that->alternatives()->length(); i++) { - GuardedAlternative alt = that->alternatives()->at(i); - alt.node()->Accept(this); - } -} - -void DotPrinterImpl::VisitLoopChoice(LoopChoiceNode* that) { - VisitChoice(that); -} - -void DotPrinterImpl::VisitNegativeLookaroundChoice( - NegativeLookaroundChoiceNode* that) { - VisitChoice(that); -} - -void DotPrinterImpl::VisitText(TextNode* that) { - Zone* zone = that->zone(); - os_ << " n" << that << " [label=\""; - for (int i = 0; i < that->elements()->length(); i++) { - if (i > 0) os_ << " "; - TextElement elm = that->elements()->at(i); - switch (elm.text_type()) { - case TextElement::ATOM: { - Vector data = elm.atom()->data(); - for (int i = 0; i < data.length(); i++) { - os_ << static_cast(data[i]); - } - break; - } - case TextElement::CHAR_CLASS: { - RegExpCharacterClass* node = elm.char_class(); - os_ << "["; - if (node->is_negated()) os_ << "^"; - for (int j = 0; j < node->ranges(zone)->length(); j++) { - CharacterRange range = node->ranges(zone)->at(j); - os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); - } - os_ << "]"; - break; - } - default: - UNREACHABLE(); - } - } - os_ << "\", shape=box, peripheries=2];\n"; - PrintAttributes(that); - os_ << " n" << that << " -> n" << that->on_success() << ";\n"; - Visit(that->on_success()); -} - -void DotPrinterImpl::VisitBackReference(BackReferenceNode* that) { - os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" - << that->end_register() << "\", shape=doubleoctagon];\n"; - PrintAttributes(that); - os_ << " n" << that << " -> n" << that->on_success() << ";\n"; - Visit(that->on_success()); -} - -void DotPrinterImpl::VisitEnd(EndNode* that) { - os_ << " n" << that << " [style=bold, shape=point];\n"; - PrintAttributes(that); -} - -void DotPrinterImpl::VisitAssertion(AssertionNode* that) { - os_ << " n" << that << " ["; - switch (that->assertion_type()) { - case AssertionNode::AT_END: - os_ << "label=\"$\", shape=septagon"; - break; - case AssertionNode::AT_START: - os_ << "label=\"^\", shape=septagon"; - break; - case AssertionNode::AT_BOUNDARY: - os_ << "label=\"\\b\", shape=septagon"; - break; - case AssertionNode::AT_NON_BOUNDARY: - os_ << "label=\"\\B\", shape=septagon"; - break; - case AssertionNode::AFTER_NEWLINE: - os_ << "label=\"(?<=\\n)\", shape=septagon"; - break; - } - os_ << "];\n"; - PrintAttributes(that); - RegExpNode* successor = that->on_success(); - os_ << " n" << that << " -> n" << successor << ";\n"; - Visit(successor); -} - -void DotPrinterImpl::VisitAction(ActionNode* that) { - os_ << " n" << that << " ["; - switch (that->action_type_) { - case ActionNode::SET_REGISTER_FOR_LOOP: - os_ << "label=\"$" << that->data_.u_store_register.reg - << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; - break; - case ActionNode::INCREMENT_REGISTER: - os_ << "label=\"$" << that->data_.u_increment_register.reg - << "++\", shape=octagon"; - break; - case ActionNode::STORE_POSITION: - os_ << "label=\"$" << that->data_.u_position_register.reg - << ":=$pos\", shape=octagon"; - break; - case ActionNode::BEGIN_SUBMATCH: - os_ << "label=\"$" << that->data_.u_submatch.current_position_register - << ":=$pos,begin\", shape=septagon"; - break; - case ActionNode::POSITIVE_SUBMATCH_SUCCESS: - os_ << "label=\"escape\", shape=septagon"; - break; - case ActionNode::EMPTY_MATCH_CHECK: - os_ << "label=\"$" << that->data_.u_empty_match_check.start_register - << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register - << "<" << that->data_.u_empty_match_check.repetition_limit - << "?\", shape=septagon"; - break; - case ActionNode::CLEAR_CAPTURES: { - os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from - << " to $" << that->data_.u_clear_captures.range_to - << "\", shape=septagon"; - break; - } - } - os_ << "];\n"; - PrintAttributes(that); - RegExpNode* successor = that->on_success(); - os_ << " n" << that << " -> n" << successor << ";\n"; - Visit(successor); -} - -#endif // DEBUG - -void DotPrinter::DotPrint(const char* label, RegExpNode* node) { -#ifdef DEBUG - StdoutStream os; - DotPrinterImpl printer(os); - printer.PrintNode(label, node); -#endif // DEBUG -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-dotprinter.h b/js/src/new-regexp/regexp-dotprinter.h deleted file mode 100644 index 0bd03e77f..000000000 --- a/js/src/new-regexp/regexp-dotprinter.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_DOTPRINTER_H_ -#define V8_REGEXP_REGEXP_DOTPRINTER_H_ - -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -class RegExpNode; - -class DotPrinter final : public AllStatic { - public: - static void DotPrint(const char* label, RegExpNode* node); -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_DOTPRINTER_H_ diff --git a/js/src/new-regexp/regexp-error.cc b/js/src/new-regexp/regexp-error.cc deleted file mode 100644 index 9db98d4b8..000000000 --- a/js/src/new-regexp/regexp-error.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-error.h" - -namespace v8 { -namespace internal { - -const char* kRegExpErrorStrings[] = { -#define TEMPLATE(NAME, STRING) STRING, - REGEXP_ERROR_MESSAGES(TEMPLATE) -#undef TEMPLATE -}; - -const char* RegExpErrorString(RegExpError error) { - DCHECK_LT(error, RegExpError::NumErrors); - return kRegExpErrorStrings[static_cast(error)]; -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-error.h b/js/src/new-regexp/regexp-error.h deleted file mode 100644 index 4b495f07d..000000000 --- a/js/src/new-regexp/regexp-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_ERROR_H_ -#define V8_REGEXP_REGEXP_ERROR_H_ - -#include "regexp-shim.h" - -namespace v8 { -namespace internal { - -#define REGEXP_ERROR_MESSAGES(T) \ - T(None, "") \ - T(StackOverflow, "Maximum call stack size exceeded") \ - T(AnalysisStackOverflow, "Stack overflow") \ - T(TooLarge, "Regular expression too large") \ - T(UnterminatedGroup, "Unterminated group") \ - T(UnmatchedParen, "Unmatched ')'") \ - T(EscapeAtEndOfPattern, "\\ at end of pattern") \ - T(InvalidPropertyName, "Invalid property name") \ - T(InvalidEscape, "Invalid escape") \ - T(InvalidDecimalEscape, "Invalid decimal escape") \ - T(InvalidUnicodeEscape, "Invalid Unicode escape") \ - T(NothingToRepeat, "Nothing to repeat") \ - T(LoneQuantifierBrackets, "Lone quantifier brackets") \ - T(RangeOutOfOrder, "numbers out of order in {} quantifier") \ - T(IncompleteQuantifier, "Incomplete quantifier") \ - T(InvalidQuantifier, "Invalid quantifier") \ - T(InvalidGroup, "Invalid group") \ - T(MultipleFlagDashes, "Multiple dashes in flag group") \ - T(RepeatedFlag, "Repeated flag in flag group") \ - T(InvalidFlagGroup, "Invalid flag group") \ - T(TooManyCaptures, "Too many captures") \ - T(InvalidCaptureGroupName, "Invalid capture group name") \ - T(DuplicateCaptureGroupName, "Duplicate capture group name") \ - T(InvalidNamedReference, "Invalid named reference") \ - T(InvalidNamedCaptureReference, "Invalid named capture referenced") \ - T(InvalidClassEscape, "Invalid class escape") \ - T(InvalidClassPropertyName, "Invalid property name in character class") \ - T(InvalidCharacterClass, "Invalid character class") \ - T(UnterminatedCharacterClass, "Unterminated character class") \ - T(OutOfOrderCharacterClass, "Range out of order in character class") - -enum class RegExpError : uint32_t { -#define TEMPLATE(NAME, STRING) k##NAME, - REGEXP_ERROR_MESSAGES(TEMPLATE) -#undef TEMPLATE - NumErrors -}; - -V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error); - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_ERROR_H_ diff --git a/js/src/new-regexp/regexp-interpreter.cc b/js/src/new-regexp/regexp-interpreter.cc deleted file mode 100644 index 7a492fca2..000000000 --- a/js/src/new-regexp/regexp-interpreter.cc +++ /dev/null @@ -1,1039 +0,0 @@ -// Copyright 2011 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// A simple interpreter for the Irregexp byte code. - -#include "new-regexp/regexp-interpreter.h" - -#include "new-regexp/regexp-bytecodes.h" -#include "new-regexp/regexp-macro-assembler.h" -#include "new-regexp/regexp-stack.h" // For kMaximumStackSize. -#include "new-regexp/regexp.h" - -#ifdef V8_INTL_SUPPORT -#include "unicode/uchar.h" -#endif // V8_INTL_SUPPORT - -// Use token threaded dispatch iff the compiler supports computed gotos and the -// build argument v8_enable_regexp_interpreter_threaded_dispatch was set. -#if V8_HAS_COMPUTED_GOTO && \ - defined(V8_ENABLE_REGEXP_INTERPRETER_THREADED_DISPATCH) -#define V8_USE_COMPUTED_GOTO 1 -#endif // V8_HAS_COMPUTED_GOTO - - -namespace v8 { -namespace internal { - -namespace { - -bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector subject) { - Address offset_a = - reinterpret_cast
(const_cast(&subject.at(from))); - Address offset_b = - reinterpret_cast
(const_cast(&subject.at(current))); - size_t length = len * kUC16Size; - return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b, - length, isolate) == 1; -} - -bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector subject) { - // For Latin1 characters the unicode flag makes no difference. - for (int i = 0; i < len; i++) { - unsigned int old_char = subject[from++]; - unsigned int new_char = subject[current++]; - if (old_char == new_char) continue; - // Convert both characters to lower case. - old_char |= 0x20; - new_char |= 0x20; - if (old_char != new_char) return false; - // Not letters in the ASCII range and Latin-1 range. - if (!(old_char - 'a' <= 'z' - 'a') && - !(old_char - 224 <= 254 - 224 && old_char != 247)) { - return false; - } - } - return true; -} - -#ifdef DEBUG -void MaybeTraceInterpreter(const byte* code_base, const byte* pc, - int stack_depth, int current_position, - uint32_t current_char, int bytecode_length, - const char* bytecode_name) { - if (FLAG_trace_regexp_bytecodes) { - const bool printable = std::isprint(current_char); - const char* format = - printable - ? "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = " - : "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = "; - PrintF(format, pc - code_base, stack_depth, current_position, current_char, - printable ? current_char : '.'); - - RegExpBytecodeDisassembleSingle(code_base, pc); - } -} -#endif // DEBUG - -int32_t Load32Aligned(const byte* pc) { - DCHECK_EQ(0, reinterpret_cast(pc) & 3); - return *reinterpret_cast(pc); -} - -// TODO(jgruber): Rename to Load16AlignedUnsigned. -uint32_t Load16Aligned(const byte* pc) { - DCHECK_EQ(0, reinterpret_cast(pc) & 1); - return *reinterpret_cast(pc); -} - -int32_t Load16AlignedSigned(const byte* pc) { - DCHECK_EQ(0, reinterpret_cast(pc) & 1); - return *reinterpret_cast(pc); -} - -// A simple abstraction over the backtracking stack used by the interpreter. -// -// Despite the name 'backtracking' stack, it's actually used as a generic stack -// that stores both program counters (= offsets into the bytecode) and generic -// integer values. -class BacktrackStack { - public: - BacktrackStack() = default; - - V8_WARN_UNUSED_RESULT bool push(int v) { - data_.emplace_back(v); - return (static_cast(data_.size()) <= kMaxSize); - } - int peek() const { - DCHECK(!data_.empty()); - return data_.back(); - } - int pop() { - int v = peek(); - data_.pop_back(); - return v; - } - - // The 'sp' is the index of the first empty element in the stack. - int sp() const { return static_cast(data_.size()); } - void set_sp(int new_sp) { - DCHECK_LE(new_sp, sp()); - data_.resize_no_init(new_sp); - } - - private: - // Semi-arbitrary. Should be large enough for common cases to remain in the - // static stack-allocated backing store, but small enough not to waste space. - static constexpr int kStaticCapacity = 64; - - using ValueT = int; - base::SmallVector data_; - - static constexpr int kMaxSize = - RegExpStack::kMaximumStackSize / sizeof(ValueT); - - DISALLOW_COPY_AND_ASSIGN(BacktrackStack); -}; - -IrregexpInterpreter::Result ThrowStackOverflow(Isolate* isolate, - RegExp::CallOrigin call_origin) { - CHECK(call_origin == RegExp::CallOrigin::kFromRuntime); - // We abort interpreter execution after the stack overflow is thrown, and thus - // allow allocation here despite the outer DisallowHeapAllocationScope. - AllowHeapAllocation yes_gc; - isolate->StackOverflow(); - return IrregexpInterpreter::EXCEPTION; -} - -// Only throws if called from the runtime, otherwise just returns the EXCEPTION -// status code. -IrregexpInterpreter::Result MaybeThrowStackOverflow( - Isolate* isolate, RegExp::CallOrigin call_origin) { - if (call_origin == RegExp::CallOrigin::kFromRuntime) { - return ThrowStackOverflow(isolate, call_origin); - } else { - return IrregexpInterpreter::EXCEPTION; - } -} - -template -void UpdateCodeAndSubjectReferences( - Isolate* isolate, Handle code_array, - Handle subject_string, ByteArray* code_array_out, - const byte** code_base_out, const byte** pc_out, String* subject_string_out, - Vector* subject_string_vector_out) { - DisallowHeapAllocation no_gc; - - if (*code_base_out != code_array->GetDataStartAddress()) { - *code_array_out = *code_array; - const intptr_t pc_offset = *pc_out - *code_base_out; - DCHECK_GT(pc_offset, 0); - *code_base_out = code_array->GetDataStartAddress(); - *pc_out = *code_base_out + pc_offset; - } - - DCHECK(subject_string->IsFlat()); - *subject_string_out = *subject_string; - *subject_string_vector_out = subject_string->GetCharVector(no_gc); -} - -// Runs all pending interrupts and updates unhandlified object references if -// necessary. -template -IrregexpInterpreter::Result HandleInterrupts( - Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out, - String* subject_string_out, const byte** code_base_out, - Vector* subject_string_vector_out, const byte** pc_out) { - DisallowHeapAllocation no_gc; - - StackLimitCheck check(isolate); - bool js_has_overflowed = check.JsHasOverflowed(); - - if (call_origin == RegExp::CallOrigin::kFromJs) { - // Direct calls from JavaScript can be interrupted in two ways: - // 1. A real stack overflow, in which case we let the caller throw the - // exception. - // 2. The stack guard was used to interrupt execution for another purpose, - // forcing the call through the runtime system. - if (js_has_overflowed) { - return IrregexpInterpreter::EXCEPTION; - } else if (check.InterruptRequested()) { - return IrregexpInterpreter::RETRY; - } - } else { - DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime); - // Prepare for possible GC. - HandleScope handles(isolate); - Handle code_handle(*code_array_out, isolate); - Handle subject_handle(*subject_string_out, isolate); - - if (js_has_overflowed) { - return ThrowStackOverflow(isolate, call_origin); - } else if (check.InterruptRequested()) { - const bool was_one_byte = - String::IsOneByteRepresentationUnderneath(*subject_string_out); - Object result; - { - AllowHeapAllocation yes_gc; - result = isolate->stack_guard()->HandleInterrupts(); - } - if (result.IsException(isolate)) { - return IrregexpInterpreter::EXCEPTION; - } - - // If we changed between a LATIN1 and a UC16 string, we need to restart - // regexp matching with the appropriate template instantiation of - // RawMatch. - if (String::IsOneByteRepresentationUnderneath(*subject_handle) != - was_one_byte) { - return IrregexpInterpreter::RETRY; - } - - UpdateCodeAndSubjectReferences( - isolate, code_handle, subject_handle, code_array_out, code_base_out, - pc_out, subject_string_out, subject_string_vector_out); - } - } - - return IrregexpInterpreter::SUCCESS; -} - -bool CheckBitInTable(const uint32_t current_char, const byte* const table) { - int mask = RegExpMacroAssembler::kTableMask; - int b = table[(current_char & mask) >> kBitsPerByteLog2]; - int bit = (current_char & (kBitsPerByte - 1)); - return (b & (1 << bit)) != 0; -} - -// If computed gotos are supported by the compiler, we can get addresses to -// labels directly in C/C++. Every bytecode handler has its own label and we -// store the addresses in a dispatch table indexed by bytecode. To execute the -// next handler we simply jump (goto) directly to its address. -#if V8_USE_COMPUTED_GOTO -#define BC_LABEL(name) BC_##name: -#define DECODE() \ - do { \ - next_insn = Load32Aligned(next_pc); \ - next_handler_addr = dispatch_table[next_insn & BYTECODE_MASK]; \ - } while (false) -#define DISPATCH() \ - pc = next_pc; \ - insn = next_insn; \ - goto* next_handler_addr -// Without computed goto support, we fall back to a simple switch-based -// dispatch (A large switch statement inside a loop with a case for every -// bytecode). -#else // V8_USE_COMPUTED_GOTO -#define BC_LABEL(name) case BC_##name: -#define DECODE() next_insn = Load32Aligned(next_pc) -#define DISPATCH() \ - pc = next_pc; \ - insn = next_insn; \ - goto switch_dispatch_continuation -#endif // V8_USE_COMPUTED_GOTO - -// ADVANCE/SET_PC_FROM_OFFSET are separated from DISPATCH, because ideally some -// instructions can be executed between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH. -// We want those two macros as far apart as possible, because the goto in -// DISPATCH is dependent on a memory load in ADVANCE/SET_PC_FROM_OFFSET. If we -// don't hit the cache and have to fetch the next handler address from physical -// memory, instructions between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH can -// potentially be executed unconditionally, reducing memory stall. -#define ADVANCE(name) \ - next_pc = pc + RegExpBytecodeLength(BC_##name); \ - DECODE() -#define SET_PC_FROM_OFFSET(offset) \ - next_pc = code_base + offset; \ - DECODE() - -#ifdef DEBUG -#define BYTECODE(name) \ - BC_LABEL(name) \ - MaybeTraceInterpreter(code_base, pc, backtrack_stack.sp(), current, \ - current_char, RegExpBytecodeLength(BC_##name), #name); -#else -#define BYTECODE(name) BC_LABEL(name) -#endif // DEBUG - -template -IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, - String subject_string, - Vector subject, int* registers, - int current, uint32_t current_char, - RegExp::CallOrigin call_origin, - const uint32_t backtrack_limit) { - DisallowHeapAllocation no_gc; - -#if V8_USE_COMPUTED_GOTO - -// We have to make sure that no OOB access to the dispatch table is possible and -// all values are valid label addresses. -// Otherwise jumps to arbitrary addresses could potentially happen. -// This is ensured as follows: -// Every index to the dispatch table gets masked using BYTECODE_MASK in -// DECODE(). This way we can only get values between 0 (only the least -// significant byte of an integer is used) and kRegExpPaddedBytecodeCount - 1 -// (BYTECODE_MASK is defined to be exactly this value). -// All entries from kRegExpBytecodeCount to kRegExpPaddedBytecodeCount have to -// be filled with BREAKs (invalid operation). - -// Fill dispatch table from last defined bytecode up to the next power of two -// with BREAK (invalid operation). -// TODO(pthier): Find a way to fill up automatically (at compile time) -// 59 real bytecodes -> 5 fillers -#define BYTECODE_FILLER_ITERATOR(V) \ - V(BREAK) /* 1 */ \ - V(BREAK) /* 2 */ \ - V(BREAK) /* 3 */ \ - V(BREAK) /* 4 */ \ - V(BREAK) /* 5 */ - -#define COUNT(...) +1 - static constexpr int kRegExpBytecodeFillerCount = - BYTECODE_FILLER_ITERATOR(COUNT); -#undef COUNT - - // Make sure kRegExpPaddedBytecodeCount is actually the closest possible power - // of two. - DCHECK_EQ(kRegExpPaddedBytecodeCount, - base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount)); - - // Make sure every bytecode we get by using BYTECODE_MASK is well defined. - STATIC_ASSERT(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount); - STATIC_ASSERT(kRegExpBytecodeCount + kRegExpBytecodeFillerCount == - kRegExpPaddedBytecodeCount); - -#define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name, - static const void* const dispatch_table[kRegExpPaddedBytecodeCount] = { - BYTECODE_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY) - BYTECODE_FILLER_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY)}; -#undef DECLARE_DISPATCH_TABLE_ENTRY -#undef BYTECODE_FILLER_ITERATOR - -#endif // V8_USE_COMPUTED_GOTO - - const byte* pc = code_array.GetDataStartAddress(); - const byte* code_base = pc; - - BacktrackStack backtrack_stack; - - uint32_t backtrack_count = 0; - -#ifdef DEBUG - if (FLAG_trace_regexp_bytecodes) { - PrintF("\n\nStart bytecode interpreter\n\n"); - } -#endif - - while (true) { - const byte* next_pc = pc; - int32_t insn; - int32_t next_insn; -#if V8_USE_COMPUTED_GOTO - const void* next_handler_addr; - DECODE(); - DISPATCH(); -#else - insn = Load32Aligned(pc); - switch (insn & BYTECODE_MASK) { -#endif // V8_USE_COMPUTED_GOTO - BYTECODE(BREAK) { UNREACHABLE(); } - BYTECODE(PUSH_CP) { - ADVANCE(PUSH_CP); - if (!backtrack_stack.push(current)) { - return MaybeThrowStackOverflow(isolate, call_origin); - } - DISPATCH(); - } - BYTECODE(PUSH_BT) { - ADVANCE(PUSH_BT); - if (!backtrack_stack.push(Load32Aligned(pc + 4))) { - return MaybeThrowStackOverflow(isolate, call_origin); - } - DISPATCH(); - } - BYTECODE(PUSH_REGISTER) { - ADVANCE(PUSH_REGISTER); - if (!backtrack_stack.push(registers[insn >> BYTECODE_SHIFT])) { - return MaybeThrowStackOverflow(isolate, call_origin); - } - DISPATCH(); - } - BYTECODE(SET_REGISTER) { - ADVANCE(SET_REGISTER); - registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4); - DISPATCH(); - } - BYTECODE(ADVANCE_REGISTER) { - ADVANCE(ADVANCE_REGISTER); - registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4); - DISPATCH(); - } - BYTECODE(SET_REGISTER_TO_CP) { - ADVANCE(SET_REGISTER_TO_CP); - registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4); - DISPATCH(); - } - BYTECODE(SET_CP_TO_REGISTER) { - ADVANCE(SET_CP_TO_REGISTER); - current = registers[insn >> BYTECODE_SHIFT]; - DISPATCH(); - } - BYTECODE(SET_REGISTER_TO_SP) { - ADVANCE(SET_REGISTER_TO_SP); - registers[insn >> BYTECODE_SHIFT] = backtrack_stack.sp(); - DISPATCH(); - } - BYTECODE(SET_SP_TO_REGISTER) { - ADVANCE(SET_SP_TO_REGISTER); - backtrack_stack.set_sp(registers[insn >> BYTECODE_SHIFT]); - DISPATCH(); - } - BYTECODE(POP_CP) { - ADVANCE(POP_CP); - current = backtrack_stack.pop(); - DISPATCH(); - } - BYTECODE(POP_BT) { - STATIC_ASSERT(JSRegExp::kNoBacktrackLimit == 0); - if (++backtrack_count == backtrack_limit) { - // Exceeded limits are treated as a failed match. - return IrregexpInterpreter::FAILURE; - } - - IrregexpInterpreter::Result return_code = - HandleInterrupts(isolate, call_origin, &code_array, &subject_string, - &code_base, &subject, &pc); - if (return_code != IrregexpInterpreter::SUCCESS) return return_code; - - SET_PC_FROM_OFFSET(backtrack_stack.pop()); - DISPATCH(); - } - BYTECODE(POP_REGISTER) { - ADVANCE(POP_REGISTER); - registers[insn >> BYTECODE_SHIFT] = backtrack_stack.pop(); - DISPATCH(); - } - BYTECODE(FAIL) { - isolate->counters()->regexp_backtracks()->AddSample( - static_cast(backtrack_count)); - return IrregexpInterpreter::FAILURE; - } - BYTECODE(SUCCEED) { - isolate->counters()->regexp_backtracks()->AddSample( - static_cast(backtrack_count)); - return IrregexpInterpreter::SUCCESS; - } - BYTECODE(ADVANCE_CP) { - ADVANCE(ADVANCE_CP); - current += insn >> BYTECODE_SHIFT; - DISPATCH(); - } - BYTECODE(GOTO) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - BYTECODE(ADVANCE_CP_AND_GOTO) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - current += insn >> BYTECODE_SHIFT; - DISPATCH(); - } - BYTECODE(CHECK_GREEDY) { - if (current == backtrack_stack.peek()) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - backtrack_stack.pop(); - } else { - ADVANCE(CHECK_GREEDY); - } - DISPATCH(); - } - BYTECODE(LOAD_CURRENT_CHAR) { - int pos = current + (insn >> BYTECODE_SHIFT); - if (pos >= subject.length() || pos < 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(LOAD_CURRENT_CHAR); - current_char = subject[pos]; - } - DISPATCH(); - } - BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { - ADVANCE(LOAD_CURRENT_CHAR_UNCHECKED); - int pos = current + (insn >> BYTECODE_SHIFT); - current_char = subject[pos]; - DISPATCH(); - } - BYTECODE(LOAD_2_CURRENT_CHARS) { - int pos = current + (insn >> BYTECODE_SHIFT); - if (pos + 2 > subject.length() || pos < 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(LOAD_2_CURRENT_CHARS); - Char next = subject[pos + 1]; - current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); - } - DISPATCH(); - } - BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { - ADVANCE(LOAD_2_CURRENT_CHARS_UNCHECKED); - int pos = current + (insn >> BYTECODE_SHIFT); - Char next = subject[pos + 1]; - current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); - DISPATCH(); - } - BYTECODE(LOAD_4_CURRENT_CHARS) { - DCHECK_EQ(1, sizeof(Char)); - int pos = current + (insn >> BYTECODE_SHIFT); - if (pos + 4 > subject.length() || pos < 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(LOAD_4_CURRENT_CHARS); - Char next1 = subject[pos + 1]; - Char next2 = subject[pos + 2]; - Char next3 = subject[pos + 3]; - current_char = - (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24)); - } - DISPATCH(); - } - BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) { - ADVANCE(LOAD_4_CURRENT_CHARS_UNCHECKED); - DCHECK_EQ(1, sizeof(Char)); - int pos = current + (insn >> BYTECODE_SHIFT); - Char next1 = subject[pos + 1]; - Char next2 = subject[pos + 2]; - Char next3 = subject[pos + 3]; - current_char = - (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24)); - DISPATCH(); - } - BYTECODE(CHECK_4_CHARS) { - uint32_t c = Load32Aligned(pc + 4); - if (c == current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(CHECK_4_CHARS); - } - DISPATCH(); - } - BYTECODE(CHECK_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); - if (c == current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_CHAR); - } - DISPATCH(); - } - BYTECODE(CHECK_NOT_4_CHARS) { - uint32_t c = Load32Aligned(pc + 4); - if (c != current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(CHECK_NOT_4_CHARS); - } - DISPATCH(); - } - BYTECODE(CHECK_NOT_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); - if (c != current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_NOT_CHAR); - } - DISPATCH(); - } - BYTECODE(AND_CHECK_4_CHARS) { - uint32_t c = Load32Aligned(pc + 4); - if (c == (current_char & Load32Aligned(pc + 8))) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); - } else { - ADVANCE(AND_CHECK_4_CHARS); - } - DISPATCH(); - } - BYTECODE(AND_CHECK_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); - if (c == (current_char & Load32Aligned(pc + 4))) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(AND_CHECK_CHAR); - } - DISPATCH(); - } - BYTECODE(AND_CHECK_NOT_4_CHARS) { - uint32_t c = Load32Aligned(pc + 4); - if (c != (current_char & Load32Aligned(pc + 8))) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); - } else { - ADVANCE(AND_CHECK_NOT_4_CHARS); - } - DISPATCH(); - } - BYTECODE(AND_CHECK_NOT_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); - if (c != (current_char & Load32Aligned(pc + 4))) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(AND_CHECK_NOT_CHAR); - } - DISPATCH(); - } - BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); - uint32_t minus = Load16Aligned(pc + 4); - uint32_t mask = Load16Aligned(pc + 6); - if (c != ((current_char - minus) & mask)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(MINUS_AND_CHECK_NOT_CHAR); - } - DISPATCH(); - } - BYTECODE(CHECK_CHAR_IN_RANGE) { - uint32_t from = Load16Aligned(pc + 4); - uint32_t to = Load16Aligned(pc + 6); - if (from <= current_char && current_char <= to) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(CHECK_CHAR_IN_RANGE); - } - DISPATCH(); - } - BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { - uint32_t from = Load16Aligned(pc + 4); - uint32_t to = Load16Aligned(pc + 6); - if (from > current_char || current_char > to) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(CHECK_CHAR_NOT_IN_RANGE); - } - DISPATCH(); - } - BYTECODE(CHECK_BIT_IN_TABLE) { - if (CheckBitInTable(current_char, pc + 8)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_BIT_IN_TABLE); - } - DISPATCH(); - } - BYTECODE(CHECK_LT) { - uint32_t limit = (insn >> BYTECODE_SHIFT); - if (current_char < limit) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_LT); - } - DISPATCH(); - } - BYTECODE(CHECK_GT) { - uint32_t limit = (insn >> BYTECODE_SHIFT); - if (current_char > limit) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_GT); - } - DISPATCH(); - } - BYTECODE(CHECK_REGISTER_LT) { - if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(CHECK_REGISTER_LT); - } - DISPATCH(); - } - BYTECODE(CHECK_REGISTER_GE) { - if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } else { - ADVANCE(CHECK_REGISTER_GE); - } - DISPATCH(); - } - BYTECODE(CHECK_REGISTER_EQ_POS) { - if (registers[insn >> BYTECODE_SHIFT] == current) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_REGISTER_EQ_POS); - } - DISPATCH(); - } - BYTECODE(CHECK_NOT_REGS_EQUAL) { - if (registers[insn >> BYTECODE_SHIFT] == - registers[Load32Aligned(pc + 4)]) { - ADVANCE(CHECK_NOT_REGS_EQUAL); - } else { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - } - DISPATCH(); - } - BYTECODE(CHECK_NOT_BACK_REF) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current + len > subject.length() || - CompareChars(&subject[from], &subject[current], len) != 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current += len; - } - ADVANCE(CHECK_NOT_BACK_REF); - DISPATCH(); - } - BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current - len < 0 || - CompareChars(&subject[from], &subject[current - len], len) != 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current -= len; - } - ADVANCE(CHECK_NOT_BACK_REF_BACKWARD); - DISPATCH(); - } - BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { - UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. - } - BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current + len > subject.length() || - !BackRefMatchesNoCase(isolate, from, current, len, subject)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current += len; - } - ADVANCE(CHECK_NOT_BACK_REF_NO_CASE); - DISPATCH(); - } - BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) { - UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. - } - BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current - len < 0 || - !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current -= len; - } - ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD); - DISPATCH(); - } - BYTECODE(CHECK_AT_START) { - if (current + (insn >> BYTECODE_SHIFT) == 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_AT_START); - } - DISPATCH(); - } - BYTECODE(CHECK_NOT_AT_START) { - if (current + (insn >> BYTECODE_SHIFT) == 0) { - ADVANCE(CHECK_NOT_AT_START); - } else { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } - DISPATCH(); - } - BYTECODE(SET_CURRENT_POSITION_FROM_END) { - ADVANCE(SET_CURRENT_POSITION_FROM_END); - int by = static_cast(insn) >> BYTECODE_SHIFT; - if (subject.length() - current > by) { - current = subject.length() - by; - current_char = subject[current - 1]; - } - DISPATCH(); - } - BYTECODE(CHECK_CURRENT_POSITION) { - int pos = current + (insn >> BYTECODE_SHIFT); - if (pos > subject.length() || pos < 0) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - } else { - ADVANCE(CHECK_CURRENT_POSITION); - } - DISPATCH(); - } - BYTECODE(SKIP_UNTIL_CHAR) { - int load_offset = (insn >> BYTECODE_SHIFT); - int32_t advance = Load16AlignedSigned(pc + 4); - uint32_t c = Load16Aligned(pc + 6); - while (static_cast(current + load_offset) < - static_cast(subject.length())) { - current_char = subject[current + load_offset]; - if (c == current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); - DISPATCH(); - } - current += advance; - } - SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); - DISPATCH(); - } - BYTECODE(SKIP_UNTIL_CHAR_AND) { - int load_offset = (insn >> BYTECODE_SHIFT); - int32_t advance = Load16AlignedSigned(pc + 4); - uint16_t c = Load16Aligned(pc + 6); - uint32_t mask = Load32Aligned(pc + 8); - int32_t maximum_offset = Load32Aligned(pc + 12); - while (static_cast(current + maximum_offset) <= - static_cast(subject.length())) { - current_char = subject[current + load_offset]; - if (c == (current_char & mask)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); - DISPATCH(); - } - current += advance; - } - SET_PC_FROM_OFFSET(Load32Aligned(pc + 20)); - DISPATCH(); - } - BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) { - int load_offset = (insn >> BYTECODE_SHIFT); - int32_t advance = Load16AlignedSigned(pc + 4); - uint16_t c = Load16Aligned(pc + 6); - int32_t maximum_offset = Load32Aligned(pc + 8); - while (static_cast(current + maximum_offset) <= - static_cast(subject.length())) { - current_char = subject[current + load_offset]; - if (c == current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); - DISPATCH(); - } - current += advance; - } - SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); - DISPATCH(); - } - BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) { - int load_offset = (insn >> BYTECODE_SHIFT); - int32_t advance = Load16AlignedSigned(pc + 4); - const byte* table = pc + 8; - while (static_cast(current + load_offset) < - static_cast(subject.length())) { - current_char = subject[current + load_offset]; - if (CheckBitInTable(current_char, table)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); - DISPATCH(); - } - current += advance; - } - SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); - DISPATCH(); - } - BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) { - int load_offset = (insn >> BYTECODE_SHIFT); - int32_t advance = Load16AlignedSigned(pc + 4); - uint16_t limit = Load16Aligned(pc + 6); - const byte* table = pc + 8; - while (static_cast(current + load_offset) < - static_cast(subject.length())) { - current_char = subject[current + load_offset]; - if (current_char > limit) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); - DISPATCH(); - } - if (!CheckBitInTable(current_char, table)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); - DISPATCH(); - } - current += advance; - } - SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); - DISPATCH(); - } - BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) { - int load_offset = (insn >> BYTECODE_SHIFT); - int32_t advance = Load32Aligned(pc + 4); - uint16_t c = Load16Aligned(pc + 8); - uint16_t c2 = Load16Aligned(pc + 10); - while (static_cast(current + load_offset) < - static_cast(subject.length())) { - current_char = subject[current + load_offset]; - // The two if-statements below are split up intentionally, as combining - // them seems to result in register allocation behaving quite - // differently and slowing down the resulting code. - if (c == current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); - DISPATCH(); - } - if (c2 == current_char) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); - DISPATCH(); - } - current += advance; - } - SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); - DISPATCH(); - } -#if V8_USE_COMPUTED_GOTO -// Lint gets confused a lot if we just use !V8_USE_COMPUTED_GOTO or ifndef -// V8_USE_COMPUTED_GOTO here. -#else - default: - UNREACHABLE(); - } - // Label we jump to in DISPATCH(). There must be no instructions between the - // end of the switch, this label and the end of the loop. - switch_dispatch_continuation : {} -#endif // V8_USE_COMPUTED_GOTO - } -} - -#undef BYTECODE -#undef DISPATCH -#undef DECODE -#undef SET_PC_FROM_OFFSET -#undef ADVANCE -#undef BC_LABEL -#undef V8_USE_COMPUTED_GOTO - -} // namespace - -// static -IrregexpInterpreter::Result IrregexpInterpreter::Match( - Isolate* isolate, JSRegExp regexp, String subject_string, int* registers, - int registers_length, int start_position, RegExp::CallOrigin call_origin) { - if (FLAG_regexp_tier_up) { - regexp.TierUpTick(); - } - - bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string); - ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte)); - - return MatchInternal(isolate, code_array, subject_string, registers, - registers_length, start_position, call_origin, - regexp.BacktrackLimit()); -} - -IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( - Isolate* isolate, ByteArray code_array, String subject_string, - int* registers, int registers_length, int start_position, - RegExp::CallOrigin call_origin, uint32_t backtrack_limit) { - DCHECK(subject_string.IsFlat()); - - // Note: Heap allocation *is* allowed in two situations if calling from - // Runtime: - // 1. When creating & throwing a stack overflow exception. The interpreter - // aborts afterwards, and thus possible-moved objects are never used. - // 2. When handling interrupts. We manually relocate unhandlified references - // after interrupts have run. - DisallowHeapAllocation no_gc; - - // Reset registers to -1 (=undefined). - // This is necessary because registers are only written when a - // capture group matched. - // Resetting them ensures that previous matches are cleared. - memset(registers, -1, sizeof(registers[0]) * registers_length); - - uc16 previous_char = '\n'; - String::FlatContent subject_content = subject_string.GetFlatContent(no_gc); - if (subject_content.IsOneByte()) { - Vector subject_vector = subject_content.ToOneByteVector(); - if (start_position != 0) previous_char = subject_vector[start_position - 1]; - return RawMatch(isolate, code_array, subject_string, subject_vector, - registers, start_position, previous_char, call_origin, - backtrack_limit); - } else { - DCHECK(subject_content.IsTwoByte()); - Vector subject_vector = subject_content.ToUC16Vector(); - if (start_position != 0) previous_char = subject_vector[start_position - 1]; - return RawMatch(isolate, code_array, subject_string, subject_vector, - registers, start_position, previous_char, call_origin, - backtrack_limit); - } -} - -#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER - -// This method is called through an external reference from RegExpExecInternal -// builtin. -IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( - Address subject, int32_t start_position, Address, Address, int* registers, - int32_t registers_length, Address, RegExp::CallOrigin call_origin, - Isolate* isolate, Address regexp) { - DCHECK_NOT_NULL(isolate); - DCHECK_NOT_NULL(registers); - DCHECK(call_origin == RegExp::CallOrigin::kFromJs); - - DisallowHeapAllocation no_gc; - DisallowJavascriptExecution no_js(isolate); - - String subject_string = String::cast(Object(subject)); - JSRegExp regexp_obj = JSRegExp::cast(Object(regexp)); - - if (regexp_obj.MarkedForTierUp()) { - // Returning RETRY will re-enter through runtime, where actual recompilation - // for tier-up takes place. - return IrregexpInterpreter::RETRY; - } - - return Match(isolate, regexp_obj, subject_string, registers, registers_length, - start_position, call_origin); -} - -#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER - -IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime( - Isolate* isolate, Handle regexp, Handle subject_string, - int* registers, int registers_length, int start_position) { - return Match(isolate, *regexp, *subject_string, registers, registers_length, - start_position, RegExp::CallOrigin::kFromRuntime); -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-interpreter.h b/js/src/new-regexp/regexp-interpreter.h deleted file mode 100644 index b4c0da2b7..000000000 --- a/js/src/new-regexp/regexp-interpreter.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2011 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// A simple interpreter for the Irregexp byte code. - -#ifndef V8_REGEXP_REGEXP_INTERPRETER_H_ -#define V8_REGEXP_REGEXP_INTERPRETER_H_ - -#include "new-regexp/regexp.h" - -namespace v8 { -namespace internal { - -class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { - public: - enum Result { - FAILURE = RegExp::kInternalRegExpFailure, - SUCCESS = RegExp::kInternalRegExpSuccess, - EXCEPTION = RegExp::kInternalRegExpException, - RETRY = RegExp::kInternalRegExpRetry, - }; - - // In case a StackOverflow occurs, a StackOverflowException is created and - // EXCEPTION is returned. - static Result MatchForCallFromRuntime(Isolate* isolate, - Handle regexp, - Handle subject_string, - int* registers, int registers_length, - int start_position); - - // In case a StackOverflow occurs, EXCEPTION is returned. The caller is - // responsible for creating the exception. - // RETRY is returned if a retry through the runtime is needed (e.g. when - // interrupts have been scheduled or the regexp is marked for tier-up). - // Arguments input_start, input_end and backtrack_stack are - // unused. They are only passed to match the signature of the native irregex - // code. - static Result MatchForCallFromJs(Address subject, int32_t start_position, - Address input_start, Address input_end, - int* registers, int32_t registers_length, - Address backtrack_stack, - RegExp::CallOrigin call_origin, - Isolate* isolate, Address regexp); - - static Result MatchInternal(Isolate* isolate, ByteArray code_array, - String subject_string, int* registers, - int registers_length, int start_position, - RegExp::CallOrigin call_origin, - uint32_t backtrack_limit); - - private: - static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string, - int* registers, int registers_length, int start_position, - RegExp::CallOrigin call_origin); -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_INTERPRETER_H_ diff --git a/js/src/new-regexp/regexp-macro-assembler-arch.h b/js/src/new-regexp/regexp-macro-assembler-arch.h deleted file mode 100644 index 8aeb8c433..000000000 --- a/js/src/new-regexp/regexp-macro-assembler-arch.h +++ /dev/null @@ -1,291 +0,0 @@ -/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim: set ts=8 sts=2 et sw=2 tw=80: - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// This file implements the NativeRegExpMacroAssembler interface for -// SpiderMonkey. It provides the same interface as each of V8's -// architecture-specific implementations. - -#ifndef RegexpMacroAssemblerArch_h -#define RegexpMacroAssemblerArch_h - -#include "jit/MacroAssembler.h" -#include "new-regexp/regexp-macro-assembler.h" - -namespace v8 { -namespace internal { - -struct FrameData { - // Character position at the start of the input, stored as a - // negative offset from the end of the string (input_end_pointer_). - size_t inputStart; - - // The backtrack_stack_pointer_ register points to the top of the stack. - // This points to the bottom of the backtrack stack. - void* backtrackStackBase; - - // Copy of the input MatchPairs. - int32_t* matches; // pointer to capture array - int32_t numMatches; // size of capture array -}; - -class SMRegExpMacroAssembler final : public NativeRegExpMacroAssembler { - public: - SMRegExpMacroAssembler(JSContext* cx, Isolate* isolate, - js::jit::StackMacroAssembler& masm, Zone* zone, - Mode mode, uint32_t num_capture_registers); - virtual ~SMRegExpMacroAssembler() {} // Nothing to do here - - virtual int stack_limit_slack(); - virtual IrregexpImplementation Implementation(); - - virtual bool Succeed(); - virtual void Fail(); - - virtual void AdvanceCurrentPosition(int by); - virtual void PopCurrentPosition(); - virtual void PushCurrentPosition(); - virtual void SetCurrentPositionFromEnd(int by); - - virtual void Backtrack(); - virtual void Bind(Label* label); - virtual void GoTo(Label* label); - virtual void PushBacktrack(Label* label); - - virtual void CheckCharacter(uint32_t c, Label* on_equal); - virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); - virtual void CheckCharacterGT(uc16 limit, Label* on_greater); - virtual void CheckCharacterLT(uc16 limit, Label* on_less); - virtual void CheckCharacterAfterAnd(uint32_t c, uint32_t mask, - Label* on_equal); - virtual void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask, - Label* on_not_equal); - virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask, - Label* on_not_equal); - virtual void CheckGreedyLoop(Label* on_tos_equals_current_position); - virtual void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range); - virtual void CheckCharacterNotInRange(uc16 from, uc16 to, - Label* on_not_in_range); - virtual void CheckAtStart(int cp_offset, Label* on_at_start); - virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start); - virtual void CheckPosition(int cp_offset, Label* on_outside_input); - virtual void CheckBitInTable(Handle table, Label* on_bit_set); - virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); - virtual void CheckNotBackReference(int start_reg, bool read_backward, - Label* on_no_match); - virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, - Label* on_no_match); - - virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, - bool check_bounds, int characters, - int eats_at_least); - - virtual void AdvanceRegister(int reg, int by); - virtual void IfRegisterGE(int reg, int comparand, Label* if_ge); - virtual void IfRegisterLT(int reg, int comparand, Label* if_lt); - virtual void IfRegisterEqPos(int reg, Label* if_eq); - virtual void PopRegister(int register_index); - virtual void PushRegister(int register_index, - StackCheckFlag check_stack_limit); - virtual void ReadCurrentPositionFromRegister(int reg); - virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); - virtual void ReadStackPointerFromRegister(int reg); - virtual void WriteStackPointerToRegister(int reg); - virtual void SetRegister(int register_index, int to); - virtual void ClearRegisters(int reg_from, int reg_to); - - virtual Handle GetCode(Handle source); - - private: - size_t frameSize_ = 0; - - void createStackFrame(); - void initFrameAndRegs(); - void successHandler(); - void exitHandler(); - void backtrackHandler(); - void stackOverflowHandler(); - - // Push a register on the backtrack stack. - void Push(js::jit::Register value); - - // Pop a value from the backtrack stack. - void Pop(js::jit::Register target); - - void CheckAtStartImpl(int cp_offset, Label* on_cond, - js::jit::Assembler::Condition cond); - void CheckCharacterImpl(js::jit::Imm32 c, Label* on_cond, - js::jit::Assembler::Condition cond); - void CheckCharacterAfterAndImpl(uint32_t c, uint32_t and_with, Label* on_cond, - bool negate); - void CheckCharacterInRangeImpl(uc16 from, uc16 to, Label* on_cond, - js::jit::Assembler::Condition cond); - void CheckNotBackReferenceImpl(int start_reg, bool read_backward, - Label* on_no_match, bool ignore_case); - - void LoadCurrentCharacterUnchecked(int cp_offset, int characters); - - void JumpOrBacktrack(Label* to); - - // MacroAssembler methods that take a Label can be called with a - // null label, which means that we should backtrack if we would jump - // to that label. This is a helper to avoid writing out the same - // logic a dozen times. - inline js::jit::Label* LabelOrBacktrack(Label* to) { - return to ? to->inner() : &backtrack_label_; - } - - void CheckBacktrackStackLimit(); - - static bool GrowBacktrackStack(RegExpStack* regexp_stack); - - static uint32_t CaseInsensitiveCompareStrings(const char16_t* substring1, - const char16_t* substring2, - size_t byteLength); - static uint32_t CaseInsensitiveCompareUCStrings(const char16_t* substring1, - const char16_t* substring2, - size_t byteLength); - - inline int char_size() { return static_cast(mode_); } - inline js::jit::Scale factor() { - return mode_ == UC16 ? js::jit::TimesTwo : js::jit::TimesOne; - } - - js::jit::Address inputStart() { - return js::jit::Address(masm_.getStackPointer(), - offsetof(FrameData, inputStart)); - } - js::jit::Address backtrackStackBase() { - return js::jit::Address(masm_.getStackPointer(), - offsetof(FrameData, backtrackStackBase)); - } - js::jit::Address matches() { - return js::jit::Address(masm_.getStackPointer(), - offsetof(FrameData, matches)); - } - js::jit::Address numMatches() { - return js::jit::Address(masm_.getStackPointer(), - offsetof(FrameData, numMatches)); - } - - // The stack-pointer-relative location of a regexp register. - js::jit::Address register_location(int register_index) { - return js::jit::Address(masm_.getStackPointer(), - register_offset(register_index)); - } - - int32_t register_offset(int register_index) { - MOZ_ASSERT(register_index >= 0 && register_index <= kMaxRegister); - if (num_registers_ <= register_index) { - num_registers_ = register_index + 1; - } - static_assert(alignof(uintptr_t) <= alignof(FrameData),"Regexp: Alignment of uintptr_t and FrameData mismatch"); - return sizeof(FrameData) + register_index * sizeof(uintptr_t*); - } - - JSContext* cx_; - js::jit::StackMacroAssembler& masm_; - - /* - * This assembler uses the following registers: - * - * - current_character_: - * Contains the character (or characters) currently being examined. - * Must be loaded using LoadCurrentCharacter before using any of the - * dispatch methods. After a matching pass for a global regexp, - * temporarily stores the index of capture start. - * - current_position_: - * Current position in input *as negative byte offset from end of string*. - * - input_end_pointer_: - * Points to byte after last character in the input. current_position_ is - * relative to this. - * - backtrack_stack_pointer_: - * Points to tip of the (heap-allocated) backtrack stack. The stack grows - * downward (like the native stack). - * - temp0_, temp1_, temp2_: - * Scratch registers. - * - * The native stack pointer is used to access arguments (InputOutputData), - * local variables (FrameData), and irregexp's internal virtual registers - * (see register_location). - */ - - js::jit::Register current_character_; - js::jit::Register current_position_; - js::jit::Register input_end_pointer_; - js::jit::Register backtrack_stack_pointer_; - js::jit::Register temp0_, temp1_, temp2_; - - js::jit::Label entry_label_; - js::jit::Label start_label_; - js::jit::Label backtrack_label_; - js::jit::Label success_label_; - js::jit::Label exit_label_; - js::jit::Label stack_overflow_label_; - js::jit::Label exit_with_exception_label_; - - // When we generate the code to push a backtrack label's address - // onto the backtrack stack, we don't know its final address. We - // have to patch it after linking. This is slightly delicate, as the - // Label itself (which is allocated on the stack) may not exist by - // the time we link. The approach is as follows: - // - // 1. When we push a label on the backtrack stack (PushBacktrack), - // we bind the label's patchOffset_ field to the offset within - // the code that should be overwritten. This works because each - // label is only pushed by a single instruction. - // - // 2. When we bind a label (Bind), we check to see if it has a - // bound patchOffset_. If it does, we create a LabelPatch mapping - // its patch offset to the offset of the label itself. - // - // 3. While linking the code, we walk the list of label patches - // and patch the code accordingly. - class LabelPatch { - public: - LabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset) - : patchOffset_(patchOffset), labelOffset_(labelOffset) {} - - js::jit::CodeOffset patchOffset_; - size_t labelOffset_ = 0; - }; - - js::Vector labelPatches_; - void AddLabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - if (!labelPatches_.emplaceBack(patchOffset, labelOffset)) { - oomUnsafe.crash("Irregexp label patch"); - } - } - - Mode mode_; - int num_registers_; - int num_capture_registers_; - js::jit::LiveGeneralRegisterSet savedRegisters_; - - public: - using TableVector = - js::Vector, 4, js::SystemAllocPolicy>; - TableVector& tables() { return tables_; } - - private: - TableVector tables_; - void AddTable(PseudoHandle table) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - if (!tables_.append(std::move(table))) { - oomUnsafe.crash("Irregexp table append"); - } - } -}; - -} // namespace internal -} // namespace v8 - -#endif // RegexpMacroAssemblerArch_h diff --git a/js/src/new-regexp/regexp-macro-assembler-tracer.cc b/js/src/new-regexp/regexp-macro-assembler-tracer.cc deleted file mode 100644 index 8eb587c3c..000000000 --- a/js/src/new-regexp/regexp-macro-assembler-tracer.cc +++ /dev/null @@ -1,418 +0,0 @@ -// Copyright 2012 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-macro-assembler-tracer.h" - - -namespace v8 { -namespace internal { - -RegExpMacroAssemblerTracer::RegExpMacroAssemblerTracer( - Isolate* isolate, RegExpMacroAssembler* assembler) - : RegExpMacroAssembler(isolate, assembler->zone()), assembler_(assembler) { - IrregexpImplementation type = assembler->Implementation(); - DCHECK_LT(type, 9); - const char* impl_names[] = {"IA32", "ARM", "ARM64", "MIPS", "S390", - "PPC", "X64", "X87", "Bytecode"}; - PrintF("RegExpMacroAssembler%s();\n", impl_names[type]); -} - -RegExpMacroAssemblerTracer::~RegExpMacroAssemblerTracer() = default; - -void RegExpMacroAssemblerTracer::AbortedCodeGeneration() { - PrintF(" AbortedCodeGeneration\n"); - assembler_->AbortedCodeGeneration(); -} - - -// This is used for printing out debugging information. It makes an integer -// that is closely related to the address of an object. -static int LabelToInt(Label* label) { - return static_cast(reinterpret_cast(label)); -} - - -void RegExpMacroAssemblerTracer::Bind(Label* label) { - PrintF("label[%08x]: (Bind)\n", LabelToInt(label)); - assembler_->Bind(label); -} - - -void RegExpMacroAssemblerTracer::AdvanceCurrentPosition(int by) { - PrintF(" AdvanceCurrentPosition(by=%d);\n", by); - assembler_->AdvanceCurrentPosition(by); -} - - -void RegExpMacroAssemblerTracer::CheckGreedyLoop(Label* label) { - PrintF(" CheckGreedyLoop(label[%08x]);\n\n", LabelToInt(label)); - assembler_->CheckGreedyLoop(label); -} - - -void RegExpMacroAssemblerTracer::PopCurrentPosition() { - PrintF(" PopCurrentPosition();\n"); - assembler_->PopCurrentPosition(); -} - - -void RegExpMacroAssemblerTracer::PushCurrentPosition() { - PrintF(" PushCurrentPosition();\n"); - assembler_->PushCurrentPosition(); -} - - -void RegExpMacroAssemblerTracer::Backtrack() { - PrintF(" Backtrack();\n"); - assembler_->Backtrack(); -} - - -void RegExpMacroAssemblerTracer::GoTo(Label* label) { - PrintF(" GoTo(label[%08x]);\n\n", LabelToInt(label)); - assembler_->GoTo(label); -} - - -void RegExpMacroAssemblerTracer::PushBacktrack(Label* label) { - PrintF(" PushBacktrack(label[%08x]);\n", LabelToInt(label)); - assembler_->PushBacktrack(label); -} - - -bool RegExpMacroAssemblerTracer::Succeed() { - bool restart = assembler_->Succeed(); - PrintF(" Succeed();%s\n", restart ? " [restart for global match]" : ""); - return restart; -} - - -void RegExpMacroAssemblerTracer::Fail() { - PrintF(" Fail();"); - assembler_->Fail(); -} - - -void RegExpMacroAssemblerTracer::PopRegister(int register_index) { - PrintF(" PopRegister(register=%d);\n", register_index); - assembler_->PopRegister(register_index); -} - - -void RegExpMacroAssemblerTracer::PushRegister( - int register_index, - StackCheckFlag check_stack_limit) { - PrintF(" PushRegister(register=%d, %s);\n", - register_index, - check_stack_limit ? "check stack limit" : ""); - assembler_->PushRegister(register_index, check_stack_limit); -} - - -void RegExpMacroAssemblerTracer::AdvanceRegister(int reg, int by) { - PrintF(" AdvanceRegister(register=%d, by=%d);\n", reg, by); - assembler_->AdvanceRegister(reg, by); -} - - -void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) { - PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by); - assembler_->SetCurrentPositionFromEnd(by); -} - - -void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) { - PrintF(" SetRegister(register=%d, to=%d);\n", register_index, to); - assembler_->SetRegister(register_index, to); -} - - -void RegExpMacroAssemblerTracer::WriteCurrentPositionToRegister(int reg, - int cp_offset) { - PrintF(" WriteCurrentPositionToRegister(register=%d,cp_offset=%d);\n", - reg, - cp_offset); - assembler_->WriteCurrentPositionToRegister(reg, cp_offset); -} - - -void RegExpMacroAssemblerTracer::ClearRegisters(int reg_from, int reg_to) { - PrintF(" ClearRegister(from=%d, to=%d);\n", reg_from, reg_to); - assembler_->ClearRegisters(reg_from, reg_to); -} - - -void RegExpMacroAssemblerTracer::ReadCurrentPositionFromRegister(int reg) { - PrintF(" ReadCurrentPositionFromRegister(register=%d);\n", reg); - assembler_->ReadCurrentPositionFromRegister(reg); -} - - -void RegExpMacroAssemblerTracer::WriteStackPointerToRegister(int reg) { - PrintF(" WriteStackPointerToRegister(register=%d);\n", reg); - assembler_->WriteStackPointerToRegister(reg); -} - - -void RegExpMacroAssemblerTracer::ReadStackPointerFromRegister(int reg) { - PrintF(" ReadStackPointerFromRegister(register=%d);\n", reg); - assembler_->ReadStackPointerFromRegister(reg); -} - -void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl( - int cp_offset, Label* on_end_of_input, bool check_bounds, int characters, - int eats_at_least) { - const char* check_msg = check_bounds ? "" : " (unchecked)"; - PrintF( - " LoadCurrentCharacter(cp_offset=%d, label[%08x]%s (%d chars) (eats at " - "least %d));\n", - cp_offset, LabelToInt(on_end_of_input), check_msg, characters, - eats_at_least); - assembler_->LoadCurrentCharacter(cp_offset, on_end_of_input, check_bounds, - characters, eats_at_least); -} - -class PrintablePrinter { - public: - explicit PrintablePrinter(uc16 character) : character_(character) { } - - const char* operator*() { - if (character_ >= ' ' && character_ <= '~') { - buffer_[0] = '('; - buffer_[1] = static_cast(character_); - buffer_[2] = ')'; - buffer_[3] = '\0'; - } else { - buffer_[0] = '\0'; - } - return &buffer_[0]; - } - - private: - uc16 character_; - char buffer_[4]; -}; - - -void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) { - PrintablePrinter printable(limit); - PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n", - limit, - *printable, - LabelToInt(on_less)); - assembler_->CheckCharacterLT(limit, on_less); -} - - -void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit, - Label* on_greater) { - PrintablePrinter printable(limit); - PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n", - limit, - *printable, - LabelToInt(on_greater)); - assembler_->CheckCharacterGT(limit, on_greater); -} - - -void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) { - PrintablePrinter printable(c); - PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n", - c, - *printable, - LabelToInt(on_equal)); - assembler_->CheckCharacter(c, on_equal); -} - -void RegExpMacroAssemblerTracer::CheckAtStart(int cp_offset, - Label* on_at_start) { - PrintF(" CheckAtStart(cp_offset=%d, label[%08x]);\n", cp_offset, - LabelToInt(on_at_start)); - assembler_->CheckAtStart(cp_offset, on_at_start); -} - -void RegExpMacroAssemblerTracer::CheckNotAtStart(int cp_offset, - Label* on_not_at_start) { - PrintF(" CheckNotAtStart(cp_offset=%d, label[%08x]);\n", cp_offset, - LabelToInt(on_not_at_start)); - assembler_->CheckNotAtStart(cp_offset, on_not_at_start); -} - - -void RegExpMacroAssemblerTracer::CheckNotCharacter(unsigned c, - Label* on_not_equal) { - PrintablePrinter printable(c); - PrintF(" CheckNotCharacter(c=0x%04x%s, label[%08x]);\n", - c, - *printable, - LabelToInt(on_not_equal)); - assembler_->CheckNotCharacter(c, on_not_equal); -} - - -void RegExpMacroAssemblerTracer::CheckCharacterAfterAnd( - unsigned c, - unsigned mask, - Label* on_equal) { - PrintablePrinter printable(c); - PrintF(" CheckCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n", - c, - *printable, - mask, - LabelToInt(on_equal)); - assembler_->CheckCharacterAfterAnd(c, mask, on_equal); -} - - -void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd( - unsigned c, - unsigned mask, - Label* on_not_equal) { - PrintablePrinter printable(c); - PrintF(" CheckNotCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n", - c, - *printable, - mask, - LabelToInt(on_not_equal)); - assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal); -} - - -void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd( - uc16 c, - uc16 minus, - uc16 mask, - Label* on_not_equal) { - PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, " - "label[%08x]);\n", - c, - minus, - mask, - LabelToInt(on_not_equal)); - assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal); -} - - -void RegExpMacroAssemblerTracer::CheckCharacterInRange( - uc16 from, - uc16 to, - Label* on_not_in_range) { - PrintablePrinter printable_from(from); - PrintablePrinter printable_to(to); - PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n", - from, - *printable_from, - to, - *printable_to, - LabelToInt(on_not_in_range)); - assembler_->CheckCharacterInRange(from, to, on_not_in_range); -} - - -void RegExpMacroAssemblerTracer::CheckCharacterNotInRange( - uc16 from, - uc16 to, - Label* on_in_range) { - PrintablePrinter printable_from(from); - PrintablePrinter printable_to(to); - PrintF( - " CheckCharacterNotInRange(from=0x%04x%s," " to=%04x%s, label[%08x]);\n", - from, - *printable_from, - to, - *printable_to, - LabelToInt(on_in_range)); - assembler_->CheckCharacterNotInRange(from, to, on_in_range); -} - - -void RegExpMacroAssemblerTracer::CheckBitInTable( - Handle table, Label* on_bit_set) { - PrintF(" CheckBitInTable(label[%08x] ", LabelToInt(on_bit_set)); - for (int i = 0; i < kTableSize; i++) { - PrintF("%c", table->get(i) != 0 ? 'X' : '.'); - if (i % 32 == 31 && i != kTableMask) { - PrintF("\n "); - } - } - PrintF(");\n"); - assembler_->CheckBitInTable(table, on_bit_set); -} - - -void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg, - bool read_backward, - Label* on_no_match) { - PrintF(" CheckNotBackReference(register=%d, %s, label[%08x]);\n", start_reg, - read_backward ? "backward" : "forward", LabelToInt(on_no_match)); - assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match); -} - -void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { - PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n", - start_reg, read_backward ? "backward" : "forward", - LabelToInt(on_no_match)); - assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, - on_no_match); -} - -void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset, - Label* on_outside_input) { - PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset, - LabelToInt(on_outside_input)); - assembler_->CheckPosition(cp_offset, on_outside_input); -} - - -bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass( - uc16 type, - Label* on_no_match) { - bool supported = assembler_->CheckSpecialCharacterClass(type, - on_no_match); - PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n", - type, - LabelToInt(on_no_match), - supported ? "true" : "false"); - return supported; -} - - -void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index, - int comparand, Label* if_lt) { - PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n", - register_index, comparand, LabelToInt(if_lt)); - assembler_->IfRegisterLT(register_index, comparand, if_lt); -} - - -void RegExpMacroAssemblerTracer::IfRegisterEqPos(int register_index, - Label* if_eq) { - PrintF(" IfRegisterEqPos(register=%d, label[%08x]);\n", - register_index, LabelToInt(if_eq)); - assembler_->IfRegisterEqPos(register_index, if_eq); -} - - -void RegExpMacroAssemblerTracer::IfRegisterGE(int register_index, - int comparand, Label* if_ge) { - PrintF(" IfRegisterGE(register=%d, number=%d, label[%08x]);\n", - register_index, comparand, LabelToInt(if_ge)); - assembler_->IfRegisterGE(register_index, comparand, if_ge); -} - - -RegExpMacroAssembler::IrregexpImplementation - RegExpMacroAssemblerTracer::Implementation() { - return assembler_->Implementation(); -} - - -Handle RegExpMacroAssemblerTracer::GetCode(Handle source) { - PrintF(" GetCode(%s);\n", source->ToCString().get()); - return assembler_->GetCode(source); -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-macro-assembler-tracer.h b/js/src/new-regexp/regexp-macro-assembler-tracer.h deleted file mode 100644 index 0596a18ba..000000000 --- a/js/src/new-regexp/regexp-macro-assembler-tracer.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2008 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_ -#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_ - -#include "new-regexp/regexp-macro-assembler.h" - -namespace v8 { -namespace internal { - -// Decorator on a RegExpMacroAssembler that write all calls. -class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { - public: - RegExpMacroAssemblerTracer(Isolate* isolate, RegExpMacroAssembler* assembler); - ~RegExpMacroAssemblerTracer() override; - void AbortedCodeGeneration() override; - int stack_limit_slack() override { return assembler_->stack_limit_slack(); } - bool CanReadUnaligned() override { return assembler_->CanReadUnaligned(); } - void AdvanceCurrentPosition(int by) override; // Signed cp change. - void AdvanceRegister(int reg, int by) override; // r[reg] += by. - void Backtrack() override; - void Bind(Label* label) override; - void CheckCharacter(unsigned c, Label* on_equal) override; - void CheckCharacterAfterAnd(unsigned c, unsigned and_with, - Label* on_equal) override; - void CheckCharacterGT(uc16 limit, Label* on_greater) override; - void CheckCharacterLT(uc16 limit, Label* on_less) override; - void CheckGreedyLoop(Label* on_tos_equals_current_position) override; - void CheckAtStart(int cp_offset, Label* on_at_start) override; - void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override; - void CheckNotBackReference(int start_reg, bool read_backward, - Label* on_no_match) override; - void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, - Label* on_no_match) override; - void CheckNotCharacter(unsigned c, Label* on_not_equal) override; - void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, - Label* on_not_equal) override; - void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with, - Label* on_not_equal) override; - void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override; - void CheckCharacterNotInRange(uc16 from, uc16 to, - Label* on_not_in_range) override; - void CheckBitInTable(Handle table, Label* on_bit_set) override; - void CheckPosition(int cp_offset, Label* on_outside_input) override; - bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) override; - void Fail() override; - Handle GetCode(Handle source) override; - void GoTo(Label* label) override; - void IfRegisterGE(int reg, int comparand, Label* if_ge) override; - void IfRegisterLT(int reg, int comparand, Label* if_lt) override; - void IfRegisterEqPos(int reg, Label* if_eq) override; - IrregexpImplementation Implementation() override; - void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, - bool check_bounds, int characters, - int eats_at_least) override; - void PopCurrentPosition() override; - void PopRegister(int register_index) override; - void PushBacktrack(Label* label) override; - void PushCurrentPosition() override; - void PushRegister(int register_index, - StackCheckFlag check_stack_limit) override; - void ReadCurrentPositionFromRegister(int reg) override; - void ReadStackPointerFromRegister(int reg) override; - void SetCurrentPositionFromEnd(int by) override; - void SetRegister(int register_index, int to) override; - bool Succeed() override; - void WriteCurrentPositionToRegister(int reg, int cp_offset) override; - void ClearRegisters(int reg_from, int reg_to) override; - void WriteStackPointerToRegister(int reg) override; - - private: - RegExpMacroAssembler* assembler_; -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_ diff --git a/js/src/new-regexp/regexp-macro-assembler.cc b/js/src/new-regexp/regexp-macro-assembler.cc deleted file mode 100644 index 52c1cb1ba..000000000 --- a/js/src/new-regexp/regexp-macro-assembler.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright 2012 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-macro-assembler.h" - -#include "new-regexp/regexp-stack.h" - -#ifdef V8_INTL_SUPPORT -#include "unicode/uchar.h" -#include "unicode/unistr.h" -#endif // V8_INTL_SUPPORT - -namespace v8 { -namespace internal { - -RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) - : slow_safe_compiler_(false), - global_mode_(NOT_GLOBAL), - isolate_(isolate), - zone_(zone) {} - -RegExpMacroAssembler::~RegExpMacroAssembler() = default; - -int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, - Address byte_offset2, - size_t byte_length, - Isolate* isolate) { - // This function is not allowed to cause a garbage collection. - // A GC might move the calling generated code and invalidate the - // return address on the stack. - DCHECK_EQ(0, byte_length % 2); - -#ifdef V8_INTL_SUPPORT - int32_t length = (int32_t)(byte_length >> 1); - icu::UnicodeString uni_str_1(reinterpret_cast(byte_offset1), - length); - return uni_str_1.caseCompare(reinterpret_cast(byte_offset2), - length, U_FOLD_CASE_DEFAULT) == 0; -#else - uc16* substring1 = reinterpret_cast(byte_offset1); - uc16* substring2 = reinterpret_cast(byte_offset2); - size_t length = byte_length >> 1; - DCHECK_NOT_NULL(isolate); - unibrow::Mapping* canonicalize = - isolate->regexp_macro_assembler_canonicalize(); - for (size_t i = 0; i < length; i++) { - unibrow::uchar c1 = substring1[i]; - unibrow::uchar c2 = substring2[i]; - if (c1 != c2) { - unibrow::uchar s1[1] = {c1}; - canonicalize->get(c1, '\0', s1); - if (s1[0] != c2) { - unibrow::uchar s2[1] = {c2}; - canonicalize->get(c2, '\0', s2); - if (s1[0] != s2[0]) { - return 0; - } - } - } - } - return 1; -#endif // V8_INTL_SUPPORT -} - - -void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, - Label* on_failure) { - Label ok; - // Check that current character is not a trail surrogate. - LoadCurrentCharacter(cp_offset, &ok); - CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok); - // Check that previous character is not a lead surrogate. - LoadCurrentCharacter(cp_offset - 1, &ok); - CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure); - Bind(&ok); -} - -void RegExpMacroAssembler::CheckPosition(int cp_offset, - Label* on_outside_input) { - LoadCurrentCharacter(cp_offset, on_outside_input, true); -} - -void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset, - Label* on_end_of_input, - bool check_bounds, - int characters, - int eats_at_least) { - // By default, eats_at_least = characters. - if (eats_at_least == kUseCharactersValue) { - eats_at_least = characters; - } - - LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters, - eats_at_least); -} - -bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type, - Label* on_no_match) { - return false; -} - -NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, - Zone* zone) - : RegExpMacroAssembler(isolate, zone) {} - -NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default; - -bool NativeRegExpMacroAssembler::CanReadUnaligned() { - return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); -} - -#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER - -// This method may only be called after an interrupt. -int NativeRegExpMacroAssembler::CheckStackGuardState( - Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, - Address* return_address, Code re_code, Address* subject, - const byte** input_start, const byte** input_end) { - DisallowHeapAllocation no_gc; - Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0); - DCHECK_LE(re_code.raw_instruction_start(), old_pc); - DCHECK_LE(old_pc, re_code.raw_instruction_end()); - - StackLimitCheck check(isolate); - bool js_has_overflowed = check.JsHasOverflowed(); - - if (call_origin == RegExp::CallOrigin::kFromJs) { - // Direct calls from JavaScript can be interrupted in two ways: - // 1. A real stack overflow, in which case we let the caller throw the - // exception. - // 2. The stack guard was used to interrupt execution for another purpose, - // forcing the call through the runtime system. - - // Bug(v8:9540) Investigate why this method is called from JS although no - // stackoverflow or interrupt is pending on ARM64. We return 0 in this case - // to continue execution normally. - if (js_has_overflowed) { - return EXCEPTION; - } else if (check.InterruptRequested()) { - return RETRY; - } else { - return 0; - } - } - DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime); - - // Prepare for possible GC. - HandleScope handles(isolate); - Handle code_handle(re_code, isolate); - Handle subject_handle(String::cast(Object(*subject)), isolate); - bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle); - int return_value = 0; - - if (js_has_overflowed) { - AllowHeapAllocation yes_gc; - isolate->StackOverflow(); - return_value = EXCEPTION; - } else if (check.InterruptRequested()) { - AllowHeapAllocation yes_gc; - Object result = isolate->stack_guard()->HandleInterrupts(); - if (result.IsException(isolate)) return_value = EXCEPTION; - } - - if (*code_handle != re_code) { // Return address no longer valid - // Overwrite the return address on the stack. - intptr_t delta = code_handle->address() - re_code.address(); - Address new_pc = old_pc + delta; - // TODO(v8:10026): avoid replacing a signed pointer. - PointerAuthentication::ReplacePC(return_address, new_pc, 0); - } - - // If we continue, we need to update the subject string addresses. - if (return_value == 0) { - // String encoding might have changed. - if (String::IsOneByteRepresentationUnderneath(*subject_handle) != - is_one_byte) { - // If we changed between an LATIN1 and an UC16 string, the specialized - // code cannot be used, and we need to restart regexp matching from - // scratch (including, potentially, compiling a new version of the code). - return_value = RETRY; - } else { - *subject = subject_handle->ptr(); - intptr_t byte_length = *input_end - *input_start; - *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc); - *input_end = *input_start + byte_length; - } - } - return return_value; -} - -// Returns a {Result} sentinel, or the number of successful matches. -int NativeRegExpMacroAssembler::Match(Handle regexp, - Handle subject, - int* offsets_vector, - int offsets_vector_length, - int previous_index, Isolate* isolate) { - DCHECK(subject->IsFlat()); - DCHECK_LE(0, previous_index); - DCHECK_LE(previous_index, subject->length()); - - // No allocations before calling the regexp, but we can't use - // DisallowHeapAllocation, since regexps might be preempted, and another - // thread might do allocation anyway. - - String subject_ptr = *subject; - // Character offsets into string. - int start_offset = previous_index; - int char_length = subject_ptr.length() - start_offset; - int slice_offset = 0; - - // The string has been flattened, so if it is a cons string it contains the - // full string in the first part. - if (StringShape(subject_ptr).IsCons()) { - DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length()); - subject_ptr = ConsString::cast(subject_ptr).first(); - } else if (StringShape(subject_ptr).IsSliced()) { - SlicedString slice = SlicedString::cast(subject_ptr); - subject_ptr = slice.parent(); - slice_offset = slice.offset(); - } - if (StringShape(subject_ptr).IsThin()) { - subject_ptr = ThinString::cast(subject_ptr).actual(); - } - // Ensure that an underlying string has the same representation. - bool is_one_byte = subject_ptr.IsOneByteRepresentation(); - DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString()); - // String is now either Sequential or External - int char_size_shift = is_one_byte ? 0 : 1; - - DisallowHeapAllocation no_gc; - const byte* input_start = - subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc); - int byte_length = char_length << char_size_shift; - const byte* input_end = input_start + byte_length; - return Execute(*subject, start_offset, input_start, input_end, offsets_vector, - offsets_vector_length, isolate, *regexp); -} - -// Returns a {Result} sentinel, or the number of successful matches. -// TODO(pthier): The JSRegExp object is passed to native irregexp code to match -// the signature of the interpreter. We should get rid of JS objects passed to -// internal methods. -int NativeRegExpMacroAssembler::Execute( - String input, // This needs to be the unpacked (sliced, cons) string. - int start_offset, const byte* input_start, const byte* input_end, - int* output, int output_size, Isolate* isolate, JSRegExp regexp) { - // Ensure that the minimum stack has been allocated. - RegExpStackScope stack_scope(isolate); - Address stack_base = stack_scope.stack()->stack_base(); - - bool is_one_byte = String::IsOneByteRepresentationUnderneath(input); - Code code = Code::cast(regexp.Code(is_one_byte)); - RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime; - - using RegexpMatcherSig = int( - Address input_string, int start_offset, // NOLINT(readability/casting) - const byte* input_start, const byte* input_end, int* output, - int output_size, Address stack_base, int call_origin, Isolate* isolate, - Address regexp); - - auto fn = GeneratedCode::FromCode(code); - int result = - fn.Call(input.ptr(), start_offset, input_start, input_end, output, - output_size, stack_base, call_origin, isolate, regexp.ptr()); - DCHECK(result >= RETRY); - - if (result == EXCEPTION && !isolate->has_pending_exception()) { - // We detected a stack overflow (on the backtrack stack) in RegExp code, - // but haven't created the exception yet. Additionally, we allow heap - // allocation because even though it invalidates {input_start} and - // {input_end}, we are about to return anyway. - AllowHeapAllocation allow_allocation; - isolate->StackOverflow(); - } - return result; -} - -#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER - -// clang-format off -const byte NativeRegExpMacroAssembler::word_character_map[] = { - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7' - 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9' - - 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G' - 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O' - 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W' - 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_' - - 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g' - 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o' - 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w' - 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' - // Latin-1 range - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, - 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, -}; -// clang-format on - -Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer, - Address* stack_base, - Isolate* isolate) { - RegExpStack* regexp_stack = isolate->regexp_stack(); - size_t size = regexp_stack->stack_capacity(); - Address old_stack_base = regexp_stack->stack_base(); - DCHECK(old_stack_base == *stack_base); - DCHECK(stack_pointer <= old_stack_base); - DCHECK(static_cast(old_stack_base - stack_pointer) <= size); - Address new_stack_base = regexp_stack->EnsureCapacity(size * 2); - if (new_stack_base == kNullAddress) { - return kNullAddress; - } - *stack_base = new_stack_base; - intptr_t stack_content_size = old_stack_base - stack_pointer; - return new_stack_base - stack_content_size; -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-macro-assembler.h b/js/src/new-regexp/regexp-macro-assembler.h deleted file mode 100644 index 60d712dfc..000000000 --- a/js/src/new-regexp/regexp-macro-assembler.h +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2012 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ -#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ - -#include "new-regexp/regexp-ast.h" -#include "new-regexp/regexp-shim.h" -#include "new-regexp/regexp.h" - -namespace v8 { -namespace internal { - -static const uc32 kLeadSurrogateStart = 0xd800; -static const uc32 kLeadSurrogateEnd = 0xdbff; -static const uc32 kTrailSurrogateStart = 0xdc00; -static const uc32 kTrailSurrogateEnd = 0xdfff; -static const uc32 kNonBmpStart = 0x10000; -static const uc32 kNonBmpEnd = 0x10ffff; - -struct DisjunctDecisionRow { - RegExpCharacterClass cc; - Label* on_match; -}; - - -class RegExpMacroAssembler { - public: - // The implementation must be able to handle at least: - static const int kMaxRegister = (1 << 16) - 1; - static const int kMaxCPOffset = (1 << 15) - 1; - static const int kMinCPOffset = -(1 << 15); - - static const int kTableSizeBits = 7; - static const int kTableSize = 1 << kTableSizeBits; - static const int kTableMask = kTableSize - 1; - - static constexpr int kUseCharactersValue = -1; - - enum IrregexpImplementation { - kIA32Implementation, - kARMImplementation, - kARM64Implementation, - kMIPSImplementation, - kS390Implementation, - kPPCImplementation, - kX64Implementation, - kX87Implementation, - kBytecodeImplementation - }; - - enum StackCheckFlag { - kNoStackLimitCheck = false, - kCheckStackLimit = true - }; - - RegExpMacroAssembler(Isolate* isolate, Zone* zone); - virtual ~RegExpMacroAssembler(); - // This function is called when code generation is aborted, so that - // the assembler could clean up internal data structures. - virtual void AbortedCodeGeneration() {} - // The maximal number of pushes between stack checks. Users must supply - // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck) - // at least once for every stack_limit() pushes that are executed. - virtual int stack_limit_slack() = 0; - virtual bool CanReadUnaligned() = 0; - virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change. - virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by. - // Continues execution from the position pushed on the top of the backtrack - // stack by an earlier PushBacktrack(Label*). - virtual void Backtrack() = 0; - virtual void Bind(Label* label) = 0; - // Dispatch after looking the current character up in a 2-bits-per-entry - // map. The destinations vector has up to 4 labels. - virtual void CheckCharacter(unsigned c, Label* on_equal) = 0; - // Bitwise and the current character with the given constant and then - // check for a match with c. - virtual void CheckCharacterAfterAnd(unsigned c, - unsigned and_with, - Label* on_equal) = 0; - virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0; - virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0; - virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0; - virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0; - virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0; - virtual void CheckNotBackReference(int start_reg, bool read_backward, - Label* on_no_match) = 0; - virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, - Label* on_no_match) = 0; - // Check the current character for a match with a literal character. If we - // fail to match then goto the on_failure label. End of input always - // matches. If the label is nullptr then we should pop a backtrack address - // off the stack and go to that. - virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0; - virtual void CheckNotCharacterAfterAnd(unsigned c, - unsigned and_with, - Label* on_not_equal) = 0; - // Subtract a constant from the current character, then and with the given - // constant and then check for a match with c. - virtual void CheckNotCharacterAfterMinusAnd(uc16 c, - uc16 minus, - uc16 and_with, - Label* on_not_equal) = 0; - virtual void CheckCharacterInRange(uc16 from, - uc16 to, // Both inclusive. - Label* on_in_range) = 0; - virtual void CheckCharacterNotInRange(uc16 from, - uc16 to, // Both inclusive. - Label* on_not_in_range) = 0; - - // The current character (modulus the kTableSize) is looked up in the byte - // array, and if the found byte is non-zero, we jump to the on_bit_set label. - virtual void CheckBitInTable(Handle table, Label* on_bit_set) = 0; - - // Checks whether the given offset from the current position is before - // the end of the string. May overwrite the current character. - virtual void CheckPosition(int cp_offset, Label* on_outside_input); - // Check whether a standard/default character class matches the current - // character. Returns false if the type of special character class does - // not have custom support. - // May clobber the current loaded character. - virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); - - // Control-flow integrity: - // Define a jump target and bind a label. - virtual void BindJumpTarget(Label* label) { Bind(label); } - - virtual void Fail() = 0; - virtual Handle GetCode(Handle source) = 0; - virtual void GoTo(Label* label) = 0; - // Check whether a register is >= a given constant and go to a label if it - // is. Backtracks instead if the label is nullptr. - virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0; - // Check whether a register is < a given constant and go to a label if it is. - // Backtracks instead if the label is nullptr. - virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0; - // Check whether a register is == to the current position and go to a - // label if it is. - virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0; - virtual IrregexpImplementation Implementation() = 0; - V8_EXPORT_PRIVATE void LoadCurrentCharacter( - int cp_offset, Label* on_end_of_input, bool check_bounds = true, - int characters = 1, int eats_at_least = kUseCharactersValue); - virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, - bool check_bounds, int characters, - int eats_at_least) = 0; - virtual void PopCurrentPosition() = 0; - virtual void PopRegister(int register_index) = 0; - // Pushes the label on the backtrack stack, so that a following Backtrack - // will go to this label. Always checks the backtrack stack limit. - virtual void PushBacktrack(Label* label) = 0; - virtual void PushCurrentPosition() = 0; - virtual void PushRegister(int register_index, - StackCheckFlag check_stack_limit) = 0; - virtual void ReadCurrentPositionFromRegister(int reg) = 0; - virtual void ReadStackPointerFromRegister(int reg) = 0; - virtual void SetCurrentPositionFromEnd(int by) = 0; - virtual void SetRegister(int register_index, int to) = 0; - // Return whether the matching (with a global regexp) will be restarted. - virtual bool Succeed() = 0; - virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; - virtual void ClearRegisters(int reg_from, int reg_to) = 0; - virtual void WriteStackPointerToRegister(int reg) = 0; - - // Compares two-byte strings case insensitively. - // Called from generated RegExp code. - static int CaseInsensitiveCompareUC16(Address byte_offset1, - Address byte_offset2, - size_t byte_length, Isolate* isolate); - - // Check that we are not in the middle of a surrogate pair. - void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); - - // Controls the generation of large inlined constants in the code. - void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } - bool slow_safe() { return slow_safe_compiler_; } - - void set_backtrack_limit(uint32_t backtrack_limit) { - backtrack_limit_ = backtrack_limit; - } - - enum GlobalMode { - NOT_GLOBAL, - GLOBAL_NO_ZERO_LENGTH_CHECK, - GLOBAL, - GLOBAL_UNICODE - }; - // Set whether the regular expression has the global flag. Exiting due to - // a failure in a global regexp may still mean success overall. - inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } - inline bool global() { return global_mode_ != NOT_GLOBAL; } - inline bool global_with_zero_length_check() { - return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; - } - inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; } - - Isolate* isolate() const { return isolate_; } - Zone* zone() const { return zone_; } - - protected: - bool has_backtrack_limit() const { - return backtrack_limit_ != JSRegExp::kNoBacktrackLimit; - } - uint32_t backtrack_limit() const { return backtrack_limit_; } - - private: - bool slow_safe_compiler_; - uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit; - GlobalMode global_mode_; - Isolate* isolate_; - Zone* zone_; -}; - -class NativeRegExpMacroAssembler: public RegExpMacroAssembler { - public: - // Type of input string to generate code for. - enum Mode { LATIN1 = 1, UC16 = 2 }; - - // Result of calling generated native RegExp code. - // RETRY: Something significant changed during execution, and the matching - // should be retried from scratch. - // EXCEPTION: Something failed during execution. If no exception has been - // thrown, it's an internal out-of-memory, and the caller should - // throw the exception. - // FAILURE: Matching failed. - // SUCCESS: Matching succeeded, and the output array has been filled with - // capture positions. - enum Result { - FAILURE = RegExp::kInternalRegExpFailure, - SUCCESS = RegExp::kInternalRegExpSuccess, - EXCEPTION = RegExp::kInternalRegExpException, - RETRY = RegExp::kInternalRegExpRetry, - }; - - NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone); - ~NativeRegExpMacroAssembler() override; - bool CanReadUnaligned() override; - - // Returns a {Result} sentinel, or the number of successful matches. - static int Match(Handle regexp, Handle subject, - int* offsets_vector, int offsets_vector_length, - int previous_index, Isolate* isolate); - - // Called from RegExp if the backtrack stack limit is hit. - // Tries to expand the stack. Returns the new stack-pointer if - // successful, and updates the stack_top address, or returns 0 if unable - // to grow the stack. - // This function must not trigger a garbage collection. - static Address GrowStack(Address stack_pointer, Address* stack_top, - Isolate* isolate); - - static int CheckStackGuardState(Isolate* isolate, int start_index, - RegExp::CallOrigin call_origin, - Address* return_address, Code re_code, - Address* subject, const byte** input_start, - const byte** input_end); - - // Byte map of one byte characters with a 0xff if the character is a word - // character (digit, letter or underscore) and 0x00 otherwise. - // Used by generated RegExp code. - static const byte word_character_map[256]; - - static Address word_character_map_address() { - return reinterpret_cast
(&word_character_map[0]); - } - - // Returns a {Result} sentinel, or the number of successful matches. - V8_EXPORT_PRIVATE static int Execute(String input, int start_offset, - const byte* input_start, - const byte* input_end, int* output, - int output_size, Isolate* isolate, - JSRegExp regexp); -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ diff --git a/js/src/new-regexp/regexp-native-macro-assembler.cc b/js/src/new-regexp/regexp-native-macro-assembler.cc deleted file mode 100644 index 01453a937..000000000 --- a/js/src/new-regexp/regexp-native-macro-assembler.cc +++ /dev/null @@ -1,1213 +0,0 @@ -/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim: set ts=8 sts=2 et sw=2 tw=80: - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "jit/Linker.h" -#include "gc/Zone.h" -#include "new-regexp/regexp-macro-assembler-arch.h" -#include "new-regexp/regexp-stack.h" -#include "vm/MatchPairs.h" - -#include "jit/MacroAssembler-inl.h" - -using namespace js; -using namespace js::irregexp; -using namespace js::jit; - -namespace v8 { -namespace internal { - -using js::MatchPairs; -using js::jit::AbsoluteAddress; -using js::jit::Address; -using js::jit::AllocatableGeneralRegisterSet; -using js::jit::Assembler; -using js::jit::BaseIndex; -using js::jit::CodeLocationLabel; -using js::jit::GeneralRegisterBackwardIterator; -using js::jit::GeneralRegisterForwardIterator; -using js::jit::GeneralRegisterSet; -using js::jit::Imm32; -using js::jit::ImmPtr; -using js::jit::ImmWord; -using js::jit::JitCode; -using js::jit::Linker; -using js::jit::LiveGeneralRegisterSet; -using js::jit::Register; -using js::jit::Registers; -using js::jit::StackMacroAssembler; - -SMRegExpMacroAssembler::SMRegExpMacroAssembler(JSContext* cx, Isolate* isolate, - StackMacroAssembler& masm, - Zone* zone, Mode mode, - uint32_t num_capture_registers) - : NativeRegExpMacroAssembler(isolate, zone), - cx_(cx), - masm_(masm), - mode_(mode), - num_registers_(num_capture_registers), - num_capture_registers_(num_capture_registers) { - // Each capture has a start and an end register - MOZ_ASSERT(num_capture_registers_ % 2 == 0); - - AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All()); - - temp0_ = regs.takeAny(); - temp1_ = regs.takeAny(); - temp2_ = regs.takeAny(); - input_end_pointer_ = regs.takeAny(); - current_character_ = regs.takeAny(); - current_position_ = regs.takeAny(); - backtrack_stack_pointer_ = regs.takeAny(); - savedRegisters_ = js::jit::SavedNonVolatileRegisters(regs); - - masm_.jump(&entry_label_); // We'll generate the entry code later - masm_.bind(&start_label_); // and continue from here. -} - -int SMRegExpMacroAssembler::stack_limit_slack() { - return RegExpStack::kStackLimitSlack; -} - -void SMRegExpMacroAssembler::AdvanceCurrentPosition(int by) { - if (by != 0) { - masm_.addPtr(Imm32(by * char_size()), current_position_); - } -} - -void SMRegExpMacroAssembler::AdvanceRegister(int reg, int by) { - MOZ_ASSERT(reg >= 0 && reg < num_registers_); - if (by != 0) { - masm_.addPtr(Imm32(by), register_location(reg)); - } -} - -void SMRegExpMacroAssembler::Backtrack() { - // Pop code location from backtrack stack and jump to location. - Pop(temp0_); - masm_.jump(temp0_); -} - -void SMRegExpMacroAssembler::Bind(Label* label) { - masm_.bind(label->inner()); - if (label->patchOffset_.bound()) { - AddLabelPatch(label->patchOffset_, label->pos()); - } -} - -// Check if current_position + cp_offset is the input start -void SMRegExpMacroAssembler::CheckAtStartImpl(int cp_offset, Label* on_cond, - Assembler::Condition cond) { - Address addr(current_position_, cp_offset * char_size()); - masm_.computeEffectiveAddress(addr, temp0_); - - masm_.branchPtr(cond, inputStart(), temp0_, - LabelOrBacktrack(on_cond)); -} - -void SMRegExpMacroAssembler::CheckAtStart(int cp_offset, Label* on_at_start) { - CheckAtStartImpl(cp_offset, on_at_start, Assembler::Equal); -} - -void SMRegExpMacroAssembler::CheckNotAtStart(int cp_offset, - Label* on_not_at_start) { - CheckAtStartImpl(cp_offset, on_not_at_start, Assembler::NotEqual); -} - -void SMRegExpMacroAssembler::CheckCharacterImpl(Imm32 c, Label* on_cond, - Assembler::Condition cond) { - masm_.branch32(cond, current_character_, c, LabelOrBacktrack(on_cond)); -} - -void SMRegExpMacroAssembler::CheckCharacter(uint32_t c, Label* on_equal) { - CheckCharacterImpl(Imm32(c), on_equal, Assembler::Equal); -} - -void SMRegExpMacroAssembler::CheckNotCharacter(uint32_t c, - Label* on_not_equal) { - CheckCharacterImpl(Imm32(c), on_not_equal, Assembler::NotEqual); -} - -void SMRegExpMacroAssembler::CheckCharacterGT(uc16 c, Label* on_greater) { - CheckCharacterImpl(Imm32(c), on_greater, Assembler::GreaterThan); -} - -void SMRegExpMacroAssembler::CheckCharacterLT(uc16 c, Label* on_less) { - CheckCharacterImpl(Imm32(c), on_less, Assembler::LessThan); -} - -// Bitwise-and the current character with mask and then check for a -// match with c. -void SMRegExpMacroAssembler::CheckCharacterAfterAndImpl(uint32_t c, - uint32_t mask, - Label* on_cond, - bool is_not) { - if (c == 0) { - Assembler::Condition cond = is_not ? Assembler::NonZero : Assembler::Zero; - masm_.branchTest32(cond, current_character_, Imm32(mask), - LabelOrBacktrack(on_cond)); - } else { - Assembler::Condition cond = is_not ? Assembler::NotEqual : Assembler::Equal; - masm_.move32(Imm32(mask), temp0_); - masm_.and32(current_character_, temp0_); - masm_.branch32(cond, temp0_, Imm32(c), LabelOrBacktrack(on_cond)); - } -} - -void SMRegExpMacroAssembler::CheckCharacterAfterAnd(uint32_t c, - uint32_t mask, - Label* on_equal) { - CheckCharacterAfterAndImpl(c, mask, on_equal, /*is_not =*/false); -} - -void SMRegExpMacroAssembler::CheckNotCharacterAfterAnd(uint32_t c, - uint32_t mask, - Label* on_not_equal) { - CheckCharacterAfterAndImpl(c, mask, on_not_equal, /*is_not =*/true); -} - - -// Subtract minus from the current character, then bitwise-and the -// result with mask, then check for a match with c. -void SMRegExpMacroAssembler::CheckNotCharacterAfterMinusAnd( - uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) { - masm_.computeEffectiveAddress(Address(current_character_, -minus), temp0_); - if (c == 0) { - masm_.branchTest32(Assembler::NonZero, temp0_, Imm32(mask), - LabelOrBacktrack(on_not_equal)); - } else { - masm_.and32(Imm32(mask), temp0_); - masm_.branch32(Assembler::NotEqual, temp0_, Imm32(c), - LabelOrBacktrack(on_not_equal)); - } -} - -// If the current position matches the position stored on top of the backtrack -// stack, pops the backtrack stack and branches to the given label. -void SMRegExpMacroAssembler::CheckGreedyLoop(Label* on_equal) { - js::jit::Label fallthrough; - masm_.branchPtr(Assembler::NotEqual, Address(backtrack_stack_pointer_, 0), - current_position_, &fallthrough); - masm_.addPtr(Imm32(sizeof(void*)), backtrack_stack_pointer_); // Pop. - JumpOrBacktrack(on_equal); - masm_.bind(&fallthrough); -} - -void SMRegExpMacroAssembler::CheckCharacterInRangeImpl( - uc16 from, uc16 to, Label* on_cond, Assembler::Condition cond) { - // x is in [from,to] if unsigned(x - from) <= to - from - masm_.computeEffectiveAddress(Address(current_character_, -from), temp0_); - masm_.branch32(cond, temp0_, Imm32(to - from), LabelOrBacktrack(on_cond)); -} - -void SMRegExpMacroAssembler::CheckCharacterInRange(uc16 from, uc16 to, - Label* on_in_range) { - CheckCharacterInRangeImpl(from, to, on_in_range, Assembler::BelowOrEqual); -} - -void SMRegExpMacroAssembler::CheckCharacterNotInRange(uc16 from, uc16 to, - Label* on_not_in_range) { - CheckCharacterInRangeImpl(from, to, on_not_in_range, Assembler::Above); -} - -void SMRegExpMacroAssembler::CheckBitInTable(Handle table, - Label* on_bit_set) { - // Claim ownership of the ByteArray from the current HandleScope. - // ByteArrays are allocated on the C++ heap and are (eventually) - // owned by the RegExpShared. - PseudoHandle rawTable = table->takeOwnership(isolate()); - - masm_.movePtr(ImmPtr(rawTable->data()), temp0_); - - masm_.move32(Imm32(kTableMask), temp1_); - masm_.and32(current_character_, temp1_); - - masm_.load8ZeroExtend(BaseIndex(temp0_, temp1_, js::jit::TimesOne), temp0_); - masm_.branchTest32(Assembler::NonZero, temp0_, temp0_, - LabelOrBacktrack(on_bit_set)); - - // Transfer ownership of |rawTable| to the |tables_| vector. - AddTable(std::move(rawTable)); -} - -void SMRegExpMacroAssembler::CheckNotBackReferenceImpl(int start_reg, - bool read_backward, - Label* on_no_match, - bool ignore_case) { - js::jit::Label fallthrough; - - // Captures are stored as a sequential pair of registers. - // Find the length of the back-referenced capture and load the - // capture's start index into current_character_. - masm_.loadPtr(register_location(start_reg), // index of start - current_character_); - masm_.loadPtr(register_location(start_reg + 1), temp0_); // index of end - masm_.subPtr(current_character_, temp0_); // length of capture - - // Capture registers are either both set or both cleared. - // If the capture length is zero, then the capture is either empty or cleared. - // Fall through in both cases. - masm_.branchPtr(Assembler::Equal, temp0_, ImmWord(0), &fallthrough); - - // Check that there are sufficient characters left in the input. - if (read_backward) { - // If start + len > current, there isn't enough room for a - // lookbehind backreference. - masm_.loadPtr(inputStart(), temp1_); - masm_.addPtr(temp0_, temp1_); - masm_.branchPtr(Assembler::GreaterThan, temp1_, current_position_, - LabelOrBacktrack(on_no_match)); - } else { - // current_position_ is the negative offset from the end. - // If current + len > 0, there isn't enough room for a backreference. - masm_.movePtr(current_position_, temp1_); - masm_.addPtr(temp0_, temp1_); - masm_.branchPtr(Assembler::GreaterThan, temp1_, ImmWord(0), - LabelOrBacktrack(on_no_match)); - } - - if (mode_ == UC16 && ignore_case) { - // We call a helper function for case-insensitive non-latin1 strings. - - // Save volatile regs. temp1_ and temp2_ don't need to be saved. - LiveGeneralRegisterSet volatileRegs(GeneralRegisterSet::Volatile()); - volatileRegs.takeUnchecked(temp1_); - volatileRegs.takeUnchecked(temp2_); - masm_.PushRegsInMask(volatileRegs); - - // Parameters are - // Address captured - Address of captured substring's start. - // Address current - Address of current character position. - // size_t byte_length - length of capture (in bytes) - - // Compute |captured| - masm_.addPtr(input_end_pointer_, current_character_); - - // Compute |current| - masm_.addPtr(input_end_pointer_, current_position_); - if (read_backward) { - // Offset by length when matching backwards. - masm_.subPtr(temp0_, current_position_); - } - - masm_.setupUnalignedABICall(temp1_); - masm_.passABIArg(current_character_); - masm_.passABIArg(current_position_); - masm_.passABIArg(temp0_); - - bool unicode = true; // TODO: Fix V8 bug - if (unicode) { - uint32_t (*fun)(const char16_t*, const char16_t*, size_t) = - CaseInsensitiveCompareUCStrings; - masm_.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun)); - } else { - uint32_t (*fun)(const char16_t*, const char16_t*, size_t) = - CaseInsensitiveCompareStrings; - masm_.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun)); - } - masm_.storeCallInt32Result(temp1_); - masm_.PopRegsInMask(volatileRegs); - masm_.branchTest32(Assembler::Zero, temp1_, temp1_, - LabelOrBacktrack(on_no_match)); - - // On success, advance position by length of capture - if (read_backward) { - masm_.subPtr(temp0_, current_position_); - } else { - masm_.addPtr(temp0_, current_position_); - } - - masm_.bind(&fallthrough); - return; - } - - // We will be modifying current_position_. Save it in case the match fails. - masm_.push(current_position_); - - // Compute start of capture string - masm_.addPtr(input_end_pointer_, current_character_); - - // Compute start of match string - masm_.addPtr(input_end_pointer_, current_position_); - if (read_backward) { - // Offset by length when matching backwards. - masm_.subPtr(temp0_, current_position_); - } - - // Compute end of match string - masm_.addPtr(current_position_, temp0_); - - js::jit::Label success; - js::jit::Label fail; - js::jit::Label loop; - masm_.bind(&loop); - - // Load next character from each string. - if (mode_ == LATIN1) { - masm_.load8ZeroExtend(Address(current_character_, 0), temp1_); - masm_.load8ZeroExtend(Address(current_position_, 0), temp2_); - } else { - masm_.load16ZeroExtend(Address(current_character_, 0), temp1_); - masm_.load16ZeroExtend(Address(current_position_, 0), temp2_); - } - - if (ignore_case) { - MOZ_ASSERT(mode_ == LATIN1); - // Try exact match. - js::jit::Label loop_increment; - masm_.branch32(Assembler::Equal, temp1_, temp2_, &loop_increment); - - // Mismatch. Try case-insensitive match. - // Force the match character to lower case (by setting bit 0x20) - // then check to see if it is a letter. - js::jit::Label convert_capture; - masm_.or32(Imm32(0x20), temp1_); - - // Check if it is in [a,z]. - masm_.computeEffectiveAddress(Address(temp1_, -'a'), temp2_); - masm_.branch32(Assembler::BelowOrEqual, temp2_, Imm32('z' - 'a'), - &convert_capture); - // Check for values in range [224,254]. - // Exclude 247 (U+00F7 DIVISION SIGN). - masm_.sub32(Imm32(224 - 'a'), temp2_); - masm_.branch32(Assembler::Above, temp2_, Imm32(254 - 224), &fail); - masm_.branch32(Assembler::Equal, temp2_, Imm32(247 - 224), &fail); - - // Match character is lower case. Convert capture character - // to lower case and compare. - masm_.bind(&convert_capture); - masm_.load8ZeroExtend(Address(current_character_, 0), temp2_); - masm_.or32(Imm32(0x20), temp2_); - masm_.branch32(Assembler::NotEqual, temp1_, temp2_, &fail); - - masm_.bind(&loop_increment); - } else { - // Fail if characters do not match. - masm_.branch32(Assembler::NotEqual, temp1_, temp2_, &fail); - } - - // Increment pointers into match and capture strings. - masm_.addPtr(Imm32(char_size()), current_character_); - masm_.addPtr(Imm32(char_size()), current_position_); - - // Loop if we have not reached the end of the match string. - masm_.branchPtr(Assembler::Below, current_position_, temp0_, &loop); - masm_.jump(&success); - - // If we fail, restore current_position_ and branch. - masm_.bind(&fail); - masm_.pop(current_position_); - JumpOrBacktrack(on_no_match); - - masm_.bind(&success); - - // current_position_ is a pointer. Convert it back to an offset. - masm_.subPtr(input_end_pointer_, current_position_); - if (read_backward) { - // Subtract match length if we matched backward - masm_.addPtr(register_location(start_reg), current_position_); - masm_.subPtr(register_location(start_reg + 1), current_position_); - } - - // Drop saved value of current_position_ - masm_.addToStackPtr(Imm32(sizeof(uintptr_t))); - - masm_.bind(&fallthrough); -} - -// Branch if a back-reference does not match a previous capture. -void SMRegExpMacroAssembler::CheckNotBackReference(int start_reg, - bool read_backward, - Label* on_no_match) { - CheckNotBackReferenceImpl(start_reg, read_backward, on_no_match, - /*ignore_case = */ false); -} - -void SMRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { - CheckNotBackReferenceImpl(start_reg, read_backward, on_no_match, - /*ignore_case = */ true); -} - -// Checks whether the given offset from the current position is -// inside the input string. -void SMRegExpMacroAssembler::CheckPosition(int cp_offset, - Label* on_outside_input) { - // Note: current_position_ is a (negative) byte offset relative to - // the end of the input string. - if (cp_offset >= 0) { - // end + current + offset >= end - // <=> current + offset >= 0 - // <=> current >= -offset - masm_.branchPtr(Assembler::GreaterThanOrEqual, current_position_, - ImmWord(-cp_offset * char_size()), - LabelOrBacktrack(on_outside_input)); - } else { - // Compute offset position - masm_.computeEffectiveAddress( - Address(current_position_, cp_offset * char_size()), temp0_); - - // Compare to start of input. - masm_.branchPtr(Assembler::GreaterThanOrEqual, inputStart(), temp0_, - LabelOrBacktrack(on_outside_input)); - } -} - -// This function attempts to generate special case code for character classes. -// Returns true if a special case is generated. -// Otherwise returns false and generates no code. -bool SMRegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type, - Label* on_no_match) { - js::jit::Label* no_match = LabelOrBacktrack(on_no_match); - - // Note: throughout this function, range checks (c in [min, max]) - // are implemented by an unsigned (c - min) <= (max - min) check. - switch (type) { - case 's': { - // Match space-characters - if (mode_ != LATIN1) { - return false; - } - js::jit::Label success; - // One byte space characters are ' ', '\t'..'\r', and '\u00a0' (NBSP). - - // Check ' ' - masm_.branch32(Assembler::Equal, current_character_, Imm32(' '), - &success); - - // Check '\t'..'\r' - masm_.computeEffectiveAddress(Address(current_character_, -'\t'), - temp0_); - masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32('\r' - '\t'), - &success); - - // Check \u00a0. - masm_.branch32(Assembler::NotEqual, temp0_, Imm32(0x00a0 - '\t'), - no_match); - - masm_.bind(&success); - return true; - } - case 'S': - // The emitted code for generic character classes is good enough. - return false; - case 'd': - // Match latin1 digits ('0'-'9') - masm_.computeEffectiveAddress(Address(current_character_, -'0'), temp0_); - masm_.branch32(Assembler::Above, temp0_, Imm32('9' - '0'), no_match); - return true; - case 'D': - // Match anything except latin1 digits ('0'-'9') - masm_.computeEffectiveAddress(Address(current_character_, -'0'), temp0_); - masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32('9' - '0'), - no_match); - return true; - case '.': - // Match non-newlines. This excludes '\n' (0x0a), '\r' (0x0d), - // U+2028 LINE SEPARATOR, and U+2029 PARAGRAPH SEPARATOR. - // See https://tc39.es/ecma262/#prod-LineTerminator - - // To test for 0x0a and 0x0d efficiently, we XOR the input with 1. - // This converts 0x0a to 0x0b, and 0x0d to 0x0c, allowing us to - // test for the contiguous range 0x0b..0x0c. - masm_.move32(current_character_, temp0_); - masm_.xor32(Imm32(0x01), temp0_); - masm_.sub32(Imm32(0x0b), temp0_); - masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32(0x0c - 0x0b), - no_match); - - if (mode_ == UC16) { - // Compare original value to 0x2028 and 0x2029, using the already - // computed (current_char ^ 0x01 - 0x0b). I.e., check for - // 0x201d (0x2028 - 0x0b) or 0x201e. - masm_.sub32(Imm32(0x2028 - 0x0b), temp0_); - masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32(0x2029 - 0x2028), - no_match); - } - return true; - case 'w': - // \w matches the set of 63 characters defined in Runtime Semantics: - // WordCharacters. We use a static lookup table, which is defined in - // regexp-macro-assembler.cc. - // Note: if both Unicode and IgnoreCase are true, \w matches a - // larger set of characters. That case is handled elsewhere. - if (mode_ != LATIN1) { - masm_.branch32(Assembler::Above, current_character_, Imm32('z'), - no_match); - } - static_assert(arraysize(word_character_map) > unibrow::Latin1::kMaxChar, - "regex: arraysize(word_character_map) > unibrow::Latin1::kMaxChar"); - masm_.movePtr(ImmPtr(word_character_map), temp0_); - masm_.load8ZeroExtend( - BaseIndex(temp0_, current_character_, js::jit::TimesOne), temp0_); - masm_.branchTest32(Assembler::Zero, temp0_, temp0_, no_match); - return true; - case 'W': { - // See 'w' above. - js::jit::Label done; - if (mode_ != LATIN1) { - masm_.branch32(Assembler::Above, current_character_, Imm32('z'), &done); - } - static_assert(arraysize(word_character_map) > unibrow::Latin1::kMaxChar, - "regex: arraysize(word_character_map) > unibrow::Latin1::kMaxChar"); - masm_.movePtr(ImmPtr(word_character_map), temp0_); - masm_.load8ZeroExtend( - BaseIndex(temp0_, current_character_, js::jit::TimesOne), temp0_); - masm_.branchTest32(Assembler::NonZero, temp0_, temp0_, no_match); - if (mode_ != LATIN1) { - masm_.bind(&done); - } - return true; - } - //////////////////////////////////////////////////////////////////////// - // Non-standard classes (with no syntactic shorthand) used internally // - //////////////////////////////////////////////////////////////////////// - case '*': - // Match any character - return true; - case 'n': - // Match newlines. The opposite of '.'. See '.' above. - masm_.move32(current_character_, temp0_); - masm_.xor32(Imm32(0x01), temp0_); - masm_.sub32(Imm32(0x0b), temp0_); - if (mode_ == LATIN1) { - masm_.branch32(Assembler::Above, temp0_, Imm32(0x0c - 0x0b), no_match); - } else { - MOZ_ASSERT(mode_ == UC16); - js::jit::Label done; - masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32(0x0c - 0x0b), - &done); - - // Compare original value to 0x2028 and 0x2029, using the already - // computed (current_char ^ 0x01 - 0x0b). I.e., check for - // 0x201d (0x2028 - 0x0b) or 0x201e. - masm_.sub32(Imm32(0x2028 - 0x0b), temp0_); - masm_.branch32(Assembler::Above, temp0_, Imm32(0x2029 - 0x2028), - no_match); - masm_.bind(&done); - } - return true; - - // No custom implementation - default: - return false; - } -} - -void SMRegExpMacroAssembler::Fail() { - masm_.movePtr(ImmWord(js::RegExpRunStatus_Success_NotFound), temp0_); - masm_.jump(&exit_label_); -} - -void SMRegExpMacroAssembler::GoTo(Label* to) { - masm_.jump(LabelOrBacktrack(to)); -} - -void SMRegExpMacroAssembler::IfRegisterGE(int reg, int comparand, - Label* if_ge) { - masm_.branchPtr(Assembler::GreaterThanOrEqual, register_location(reg), - ImmWord(comparand), LabelOrBacktrack(if_ge)); -} - -void SMRegExpMacroAssembler::IfRegisterLT(int reg, int comparand, - Label* if_lt) { - masm_.branchPtr(Assembler::LessThan, register_location(reg), - ImmWord(comparand), LabelOrBacktrack(if_lt)); -} - -void SMRegExpMacroAssembler::IfRegisterEqPos(int reg, Label* if_eq) { - masm_.branchPtr(Assembler::Equal, register_location(reg), current_position_, - LabelOrBacktrack(if_eq)); -} - -// This is a word-for-word identical copy of the V8 code, which is -// duplicated in at least nine different places in V8 (one per -// supported architecture) with no differences outside of comments and -// formatting. It should be hoisted into the superclass. Once that is -// done upstream, this version can be deleted. -void SMRegExpMacroAssembler::LoadCurrentCharacterImpl(int cp_offset, - Label* on_end_of_input, - bool check_bounds, - int characters, - int eats_at_least) { - // It's possible to preload a small number of characters when each success - // path requires a large number of characters, but not the reverse. - MOZ_ASSERT(eats_at_least >= characters); - MOZ_ASSERT(cp_offset < (1 << 30)); // Be sane! (And ensure negation works) - - if (check_bounds) { - if (cp_offset >= 0) { - CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input); - } else { - CheckPosition(cp_offset, on_end_of_input); - } - } - LoadCurrentCharacterUnchecked(cp_offset, characters); -} - -// Load the character (or characters) at the specified offset from the -// current position. Zero-extend to 32 bits. -void SMRegExpMacroAssembler::LoadCurrentCharacterUnchecked(int cp_offset, - int characters) { - BaseIndex address(input_end_pointer_, current_position_, js::jit::TimesOne, - cp_offset * char_size()); - if (mode_ == LATIN1) { - if (characters == 4) { - masm_.load32(address, current_character_); - } else if (characters == 2) { - masm_.load16ZeroExtend(address, current_character_); - } else { - MOZ_ASSERT(characters == 1); - masm_.load8ZeroExtend(address, current_character_); - } - } else { - MOZ_ASSERT(mode_ == UC16); - if (characters == 2) { - masm_.load32(address, current_character_); - } else { - MOZ_ASSERT(characters == 1); - masm_.load16ZeroExtend(address, current_character_); - } - } -} - -void SMRegExpMacroAssembler::PopCurrentPosition() { Pop(current_position_); } - -void SMRegExpMacroAssembler::PopRegister(int register_index) { - Pop(temp0_); - masm_.storePtr(temp0_, register_location(register_index)); -} - -void SMRegExpMacroAssembler::PushBacktrack(Label* label) { - MOZ_ASSERT(!label->is_bound()); - MOZ_ASSERT(!label->patchOffset_.bound()); - label->patchOffset_ = masm_.movWithPatch(ImmPtr(nullptr), temp0_); - MOZ_ASSERT(label->patchOffset_.bound()); - - Push(temp0_); - - CheckBacktrackStackLimit(); -} - -void SMRegExpMacroAssembler::PushCurrentPosition() { Push(current_position_); } - -void SMRegExpMacroAssembler::PushRegister(int register_index, - StackCheckFlag check_stack_limit) { - masm_.loadPtr(register_location(register_index), temp0_); - Push(temp0_); - if (check_stack_limit) { - CheckBacktrackStackLimit(); - } -} - -void SMRegExpMacroAssembler::ReadCurrentPositionFromRegister(int reg) { - masm_.loadPtr(register_location(reg), current_position_); -} - -void SMRegExpMacroAssembler::WriteCurrentPositionToRegister(int reg, - int cp_offset) { - if (cp_offset == 0) { - masm_.storePtr(current_position_, register_location(reg)); - } else { - Address addr(current_position_, cp_offset * char_size()); - masm_.computeEffectiveAddress(addr, temp0_); - masm_.storePtr(temp0_, register_location(reg)); - } -} - -// Note: The backtrack stack pointer is stored in a register as an -// offset from the stack top, not as a bare pointer, so that it is not -// corrupted if the backtrack stack grows (and therefore moves). -void SMRegExpMacroAssembler::ReadStackPointerFromRegister(int reg) { - masm_.loadPtr(register_location(reg), backtrack_stack_pointer_); - masm_.addPtr(backtrackStackBase(), backtrack_stack_pointer_); -} -void SMRegExpMacroAssembler::WriteStackPointerToRegister(int reg) { - masm_.movePtr(backtrack_stack_pointer_, temp0_); - masm_.subPtr(backtrackStackBase(), temp0_); - masm_.storePtr(temp0_, register_location(reg)); -} - -// When matching a regexp that is anchored at the end, this operation -// is used to try skipping the beginning of long strings. If the -// maximum length of a match is less than the length of the string, we -// can skip the initial len - max_len bytes. -void SMRegExpMacroAssembler::SetCurrentPositionFromEnd(int by) { - js::jit::Label after_position; - masm_.branchPtr(Assembler::GreaterThanOrEqual, current_position_, - ImmWord(-by * char_size()), &after_position); - masm_.movePtr(ImmWord(-by * char_size()), current_position_); - - // On RegExp code entry (where this operation is used), the character before - // the current position is expected to be already loaded. - // We have advanced the position, so it's safe to read backwards. - LoadCurrentCharacterUnchecked(-1, 1); - masm_.bind(&after_position); -} - -void SMRegExpMacroAssembler::SetRegister(int register_index, int to) { - MOZ_ASSERT(register_index >= num_capture_registers_); - masm_.storePtr(ImmWord(to), register_location(register_index)); -} - -// Returns true if a regexp match can be restarted (aka the regexp is global). -// The return value is not used anywhere, but we implement it to be safe. -bool SMRegExpMacroAssembler::Succeed() { - masm_.jump(&success_label_); - return global(); -} - -// Capture registers are initialized to input[-1] -void SMRegExpMacroAssembler::ClearRegisters(int reg_from, int reg_to) { - MOZ_ASSERT(reg_from <= reg_to); - masm_.loadPtr(inputStart(), temp0_); - masm_.subPtr(Imm32(char_size()), temp0_); - for (int reg = reg_from; reg <= reg_to; reg++) { - masm_.storePtr(temp0_, register_location(reg)); - } -} - -void SMRegExpMacroAssembler::Push(Register source) { - MOZ_ASSERT(source != backtrack_stack_pointer_); - - masm_.subPtr(Imm32(sizeof(void*)), backtrack_stack_pointer_); - masm_.storePtr(source, Address(backtrack_stack_pointer_, 0)); -} - -void SMRegExpMacroAssembler::Pop(Register target) { - MOZ_ASSERT(target != backtrack_stack_pointer_); - - masm_.loadPtr(Address(backtrack_stack_pointer_, 0), target); - masm_.addPtr(Imm32(sizeof(void*)), backtrack_stack_pointer_); -} - -void SMRegExpMacroAssembler::JumpOrBacktrack(Label* to) { - if (to) { - masm_.jump(to->inner()); - } else { - Backtrack(); - } -} - -// Generate a quick inline test for backtrack stack overflow. -// If the test fails, call an OOL handler to try growing the stack. -void SMRegExpMacroAssembler::CheckBacktrackStackLimit() { - js::jit::Label no_stack_overflow; - masm_.branchPtr( - Assembler::BelowOrEqual, - AbsoluteAddress(isolate()->regexp_stack()->limit_address_address()), - backtrack_stack_pointer_, &no_stack_overflow); - - masm_.call(&stack_overflow_label_); - - // Exit with an exception if the call failed - masm_.branchTest32(Assembler::Zero, temp0_, temp0_, - &exit_with_exception_label_); - - masm_.bind(&no_stack_overflow); -} - -// This is used to sneak an OOM through the V8 layer. -static Handle DummyCode() { - return Handle::fromHandleValue(JS::UndefinedHandleValue); -} - -// Finalize code. This is called last, so that we know how many -// registers we need. -Handle SMRegExpMacroAssembler::GetCode(Handle source) { - if (!cx_->compartment()->ensureJitCompartmentExists(cx_)) { - return DummyCode(); - } - - masm_.bind(&entry_label_); - - createStackFrame(); - initFrameAndRegs(); - - masm_.jump(&start_label_); - - successHandler(); - exitHandler(); - backtrackHandler(); - stackOverflowHandler(); - - Linker linker(masm_); - JitCode* code = linker.newCode(cx_, REGEXP_CODE); - if (!code) { - ReportOutOfMemory(cx_); - return DummyCode(); - } - - for (LabelPatch& lp : labelPatches_) { - Assembler::PatchDataWithValueCheck(CodeLocationLabel(code, lp.patchOffset_), - ImmPtr(code->raw() + lp.labelOffset_), - ImmPtr(nullptr)); - } - - return Handle(JS::PrivateGCThingValue(code), isolate()); -} - -/* - * The stack will have the following structure: - * sp-> - FrameData - * - inputStart - * - backtrack stack base - * - matches - * - numMatches - * - Registers - * - Capture positions - * - Scratch registers - * --- frame alignment --- - * - Saved register area - * - Return address - */ -void SMRegExpMacroAssembler::createStackFrame() { -#ifdef JS_CODEGEN_ARM64 - // ARM64 communicates stack address via SP, but uses a pseudo-sp (PSP) for - // addressing. The register we use for PSP may however also be used by - // calling code, and it is nonvolatile, so save it. Do this as a special - // case first because the generic save/restore code needs the PSP to be - // initialized already. - MOZ_ASSERT(js::jit::PseudoStackPointer64.Is(masm_.GetStackPointer64())); - masm_.Str(js::jit::PseudoStackPointer64, - vixl::MemOperand(js::jit::sp, -16, vixl::PreIndex)); - - // Initialize the PSP from the SP. - masm_.initPseudoStackPtr(); -#endif - - // Push non-volatile registers which might be modified by jitcode. - size_t pushedNonVolatileRegisters = 0; - for (GeneralRegisterForwardIterator iter(savedRegisters_); iter.more(); - ++iter) { - masm_.Push(*iter); - pushedNonVolatileRegisters++; - } - - // The pointer to InputOutputData is passed as the first argument. - // On x86 we have to load it off the stack into temp0_. - // On other platforms it is already in a register. -#ifdef JS_CODEGEN_X86 - Address ioDataAddr(masm_.getStackPointer(), - (pushedNonVolatileRegisters + 1) * sizeof(void*)); - masm_.loadPtr(ioDataAddr, temp0_); -#else - if (js::jit::IntArgReg0 != temp0_) { - masm_.movePtr(js::jit::IntArgReg0, temp0_); - } -#endif - - // Start a new stack frame. - size_t frameBytes = sizeof(FrameData) + num_registers_ * sizeof(void*); - frameSize_ = js::jit::StackDecrementForCall(js::jit::ABIStackAlignment, - masm_.framePushed(), frameBytes); - masm_.reserveStack(frameSize_); - masm_.checkStackAlignment(); - - // Check if we have space on the stack. Use the *NoInterrupt stack limit to - // avoid failing repeatedly when the regex code is called from Ion JIT code. - // (See bug 1208819) - js::jit::Label stack_ok; - AbsoluteAddress limit_addr(cx_->addressOfJitStackLimitNoInterrupt()); - masm_.branchStackPtrRhs(Assembler::Below, limit_addr, &stack_ok); - - // There is not enough space on the stack. Exit with an exception. - masm_.movePtr(ImmWord(js::RegExpRunStatus_Error), temp0_); - masm_.jump(&exit_label_); - - masm_.bind(&stack_ok); -} - -void SMRegExpMacroAssembler::initFrameAndRegs() { - // At this point, an uninitialized stack frame has been created, - // and the address of the InputOutputData is in temp0_. - Register ioDataReg = temp0_; - - Register matchesReg = temp1_; - masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, matches)), - matchesReg); - - // Initialize output registers - masm_.loadPtr(Address(matchesReg, MatchPairs::offsetOfPairs()), temp2_); - masm_.storePtr(temp2_, matches()); - masm_.load32(Address(matchesReg, MatchPairs::offsetOfPairCount()), temp2_); - masm_.store32(temp2_, numMatches()); - -#ifdef DEBUG - // Bounds-check numMatches. - js::jit::Label enoughRegisters; - masm_.branchPtr(Assembler::GreaterThanOrEqual, temp2_, - ImmWord(num_capture_registers_ / 2), &enoughRegisters); - masm_.assumeUnreachable("Not enough output pairs for RegExp"); - masm_.bind(&enoughRegisters); -#endif - - // Load input start pointer. - masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, inputStart)), - current_position_); - - // Load input end pointer - masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, inputEnd)), - input_end_pointer_); - - // Set up input position to be negative offset from string end. - masm_.subPtr(input_end_pointer_, current_position_); - - // Store inputStart - masm_.storePtr(current_position_, inputStart()); - - // Load start index - Register startIndexReg = temp1_; - masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, startIndex)), - startIndexReg); - masm_.computeEffectiveAddress( - BaseIndex(current_position_, startIndexReg, factor()), current_position_); - - // Initialize current_character_. - // Load newline if index is at start, or previous character otherwise. - js::jit::Label start_regexp; - js::jit::Label load_previous_character; - masm_.branchPtr(Assembler::NotEqual, startIndexReg, ImmWord(0), - &load_previous_character); - masm_.movePtr(ImmWord('\n'), current_character_); - masm_.jump(&start_regexp); - - masm_.bind(&load_previous_character); - LoadCurrentCharacterUnchecked(-1, 1); - masm_.bind(&start_regexp); - - // Initialize captured registers with inputStart - 1 - MOZ_ASSERT(num_capture_registers_ > 0); - Register inputStartMinusOneReg = temp2_; - masm_.loadPtr(inputStart(), inputStartMinusOneReg); - masm_.subPtr(Imm32(char_size()), inputStartMinusOneReg); - if (num_capture_registers_ > 8) { - masm_.movePtr(ImmWord(register_offset(0)), temp1_); - js::jit::Label init_loop; - masm_.bind(&init_loop); - masm_.storePtr(inputStartMinusOneReg, BaseIndex(masm_.getStackPointer(), - temp1_, js::jit::TimesOne)); - masm_.addPtr(ImmWord(sizeof(void*)), temp1_); - masm_.branchPtr(Assembler::LessThan, temp1_, - ImmWord(register_offset(num_capture_registers_)), - &init_loop); - } else { - // Unroll the loop - for (int i = 0; i < num_capture_registers_; i++) { - masm_.storePtr(inputStartMinusOneReg, register_location(i)); - } - } - - // Initialize backtrack stack pointer - masm_.loadPtr(AbsoluteAddress(isolate()->top_of_regexp_stack()), - backtrack_stack_pointer_); - masm_.storePtr(backtrack_stack_pointer_, backtrackStackBase()); -} - -void SMRegExpMacroAssembler::successHandler() { - MOZ_ASSERT(success_label_.used()); - masm_.bind(&success_label_); - - // Copy captures to the MatchPairs pointed to by the InputOutputData. - // Captures are stored as positions, which are negative byte offsets - // from the end of the string. We must convert them to actual - // indices. - // - // Index: [ 0 ][ 1 ][ 2 ][ 3 ][ 4 ][ 5 ][END] - // Pos (1-byte): [-6 ][-5 ][-4 ][-3 ][-2 ][-1 ][ 0 ] // IS = -6 - // Pos (2-byte): [-12][-10][-8 ][-6 ][-4 ][-2 ][ 0 ] // IS = -12 - // - // To convert a position to an index, we subtract InputStart, and - // divide the result by char_size. - Register matchesReg = temp1_; - masm_.loadPtr(matches(), matchesReg); - - Register inputStartReg = temp2_; - masm_.loadPtr(inputStart(), inputStartReg); - - for (int i = 0; i < num_capture_registers_; i++) { - masm_.loadPtr(register_location(i), temp0_); - masm_.subPtr(inputStartReg, temp0_); - if (mode_ == UC16) { - masm_.rshiftPtrArithmetic(Imm32(1), temp0_); - } - masm_.store32(temp0_, Address(matchesReg, i * sizeof(int32_t))); - } - - masm_.movePtr(ImmWord(js::RegExpRunStatus_Success), temp0_); - // This falls through to the exit handler. -} - -void SMRegExpMacroAssembler::exitHandler() { - masm_.bind(&exit_label_); - - if (temp0_ != js::jit::ReturnReg) { - masm_.movePtr(temp0_, js::jit::ReturnReg); - } - - masm_.freeStack(frameSize_); - - // Restore registers which were saved on entry - for (GeneralRegisterBackwardIterator iter(savedRegisters_); iter.more(); - ++iter) { - masm_.Pop(*iter); - } - -#ifdef JS_CODEGEN_ARM64 - // Now restore the value that was in the PSP register on entry, and return. - - // Obtain the correct SP from the PSP. - masm_.Mov(js::jit::sp, js::jit::PseudoStackPointer64); - - // Restore the saved value of the PSP register, this value is whatever the - // caller had saved in it, not any actual SP value, and it must not be - // overwritten subsequently. - masm_.Ldr(js::jit::PseudoStackPointer64, - vixl::MemOperand(js::jit::sp, 16, vixl::PostIndex)); - - // Perform a plain Ret(), as abiret() will move SP <- PSP and that is wrong. - masm_.Ret(vixl::lr); -#else - masm_.abiret(); -#endif - - if (exit_with_exception_label_.used()) { - masm_.bind(&exit_with_exception_label_); - - // Exit with an error result to signal thrown exception - masm_.movePtr(ImmWord(js::RegExpRunStatus_Error), temp0_); - masm_.jump(&exit_label_); - } -} - -void SMRegExpMacroAssembler::backtrackHandler() { - if (!backtrack_label_.used()) { - return; - } - masm_.bind(&backtrack_label_); - Backtrack(); -} - -void SMRegExpMacroAssembler::stackOverflowHandler() { - if (!stack_overflow_label_.used()) { - return; - } - - // Called if the backtrack-stack limit has been hit. - // NOTE: depending on architecture, the call may have - // changed the stack pointer. We adjust for that below. - masm_.bind(&stack_overflow_label_); - - // Load argument - masm_.movePtr(ImmPtr(isolate()->regexp_stack()), temp1_); - - // Save registers before calling C function - LiveGeneralRegisterSet volatileRegs(GeneralRegisterSet::Volatile()); - -#ifdef JS_USE_LINK_REGISTER - masm.pushReturnAddress(); -#endif - - // Adjust for the return address on the stack. - size_t frameOffset = sizeof(void*); - - volatileRegs.takeUnchecked(temp0_); - volatileRegs.takeUnchecked(temp1_); - masm_.PushRegsInMask(volatileRegs); - - masm_.setupUnalignedABICall(temp0_); - masm_.passABIArg(temp1_); - masm_.callWithABI(JS_FUNC_TO_DATA_PTR(void*, GrowBacktrackStack)); - masm_.storeCallBoolResult(temp0_); - - masm_.PopRegsInMask(volatileRegs); - - // If GrowBacktrackStack returned false, we have failed to grow the - // stack, and must exit with a stack-overflow exception. Do this in - // the caller so that the stack is adjusted by our return instruction. - js::jit::Label overflow_return; - masm_.branchTest32(Assembler::Zero, temp0_, temp0_, &overflow_return); - - // Otherwise, store the new backtrack stack base and recompute the new - // top of the stack. - Address bsbAddress(masm_.getStackPointer(), - offsetof(FrameData, backtrackStackBase) + frameOffset); - masm_.subPtr(bsbAddress, backtrack_stack_pointer_); - - masm_.loadPtr(AbsoluteAddress(isolate()->top_of_regexp_stack()), temp1_); - masm_.storePtr(temp1_, bsbAddress); - masm_.addPtr(temp1_, backtrack_stack_pointer_); - - // Resume execution in calling code. - masm_.bind(&overflow_return); - masm_.ret(); -} - -// This is only used by tracing code. -// The return value doesn't matter. -RegExpMacroAssembler::IrregexpImplementation -SMRegExpMacroAssembler::Implementation() { - return kBytecodeImplementation; -} - -/*static */ -uint32_t SMRegExpMacroAssembler::CaseInsensitiveCompareStrings( - const char16_t* substring1, const char16_t* substring2, size_t byteLength) { - JS::AutoCheckCannotGC nogc; - - MOZ_ASSERT(byteLength % sizeof(char16_t) == 0); - size_t length = byteLength / sizeof(char16_t); - - for (size_t i = 0; i < length; i++) { - char16_t c1 = substring1[i]; - char16_t c2 = substring2[i]; - if (c1 != c2) { - c1 = js::unicode::ToUpperCase(c1); - c2 = js::unicode::ToUpperCase(c2); - if (c1 != c2) { - return 0; - } - } - } - - return 1; -} - -/*static */ -uint32_t SMRegExpMacroAssembler::CaseInsensitiveCompareUCStrings( - const char16_t* substring1, const char16_t* substring2, size_t byteLength) { - JS::AutoCheckCannotGC nogc; - - MOZ_ASSERT(byteLength % sizeof(char16_t) == 0); - size_t length = byteLength / sizeof(char16_t); - - for (size_t i = 0; i < length; i++) { - char16_t c1 = substring1[i]; - char16_t c2 = substring2[i]; - if (c1 != c2) { - c1 = js::unicode::FoldCase(c1); - c2 = js::unicode::FoldCase(c2); - if (c1 != c2) { - return 0; - } - } - } - - return 1; -} - -/* static */ -bool SMRegExpMacroAssembler::GrowBacktrackStack(RegExpStack* regexp_stack) { - JS::AutoCheckCannotGC nogc; - size_t size = regexp_stack->stack_capacity(); - return !!regexp_stack->EnsureCapacity(size * 2); -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-nodes.h b/js/src/new-regexp/regexp-nodes.h deleted file mode 100644 index 099687c25..000000000 --- a/js/src/new-regexp/regexp-nodes.h +++ /dev/null @@ -1,750 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_NODES_H_ -#define V8_REGEXP_REGEXP_NODES_H_ - -#include "new-regexp/regexp-macro-assembler.h" - -namespace v8 { -namespace internal { - -class AlternativeGenerationList; -class BoyerMooreLookahead; -class GreedyLoopState; -class Label; -class NodeVisitor; -class QuickCheckDetails; -class RegExpCompiler; -class Trace; -struct PreloadState; -class ChoiceNode; - -#define FOR_EACH_NODE_TYPE(VISIT) \ - VISIT(End) \ - VISIT(Action) \ - VISIT(Choice) \ - VISIT(LoopChoice) \ - VISIT(NegativeLookaroundChoice) \ - VISIT(BackReference) \ - VISIT(Assertion) \ - VISIT(Text) - -struct NodeInfo final { - NodeInfo() - : being_analyzed(false), - been_analyzed(false), - follows_word_interest(false), - follows_newline_interest(false), - follows_start_interest(false), - at_end(false), - visited(false), - replacement_calculated(false) {} - - // Returns true if the interests and assumptions of this node - // matches the given one. - bool Matches(NodeInfo* that) { - return (at_end == that->at_end) && - (follows_word_interest == that->follows_word_interest) && - (follows_newline_interest == that->follows_newline_interest) && - (follows_start_interest == that->follows_start_interest); - } - - // Updates the interests of this node given the interests of the - // node preceding it. - void AddFromPreceding(NodeInfo* that) { - at_end |= that->at_end; - follows_word_interest |= that->follows_word_interest; - follows_newline_interest |= that->follows_newline_interest; - follows_start_interest |= that->follows_start_interest; - } - - bool HasLookbehind() { - return follows_word_interest || follows_newline_interest || - follows_start_interest; - } - - // Sets the interests of this node to include the interests of the - // following node. - void AddFromFollowing(NodeInfo* that) { - follows_word_interest |= that->follows_word_interest; - follows_newline_interest |= that->follows_newline_interest; - follows_start_interest |= that->follows_start_interest; - } - - void ResetCompilationState() { - being_analyzed = false; - been_analyzed = false; - } - - bool being_analyzed : 1; - bool been_analyzed : 1; - - // These bits are set of this node has to know what the preceding - // character was. - bool follows_word_interest : 1; - bool follows_newline_interest : 1; - bool follows_start_interest : 1; - - bool at_end : 1; - bool visited : 1; - bool replacement_calculated : 1; -}; - -struct EatsAtLeastInfo final { - EatsAtLeastInfo() : EatsAtLeastInfo(0) {} - explicit EatsAtLeastInfo(uint8_t eats) - : eats_at_least_from_possibly_start(eats), - eats_at_least_from_not_start(eats) {} - void SetMin(const EatsAtLeastInfo& other) { - if (other.eats_at_least_from_possibly_start < - eats_at_least_from_possibly_start) { - eats_at_least_from_possibly_start = - other.eats_at_least_from_possibly_start; - } - if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) { - eats_at_least_from_not_start = other.eats_at_least_from_not_start; - } - } - - // Any successful match starting from the current node will consume at least - // this many characters. This does not necessarily mean that there is a - // possible match with exactly this many characters, but we generally try to - // get this number as high as possible to allow for early exit on failure. - uint8_t eats_at_least_from_possibly_start; - - // Like eats_at_least_from_possibly_start, but with the additional assumption - // that start-of-string assertions (^) can't match. This value is greater than - // or equal to eats_at_least_from_possibly_start. - uint8_t eats_at_least_from_not_start; -}; - -class RegExpNode : public ZoneObject { - public: - explicit RegExpNode(Zone* zone) - : replacement_(nullptr), - on_work_list_(false), - trace_count_(0), - zone_(zone) { - bm_info_[0] = bm_info_[1] = nullptr; - } - virtual ~RegExpNode(); - virtual void Accept(NodeVisitor* visitor) = 0; - // Generates a goto to this node or actually generates the code at this point. - virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; - // How many characters must this node consume at a minimum in order to - // succeed. The not_at_start argument is used to indicate that we know we are - // not at the start of the input. In this case anchored branches will always - // fail and can be ignored when determining how many characters are consumed - // on success. If this node has not been analyzed yet, EatsAtLeast returns 0. - int EatsAtLeast(bool not_at_start); - // Returns how many characters this node must consume in order to succeed, - // given that this is a LoopChoiceNode whose counter register is in a - // newly-initialized state at the current position in the generated code. For - // example, consider /a{6,8}/. Absent any extra information, the - // LoopChoiceNode for the repetition must report that it consumes at least - // zero characters, because it may have already looped several times. However, - // with a newly-initialized counter, it can report that it consumes at least - // six characters. - virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry(); - // Emits some quick code that checks whether the preloaded characters match. - // Falls through on certain failure, jumps to the label on possible success. - // If the node cannot make a quick check it does nothing and returns false. - bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace, - Trace* trace, bool preload_has_checked_bounds, - Label* on_possible_success, - QuickCheckDetails* details_return, - bool fall_through_on_failure, ChoiceNode* predecessor); - // For a given number of characters this returns a mask and a value. The - // next n characters are anded with the mask and compared with the value. - // A comparison failure indicates the node cannot match the next n characters. - // A comparison success indicates the node may match. - virtual void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start) = 0; - // Fills in quick check details for this node, given that this is a - // LoopChoiceNode whose counter register is in a newly-initialized state at - // the current position in the generated code. For example, consider /a{6,8}/. - // Absent any extra information, the LoopChoiceNode for the repetition cannot - // generate any useful quick check because a match might be the (empty) - // continuation node. However, with a newly-initialized counter, it can - // generate a quick check for several 'a' characters at once. - virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start); - static const int kNodeIsTooComplexForGreedyLoops = kMinInt; - virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } - // Only returns the successor for a text node of length 1 that matches any - // character and that has no guards on it. - virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( - RegExpCompiler* compiler) { - return nullptr; - } - - // Collects information on the possible code units (mod 128) that can match if - // we look forward. This is used for a Boyer-Moore-like string searching - // implementation. TODO(erikcorry): This should share more code with - // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit - // the number of nodes we are willing to look at in order to create this data. - static const int kRecursionBudget = 200; - bool KeepRecursing(RegExpCompiler* compiler); - virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) { - UNREACHABLE(); - } - - // If we know that the input is one-byte then there are some nodes that can - // never match. This method returns a node that can be substituted for - // itself, or nullptr if the node can never match. - virtual RegExpNode* FilterOneByte(int depth) { return this; } - // Helper for FilterOneByte. - RegExpNode* replacement() { - DCHECK(info()->replacement_calculated); - return replacement_; - } - RegExpNode* set_replacement(RegExpNode* replacement) { - info()->replacement_calculated = true; - replacement_ = replacement; - return replacement; // For convenience. - } - - // We want to avoid recalculating the lookahead info, so we store it on the - // node. Only info that is for this node is stored. We can tell that the - // info is for this node when offset == 0, so the information is calculated - // relative to this node. - void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { - if (offset == 0) set_bm_info(not_at_start, bm); - } - - Label* label() { return &label_; } - // If non-generic code is generated for a node (i.e. the node is not at the - // start of the trace) then it cannot be reused. This variable sets a limit - // on how often we allow that to happen before we insist on starting a new - // trace and generating generic code for a node that can be reused by flushing - // the deferred actions in the current trace and generating a goto. - static const int kMaxCopiesCodeGenerated = 10; - - bool on_work_list() { return on_work_list_; } - void set_on_work_list(bool value) { on_work_list_ = value; } - - NodeInfo* info() { return &info_; } - const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; } - void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) { - eats_at_least_ = eats_at_least; - } - - BoyerMooreLookahead* bm_info(bool not_at_start) { - return bm_info_[not_at_start ? 1 : 0]; - } - - Zone* zone() const { return zone_; } - - protected: - enum LimitResult { DONE, CONTINUE }; - RegExpNode* replacement_; - - LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); - - void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { - bm_info_[not_at_start ? 1 : 0] = bm; - } - - private: - static const int kFirstCharBudget = 10; - Label label_; - bool on_work_list_; - NodeInfo info_; - - // Saved values for EatsAtLeast results, to avoid recomputation. Filled in - // during analysis (valid if info_.been_analyzed is true). - EatsAtLeastInfo eats_at_least_; - - // This variable keeps track of how many times code has been generated for - // this node (in different traces). We don't keep track of where the - // generated code is located unless the code is generated at the start of - // a trace, in which case it is generic and can be reused by flushing the - // deferred operations in the current trace and generating a goto. - int trace_count_; - BoyerMooreLookahead* bm_info_[2]; - - Zone* zone_; -}; - -class SeqRegExpNode : public RegExpNode { - public: - explicit SeqRegExpNode(RegExpNode* on_success) - : RegExpNode(on_success->zone()), on_success_(on_success) {} - RegExpNode* on_success() { return on_success_; } - void set_on_success(RegExpNode* node) { on_success_ = node; } - RegExpNode* FilterOneByte(int depth) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override { - on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); - if (offset == 0) set_bm_info(not_at_start, bm); - } - - protected: - RegExpNode* FilterSuccessor(int depth); - - private: - RegExpNode* on_success_; -}; - -class ActionNode : public SeqRegExpNode { - public: - enum ActionType { - SET_REGISTER_FOR_LOOP, - INCREMENT_REGISTER, - STORE_POSITION, - BEGIN_SUBMATCH, - POSITIVE_SUBMATCH_SUCCESS, - EMPTY_MATCH_CHECK, - CLEAR_CAPTURES - }; - static ActionNode* SetRegisterForLoop(int reg, int val, - RegExpNode* on_success); - static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); - static ActionNode* StorePosition(int reg, bool is_capture, - RegExpNode* on_success); - static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); - static ActionNode* BeginSubmatch(int stack_pointer_reg, int position_reg, - RegExpNode* on_success); - static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, - int restore_reg, - int clear_capture_count, - int clear_capture_from, - RegExpNode* on_success); - static ActionNode* EmptyMatchCheck(int start_register, - int repetition_register, - int repetition_limit, - RegExpNode* on_success); - void Accept(NodeVisitor* visitor) override; - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int filled_in, - bool not_at_start) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override; - ActionType action_type() { return action_type_; } - // TODO(erikcorry): We should allow some action nodes in greedy loops. - int GreedyLoopTextLength() override { - return kNodeIsTooComplexForGreedyLoops; - } - - private: - union { - struct { - int reg; - int value; - } u_store_register; - struct { - int reg; - } u_increment_register; - struct { - int reg; - bool is_capture; - } u_position_register; - struct { - int stack_pointer_register; - int current_position_register; - int clear_register_count; - int clear_register_from; - } u_submatch; - struct { - int start_register; - int repetition_register; - int repetition_limit; - } u_empty_match_check; - struct { - int range_from; - int range_to; - } u_clear_captures; - } data_; - ActionNode(ActionType action_type, RegExpNode* on_success) - : SeqRegExpNode(on_success), action_type_(action_type) {} - ActionType action_type_; - friend class DotPrinterImpl; -}; - -class TextNode : public SeqRegExpNode { - public: - TextNode(ZoneList* elms, bool read_backward, - RegExpNode* on_success) - : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {} - TextNode(RegExpCharacterClass* that, bool read_backward, - RegExpNode* on_success) - : SeqRegExpNode(on_success), - elms_(new (zone()) ZoneList(1, zone())), - read_backward_(read_backward) { - elms_->Add(TextElement::CharClass(that), zone()); - } - // Create TextNode for a single character class for the given ranges. - static TextNode* CreateForCharacterRanges(Zone* zone, - ZoneList* ranges, - bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags); - // Create TextNode for a surrogate pair with a range given for the - // lead and the trail surrogate each. - static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead, - CharacterRange trail, - bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags); - void Accept(NodeVisitor* visitor) override; - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int characters_filled_in, - bool not_at_start) override; - ZoneList* elements() { return elms_; } - bool read_backward() { return read_backward_; } - void MakeCaseIndependent(Isolate* isolate, bool is_one_byte); - int GreedyLoopTextLength() override; - RegExpNode* GetSuccessorOfOmnivorousTextNode( - RegExpCompiler* compiler) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override; - void CalculateOffsets(); - RegExpNode* FilterOneByte(int depth) override; - int Length(); - - private: - enum TextEmitPassType { - NON_LATIN1_MATCH, // Check for characters that can't match. - SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. - NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. - CASE_CHARACTER_MATCH, // Case-independent single character check. - CHARACTER_CLASS_MATCH // Character class. - }; - static bool SkipPass(TextEmitPassType pass, bool ignore_case); - static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; - static const int kLastPass = CHARACTER_CLASS_MATCH; - void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, - bool preloaded, Trace* trace, bool first_element_checked, - int* checked_up_to); - ZoneList* elms_; - bool read_backward_; -}; - -class AssertionNode : public SeqRegExpNode { - public: - enum AssertionType { - AT_END, - AT_START, - AT_BOUNDARY, - AT_NON_BOUNDARY, - AFTER_NEWLINE - }; - static AssertionNode* AtEnd(RegExpNode* on_success) { - return new (on_success->zone()) AssertionNode(AT_END, on_success); - } - static AssertionNode* AtStart(RegExpNode* on_success) { - return new (on_success->zone()) AssertionNode(AT_START, on_success); - } - static AssertionNode* AtBoundary(RegExpNode* on_success) { - return new (on_success->zone()) AssertionNode(AT_BOUNDARY, on_success); - } - static AssertionNode* AtNonBoundary(RegExpNode* on_success) { - return new (on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success); - } - static AssertionNode* AfterNewline(RegExpNode* on_success) { - return new (on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success); - } - void Accept(NodeVisitor* visitor) override; - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int filled_in, - bool not_at_start) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override; - AssertionType assertion_type() { return assertion_type_; } - - private: - void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); - enum IfPrevious { kIsNonWord, kIsWord }; - void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace, - IfPrevious backtrack_if_previous); - AssertionNode(AssertionType t, RegExpNode* on_success) - : SeqRegExpNode(on_success), assertion_type_(t) {} - AssertionType assertion_type_; -}; - -class BackReferenceNode : public SeqRegExpNode { - public: - BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags, - bool read_backward, RegExpNode* on_success) - : SeqRegExpNode(on_success), - start_reg_(start_reg), - end_reg_(end_reg), - flags_(flags), - read_backward_(read_backward) {} - void Accept(NodeVisitor* visitor) override; - int start_register() { return start_reg_; } - int end_register() { return end_reg_; } - bool read_backward() { return read_backward_; } - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int characters_filled_in, - bool not_at_start) override { - return; - } - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override; - - private: - int start_reg_; - int end_reg_; - JSRegExp::Flags flags_; - bool read_backward_; -}; - -class EndNode : public RegExpNode { - public: - enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; - EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {} - void Accept(NodeVisitor* visitor) override; - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int characters_filled_in, - bool not_at_start) override { - // Returning 0 from EatsAtLeast should ensure we never get here. - UNREACHABLE(); - } - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override { - // Returning 0 from EatsAtLeast should ensure we never get here. - UNREACHABLE(); - } - - private: - Action action_; -}; - -class NegativeSubmatchSuccess : public EndNode { - public: - NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, - int clear_capture_count, int clear_capture_start, - Zone* zone) - : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), - stack_pointer_register_(stack_pointer_reg), - current_position_register_(position_reg), - clear_capture_count_(clear_capture_count), - clear_capture_start_(clear_capture_start) {} - void Emit(RegExpCompiler* compiler, Trace* trace) override; - - private: - int stack_pointer_register_; - int current_position_register_; - int clear_capture_count_; - int clear_capture_start_; -}; - -class Guard : public ZoneObject { - public: - enum Relation { LT, GEQ }; - Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {} - int reg() { return reg_; } - Relation op() { return op_; } - int value() { return value_; } - - private: - int reg_; - Relation op_; - int value_; -}; - -class GuardedAlternative { - public: - explicit GuardedAlternative(RegExpNode* node) - : node_(node), guards_(nullptr) {} - void AddGuard(Guard* guard, Zone* zone); - RegExpNode* node() { return node_; } - void set_node(RegExpNode* node) { node_ = node; } - ZoneList* guards() { return guards_; } - - private: - RegExpNode* node_; - ZoneList* guards_; -}; - -class AlternativeGeneration; - -class ChoiceNode : public RegExpNode { - public: - explicit ChoiceNode(int expected_size, Zone* zone) - : RegExpNode(zone), - alternatives_(new (zone) - ZoneList(expected_size, zone)), - not_at_start_(false), - being_calculated_(false) {} - void Accept(NodeVisitor* visitor) override; - void AddAlternative(GuardedAlternative node) { - alternatives()->Add(node, zone()); - } - ZoneList* alternatives() { return alternatives_; } - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int characters_filled_in, - bool not_at_start) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override; - - bool being_calculated() { return being_calculated_; } - bool not_at_start() { return not_at_start_; } - void set_not_at_start() { not_at_start_ = true; } - void set_being_calculated(bool b) { being_calculated_ = b; } - virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { - return true; - } - RegExpNode* FilterOneByte(int depth) override; - virtual bool read_backward() { return false; } - - protected: - int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); - ZoneList* alternatives_; - - private: - template - friend class Analysis; - - void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, - Trace* trace); - int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); - void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, - GuardedAlternative alternative, - AlternativeGeneration* alt_gen, - int preload_characters, - bool next_expects_preload); - void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, - PreloadState* preloads); - void AssertGuardsMentionRegisters(Trace* trace); - int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace); - Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, - AlternativeGenerationList* alt_gens, - PreloadState* preloads, - GreedyLoopState* greedy_loop_state, int text_length); - void EmitChoices(RegExpCompiler* compiler, - AlternativeGenerationList* alt_gens, int first_choice, - Trace* trace, PreloadState* preloads); - - // If true, this node is never checked at the start of the input. - // Allows a new trace to start with at_start() set to false. - bool not_at_start_; - bool being_calculated_; -}; - -class NegativeLookaroundChoiceNode : public ChoiceNode { - public: - explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail, - GuardedAlternative then_do_this, - Zone* zone) - : ChoiceNode(2, zone) { - AddAlternative(this_must_fail); - AddAlternative(then_do_this); - } - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int characters_filled_in, - bool not_at_start) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override { - continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm, - not_at_start); - if (offset == 0) set_bm_info(not_at_start, bm); - } - static constexpr int kLookaroundIndex = 0; - static constexpr int kContinueIndex = 1; - RegExpNode* lookaround_node() { - return alternatives()->at(kLookaroundIndex).node(); - } - RegExpNode* continue_node() { - return alternatives()->at(kContinueIndex).node(); - } - // For a negative lookahead we don't emit the quick check for the - // alternative that is expected to fail. This is because quick check code - // starts by loading enough characters for the alternative that takes fewest - // characters, but on a negative lookahead the negative branch did not take - // part in that calculation (EatsAtLeast) so the assumptions don't hold. - bool try_to_emit_quick_check_for_alternative(bool is_first) override { - return !is_first; - } - void Accept(NodeVisitor* visitor) override; - RegExpNode* FilterOneByte(int depth) override; -}; - -class LoopChoiceNode : public ChoiceNode { - public: - LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, - int min_loop_iterations, Zone* zone) - : ChoiceNode(2, zone), - loop_node_(nullptr), - continue_node_(nullptr), - body_can_be_zero_length_(body_can_be_zero_length), - read_backward_(read_backward), - traversed_loop_initialization_node_(false), - min_loop_iterations_(min_loop_iterations) {} - void AddLoopAlternative(GuardedAlternative alt); - void AddContinueAlternative(GuardedAlternative alt); - void Emit(RegExpCompiler* compiler, Trace* trace) override; - void GetQuickCheckDetails(QuickCheckDetails* details, - RegExpCompiler* compiler, int characters_filled_in, - bool not_at_start) override; - void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, - RegExpCompiler* compiler, - int characters_filled_in, - bool not_at_start) override; - void FillInBMInfo(Isolate* isolate, int offset, int budget, - BoyerMooreLookahead* bm, bool not_at_start) override; - EatsAtLeastInfo EatsAtLeastFromLoopEntry() override; - RegExpNode* loop_node() { return loop_node_; } - RegExpNode* continue_node() { return continue_node_; } - bool body_can_be_zero_length() { return body_can_be_zero_length_; } - int min_loop_iterations() const { return min_loop_iterations_; } - bool read_backward() override { return read_backward_; } - void Accept(NodeVisitor* visitor) override; - RegExpNode* FilterOneByte(int depth) override; - - private: - // AddAlternative is made private for loop nodes because alternatives - // should not be added freely, we need to keep track of which node - // goes back to the node itself. - void AddAlternative(GuardedAlternative node) { - ChoiceNode::AddAlternative(node); - } - - RegExpNode* loop_node_; - RegExpNode* continue_node_; - bool body_can_be_zero_length_; - bool read_backward_; - - // Temporary marker set only while generating quick check details. Represents - // whether GetQuickCheckDetails traversed the initialization node for this - // loop's counter. If so, we may be able to generate stricter quick checks - // because we know the loop node must match at least min_loop_iterations_ - // times before the continuation node can match. - bool traversed_loop_initialization_node_; - - // The minimum number of times the loop_node_ must match before the - // continue_node_ might be considered. This value can be temporarily decreased - // while generating quick check details, to represent the remaining iterations - // after the completed portion of the quick check details. - int min_loop_iterations_; - - friend class IterationDecrementer; - friend class LoopInitializationMarker; -}; - -class NodeVisitor { - public: - virtual ~NodeVisitor() = default; -#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0; - FOR_EACH_NODE_TYPE(DECLARE_VISIT) -#undef DECLARE_VISIT -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_NODES_H_ diff --git a/js/src/new-regexp/regexp-parser.cc b/js/src/new-regexp/regexp-parser.cc deleted file mode 100644 index a26e35438..000000000 --- a/js/src/new-regexp/regexp-parser.cc +++ /dev/null @@ -1,2109 +0,0 @@ -// Copyright 2016 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-parser.h" - -#include - -#include "new-regexp/property-sequences.h" -#include "new-regexp/regexp-macro-assembler.h" -#include "new-regexp/regexp.h" - -#ifdef V8_INTL_SUPPORT -#include "unicode/uniset.h" -#endif // V8_INTL_SUPPORT - -namespace v8 { -namespace internal { - -RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, - Isolate* isolate, Zone* zone) - : isolate_(isolate), - zone_(zone), - captures_(nullptr), - named_captures_(nullptr), - named_back_references_(nullptr), - in_(in), - current_(kEndMarker), - top_level_flags_(flags), - next_pos_(0), - captures_started_(0), - capture_count_(0), - has_more_(true), - simple_(false), - contains_anchor_(false), - is_scanned_for_captures_(false), - has_named_captures_(false), - failed_(false) { - Advance(); -} - -template -inline uc32 RegExpParser::ReadNext() { - int position = next_pos_; - uc32 c0 = in()->Get(position); - position++; - // Read the whole surrogate pair in case of unicode flag, if possible. - if (unicode() && position < in()->length() && - unibrow::Utf16::IsLeadSurrogate(static_cast(c0))) { - uc16 c1 = in()->Get(position); - if (unibrow::Utf16::IsTrailSurrogate(c1)) { - c0 = unibrow::Utf16::CombineSurrogatePair(static_cast(c0), c1); - position++; - } - } - if (update_position) next_pos_ = position; - return c0; -} - - -uc32 RegExpParser::Next() { - if (has_next()) { - return ReadNext(); - } else { - return kEndMarker; - } -} - -void RegExpParser::Advance() { - if (has_next()) { - StackLimitCheck check(isolate()); - if (check.HasOverflowed()) { - if (FLAG_correctness_fuzzer_suppressions) { - FATAL("Aborting on stack overflow"); - } - ReportError(RegExpError::kStackOverflow); - } else if (zone()->excess_allocation()) { - if (FLAG_correctness_fuzzer_suppressions) { - FATAL("Aborting on excess zone allocation"); - } - ReportError(RegExpError::kTooLarge); - } else { - current_ = ReadNext(); - } - } else { - current_ = kEndMarker; - // Advance so that position() points to 1-after-the-last-character. This is - // important so that Reset() to this position works correctly. - next_pos_ = in()->length() + 1; - has_more_ = false; - } -} - - -void RegExpParser::Reset(int pos) { - next_pos_ = pos; - has_more_ = (pos < in()->length()); - Advance(); -} - -void RegExpParser::Advance(int dist) { - next_pos_ += dist - 1; - Advance(); -} - - -bool RegExpParser::simple() { return simple_; } - -bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { - switch (c) { - case '^': - case '$': - case '\\': - case '.': - case '*': - case '+': - case '?': - case '(': - case ')': - case '[': - case ']': - case '{': - case '}': - case '|': - case '/': - return true; - default: - break; - } - return false; -} - -RegExpTree* RegExpParser::ReportError(RegExpError error) { - if (failed_) return nullptr; // Do not overwrite any existing error. - failed_ = true; - error_ = error; - error_pos_ = position(); - // Zip to the end to make sure no more input is read. - current_ = kEndMarker; - next_pos_ = in()->length(); - return nullptr; -} - -#define CHECK_FAILED /**/); \ - if (failed_) return nullptr; \ - ((void)0 - -// Pattern :: -// Disjunction -RegExpTree* RegExpParser::ParsePattern() { - RegExpTree* result = ParseDisjunction(CHECK_FAILED); - PatchNamedBackReferences(CHECK_FAILED); - DCHECK(!has_more()); - // If the result of parsing is a literal string atom, and it has the - // same length as the input, then the atom is identical to the input. - if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { - simple_ = true; - } - return result; -} - - -// Disjunction :: -// Alternative -// Alternative | Disjunction -// Alternative :: -// [empty] -// Term Alternative -// Term :: -// Assertion -// Atom -// Atom Quantifier -RegExpTree* RegExpParser::ParseDisjunction() { - // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, - 0, nullptr, top_level_flags_, zone()); - RegExpParserState* state = &initial_state; - // Cache the builder in a local variable for quick access. - RegExpBuilder* builder = initial_state.builder(); - while (true) { - switch (current()) { - case kEndMarker: - if (state->IsSubexpression()) { - // Inside a parenthesized group when hitting end of input. - return ReportError(RegExpError::kUnterminatedGroup); - } - DCHECK_EQ(INITIAL, state->group_type()); - // Parsing completed successfully. - return builder->ToRegExp(); - case ')': { - if (!state->IsSubexpression()) { - return ReportError(RegExpError::kUnmatchedParen); - } - DCHECK_NE(INITIAL, state->group_type()); - - Advance(); - // End disjunction parsing and convert builder content to new single - // regexp atom. - RegExpTree* body = builder->ToRegExp(); - - int end_capture_index = captures_started(); - - int capture_index = state->capture_index(); - SubexpressionType group_type = state->group_type(); - - // Build result of subexpression. - if (group_type == CAPTURE) { - if (state->IsNamedCapture()) { - CreateNamedCaptureAtIndex(state->capture_name(), - capture_index CHECK_FAILED); - } - RegExpCapture* capture = GetCapture(capture_index); - capture->set_body(body); - body = capture; - } else if (group_type == GROUPING) { - body = new (zone()) RegExpGroup(body); - } else { - DCHECK(group_type == POSITIVE_LOOKAROUND || - group_type == NEGATIVE_LOOKAROUND); - bool is_positive = (group_type == POSITIVE_LOOKAROUND); - body = new (zone()) RegExpLookaround( - body, is_positive, end_capture_index - capture_index, - capture_index, state->lookaround_type()); - } - - // Restore previous state. - state = state->previous_state(); - builder = state->builder(); - - builder->AddAtom(body); - // For compatibility with JSC and ES3, we allow quantifiers after - // lookaheads, and break in all cases. - break; - } - case '|': { - Advance(); - builder->NewAlternative(); - continue; - } - case '*': - case '+': - case '?': - return ReportError(RegExpError::kNothingToRepeat); - case '^': { - Advance(); - if (builder->multiline()) { - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::START_OF_LINE, builder->flags())); - } else { - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::START_OF_INPUT, builder->flags())); - set_contains_anchor(); - } - continue; - } - case '$': { - Advance(); - RegExpAssertion::AssertionType assertion_type = - builder->multiline() ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; - builder->AddAssertion( - new (zone()) RegExpAssertion(assertion_type, builder->flags())); - continue; - } - case '.': { - Advance(); - ZoneList* ranges = - new (zone()) ZoneList(2, zone()); - - if (builder->dotall()) { - // Everything. - CharacterRange::AddClassEscape('*', ranges, false, zone()); - } else { - // Everything except \x0A, \x0D, \u2028 and \u2029 - CharacterRange::AddClassEscape('.', ranges, false, zone()); - } - - RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - case '(': { - state = ParseOpenParenthesis(state CHECK_FAILED); - builder = state->builder(); - continue; - } - case '[': { - RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); - builder->AddCharacterClass(cc->AsCharacterClass()); - break; - } - // Atom :: - // \ AtomEscape - case '\\': - switch (Next()) { - case kEndMarker: - return ReportError(RegExpError::kEscapeAtEndOfPattern); - case 'b': - Advance(2); - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::BOUNDARY, builder->flags())); - continue; - case 'B': - Advance(2); - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::NON_BOUNDARY, builder->flags())); - continue; - // AtomEscape :: - // CharacterClassEscape - // - // CharacterClassEscape :: one of - // d D s S w W - case 'd': - case 'D': - case 's': - case 'S': - case 'w': - case 'W': { - uc32 c = Next(); - Advance(2); - ZoneList* ranges = - new (zone()) ZoneList(2, zone()); - CharacterRange::AddClassEscape( - c, ranges, unicode() && builder->ignore_case(), zone()); - RegExpCharacterClass* cc = new (zone()) - RegExpCharacterClass(zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - case 'p': - case 'P': { - uc32 p = Next(); - Advance(2); - if (unicode()) { - ZoneList* ranges = - new (zone()) ZoneList(2, zone()); - ZoneVector name_1(zone()); - ZoneVector name_2(zone()); - if (ParsePropertyClassName(&name_1, &name_2)) { - if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { - RegExpCharacterClass* cc = new (zone()) - RegExpCharacterClass(zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - if (p == 'p' && name_2.empty()) { - RegExpTree* sequence = GetPropertySequence(name_1); - if (sequence != nullptr) { - builder->AddAtom(sequence); - break; - } - } - } - return ReportError(RegExpError::kInvalidPropertyName); - } else { - builder->AddCharacter(p); - } - break; - } - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - int index = 0; - bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); - if (is_backref) { - if (state->IsInsideCaptureGroup(index)) { - // The back reference is inside the capture group it refers to. - // Nothing can possibly have been captured yet, so we use empty - // instead. This ensures that, when checking a back reference, - // the capture registers of the referenced capture are either - // both set or both cleared. - builder->AddEmpty(); - } else { - RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = - new (zone()) RegExpBackReference(capture, builder->flags()); - builder->AddAtom(atom); - } - break; - } - // With /u, no identity escapes except for syntax characters - // are allowed. Otherwise, all identity escapes are allowed. - if (unicode()) { - return ReportError(RegExpError::kInvalidEscape); - } - uc32 first_digit = Next(); - if (first_digit == '8' || first_digit == '9') { - builder->AddCharacter(first_digit); - Advance(2); - break; - } - V8_FALLTHROUGH; - } - case '0': { - Advance(); - if (unicode() && Next() >= '0' && Next() <= '9') { - // With /u, decimal escape with leading 0 are not parsed as octal. - return ReportError(RegExpError::kInvalidDecimalEscape); - } - uc32 octal = ParseOctalLiteral(); - builder->AddCharacter(octal); - break; - } - // ControlEscape :: one of - // f n r t v - case 'f': - Advance(2); - builder->AddCharacter('\f'); - break; - case 'n': - Advance(2); - builder->AddCharacter('\n'); - break; - case 'r': - Advance(2); - builder->AddCharacter('\r'); - break; - case 't': - Advance(2); - builder->AddCharacter('\t'); - break; - case 'v': - Advance(2); - builder->AddCharacter('\v'); - break; - case 'c': { - Advance(); - uc32 controlLetter = Next(); - // Special case if it is an ASCII letter. - // Convert lower case letters to uppercase. - uc32 letter = controlLetter & ~('a' ^ 'A'); - if (letter < 'A' || 'Z' < letter) { - // controlLetter is not in range 'A'-'Z' or 'a'-'z'. - // Read the backslash as a literal character instead of as - // starting an escape. - // ES#prod-annexB-ExtendedPatternCharacter - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidUnicodeEscape); - } - builder->AddCharacter('\\'); - } else { - Advance(2); - builder->AddCharacter(controlLetter & 0x1F); - } - break; - } - case 'x': { - Advance(2); - uc32 value; - if (ParseHexEscape(2, &value)) { - builder->AddCharacter(value); - } else if (!unicode()) { - builder->AddCharacter('x'); - } else { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidEscape); - } - break; - } - case 'u': { - Advance(2); - uc32 value; - if (ParseUnicodeEscape(&value)) { - builder->AddEscapedUnicodeCharacter(value); - } else if (!unicode()) { - builder->AddCharacter('u'); - } else { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidUnicodeEscape); - } - break; - } - case 'k': - // Either an identity escape or a named back-reference. The two - // interpretations are mutually exclusive: '\k' is interpreted as - // an identity escape for non-Unicode patterns without named - // capture groups, and as the beginning of a named back-reference - // in all other cases. - if (unicode() || HasNamedCaptures()) { - Advance(2); - ParseNamedBackReference(builder, state CHECK_FAILED); - break; - } - V8_FALLTHROUGH; - default: - Advance(); - // With /u, no identity escapes except for syntax characters - // are allowed. Otherwise, all identity escapes are allowed. - if (!unicode() || IsSyntaxCharacterOrSlash(current())) { - builder->AddCharacter(current()); - Advance(); - } else { - return ReportError(RegExpError::kInvalidEscape); - } - break; - } - break; - case '{': { - int dummy; - bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); - if (parsed) return ReportError(RegExpError::kNothingToRepeat); - V8_FALLTHROUGH; - } - case '}': - case ']': - if (unicode()) { - return ReportError(RegExpError::kLoneQuantifierBrackets); - } - V8_FALLTHROUGH; - default: - builder->AddUnicodeCharacter(current()); - Advance(); - break; - } // end switch(current()) - - int min; - int max; - switch (current()) { - // QuantifierPrefix :: - // * - // + - // ? - // { - case '*': - min = 0; - max = RegExpTree::kInfinity; - Advance(); - break; - case '+': - min = 1; - max = RegExpTree::kInfinity; - Advance(); - break; - case '?': - min = 0; - max = 1; - Advance(); - break; - case '{': - if (ParseIntervalQuantifier(&min, &max)) { - if (max < min) { - return ReportError(RegExpError::kRangeOutOfOrder); - } - break; - } else if (unicode()) { - // With /u, incomplete quantifiers are not allowed. - return ReportError(RegExpError::kIncompleteQuantifier); - } - continue; - default: - continue; - } - RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; - if (current() == '?') { - quantifier_type = RegExpQuantifier::NON_GREEDY; - Advance(); - } else if (FLAG_regexp_possessive_quantifier && current() == '+') { - // FLAG_regexp_possessive_quantifier is a debug-only flag. - quantifier_type = RegExpQuantifier::POSSESSIVE; - Advance(); - } - if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { - return ReportError(RegExpError::kInvalidQuantifier); - } - } -} - -RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( - RegExpParserState* state) { - RegExpLookaround::Type lookaround_type = state->lookaround_type(); - bool is_named_capture = false; - JSRegExp::Flags switch_on = JSRegExp::kNone; - JSRegExp::Flags switch_off = JSRegExp::kNone; - const ZoneVector* capture_name = nullptr; - SubexpressionType subexpr_type = CAPTURE; - Advance(); - if (current() == '?') { - switch (Next()) { - case ':': - Advance(2); - subexpr_type = GROUPING; - break; - case '=': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = POSITIVE_LOOKAROUND; - break; - case '!': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - case '-': - case 'i': - case 's': - case 'm': { - if (!FLAG_regexp_mode_modifiers) { - ReportError(RegExpError::kInvalidGroup); - return nullptr; - } - Advance(); - bool flags_sense = true; // Switching on flags. - while (subexpr_type != GROUPING) { - switch (current()) { - case '-': - if (!flags_sense) { - ReportError(RegExpError::kMultipleFlagDashes); - return nullptr; - } - flags_sense = false; - Advance(); - continue; - case 's': - case 'i': - case 'm': { - JSRegExp::Flags bit = JSRegExp::kUnicode; - if (current() == 'i') bit = JSRegExp::kIgnoreCase; - if (current() == 'm') bit = JSRegExp::kMultiline; - if (current() == 's') bit = JSRegExp::kDotAll; - if (((switch_on | switch_off) & bit) != 0) { - ReportError(RegExpError::kRepeatedFlag); - return nullptr; - } - if (flags_sense) { - switch_on |= bit; - } else { - switch_off |= bit; - } - Advance(); - continue; - } - case ')': { - Advance(); - state->builder() - ->FlushText(); // Flush pending text using old flags. - // These (?i)-style flag switches don't put us in a subexpression - // at all, they just modify the flags in the rest of the current - // subexpression. - JSRegExp::Flags flags = - (state->builder()->flags() | switch_on) & ~switch_off; - state->builder()->set_flags(flags); - return state; - } - case ':': - Advance(); - subexpr_type = GROUPING; // Will break us out of the outer loop. - continue; - default: - ReportError(RegExpError::kInvalidFlagGroup); - return nullptr; - } - } - break; - } - case '<': - Advance(); - if (Next() == '=') { - Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; - subexpr_type = POSITIVE_LOOKAROUND; - break; - } else if (Next() == '!') { - Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - } - is_named_capture = true; - has_named_captures_ = true; - Advance(); - break; - default: - ReportError(RegExpError::kInvalidGroup); - return nullptr; - } - } - if (subexpr_type == CAPTURE) { - if (captures_started_ >= JSRegExp::kMaxCaptures) { - ReportError(RegExpError::kTooManyCaptures); - return nullptr; - } - captures_started_++; - - if (is_named_capture) { - capture_name = ParseCaptureGroupName(CHECK_FAILED); - } - } - JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off; - // Store current state and begin new disjunction parsing. - return new (zone()) - RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, - capture_name, flags, zone()); -} - -#ifdef DEBUG -// Currently only used in an DCHECK. -static bool IsSpecialClassEscape(uc32 c) { - switch (c) { - case 'd': - case 'D': - case 's': - case 'S': - case 'w': - case 'W': - return true; - default: - return false; - } -} -#endif - - -// In order to know whether an escape is a backreference or not we have to scan -// the entire regexp and find the number of capturing parentheses. However we -// don't want to scan the regexp twice unless it is necessary. This mini-parser -// is called when needed. It can see the difference between capturing and -// noncapturing parentheses and can skip character classes and backslash-escaped -// characters. -void RegExpParser::ScanForCaptures() { - DCHECK(!is_scanned_for_captures_); - const int saved_position = position(); - // Start with captures started previous to current position - int capture_count = captures_started(); - // Add count of captures after this position. - int n; - while ((n = current()) != kEndMarker) { - Advance(); - switch (n) { - case '\\': - Advance(); - break; - case '[': { - int c; - while ((c = current()) != kEndMarker) { - Advance(); - if (c == '\\') { - Advance(); - } else { - if (c == ']') break; - } - } - break; - } - case '(': - if (current() == '?') { - // At this point we could be in - // * a non-capturing group '(:', - // * a lookbehind assertion '(?<=' '(? JSRegExp::kMaxCaptures) { - Reset(start); - return false; - } - Advance(); - } else { - break; - } - } - if (value > captures_started()) { - if (!is_scanned_for_captures_) ScanForCaptures(); - if (value > capture_count_) { - Reset(start); - return false; - } - } - *index_out = value; - return true; -} - -static void push_code_unit(ZoneVector* v, uint32_t code_unit) { - if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { - v->push_back(code_unit); - } else { - v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); - v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); - } -} - -const ZoneVector* RegExpParser::ParseCaptureGroupName() { - ZoneVector* name = - new (zone()->New(sizeof(ZoneVector))) ZoneVector(zone()); - - bool at_start = true; - while (true) { - uc32 c = current(); - Advance(); - - // Convert unicode escapes. - if (c == '\\' && current() == 'u') { - Advance(); - if (!ParseUnicodeEscape(&c)) { - ReportError(RegExpError::kInvalidUnicodeEscape); - return nullptr; - } - } - - // The backslash char is misclassified as both ID_Start and ID_Continue. - if (c == '\\') { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - - if (at_start) { - if (!IsIdentifierStart(c)) { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - push_code_unit(name, c); - at_start = false; - } else { - if (c == '>') { - break; - } else if (IsIdentifierPart(c)) { - push_code_unit(name, c); - } else { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - } - } - - return name; -} - -bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector* name, - int index) { - DCHECK(0 < index && index <= captures_started_); - DCHECK_NOT_NULL(name); - - RegExpCapture* capture = GetCapture(index); - DCHECK_NULL(capture->name()); - - capture->set_name(name); - - if (named_captures_ == nullptr) { - named_captures_ = new (zone_->New(sizeof(*named_captures_))) - ZoneSet(zone()); - } else { - // Check for duplicates and bail if we find any. - - const auto& named_capture_it = named_captures_->find(capture); - if (named_capture_it != named_captures_->end()) { - ReportError(RegExpError::kDuplicateCaptureGroupName); - return false; - } - } - - named_captures_->emplace(capture); - - return true; -} - -bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, - RegExpParserState* state) { - // The parser is assumed to be on the '<' in \k. - if (current() != '<') { - ReportError(RegExpError::kInvalidNamedReference); - return false; - } - - Advance(); - const ZoneVector* name = ParseCaptureGroupName(); - if (name == nullptr) { - return false; - } - - if (state->IsInsideCaptureGroup(name)) { - builder->AddEmpty(); - } else { - RegExpBackReference* atom = - new (zone()) RegExpBackReference(builder->flags()); - atom->set_name(name); - - builder->AddAtom(atom); - - if (named_back_references_ == nullptr) { - named_back_references_ = - new (zone()) ZoneList(1, zone()); - } - named_back_references_->Add(atom, zone()); - } - - return true; -} - -void RegExpParser::PatchNamedBackReferences() { - if (named_back_references_ == nullptr) return; - - if (named_captures_ == nullptr) { - ReportError(RegExpError::kInvalidNamedCaptureReference); - return; - } - - // Look up and patch the actual capture for each named back reference. - - for (int i = 0; i < named_back_references_->length(); i++) { - RegExpBackReference* ref = named_back_references_->at(i); - - // Capture used to search the named_captures_ by name, index of the - // capture is never used. - static const int kInvalidIndex = 0; - RegExpCapture* search_capture = new (zone()) RegExpCapture(kInvalidIndex); - DCHECK_NULL(search_capture->name()); - search_capture->set_name(ref->name()); - - int index = -1; - const auto& capture_it = named_captures_->find(search_capture); - if (capture_it != named_captures_->end()) { - index = (*capture_it)->index(); - } else { - ReportError(RegExpError::kInvalidNamedCaptureReference); - return; - } - - ref->set_capture(GetCapture(index)); - } -} - -RegExpCapture* RegExpParser::GetCapture(int index) { - // The index for the capture groups are one-based. Its index in the list is - // zero-based. - int know_captures = - is_scanned_for_captures_ ? capture_count_ : captures_started_; - DCHECK(index <= know_captures); - if (captures_ == nullptr) { - captures_ = new (zone()) ZoneList(know_captures, zone()); - } - while (captures_->length() < know_captures) { - captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); - } - return captures_->at(index - 1); -} - -namespace { - -struct RegExpCaptureIndexLess { - bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { - DCHECK_NOT_NULL(lhs); - DCHECK_NOT_NULL(rhs); - return lhs->index() < rhs->index(); - } -}; - -} // namespace - -Handle RegExpParser::CreateCaptureNameMap() { - if (named_captures_ == nullptr || named_captures_->empty()) { - return Handle(); - } - - // Named captures are sorted by name (because the set is used to ensure - // name uniqueness). But the capture name map must to be sorted by index. - - ZoneVector sorted_named_captures( - named_captures_->begin(), named_captures_->end(), zone()); - std::sort(sorted_named_captures.begin(), sorted_named_captures.end(), - RegExpCaptureIndexLess{}); - DCHECK_EQ(sorted_named_captures.size(), named_captures_->size()); - - Factory* factory = isolate()->factory(); - - int len = static_cast(sorted_named_captures.size()) * 2; - Handle array = factory->NewFixedArray(len); - - int i = 0; - for (const auto& capture : sorted_named_captures) { - Vector capture_name(capture->name()->data(), - capture->name()->size()); - // CSA code in ConstructNewResultFromMatchInfo requires these strings to be - // internalized so they can be used as property names in the 'exec' results. - Handle name = factory->InternalizeString(capture_name); - array->set(i * 2, *name); - array->set(i * 2 + 1, Smi::FromInt(capture->index())); - - i++; - } - DCHECK_EQ(i * 2, len); - - return array; -} - -bool RegExpParser::HasNamedCaptures() { - if (has_named_captures_ || is_scanned_for_captures_) { - return has_named_captures_; - } - - ScanForCaptures(); - DCHECK(is_scanned_for_captures_); - return has_named_captures_; -} - -bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { - for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { - if (s->group_type() != CAPTURE) continue; - // Return true if we found the matching capture index. - if (index == s->capture_index()) return true; - // Abort if index is larger than what has been parsed up till this state. - if (index > s->capture_index()) return false; - } - return false; -} - -bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( - const ZoneVector* name) { - DCHECK_NOT_NULL(name); - for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { - if (s->capture_name() == nullptr) continue; - if (*s->capture_name() == *name) return true; - } - return false; -} - -// QuantifierPrefix :: -// { DecimalDigits } -// { DecimalDigits , } -// { DecimalDigits , DecimalDigits } -// -// Returns true if parsing succeeds, and set the min_out and max_out -// values. Values are truncated to RegExpTree::kInfinity if they overflow. -bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { - DCHECK_EQ(current(), '{'); - int start = position(); - Advance(); - int min = 0; - if (!IsDecimalDigit(current())) { - Reset(start); - return false; - } - while (IsDecimalDigit(current())) { - int next = current() - '0'; - if (min > (RegExpTree::kInfinity - next) / 10) { - // Overflow. Skip past remaining decimal digits and return -1. - do { - Advance(); - } while (IsDecimalDigit(current())); - min = RegExpTree::kInfinity; - break; - } - min = 10 * min + next; - Advance(); - } - int max = 0; - if (current() == '}') { - max = min; - Advance(); - } else if (current() == ',') { - Advance(); - if (current() == '}') { - max = RegExpTree::kInfinity; - Advance(); - } else { - while (IsDecimalDigit(current())) { - int next = current() - '0'; - if (max > (RegExpTree::kInfinity - next) / 10) { - do { - Advance(); - } while (IsDecimalDigit(current())); - max = RegExpTree::kInfinity; - break; - } - max = 10 * max + next; - Advance(); - } - if (current() != '}') { - Reset(start); - return false; - } - Advance(); - } - } else { - Reset(start); - return false; - } - *min_out = min; - *max_out = max; - return true; -} - - -uc32 RegExpParser::ParseOctalLiteral() { - DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); - // For compatibility with some other browsers (not all), we parse - // up to three octal digits with a value below 256. - // ES#prod-annexB-LegacyOctalEscapeSequence - uc32 value = current() - '0'; - Advance(); - if ('0' <= current() && current() <= '7') { - value = value * 8 + current() - '0'; - Advance(); - if (value < 32 && '0' <= current() && current() <= '7') { - value = value * 8 + current() - '0'; - Advance(); - } - } - return value; -} - - -bool RegExpParser::ParseHexEscape(int length, uc32* value) { - int start = position(); - uc32 val = 0; - for (int i = 0; i < length; ++i) { - uc32 c = current(); - int d = HexValue(c); - if (d < 0) { - Reset(start); - return false; - } - val = val * 16 + d; - Advance(); - } - *value = val; - return true; -} - -// This parses RegExpUnicodeEscapeSequence as described in ECMA262. -bool RegExpParser::ParseUnicodeEscape(uc32* value) { - // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are - // allowed). In the latter case, the number of hex digits between { } is - // arbitrary. \ and u have already been read. - if (current() == '{' && unicode()) { - int start = position(); - Advance(); - if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { - if (current() == '}') { - Advance(); - return true; - } - } - Reset(start); - return false; - } - // \u but no {, or \u{...} escapes not allowed. - bool result = ParseHexEscape(4, value); - if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && - current() == '\\') { - // Attempt to read trail surrogate. - int start = position(); - if (Next() == 'u') { - Advance(2); - uc32 trail; - if (ParseHexEscape(4, &trail) && - unibrow::Utf16::IsTrailSurrogate(trail)) { - *value = unibrow::Utf16::CombineSurrogatePair(static_cast(*value), - static_cast(trail)); - return true; - } - } - Reset(start); - } - return result; -} - -#ifdef V8_INTL_SUPPORT - -namespace { - -bool IsExactPropertyAlias(const char* property_name, UProperty property) { - const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); - if (short_name != nullptr && strcmp(property_name, short_name) == 0) - return true; - for (int i = 0;; i++) { - const char* long_name = u_getPropertyName( - property, static_cast(U_LONG_PROPERTY_NAME + i)); - if (long_name == nullptr) break; - if (strcmp(property_name, long_name) == 0) return true; - } - return false; -} - -bool IsExactPropertyValueAlias(const char* property_value_name, - UProperty property, int32_t property_value) { - const char* short_name = - u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); - if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { - return true; - } - for (int i = 0;; i++) { - const char* long_name = u_getPropertyValueName( - property, property_value, - static_cast(U_LONG_PROPERTY_NAME + i)); - if (long_name == nullptr) break; - if (strcmp(property_value_name, long_name) == 0) return true; - } - return false; -} - -bool LookupPropertyValueName(UProperty property, - const char* property_value_name, bool negate, - ZoneList* result, Zone* zone) { - UProperty property_for_lookup = property; - if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { - // For the property Script_Extensions, we have to do the property value - // name lookup as if the property is Script. - property_for_lookup = UCHAR_SCRIPT; - } - int32_t property_value = - u_getPropertyValueEnum(property_for_lookup, property_value_name); - if (property_value == UCHAR_INVALID_CODE) return false; - - // We require the property name to match exactly to one of the property value - // aliases. However, u_getPropertyValueEnum uses loose matching. - if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, - property_value)) { - return false; - } - - UErrorCode ec = U_ZERO_ERROR; - icu::UnicodeSet set; - set.applyIntPropertyValue(property, property_value, ec); - bool success = ec == U_ZERO_ERROR && !set.isEmpty(); - - if (success) { - set.removeAllStrings(); - if (negate) set.complement(); - for (int i = 0; i < set.getRangeCount(); i++) { - result->Add( - CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), - zone); - } - } - return success; -} - -template -inline bool NameEquals(const char* name, const char (&literal)[N]) { - return strncmp(name, literal, N + 1) == 0; -} - -bool LookupSpecialPropertyValueName(const char* name, - ZoneList* result, - bool negate, Zone* zone) { - if (NameEquals(name, "Any")) { - if (negate) { - // Leave the list of character ranges empty, since the negation of 'Any' - // is the empty set. - } else { - result->Add(CharacterRange::Everything(), zone); - } - } else if (NameEquals(name, "ASCII")) { - result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) - : CharacterRange::Range(0x0, 0x7F), - zone); - } else if (NameEquals(name, "Assigned")) { - return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", - !negate, result, zone); - } else { - return false; - } - return true; -} - -// Explicitly whitelist supported binary properties. The spec forbids supporting -// properties outside of this set to ensure interoperability. -bool IsSupportedBinaryProperty(UProperty property) { - switch (property) { - case UCHAR_ALPHABETIC: - // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. - // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. - case UCHAR_ASCII_HEX_DIGIT: - // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. - case UCHAR_BIDI_CONTROL: - case UCHAR_BIDI_MIRRORED: - case UCHAR_CASE_IGNORABLE: - case UCHAR_CASED: - case UCHAR_CHANGES_WHEN_CASEFOLDED: - case UCHAR_CHANGES_WHEN_CASEMAPPED: - case UCHAR_CHANGES_WHEN_LOWERCASED: - case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: - case UCHAR_CHANGES_WHEN_TITLECASED: - case UCHAR_CHANGES_WHEN_UPPERCASED: - case UCHAR_DASH: - case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: - case UCHAR_DEPRECATED: - case UCHAR_DIACRITIC: - case UCHAR_EMOJI: - case UCHAR_EMOJI_COMPONENT: - case UCHAR_EMOJI_MODIFIER_BASE: - case UCHAR_EMOJI_MODIFIER: - case UCHAR_EMOJI_PRESENTATION: - case UCHAR_EXTENDED_PICTOGRAPHIC: - case UCHAR_EXTENDER: - case UCHAR_GRAPHEME_BASE: - case UCHAR_GRAPHEME_EXTEND: - case UCHAR_HEX_DIGIT: - case UCHAR_ID_CONTINUE: - case UCHAR_ID_START: - case UCHAR_IDEOGRAPHIC: - case UCHAR_IDS_BINARY_OPERATOR: - case UCHAR_IDS_TRINARY_OPERATOR: - case UCHAR_JOIN_CONTROL: - case UCHAR_LOGICAL_ORDER_EXCEPTION: - case UCHAR_LOWERCASE: - case UCHAR_MATH: - case UCHAR_NONCHARACTER_CODE_POINT: - case UCHAR_PATTERN_SYNTAX: - case UCHAR_PATTERN_WHITE_SPACE: - case UCHAR_QUOTATION_MARK: - case UCHAR_RADICAL: - case UCHAR_REGIONAL_INDICATOR: - case UCHAR_S_TERM: - case UCHAR_SOFT_DOTTED: - case UCHAR_TERMINAL_PUNCTUATION: - case UCHAR_UNIFIED_IDEOGRAPH: - case UCHAR_UPPERCASE: - case UCHAR_VARIATION_SELECTOR: - case UCHAR_WHITE_SPACE: - case UCHAR_XID_CONTINUE: - case UCHAR_XID_START: - return true; - default: - break; - } - return false; -} - -bool IsUnicodePropertyValueCharacter(char c) { - // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ - // - // Note that using this to validate each parsed char is quite conservative. - // A possible alternative solution would be to only ensure the parsed - // property name/value candidate string does not contain '\0' characters and - // let ICU lookups trigger the final failure. - if ('a' <= c && c <= 'z') return true; - if ('A' <= c && c <= 'Z') return true; - if ('0' <= c && c <= '9') return true; - return (c == '_'); -} - -} // anonymous namespace - -bool RegExpParser::ParsePropertyClassName(ZoneVector* name_1, - ZoneVector* name_2) { - DCHECK(name_1->empty()); - DCHECK(name_2->empty()); - // Parse the property class as follows: - // - In \p{name}, 'name' is interpreted - // - either as a general category property value name. - // - or as a binary property name. - // - In \p{name=value}, 'name' is interpreted as an enumerated property name, - // and 'value' is interpreted as one of the available property value names. - // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. - // - Loose matching is not applied. - if (current() == '{') { - // Parse \p{[PropertyName=]PropertyNameValue} - for (Advance(); current() != '}' && current() != '='; Advance()) { - if (!IsUnicodePropertyValueCharacter(current())) return false; - if (!has_next()) return false; - name_1->push_back(static_cast(current())); - } - if (current() == '=') { - for (Advance(); current() != '}'; Advance()) { - if (!IsUnicodePropertyValueCharacter(current())) return false; - if (!has_next()) return false; - name_2->push_back(static_cast(current())); - } - name_2->push_back(0); // null-terminate string. - } - } else { - return false; - } - Advance(); - name_1->push_back(0); // null-terminate string. - - DCHECK(name_1->size() - 1 == std::strlen(name_1->data())); - DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); - return true; -} - -bool RegExpParser::AddPropertyClassRange(ZoneList* add_to, - bool negate, - const ZoneVector& name_1, - const ZoneVector& name_2) { - if (name_2.empty()) { - // First attempt to interpret as general category property value name. - const char* name = name_1.data(); - if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, - add_to, zone())) { - return true; - } - // Interpret "Any", "ASCII", and "Assigned". - if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) { - return true; - } - // Then attempt to interpret as binary property name with value name 'Y'. - UProperty property = u_getPropertyEnum(name); - if (!IsSupportedBinaryProperty(property)) return false; - if (!IsExactPropertyAlias(name, property)) return false; - return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to, - zone()); - } else { - // Both property name and value name are specified. Attempt to interpret - // the property name as enumerated property. - const char* property_name = name_1.data(); - const char* value_name = name_2.data(); - UProperty property = u_getPropertyEnum(property_name); - if (!IsExactPropertyAlias(property_name, property)) return false; - if (property == UCHAR_GENERAL_CATEGORY) { - // We want to allow aggregate value names such as "Letter". - property = UCHAR_GENERAL_CATEGORY_MASK; - } else if (property != UCHAR_SCRIPT && - property != UCHAR_SCRIPT_EXTENSIONS) { - return false; - } - return LookupPropertyValueName(property, value_name, negate, add_to, - zone()); - } -} - -RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector& name_1) { - if (!FLAG_harmony_regexp_sequence) return nullptr; - const char* name = name_1.data(); - const uc32* sequence_list = nullptr; - JSRegExp::Flags flags = JSRegExp::kUnicode; - if (NameEquals(name, "Emoji_Flag_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiFlagSequences; - } else if (NameEquals(name, "Emoji_Tag_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiTagSequences; - } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiZWJSequences; - } - if (sequence_list != nullptr) { - // TODO(yangguo): this creates huge regexp code. Alternative to this is - // to create a new operator that checks for these sequences at runtime. - RegExpBuilder builder(zone(), flags); - while (true) { // Iterate through list of sequences. - while (*sequence_list != 0) { // Iterate through sequence. - builder.AddUnicodeCharacter(*sequence_list); - sequence_list++; - } - sequence_list++; - if (*sequence_list == 0) break; - builder.NewAlternative(); - } - return builder.ToRegExp(); - } - - if (NameEquals(name, "Emoji_Keycap_Sequence")) { - // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence - // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3} - RegExpBuilder builder(zone(), flags); - ZoneList* prefix_ranges = - new (zone()) ZoneList(2, zone()); - prefix_ranges->Add(CharacterRange::Range('0', '9'), zone()); - prefix_ranges->Add(CharacterRange::Singleton('#'), zone()); - prefix_ranges->Add(CharacterRange::Singleton('*'), zone()); - builder.AddCharacterClass( - new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags)); - builder.AddCharacter(0xFE0F); - builder.AddCharacter(0x20E3); - return builder.ToRegExp(); - } else if (NameEquals(name, "Emoji_Modifier_Sequence")) { - // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence - // emoji_modifier_sequence := emoji_modifier_base emoji_modifier - RegExpBuilder builder(zone(), flags); - ZoneList* modifier_base_ranges = - new (zone()) ZoneList(2, zone()); - LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false, - modifier_base_ranges, zone()); - builder.AddCharacterClass( - new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags)); - ZoneList* modifier_ranges = - new (zone()) ZoneList(2, zone()); - LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges, - zone()); - builder.AddCharacterClass( - new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags)); - return builder.ToRegExp(); - } - - return nullptr; -} - -#else // V8_INTL_SUPPORT - -bool RegExpParser::ParsePropertyClassName(ZoneVector* name_1, - ZoneVector* name_2) { - return false; -} - -bool RegExpParser::AddPropertyClassRange(ZoneList* add_to, - bool negate, - const ZoneVector& name_1, - const ZoneVector& name_2) { - return false; -} - -RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector& name) { - return nullptr; -} - -#endif // V8_INTL_SUPPORT - -bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { - uc32 x = 0; - int d = HexValue(current()); - if (d < 0) { - return false; - } - while (d >= 0) { - x = x * 16 + d; - if (x > max_value) { - return false; - } - Advance(); - d = HexValue(current()); - } - *value = x; - return true; -} - - -uc32 RegExpParser::ParseClassCharacterEscape() { - DCHECK_EQ('\\', current()); - DCHECK(has_next() && !IsSpecialClassEscape(Next())); - Advance(); - switch (current()) { - case 'b': - Advance(); - return '\b'; - // ControlEscape :: one of - // f n r t v - case 'f': - Advance(); - return '\f'; - case 'n': - Advance(); - return '\n'; - case 'r': - Advance(); - return '\r'; - case 't': - Advance(); - return '\t'; - case 'v': - Advance(); - return '\v'; - case 'c': { - uc32 controlLetter = Next(); - uc32 letter = controlLetter & ~('A' ^ 'a'); - // Inside a character class, we also accept digits and underscore as - // control characters, unless with /u. See Annex B: - // ES#prod-annexB-ClassControlLetter - if (letter >= 'A' && letter <= 'Z') { - Advance(2); - // Control letters mapped to ASCII control characters in the range - // 0x00-0x1F. - return controlLetter & 0x1F; - } - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidClassEscape); - return 0; - } - if ((controlLetter >= '0' && controlLetter <= '9') || - controlLetter == '_') { - Advance(2); - return controlLetter & 0x1F; - } - // We match JSC in reading the backslash as a literal - // character instead of as starting an escape. - // TODO(v8:6201): Not yet covered by the spec. - return '\\'; - } - case '0': - // With /u, \0 is interpreted as NUL if not followed by another digit. - if (unicode() && !(Next() >= '0' && Next() <= '9')) { - Advance(); - return 0; - } - V8_FALLTHROUGH; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - // For compatibility, we interpret a decimal escape that isn't - // a back reference (and therefore either \0 or not valid according - // to the specification) as a 1..3 digit octal character code. - // ES#prod-annexB-LegacyOctalEscapeSequence - if (unicode()) { - // With /u, decimal escape is not interpreted as octal character code. - ReportError(RegExpError::kInvalidClassEscape); - return 0; - } - return ParseOctalLiteral(); - case 'x': { - Advance(); - uc32 value; - if (ParseHexEscape(2, &value)) return value; - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidEscape); - return 0; - } - // If \x is not followed by a two-digit hexadecimal, treat it - // as an identity escape. - return 'x'; - } - case 'u': { - Advance(); - uc32 value; - if (ParseUnicodeEscape(&value)) return value; - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidUnicodeEscape); - return 0; - } - // If \u is not followed by a two-digit hexadecimal, treat it - // as an identity escape. - return 'u'; - } - default: { - uc32 result = current(); - // With /u, no identity escapes except for syntax characters and '-' are - // allowed. Otherwise, all identity escapes are allowed. - if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { - Advance(); - return result; - } - ReportError(RegExpError::kInvalidEscape); - return 0; - } - } - UNREACHABLE(); -} - -void RegExpParser::ParseClassEscape(ZoneList* ranges, - Zone* zone, - bool add_unicode_case_equivalents, - uc32* char_out, bool* is_class_escape) { - uc32 current_char = current(); - if (current_char == '\\') { - switch (Next()) { - case 'w': - case 'W': - case 'd': - case 'D': - case 's': - case 'S': { - CharacterRange::AddClassEscape(static_cast(Next()), ranges, - add_unicode_case_equivalents, zone); - Advance(2); - *is_class_escape = true; - return; - } - case kEndMarker: - ReportError(RegExpError::kEscapeAtEndOfPattern); - return; - case 'p': - case 'P': - if (unicode()) { - bool negate = Next() == 'P'; - Advance(2); - ZoneVector name_1(zone); - ZoneVector name_2(zone); - if (!ParsePropertyClassName(&name_1, &name_2) || - !AddPropertyClassRange(ranges, negate, name_1, name_2)) { - ReportError(RegExpError::kInvalidClassPropertyName); - } - *is_class_escape = true; - return; - } - break; - default: - break; - } - *char_out = ParseClassCharacterEscape(); - *is_class_escape = false; - } else { - Advance(); - *char_out = current_char; - *is_class_escape = false; - } -} - -RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { - DCHECK_EQ(current(), '['); - Advance(); - bool is_negated = false; - if (current() == '^') { - is_negated = true; - Advance(); - } - ZoneList* ranges = - new (zone()) ZoneList(2, zone()); - bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); - while (has_more() && current() != ']') { - uc32 char_1, char_2; - bool is_class_1, is_class_2; - ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, - &is_class_1 CHECK_FAILED); - if (current() == '-') { - Advance(); - if (current() == kEndMarker) { - // If we reach the end we break out of the loop and let the - // following code report an error. - break; - } else if (current() == ']') { - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); - ranges->Add(CharacterRange::Singleton('-'), zone()); - break; - } - ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, - &is_class_2 CHECK_FAILED); - if (is_class_1 || is_class_2) { - // Either end is an escaped character class. Treat the '-' verbatim. - if (unicode()) { - // ES2015 21.2.2.15.1 step 1. - return ReportError(RegExpError::kInvalidCharacterClass); - } - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); - ranges->Add(CharacterRange::Singleton('-'), zone()); - if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); - continue; - } - // ES2015 21.2.2.15.1 step 6. - if (char_1 > char_2) { - return ReportError(RegExpError::kOutOfOrderCharacterClass); - } - ranges->Add(CharacterRange::Range(char_1, char_2), zone()); - } else { - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); - } - } - if (!has_more()) { - return ReportError(RegExpError::kUnterminatedCharacterClass); - } - Advance(); - RegExpCharacterClass::CharacterClassFlags character_class_flags; - if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; - return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(), - character_class_flags); -} - - -#undef CHECK_FAILED - - -bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, - FlatStringReader* input, JSRegExp::Flags flags, - RegExpCompileData* result) { - DCHECK(result != nullptr); - RegExpParser parser(input, flags, isolate, zone); - RegExpTree* tree = parser.ParsePattern(); - if (parser.failed()) { - DCHECK(tree == nullptr); - DCHECK(parser.error_ != RegExpError::kNone); - result->error = parser.error_; - result->error_pos = parser.error_pos_; - } else { - DCHECK(tree != nullptr); - DCHECK(parser.error_ == RegExpError::kNone); - if (FLAG_trace_regexp_parser) { - StdoutStream os; - tree->Print(os, zone); - os << "\n"; - } - result->tree = tree; - int capture_count = parser.captures_started(); - result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; - result->contains_anchor = parser.contains_anchor(); - result->capture_name_map = parser.CreateCaptureNameMap(); - result->capture_count = capture_count; - } - return !parser.failed(); -} - -RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) - : zone_(zone), - pending_empty_(false), - flags_(flags), - characters_(nullptr), - pending_surrogate_(kNoPendingSurrogate), - terms_(), - alternatives_() -#ifdef DEBUG - , - last_added_(ADD_NONE) -#endif -{ -} - - -void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { - DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); - FlushPendingSurrogate(); - // Hold onto the lead surrogate, waiting for a trail surrogate to follow. - pending_surrogate_ = lead_surrogate; -} - - -void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { - DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); - if (pending_surrogate_ != kNoPendingSurrogate) { - uc16 lead_surrogate = pending_surrogate_; - pending_surrogate_ = kNoPendingSurrogate; - DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); - uc32 combined = - unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); - if (NeedsDesugaringForIgnoreCase(combined)) { - AddCharacterClassForDesugaring(combined); - } else { - ZoneList surrogate_pair(2, zone()); - surrogate_pair.Add(lead_surrogate, zone()); - surrogate_pair.Add(trail_surrogate, zone()); - RegExpAtom* atom = - new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_); - AddAtom(atom); - } - } else { - pending_surrogate_ = trail_surrogate; - FlushPendingSurrogate(); - } -} - - -void RegExpBuilder::FlushPendingSurrogate() { - if (pending_surrogate_ != kNoPendingSurrogate) { - DCHECK(unicode()); - uc32 c = pending_surrogate_; - pending_surrogate_ = kNoPendingSurrogate; - AddCharacterClassForDesugaring(c); - } -} - - -void RegExpBuilder::FlushCharacters() { - FlushPendingSurrogate(); - pending_empty_ = false; - if (characters_ != nullptr) { - RegExpTree* atom = - new (zone()) RegExpAtom(characters_->ToConstVector(), flags_); - characters_ = nullptr; - text_.Add(atom, zone()); - LAST(ADD_ATOM); - } -} - - -void RegExpBuilder::FlushText() { - FlushCharacters(); - int num_text = text_.length(); - if (num_text == 0) { - return; - } else if (num_text == 1) { - terms_.Add(text_.last(), zone()); - } else { - RegExpText* text = new (zone()) RegExpText(zone()); - for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); - terms_.Add(text, zone()); - } - text_.Clear(); -} - - -void RegExpBuilder::AddCharacter(uc16 c) { - FlushPendingSurrogate(); - pending_empty_ = false; - if (NeedsDesugaringForIgnoreCase(c)) { - AddCharacterClassForDesugaring(c); - } else { - if (characters_ == nullptr) { - characters_ = new (zone()) ZoneList(4, zone()); - } - characters_->Add(c, zone()); - LAST(ADD_CHAR); - } -} - - -void RegExpBuilder::AddUnicodeCharacter(uc32 c) { - if (c > static_cast(unibrow::Utf16::kMaxNonSurrogateCharCode)) { - DCHECK(unicode()); - AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); - AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); - } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { - AddLeadSurrogate(c); - } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { - AddTrailSurrogate(c); - } else { - AddCharacter(static_cast(c)); - } -} - -void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { - // A lead or trail surrogate parsed via escape sequence will not - // pair up with any preceding lead or following trail surrogate. - FlushPendingSurrogate(); - AddUnicodeCharacter(character); - FlushPendingSurrogate(); -} - -void RegExpBuilder::AddEmpty() { pending_empty_ = true; } - - -void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { - if (NeedsDesugaringForUnicode(cc)) { - // With /u, character class needs to be desugared, so it - // must be a standalone term instead of being part of a RegExpText. - AddTerm(cc); - } else { - AddAtom(cc); - } -} - -void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { - AddTerm(new (zone()) RegExpCharacterClass( - zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)), - flags_)); -} - - -void RegExpBuilder::AddAtom(RegExpTree* term) { - if (term->IsEmpty()) { - AddEmpty(); - return; - } - if (term->IsTextElement()) { - FlushCharacters(); - text_.Add(term, zone()); - } else { - FlushText(); - terms_.Add(term, zone()); - } - LAST(ADD_ATOM); -} - - -void RegExpBuilder::AddTerm(RegExpTree* term) { - FlushText(); - terms_.Add(term, zone()); - LAST(ADD_ATOM); -} - - -void RegExpBuilder::AddAssertion(RegExpTree* assert) { - FlushText(); - terms_.Add(assert, zone()); - LAST(ADD_ASSERT); -} - - -void RegExpBuilder::NewAlternative() { FlushTerms(); } - - -void RegExpBuilder::FlushTerms() { - FlushText(); - int num_terms = terms_.length(); - RegExpTree* alternative; - if (num_terms == 0) { - alternative = new (zone()) RegExpEmpty(); - } else if (num_terms == 1) { - alternative = terms_.last(); - } else { - alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); - } - alternatives_.Add(alternative, zone()); - terms_.Clear(); - LAST(ADD_NONE); -} - - -bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { - if (!unicode()) return false; - // TODO(yangguo): we could be smarter than this. Case-insensitivity does not - // necessarily mean that we need to desugar. It's probably nicer to have a - // separate pass to figure out unicode desugarings. - if (ignore_case()) return true; - ZoneList* ranges = cc->ranges(zone()); - CharacterRange::Canonicalize(ranges); - for (int i = ranges->length() - 1; i >= 0; i--) { - uc32 from = ranges->at(i).from(); - uc32 to = ranges->at(i).to(); - // Check for non-BMP characters. - if (to >= kNonBmpStart) return true; - // Check for lone surrogates. - if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; - } - return false; -} - - -bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { -#ifdef V8_INTL_SUPPORT - if (unicode() && ignore_case()) { - icu::UnicodeSet set(c, c); - set.closeOver(USET_CASE_INSENSITIVE); - set.removeAllStrings(); - return set.size() > 1; - } - // In the case where ICU is not included, we act as if the unicode flag is - // not set, and do not desugar. -#endif // V8_INTL_SUPPORT - return false; -} - - -RegExpTree* RegExpBuilder::ToRegExp() { - FlushTerms(); - int num_alternatives = alternatives_.length(); - if (num_alternatives == 0) return new (zone()) RegExpEmpty(); - if (num_alternatives == 1) return alternatives_.last(); - return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); -} - -bool RegExpBuilder::AddQuantifierToAtom( - int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { - FlushPendingSurrogate(); - if (pending_empty_) { - pending_empty_ = false; - return true; - } - RegExpTree* atom; - if (characters_ != nullptr) { - DCHECK(last_added_ == ADD_CHAR); - // Last atom was character. - Vector char_vector = characters_->ToConstVector(); - int num_chars = char_vector.length(); - if (num_chars > 1) { - Vector prefix = char_vector.SubVector(0, num_chars - 1); - text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone()); - char_vector = char_vector.SubVector(num_chars - 1, num_chars); - } - characters_ = nullptr; - atom = new (zone()) RegExpAtom(char_vector, flags_); - FlushText(); - } else if (text_.length() > 0) { - DCHECK(last_added_ == ADD_ATOM); - atom = text_.RemoveLast(); - FlushText(); - } else if (terms_.length() > 0) { - DCHECK(last_added_ == ADD_ATOM); - atom = terms_.RemoveLast(); - if (atom->IsLookaround()) { - // With /u, lookarounds are not quantifiable. - if (unicode()) return false; - // Lookbehinds are not quantifiable. - if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { - return false; - } - } - if (atom->max_match() == 0) { - // Guaranteed to only match an empty string. - LAST(ADD_TERM); - if (min == 0) { - return true; - } - terms_.Add(atom, zone()); - return true; - } - } else { - // Only call immediately after adding an atom or character! - UNREACHABLE(); - } - terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), - zone()); - LAST(ADD_TERM); - return true; -} - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-parser.h b/js/src/new-regexp/regexp-parser.h deleted file mode 100644 index 1b2a9fe18..000000000 --- a/js/src/new-regexp/regexp-parser.h +++ /dev/null @@ -1,363 +0,0 @@ -// Copyright 2016 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_PARSER_H_ -#define V8_REGEXP_REGEXP_PARSER_H_ - -#include "new-regexp/regexp-ast.h" -#include "new-regexp/regexp-error.h" - -namespace v8 { -namespace internal { - -struct RegExpCompileData; - -// A BufferedZoneList is an automatically growing list, just like (and backed -// by) a ZoneList, that is optimized for the case of adding and removing -// a single element. The last element added is stored outside the backing list, -// and if no more than one element is ever added, the ZoneList isn't even -// allocated. -// Elements must not be nullptr pointers. -template -class BufferedZoneList { - public: - BufferedZoneList() : list_(nullptr), last_(nullptr) {} - - // Adds element at end of list. This element is buffered and can - // be read using last() or removed using RemoveLast until a new Add or until - // RemoveLast or GetList has been called. - void Add(T* value, Zone* zone) { - if (last_ != nullptr) { - if (list_ == nullptr) { - list_ = new (zone) ZoneList(initial_size, zone); - } - list_->Add(last_, zone); - } - last_ = value; - } - - T* last() { - DCHECK(last_ != nullptr); - return last_; - } - - T* RemoveLast() { - DCHECK(last_ != nullptr); - T* result = last_; - if ((list_ != nullptr) && (list_->length() > 0)) - last_ = list_->RemoveLast(); - else - last_ = nullptr; - return result; - } - - T* Get(int i) { - DCHECK((0 <= i) && (i < length())); - if (list_ == nullptr) { - DCHECK_EQ(0, i); - return last_; - } else { - if (i == list_->length()) { - DCHECK(last_ != nullptr); - return last_; - } else { - return list_->at(i); - } - } - } - - void Clear() { - list_ = nullptr; - last_ = nullptr; - } - - int length() { - int length = (list_ == nullptr) ? 0 : list_->length(); - return length + ((last_ == nullptr) ? 0 : 1); - } - - ZoneList* GetList(Zone* zone) { - if (list_ == nullptr) { - list_ = new (zone) ZoneList(initial_size, zone); - } - if (last_ != nullptr) { - list_->Add(last_, zone); - last_ = nullptr; - } - return list_; - } - - private: - ZoneList* list_; - T* last_; -}; - - -// Accumulates RegExp atoms and assertions into lists of terms and alternatives. -class RegExpBuilder : public ZoneObject { - public: - RegExpBuilder(Zone* zone, JSRegExp::Flags flags); - void AddCharacter(uc16 character); - void AddUnicodeCharacter(uc32 character); - void AddEscapedUnicodeCharacter(uc32 character); - // "Adds" an empty expression. Does nothing except consume a - // following quantifier - void AddEmpty(); - void AddCharacterClass(RegExpCharacterClass* cc); - void AddCharacterClassForDesugaring(uc32 c); - void AddAtom(RegExpTree* tree); - void AddTerm(RegExpTree* tree); - void AddAssertion(RegExpTree* tree); - void NewAlternative(); // '|' - bool AddQuantifierToAtom(int min, int max, - RegExpQuantifier::QuantifierType type); - void FlushText(); - RegExpTree* ToRegExp(); - JSRegExp::Flags flags() const { return flags_; } - void set_flags(JSRegExp::Flags flags) { flags_ = flags; } - - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } - bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } - - private: - static const uc16 kNoPendingSurrogate = 0; - void AddLeadSurrogate(uc16 lead_surrogate); - void AddTrailSurrogate(uc16 trail_surrogate); - void FlushPendingSurrogate(); - void FlushCharacters(); - void FlushTerms(); - bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); - bool NeedsDesugaringForIgnoreCase(uc32 c); - Zone* zone() const { return zone_; } - bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } - - Zone* zone_; - bool pending_empty_; - JSRegExp::Flags flags_; - ZoneList* characters_; - uc16 pending_surrogate_; - BufferedZoneList terms_; - BufferedZoneList text_; - BufferedZoneList alternatives_; -#ifdef DEBUG - enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; -#define LAST(x) last_added_ = x; -#else -#define LAST(x) -#endif -}; - -class V8_EXPORT_PRIVATE RegExpParser { - public: - RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate, - Zone* zone); - - static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, - JSRegExp::Flags flags, RegExpCompileData* result); - - RegExpTree* ParsePattern(); - RegExpTree* ParseDisjunction(); - RegExpTree* ParseGroup(); - - // Parses a {...,...} quantifier and stores the range in the given - // out parameters. - bool ParseIntervalQuantifier(int* min_out, int* max_out); - - // Parses and returns a single escaped character. The character - // must not be 'b' or 'B' since they are usually handle specially. - uc32 ParseClassCharacterEscape(); - - // Checks whether the following is a length-digit hexadecimal number, - // and sets the value if it is. - bool ParseHexEscape(int length, uc32* value); - bool ParseUnicodeEscape(uc32* value); - bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); - - bool ParsePropertyClassName(ZoneVector* name_1, - ZoneVector* name_2); - bool AddPropertyClassRange(ZoneList* add_to, bool negate, - const ZoneVector& name_1, - const ZoneVector& name_2); - - RegExpTree* GetPropertySequence(const ZoneVector& name_1); - RegExpTree* ParseCharacterClass(const RegExpBuilder* state); - - uc32 ParseOctalLiteral(); - - // Tries to parse the input as a back reference. If successful it - // stores the result in the output parameter and returns true. If - // it fails it will push back the characters read so the same characters - // can be reparsed. - bool ParseBackReferenceIndex(int* index_out); - - // Parse inside a class. Either add escaped class to the range, or return - // false and pass parsed single character through |char_out|. - void ParseClassEscape(ZoneList* ranges, Zone* zone, - bool add_unicode_case_equivalents, uc32* char_out, - bool* is_class_escape); - - char ParseClassEscape(); - - RegExpTree* ReportError(RegExpError error); - void Advance(); - void Advance(int dist); - void Reset(int pos); - - // Reports whether the pattern might be used as a literal search string. - // Only use if the result of the parse is a single atom node. - bool simple(); - bool contains_anchor() { return contains_anchor_; } - void set_contains_anchor() { contains_anchor_ = true; } - int captures_started() { return captures_started_; } - int position() { return next_pos_ - 1; } - bool failed() { return failed_; } - // The Unicode flag can't be changed using in-regexp syntax, so it's OK to - // just read the initial flag value here. - bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } - - static bool IsSyntaxCharacterOrSlash(uc32 c); - - static const uc32 kEndMarker = (1 << 21); - - private: - enum SubexpressionType { - INITIAL, - CAPTURE, // All positive values represent captures. - POSITIVE_LOOKAROUND, - NEGATIVE_LOOKAROUND, - GROUPING - }; - - class RegExpParserState : public ZoneObject { - public: - // Push a state on the stack. - RegExpParserState(RegExpParserState* previous_state, - SubexpressionType group_type, - RegExpLookaround::Type lookaround_type, - int disjunction_capture_index, - const ZoneVector* capture_name, - JSRegExp::Flags flags, Zone* zone) - : previous_state_(previous_state), - builder_(new (zone) RegExpBuilder(zone, flags)), - group_type_(group_type), - lookaround_type_(lookaround_type), - disjunction_capture_index_(disjunction_capture_index), - capture_name_(capture_name) {} - // Parser state of containing expression, if any. - RegExpParserState* previous_state() const { return previous_state_; } - bool IsSubexpression() { return previous_state_ != nullptr; } - // RegExpBuilder building this regexp's AST. - RegExpBuilder* builder() const { return builder_; } - // Type of regexp being parsed (parenthesized group or entire regexp). - SubexpressionType group_type() const { return group_type_; } - // Lookahead or Lookbehind. - RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } - // Index in captures array of first capture in this sub-expression, if any. - // Also the capture index of this sub-expression itself, if group_type - // is CAPTURE. - int capture_index() const { return disjunction_capture_index_; } - // The name of the current sub-expression, if group_type is CAPTURE. Only - // used for named captures. - const ZoneVector* capture_name() const { return capture_name_; } - - bool IsNamedCapture() const { return capture_name_ != nullptr; } - - // Check whether the parser is inside a capture group with the given index. - bool IsInsideCaptureGroup(int index); - // Check whether the parser is inside a capture group with the given name. - bool IsInsideCaptureGroup(const ZoneVector* name); - - private: - // Linked list implementation of stack of states. - RegExpParserState* const previous_state_; - // Builder for the stored disjunction. - RegExpBuilder* const builder_; - // Stored disjunction type (capture, look-ahead or grouping), if any. - const SubexpressionType group_type_; - // Stored read direction. - const RegExpLookaround::Type lookaround_type_; - // Stored disjunction's capture index (if any). - const int disjunction_capture_index_; - // Stored capture name (if any). - const ZoneVector* const capture_name_; - }; - - // Return the 1-indexed RegExpCapture object, allocate if necessary. - RegExpCapture* GetCapture(int index); - - // Creates a new named capture at the specified index. Must be called exactly - // once for each named capture. Fails if a capture with the same name is - // encountered. - bool CreateNamedCaptureAtIndex(const ZoneVector* name, int index); - - // Parses the name of a capture group (?pattern). The name must adhere - // to IdentifierName in the ECMAScript standard. - const ZoneVector* ParseCaptureGroupName(); - - bool ParseNamedBackReference(RegExpBuilder* builder, - RegExpParserState* state); - RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); - - // After the initial parsing pass, patch corresponding RegExpCapture objects - // into all RegExpBackReferences. This is done after initial parsing in order - // to avoid complicating cases in which references comes before the capture. - void PatchNamedBackReferences(); - - Handle CreateCaptureNameMap(); - - // Returns true iff the pattern contains named captures. May call - // ScanForCaptures to look ahead at the remaining pattern. - bool HasNamedCaptures(); - - Isolate* isolate() { return isolate_; } - Zone* zone() const { return zone_; } - - uc32 current() { return current_; } - bool has_more() { return has_more_; } - bool has_next() { return next_pos_ < in()->length(); } - uc32 Next(); - template - uc32 ReadNext(); - FlatStringReader* in() { return in_; } - void ScanForCaptures(); - - struct RegExpCaptureNameLess { - bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { - DCHECK_NOT_NULL(lhs); - DCHECK_NOT_NULL(rhs); - ZoneVector lhname = *lhs->name(); - ZoneVector rhname = *rhs->name(); - return lhname < rhname; - } - }; - - Isolate* isolate_; - Zone* zone_; - RegExpError error_ = RegExpError::kNone; - int error_pos_ = 0; - ZoneList* captures_; - ZoneSet* named_captures_; - ZoneList* named_back_references_; - FlatStringReader* in_; - uc32 current_; - // These are the flags specified outside the regexp syntax ie after the - // terminating '/' or in the second argument to the constructor. The current - // flags are stored on the RegExpBuilder. - JSRegExp::Flags top_level_flags_; - int next_pos_; - int captures_started_; - int capture_count_; // Only valid after we have scanned for captures. - bool has_more_; - bool simple_; - bool contains_anchor_; - bool is_scanned_for_captures_; - bool has_named_captures_; // Only valid after we have scanned for captures. - bool failed_; -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_PARSER_H_ diff --git a/js/src/new-regexp/regexp-shim.cc b/js/src/new-regexp/regexp-shim.cc deleted file mode 100644 index 51a9e2d83..000000000 --- a/js/src/new-regexp/regexp-shim.cc +++ /dev/null @@ -1,212 +0,0 @@ -/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim: set ts=8 sts=2 et sw=2 tw=80: - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include - -#include "new-regexp/regexp-shim.h" -#include "new-regexp/regexp-stack.h" - -#include "mozilla/Sprintf.h" // for SprintfLiteral - -namespace v8 { -namespace internal { - -void PrintF(const char* format, ...) { - va_list arguments; - va_start(arguments, format); - vprintf(format, arguments); - va_end(arguments); -} - -void PrintF(FILE* out, const char* format, ...) { - va_list arguments; - va_start(arguments, format); - vfprintf(out, format, arguments); - va_end(arguments); -} - -StdoutStream::operator std::ostream&() const { return std::cerr; } - -template -std::ostream& StdoutStream::operator<<(T t) { return std::cerr << t; } - -template std::ostream& StdoutStream::operator<<(char const* c); - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/ostreams.cc#L120-L169 -// (This is a hand-simplified version.) -// Writes the given character to the output escaping everything outside -// of printable ASCII range. -std::ostream& operator<<(std::ostream& os, const AsUC16& c) { - uc16 v = c.value; - bool isPrint = 0x20 < v && v <= 0x7e; - char buf[10]; - const char* format = isPrint ? "%c" : (v <= 0xFF) ? "\\x%02x" : "\\u%04x"; - SprintfLiteral(buf, format, v); - return os << buf; -} -std::ostream& operator<<(std::ostream& os, const AsUC32& c) { - int32_t v = c.value; - if (v <= String::kMaxUtf16CodeUnit) { - return os << AsUC16(v); - } - char buf[13]; - SprintfLiteral(buf, "\\u{%06x}", v); - return os << buf; -} - -HandleScope::HandleScope(Isolate* isolate) - : isolate_(isolate) { - isolate->openHandleScope(*this); -} - -HandleScope::~HandleScope() { - isolate_->closeHandleScope(level_, non_gc_level_); -} - -template -Handle::Handle(T object, Isolate* isolate) - : location_(isolate->getHandleLocation(JS::Value(object))) {} - -template Handle::Handle(ByteArray b, Isolate* isolate); -template Handle::Handle(JS::Value v, Isolate* isolate); -template Handle::Handle(JSRegExp re, Isolate* isolate); -template Handle::Handle(String s, Isolate* isolate); - -template -Handle::Handle(JS::Value value, Isolate* isolate) - : location_(isolate->getHandleLocation(value)) { - T::cast(Object(value)); // Assert that value has the correct type. -} - -JS::Value* Isolate::getHandleLocation(JS::Value value) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - if (!handleArena_.Append(value)) { - oomUnsafe.crash("Irregexp handle allocation"); - } - return &handleArena_.GetLast(); -} - -void* Isolate::allocatePseudoHandle(size_t bytes) { - PseudoHandle ptr; - ptr.reset(js_malloc(bytes)); - if (!ptr) { - return nullptr; - } - if (!uniquePtrArena_.Append(std::move(ptr))) { - return nullptr; - } - return uniquePtrArena_.GetLast().get(); -} - -template -PseudoHandle Isolate::takeOwnership(void* ptr) { - for (auto iter = uniquePtrArena_.IterFromLast(); !iter.Done(); iter.Prev()) { - auto& entry = iter.Get(); - if (entry.get() == ptr) { - PseudoHandle result; - result.reset(static_cast(entry.release())); - return result; - } - } - MOZ_CRASH("Tried to take ownership of pseudohandle that is not in the arena"); -} - -PseudoHandle ByteArray::takeOwnership(Isolate* isolate) { - PseudoHandle result = - isolate->takeOwnership(value_.toPrivate()); - value_ = JS::PrivateValue(nullptr); - return result; -} - -void Isolate::trace(JSTracer* trc) { - for (auto iter = handleArena_.Iter(); !iter.Done(); iter.Next()) { - auto& elem = iter.Get(); - JS::GCPolicy::trace(trc, &elem, "Isolate handle arena"); - } -} - -/*static*/ Handle String::Flatten(Isolate* isolate, - Handle string) { - if (string->IsFlat()) { - return string; - } - js::AutoEnterOOMUnsafeRegion oomUnsafe; - JSLinearString* linear = string->str()->ensureLinear(isolate->cx()); - if (!linear) { - oomUnsafe.crash("Irregexp String::Flatten"); - } - return Handle(JS::StringValue(linear), isolate); -} - -// This is only used for trace messages printing the source of a -// regular expression. To keep things simple, we just return an -// empty string and don't print anything. -std::unique_ptr String::ToCString() { - return std::unique_ptr(); -} - -byte* Isolate::top_of_regexp_stack() const { - return reinterpret_cast(regexpStack_->memory_top_address_address()); -} - -Handle Isolate::NewByteArray(int length, AllocationType alloc) { - MOZ_RELEASE_ASSERT(length >= 0); - - js::AutoEnterOOMUnsafeRegion oomUnsafe; - - size_t alloc_size = sizeof(uint32_t) + length; - ByteArrayData* data = - static_cast(allocatePseudoHandle(alloc_size)); - if (!data) { - oomUnsafe.crash("Irregexp NewByteArray"); - } - data->length = length; - - return Handle(JS::PrivateValue(data), this); -} - -Handle Isolate::NewFixedArray(int length) { - MOZ_RELEASE_ASSERT(length >= 0); - MOZ_CRASH("TODO"); -} - -template -Handle Isolate::InternalizeString(const Vector& str) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - JSAtom* atom = js::AtomizeChars(cx(), str.begin(), str.length()); - if (!atom) { - oomUnsafe.crash("Irregexp InternalizeString"); - } - return Handle(JS::StringValue(atom), this); -} - -template Handle -Isolate::InternalizeString(const Vector& str); -template Handle -Isolate::InternalizeString(const Vector& str); - -// TODO: Map flags to jitoptions -bool FLAG_correctness_fuzzer_suppressions = false; -bool FLAG_enable_regexp_unaligned_accesses = false; -bool FLAG_harmony_regexp_sequence = false; -bool FLAG_regexp_interpret_all = false; -bool FLAG_regexp_mode_modifiers = false; -bool FLAG_regexp_optimization = true; -bool FLAG_regexp_peephole_optimization = true; -bool FLAG_regexp_possessive_quantifier = false; -bool FLAG_regexp_tier_up = false; -bool FLAG_trace_regexp_assembler = false; -bool FLAG_trace_regexp_bytecodes = false; -bool FLAG_trace_regexp_parser = false; -bool FLAG_trace_regexp_peephole_optimization = false; - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-shim.h b/js/src/new-regexp/regexp-shim.h deleted file mode 100644 index c49c25ff1..000000000 --- a/js/src/new-regexp/regexp-shim.h +++ /dev/null @@ -1,1181 +0,0 @@ -/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim: set ts=8 sts=2 et sw=2 tw=80: - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef RegexpShim_h -#define RegexpShim_h - -#include "mozilla/Assertions.h" -#include "mozilla/Attributes.h" -#include "mozilla/MathAlgorithms.h" -#include "mozilla/Maybe.h" -#include "mozilla/SegmentedVector.h" -#include "mozilla/Types.h" - -#include -#include -#include // needed for gcc 10 - -#include "jit/Label.h" -#include "jit/shared/Assembler-shared.h" -#include "js/Value.h" -#include "new-regexp/RegExpTypes.h" -#include "new-regexp/util/flags.h" -#include "new-regexp/util/vector.h" -#include "new-regexp/util/zone.h" -#include "vm/NativeObject.h" - -// Forward declaration of classes -namespace v8 { -namespace internal { - -class Heap; -class Isolate; -class RegExpMatchInfo; -class RegExpStack; - -} // namespace internal -} // namespace v8 - -#define V8_WARN_UNUSED_RESULT MOZ_MUST_USE -#define V8_EXPORT_PRIVATE MOZ_EXPORT -#define V8_FALLTHROUGH MOZ_FALLTHROUGH - -#define FATAL(x) MOZ_CRASH(x) -#define UNREACHABLE() MOZ_CRASH("unreachable code") -#define UNIMPLEMENTED() MOZ_CRASH("unimplemented code") -#define STATIC_ASSERT(exp) static_assert(exp, #exp) - -#define DCHECK MOZ_ASSERT -#define DCHECK_EQ(lhs, rhs) MOZ_ASSERT((lhs) == (rhs)) -#define DCHECK_NE(lhs, rhs) MOZ_ASSERT((lhs) != (rhs)) -#define DCHECK_GT(lhs, rhs) MOZ_ASSERT((lhs) > (rhs)) -#define DCHECK_GE(lhs, rhs) MOZ_ASSERT((lhs) >= (rhs)) -#define DCHECK_LT(lhs, rhs) MOZ_ASSERT((lhs) < (rhs)) -#define DCHECK_LE(lhs, rhs) MOZ_ASSERT((lhs) <= (rhs)) -#define DCHECK_NULL(val) MOZ_ASSERT((val) == nullptr) -#define DCHECK_NOT_NULL(val) MOZ_ASSERT((val) != nullptr) -#define DCHECK_IMPLIES(lhs, rhs) MOZ_ASSERT_IF(lhs, rhs) -#define CHECK MOZ_RELEASE_ASSERT -#define CHECK_LE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) <= (rhs)) - -template -static constexpr inline T Min(T t1, T t2) { - return t1 < t2 ? t1 : t2; -} - -template -static constexpr inline T Max(T t1, T t2) { - return t1 > t2 ? t1 : t2; -} -#define MemCopy memcpy - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L310-L319 -// ptrdiff_t is 't' according to the standard, but MSVC uses 'I'. -#ifdef _MSC_VER -# define V8PRIxPTRDIFF "Ix" -# define V8PRIdPTRDIFF "Id" -# define V8PRIuPTRDIFF "Iu" -#else -# define V8PRIxPTRDIFF "tx" -# define V8PRIdPTRDIFF "td" -# define V8PRIuPTRDIFF "tu" -#endif - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L27-L38 -// The arraysize(arr) macro returns the # of elements in an array arr. -// The expression is a compile-time constant, and therefore can be -// used in defining new arrays, for example. If you use arraysize on -// a pointer by mistake, you will get a compile-time error. -#define arraysize(array) (sizeof(ArraySizeHelper(array))) - -// This template function declaration is used in defining arraysize. -// Note that the function doesn't need an implementation, as we only -// use its type. -template -char (&ArraySizeHelper(T (&array)[N]))[N]; - -// Explicitly declare the assignment operator as deleted. -#define DISALLOW_ASSIGN(TypeName) TypeName& operator=(const TypeName&) = delete - -// Explicitly declare the copy constructor and assignment operator as deleted. -// This also deletes the implicit move constructor and implicit move assignment -// operator, but still allows to manually define them. -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ - DISALLOW_ASSIGN(TypeName) - -// Explicitly declare all implicit constructors as deleted, namely the -// default constructor, copy constructor and operator= functions. -// This is especially useful for classes containing only static methods. -#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ - TypeName() = delete; \ - DISALLOW_COPY_AND_ASSIGN(TypeName) - -namespace v8 { - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L364-L367 -template -constexpr inline bool IsAligned(T value, U alignment) { - return (value & (alignment - 1)) == 0; -} - -using byte = uint8_t; -using Address = uintptr_t; -static const Address kNullAddress = 0; - -// Latin1/UTF-16 constants -// Code-point values in Unicode 4.0 are 21 bits wide. -// Code units in UTF-16 are 16 bits wide. -using uc16 = char16_t; -using uc32 = int32_t; - -namespace base { - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L247-L258 -// The USE(x, ...) template is used to silence C++ compiler warnings -// issued for (yet) unused variables (typically parameters). -// The arguments are guaranteed to be evaluated from left to right. -struct Use { - template - Use(T&&) {} // NOLINT(runtime/explicit) -}; -#define USE(...) \ - do { \ - ::v8::base::Use unused_tmp_array_for_use_macro[]{__VA_ARGS__}; \ - (void)unused_tmp_array_for_use_macro; \ - } while (false) - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/safe_conversions.h#L35-L39 -// saturated_cast<> is analogous to static_cast<> for numeric types, except -// that the specified numeric conversion will saturate rather than overflow or -// underflow. -template -inline Dst saturated_cast(Src value); - -// This is the only specialization that is needed for regexp code. -// Instead of pulling in dozens of lines of template goo -// to derive it, I used the implementation from uint8_clamped in -// ArrayBufferObject.h. -template <> -inline uint8_t saturated_cast(int x) { - return (x >= 0) ? ((x < 255) ? uint8_t(x) : 255) : 0; -} - -#define LAZY_INSTANCE_INITIALIZER { mozilla::Nothing() } - -template -struct LazyInstanceImpl { - mozilla::Maybe value_; - T* Pointer() { - if (value_.isNothing()) { - value_.emplace(); - } - return value_.ptr(); - } -}; - -template -class LazyInstance { -public: - using type = LazyInstanceImpl; -}; - - -namespace bits { - -inline uint64_t CountTrailingZeros(uint64_t value) { - return mozilla::CountTrailingZeroes64(value); -} - -inline size_t RoundUpToPowerOfTwo32(size_t value) { - return mozilla::RoundUpPow2(value); -} - -} // namespace bits -} // namespace base - -namespace unibrow { - -using uchar = unsigned int; - -// Origin: -// https://github.com/v8/v8/blob/1f1e4cdb04c75eab77adbecd5f5514ddc3eb56cf/src/strings/unicode.h#L133-L150 -class Latin1 { - public: - static const uc16 kMaxChar = 0xff; - - // Convert the character to Latin-1 case equivalent if possible. - static inline uc16 TryConvertToLatin1(uc16 c) { - // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN". - // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN". - if (c == 0x039C || c == 0x03BC) { - return 0xB5; - } - // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER - // Y WITH DIAERESIS". - if (c == 0x0178) { - return 0xFF; - } - return c; - } -}; - -// Origin: -// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L99-L131 -class Utf16 { - public: - static inline bool IsLeadSurrogate(int code) { - return js::unicode::IsLeadSurrogate(code); - } - static inline bool IsTrailSurrogate(int code) { - return js::unicode::IsTrailSurrogate(code); - } - static inline uc16 LeadSurrogate(uint32_t char_code) { - return js::unicode::LeadSurrogate(char_code); - } - static inline uc16 TrailSurrogate(uint32_t char_code) { - return js::unicode::TrailSurrogate(char_code); - } - static inline uint32_t CombineSurrogatePair(char16_t lead, char16_t trail) { - return js::unicode::UTF16Decode(lead, trail); - } - static const uchar kMaxNonSurrogateCharCode = 0xffff; -}; - -#ifndef V8_INTL_SUPPORT - -// A cache used in case conversion. It caches the value for characters -// that either have no mapping or map to a single character independent -// of context. Characters that map to more than one character or that -// map differently depending on context are always looked up. -// Origin: -// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L64-L88 -template -class Mapping { - public: - inline Mapping() = default; - inline int get(uchar c, uchar n, uchar* result) { - CacheEntry entry = entries_[c & kMask]; - if (entry.code_point_ == c) { - if (entry.offset_ == 0) { - return 0; - } else { - result[0] = c + entry.offset_; - return 1; - } - } else { - return CalculateValue(c, n, result); - } - } - - private: - int CalculateValue(uchar c, uchar n, uchar* result) { - bool allow_caching = true; - int length = T::Convert(c, n, result, &allow_caching); - if (allow_caching) { - if (length == 1) { - entries_[c & kMask] = CacheEntry(c, result[0] - c); - return 1; - } else { - entries_[c & kMask] = CacheEntry(c, 0); - return 0; - } - } else { - return length; - } - } - - struct CacheEntry { - inline CacheEntry() : code_point_(kNoChar), offset_(0) {} - inline CacheEntry(uchar code_point, signed offset) - : code_point_(code_point), offset_(offset) {} - uchar code_point_; - signed offset_; - static const int kNoChar = (1 << 21) - 1; - }; - static const int kSize = size; - static const int kMask = kSize - 1; - CacheEntry entries_[kSize]; -}; - -// Origin: -// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L241-L252 -struct Ecma262Canonicalize { - static const int kMaxWidth = 1; - static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); -}; -struct Ecma262UnCanonicalize { - static const int kMaxWidth = 4; - static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); -}; -struct CanonicalizationRange { - static const int kMaxWidth = 1; - static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); -}; - -#endif // !V8_INTL_SUPPORT - -struct Letter { - static bool Is(uchar c); -}; - -} // namespace unibrow - -namespace internal { - -#define PRINTF_FORMAT(x, y) MOZ_FORMAT_PRINTF(x, y) -void PRINTF_FORMAT(1, 2) PrintF(const char* format, ...); -void PRINTF_FORMAT(2, 3) PrintF(FILE* out, const char* format, ...); - -// Superclass for classes only using static method functions. -// The subclass of AllStatic cannot be instantiated at all. -class AllStatic { -#ifdef DEBUG - public: - AllStatic() = delete; -#endif -}; - -// Superclass for classes managed with new and delete. -// In irregexp, this is only AlternativeGeneration (in regexp-compiler.cc) -// Compare: -// https://github.com/v8/v8/blob/7b3332844212d78ee87a9426f3a6f7f781a8fbfa/src/utils/allocation.cc#L88-L96 -class Malloced { - public: - static void* operator new(size_t size) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - void* result = js_malloc(size); - if (!result) { - oomUnsafe.crash("Irregexp Malloced shim"); - } - return result; - } - static void operator delete(void* p) { js_free(p); } -}; - -constexpr int32_t KB = 1024; -constexpr int32_t MB = 1024 * 1024; - -#define kMaxInt JSVAL_INT_MAX -#define kMinInt JSVAL_INT_MIN -constexpr int kSystemPointerSize = sizeof(void*); - -// The largest integer n such that n and n + 1 are both exactly -// representable as a Number value. ES6 section 20.1.2.6 -constexpr double kMaxSafeInteger = 9007199254740991.0; // 2^53-1 - -constexpr int kBitsPerByte = 8; -constexpr int kBitsPerByteLog2 = 3; -constexpr int kUInt32Size = sizeof(uint32_t); -constexpr int kInt64Size = sizeof(int64_t); -constexpr int kUC16Size = sizeof(uc16); - -inline constexpr bool IsDecimalDigit(uc32 c) { return c >= '0' && c <= '9'; } -inline bool is_uint24(int val) { return (val & 0x00ffffff) == val; } - -inline bool IsIdentifierStart(uc32 c) { - return js::unicode::IsIdentifierStart(uint32_t(c)); -} -inline bool IsIdentifierPart(uc32 c) { - return js::unicode::IsIdentifierPart(uint32_t(c)); -} - -// Wrappers to disambiguate char16_t and uc16. -struct AsUC16 { - explicit AsUC16(char16_t v) : value(v) {} - char16_t value; -}; - -struct AsUC32 { - explicit AsUC32(int32_t v) : value(v) {} - int32_t value; -}; - -std::ostream& operator<<(std::ostream& os, const AsUC16& c); -std::ostream& operator<<(std::ostream& os, const AsUC32& c); - -// This class is used for the output of trace-regexp-parser. V8 has -// an elaborate implementation to ensure that the output gets to the -// right place, even on Android. We just need something that will -// print output (ideally to stderr, to match the rest of our tracing -// code). This is an empty wrapper that will convert itself to -// std::cerr when used. -class StdoutStream { -public: - operator std::ostream&() const; - template std::ostream& operator<<(T t); -}; - -// Reuse existing Maybe implementation -using mozilla::Maybe; - -template -Maybe Just(const T& value) { - return mozilla::Some(value); -} - -template -mozilla::Nothing Nothing() { - return mozilla::Nothing(); -} - - -template -using PseudoHandle = mozilla::UniquePtr; - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/utils.h#L600-L642 -// Compare 8bit/16bit chars to 8bit/16bit chars. -// Used indirectly by regexp-interpreter.cc -template -inline int CompareCharsUnsigned(const lchar* lhs, const rchar* rhs, - size_t chars) { - const lchar* limit = lhs + chars; - if (sizeof(*lhs) == sizeof(char) && sizeof(*rhs) == sizeof(char)) { - // memcmp compares byte-by-byte, yielding wrong results for two-byte - // strings on little-endian systems. - return memcmp(lhs, rhs, chars); - } - while (lhs < limit) { - int r = static_cast(*lhs) - static_cast(*rhs); - if (r != 0) return r; - ++lhs; - ++rhs; - } - return 0; -} -template -inline int CompareChars(const lchar* lhs, const rchar* rhs, size_t chars) { - DCHECK_LE(sizeof(lchar), 2); - DCHECK_LE(sizeof(rchar), 2); - if (sizeof(lchar) == 1) { - if (sizeof(rchar) == 1) { - return CompareCharsUnsigned(reinterpret_cast(lhs), - reinterpret_cast(rhs), chars); - } else { - return CompareCharsUnsigned(reinterpret_cast(lhs), - reinterpret_cast(rhs), - chars); - } - } else { - if (sizeof(rchar) == 1) { - return CompareCharsUnsigned(reinterpret_cast(lhs), - reinterpret_cast(rhs), chars); - } else { - return CompareCharsUnsigned(reinterpret_cast(lhs), - reinterpret_cast(rhs), - chars); - } - } -} - -// Origin: -// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/utils.h#L40-L48 -// Returns the value (0 .. 15) of a hexadecimal character c. -// If c is not a legal hexadecimal character, returns a value < 0. -// Used in regexp-parser.cc -inline int HexValue(uc32 c) { - c -= '0'; - if (static_cast(c) <= 9) return c; - c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. - if (static_cast(c) <= 5) return c + 10; - return -1; -} - -// V8::Object ~= JS::Value -class Object { - public: - // The default object constructor in V8 stores a nullptr, - // which has its low bit clear and is interpreted as Smi(0). - constexpr Object() : value_(JS::Int32Value(0)) {} - - // Conversions to/from SpiderMonkey types - constexpr Object(JS::Value value) : value_(value) {} - operator JS::Value() const { return value_; } - - // Used in regexp-macro-assembler.cc and regexp-interpreter.cc to - // check the return value of isolate->stack_guard()->HandleInterrupts() - // In V8, this will be either an exception object or undefined. - // In SM, we store the exception in the context, so we can use our normal - // idiom: return false iff we are throwing an exception. - inline bool IsException(Isolate*) const { return !value_.toBoolean(); } - - protected: - JS::Value value_; -}; - -class Smi : public Object { - public: - static Smi FromInt(int32_t value) { - Smi smi; - smi.value_ = JS::Int32Value(value); - return smi; - } - static inline int32_t ToInt(const Object object) { - return JS::Value(object).toInt32(); - } -}; - -// V8::HeapObject ~= JSObject -class HeapObject : public Object { - public: - inline static HeapObject cast(Object object) { - HeapObject h; - h.value_ = JS::Value(object); - return h; - } -}; - -// A fixed-size array with Objects (aka Values) as element types -// Only used for named captures. Allocated during parsing, so -// can't be a GC thing. -// TODO: implement. -class FixedArray : public HeapObject { - public: - inline void set(uint32_t index, Object value) {} - inline static FixedArray cast(Object object) { MOZ_CRASH("TODO"); } -}; - -class ByteArrayData { -public: - uint32_t length; - uint8_t* data(); -}; - -/* - * Conceptually, ByteArrayData is a variable-size structure. To - * implement this in a C++-approved way, we allocate a struct - * containing the 32-bit length field, followed by additional memory - * for the data. To access the data, we get a pointer to the next byte - * after the length field and cast it to the correct type. - */ -inline uint8_t* ByteArrayData::data() { - static_assert(alignof(uint8_t) <= alignof(ByteArrayData), - "The trailing data must be aligned to start immediately " - "after the header with no padding."); - ByteArrayData* immediatelyAfter = this + 1; - return reinterpret_cast(immediatelyAfter); -} - -// A fixed-size array of bytes. -class ByteArray : public HeapObject { - ByteArrayData* inner() const { - return static_cast(value_.toPrivate()); - } -public: - PseudoHandle takeOwnership(Isolate* isolate); - byte get(uint32_t index) { - MOZ_ASSERT(index < length()); - return inner()->data()[index]; - } - void set(uint32_t index, byte val) { - MOZ_ASSERT(index < length()); - inner()->data()[index] = val; - } - uint32_t length() const { return inner()->length; } - byte* GetDataStartAddress() { return inner()->data(); } - - static ByteArray cast(Object object) { - ByteArray b; - b.value_ = JS::Value(object); - return b; - } -}; - -// Like Handles in SM, V8 handles are references to marked pointers. -// Unlike SM, where Rooted pointers are created individually on the -// stack, the target of a V8 handle lives in an arena on the isolate -// (~= JSContext). Whenever a Handle is created, a new "root" is -// created at the end of the arena. -// -// HandleScopes are used to manage the lifetimes of these handles. A -// HandleScope lives on the stack and stores the size of the arena at -// the time of its creation. When the function returns and the -// HandleScope is destroyed, the arena is truncated to its previous -// size, clearing all roots that were created since the creation of -// the HandleScope. -// -// In some cases, objects that are GC-allocated in V8 are not in SM. -// In particular, irregexp allocates ByteArrays during code generation -// to store lookup tables. This does not play nicely with the SM -// macroassembler's requirement that no GC allocations take place -// while it is on the stack. To work around this, this shim layer also -// provides the ability to create pseudo-handles, which are not -// managed by the GC but provide the same API to irregexp. The "root" -// of a pseudohandle is a unique pointer living in a second arena. If -// the allocated object should outlive the HandleScope, it must be -// manually moved out of the arena using takeOwnership. - -class MOZ_STACK_CLASS HandleScope { -public: - HandleScope(Isolate* isolate); - ~HandleScope(); - - private: - size_t level_; - size_t non_gc_level_; - Isolate* isolate_; - - friend class Isolate; -}; - -// Origin: -// https://github.com/v8/v8/blob/5792f3587116503fc047d2f68c951c72dced08a5/src/handles/handles.h#L88-L171 -template -class MOZ_NONHEAP_CLASS Handle { - public: - Handle() : location_(nullptr) {} - Handle(T object, Isolate* isolate); - Handle(JS::Value value, Isolate* isolate); - - // Constructor for handling automatic up casting. - template ::value>::type> - inline Handle(Handle handle) : location_(handle.location_) {} - - template - inline static const Handle cast(Handle that) { - return Handle(that.location_); - } - - inline bool is_null() const { return location_ == nullptr; } - - inline T operator*() const { - return T::cast(Object(*location_)); - }; - - // {ObjectRef} is returned by {Handle::operator->}. It should never be stored - // anywhere or used in any other code; no one should ever have to spell out - // {ObjectRef} in code. Its only purpose is to be dereferenced immediately by - // "operator-> chaining". Returning the address of the field is valid because - // this object's lifetime only ends at the end of the full statement. - // Origin: - // https://github.com/v8/v8/blob/03aaa4b3bf4cb01eee1f223b252e6869b04ab08c/src/handles/handles.h#L91-L105 - class ObjectRef { - public: - T* operator->() { return &object_; } - - private: - friend class Handle; - explicit ObjectRef(T object) : object_(object) {} - - T object_; - }; - inline ObjectRef operator->() const { return ObjectRef{**this}; } - - static Handle fromHandleValue(JS::HandleValue handle) { - return Handle(handle.address()); - } - - private: - Handle(const JS::Value* location) : location_(location) {} - - template - friend class Handle; - template - friend class MaybeHandle; - - const JS::Value* location_; -}; - -// A Handle can be converted into a MaybeHandle. Converting a MaybeHandle -// into a Handle requires checking that it does not point to nullptr. This -// ensures nullptr checks before use. -// -// Also note that Handles do not provide default equality comparison or hashing -// operators on purpose. Such operators would be misleading, because intended -// semantics is ambiguous between Handle location and object identity. -// Origin: -// https://github.com/v8/v8/blob/5792f3587116503fc047d2f68c951c72dced08a5/src/handles/maybe-handles.h#L15-L78 -template -class MOZ_NONHEAP_CLASS MaybeHandle final { - public: - MaybeHandle() : location_(nullptr) {} - - // Constructor for handling automatic up casting from Handle. - // Ex. Handle can be passed when MaybeHandle is expected. - template ::value>::type> - MaybeHandle(Handle handle) : location_(handle.location_) {} - - inline Handle ToHandleChecked() const { - MOZ_RELEASE_ASSERT(location_); - return Handle(location_); - } - - // Convert to a Handle with a type that can be upcasted to. - template - inline bool ToHandle(Handle* out) const { - if (location_) { - *out = Handle(location_); - return true; - } else { - *out = Handle(); - return false; - } - } - -private: - JS::Value* location_; -}; - -// From v8/src/handles/handles-inl.h - -template -inline Handle handle(T object, Isolate* isolate) { - return Handle(object, isolate); -} - -// RAII Guard classes - -class DisallowHeapAllocation { - public: - DisallowHeapAllocation() {} - operator const JS::AutoCheckCannotGC&() const { return no_gc_; } - - private: - const JS::AutoCheckCannotGC no_gc_; -}; - -// This is used inside DisallowHeapAllocation regions to enable -// allocation just before throwing an exception, to allocate the -// exception object. Specifically, it only ever guards: -// - isolate->stack_guard()->HandleInterrupts() -// - isolate->StackOverflow() -// Those cases don't allocate in SpiderMonkey, so this can be a no-op. -class AllowHeapAllocation { - public: - // Empty constructor to avoid unused_variable warnings - AllowHeapAllocation() {} -}; - -// Origin: -// https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L83-L474 -class String : public HeapObject { - private: - JSString* str() const { return value_.toString(); } - - public: - String() : HeapObject() {} - String(JSString* str) { value_ = JS::StringValue(str); } - - operator JSString*() const { return str(); } - - // Max char codes. - static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; - static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; - static const int kMaxUtf16CodeUnit = 0xffff; - static const uc32 kMaxCodePoint = 0x10ffff; - - MOZ_ALWAYS_INLINE int length() const { return str()->length(); } - bool IsFlat() { return str()->isLinear(); }; - - // Origin: - // https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L95-L152 - class FlatContent { - public: - FlatContent(JSLinearString* string, const DisallowHeapAllocation& no_gc) - : string_(string), no_gc_(no_gc) {} - inline bool IsOneByte() const { return string_->hasLatin1Chars(); } - inline bool IsTwoByte() const { return !string_->hasLatin1Chars(); } - - Vector ToOneByteVector() const { - MOZ_ASSERT(IsOneByte()); - return Vector(string_->latin1Chars(no_gc_), - string_->length()); - } - Vector ToUC16Vector() const { - MOZ_ASSERT(IsTwoByte()); - return Vector(string_->twoByteChars(no_gc_), - string_->length()); - } - private: - const JSLinearString* string_; - const JS::AutoCheckCannotGC& no_gc_; - }; - FlatContent GetFlatContent(const DisallowHeapAllocation& no_gc) { - MOZ_ASSERT(IsFlat()); - return FlatContent(&str()->asLinear(), no_gc); - } - - static Handle Flatten(Isolate* isolate, Handle string); - - inline static String cast(Object object) { - String s; - s.value_ = JS::StringValue(JS::Value(object).toString()); - return s; - } - - inline static bool IsOneByteRepresentationUnderneath(String string) { - return string.str()->hasLatin1Chars(); - } - inline bool IsOneByteRepresentation() const { - return str()->hasLatin1Chars(); - } - - std::unique_ptr ToCString(); - - template - Vector GetCharVector(const DisallowHeapAllocation& no_gc); -}; - -template <> -inline Vector String::GetCharVector( - const DisallowHeapAllocation& no_gc) { - String::FlatContent flat = GetFlatContent(no_gc); - MOZ_ASSERT(flat.IsOneByte()); - return flat.ToOneByteVector(); -} - -template <> -inline Vector String::GetCharVector( - const DisallowHeapAllocation& no_gc) { - String::FlatContent flat = GetFlatContent(no_gc); - MOZ_ASSERT(flat.IsTwoByte()); - return flat.ToUC16Vector(); -} - -// A flat string reader provides random access to the contents of a -// string independent of the character width of the string. The handle -// must be valid as long as the reader is being used. -// Origin: -// https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L807-L825 -class MOZ_STACK_CLASS FlatStringReader { - public: - FlatStringReader(JSLinearString* string) - : length_(string->length()), - is_latin1_(string->hasLatin1Chars()) { - - if (is_latin1_) { - latin1_chars_ = string->latin1Chars(nogc_); - } else { - two_byte_chars_ = string->twoByteChars(nogc_); - } - } - FlatStringReader(const char16_t* chars, size_t length) - : two_byte_chars_(chars), - length_(length), - is_latin1_(false) {} - - int length() { return length_; } - - inline char16_t Get(size_t index) { - MOZ_ASSERT(index < length_); - if (is_latin1_) { - return latin1_chars_[index]; - } else { - return two_byte_chars_[index]; - } - } - - private: - union { - const JS::Latin1Char *latin1_chars_; - const char16_t* two_byte_chars_; - }; - size_t length_; - bool is_latin1_; - JS::AutoCheckCannotGC nogc_; -}; - -class JSRegExp : public HeapObject { - public: - // ****************************************************** - // Methods that are called from inside the implementation - // ****************************************************** - void TierUpTick() { /*inner()->tierUpTick();*/ } - bool MarkedForTierUp() const { - return false; /*inner()->markedForTierUp();*/ - } - - // TODO: hook these up - Object Code(bool is_latin1) const { return Object(JS::UndefinedValue()); } - Object Bytecode(bool is_latin1) const { return Object(JS::UndefinedValue()); } - - uint32_t BacktrackLimit() const { - return 0; /*inner()->backtrackLimit();*/ - } - - static JSRegExp cast(Object object) { - JSRegExp regexp; - MOZ_ASSERT(JS::Value(object).toGCThing()->is()); - regexp.value_ = JS::PrivateGCThingValue(JS::Value(object).toGCThing()); - return regexp; - } - - // ****************************** - // Static constants - // ****************************** - - // Meaning of Type: - // NOT_COMPILED: Initial value. No data has been stored in the JSRegExp yet. - // ATOM: A simple string to match against using an indexOf operation. - // IRREGEXP: Compiled with Irregexp. - enum Type { NOT_COMPILED, ATOM, IRREGEXP }; - - // Maximum number of captures allowed. - static constexpr int kMaxCaptures = 1 << 16; - - // ************************************************** - // JSRegExp::Flags - // ************************************************** - - struct FlagShiftBit { - static constexpr int kGlobal = 0; - static constexpr int kIgnoreCase = 1; - static constexpr int kMultiline = 2; - static constexpr int kSticky = 3; - static constexpr int kUnicode = 4; - static constexpr int kDotAll = 5; - static constexpr int kInvalid = 6; - }; - enum Flag : uint8_t { - kNone = 0, - kGlobal = 1 << FlagShiftBit::kGlobal, - kIgnoreCase = 1 << FlagShiftBit::kIgnoreCase, - kMultiline = 1 << FlagShiftBit::kMultiline, - kSticky = 1 << FlagShiftBit::kSticky, - kUnicode = 1 << FlagShiftBit::kUnicode, - kDotAll = 1 << FlagShiftBit::kDotAll, - kInvalid = 1 << FlagShiftBit::kInvalid, // Not included in FlagCount. - }; - using Flags = base::Flags; - static constexpr int kFlagCount = 6; - - static constexpr int kNoBacktrackLimit = 0; - -private: - js::RegExpShared* inner() { - return reinterpret_cast(value_.toGCThing()); - } -}; - -class Histogram { - public: - inline void AddSample(int sample) {} -}; - -class Counters { - public: - Histogram* regexp_backtracks() { return ®exp_backtracks_; } - - private: - Histogram regexp_backtracks_; -}; - -#define PROFILE(isolate, call) \ - do { \ - } while (false); - -enum class AllocationType : uint8_t { - kYoung, // Allocate in the nursery - kOld, // Allocate in the tenured heap -}; - -using StackGuard = Isolate; -using Factory = Isolate; - -class Isolate { - public: - //********** Isolate code **********// - RegExpStack* regexp_stack() const { return regexpStack_; } - byte* top_of_regexp_stack() const; - - // This is called from inside no-GC code. Instead of suppressing GC - // to allocate the error, we return false from Execute and call - // ReportOverRecursed in the caller. - void StackOverflow() {} - -#ifndef V8_INTL_SUPPORT - unibrow::Mapping* jsregexp_uncanonicalize() { - return &jsregexp_uncanonicalize_; - } - unibrow::Mapping* - regexp_macro_assembler_canonicalize() { - return ®exp_macro_assembler_canonicalize_; - } - unibrow::Mapping* jsregexp_canonrange() { - return &jsregexp_canonrange_; - } - -private: - unibrow::Mapping jsregexp_uncanonicalize_; - unibrow::Mapping - regexp_macro_assembler_canonicalize_; - unibrow::Mapping jsregexp_canonrange_; -#endif // !V8_INTL_SUPPORT - -public: - // An empty stub for telemetry we don't support - void IncreaseTotalRegexpCodeGenerated(Handle code) {} - - Counters* counters() { return &counters_; } - - //********** Factory code **********// - inline Factory* factory() { return this; } - - Handle NewByteArray( - int length, AllocationType allocation = AllocationType::kYoung); - - // Allocates a fixed array initialized with undefined values. - Handle NewFixedArray(int length); - - template - Handle InternalizeString(const Vector& str); - - //********** Stack guard code **********// - inline StackGuard* stack_guard() { return this; } - Object HandleInterrupts() { - return Object(JS::BooleanValue(cx()->handleInterrupt(cx()))); - } - - JSContext* cx() const { return cx_; } - - void trace(JSTracer* trc); - - //********** Handle code **********// - - JS::Value* getHandleLocation(JS::Value value); - - private: - - mozilla::SegmentedVector handleArena_; - mozilla::SegmentedVector> uniquePtrArena_; - - void* allocatePseudoHandle(size_t bytes); - -public: - template - PseudoHandle takeOwnership(void* ptr); - -private: - void openHandleScope(HandleScope& scope) { - scope.level_ = handleArena_.Length(); - scope.non_gc_level_ = uniquePtrArena_.Length(); - } - void closeHandleScope(size_t prevLevel, size_t prevUniqueLevel) { - size_t currLevel = handleArena_.Length(); - handleArena_.PopLastN(currLevel - prevLevel); - - size_t currUniqueLevel = uniquePtrArena_.Length(); - uniquePtrArena_.PopLastN(currUniqueLevel - prevUniqueLevel); - } - friend class HandleScope; - - JSContext* cx_; - RegExpStack* regexpStack_; - Counters counters_; -}; - -// Origin: -// https://github.com/v8/v8/blob/50dcf2af54ce27801a71c47c1be1d2c5e36b0dd6/src/execution/isolate.h#L1909-L1931 -class StackLimitCheck { - public: - StackLimitCheck(Isolate* isolate) : cx_(isolate->cx()) {} - - // Use this to check for stack-overflows in C++ code. - bool HasOverflowed() { - JS_CHECK_RECURSION_DONT_REPORT(cx_, return true); - return false; - } - - // Use this to check for interrupt request in C++ code. - bool InterruptRequested() { - JSRuntime* rt = cx_->runtime(); - return rt->hasPendingInterrupt(); - } - - // Use this to check for stack-overflow when entering runtime from JS code. - bool JsHasOverflowed() { - JS_CHECK_RECURSION_CONSERVATIVE_DONT_REPORT(cx_, return true); - return false; - } - - private: - JSContext* cx_; -}; - -class Code : public HeapObject { - public: - uint8_t* raw_instruction_start() { return inner()->raw(); } - - static Code cast(Object object) { - Code c; - MOZ_ASSERT(JS::Value(object).toGCThing()->is()); - c.value_ = JS::PrivateGCThingValue(JS::Value(object).toGCThing()); - return c; - } - js::jit::JitCode* inner() { - return value_.toGCThing()->as(); - } -}; - -enum class MessageTemplate { kStackOverflow }; - -class MessageFormatter { - public: - static const char* TemplateString(MessageTemplate index) { - switch (index) { - case MessageTemplate::kStackOverflow: - return "too much recursion"; - } - } -}; - -// Origin: https://github.com/v8/v8/blob/master/src/codegen/label.h -class Label { - public: - Label() : inner_(js::jit::Label()) {} - - js::jit::Label* inner() { return &inner_; } - - void Unuse() { inner_.reset(); } - - bool is_linked() { return inner_.used(); } - bool is_bound() { return inner_.bound(); } - bool is_unused() { return !inner_.used() && !inner_.bound(); } - - int pos() { return inner_.offset(); } - void link_to(int pos) { inner_.use(pos); } - void bind_to(int pos) { inner_.bind(pos); } - - private: - js::jit::Label inner_; - js::jit::CodeOffset patchOffset_; - - friend class SMRegExpMacroAssembler; -}; - -// TODO: Map flags to jitoptions -extern bool FLAG_correctness_fuzzer_suppressions; -extern bool FLAG_enable_regexp_unaligned_accesses; -extern bool FLAG_harmony_regexp_sequence; -extern bool FLAG_regexp_interpret_all; -extern bool FLAG_regexp_mode_modifiers; -extern bool FLAG_regexp_optimization; -extern bool FLAG_regexp_peephole_optimization; -extern bool FLAG_regexp_possessive_quantifier; -extern bool FLAG_regexp_tier_up; -extern bool FLAG_trace_regexp_assembler; -extern bool FLAG_trace_regexp_bytecodes; -extern bool FLAG_trace_regexp_parser; -extern bool FLAG_trace_regexp_peephole_optimization; - -#define COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER - -} // namespace internal -} // namespace v8 - -#endif // RegexpShim_h diff --git a/js/src/new-regexp/regexp-stack.cc b/js/src/new-regexp/regexp-stack.cc deleted file mode 100644 index c8944541c..000000000 --- a/js/src/new-regexp/regexp-stack.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2009 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-stack.h" - - -namespace v8 { -namespace internal { - -RegExpStackScope::RegExpStackScope(Isolate* isolate) - : regexp_stack_(isolate->regexp_stack()) { - // Initialize, if not already initialized. - regexp_stack_->EnsureCapacity(0); -} - - -RegExpStackScope::~RegExpStackScope() { - // Reset the buffer if it has grown. - regexp_stack_->Reset(); -} - -RegExpStack::RegExpStack() : thread_local_(this), isolate_(nullptr) {} - -RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); } - -char* RegExpStack::ArchiveStack(char* to) { - if (!thread_local_.owns_memory_) { - // Force dynamic stacks prior to archiving. Any growth will do. A dynamic - // stack is needed because stack archival & restoration rely on `memory_` - // pointing at a fixed-location backing store, whereas the static stack is - // tied to a RegExpStack instance. - EnsureCapacity(thread_local_.memory_size_ + 1); - DCHECK(thread_local_.owns_memory_); - } - - size_t size = sizeof(thread_local_); - MemCopy(reinterpret_cast(to), &thread_local_, size); - thread_local_ = ThreadLocal(this); - return to + size; -} - - -char* RegExpStack::RestoreStack(char* from) { - size_t size = sizeof(thread_local_); - MemCopy(&thread_local_, reinterpret_cast(from), size); - return from + size; -} - -void RegExpStack::Reset() { thread_local_.ResetToStaticStack(this); } - -void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) { - if (owns_memory_) DeleteArray(memory_); - - memory_ = regexp_stack->static_stack_; - memory_top_ = regexp_stack->static_stack_ + kStaticStackSize; - memory_size_ = kStaticStackSize; - limit_ = reinterpret_cast
(regexp_stack->static_stack_) + - kStackLimitSlack * kSystemPointerSize; - owns_memory_ = false; -} - -void RegExpStack::ThreadLocal::FreeAndInvalidate() { - if (owns_memory_) DeleteArray(memory_); - - // This stack may not be used after being freed. Just reset to invalid values - // to ensure we don't accidentally use old memory areas. - memory_ = nullptr; - memory_top_ = nullptr; - memory_size_ = 0; - limit_ = kMemoryTop; -} - -Address RegExpStack::EnsureCapacity(size_t size) { - if (size > kMaximumStackSize) return kNullAddress; - if (size < kMinimumDynamicStackSize) size = kMinimumDynamicStackSize; - if (thread_local_.memory_size_ < size) { - byte* new_memory = NewArray(size); - if (thread_local_.memory_size_ > 0) { - // Copy original memory into top of new memory. - MemCopy(new_memory + size - thread_local_.memory_size_, - thread_local_.memory_, thread_local_.memory_size_); - if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_); - } - thread_local_.memory_ = new_memory; - thread_local_.memory_top_ = new_memory + size; - thread_local_.memory_size_ = size; - thread_local_.limit_ = reinterpret_cast
(new_memory) + - kStackLimitSlack * kSystemPointerSize; - thread_local_.owns_memory_ = true; - } - return reinterpret_cast
(thread_local_.memory_top_); -} - - -} // namespace internal -} // namespace v8 diff --git a/js/src/new-regexp/regexp-stack.h b/js/src/new-regexp/regexp-stack.h deleted file mode 100644 index e32d0ed1f..000000000 --- a/js/src/new-regexp/regexp-stack.h +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2009 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_STACK_H_ -#define V8_REGEXP_REGEXP_STACK_H_ - -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -class RegExpStack; - -// Maintains a per-v8thread stack area that can be used by irregexp -// implementation for its backtracking stack. -// Since there is only one stack area, the Irregexp implementation is not -// re-entrant. I.e., no regular expressions may be executed in the same thread -// during a preempted Irregexp execution. -class RegExpStackScope { - public: - // Create and delete an instance to control the life-time of a growing stack. - - // Initializes the stack memory area if necessary. - explicit RegExpStackScope(Isolate* isolate); - ~RegExpStackScope(); // Releases the stack if it has grown. - - RegExpStack* stack() const { return regexp_stack_; } - - private: - RegExpStack* regexp_stack_; - - DISALLOW_COPY_AND_ASSIGN(RegExpStackScope); -}; - - -class RegExpStack { - public: - RegExpStack(); - ~RegExpStack(); - - // Number of allocated locations on the stack below the limit. - // No sequence of pushes must be longer that this without doing a stack-limit - // check. - static constexpr int kStackLimitSlack = 32; - - // Gives the top of the memory used as stack. - Address stack_base() { - DCHECK_NE(0, thread_local_.memory_size_); - DCHECK_EQ(thread_local_.memory_top_, - thread_local_.memory_ + thread_local_.memory_size_); - return reinterpret_cast
(thread_local_.memory_top_); - } - - // The total size of the memory allocated for the stack. - size_t stack_capacity() { return thread_local_.memory_size_; } - - // If the stack pointer gets below the limit, we should react and - // either grow the stack or report an out-of-stack exception. - // There is only a limited number of locations below the stack limit, - // so users of the stack should check the stack limit during any - // sequence of pushes longer that this. - Address* limit_address_address() { return &(thread_local_.limit_); } - - // Ensures that there is a memory area with at least the specified size. - // If passing zero, the default/minimum size buffer is allocated. - Address EnsureCapacity(size_t size); - - // Thread local archiving. - static constexpr int ArchiveSpacePerThread() { - return static_cast(sizeof(ThreadLocal)); - } - char* ArchiveStack(char* to); - char* RestoreStack(char* from); - void FreeThreadResources() { thread_local_.ResetToStaticStack(this); } - - // Maximal size of allocated stack area. - static constexpr size_t kMaximumStackSize = 64 * MB; - - private: - // Artificial limit used when the thread-local state has been destroyed. - static const Address kMemoryTop = - static_cast
(static_cast(-1)); - - // Minimal size of dynamically-allocated stack area. - static constexpr size_t kMinimumDynamicStackSize = 1 * KB; - - // In addition to dynamically-allocated, variable-sized stacks, we also have - // a statically allocated and sized area that is used whenever no dynamic - // stack is allocated. This guarantees that a stack is always available and - // we can skip availability-checks later on. - // It's double the slack size to ensure that we have a bit of breathing room - // before NativeRegExpMacroAssembler::GrowStack must be called. - static constexpr size_t kStaticStackSize = - 2 * kStackLimitSlack * kSystemPointerSize; - byte static_stack_[kStaticStackSize] = {0}; - - STATIC_ASSERT(kStaticStackSize <= kMaximumStackSize); - - // Structure holding the allocated memory, size and limit. - struct ThreadLocal { - explicit ThreadLocal(RegExpStack* regexp_stack) { - ResetToStaticStack(regexp_stack); - } - - // If memory_size_ > 0 then memory_ and memory_top_ must be non-nullptr - // and memory_top_ = memory_ + memory_size_ - byte* memory_ = nullptr; - byte* memory_top_ = nullptr; - size_t memory_size_ = 0; - Address limit_ = kNullAddress; - bool owns_memory_ = false; // Whether memory_ is owned and must be freed. - - void ResetToStaticStack(RegExpStack* regexp_stack); - void FreeAndInvalidate(); - }; - - // Address of top of memory used as stack. - Address memory_top_address_address() { - return reinterpret_cast
(&thread_local_.memory_top_); - } - - // Resets the buffer if it has grown beyond the default/minimum size. - // After this, the buffer is either the default size, or it is empty, so - // you have to call EnsureCapacity before using it again. - void Reset(); - - ThreadLocal thread_local_; - Isolate* isolate_; - - friend class ExternalReference; - friend class Isolate; - friend class RegExpStackScope; - - DISALLOW_COPY_AND_ASSIGN(RegExpStack); -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_STACK_H_ diff --git a/js/src/new-regexp/regexp.h b/js/src/new-regexp/regexp.h deleted file mode 100644 index f1e403bf0..000000000 --- a/js/src/new-regexp/regexp.h +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright 2012 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_H_ -#define V8_REGEXP_REGEXP_H_ - -#include "new-regexp/regexp-error.h" -#include "new-regexp/regexp-shim.h" - -namespace v8 { -namespace internal { - -class RegExpNode; -class RegExpTree; - -enum class RegExpCompilationTarget : int { kBytecode, kNative }; - -// TODO(jgruber): Do not expose in regexp.h. -// TODO(jgruber): Consider splitting between ParseData and CompileData. -struct RegExpCompileData { - // The parsed AST as produced by the RegExpParser. - RegExpTree* tree = nullptr; - - // The compiled Node graph as produced by RegExpTree::ToNode methods. - RegExpNode* node = nullptr; - - // Either the generated code as produced by the compiler or a trampoline - // to the interpreter. - Object code; - - // True, iff the pattern is a 'simple' atom with zero captures. In other - // words, the pattern consists of a string with no metacharacters and special - // regexp features, and can be implemented as a standard string search. - bool simple = true; - - // True, iff the pattern is anchored at the start of the string with '^'. - bool contains_anchor = false; - - // Only use if the pattern contains named captures. If so, this contains a - // mapping of capture names to capture indices. - Handle capture_name_map; - - // The error message. Only used if an error occurred during parsing or - // compilation. - RegExpError error = RegExpError::kNone; - - // The position at which the error was detected. Only used if an - // error occurred. - int error_pos = 0; - - // The number of capture groups, without the global capture \0. - int capture_count = 0; - - // The number of registers used by the generated code. - int register_count = 0; - - // The compilation target (bytecode or native code). - RegExpCompilationTarget compilation_target; -}; - -class RegExp final : public AllStatic { - public: - // Whether the irregexp engine generates interpreter bytecode. - static bool CanGenerateBytecode() { - return FLAG_regexp_interpret_all || FLAG_regexp_tier_up; - } - - // Parses the RegExp pattern and prepares the JSRegExp object with - // generic data and choice of implementation - as well as what - // the implementation wants to store in the data field. - // Returns false if compilation fails. - V8_WARN_UNUSED_RESULT static MaybeHandle Compile( - Isolate* isolate, Handle re, Handle pattern, - JSRegExp::Flags flags, uint32_t backtrack_limit); - - enum CallOrigin : int { - kFromRuntime = 0, - kFromJs = 1, - }; - - // See ECMA-262 section 15.10.6.2. - // This function calls the garbage collector if necessary. - V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle Exec( - Isolate* isolate, Handle regexp, Handle subject, - int index, Handle last_match_info); - - // Integral return values used throughout regexp code layers. - static constexpr int kInternalRegExpFailure = 0; - static constexpr int kInternalRegExpSuccess = 1; - static constexpr int kInternalRegExpException = -1; - static constexpr int kInternalRegExpRetry = -2; - - enum IrregexpResult : int32_t { - RE_FAILURE = kInternalRegExpFailure, - RE_SUCCESS = kInternalRegExpSuccess, - RE_EXCEPTION = kInternalRegExpException, - }; - - // Prepare a RegExp for being executed one or more times (using - // IrregexpExecOnce) on the subject. - // This ensures that the regexp is compiled for the subject, and that - // the subject is flat. - // Returns the number of integer spaces required by IrregexpExecOnce - // as its "registers" argument. If the regexp cannot be compiled, - // an exception is set as pending, and this function returns negative. - static int IrregexpPrepare(Isolate* isolate, Handle regexp, - Handle subject); - - // Set last match info. If match is nullptr, then setting captures is - // omitted. - static Handle SetLastMatchInfo( - Isolate* isolate, Handle last_match_info, - Handle subject, int capture_count, int32_t* match); - - V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone, - RegExpCompileData* input, - JSRegExp::Flags flags, - Handle pattern, - Handle sample_subject, - bool is_one_byte); - - V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label, - RegExpNode* node); - - static const int kRegExpTooLargeToOptimize = 20 * KB; -}; - -// Uses a special global mode of irregexp-generated code to perform a global -// search and return multiple results at once. As such, this is essentially an -// iterator over multiple results (retrieved batch-wise in advance). -class RegExpGlobalCache final { - public: - RegExpGlobalCache(Handle regexp, Handle subject, - Isolate* isolate); - - ~RegExpGlobalCache(); - - // Fetch the next entry in the cache for global regexp match results. - // This does not set the last match info. Upon failure, nullptr is - // returned. The cause can be checked with Result(). The previous result is - // still in available in memory when a failure happens. - int32_t* FetchNext(); - - int32_t* LastSuccessfulMatch(); - - bool HasException() { return num_matches_ < 0; } - - private: - int AdvanceZeroLength(int last_index); - - int num_matches_; - int max_matches_; - int current_match_index_; - int registers_per_match_; - // Pointer to the last set of captures. - int32_t* register_array_; - int register_array_size_; - Handle regexp_; - Handle subject_; - Isolate* isolate_; -}; - -// Caches results for specific regexp queries on the isolate. At the time of -// writing, this is used during global calls to RegExp.prototype.exec and -// @@split. -class RegExpResultsCache final : public AllStatic { - public: - enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS }; - - // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi. - // On success, the returned result is guaranteed to be a COW-array. - static Object Lookup(Heap* heap, String key_string, Object key_pattern, - FixedArray* last_match_out, ResultsCacheType type); - // Attempt to add value_array to the cache specified by type. On success, - // value_array is turned into a COW-array. - static void Enter(Isolate* isolate, Handle key_string, - Handle key_pattern, Handle value_array, - Handle last_match_cache, ResultsCacheType type); - static void Clear(FixedArray cache); - - static constexpr int kRegExpResultsCacheSize = 0x100; - - private: - static constexpr int kStringOffset = 0; - static constexpr int kPatternOffset = 1; - static constexpr int kArrayOffset = 2; - static constexpr int kLastMatchOffset = 3; - static constexpr int kArrayEntriesPerCacheEntry = 4; -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_H_ diff --git a/js/src/new-regexp/special-case.cc b/js/src/new-regexp/special-case.cc deleted file mode 100644 index d767b94c2..000000000 --- a/js/src/new-regexp/special-case.cc +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that -// can be found in the LICENSE file. - -// Automatically generated by regexp/gen-regexp-special-case.cc - -// The following functions are used to build UnicodeSets -// for special cases where the case-folding algorithm used by -// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match -// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime -// Semantics: Canonicalize) step 3. - -#ifdef V8_INTL_SUPPORT -#include "new-regexp/special-case.h" - -#include "unicode/uniset.h" -namespace v8 { -namespace internal { - -icu::UnicodeSet BuildIgnoreSet() { - icu::UnicodeSet set; - set.add(0xdf); - set.add(0x17f); - set.add(0x390); - set.add(0x3b0); - set.add(0x3f4); - set.add(0x1e9e); - set.add(0x1f80, 0x1faf); - set.add(0x1fb3); - set.add(0x1fbc); - set.add(0x1fc3); - set.add(0x1fcc); - set.add(0x1fd3); - set.add(0x1fe3); - set.add(0x1ff3); - set.add(0x1ffc); - set.add(0x2126); - set.add(0x212a, 0x212b); - set.add(0xfb05, 0xfb06); - set.freeze(); - return set; -} - -struct IgnoreSetData { - IgnoreSetData() : set(BuildIgnoreSet()) {} - const icu::UnicodeSet set; -}; - -//static -const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() { - static base::LazyInstance::type set = - LAZY_INSTANCE_INITIALIZER; - return set.Pointer()->set; -} - -icu::UnicodeSet BuildSpecialAddSet() { - icu::UnicodeSet set; - set.add(0x4b); - set.add(0x53); - set.add(0x6b); - set.add(0x73); - set.add(0xc5); - set.add(0xe5); - set.add(0x398); - set.add(0x3a9); - set.add(0x3b8); - set.add(0x3c9); - set.add(0x3d1); - set.freeze(); - return set; -} - -struct SpecialAddSetData { - SpecialAddSetData() : set(BuildSpecialAddSet()) {} - const icu::UnicodeSet set; -}; - -//static -const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() { - static base::LazyInstance::type set = - LAZY_INSTANCE_INITIALIZER; - return set.Pointer()->set; -} - - -} // namespace internal -} // namespace v8 -#endif // V8_INTL_SUPPORT diff --git a/js/src/new-regexp/special-case.h b/js/src/new-regexp/special-case.h deleted file mode 100644 index 31dfd7858..000000000 --- a/js/src/new-regexp/special-case.h +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_SPECIAL_CASE_H_ -#define V8_REGEXP_SPECIAL_CASE_H_ - -#ifdef V8_INTL_SUPPORT -#include "new-regexp/regexp-shim.h" - -#include "unicode/uchar.h" -#include "unicode/uniset.h" -#include "unicode/unistr.h" - -namespace v8 { -namespace internal { - -// Sets of Unicode characters that need special handling under "i" mode - -// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262 -// defines slightly different case-folding rules than Unicode. An -// input character should match a pattern character if the result of -// the Canonicalize algorithm is the same for both characters. -// -// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as -// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character -// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See -// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for -// the precise definition. -// -// While compiling such regular expressions, we need to compute the -// set of characters that should match a given input character. (See -// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.) -// For almost all characters, this can be efficiently computed using -// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent -// the remaining special cases. -// -// For a character c, the rules are as follows: -// -// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling -// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet -// containing c will produce the set of characters that should -// match /c/i (or /[c]/i), and only those characters. -// -// 2. If c is in IgnoreSet, then the only character it should match is -// itself. However, closeOver will add additional incorrect -// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ' -// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is -// "SS". Step 3.e therefore requires that 'ß' canonicalizes to -// itself, and should not match 'ẞ'. In these cases, we can skip -// the closeOver entirely, because it will never add an equivalent -// character. -// -// 3. If c is in SpecialAddSet, then it should match at least one -// character other than itself. However, closeOver will add at -// least one additional incorrect match. For example, consider the -// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase -// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN -// SIGN should not match either of the other two characters. As a -// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in -// IgnoreSet). To find the correct matches for characters in -// SpecialAddSet, we closeOver the original character, but filter -// out the results that do not have the same canonical value. -// -// The contents of these sets are calculated at build time by -// src/regexp/gen-regexp-special-case.cc, which generates -// gen/src/regexp/special-case.cc. This is done by iterating over the -// result of closeOver for each BMP character, and finding sets for -// which at least one character has a different canonical value than -// another character. Characters that match no other characters in -// their equivalence class are added to IgnoreSet. Characters that -// match at least one other character are added to SpecialAddSet. - -class RegExpCaseFolding final : public AllStatic { - public: - static const icu::UnicodeSet& IgnoreSet(); - static const icu::UnicodeSet& SpecialAddSet(); - - // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: - // Canonicalize) step 3, which is used to determine whether - // characters match when ignoreCase is true and unicode is false. - static UChar32 Canonicalize(UChar32 ch) { - // a. Assert: ch is a UTF-16 code unit. - CHECK_LE(ch, 0xffff); - - // b. Let s be the String value consisting of the single code unit ch. - icu::UnicodeString s(ch); - - // c. Let u be the same result produced as if by performing the algorithm - // for String.prototype.toUpperCase using s as the this value. - // d. Assert: Type(u) is String. - icu::UnicodeString& u = s.toUpper(); - - // e. If u does not consist of a single code unit, return ch. - if (u.length() != 1) { - return ch; - } - - // f. Let cu be u's single code unit element. - UChar32 cu = u.char32At(0); - - // g. If the value of ch >= 128 and the value of cu < 128, return ch. - if (ch >= 128 && cu < 128) { - return ch; - } - - // h. Return cu. - return cu; - } -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_INTL_SUPPORT - -#endif // V8_REGEXP_SPECIAL_CASE_H_ diff --git a/js/src/new-regexp/util/flags.h b/js/src/new-regexp/util/flags.h deleted file mode 100644 index 1fa421fc0..000000000 --- a/js/src/new-regexp/util/flags.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2014 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_UTIL_FLAGS_H_ -#define V8_UTIL_FLAGS_H_ - -// Origin: https://github.com/v8/v8/blob/1bafcc6b999b23ea1d394f5d267a08183e3c4e19/src/base/flags.h#L15-L90 - -namespace v8 { -namespace base { - -// The Flags class provides a type-safe way of storing OR-combinations of enum -// values. The Flags class is a template class, where T is an enum type, -// and S is the underlying storage type (usually int). -// -// The traditional C++ approach for storing OR-combinations of enum values is to -// use an int or unsigned int variable. The inconvenience with this approach is -// that there's no type checking at all; any enum value can be OR'd with any -// other enum value and passed on to a function that takes an int or unsigned -// int. -template -class Flags final { - public: - using flag_type = T; - using mask_type = S; - - constexpr Flags() : mask_(0) {} - constexpr Flags(flag_type flag) - : mask_(static_cast(flag)) {} - constexpr explicit Flags(mask_type mask) : mask_(static_cast(mask)) {} - - constexpr bool operator==(flag_type flag) const { - return mask_ == static_cast(flag); - } - constexpr bool operator!=(flag_type flag) const { - return mask_ != static_cast(flag); - } - - Flags& operator&=(const Flags& flags) { - mask_ &= flags.mask_; - return *this; - } - Flags& operator|=(const Flags& flags) { - mask_ |= flags.mask_; - return *this; - } - Flags& operator^=(const Flags& flags) { - mask_ ^= flags.mask_; - return *this; - } - - constexpr Flags operator&(const Flags& flags) const { - return Flags(mask_ & flags.mask_); - } - constexpr Flags operator|(const Flags& flags) const { - return Flags(mask_ | flags.mask_); - } - constexpr Flags operator^(const Flags& flags) const { - return Flags(mask_ ^ flags.mask_); - } - - Flags& operator&=(flag_type flag) { return operator&=(Flags(flag)); } - Flags& operator|=(flag_type flag) { return operator|=(Flags(flag)); } - Flags& operator^=(flag_type flag) { return operator^=(Flags(flag)); } - - constexpr Flags operator&(flag_type flag) const { - return operator&(Flags(flag)); - } - constexpr Flags operator|(flag_type flag) const { - return operator|(Flags(flag)); - } - constexpr Flags operator^(flag_type flag) const { - return operator^(Flags(flag)); - } - - constexpr Flags operator~() const { return Flags(~mask_); } - - constexpr operator mask_type() const { return mask_; } - constexpr bool operator!() const { return !mask_; } - - Flags without(flag_type flag) { return *this & (~Flags(flag)); } - - friend size_t hash_value(const Flags& flags) { return flags.mask_; } - - private: - mask_type mask_; -}; - -} // namespace base -} // namespace v8 - -#endif // V8_UTIL_FLAG_H_ diff --git a/js/src/new-regexp/util/unicode.cc b/js/src/new-regexp/util/unicode.cc deleted file mode 100644 index ba9ea607c..000000000 --- a/js/src/new-regexp/util/unicode.cc +++ /dev/null @@ -1,1865 +0,0 @@ -// Copyright 2012 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -// -// This file is a subset of: -// https://github.com/v8/v8/blob/master/src/strings/unicode.cc - -#include "new-regexp/regexp-shim.h" - -#ifdef V8_INTL_SUPPORT -#include "unicode/uchar.h" -#endif - -namespace v8 { -namespace unibrow { - -#ifndef V8_INTL_SUPPORT -static const int kStartBit = (1 << 30); -static const int kChunkBits = (1 << 13); -#endif // !V8_INTL_SUPPORT - -static const uchar kSentinel = static_cast(-1); - -/** - * \file - * Implementations of functions for working with Unicode. - */ - -using int16_t = signed short; // NOLINT -using uint16_t = unsigned short; // NOLINT -using int32_t = int; // NOLINT - -#ifndef V8_INTL_SUPPORT -// All access to the character table should go through this function. -template -static inline uchar TableGet(const int32_t* table, int index) { - return table[D * index]; -} - -static inline uchar GetEntry(int32_t entry) { return entry & (kStartBit - 1); } - -static inline bool IsStart(int32_t entry) { return (entry & kStartBit) != 0; } - -/** - * Look up a character in the Unicode table using a mix of binary and - * interpolation search. For a uniformly distributed array - * interpolation search beats binary search by a wide margin. However, - * in this case interpolation search degenerates because of some very - * high values in the lower end of the table so this function uses a - * combination. The average number of steps to look up the information - * about a character is around 10, slightly higher if there is no - * information available about the character. - */ -static bool LookupPredicate(const int32_t* table, uint16_t size, uchar chr) { - static const int kEntryDist = 1; - uint16_t value = chr & (kChunkBits - 1); - unsigned int low = 0; - unsigned int high = size - 1; - while (high != low) { - unsigned int mid = low + ((high - low) >> 1); - uchar current_value = GetEntry(TableGet(table, mid)); - // If we've found an entry less than or equal to this one, and the - // next one is not also less than this one, we've arrived. - if ((current_value <= value) && - (mid + 1 == size || - GetEntry(TableGet(table, mid + 1)) > value)) { - low = mid; - break; - } else if (current_value < value) { - low = mid + 1; - } else if (current_value > value) { - // If we've just checked the bottom-most value and it's not - // the one we're looking for, we're done. - if (mid == 0) break; - high = mid - 1; - } - } - int32_t field = TableGet(table, low); - uchar entry = GetEntry(field); - bool is_start = IsStart(field); - return (entry == value) || (entry < value && is_start); -} -#endif // !V8_INTL_SUPPORT - -template -struct MultiCharacterSpecialCase { - static const uchar kEndOfEncoding = kSentinel; - uchar chars[kW]; -}; - -#ifndef V8_INTL_SUPPORT -// Look up the mapping for the given character in the specified table, -// which is of the specified length and uses the specified special case -// mapping for multi-char mappings. The next parameter is the character -// following the one to map. The result will be written in to the result -// buffer and the number of characters written will be returned. Finally, -// if the allow_caching_ptr is non-null then false will be stored in -// it if the result contains multiple characters or depends on the -// context. -// If ranges are linear, a match between a start and end point is -// offset by the distance between the match and the start. Otherwise -// the result is the same as for the start point on the entire range. -template -static int LookupMapping(const int32_t* table, uint16_t size, - const MultiCharacterSpecialCase* multi_chars, - uchar chr, uchar next, uchar* result, - bool* allow_caching_ptr) { - static const int kEntryDist = 2; - uint16_t key = chr & (kChunkBits - 1); - uint16_t chunk_start = chr - key; - unsigned int low = 0; - unsigned int high = size - 1; - while (high != low) { - unsigned int mid = low + ((high - low) >> 1); - uchar current_value = GetEntry(TableGet(table, mid)); - // If we've found an entry less than or equal to this one, and the next one - // is not also less than this one, we've arrived. - if ((current_value <= key) && - (mid + 1 == size || - GetEntry(TableGet(table, mid + 1)) > key)) { - low = mid; - break; - } else if (current_value < key) { - low = mid + 1; - } else if (current_value > key) { - // If we've just checked the bottom-most value and it's not - // the one we're looking for, we're done. - if (mid == 0) break; - high = mid - 1; - } - } - int32_t field = TableGet(table, low); - uchar entry = GetEntry(field); - bool is_start = IsStart(field); - bool found = (entry == key) || (entry < key && is_start); - if (found) { - int32_t value = table[2 * low + 1]; - if (value == 0) { - // 0 means not present - return 0; - } else if ((value & 3) == 0) { - // Low bits 0 means a constant offset from the given character. - if (ranges_are_linear) { - result[0] = chr + (value >> 2); - } else { - result[0] = entry + chunk_start + (value >> 2); - } - return 1; - } else if ((value & 3) == 1) { - // Low bits 1 means a special case mapping - if (allow_caching_ptr) *allow_caching_ptr = false; - const MultiCharacterSpecialCase& mapping = multi_chars[value >> 2]; - int length = 0; - for (length = 0; length < kW; length++) { - uchar mapped = mapping.chars[length]; - if (mapped == MultiCharacterSpecialCase::kEndOfEncoding) break; - if (ranges_are_linear) { - result[length] = mapped + (key - entry); - } else { - result[length] = mapped; - } - } - return length; - } else { - // Low bits 2 means a really really special case - if (allow_caching_ptr) *allow_caching_ptr = false; - // The cases of this switch are defined in unicode.py in the - // really_special_cases mapping. - switch (value >> 2) { - case 1: - // Really special case 1: upper case sigma. This letter - // converts to two different lower case sigmas depending on - // whether or not it occurs at the end of a word. - if (next != 0 && Letter::Is(next)) { - result[0] = 0x03C3; - } else { - result[0] = 0x03C2; - } - return 1; - default: - return 0; - } - return -1; - } - } else { - return 0; - } -} -#endif // !V8_INTL_SUPPORT - -// Letter: point.category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'] -#ifdef V8_INTL_SUPPORT -bool Letter::Is(uchar c) { return static_cast(u_isalpha(c)); } -#else -static const uint16_t kLetterTable0Size = 431; -static const int32_t kLetterTable0[431] = { - 1073741889, 90, 1073741921, 122, - 170, 181, 186, 1073742016, // NOLINT - 214, 1073742040, 246, 1073742072, - 705, 1073742534, 721, 1073742560, // NOLINT - 740, 748, 750, 1073742704, - 884, 1073742710, 887, 1073742714, // NOLINT - 893, 895, 902, 1073742728, - 906, 908, 1073742734, 929, // NOLINT - 1073742755, 1013, 1073742839, 1153, - 1073742986, 1327, 1073743153, 1366, // NOLINT - 1369, 1073743201, 1415, 1073743312, - 1514, 1073743344, 1522, 1073743392, // NOLINT - 1610, 1073743470, 1647, 1073743473, - 1747, 1749, 1073743589, 1766, // NOLINT - 1073743598, 1775, 1073743610, 1788, - 1791, 1808, 1073743634, 1839, // NOLINT - 1073743693, 1957, 1969, 1073743818, - 2026, 1073743860, 2037, 2042, // NOLINT - 1073743872, 2069, 2074, 2084, - 2088, 1073743936, 2136, 1073744032, // NOLINT - 2226, 1073744132, 2361, 2365, - 2384, 1073744216, 2401, 1073744241, // NOLINT - 2432, 1073744261, 2444, 1073744271, - 2448, 1073744275, 2472, 1073744298, // NOLINT - 2480, 2482, 1073744310, 2489, - 2493, 2510, 1073744348, 2525, // NOLINT - 1073744351, 2529, 1073744368, 2545, - 1073744389, 2570, 1073744399, 2576, // NOLINT - 1073744403, 2600, 1073744426, 2608, - 1073744434, 2611, 1073744437, 2614, // NOLINT - 1073744440, 2617, 1073744473, 2652, - 2654, 1073744498, 2676, 1073744517, // NOLINT - 2701, 1073744527, 2705, 1073744531, - 2728, 1073744554, 2736, 1073744562, // NOLINT - 2739, 1073744565, 2745, 2749, - 2768, 1073744608, 2785, 1073744645, // NOLINT - 2828, 1073744655, 2832, 1073744659, - 2856, 1073744682, 2864, 1073744690, // NOLINT - 2867, 1073744693, 2873, 2877, - 1073744732, 2909, 1073744735, 2913, // NOLINT - 2929, 2947, 1073744773, 2954, - 1073744782, 2960, 1073744786, 2965, // NOLINT - 1073744793, 2970, 2972, 1073744798, - 2975, 1073744803, 2980, 1073744808, // NOLINT - 2986, 1073744814, 3001, 3024, - 1073744901, 3084, 1073744910, 3088, // NOLINT - 1073744914, 3112, 1073744938, 3129, - 3133, 1073744984, 3161, 1073744992, // NOLINT - 3169, 1073745029, 3212, 1073745038, - 3216, 1073745042, 3240, 1073745066, // NOLINT - 3251, 1073745077, 3257, 3261, - 3294, 1073745120, 3297, 1073745137, // NOLINT - 3314, 1073745157, 3340, 1073745166, - 3344, 1073745170, 3386, 3389, // NOLINT - 3406, 1073745248, 3425, 1073745274, - 3455, 1073745285, 3478, 1073745306, // NOLINT - 3505, 1073745331, 3515, 3517, - 1073745344, 3526, 1073745409, 3632, // NOLINT - 1073745458, 3635, 1073745472, 3654, - 1073745537, 3714, 3716, 1073745543, // NOLINT - 3720, 3722, 3725, 1073745556, - 3735, 1073745561, 3743, 1073745569, // NOLINT - 3747, 3749, 3751, 1073745578, - 3755, 1073745581, 3760, 1073745586, // NOLINT - 3763, 3773, 1073745600, 3780, - 3782, 1073745628, 3807, 3840, // NOLINT - 1073745728, 3911, 1073745737, 3948, - 1073745800, 3980, 1073745920, 4138, // NOLINT - 4159, 1073746000, 4181, 1073746010, - 4189, 4193, 1073746021, 4198, // NOLINT - 1073746030, 4208, 1073746037, 4225, - 4238, 1073746080, 4293, 4295, // NOLINT - 4301, 1073746128, 4346, 1073746172, - 4680, 1073746506, 4685, 1073746512, // NOLINT - 4694, 4696, 1073746522, 4701, - 1073746528, 4744, 1073746570, 4749, // NOLINT - 1073746576, 4784, 1073746610, 4789, - 1073746616, 4798, 4800, 1073746626, // NOLINT - 4805, 1073746632, 4822, 1073746648, - 4880, 1073746706, 4885, 1073746712, // NOLINT - 4954, 1073746816, 5007, 1073746848, - 5108, 1073746945, 5740, 1073747567, // NOLINT - 5759, 1073747585, 5786, 1073747616, - 5866, 1073747694, 5880, 1073747712, // NOLINT - 5900, 1073747726, 5905, 1073747744, - 5937, 1073747776, 5969, 1073747808, // NOLINT - 5996, 1073747822, 6000, 1073747840, - 6067, 6103, 6108, 1073748000, // NOLINT - 6263, 1073748096, 6312, 6314, - 1073748144, 6389, 1073748224, 6430, // NOLINT - 1073748304, 6509, 1073748336, 6516, - 1073748352, 6571, 1073748417, 6599, // NOLINT - 1073748480, 6678, 1073748512, 6740, - 6823, 1073748741, 6963, 1073748805, // NOLINT - 6987, 1073748867, 7072, 1073748910, - 7087, 1073748922, 7141, 1073748992, // NOLINT - 7203, 1073749069, 7247, 1073749082, - 7293, 1073749225, 7404, 1073749230, // NOLINT - 7409, 1073749237, 7414, 1073749248, - 7615, 1073749504, 7957, 1073749784, // NOLINT - 7965, 1073749792, 8005, 1073749832, - 8013, 1073749840, 8023, 8025, // NOLINT - 8027, 8029, 1073749855, 8061, - 1073749888, 8116, 1073749942, 8124, // NOLINT - 8126, 1073749954, 8132, 1073749958, - 8140, 1073749968, 8147, 1073749974, // NOLINT - 8155, 1073749984, 8172, 1073750002, - 8180, 1073750006, 8188}; // NOLINT -static const uint16_t kLetterTable1Size = 87; -static const int32_t kLetterTable1[87] = { - 113, 127, 1073741968, 156, - 258, 263, 1073742090, 275, // NOLINT - 277, 1073742105, 285, 292, - 294, 296, 1073742122, 301, // NOLINT - 1073742127, 313, 1073742140, 319, - 1073742149, 329, 334, 1073742176, // NOLINT - 392, 1073744896, 3118, 1073744944, - 3166, 1073744992, 3300, 1073745131, // NOLINT - 3310, 1073745138, 3315, 1073745152, - 3365, 3367, 3373, 1073745200, // NOLINT - 3431, 3439, 1073745280, 3478, - 1073745312, 3494, 1073745320, 3502, // NOLINT - 1073745328, 3510, 1073745336, 3518, - 1073745344, 3526, 1073745352, 3534, // NOLINT - 1073745360, 3542, 1073745368, 3550, - 3631, 1073745925, 4103, 1073745953, // NOLINT - 4137, 1073745969, 4149, 1073745976, - 4156, 1073745985, 4246, 1073746077, // NOLINT - 4255, 1073746081, 4346, 1073746172, - 4351, 1073746181, 4397, 1073746225, // NOLINT - 4494, 1073746336, 4538, 1073746416, - 4607, 1073746944, 8191}; // NOLINT -static const uint16_t kLetterTable2Size = 4; -static const int32_t kLetterTable2[4] = {1073741824, 3509, 1073745408, - 8191}; // NOLINT -static const uint16_t kLetterTable3Size = 2; -static const int32_t kLetterTable3[2] = {1073741824, 8191}; // NOLINT -static const uint16_t kLetterTable4Size = 2; -static const int32_t kLetterTable4[2] = {1073741824, 8140}; // NOLINT -static const uint16_t kLetterTable5Size = 100; -static const int32_t kLetterTable5[100] = { - 1073741824, 1164, 1073743056, 1277, - 1073743104, 1548, 1073743376, 1567, // NOLINT - 1073743402, 1579, 1073743424, 1646, - 1073743487, 1693, 1073743520, 1775, // NOLINT - 1073743639, 1823, 1073743650, 1928, - 1073743755, 1934, 1073743760, 1965, // NOLINT - 1073743792, 1969, 1073743863, 2049, - 1073743875, 2053, 1073743879, 2058, // NOLINT - 1073743884, 2082, 1073743936, 2163, - 1073744002, 2227, 1073744114, 2295, // NOLINT - 2299, 1073744138, 2341, 1073744176, - 2374, 1073744224, 2428, 1073744260, // NOLINT - 2482, 2511, 1073744352, 2532, - 1073744358, 2543, 1073744378, 2558, // NOLINT - 1073744384, 2600, 1073744448, 2626, - 1073744452, 2635, 1073744480, 2678, // NOLINT - 2682, 1073744510, 2735, 2737, - 1073744565, 2742, 1073744569, 2749, // NOLINT - 2752, 2754, 1073744603, 2781, - 1073744608, 2794, 1073744626, 2804, // NOLINT - 1073744641, 2822, 1073744649, 2830, - 1073744657, 2838, 1073744672, 2854, // NOLINT - 1073744680, 2862, 1073744688, 2906, - 1073744732, 2911, 1073744740, 2917, // NOLINT - 1073744832, 3042, 1073744896, 8191}; // NOLINT -static const uint16_t kLetterTable6Size = 6; -static const int32_t kLetterTable6[6] = {1073741824, 6051, 1073747888, 6086, - 1073747915, 6139}; // NOLINT -static const uint16_t kLetterTable7Size = 48; -static const int32_t kLetterTable7[48] = { - 1073748224, 6765, 1073748592, 6873, - 1073748736, 6918, 1073748755, 6935, // NOLINT - 6941, 1073748767, 6952, 1073748778, - 6966, 1073748792, 6972, 6974, // NOLINT - 1073748800, 6977, 1073748803, 6980, - 1073748806, 7089, 1073748947, 7485, // NOLINT - 1073749328, 7567, 1073749394, 7623, - 1073749488, 7675, 1073749616, 7796, // NOLINT - 1073749622, 7932, 1073749793, 7994, - 1073749825, 8026, 1073749862, 8126, // NOLINT - 1073749954, 8135, 1073749962, 8143, - 1073749970, 8151, 1073749978, 8156}; // NOLINT -bool Letter::Is(uchar c) { - int chunk_index = c >> 13; - switch (chunk_index) { - case 0: - return LookupPredicate(kLetterTable0, kLetterTable0Size, c); - case 1: - return LookupPredicate(kLetterTable1, kLetterTable1Size, c); - case 2: - return LookupPredicate(kLetterTable2, kLetterTable2Size, c); - case 3: - return LookupPredicate(kLetterTable3, kLetterTable3Size, c); - case 4: - return LookupPredicate(kLetterTable4, kLetterTable4Size, c); - case 5: - return LookupPredicate(kLetterTable5, kLetterTable5Size, c); - case 6: - return LookupPredicate(kLetterTable6, kLetterTable6Size, c); - case 7: - return LookupPredicate(kLetterTable7, kLetterTable7Size, c); - default: - return false; - } -} -#endif - -#ifndef V8_INTL_SUPPORT - -static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings0[1] = - { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262CanonicalizeTable0Size = 498; // NOLINT -static const int32_t kEcma262CanonicalizeTable0[996] = { - 1073741921, -128, 122, -128, 181, 2972, - 1073742048, -128, 246, -128, 1073742072, -128, - 254, -128, 255, 484, // NOLINT - 257, -4, 259, -4, 261, -4, - 263, -4, 265, -4, 267, -4, - 269, -4, 271, -4, // NOLINT - 273, -4, 275, -4, 277, -4, - 279, -4, 281, -4, 283, -4, - 285, -4, 287, -4, // NOLINT - 289, -4, 291, -4, 293, -4, - 295, -4, 297, -4, 299, -4, - 301, -4, 303, -4, // NOLINT - 307, -4, 309, -4, 311, -4, - 314, -4, 316, -4, 318, -4, - 320, -4, 322, -4, // NOLINT - 324, -4, 326, -4, 328, -4, - 331, -4, 333, -4, 335, -4, - 337, -4, 339, -4, // NOLINT - 341, -4, 343, -4, 345, -4, - 347, -4, 349, -4, 351, -4, - 353, -4, 355, -4, // NOLINT - 357, -4, 359, -4, 361, -4, - 363, -4, 365, -4, 367, -4, - 369, -4, 371, -4, // NOLINT - 373, -4, 375, -4, 378, -4, - 380, -4, 382, -4, 384, 780, - 387, -4, 389, -4, // NOLINT - 392, -4, 396, -4, 402, -4, - 405, 388, 409, -4, 410, 652, - 414, 520, 417, -4, // NOLINT - 419, -4, 421, -4, 424, -4, - 429, -4, 432, -4, 436, -4, - 438, -4, 441, -4, // NOLINT - 445, -4, 447, 224, 453, -4, - 454, -8, 456, -4, 457, -8, - 459, -4, 460, -8, // NOLINT - 462, -4, 464, -4, 466, -4, - 468, -4, 470, -4, 472, -4, - 474, -4, 476, -4, // NOLINT - 477, -316, 479, -4, 481, -4, - 483, -4, 485, -4, 487, -4, - 489, -4, 491, -4, // NOLINT - 493, -4, 495, -4, 498, -4, - 499, -8, 501, -4, 505, -4, - 507, -4, 509, -4, // NOLINT - 511, -4, 513, -4, 515, -4, - 517, -4, 519, -4, 521, -4, - 523, -4, 525, -4, // NOLINT - 527, -4, 529, -4, 531, -4, - 533, -4, 535, -4, 537, -4, - 539, -4, 541, -4, // NOLINT - 543, -4, 547, -4, 549, -4, - 551, -4, 553, -4, 555, -4, - 557, -4, 559, -4, // NOLINT - 561, -4, 563, -4, 572, -4, - 1073742399, 43260, 576, 43260, 578, -4, - 583, -4, 585, -4, // NOLINT - 587, -4, 589, -4, 591, -4, - 592, 43132, 593, 43120, 594, 43128, - 595, -840, 596, -824, // NOLINT - 1073742422, -820, 599, -820, 601, -808, - 603, -812, 604, 169276, 608, -820, - 609, 169260, 611, -828, // NOLINT - 613, 169120, 614, 169232, 616, -836, - 617, -844, 619, 42972, 620, 169220, - 623, -844, 625, 42996, // NOLINT - 626, -852, 629, -856, 637, 42908, - 640, -872, 643, -872, 647, 169128, - 648, -872, 649, -276, // NOLINT - 1073742474, -868, 651, -868, 652, -284, - 658, -876, 670, 169032, 837, 336, - 881, -4, 883, -4, // NOLINT - 887, -4, 1073742715, 520, 893, 520, - 940, -152, 1073742765, -148, 943, -148, - 1073742769, -128, 961, -128, // NOLINT - 962, -124, 1073742787, -128, 971, -128, - 972, -256, 1073742797, -252, 974, -252, - 976, -248, 977, -228, // NOLINT - 981, -188, 982, -216, 983, -32, - 985, -4, 987, -4, 989, -4, - 991, -4, 993, -4, // NOLINT - 995, -4, 997, -4, 999, -4, - 1001, -4, 1003, -4, 1005, -4, - 1007, -4, 1008, -344, // NOLINT - 1009, -320, 1010, 28, 1011, -464, - 1013, -384, 1016, -4, 1019, -4, - 1073742896, -128, 1103, -128, // NOLINT - 1073742928, -320, 1119, -320, 1121, -4, - 1123, -4, 1125, -4, 1127, -4, - 1129, -4, 1131, -4, // NOLINT - 1133, -4, 1135, -4, 1137, -4, - 1139, -4, 1141, -4, 1143, -4, - 1145, -4, 1147, -4, // NOLINT - 1149, -4, 1151, -4, 1153, -4, - 1163, -4, 1165, -4, 1167, -4, - 1169, -4, 1171, -4, // NOLINT - 1173, -4, 1175, -4, 1177, -4, - 1179, -4, 1181, -4, 1183, -4, - 1185, -4, 1187, -4, // NOLINT - 1189, -4, 1191, -4, 1193, -4, - 1195, -4, 1197, -4, 1199, -4, - 1201, -4, 1203, -4, // NOLINT - 1205, -4, 1207, -4, 1209, -4, - 1211, -4, 1213, -4, 1215, -4, - 1218, -4, 1220, -4, // NOLINT - 1222, -4, 1224, -4, 1226, -4, - 1228, -4, 1230, -4, 1231, -60, - 1233, -4, 1235, -4, // NOLINT - 1237, -4, 1239, -4, 1241, -4, - 1243, -4, 1245, -4, 1247, -4, - 1249, -4, 1251, -4, // NOLINT - 1253, -4, 1255, -4, 1257, -4, - 1259, -4, 1261, -4, 1263, -4, - 1265, -4, 1267, -4, // NOLINT - 1269, -4, 1271, -4, 1273, -4, - 1275, -4, 1277, -4, 1279, -4, - 1281, -4, 1283, -4, // NOLINT - 1285, -4, 1287, -4, 1289, -4, - 1291, -4, 1293, -4, 1295, -4, - 1297, -4, 1299, -4, // NOLINT - 1301, -4, 1303, -4, 1305, -4, - 1307, -4, 1309, -4, 1311, -4, - 1313, -4, 1315, -4, // NOLINT - 1317, -4, 1319, -4, 1321, -4, - 1323, -4, 1325, -4, 1327, -4, - 1073743201, -192, 1414, -192, // NOLINT - 7545, 141328, 7549, 15256, 7681, -4, - 7683, -4, 7685, -4, 7687, -4, - 7689, -4, 7691, -4, // NOLINT - 7693, -4, 7695, -4, 7697, -4, - 7699, -4, 7701, -4, 7703, -4, - 7705, -4, 7707, -4, // NOLINT - 7709, -4, 7711, -4, 7713, -4, - 7715, -4, 7717, -4, 7719, -4, - 7721, -4, 7723, -4, // NOLINT - 7725, -4, 7727, -4, 7729, -4, - 7731, -4, 7733, -4, 7735, -4, - 7737, -4, 7739, -4, // NOLINT - 7741, -4, 7743, -4, 7745, -4, - 7747, -4, 7749, -4, 7751, -4, - 7753, -4, 7755, -4, // NOLINT - 7757, -4, 7759, -4, 7761, -4, - 7763, -4, 7765, -4, 7767, -4, - 7769, -4, 7771, -4, // NOLINT - 7773, -4, 7775, -4, 7777, -4, - 7779, -4, 7781, -4, 7783, -4, - 7785, -4, 7787, -4, // NOLINT - 7789, -4, 7791, -4, 7793, -4, - 7795, -4, 7797, -4, 7799, -4, - 7801, -4, 7803, -4, // NOLINT - 7805, -4, 7807, -4, 7809, -4, - 7811, -4, 7813, -4, 7815, -4, - 7817, -4, 7819, -4, // NOLINT - 7821, -4, 7823, -4, 7825, -4, - 7827, -4, 7829, -4, 7835, -236, - 7841, -4, 7843, -4, // NOLINT - 7845, -4, 7847, -4, 7849, -4, - 7851, -4, 7853, -4, 7855, -4, - 7857, -4, 7859, -4, // NOLINT - 7861, -4, 7863, -4, 7865, -4, - 7867, -4, 7869, -4, 7871, -4, - 7873, -4, 7875, -4, // NOLINT - 7877, -4, 7879, -4, 7881, -4, - 7883, -4, 7885, -4, 7887, -4, - 7889, -4, 7891, -4, // NOLINT - 7893, -4, 7895, -4, 7897, -4, - 7899, -4, 7901, -4, 7903, -4, - 7905, -4, 7907, -4, // NOLINT - 7909, -4, 7911, -4, 7913, -4, - 7915, -4, 7917, -4, 7919, -4, - 7921, -4, 7923, -4, // NOLINT - 7925, -4, 7927, -4, 7929, -4, - 7931, -4, 7933, -4, 7935, -4, - 1073749760, 32, 7943, 32, // NOLINT - 1073749776, 32, 7957, 32, 1073749792, 32, - 7975, 32, 1073749808, 32, 7991, 32, - 1073749824, 32, 8005, 32, // NOLINT - 8017, 32, 8019, 32, 8021, 32, - 8023, 32, 1073749856, 32, 8039, 32, - 1073749872, 296, 8049, 296, // NOLINT - 1073749874, 344, 8053, 344, 1073749878, 400, - 8055, 400, 1073749880, 512, 8057, 512, - 1073749882, 448, 8059, 448, // NOLINT - 1073749884, 504, 8061, 504, 1073749936, 32, - 8113, 32, 8126, -28820, 1073749968, 32, - 8145, 32, 1073749984, 32, // NOLINT - 8161, 32, 8165, 28}; // NOLINT -static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings1[1] = - { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262CanonicalizeTable1Size = 73; // NOLINT -static const int32_t kEcma262CanonicalizeTable1[146] = { - 334, -112, 1073742192, -64, 383, -64, - 388, -4, 1073743056, -104, 1257, -104, - 1073744944, -192, 3166, -192, // NOLINT - 3169, -4, 3173, -43180, 3174, -43168, - 3176, -4, 3178, -4, 3180, -4, - 3187, -4, 3190, -4, // NOLINT - 3201, -4, 3203, -4, 3205, -4, - 3207, -4, 3209, -4, 3211, -4, - 3213, -4, 3215, -4, // NOLINT - 3217, -4, 3219, -4, 3221, -4, - 3223, -4, 3225, -4, 3227, -4, - 3229, -4, 3231, -4, // NOLINT - 3233, -4, 3235, -4, 3237, -4, - 3239, -4, 3241, -4, 3243, -4, - 3245, -4, 3247, -4, // NOLINT - 3249, -4, 3251, -4, 3253, -4, - 3255, -4, 3257, -4, 3259, -4, - 3261, -4, 3263, -4, // NOLINT - 3265, -4, 3267, -4, 3269, -4, - 3271, -4, 3273, -4, 3275, -4, - 3277, -4, 3279, -4, // NOLINT - 3281, -4, 3283, -4, 3285, -4, - 3287, -4, 3289, -4, 3291, -4, - 3293, -4, 3295, -4, // NOLINT - 3297, -4, 3299, -4, 3308, -4, - 3310, -4, 3315, -4, 1073745152, -29056, - 3365, -29056, 3367, -29056, // NOLINT - 3373, -29056}; // NOLINT -static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings5[1] = - { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262CanonicalizeTable5Size = 95; // NOLINT -static const int32_t kEcma262CanonicalizeTable5[190] = { - 1601, -4, 1603, -4, 1605, -4, 1607, -4, - 1609, -4, 1611, -4, 1613, -4, 1615, -4, // NOLINT - 1617, -4, 1619, -4, 1621, -4, 1623, -4, - 1625, -4, 1627, -4, 1629, -4, 1631, -4, // NOLINT - 1633, -4, 1635, -4, 1637, -4, 1639, -4, - 1641, -4, 1643, -4, 1645, -4, 1665, -4, // NOLINT - 1667, -4, 1669, -4, 1671, -4, 1673, -4, - 1675, -4, 1677, -4, 1679, -4, 1681, -4, // NOLINT - 1683, -4, 1685, -4, 1687, -4, 1689, -4, - 1691, -4, 1827, -4, 1829, -4, 1831, -4, // NOLINT - 1833, -4, 1835, -4, 1837, -4, 1839, -4, - 1843, -4, 1845, -4, 1847, -4, 1849, -4, // NOLINT - 1851, -4, 1853, -4, 1855, -4, 1857, -4, - 1859, -4, 1861, -4, 1863, -4, 1865, -4, // NOLINT - 1867, -4, 1869, -4, 1871, -4, 1873, -4, - 1875, -4, 1877, -4, 1879, -4, 1881, -4, // NOLINT - 1883, -4, 1885, -4, 1887, -4, 1889, -4, - 1891, -4, 1893, -4, 1895, -4, 1897, -4, // NOLINT - 1899, -4, 1901, -4, 1903, -4, 1914, -4, - 1916, -4, 1919, -4, 1921, -4, 1923, -4, // NOLINT - 1925, -4, 1927, -4, 1932, -4, 1937, -4, - 1939, -4, 1943, -4, 1945, -4, 1947, -4, // NOLINT - 1949, -4, 1951, -4, 1953, -4, 1955, -4, - 1957, -4, 1959, -4, 1961, -4}; // NOLINT -static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings7[1] = - { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262CanonicalizeTable7Size = 2; // NOLINT -static const int32_t kEcma262CanonicalizeTable7[4] = {1073749825, -128, 8026, - -128}; // NOLINT -int Ecma262Canonicalize::Convert(uchar c, uchar n, uchar* result, - bool* allow_caching_ptr) { - int chunk_index = c >> 13; - switch (chunk_index) { - case 0: - return LookupMapping( - kEcma262CanonicalizeTable0, kEcma262CanonicalizeTable0Size, - kEcma262CanonicalizeMultiStrings0, c, n, result, allow_caching_ptr); - case 1: - return LookupMapping( - kEcma262CanonicalizeTable1, kEcma262CanonicalizeTable1Size, - kEcma262CanonicalizeMultiStrings1, c, n, result, allow_caching_ptr); - case 5: - return LookupMapping( - kEcma262CanonicalizeTable5, kEcma262CanonicalizeTable5Size, - kEcma262CanonicalizeMultiStrings5, c, n, result, allow_caching_ptr); - case 7: - return LookupMapping( - kEcma262CanonicalizeTable7, kEcma262CanonicalizeTable7Size, - kEcma262CanonicalizeMultiStrings7, c, n, result, allow_caching_ptr); - default: - return 0; - } -} - -static const MultiCharacterSpecialCase<4> - kEcma262UnCanonicalizeMultiStrings0[507] = { // NOLINT - {{65, 97, kSentinel}}, - {{90, 122, kSentinel}}, - {{181, 924, 956, kSentinel}}, - {{192, 224, kSentinel}}, // NOLINT - {{214, 246, kSentinel}}, - {{216, 248, kSentinel}}, - {{222, 254, kSentinel}}, - {{255, 376, kSentinel}}, // NOLINT - {{256, 257, kSentinel}}, - {{258, 259, kSentinel}}, - {{260, 261, kSentinel}}, - {{262, 263, kSentinel}}, // NOLINT - {{264, 265, kSentinel}}, - {{266, 267, kSentinel}}, - {{268, 269, kSentinel}}, - {{270, 271, kSentinel}}, // NOLINT - {{272, 273, kSentinel}}, - {{274, 275, kSentinel}}, - {{276, 277, kSentinel}}, - {{278, 279, kSentinel}}, // NOLINT - {{280, 281, kSentinel}}, - {{282, 283, kSentinel}}, - {{284, 285, kSentinel}}, - {{286, 287, kSentinel}}, // NOLINT - {{288, 289, kSentinel}}, - {{290, 291, kSentinel}}, - {{292, 293, kSentinel}}, - {{294, 295, kSentinel}}, // NOLINT - {{296, 297, kSentinel}}, - {{298, 299, kSentinel}}, - {{300, 301, kSentinel}}, - {{302, 303, kSentinel}}, // NOLINT - {{306, 307, kSentinel}}, - {{308, 309, kSentinel}}, - {{310, 311, kSentinel}}, - {{313, 314, kSentinel}}, // NOLINT - {{315, 316, kSentinel}}, - {{317, 318, kSentinel}}, - {{319, 320, kSentinel}}, - {{321, 322, kSentinel}}, // NOLINT - {{323, 324, kSentinel}}, - {{325, 326, kSentinel}}, - {{327, 328, kSentinel}}, - {{330, 331, kSentinel}}, // NOLINT - {{332, 333, kSentinel}}, - {{334, 335, kSentinel}}, - {{336, 337, kSentinel}}, - {{338, 339, kSentinel}}, // NOLINT - {{340, 341, kSentinel}}, - {{342, 343, kSentinel}}, - {{344, 345, kSentinel}}, - {{346, 347, kSentinel}}, // NOLINT - {{348, 349, kSentinel}}, - {{350, 351, kSentinel}}, - {{352, 353, kSentinel}}, - {{354, 355, kSentinel}}, // NOLINT - {{356, 357, kSentinel}}, - {{358, 359, kSentinel}}, - {{360, 361, kSentinel}}, - {{362, 363, kSentinel}}, // NOLINT - {{364, 365, kSentinel}}, - {{366, 367, kSentinel}}, - {{368, 369, kSentinel}}, - {{370, 371, kSentinel}}, // NOLINT - {{372, 373, kSentinel}}, - {{374, 375, kSentinel}}, - {{377, 378, kSentinel}}, - {{379, 380, kSentinel}}, // NOLINT - {{381, 382, kSentinel}}, - {{384, 579, kSentinel}}, - {{385, 595, kSentinel}}, - {{386, 387, kSentinel}}, // NOLINT - {{388, 389, kSentinel}}, - {{390, 596, kSentinel}}, - {{391, 392, kSentinel}}, - {{393, 598, kSentinel}}, // NOLINT - {{394, 599, kSentinel}}, - {{395, 396, kSentinel}}, - {{398, 477, kSentinel}}, - {{399, 601, kSentinel}}, // NOLINT - {{400, 603, kSentinel}}, - {{401, 402, kSentinel}}, - {{403, 608, kSentinel}}, - {{404, 611, kSentinel}}, // NOLINT - {{405, 502, kSentinel}}, - {{406, 617, kSentinel}}, - {{407, 616, kSentinel}}, - {{408, 409, kSentinel}}, // NOLINT - {{410, 573, kSentinel}}, - {{412, 623, kSentinel}}, - {{413, 626, kSentinel}}, - {{414, 544, kSentinel}}, // NOLINT - {{415, 629, kSentinel}}, - {{416, 417, kSentinel}}, - {{418, 419, kSentinel}}, - {{420, 421, kSentinel}}, // NOLINT - {{422, 640, kSentinel}}, - {{423, 424, kSentinel}}, - {{425, 643, kSentinel}}, - {{428, 429, kSentinel}}, // NOLINT - {{430, 648, kSentinel}}, - {{431, 432, kSentinel}}, - {{433, 650, kSentinel}}, - {{434, 651, kSentinel}}, // NOLINT - {{435, 436, kSentinel}}, - {{437, 438, kSentinel}}, - {{439, 658, kSentinel}}, - {{440, 441, kSentinel}}, // NOLINT - {{444, 445, kSentinel}}, - {{447, 503, kSentinel}}, - {{452, 453, 454, kSentinel}}, - {{455, 456, 457, kSentinel}}, // NOLINT - {{458, 459, 460, kSentinel}}, - {{461, 462, kSentinel}}, - {{463, 464, kSentinel}}, - {{465, 466, kSentinel}}, // NOLINT - {{467, 468, kSentinel}}, - {{469, 470, kSentinel}}, - {{471, 472, kSentinel}}, - {{473, 474, kSentinel}}, // NOLINT - {{475, 476, kSentinel}}, - {{478, 479, kSentinel}}, - {{480, 481, kSentinel}}, - {{482, 483, kSentinel}}, // NOLINT - {{484, 485, kSentinel}}, - {{486, 487, kSentinel}}, - {{488, 489, kSentinel}}, - {{490, 491, kSentinel}}, // NOLINT - {{492, 493, kSentinel}}, - {{494, 495, kSentinel}}, - {{497, 498, 499, kSentinel}}, - {{500, 501, kSentinel}}, // NOLINT - {{504, 505, kSentinel}}, - {{506, 507, kSentinel}}, - {{508, 509, kSentinel}}, - {{510, 511, kSentinel}}, // NOLINT - {{512, 513, kSentinel}}, - {{514, 515, kSentinel}}, - {{516, 517, kSentinel}}, - {{518, 519, kSentinel}}, // NOLINT - {{520, 521, kSentinel}}, - {{522, 523, kSentinel}}, - {{524, 525, kSentinel}}, - {{526, 527, kSentinel}}, // NOLINT - {{528, 529, kSentinel}}, - {{530, 531, kSentinel}}, - {{532, 533, kSentinel}}, - {{534, 535, kSentinel}}, // NOLINT - {{536, 537, kSentinel}}, - {{538, 539, kSentinel}}, - {{540, 541, kSentinel}}, - {{542, 543, kSentinel}}, // NOLINT - {{546, 547, kSentinel}}, - {{548, 549, kSentinel}}, - {{550, 551, kSentinel}}, - {{552, 553, kSentinel}}, // NOLINT - {{554, 555, kSentinel}}, - {{556, 557, kSentinel}}, - {{558, 559, kSentinel}}, - {{560, 561, kSentinel}}, // NOLINT - {{562, 563, kSentinel}}, - {{570, 11365, kSentinel}}, - {{571, 572, kSentinel}}, - {{574, 11366, kSentinel}}, // NOLINT - {{575, 11390, kSentinel}}, - {{576, 11391, kSentinel}}, - {{577, 578, kSentinel}}, - {{580, 649, kSentinel}}, // NOLINT - {{581, 652, kSentinel}}, - {{582, 583, kSentinel}}, - {{584, 585, kSentinel}}, - {{586, 587, kSentinel}}, // NOLINT - {{588, 589, kSentinel}}, - {{590, 591, kSentinel}}, - {{592, 11375, kSentinel}}, - {{593, 11373, kSentinel}}, // NOLINT - {{594, 11376, kSentinel}}, - {{604, 42923, kSentinel}}, - {{609, 42924, kSentinel}}, - {{613, 42893, kSentinel}}, // NOLINT - {{614, 42922, kSentinel}}, - {{619, 11362, kSentinel}}, - {{620, 42925, kSentinel}}, - {{625, 11374, kSentinel}}, // NOLINT - {{637, 11364, kSentinel}}, - {{647, 42929, kSentinel}}, - {{670, 42928, kSentinel}}, - {{837, 921, 953, 8126}}, // NOLINT - {{880, 881, kSentinel}}, - {{882, 883, kSentinel}}, - {{886, 887, kSentinel}}, - {{891, 1021, kSentinel}}, // NOLINT - {{893, 1023, kSentinel}}, - {{895, 1011, kSentinel}}, - {{902, 940, kSentinel}}, - {{904, 941, kSentinel}}, // NOLINT - {{906, 943, kSentinel}}, - {{908, 972, kSentinel}}, - {{910, 973, kSentinel}}, - {{911, 974, kSentinel}}, // NOLINT - {{913, 945, kSentinel}}, - {{914, 946, 976, kSentinel}}, - {{915, 947, kSentinel}}, - {{916, 948, kSentinel}}, // NOLINT - {{917, 949, 1013, kSentinel}}, - {{918, 950, kSentinel}}, - {{919, 951, kSentinel}}, - {{920, 952, 977, kSentinel}}, // NOLINT - {{922, 954, 1008, kSentinel}}, - {{923, 955, kSentinel}}, - {{925, 957, kSentinel}}, - {{927, 959, kSentinel}}, // NOLINT - {{928, 960, 982, kSentinel}}, - {{929, 961, 1009, kSentinel}}, - {{931, 962, 963, kSentinel}}, - {{932, 964, kSentinel}}, // NOLINT - {{933, 965, kSentinel}}, - {{934, 966, 981, kSentinel}}, - {{935, 967, kSentinel}}, - {{939, 971, kSentinel}}, // NOLINT - {{975, 983, kSentinel}}, - {{984, 985, kSentinel}}, - {{986, 987, kSentinel}}, - {{988, 989, kSentinel}}, // NOLINT - {{990, 991, kSentinel}}, - {{992, 993, kSentinel}}, - {{994, 995, kSentinel}}, - {{996, 997, kSentinel}}, // NOLINT - {{998, 999, kSentinel}}, - {{1000, 1001, kSentinel}}, - {{1002, 1003, kSentinel}}, - {{1004, 1005, kSentinel}}, // NOLINT - {{1006, 1007, kSentinel}}, - {{1010, 1017, kSentinel}}, - {{1015, 1016, kSentinel}}, - {{1018, 1019, kSentinel}}, // NOLINT - {{1024, 1104, kSentinel}}, - {{1039, 1119, kSentinel}}, - {{1040, 1072, kSentinel}}, - {{1071, 1103, kSentinel}}, // NOLINT - {{1120, 1121, kSentinel}}, - {{1122, 1123, kSentinel}}, - {{1124, 1125, kSentinel}}, - {{1126, 1127, kSentinel}}, // NOLINT - {{1128, 1129, kSentinel}}, - {{1130, 1131, kSentinel}}, - {{1132, 1133, kSentinel}}, - {{1134, 1135, kSentinel}}, // NOLINT - {{1136, 1137, kSentinel}}, - {{1138, 1139, kSentinel}}, - {{1140, 1141, kSentinel}}, - {{1142, 1143, kSentinel}}, // NOLINT - {{1144, 1145, kSentinel}}, - {{1146, 1147, kSentinel}}, - {{1148, 1149, kSentinel}}, - {{1150, 1151, kSentinel}}, // NOLINT - {{1152, 1153, kSentinel}}, - {{1162, 1163, kSentinel}}, - {{1164, 1165, kSentinel}}, - {{1166, 1167, kSentinel}}, // NOLINT - {{1168, 1169, kSentinel}}, - {{1170, 1171, kSentinel}}, - {{1172, 1173, kSentinel}}, - {{1174, 1175, kSentinel}}, // NOLINT - {{1176, 1177, kSentinel}}, - {{1178, 1179, kSentinel}}, - {{1180, 1181, kSentinel}}, - {{1182, 1183, kSentinel}}, // NOLINT - {{1184, 1185, kSentinel}}, - {{1186, 1187, kSentinel}}, - {{1188, 1189, kSentinel}}, - {{1190, 1191, kSentinel}}, // NOLINT - {{1192, 1193, kSentinel}}, - {{1194, 1195, kSentinel}}, - {{1196, 1197, kSentinel}}, - {{1198, 1199, kSentinel}}, // NOLINT - {{1200, 1201, kSentinel}}, - {{1202, 1203, kSentinel}}, - {{1204, 1205, kSentinel}}, - {{1206, 1207, kSentinel}}, // NOLINT - {{1208, 1209, kSentinel}}, - {{1210, 1211, kSentinel}}, - {{1212, 1213, kSentinel}}, - {{1214, 1215, kSentinel}}, // NOLINT - {{1216, 1231, kSentinel}}, - {{1217, 1218, kSentinel}}, - {{1219, 1220, kSentinel}}, - {{1221, 1222, kSentinel}}, // NOLINT - {{1223, 1224, kSentinel}}, - {{1225, 1226, kSentinel}}, - {{1227, 1228, kSentinel}}, - {{1229, 1230, kSentinel}}, // NOLINT - {{1232, 1233, kSentinel}}, - {{1234, 1235, kSentinel}}, - {{1236, 1237, kSentinel}}, - {{1238, 1239, kSentinel}}, // NOLINT - {{1240, 1241, kSentinel}}, - {{1242, 1243, kSentinel}}, - {{1244, 1245, kSentinel}}, - {{1246, 1247, kSentinel}}, // NOLINT - {{1248, 1249, kSentinel}}, - {{1250, 1251, kSentinel}}, - {{1252, 1253, kSentinel}}, - {{1254, 1255, kSentinel}}, // NOLINT - {{1256, 1257, kSentinel}}, - {{1258, 1259, kSentinel}}, - {{1260, 1261, kSentinel}}, - {{1262, 1263, kSentinel}}, // NOLINT - {{1264, 1265, kSentinel}}, - {{1266, 1267, kSentinel}}, - {{1268, 1269, kSentinel}}, - {{1270, 1271, kSentinel}}, // NOLINT - {{1272, 1273, kSentinel}}, - {{1274, 1275, kSentinel}}, - {{1276, 1277, kSentinel}}, - {{1278, 1279, kSentinel}}, // NOLINT - {{1280, 1281, kSentinel}}, - {{1282, 1283, kSentinel}}, - {{1284, 1285, kSentinel}}, - {{1286, 1287, kSentinel}}, // NOLINT - {{1288, 1289, kSentinel}}, - {{1290, 1291, kSentinel}}, - {{1292, 1293, kSentinel}}, - {{1294, 1295, kSentinel}}, // NOLINT - {{1296, 1297, kSentinel}}, - {{1298, 1299, kSentinel}}, - {{1300, 1301, kSentinel}}, - {{1302, 1303, kSentinel}}, // NOLINT - {{1304, 1305, kSentinel}}, - {{1306, 1307, kSentinel}}, - {{1308, 1309, kSentinel}}, - {{1310, 1311, kSentinel}}, // NOLINT - {{1312, 1313, kSentinel}}, - {{1314, 1315, kSentinel}}, - {{1316, 1317, kSentinel}}, - {{1318, 1319, kSentinel}}, // NOLINT - {{1320, 1321, kSentinel}}, - {{1322, 1323, kSentinel}}, - {{1324, 1325, kSentinel}}, - {{1326, 1327, kSentinel}}, // NOLINT - {{1329, 1377, kSentinel}}, - {{1366, 1414, kSentinel}}, - {{4256, 11520, kSentinel}}, - {{4293, 11557, kSentinel}}, // NOLINT - {{4295, 11559, kSentinel}}, - {{4301, 11565, kSentinel}}, - {{7545, 42877, kSentinel}}, - {{7549, 11363, kSentinel}}, // NOLINT - {{7680, 7681, kSentinel}}, - {{7682, 7683, kSentinel}}, - {{7684, 7685, kSentinel}}, - {{7686, 7687, kSentinel}}, // NOLINT - {{7688, 7689, kSentinel}}, - {{7690, 7691, kSentinel}}, - {{7692, 7693, kSentinel}}, - {{7694, 7695, kSentinel}}, // NOLINT - {{7696, 7697, kSentinel}}, - {{7698, 7699, kSentinel}}, - {{7700, 7701, kSentinel}}, - {{7702, 7703, kSentinel}}, // NOLINT - {{7704, 7705, kSentinel}}, - {{7706, 7707, kSentinel}}, - {{7708, 7709, kSentinel}}, - {{7710, 7711, kSentinel}}, // NOLINT - {{7712, 7713, kSentinel}}, - {{7714, 7715, kSentinel}}, - {{7716, 7717, kSentinel}}, - {{7718, 7719, kSentinel}}, // NOLINT - {{7720, 7721, kSentinel}}, - {{7722, 7723, kSentinel}}, - {{7724, 7725, kSentinel}}, - {{7726, 7727, kSentinel}}, // NOLINT - {{7728, 7729, kSentinel}}, - {{7730, 7731, kSentinel}}, - {{7732, 7733, kSentinel}}, - {{7734, 7735, kSentinel}}, // NOLINT - {{7736, 7737, kSentinel}}, - {{7738, 7739, kSentinel}}, - {{7740, 7741, kSentinel}}, - {{7742, 7743, kSentinel}}, // NOLINT - {{7744, 7745, kSentinel}}, - {{7746, 7747, kSentinel}}, - {{7748, 7749, kSentinel}}, - {{7750, 7751, kSentinel}}, // NOLINT - {{7752, 7753, kSentinel}}, - {{7754, 7755, kSentinel}}, - {{7756, 7757, kSentinel}}, - {{7758, 7759, kSentinel}}, // NOLINT - {{7760, 7761, kSentinel}}, - {{7762, 7763, kSentinel}}, - {{7764, 7765, kSentinel}}, - {{7766, 7767, kSentinel}}, // NOLINT - {{7768, 7769, kSentinel}}, - {{7770, 7771, kSentinel}}, - {{7772, 7773, kSentinel}}, - {{7774, 7775, kSentinel}}, // NOLINT - {{7776, 7777, 7835, kSentinel}}, - {{7778, 7779, kSentinel}}, - {{7780, 7781, kSentinel}}, - {{7782, 7783, kSentinel}}, // NOLINT - {{7784, 7785, kSentinel}}, - {{7786, 7787, kSentinel}}, - {{7788, 7789, kSentinel}}, - {{7790, 7791, kSentinel}}, // NOLINT - {{7792, 7793, kSentinel}}, - {{7794, 7795, kSentinel}}, - {{7796, 7797, kSentinel}}, - {{7798, 7799, kSentinel}}, // NOLINT - {{7800, 7801, kSentinel}}, - {{7802, 7803, kSentinel}}, - {{7804, 7805, kSentinel}}, - {{7806, 7807, kSentinel}}, // NOLINT - {{7808, 7809, kSentinel}}, - {{7810, 7811, kSentinel}}, - {{7812, 7813, kSentinel}}, - {{7814, 7815, kSentinel}}, // NOLINT - {{7816, 7817, kSentinel}}, - {{7818, 7819, kSentinel}}, - {{7820, 7821, kSentinel}}, - {{7822, 7823, kSentinel}}, // NOLINT - {{7824, 7825, kSentinel}}, - {{7826, 7827, kSentinel}}, - {{7828, 7829, kSentinel}}, - {{7840, 7841, kSentinel}}, // NOLINT - {{7842, 7843, kSentinel}}, - {{7844, 7845, kSentinel}}, - {{7846, 7847, kSentinel}}, - {{7848, 7849, kSentinel}}, // NOLINT - {{7850, 7851, kSentinel}}, - {{7852, 7853, kSentinel}}, - {{7854, 7855, kSentinel}}, - {{7856, 7857, kSentinel}}, // NOLINT - {{7858, 7859, kSentinel}}, - {{7860, 7861, kSentinel}}, - {{7862, 7863, kSentinel}}, - {{7864, 7865, kSentinel}}, // NOLINT - {{7866, 7867, kSentinel}}, - {{7868, 7869, kSentinel}}, - {{7870, 7871, kSentinel}}, - {{7872, 7873, kSentinel}}, // NOLINT - {{7874, 7875, kSentinel}}, - {{7876, 7877, kSentinel}}, - {{7878, 7879, kSentinel}}, - {{7880, 7881, kSentinel}}, // NOLINT - {{7882, 7883, kSentinel}}, - {{7884, 7885, kSentinel}}, - {{7886, 7887, kSentinel}}, - {{7888, 7889, kSentinel}}, // NOLINT - {{7890, 7891, kSentinel}}, - {{7892, 7893, kSentinel}}, - {{7894, 7895, kSentinel}}, - {{7896, 7897, kSentinel}}, // NOLINT - {{7898, 7899, kSentinel}}, - {{7900, 7901, kSentinel}}, - {{7902, 7903, kSentinel}}, - {{7904, 7905, kSentinel}}, // NOLINT - {{7906, 7907, kSentinel}}, - {{7908, 7909, kSentinel}}, - {{7910, 7911, kSentinel}}, - {{7912, 7913, kSentinel}}, // NOLINT - {{7914, 7915, kSentinel}}, - {{7916, 7917, kSentinel}}, - {{7918, 7919, kSentinel}}, - {{7920, 7921, kSentinel}}, // NOLINT - {{7922, 7923, kSentinel}}, - {{7924, 7925, kSentinel}}, - {{7926, 7927, kSentinel}}, - {{7928, 7929, kSentinel}}, // NOLINT - {{7930, 7931, kSentinel}}, - {{7932, 7933, kSentinel}}, - {{7934, 7935, kSentinel}}, - {{7936, 7944, kSentinel}}, // NOLINT - {{7943, 7951, kSentinel}}, - {{7952, 7960, kSentinel}}, - {{7957, 7965, kSentinel}}, - {{7968, 7976, kSentinel}}, // NOLINT - {{7975, 7983, kSentinel}}, - {{7984, 7992, kSentinel}}, - {{7991, 7999, kSentinel}}, - {{8000, 8008, kSentinel}}, // NOLINT - {{8005, 8013, kSentinel}}, - {{8017, 8025, kSentinel}}, - {{8019, 8027, kSentinel}}, - {{8021, 8029, kSentinel}}, // NOLINT - {{8023, 8031, kSentinel}}, - {{8032, 8040, kSentinel}}, - {{8039, 8047, kSentinel}}, - {{8048, 8122, kSentinel}}, // NOLINT - {{8049, 8123, kSentinel}}, - {{8050, 8136, kSentinel}}, - {{8053, 8139, kSentinel}}, - {{8054, 8154, kSentinel}}, // NOLINT - {{8055, 8155, kSentinel}}, - {{8056, 8184, kSentinel}}, - {{8057, 8185, kSentinel}}, - {{8058, 8170, kSentinel}}, // NOLINT - {{8059, 8171, kSentinel}}, - {{8060, 8186, kSentinel}}, - {{8061, 8187, kSentinel}}, - {{8112, 8120, kSentinel}}, // NOLINT - {{8113, 8121, kSentinel}}, - {{8144, 8152, kSentinel}}, - {{8145, 8153, kSentinel}}, - {{8160, 8168, kSentinel}}, // NOLINT - {{8161, 8169, kSentinel}}, - {{8165, 8172, kSentinel}}, - {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262UnCanonicalizeTable0Size = 1005; // NOLINT -static const int32_t kEcma262UnCanonicalizeTable0[2010] = { - 1073741889, 1, 90, 5, 1073741921, 1, - 122, 5, 181, 9, 1073742016, 13, - 214, 17, 1073742040, 21, // NOLINT - 222, 25, 1073742048, 13, 246, 17, - 1073742072, 21, 254, 25, 255, 29, - 256, 33, 257, 33, // NOLINT - 258, 37, 259, 37, 260, 41, - 261, 41, 262, 45, 263, 45, - 264, 49, 265, 49, // NOLINT - 266, 53, 267, 53, 268, 57, - 269, 57, 270, 61, 271, 61, - 272, 65, 273, 65, // NOLINT - 274, 69, 275, 69, 276, 73, - 277, 73, 278, 77, 279, 77, - 280, 81, 281, 81, // NOLINT - 282, 85, 283, 85, 284, 89, - 285, 89, 286, 93, 287, 93, - 288, 97, 289, 97, // NOLINT - 290, 101, 291, 101, 292, 105, - 293, 105, 294, 109, 295, 109, - 296, 113, 297, 113, // NOLINT - 298, 117, 299, 117, 300, 121, - 301, 121, 302, 125, 303, 125, - 306, 129, 307, 129, // NOLINT - 308, 133, 309, 133, 310, 137, - 311, 137, 313, 141, 314, 141, - 315, 145, 316, 145, // NOLINT - 317, 149, 318, 149, 319, 153, - 320, 153, 321, 157, 322, 157, - 323, 161, 324, 161, // NOLINT - 325, 165, 326, 165, 327, 169, - 328, 169, 330, 173, 331, 173, - 332, 177, 333, 177, // NOLINT - 334, 181, 335, 181, 336, 185, - 337, 185, 338, 189, 339, 189, - 340, 193, 341, 193, // NOLINT - 342, 197, 343, 197, 344, 201, - 345, 201, 346, 205, 347, 205, - 348, 209, 349, 209, // NOLINT - 350, 213, 351, 213, 352, 217, - 353, 217, 354, 221, 355, 221, - 356, 225, 357, 225, // NOLINT - 358, 229, 359, 229, 360, 233, - 361, 233, 362, 237, 363, 237, - 364, 241, 365, 241, // NOLINT - 366, 245, 367, 245, 368, 249, - 369, 249, 370, 253, 371, 253, - 372, 257, 373, 257, // NOLINT - 374, 261, 375, 261, 376, 29, - 377, 265, 378, 265, 379, 269, - 380, 269, 381, 273, // NOLINT - 382, 273, 384, 277, 385, 281, - 386, 285, 387, 285, 388, 289, - 389, 289, 390, 293, // NOLINT - 391, 297, 392, 297, 1073742217, 301, - 394, 305, 395, 309, 396, 309, - 398, 313, 399, 317, // NOLINT - 400, 321, 401, 325, 402, 325, - 403, 329, 404, 333, 405, 337, - 406, 341, 407, 345, // NOLINT - 408, 349, 409, 349, 410, 353, - 412, 357, 413, 361, 414, 365, - 415, 369, 416, 373, // NOLINT - 417, 373, 418, 377, 419, 377, - 420, 381, 421, 381, 422, 385, - 423, 389, 424, 389, // NOLINT - 425, 393, 428, 397, 429, 397, - 430, 401, 431, 405, 432, 405, - 1073742257, 409, 434, 413, // NOLINT - 435, 417, 436, 417, 437, 421, - 438, 421, 439, 425, 440, 429, - 441, 429, 444, 433, // NOLINT - 445, 433, 447, 437, 452, 441, - 453, 441, 454, 441, 455, 445, - 456, 445, 457, 445, // NOLINT - 458, 449, 459, 449, 460, 449, - 461, 453, 462, 453, 463, 457, - 464, 457, 465, 461, // NOLINT - 466, 461, 467, 465, 468, 465, - 469, 469, 470, 469, 471, 473, - 472, 473, 473, 477, // NOLINT - 474, 477, 475, 481, 476, 481, - 477, 313, 478, 485, 479, 485, - 480, 489, 481, 489, // NOLINT - 482, 493, 483, 493, 484, 497, - 485, 497, 486, 501, 487, 501, - 488, 505, 489, 505, // NOLINT - 490, 509, 491, 509, 492, 513, - 493, 513, 494, 517, 495, 517, - 497, 521, 498, 521, // NOLINT - 499, 521, 500, 525, 501, 525, - 502, 337, 503, 437, 504, 529, - 505, 529, 506, 533, // NOLINT - 507, 533, 508, 537, 509, 537, - 510, 541, 511, 541, 512, 545, - 513, 545, 514, 549, // NOLINT - 515, 549, 516, 553, 517, 553, - 518, 557, 519, 557, 520, 561, - 521, 561, 522, 565, // NOLINT - 523, 565, 524, 569, 525, 569, - 526, 573, 527, 573, 528, 577, - 529, 577, 530, 581, // NOLINT - 531, 581, 532, 585, 533, 585, - 534, 589, 535, 589, 536, 593, - 537, 593, 538, 597, // NOLINT - 539, 597, 540, 601, 541, 601, - 542, 605, 543, 605, 544, 365, - 546, 609, 547, 609, // NOLINT - 548, 613, 549, 613, 550, 617, - 551, 617, 552, 621, 553, 621, - 554, 625, 555, 625, // NOLINT - 556, 629, 557, 629, 558, 633, - 559, 633, 560, 637, 561, 637, - 562, 641, 563, 641, // NOLINT - 570, 645, 571, 649, 572, 649, - 573, 353, 574, 653, 1073742399, 657, - 576, 661, 577, 665, // NOLINT - 578, 665, 579, 277, 580, 669, - 581, 673, 582, 677, 583, 677, - 584, 681, 585, 681, // NOLINT - 586, 685, 587, 685, 588, 689, - 589, 689, 590, 693, 591, 693, - 592, 697, 593, 701, // NOLINT - 594, 705, 595, 281, 596, 293, - 1073742422, 301, 599, 305, 601, 317, - 603, 321, 604, 709, // NOLINT - 608, 329, 609, 713, 611, 333, - 613, 717, 614, 721, 616, 345, - 617, 341, 619, 725, // NOLINT - 620, 729, 623, 357, 625, 733, - 626, 361, 629, 369, 637, 737, - 640, 385, 643, 393, // NOLINT - 647, 741, 648, 401, 649, 669, - 1073742474, 409, 651, 413, 652, 673, - 658, 425, 670, 745, // NOLINT - 837, 749, 880, 753, 881, 753, - 882, 757, 883, 757, 886, 761, - 887, 761, 1073742715, 765, // NOLINT - 893, 769, 895, 773, 902, 777, - 1073742728, 781, 906, 785, 908, 789, - 1073742734, 793, 911, 797, // NOLINT - 913, 801, 914, 805, 1073742739, 809, - 916, 813, 917, 817, 1073742742, 821, - 919, 825, 920, 829, // NOLINT - 921, 749, 922, 833, 923, 837, - 924, 9, 1073742749, 841, 927, 845, - 928, 849, 929, 853, // NOLINT - 931, 857, 1073742756, 861, 933, 865, - 934, 869, 1073742759, 873, 939, 877, - 940, 777, 1073742765, 781, // NOLINT - 943, 785, 945, 801, 946, 805, - 1073742771, 809, 948, 813, 949, 817, - 1073742774, 821, 951, 825, // NOLINT - 952, 829, 953, 749, 954, 833, - 955, 837, 956, 9, 1073742781, 841, - 959, 845, 960, 849, // NOLINT - 961, 853, 962, 857, 963, 857, - 1073742788, 861, 965, 865, 966, 869, - 1073742791, 873, 971, 877, // NOLINT - 972, 789, 1073742797, 793, 974, 797, - 975, 881, 976, 805, 977, 829, - 981, 869, 982, 849, // NOLINT - 983, 881, 984, 885, 985, 885, - 986, 889, 987, 889, 988, 893, - 989, 893, 990, 897, // NOLINT - 991, 897, 992, 901, 993, 901, - 994, 905, 995, 905, 996, 909, - 997, 909, 998, 913, // NOLINT - 999, 913, 1000, 917, 1001, 917, - 1002, 921, 1003, 921, 1004, 925, - 1005, 925, 1006, 929, // NOLINT - 1007, 929, 1008, 833, 1009, 853, - 1010, 933, 1011, 773, 1013, 817, - 1015, 937, 1016, 937, // NOLINT - 1017, 933, 1018, 941, 1019, 941, - 1073742845, 765, 1023, 769, 1073742848, 945, - 1039, 949, 1073742864, 953, // NOLINT - 1071, 957, 1073742896, 953, 1103, 957, - 1073742928, 945, 1119, 949, 1120, 961, - 1121, 961, 1122, 965, // NOLINT - 1123, 965, 1124, 969, 1125, 969, - 1126, 973, 1127, 973, 1128, 977, - 1129, 977, 1130, 981, // NOLINT - 1131, 981, 1132, 985, 1133, 985, - 1134, 989, 1135, 989, 1136, 993, - 1137, 993, 1138, 997, // NOLINT - 1139, 997, 1140, 1001, 1141, 1001, - 1142, 1005, 1143, 1005, 1144, 1009, - 1145, 1009, 1146, 1013, // NOLINT - 1147, 1013, 1148, 1017, 1149, 1017, - 1150, 1021, 1151, 1021, 1152, 1025, - 1153, 1025, 1162, 1029, // NOLINT - 1163, 1029, 1164, 1033, 1165, 1033, - 1166, 1037, 1167, 1037, 1168, 1041, - 1169, 1041, 1170, 1045, // NOLINT - 1171, 1045, 1172, 1049, 1173, 1049, - 1174, 1053, 1175, 1053, 1176, 1057, - 1177, 1057, 1178, 1061, // NOLINT - 1179, 1061, 1180, 1065, 1181, 1065, - 1182, 1069, 1183, 1069, 1184, 1073, - 1185, 1073, 1186, 1077, // NOLINT - 1187, 1077, 1188, 1081, 1189, 1081, - 1190, 1085, 1191, 1085, 1192, 1089, - 1193, 1089, 1194, 1093, // NOLINT - 1195, 1093, 1196, 1097, 1197, 1097, - 1198, 1101, 1199, 1101, 1200, 1105, - 1201, 1105, 1202, 1109, // NOLINT - 1203, 1109, 1204, 1113, 1205, 1113, - 1206, 1117, 1207, 1117, 1208, 1121, - 1209, 1121, 1210, 1125, // NOLINT - 1211, 1125, 1212, 1129, 1213, 1129, - 1214, 1133, 1215, 1133, 1216, 1137, - 1217, 1141, 1218, 1141, // NOLINT - 1219, 1145, 1220, 1145, 1221, 1149, - 1222, 1149, 1223, 1153, 1224, 1153, - 1225, 1157, 1226, 1157, // NOLINT - 1227, 1161, 1228, 1161, 1229, 1165, - 1230, 1165, 1231, 1137, 1232, 1169, - 1233, 1169, 1234, 1173, // NOLINT - 1235, 1173, 1236, 1177, 1237, 1177, - 1238, 1181, 1239, 1181, 1240, 1185, - 1241, 1185, 1242, 1189, // NOLINT - 1243, 1189, 1244, 1193, 1245, 1193, - 1246, 1197, 1247, 1197, 1248, 1201, - 1249, 1201, 1250, 1205, // NOLINT - 1251, 1205, 1252, 1209, 1253, 1209, - 1254, 1213, 1255, 1213, 1256, 1217, - 1257, 1217, 1258, 1221, // NOLINT - 1259, 1221, 1260, 1225, 1261, 1225, - 1262, 1229, 1263, 1229, 1264, 1233, - 1265, 1233, 1266, 1237, // NOLINT - 1267, 1237, 1268, 1241, 1269, 1241, - 1270, 1245, 1271, 1245, 1272, 1249, - 1273, 1249, 1274, 1253, // NOLINT - 1275, 1253, 1276, 1257, 1277, 1257, - 1278, 1261, 1279, 1261, 1280, 1265, - 1281, 1265, 1282, 1269, // NOLINT - 1283, 1269, 1284, 1273, 1285, 1273, - 1286, 1277, 1287, 1277, 1288, 1281, - 1289, 1281, 1290, 1285, // NOLINT - 1291, 1285, 1292, 1289, 1293, 1289, - 1294, 1293, 1295, 1293, 1296, 1297, - 1297, 1297, 1298, 1301, // NOLINT - 1299, 1301, 1300, 1305, 1301, 1305, - 1302, 1309, 1303, 1309, 1304, 1313, - 1305, 1313, 1306, 1317, // NOLINT - 1307, 1317, 1308, 1321, 1309, 1321, - 1310, 1325, 1311, 1325, 1312, 1329, - 1313, 1329, 1314, 1333, // NOLINT - 1315, 1333, 1316, 1337, 1317, 1337, - 1318, 1341, 1319, 1341, 1320, 1345, - 1321, 1345, 1322, 1349, // NOLINT - 1323, 1349, 1324, 1353, 1325, 1353, - 1326, 1357, 1327, 1357, 1073743153, 1361, - 1366, 1365, 1073743201, 1361, // NOLINT - 1414, 1365, 1073746080, 1369, 4293, 1373, - 4295, 1377, 4301, 1381, 7545, 1385, - 7549, 1389, 7680, 1393, // NOLINT - 7681, 1393, 7682, 1397, 7683, 1397, - 7684, 1401, 7685, 1401, 7686, 1405, - 7687, 1405, 7688, 1409, // NOLINT - 7689, 1409, 7690, 1413, 7691, 1413, - 7692, 1417, 7693, 1417, 7694, 1421, - 7695, 1421, 7696, 1425, // NOLINT - 7697, 1425, 7698, 1429, 7699, 1429, - 7700, 1433, 7701, 1433, 7702, 1437, - 7703, 1437, 7704, 1441, // NOLINT - 7705, 1441, 7706, 1445, 7707, 1445, - 7708, 1449, 7709, 1449, 7710, 1453, - 7711, 1453, 7712, 1457, // NOLINT - 7713, 1457, 7714, 1461, 7715, 1461, - 7716, 1465, 7717, 1465, 7718, 1469, - 7719, 1469, 7720, 1473, // NOLINT - 7721, 1473, 7722, 1477, 7723, 1477, - 7724, 1481, 7725, 1481, 7726, 1485, - 7727, 1485, 7728, 1489, // NOLINT - 7729, 1489, 7730, 1493, 7731, 1493, - 7732, 1497, 7733, 1497, 7734, 1501, - 7735, 1501, 7736, 1505, // NOLINT - 7737, 1505, 7738, 1509, 7739, 1509, - 7740, 1513, 7741, 1513, 7742, 1517, - 7743, 1517, 7744, 1521, // NOLINT - 7745, 1521, 7746, 1525, 7747, 1525, - 7748, 1529, 7749, 1529, 7750, 1533, - 7751, 1533, 7752, 1537, // NOLINT - 7753, 1537, 7754, 1541, 7755, 1541, - 7756, 1545, 7757, 1545, 7758, 1549, - 7759, 1549, 7760, 1553, // NOLINT - 7761, 1553, 7762, 1557, 7763, 1557, - 7764, 1561, 7765, 1561, 7766, 1565, - 7767, 1565, 7768, 1569, // NOLINT - 7769, 1569, 7770, 1573, 7771, 1573, - 7772, 1577, 7773, 1577, 7774, 1581, - 7775, 1581, 7776, 1585, // NOLINT - 7777, 1585, 7778, 1589, 7779, 1589, - 7780, 1593, 7781, 1593, 7782, 1597, - 7783, 1597, 7784, 1601, // NOLINT - 7785, 1601, 7786, 1605, 7787, 1605, - 7788, 1609, 7789, 1609, 7790, 1613, - 7791, 1613, 7792, 1617, // NOLINT - 7793, 1617, 7794, 1621, 7795, 1621, - 7796, 1625, 7797, 1625, 7798, 1629, - 7799, 1629, 7800, 1633, // NOLINT - 7801, 1633, 7802, 1637, 7803, 1637, - 7804, 1641, 7805, 1641, 7806, 1645, - 7807, 1645, 7808, 1649, // NOLINT - 7809, 1649, 7810, 1653, 7811, 1653, - 7812, 1657, 7813, 1657, 7814, 1661, - 7815, 1661, 7816, 1665, // NOLINT - 7817, 1665, 7818, 1669, 7819, 1669, - 7820, 1673, 7821, 1673, 7822, 1677, - 7823, 1677, 7824, 1681, // NOLINT - 7825, 1681, 7826, 1685, 7827, 1685, - 7828, 1689, 7829, 1689, 7835, 1585, - 7840, 1693, 7841, 1693, // NOLINT - 7842, 1697, 7843, 1697, 7844, 1701, - 7845, 1701, 7846, 1705, 7847, 1705, - 7848, 1709, 7849, 1709, // NOLINT - 7850, 1713, 7851, 1713, 7852, 1717, - 7853, 1717, 7854, 1721, 7855, 1721, - 7856, 1725, 7857, 1725, // NOLINT - 7858, 1729, 7859, 1729, 7860, 1733, - 7861, 1733, 7862, 1737, 7863, 1737, - 7864, 1741, 7865, 1741, // NOLINT - 7866, 1745, 7867, 1745, 7868, 1749, - 7869, 1749, 7870, 1753, 7871, 1753, - 7872, 1757, 7873, 1757, // NOLINT - 7874, 1761, 7875, 1761, 7876, 1765, - 7877, 1765, 7878, 1769, 7879, 1769, - 7880, 1773, 7881, 1773, // NOLINT - 7882, 1777, 7883, 1777, 7884, 1781, - 7885, 1781, 7886, 1785, 7887, 1785, - 7888, 1789, 7889, 1789, // NOLINT - 7890, 1793, 7891, 1793, 7892, 1797, - 7893, 1797, 7894, 1801, 7895, 1801, - 7896, 1805, 7897, 1805, // NOLINT - 7898, 1809, 7899, 1809, 7900, 1813, - 7901, 1813, 7902, 1817, 7903, 1817, - 7904, 1821, 7905, 1821, // NOLINT - 7906, 1825, 7907, 1825, 7908, 1829, - 7909, 1829, 7910, 1833, 7911, 1833, - 7912, 1837, 7913, 1837, // NOLINT - 7914, 1841, 7915, 1841, 7916, 1845, - 7917, 1845, 7918, 1849, 7919, 1849, - 7920, 1853, 7921, 1853, // NOLINT - 7922, 1857, 7923, 1857, 7924, 1861, - 7925, 1861, 7926, 1865, 7927, 1865, - 7928, 1869, 7929, 1869, // NOLINT - 7930, 1873, 7931, 1873, 7932, 1877, - 7933, 1877, 7934, 1881, 7935, 1881, - 1073749760, 1885, 7943, 1889, // NOLINT - 1073749768, 1885, 7951, 1889, 1073749776, 1893, - 7957, 1897, 1073749784, 1893, 7965, 1897, - 1073749792, 1901, 7975, 1905, // NOLINT - 1073749800, 1901, 7983, 1905, 1073749808, 1909, - 7991, 1913, 1073749816, 1909, 7999, 1913, - 1073749824, 1917, 8005, 1921, // NOLINT - 1073749832, 1917, 8013, 1921, 8017, 1925, - 8019, 1929, 8021, 1933, 8023, 1937, - 8025, 1925, 8027, 1929, // NOLINT - 8029, 1933, 8031, 1937, 1073749856, 1941, - 8039, 1945, 1073749864, 1941, 8047, 1945, - 1073749872, 1949, 8049, 1953, // NOLINT - 1073749874, 1957, 8053, 1961, 1073749878, 1965, - 8055, 1969, 1073749880, 1973, 8057, 1977, - 1073749882, 1981, 8059, 1985, // NOLINT - 1073749884, 1989, 8061, 1993, 1073749936, 1997, - 8113, 2001, 1073749944, 1997, 8121, 2001, - 1073749946, 1949, 8123, 1953, // NOLINT - 8126, 749, 1073749960, 1957, 8139, 1961, - 1073749968, 2005, 8145, 2009, 1073749976, 2005, - 8153, 2009, 1073749978, 1965, // NOLINT - 8155, 1969, 1073749984, 2013, 8161, 2017, - 8165, 2021, 1073749992, 2013, 8169, 2017, - 1073749994, 1981, 8171, 1985, // NOLINT - 8172, 2021, 1073750008, 1973, 8185, 1977, - 1073750010, 1989, 8187, 1993}; // NOLINT -static const MultiCharacterSpecialCase<2> - kEcma262UnCanonicalizeMultiStrings1[83] = { // NOLINT - {{8498, 8526}}, {{8544, 8560}}, {{8559, 8575}}, - {{8579, 8580}}, // NOLINT - {{9398, 9424}}, {{9423, 9449}}, {{11264, 11312}}, - {{11310, 11358}}, // NOLINT - {{11360, 11361}}, {{619, 11362}}, {{7549, 11363}}, - {{637, 11364}}, // NOLINT - {{570, 11365}}, {{574, 11366}}, {{11367, 11368}}, - {{11369, 11370}}, // NOLINT - {{11371, 11372}}, {{593, 11373}}, {{625, 11374}}, - {{592, 11375}}, // NOLINT - {{594, 11376}}, {{11378, 11379}}, {{11381, 11382}}, - {{575, 11390}}, // NOLINT - {{576, 11391}}, {{11392, 11393}}, {{11394, 11395}}, - {{11396, 11397}}, // NOLINT - {{11398, 11399}}, {{11400, 11401}}, {{11402, 11403}}, - {{11404, 11405}}, // NOLINT - {{11406, 11407}}, {{11408, 11409}}, {{11410, 11411}}, - {{11412, 11413}}, // NOLINT - {{11414, 11415}}, {{11416, 11417}}, {{11418, 11419}}, - {{11420, 11421}}, // NOLINT - {{11422, 11423}}, {{11424, 11425}}, {{11426, 11427}}, - {{11428, 11429}}, // NOLINT - {{11430, 11431}}, {{11432, 11433}}, {{11434, 11435}}, - {{11436, 11437}}, // NOLINT - {{11438, 11439}}, {{11440, 11441}}, {{11442, 11443}}, - {{11444, 11445}}, // NOLINT - {{11446, 11447}}, {{11448, 11449}}, {{11450, 11451}}, - {{11452, 11453}}, // NOLINT - {{11454, 11455}}, {{11456, 11457}}, {{11458, 11459}}, - {{11460, 11461}}, // NOLINT - {{11462, 11463}}, {{11464, 11465}}, {{11466, 11467}}, - {{11468, 11469}}, // NOLINT - {{11470, 11471}}, {{11472, 11473}}, {{11474, 11475}}, - {{11476, 11477}}, // NOLINT - {{11478, 11479}}, {{11480, 11481}}, {{11482, 11483}}, - {{11484, 11485}}, // NOLINT - {{11486, 11487}}, {{11488, 11489}}, {{11490, 11491}}, - {{11499, 11500}}, // NOLINT - {{11501, 11502}}, {{11506, 11507}}, {{4256, 11520}}, - {{4293, 11557}}, // NOLINT - {{4295, 11559}}, {{4301, 11565}}, {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262UnCanonicalizeTable1Size = 149; // NOLINT -static const int32_t kEcma262UnCanonicalizeTable1[298] = { - 306, 1, 334, 1, 1073742176, 5, 367, 9, - 1073742192, 5, 383, 9, 387, 13, 388, 13, // NOLINT - 1073743030, 17, 1231, 21, 1073743056, 17, 1257, 21, - 1073744896, 25, 3118, 29, 1073744944, 25, 3166, 29, // NOLINT - 3168, 33, 3169, 33, 3170, 37, 3171, 41, - 3172, 45, 3173, 49, 3174, 53, 3175, 57, // NOLINT - 3176, 57, 3177, 61, 3178, 61, 3179, 65, - 3180, 65, 3181, 69, 3182, 73, 3183, 77, // NOLINT - 3184, 81, 3186, 85, 3187, 85, 3189, 89, - 3190, 89, 1073745022, 93, 3199, 97, 3200, 101, // NOLINT - 3201, 101, 3202, 105, 3203, 105, 3204, 109, - 3205, 109, 3206, 113, 3207, 113, 3208, 117, // NOLINT - 3209, 117, 3210, 121, 3211, 121, 3212, 125, - 3213, 125, 3214, 129, 3215, 129, 3216, 133, // NOLINT - 3217, 133, 3218, 137, 3219, 137, 3220, 141, - 3221, 141, 3222, 145, 3223, 145, 3224, 149, // NOLINT - 3225, 149, 3226, 153, 3227, 153, 3228, 157, - 3229, 157, 3230, 161, 3231, 161, 3232, 165, // NOLINT - 3233, 165, 3234, 169, 3235, 169, 3236, 173, - 3237, 173, 3238, 177, 3239, 177, 3240, 181, // NOLINT - 3241, 181, 3242, 185, 3243, 185, 3244, 189, - 3245, 189, 3246, 193, 3247, 193, 3248, 197, // NOLINT - 3249, 197, 3250, 201, 3251, 201, 3252, 205, - 3253, 205, 3254, 209, 3255, 209, 3256, 213, // NOLINT - 3257, 213, 3258, 217, 3259, 217, 3260, 221, - 3261, 221, 3262, 225, 3263, 225, 3264, 229, // NOLINT - 3265, 229, 3266, 233, 3267, 233, 3268, 237, - 3269, 237, 3270, 241, 3271, 241, 3272, 245, // NOLINT - 3273, 245, 3274, 249, 3275, 249, 3276, 253, - 3277, 253, 3278, 257, 3279, 257, 3280, 261, // NOLINT - 3281, 261, 3282, 265, 3283, 265, 3284, 269, - 3285, 269, 3286, 273, 3287, 273, 3288, 277, // NOLINT - 3289, 277, 3290, 281, 3291, 281, 3292, 285, - 3293, 285, 3294, 289, 3295, 289, 3296, 293, // NOLINT - 3297, 293, 3298, 297, 3299, 297, 3307, 301, - 3308, 301, 3309, 305, 3310, 305, 3314, 309, // NOLINT - 3315, 309, 1073745152, 313, 3365, 317, 3367, 321, - 3373, 325}; // NOLINT -static const MultiCharacterSpecialCase<2> - kEcma262UnCanonicalizeMultiStrings5[104] = { // NOLINT - {{42560, 42561}}, {{42562, 42563}}, - {{42564, 42565}}, {{42566, 42567}}, // NOLINT - {{42568, 42569}}, {{42570, 42571}}, - {{42572, 42573}}, {{42574, 42575}}, // NOLINT - {{42576, 42577}}, {{42578, 42579}}, - {{42580, 42581}}, {{42582, 42583}}, // NOLINT - {{42584, 42585}}, {{42586, 42587}}, - {{42588, 42589}}, {{42590, 42591}}, // NOLINT - {{42592, 42593}}, {{42594, 42595}}, - {{42596, 42597}}, {{42598, 42599}}, // NOLINT - {{42600, 42601}}, {{42602, 42603}}, - {{42604, 42605}}, {{42624, 42625}}, // NOLINT - {{42626, 42627}}, {{42628, 42629}}, - {{42630, 42631}}, {{42632, 42633}}, // NOLINT - {{42634, 42635}}, {{42636, 42637}}, - {{42638, 42639}}, {{42640, 42641}}, // NOLINT - {{42642, 42643}}, {{42644, 42645}}, - {{42646, 42647}}, {{42648, 42649}}, // NOLINT - {{42650, 42651}}, {{42786, 42787}}, - {{42788, 42789}}, {{42790, 42791}}, // NOLINT - {{42792, 42793}}, {{42794, 42795}}, - {{42796, 42797}}, {{42798, 42799}}, // NOLINT - {{42802, 42803}}, {{42804, 42805}}, - {{42806, 42807}}, {{42808, 42809}}, // NOLINT - {{42810, 42811}}, {{42812, 42813}}, - {{42814, 42815}}, {{42816, 42817}}, // NOLINT - {{42818, 42819}}, {{42820, 42821}}, - {{42822, 42823}}, {{42824, 42825}}, // NOLINT - {{42826, 42827}}, {{42828, 42829}}, - {{42830, 42831}}, {{42832, 42833}}, // NOLINT - {{42834, 42835}}, {{42836, 42837}}, - {{42838, 42839}}, {{42840, 42841}}, // NOLINT - {{42842, 42843}}, {{42844, 42845}}, - {{42846, 42847}}, {{42848, 42849}}, // NOLINT - {{42850, 42851}}, {{42852, 42853}}, - {{42854, 42855}}, {{42856, 42857}}, // NOLINT - {{42858, 42859}}, {{42860, 42861}}, - {{42862, 42863}}, {{42873, 42874}}, // NOLINT - {{42875, 42876}}, {{7545, 42877}}, - {{42878, 42879}}, {{42880, 42881}}, // NOLINT - {{42882, 42883}}, {{42884, 42885}}, - {{42886, 42887}}, {{42891, 42892}}, // NOLINT - {{613, 42893}}, {{42896, 42897}}, - {{42898, 42899}}, {{42902, 42903}}, // NOLINT - {{42904, 42905}}, {{42906, 42907}}, - {{42908, 42909}}, {{42910, 42911}}, // NOLINT - {{42912, 42913}}, {{42914, 42915}}, - {{42916, 42917}}, {{42918, 42919}}, // NOLINT - {{42920, 42921}}, {{614, 42922}}, - {{604, 42923}}, {{609, 42924}}, // NOLINT - {{620, 42925}}, {{670, 42928}}, - {{647, 42929}}, {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262UnCanonicalizeTable5Size = 198; // NOLINT -static const int32_t - kEcma262UnCanonicalizeTable5[396] = - {1600, 1, 1601, 1, 1602, 5, 1603, 5, - 1604, 9, 1605, 9, 1606, 13, 1607, 13, // NOLINT - 1608, 17, 1609, 17, 1610, 21, 1611, 21, - 1612, 25, 1613, 25, 1614, 29, 1615, 29, // NOLINT - 1616, 33, 1617, 33, 1618, 37, 1619, 37, - 1620, 41, 1621, 41, 1622, 45, 1623, 45, // NOLINT - 1624, 49, 1625, 49, 1626, 53, 1627, 53, - 1628, 57, 1629, 57, 1630, 61, 1631, 61, // NOLINT - 1632, 65, 1633, 65, 1634, 69, 1635, 69, - 1636, 73, 1637, 73, 1638, 77, 1639, 77, // NOLINT - 1640, 81, 1641, 81, 1642, 85, 1643, 85, - 1644, 89, 1645, 89, 1664, 93, 1665, 93, // NOLINT - 1666, 97, 1667, 97, 1668, 101, 1669, 101, - 1670, 105, 1671, 105, 1672, 109, 1673, 109, // NOLINT - 1674, 113, 1675, 113, 1676, 117, 1677, 117, - 1678, 121, 1679, 121, 1680, 125, 1681, 125, // NOLINT - 1682, 129, 1683, 129, 1684, 133, 1685, 133, - 1686, 137, 1687, 137, 1688, 141, 1689, 141, // NOLINT - 1690, 145, 1691, 145, 1826, 149, 1827, 149, - 1828, 153, 1829, 153, 1830, 157, 1831, 157, // NOLINT - 1832, 161, 1833, 161, 1834, 165, 1835, 165, - 1836, 169, 1837, 169, 1838, 173, 1839, 173, // NOLINT - 1842, 177, 1843, 177, 1844, 181, 1845, 181, - 1846, 185, 1847, 185, 1848, 189, 1849, 189, // NOLINT - 1850, 193, 1851, 193, 1852, 197, 1853, 197, - 1854, 201, 1855, 201, 1856, 205, 1857, 205, // NOLINT - 1858, 209, 1859, 209, 1860, 213, 1861, 213, - 1862, 217, 1863, 217, 1864, 221, 1865, 221, // NOLINT - 1866, 225, 1867, 225, 1868, 229, 1869, 229, - 1870, 233, 1871, 233, 1872, 237, 1873, 237, // NOLINT - 1874, 241, 1875, 241, 1876, 245, 1877, 245, - 1878, 249, 1879, 249, 1880, 253, 1881, 253, // NOLINT - 1882, 257, 1883, 257, 1884, 261, 1885, 261, - 1886, 265, 1887, 265, 1888, 269, 1889, 269, // NOLINT - 1890, 273, 1891, 273, 1892, 277, 1893, 277, - 1894, 281, 1895, 281, 1896, 285, 1897, 285, // NOLINT - 1898, 289, 1899, 289, 1900, 293, 1901, 293, - 1902, 297, 1903, 297, 1913, 301, 1914, 301, // NOLINT - 1915, 305, 1916, 305, 1917, 309, 1918, 313, - 1919, 313, 1920, 317, 1921, 317, 1922, 321, // NOLINT - 1923, 321, 1924, 325, 1925, 325, 1926, 329, - 1927, 329, 1931, 333, 1932, 333, 1933, 337, // NOLINT - 1936, 341, 1937, 341, 1938, 345, 1939, 345, - 1942, 349, 1943, 349, 1944, 353, 1945, 353, // NOLINT - 1946, 357, 1947, 357, 1948, 361, 1949, 361, - 1950, 365, 1951, 365, 1952, 369, 1953, 369, // NOLINT - 1954, 373, 1955, 373, 1956, 377, 1957, 377, - 1958, 381, 1959, 381, 1960, 385, 1961, 385, // NOLINT - 1962, 389, 1963, 393, 1964, 397, 1965, 401, - 1968, 405, 1969, 409}; // NOLINT -static const MultiCharacterSpecialCase<2> - kEcma262UnCanonicalizeMultiStrings7[3] = { // NOLINT - {{65313, 65345}}, - {{65338, 65370}}, - {{kSentinel}}}; // NOLINT -static const uint16_t kEcma262UnCanonicalizeTable7Size = 4; // NOLINT -static const int32_t kEcma262UnCanonicalizeTable7[8] = { - 1073749793, 1, 7994, 5, 1073749825, 1, 8026, 5}; // NOLINT -int Ecma262UnCanonicalize::Convert(uchar c, uchar n, uchar* result, - bool* allow_caching_ptr) { - int chunk_index = c >> 13; - switch (chunk_index) { - case 0: - return LookupMapping( - kEcma262UnCanonicalizeTable0, kEcma262UnCanonicalizeTable0Size, - kEcma262UnCanonicalizeMultiStrings0, c, n, result, allow_caching_ptr); - case 1: - return LookupMapping( - kEcma262UnCanonicalizeTable1, kEcma262UnCanonicalizeTable1Size, - kEcma262UnCanonicalizeMultiStrings1, c, n, result, allow_caching_ptr); - case 5: - return LookupMapping( - kEcma262UnCanonicalizeTable5, kEcma262UnCanonicalizeTable5Size, - kEcma262UnCanonicalizeMultiStrings5, c, n, result, allow_caching_ptr); - case 7: - return LookupMapping( - kEcma262UnCanonicalizeTable7, kEcma262UnCanonicalizeTable7Size, - kEcma262UnCanonicalizeMultiStrings7, c, n, result, allow_caching_ptr); - default: - return 0; - } -} - -static const MultiCharacterSpecialCase<1> - kCanonicalizationRangeMultiStrings0[1] = { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kCanonicalizationRangeTable0Size = 70; // NOLINT -static const int32_t kCanonicalizationRangeTable0[140] = { - 1073741889, 100, 90, 0, 1073741921, 100, 122, 0, - 1073742016, 88, 214, 0, 1073742040, 24, 222, 0, // NOLINT - 1073742048, 88, 246, 0, 1073742072, 24, 254, 0, - 1073742715, 8, 893, 0, 1073742728, 8, 906, 0, // NOLINT - 1073742749, 8, 927, 0, 1073742759, 16, 939, 0, - 1073742765, 8, 943, 0, 1073742781, 8, 959, 0, // NOLINT - 1073742791, 16, 971, 0, 1073742845, 8, 1023, 0, - 1073742848, 60, 1039, 0, 1073742864, 124, 1071, 0, // NOLINT - 1073742896, 124, 1103, 0, 1073742928, 60, 1119, 0, - 1073743153, 148, 1366, 0, 1073743201, 148, 1414, 0, // NOLINT - 1073746080, 148, 4293, 0, 1073749760, 28, 7943, 0, - 1073749768, 28, 7951, 0, 1073749776, 20, 7957, 0, // NOLINT - 1073749784, 20, 7965, 0, 1073749792, 28, 7975, 0, - 1073749800, 28, 7983, 0, 1073749808, 28, 7991, 0, // NOLINT - 1073749816, 28, 7999, 0, 1073749824, 20, 8005, 0, - 1073749832, 20, 8013, 0, 1073749856, 28, 8039, 0, // NOLINT - 1073749864, 28, 8047, 0, 1073749874, 12, 8053, 0, - 1073749960, 12, 8139, 0}; // NOLINT -static const MultiCharacterSpecialCase<1> - kCanonicalizationRangeMultiStrings1[1] = { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kCanonicalizationRangeTable1Size = 14; // NOLINT -static const int32_t kCanonicalizationRangeTable1[28] = { - 1073742176, 60, 367, 0, 1073742192, 60, 383, 0, - 1073743030, 100, 1231, 0, 1073743056, 100, 1257, 0, // NOLINT - 1073744896, 184, 3118, 0, 1073744944, 184, 3166, 0, - 1073745152, 148, 3365, 0}; // NOLINT -static const MultiCharacterSpecialCase<1> - kCanonicalizationRangeMultiStrings7[1] = { // NOLINT - {{kSentinel}}}; // NOLINT -static const uint16_t kCanonicalizationRangeTable7Size = 4; // NOLINT -static const int32_t kCanonicalizationRangeTable7[8] = { - 1073749793, 100, 7994, 0, 1073749825, 100, 8026, 0}; // NOLINT -int CanonicalizationRange::Convert(uchar c, uchar n, uchar* result, - bool* allow_caching_ptr) { - int chunk_index = c >> 13; - switch (chunk_index) { - case 0: - return LookupMapping( - kCanonicalizationRangeTable0, kCanonicalizationRangeTable0Size, - kCanonicalizationRangeMultiStrings0, c, n, result, allow_caching_ptr); - case 1: - return LookupMapping( - kCanonicalizationRangeTable1, kCanonicalizationRangeTable1Size, - kCanonicalizationRangeMultiStrings1, c, n, result, allow_caching_ptr); - case 7: - return LookupMapping( - kCanonicalizationRangeTable7, kCanonicalizationRangeTable7Size, - kCanonicalizationRangeMultiStrings7, c, n, result, allow_caching_ptr); - default: - return 0; - } -} - -#endif // !V8_INTL_SUPPORT - -} // namespace unibrow -} // namespace v8 diff --git a/js/src/new-regexp/util/vector.h b/js/src/new-regexp/util/vector.h deleted file mode 100644 index 435318ce7..000000000 --- a/js/src/new-regexp/util/vector.h +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright 2014 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_UTIL_VECTOR_H_ -#define V8_UTIL_VECTOR_H_ - -#include -#include -#include -#include - -#include "js/Utility.h" - -namespace v8 { -namespace internal { - -////////////////////////////////////////////////// - -// Adapted from: https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/allocation.h#L36-L58 - -template -T* NewArray(size_t size) { - static_assert(std::is_pod::value, ""); - js::AutoEnterOOMUnsafeRegion oomUnsafe; - T* result = static_cast(js_malloc(size * sizeof(T))); - if (!result) { - oomUnsafe.crash("Irregexp NewArray"); - } - return result; -} - -template -void DeleteArray(T* array) { - js_free(array); -} - -////////////////////////////////////////////////// - -// A non-resizable vector containing a pointer and a length. -// The Vector may or may not own the pointer, depending on context. -// Origin: -// https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/vector.h#L20-L134 - -template -class Vector { - public: - Vector() : start_(nullptr), length_(0) {} - - Vector(T* data, size_t length) : start_(data), length_(length) { - MOZ_ASSERT_IF(length != 0, data != nullptr); - } - - static Vector New(size_t length) { - return Vector(NewArray(length), length); - } - - // Returns a vector using the same backing storage as this one, - // spanning from and including 'from', to but not including 'to'. - Vector SubVector(size_t from, size_t to) const { - MOZ_ASSERT(from < to); - MOZ_ASSERT(to < length_); - return Vector(begin() + from, to - from); - } - - // Returns the length of the vector. Only use this if you really need an - // integer return value. Use {size()} otherwise. - int length() const { - MOZ_ASSERT(length_ <= std::numeric_limits::max()); - return static_cast(length_); - } - - // Returns the length of the vector as a size_t. - constexpr size_t size() const { return length_; } - - // Returns whether or not the vector is empty. - constexpr bool empty() const { return length_ == 0; } - - // Access individual vector elements - checks bounds in debug mode. - T& operator[](size_t index) const { - MOZ_ASSERT(index < length_); - return start_[index]; - } - - const T& at(size_t index) const { return operator[](index); } - - T& first() { return start_[0]; } - - T& last() { - MOZ_ASSERT(length_ > 0); - return start_[length_ - 1]; - } - - // Returns a pointer to the start of the data in the vector. - constexpr T* begin() const { return start_; } - - // Returns a pointer past the end of the data in the vector. - constexpr T* end() const { return start_ + length_; } - - // Returns a clone of this vector with a new backing store. - Vector Clone() const { - T* result = NewArray(length_); - for (size_t i = 0; i < length_; i++) result[i] = start_[i]; - return Vector(result, length_); - } - - void Truncate(size_t length) { - MOZ_ASSERT(length <= length_); - length_ = length; - } - - // Releases the array underlying this vector. Once disposed the - // vector is empty. - void Dispose() { - DeleteArray(start_); - start_ = nullptr; - length_ = 0; - } - - Vector operator+(size_t offset) { - MOZ_ASSERT(offset <= length_); - return Vector(start_ + offset, length_ - offset); - } - - Vector operator+=(size_t offset) { - MOZ_ASSERT(offset <= length_); - start_ += offset; - length_ -= offset; - return *this; - } - - // Implicit conversion from Vector to Vector. - inline operator Vector() const { - return Vector::cast(*this); - } - - template - static constexpr Vector cast(Vector input) { - return Vector(reinterpret_cast(input.begin()), - input.length() * sizeof(S) / sizeof(T)); - } - - bool operator==(const Vector other) const { - if (length_ != other.length_) return false; - if (start_ == other.start_) return true; - for (size_t i = 0; i < length_; ++i) { - if (start_[i] != other.start_[i]) { - return false; - } - } - return true; - } - - private: - T* start_; - size_t length_; -}; - -// The resulting vector does not contain a null-termination byte. If you want -// the null byte, use ArrayVector("foo"). -inline Vector CStrVector(const char* data) { - return Vector(data, strlen(data)); -} - -} // namespace internal - -namespace base { - -// SmallVector uses inline storage first, and reallocates when full. -// It is basically equivalent to js::Vector, and is implemented -// as a thin wrapper. -// V8's implementation: https://github.com/v8/v8/blob/master/src/base/small-vector.h -template -class SmallVector { -public: - inline bool empty() const { return inner_.empty(); } - inline const T& back() const { return inner_.back(); } - inline void pop_back() { inner_.popBack(); }; - template - inline void emplace_back(Args&&... args) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - if (!inner_.emplaceBack(args...)) { - oomUnsafe.crash("Irregexp SmallVector emplace_back"); - } - }; - inline size_t size() const { return inner_.length(); } - inline const T& at(size_t index) const { return inner_[index]; } - - void resize_no_init(size_t new_size) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - if (!inner_.resizeUninitialized(new_size)) { - oomUnsafe.crash("Irregexp SmallVector resize"); - } - } -private: - js::Vector inner_; -}; - - -} // namespace base - -} // namespace v8 - -#endif // V8_UTIL_VECTOR_H_ diff --git a/js/src/new-regexp/util/zone.h b/js/src/new-regexp/util/zone.h deleted file mode 100644 index 7183f77b7..000000000 --- a/js/src/new-regexp/util/zone.h +++ /dev/null @@ -1,375 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_UTIL_ZONE_H_ -#define V8_UTIL_ZONE_H_ - -#include -#include -#include -#include -#include - -#include "ds/LifoAlloc.h" -#include "ds/Sort.h" -#include "new-regexp/util/vector.h" - -namespace v8 { -namespace internal { - -// V8::Zone ~= LifoAlloc -class Zone { - public: - Zone(size_t defaultChunkSize) : lifoAlloc_(defaultChunkSize) { - lifoAlloc_.setAsInfallibleByDefault(); - } - - void* New(size_t size) { - js::LifoAlloc::AutoFallibleScope fallible(&lifoAlloc_); - js::AutoEnterOOMUnsafeRegion oomUnsafe; - void* result = lifoAlloc_.alloc(size); - if (!result) { - oomUnsafe.crash("Irregexp Zone::new"); - } - return result; - } - - void DeleteAll() { lifoAlloc_.freeAll(); } - - // Returns true if the total memory allocated exceeds a threshold. - static const size_t kExcessLimit = 256 * 1024 * 1024; - bool excess_allocation() const { - return lifoAlloc_.computedSizeOfExcludingThis() > kExcessLimit; - } -private: - js::LifoAlloc lifoAlloc_; -}; - -// Superclass for classes allocated in a Zone. -// Origin: https://github.com/v8/v8/blob/7b3332844212d78ee87a9426f3a6f7f781a8fbfa/src/zone/zone.h#L138-L155 -class ZoneObject { - public: - // Allocate a new ZoneObject of 'size' bytes in the Zone. - void* operator new(size_t size, Zone* zone) { return zone->New(size); } - - // Ideally, the delete operator should be private instead of - // public, but unfortunately the compiler sometimes synthesizes - // (unused) destructors for classes derived from ZoneObject, which - // require the operator to be visible. MSVC requires the delete - // operator to be public. - - // ZoneObjects should never be deleted individually; use - // Zone::DeleteAll() to delete all zone objects in one go. - void operator delete(void*, size_t) { MOZ_CRASH("unreachable"); } - void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); } -}; - -// ZoneLists are growable lists with constant-time access to the -// elements. The list itself and all its elements are allocated in the -// Zone. ZoneLists cannot be deleted individually; you can delete all -// objects in the Zone by calling Zone::DeleteAll(). -// Used throughout irregexp. -// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone.h#L173-L318 -// Inlines: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-list-inl.h#L17-L155 -template -class ZoneList final { - public: - // Construct a new ZoneList with the given capacity; the length is - // always zero. The capacity must be non-negative. - ZoneList(int capacity, Zone* zone) { Initialize(capacity, zone); } - // Construct a new ZoneList from a std::initializer_list - ZoneList(std::initializer_list list, Zone* zone) { - Initialize(static_cast(list.size()), zone); - for (auto& i : list) Add(i, zone); - } - // Construct a new ZoneList by copying the elements of the given ZoneList. - ZoneList(const ZoneList& other, Zone* zone) { - Initialize(other.length(), zone); - AddAll(other, zone); - } - - void* operator new(size_t size, Zone* zone) { return zone->New(size); } - - // Returns a reference to the element at index i. This reference is not safe - // to use after operations that can change the list's backing store - // (e.g. Add). - inline T& operator[](int i) const { - MOZ_ASSERT(0 < i); - MOZ_ASSERT(static_cast(i) < static_cast(length_)); - return data_[i]; - } - inline T& at(int i) const { return operator[](i); } - inline T& last() const { return at(length_ - 1); } - inline T& first() const { return at(0); } - - using iterator = T*; - inline iterator begin() const { return &data_[0]; } - inline iterator end() const { return &data_[length_]; } - - inline bool is_empty() const { return length_ == 0; } - inline int length() const { return length_; } - inline int capacity() const { return capacity_; } - - Vector ToVector() const { return Vector(data_, length_); } - Vector ToVector(int start, int length) const { - return Vector(data_ + start, std::min(length_ - start, length)); - } - - Vector ToConstVector() const { - return Vector(data_, length_); - } - - inline void Initialize(int capacity, Zone* zone) { - MOZ_ASSERT(capacity >= 0); - data_ = (capacity > 0) ? NewData(capacity, zone) : nullptr; - capacity_ = capacity; - length_ = 0; - } - - // Adds a copy of the given 'element' to the end of the list, - // expanding the list if necessary. - void Add(const T& element, Zone* zone) { - if (length_ < capacity_) { - data_[length_++] = element; - } else { - ZoneList::ResizeAdd(element, zone); - } - } - // Add all the elements from the argument list to this list. - void AddAll(const ZoneList& other, Zone* zone) { - AddAll(other.ToVector(), zone); - } - // Add all the elements from the vector to this list. - void AddAll(const Vector& other, Zone* zone) { - int result_length = length_ + other.length(); - if (capacity_ < result_length) { - Resize(result_length, zone); - } - if (std::is_fundamental()) { - memcpy(data_ + length_, other.begin(), sizeof(*data_) * other.length()); - } else { - for (int i = 0; i < other.length(); i++) { - data_[length_ + i] = other.at(i); - } - } - length_ = result_length; - } - - // Overwrites the element at the specific index. - void Set(int index, const T& element) { - MOZ_ASSERT(index >= 0 && index <= length_); - data_[index] = element; - } - - // Removes the i'th element without deleting it even if T is a - // pointer type; moves all elements above i "down". Returns the - // removed element. This function's complexity is linear in the - // size of the list. - T Remove(int i) { - T element = at(i); - length_--; - while (i < length_) { - data_[i] = data_[i + 1]; - i++; - } - return element; - } - - // Removes the last element without deleting it even if T is a - // pointer type. Returns the removed element. - inline T RemoveLast() { return Remove(length_ - 1); } - - // Clears the list by freeing the storage memory. If you want to keep the - // memory, use Rewind(0) instead. Be aware, that even if T is a - // pointer type, clearing the list doesn't delete the entries. - inline void Clear() { - data_ = nullptr; - capacity_ = 0; - length_ = 0; - } - - // Drops all but the first 'pos' elements from the list. - inline void Rewind(int pos) { - MOZ_ASSERT(0 <= pos && pos <= length_); - length_ = pos; - } - - inline bool Contains(const T& elm) const { - for (int i = 0; i < length_; i++) { - if (data_[i] == elm) return true; - } - return false; - } - - template - void StableSort(CompareFunction cmp, size_t start, size_t length) { - js::AutoEnterOOMUnsafeRegion oomUnsafe; - T* scratch = static_cast(js_malloc(length * sizeof(T))); - if (!scratch) { - oomUnsafe.crash("Irregexp stable sort scratch space"); - } - auto comparator = [cmp](const T& a, const T& b, bool* lessOrEqual) { - *lessOrEqual = cmp(&a, &b) <= 0; - return true; - }; - MOZ_ALWAYS_TRUE(js::MergeSort(begin() + start, length, scratch, - comparator)); - js_free(scratch); - } - - void operator delete(void* pointer) { MOZ_CRASH("unreachable"); } - void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); } - - private: - T* data_; - int capacity_; - int length_; - - inline T* NewData(int n, Zone* zone) { - return static_cast(zone->New(n * sizeof(T))); - } - - // Increase the capacity of a full list, and add an element. - // List must be full already. - void ResizeAdd(const T& element, Zone* zone) { - MOZ_ASSERT(length_ >= capacity_); - // Grow the list capacity by 100%, but make sure to let it grow - // even when the capacity is zero (possible initial case). - int new_capacity = 1 + 2 * capacity_; - // Since the element reference could be an element of the list, copy - // it out of the old backing storage before resizing. - T temp = element; - Resize(new_capacity, zone); - data_[length_++] = temp; - } - - // Resize the list. - void Resize(int new_capacity, Zone* zone) { - MOZ_ASSERT(length_ <= new_capacity); - T* new_data = NewData(new_capacity, zone); - if (length_ > 0) { - memcpy(new_data, data_, length_ * sizeof(T)); - } - data_ = new_data; - capacity_ = new_capacity; - } - - ZoneList& operator=(const ZoneList&) = delete; - ZoneList() = delete; - ZoneList(const ZoneList&) = delete; -}; - -// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-allocator.h#L14-L77 -template -class ZoneAllocator { -public: - using pointer = T*; - using const_pointer = const T*; - using reference = T&; - using const_reference = const T&; - using value_type = T; - using size_type = size_t; - using difference_type = ptrdiff_t; - template - struct rebind { - using other = ZoneAllocator; - }; - - explicit ZoneAllocator(Zone* zone) : zone_(zone) {} - template - ZoneAllocator(const ZoneAllocator& other) - : ZoneAllocator(other.zone_) {} - template - friend class ZoneAllocator; - - T* allocate(size_t n) { return static_cast(zone_->New(n * sizeof(T))); } - void deallocate(T* p, size_t) {} // noop for zones - - bool operator==(ZoneAllocator const& other) const { - return zone_ == other.zone_; - } - bool operator!=(ZoneAllocator const& other) const { - return zone_ != other.zone_; - } - -private: - Zone* zone_; -}; - -// Zone wrappers for std containers: -// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-containers.h#L25-L169 - -// A wrapper subclass for std::vector to make it easy to construct one -// that uses a zone allocator. -// Used throughout irregexp -template -class ZoneVector : public std::vector> { -public: - ZoneVector(Zone* zone) - : std::vector>(ZoneAllocator(zone)) {} - - // Constructs a new vector and fills it with the contents of the range - // [first, last). - template - ZoneVector(Iter first, Iter last, Zone* zone) - : std::vector>(first, last, ZoneAllocator(zone)) {} -}; - -// A wrapper subclass for std::list to make it easy to construct one -// that uses a zone allocator. -// Used in regexp-bytecode-peephole.cc -template -class ZoneLinkedList : public std::list> { - public: - // Constructs an empty list. - explicit ZoneLinkedList(Zone* zone) - : std::list>(ZoneAllocator(zone)) {} -}; - -// A wrapper subclass for std::set to make it easy to construct one that uses -// a zone allocator. -// Used in regexp-parser.cc -template > -class ZoneSet : public std::set> { - public: - // Constructs an empty set. - explicit ZoneSet(Zone* zone) - : std::set>(Compare(), - ZoneAllocator(zone)) {} -}; - -// A wrapper subclass for std::map to make it easy to construct one that uses -// a zone allocator. -// Used in regexp-bytecode-peephole.cc -template > -class ZoneMap - : public std::map>> { - public: - // Constructs an empty map. - explicit ZoneMap(Zone* zone) - : std::map>>( - Compare(), ZoneAllocator>(zone)) {} -}; - -// A wrapper subclass for std::unordered_map to make it easy to construct one -// that uses a zone allocator. -// Used in regexp-bytecode-peephole.cc -template , - typename KeyEqual = std::equal_to> -class ZoneUnorderedMap - : public std::unordered_map>> { - public: - // Constructs an empty map. - explicit ZoneUnorderedMap(Zone* zone, size_t bucket_count = 100) - : std::unordered_map>>( - bucket_count, Hash(), KeyEqual(), - ZoneAllocator>(zone)) {} -}; - -} // namespace internal -} // namespace v8 - -#endif // V8_UTIL_FLAG_H_ -- cgit v1.2.3