Issue #1677 - Part 5: "Simplify" regexp re-import process (and re-import from later revision)

I am going on record to say Mozilla are utter fucking assholes for pulling this as part of their progression.
author: Matt A. Tobin <email@mattatobin.com> 2020-11-09 20:37:05 -0500
committer: Matt A. Tobin <email@mattatobin.com> 2020-11-09 20:37:05 -0500
commit: 51468e998c8e7191ddecacec3944c806b29dd590 (patch)
tree: c713f075c54781868ec119ea5c5f3c9369af3576
parent: 77746f1d900a35eceb23bd760983e95de7b4a547 (diff)
download: uxp-51468e998c8e7191ddecacec3944c806b29dd590.tar.gz
26 files changed, 706 insertions, 512 deletions
diff --git a/js/src/regexp/VERSION b/js/src/regexp/VERSION
index 3a0935deac..c7d35a2bb8 100644
--- a/js/src/regexp/VERSION
+++ b/js/src/regexp/VERSION
@@ -1,3 +1,2 @@
-This code was most recently imported from the following version of V8:
-
-https://github.com/v8/v8/tree/2599d3cc208a3a4873be517285220abd8416c3d7/src/regexp
+Imported using import-irregexp.py from:
+https://github.com/v8/v8/tree/560f2d8bb3f3a72d78e1a7d7654235d53fdcc83c/src/regexp
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc
index 337743f536..b4a8c3da48 100644
--- a/js/src/regexp/gen-regexp-special-case.cc
+++ b/js/src/regexp/gen-regexp-special-case.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
+// Copyright 2020 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -7,18 +7,19 @@
 #include <iostream>
 #include <sstream>
 
-#include "unicode/uchar.h"
-#include "unicode/uniset.h"
+#include "regexp/special-case.h"
 
 namespace v8 {
 namespace internal {
 
-// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
-// functions into "src/regexp/special-case.cc".
-// See more details in http://shorturl.at/adfO5
-void PrintSet(std::ofstream& out, const char* func_name,
+static const uc32 kSurrogateStart = 0xd800;
+static const uc32 kSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+
+// The following code generates "src/regexp/special-case.cc".
+void PrintSet(std::ofstream& out, const char* name,
               const icu::UnicodeSet& set) {
-  out << "icu::UnicodeSet " << func_name << "() {\n"
+  out << "icu::UnicodeSet Build" << name << "() {\n"
       << "  icu::UnicodeSet set;\n";
   for (int32_t i = 0; i < set.getRangeCount(); i++) {
     if (set.getRangeStart(i) == set.getRangeEnd(i)) {
@@ -30,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
   }
   out << "  set.freeze();\n"
       << "  return set;\n"
-      << "}\n";
+      << "}\n\n";
+
+  out << "struct " << name << "Data {\n"
+      << "  " << name << "Data() : set(Build" << name << "()) {}\n"
+      << "  const icu::UnicodeSet set;\n"
+      << "};\n\n";
+
+  out << "//static\n"
+      << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
+      << "  static base::LazyInstance<" << name << "Data>::type set =\n"
+      << "      LAZY_INSTANCE_INITIALIZER;\n"
+      << "  return set.Pointer()->set;\n"
+      << "}\n\n";
 }
 
 void PrintSpecial(std::ofstream& out) {
   icu::UnicodeSet current;
-  icu::UnicodeSet processed(0xd800, 0xdbff);  // Ignore surrogate range.
   icu::UnicodeSet special_add;
   icu::UnicodeSet ignore;
   UErrorCode status = U_ZERO_ERROR;
   icu::UnicodeSet upper("[\\p{Lu}]", status);
   CHECK(U_SUCCESS(status));
-  // Iterate through all chars in BMP except ASCII and Surrogate.
-  for (UChar32 i = 0x80; i < 0x010000; i++) {
-    // Ignore those characters which is already processed.
-    if (!processed.contains(i)) {
-      current.set(i, i);
-      current.closeOver(USET_CASE_INSENSITIVE);
 
-      // Remember we already processed current.
-      processed.addAll(current);
-
-      // All uppercase characters in current.
-      icu::UnicodeSet keep_upper(current);
-      keep_upper.retainAll(upper);
-
-      // Check if we have more than one uppercase character in current.
-      // If there are more than one uppercase character, then it is a special
-      // set which need to be added into either "Special Add" set or "Ignore"
-      // set.
-      int32_t number_of_upper = 0;
-      for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
-        number_of_upper +=
-            keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+  // Iterate through all chars in BMP except surrogates.
+  for (UChar32 i = 0; i < kNonBmpStart; i++) {
+    if (i >= kSurrogateStart && i <= kSurrogateEnd) {
+      continue;  // Ignore surrogate range
+    }
+    current.set(i, i);
+    current.closeOver(USET_CASE_INSENSITIVE);
+
+    // Check to see if all characters in the case-folding equivalence
+    // class as defined by UnicodeSet::closeOver all map to the same
+    // canonical value.
+    UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
+    bool class_has_matching_canonical_char = false;
+    bool class_has_non_matching_canonical_char = false;
+    for (int32_t j = 0; j < current.getRangeCount(); j++) {
+      for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
+           c++) {
+        if (c == i) {
+          continue;
+        }
+        UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
+        if (canonical == other_canonical) {
+          class_has_matching_canonical_char = true;
+        } else {
+          class_has_non_matching_canonical_char = true;
+        }
+      }
+    }
+    // If any other character in i's equivalence class has a
+    // different canonical value, then i needs special handling.  If
+    // no other character shares a canonical value with i, we can
+    // ignore i when adding alternatives for case-independent
+    // comparison.  If at least one other character shares a
+    // canonical value, then i needs special handling.
+    if (class_has_non_matching_canonical_char) {
+      if (class_has_matching_canonical_char) {
+        special_add.add(i);
+      } else {
+        ignore.add(i);
       }
-      if (number_of_upper > 1) {
-        // Add all non uppercase characters (could be Ll or Mn) to special add
-        // set.
-        current.removeAll(upper);
-        special_add.addAll(current);
-
-        // Add the uppercase characters of non uppercase character to
-        // special add set.
-        CHECK_GT(current.getRangeCount(), 0);
-        UChar32 main_upper = u_toupper(current.getRangeStart(0));
-        special_add.add(main_upper);
-
-        // Add all uppercase except the main upper to ignore set.
-        keep_upper.remove(main_upper);
-        ignore.addAll(keep_upper);
+    }
+  }
+
+  // Verify that no Unicode equivalence class contains two non-trivial
+  // JS equivalence classes. Every character in SpecialAddSet has the
+  // same canonical value as every other non-IgnoreSet character in
+  // its Unicode equivalence class. Therefore, if we call closeOver on
+  // a set containing no IgnoreSet characters, the only characters
+  // that must be removed from the result are in IgnoreSet. This fact
+  // is used in CharacterRange::AddCaseEquivalents.
+  for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
+    for (UChar32 c = special_add.getRangeStart(i);
+         c <= special_add.getRangeEnd(i); c++) {
+      UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
+      current.set(c, c);
+      current.closeOver(USET_CASE_INSENSITIVE);
+      current.removeAll(ignore);
+      for (int32_t j = 0; j < current.getRangeCount(); j++) {
+        for (UChar32 c2 = current.getRangeStart(j);
+             c2 <= current.getRangeEnd(j); c2++) {
+          CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
+        }
       }
     }
   }
 
-  // Remove any ASCII
-  special_add.remove(0x0000, 0x007f);
-  PrintSet(out, "BuildIgnoreSet", ignore);
-  PrintSet(out, "BuildSpecialAddSet", special_add);
+  PrintSet(out, "IgnoreSet", ignore);
+  PrintSet(out, "SpecialAddSet", special_add);
 }
 
 void WriteHeader(const char* header_filename) {
   std::ofstream out(header_filename);
   out << std::hex << std::setfill('0') << std::setw(4);
-
-  out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
-      << "// The following functions are used to build icu::UnicodeSet\n"
-      << "// for specical cases different between Unicode and ECMA262.\n"
+  out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
+      << "// Use of this source code is governed by a BSD-style license that\n"
+      << "// can be found in the LICENSE file.\n\n"
+      << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
+      << "// The following functions are used to build UnicodeSets\n"
+      << "// for special cases where the case-folding algorithm used by\n"
+      << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
+      << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
+      << "// Semantics: Canonicalize) step 3.\n\n"
       << "#ifdef V8_INTL_SUPPORT\n"
+      << "#include \"src/base/lazy-instance.h\"\n\n"
       << "#include \"src/regexp/special-case.h\"\n\n"
       << "#include \"unicode/uniset.h\"\n"
       << "namespace v8 {\n"
diff --git a/js/src/regexp/import-irregexp.py b/js/src/regexp/import-irregexp.py
new file mode 100644
index 0000000000..870387232c
--- /dev/null
+++ b/js/src/regexp/import-irregexp.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This script handles all the mechanical steps of importing irregexp from v8:
+#
+# 1. Acquire the source: either from github, or optionally from a local copy of v8.
+# 2. Copy the contents of v8/src/regexp into js/src/regexp
+#    - Exclude files that we have chosen not to import.
+# 3. While doing so, update #includes:
+#    - Change "src/regexp/*" to "regexp/*".
+#    - Remove other v8-specific headers completely.
+# 4. Add '#include "regexp/regexp-shim.h" in the necessary places.
+# 5. Update the VERSION file to include the correct git hash.
+#
+# Usage:
+#  cd path/to/js/src/regexp
+#  ./import-irregexp.py --path path/to/v8/src/regexp
+#
+# Alternatively, without the --path argument, import-irregexp.py will
+# clone v8 from github into a temporary directory.
+#
+# After running this script, changes to the shim code may be necessary
+# to account for changes in upstream irregexp.
+
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+def get_hash(path):
+    # Get the hash for the current git revision
+    cwd = os.getcwd()
+    os.chdir(path)
+    command = ['git', 'rev-parse', 'HEAD']
+    result = subprocess.check_output(command, encoding='utf-8')
+    os.chdir(cwd)
+    return result.rstrip()
+
+
+def copy_and_update_includes(src_path, dst_path):
+    # List of header files that need to include the shim header
+    need_shim = ['property-sequences.h',
+                 'regexp-ast.h',
+                 'regexp-bytecode-peephole.h',
+                 'regexp-bytecodes.h',
+                 'regexp-dotprinter.h',
+                 'regexp.h',
+                 'regexp-macro-assembler.h',
+                 'regexp-stack.h',
+                 'special-case.h']
+
+    src = open(str(src_path), 'r')
+    dst = open(str(dst_path), 'w')
+
+    # 1. Rewrite includes of V8 regexp headers:
+    regexp_include = re.compile('#include "src/regexp')
+    regexp_include_new = '#include "regexp'
+
+    # 2. Remove includes of other V8 headers
+    other_include = re.compile('#include "src/')
+
+    # 3. If needed, add '#include "regexp/regexp-shim.h"'.
+    #    Note: We get a little fancy to ensure that header files are
+    #    in alphabetic order. `need_to_add_shim` is true if we still
+    #    have to add the shim header in this file. `adding_shim_now`
+    #    is true if we have found a '#include "src/*' and we are just
+    #    waiting to find something alphabetically smaller (or an empty
+    #    line) so that we can insert the shim header in the right place.
+    need_to_add_shim = src_path.name in need_shim
+    adding_shim_now = False
+
+    for line in src:
+        if adding_shim_now:
+            if (line == '\n' or line > '#include "src/regexp/regexp-shim.h"'):
+                dst.write('#include "regexp/regexp-shim.h"\n')
+                need_to_add_shim = False
+                adding_shim_now = False
+
+        if regexp_include.search(line):
+            dst.write(re.sub(regexp_include, regexp_include_new, line))
+        elif other_include.search(line):
+            if need_to_add_shim:
+                adding_shim_now = True
+        else:
+            dst.write(line)
+
+
+def import_from(srcdir, dstdir):
+    excluded = ['OWNERS',
+                'regexp.cc',
+                'regexp-utils.cc',
+                'regexp-utils.h',
+                'regexp-macro-assembler-arch.h']
+
+    for file in srcdir.iterdir():
+        if file.is_dir():
+            continue
+        if str(file.name) in excluded:
+            continue
+        copy_and_update_includes(file, dstdir / file.name)
+
+    # Update VERSION file
+    hash = get_hash(srcdir)
+    version_file = open(str(dstdir / 'VERSION'), 'w')
+    version_file.write('Imported using import-irregexp.py from:\n')
+    version_file.write('https://github.com/v8/v8/tree/%s/src/regexp\n' % hash)
+
+
+if __name__ == '__main__':
+    import argparse
+    import tempfile
+
+    # This script should be run from js/src/regexp to work correctly.
+    current_path = Path(os.getcwd())
+    expected_path = 'js/src/regexp'
+    if not current_path.match(expected_path):
+        raise RuntimeError('%s must be run from %s' % (sys.argv[0],
+                                                       expected_path))
+
+    parser = argparse.ArgumentParser(description='Import irregexp from v8')
+    parser.add_argument('-p', '--path', help='path to v8/src/regexp')
+    args = parser.parse_args()
+
+    if args.path:
+        src_path = Path(args.path)
+
+        if not (src_path / 'regexp.h').exists():
+            print('Usage:\n  import-irregexp.py --path <path/to/v8/src/regexp>')
+            sys.exit(1)
+        import_from(src_path, current_path)
+        sys.exit(0)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        v8_git = 'https://github.com/v8/v8.git'
+        clone = 'git clone --depth 1 %s %s' % (v8_git, tempdir)
+        os.system(clone)
+        src_path = Path(tempdir) / 'src/regexp'
+        import_from(src_path, current_path)
diff --git a/js/src/regexp/regexp-ast.h b/js/src/regexp/regexp-ast.h
index fe6913e1d4..311929d0b9 100644
--- a/js/src/regexp/regexp-ast.h
+++ b/js/src/regexp/regexp-ast.h
@@ -458,7 +458,11 @@ class RegExpQuantifier final : public RegExpTree {
 class RegExpCapture final : public RegExpTree {
  public:
   explicit RegExpCapture(int index)
-      : body_(nullptr), index_(index), name_(nullptr) {}
+      : body_(nullptr),
+        index_(index),
+        min_match_(0),
+        max_match_(0),
+        name_(nullptr) {}
   void* Accept(RegExpVisitor* visitor, void* data) override;
   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
   static RegExpNode* ToNode(RegExpTree* body, int index,
@@ -468,10 +472,14 @@ class RegExpCapture final : public RegExpTree {
   bool IsAnchoredAtEnd() override;
   Interval CaptureRegisters() override;
   bool IsCapture() override;
-  int min_match() override { return body_->min_match(); }
-  int max_match() override { return body_->max_match(); }
+  int min_match() override { return min_match_; }
+  int max_match() override { return max_match_; }
   RegExpTree* body() { return body_; }
-  void set_body(RegExpTree* body) { body_ = body; }
+  void set_body(RegExpTree* body) {
+    body_ = body;
+    min_match_ = body->min_match();
+    max_match_ = body->max_match();
+  }
   int index() const { return index_; }
   const ZoneVector<uc16>* name() const { return name_; }
   void set_name(const ZoneVector<uc16>* name) { name_ = name; }
@@ -481,12 +489,17 @@ class RegExpCapture final : public RegExpTree {
  private:
   RegExpTree* body_;
   int index_;
+  int min_match_;
+  int max_match_;
   const ZoneVector<uc16>* name_;
 };
 
 class RegExpGroup final : public RegExpTree {
  public:
-  explicit RegExpGroup(RegExpTree* body) : body_(body) {}
+  explicit RegExpGroup(RegExpTree* body)
+      : body_(body),
+        min_match_(body->min_match()),
+        max_match_(body->max_match()) {}
   void* Accept(RegExpVisitor* visitor, void* data) override;
   RegExpNode* ToNode(RegExpCompiler* compiler,
                      RegExpNode* on_success) override {
@@ -496,13 +509,15 @@ class RegExpGroup final : public RegExpTree {
   bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
   bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
   bool IsGroup() override;
-  int min_match() override { return body_->min_match(); }
-  int max_match() override { return body_->max_match(); }
+  int min_match() override { return min_match_; }
+  int max_match() override { return max_match_; }
   Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
   RegExpTree* body() { return body_; }
 
  private:
   RegExpTree* body_;
+  int min_match_;
+  int max_match_;
 };
 
 class RegExpLookaround final : public RegExpTree {
diff --git a/js/src/regexp/regexp-bytecode-generator.cc b/js/src/regexp/regexp-bytecode-generator.cc
index 239b27605f..db151de851 100644
--- a/js/src/regexp/regexp-bytecode-generator.cc
+++ b/js/src/regexp/regexp-bytecode-generator.cc
@@ -327,13 +327,11 @@ void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg,
 }
 
 void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
+    int start_reg, bool read_backward, Label* on_not_equal) {
   DCHECK_LE(0, start_reg);
   DCHECK_GE(kMaxRegister, start_reg);
-  Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
-                                : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
-                     : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
-                                : BC_CHECK_NOT_BACK_REF_NO_CASE),
+  Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
+                     : BC_CHECK_NOT_BACK_REF_NO_CASE,
        start_reg);
   EmitOrLink(on_not_equal);
 }
diff --git a/js/src/regexp/regexp-bytecode-generator.h b/js/src/regexp/regexp-bytecode-generator.h
index 15fbda8ecb..f5502464d4 100644
--- a/js/src/regexp/regexp-bytecode-generator.h
+++ b/js/src/regexp/regexp-bytecode-generator.h
@@ -69,7 +69,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward, bool unicode,
+                                               bool read_backward,
                                                Label* on_no_match);
   virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
   virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
diff --git a/js/src/regexp/regexp-bytecode-peephole.cc b/js/src/regexp/regexp-bytecode-peephole.cc
index 2bc1b5aa26..4266b4a807 100644
--- a/js/src/regexp/regexp-bytecode-peephole.cc
+++ b/js/src/regexp/regexp-bytecode-peephole.cc
@@ -428,7 +428,6 @@ BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping(
     size_t index) const {
   DCHECK(IsSequence());
   DCHECK(argument_mapping_ != nullptr);
-  DCHECK_GE(index, 0);
   DCHECK_LT(index, argument_mapping_->size());
 
   return argument_mapping_->at(index);
diff --git a/js/src/regexp/regexp-bytecodes.h b/js/src/regexp/regexp-bytecodes.h
index 24d6925db9..1cfef1b2d4 100644
--- a/js/src/regexp/regexp-bytecodes.h
+++ b/js/src/regexp/regexp-bytecodes.h
@@ -100,12 +100,12 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
   V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128           */       \
   V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32                       */          \
   V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32                       */          \
-  V(CHECK_NOT_BACK_REF, 37, 8)         /* bc8 reg_idx24 addr32 */              \
-  V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */              \
-  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8)                                 \
+  V(CHECK_NOT_BACK_REF, 37, 8)                  /* bc8 reg_idx24 addr32 */     \
+  V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8)          /* bc8 reg_idx24 addr32 */     \
+  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8)  /* UNUSED */                   \
   V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8)         /* bc8 reg_idx24 addr32 */     \
   V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */     \
-  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8)                        \
+  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */           \
   V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */          \
   V(CHECK_REGISTER_LT, 44, 12)    /* bc8 reg_idx24 value32 addr32 */           \
   V(CHECK_REGISTER_GE, 45, 12)    /* bc8 reg_idx24 value32 addr32 */           \
diff --git a/js/src/regexp/regexp-compiler-tonode.cc b/js/src/regexp/regexp-compiler-tonode.cc
index fc734ac7c1..257030589d 100644
--- a/js/src/regexp/regexp-compiler-tonode.cc
+++ b/js/src/regexp/regexp-compiler-tonode.cc
@@ -1137,39 +1137,6 @@ Vector<const int> CharacterRange::GetWordBounds() {
   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
 }
 
-#ifdef V8_INTL_SUPPORT
-struct IgnoreSet {
-  IgnoreSet() : set(BuildIgnoreSet()) {}
-  const icu::UnicodeSet set;
-};
-
-struct SpecialAddSet {
-  SpecialAddSet() : set(BuildSpecialAddSet()) {}
-  const icu::UnicodeSet set;
-};
-
-icu::UnicodeSet BuildAsciiAToZSet() {
-  icu::UnicodeSet set('a', 'z');
-  set.add('A', 'Z');
-  set.freeze();
-  return set;
-}
-
-struct AsciiAToZSet {
-  AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
-  const icu::UnicodeSet set;
-};
-
-static base::LazyInstance<IgnoreSet>::type ignore_set =
-    LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<SpecialAddSet>::type special_add_set =
-    LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
-    LAZY_INSTANCE_INITIALIZER;
-#endif  // V8_INTL_SUPPORT
-
 // static
 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
                                         ZoneList<CharacterRange>* ranges,
@@ -1192,75 +1159,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
     others.add(from, to);
   }
 
-  // Set of characters already added to ranges that do not need to be added
-  // again.
+  // Compute the set of additional characters that should be added,
+  // using UnicodeSet::closeOver. ECMA 262 defines slightly different
+  // case-folding rules than Unicode, so some characters that are
+  // added by closeOver do not match anything other than themselves in
+  // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
+  // same case-insensitive character as 's' or 'S' according to
+  // Unicode, but does not match any other character in JS. To handle
+  // this case, we add such characters to the IgnoreSet and filter
+  // them out. We filter twice: once before calling closeOver (to
+  // prevent 'ſ' from adding 's'), and once after calling closeOver
+  // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
+  // more information.
   icu::UnicodeSet already_added(others);
-
-  // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
-  icu::UnicodeSet in_ascii_a_to_z(others);
-  in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
-
-  // Remove all chars in [a-zA-Z] from others.
-  others.removeAll(in_ascii_a_to_z);
-
-  // Set of characters in ranges that are overlapping with special add set.
-  icu::UnicodeSet in_special_add(others);
-  in_special_add.retainAll(special_add_set.Pointer()->set);
-
-  others.removeAll(in_special_add);
-
-  // Ignore all chars in ignore set.
-  others.removeAll(ignore_set.Pointer()->set);
-
-  // For most of the chars in ranges that is still in others, find the case
-  // equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
+  others.removeAll(RegExpCaseFolding::IgnoreSet());
   others.closeOver(USET_CASE_INSENSITIVE);
-
-  // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
-  // but ECMA262 "i" mode won't consider that, remove them from others.
-  // Ex: U+017F add 'S' and 's' to others.
-  others.removeAll(ascii_a_to_z_set.Pointer()->set);
-
-  // Special handling for in_ascii_a_to_z.
-  for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
-    UChar32 start = in_ascii_a_to_z.getRangeStart(i);
-    UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
-    // Check if it is uppercase A-Z by checking bit 6.
-    if (start & 0x0020) {
-      // Add the lowercases
-      others.add(start & 0x005F, end & 0x005F);
-    } else {
-      // Add the uppercases
-      others.add(start | 0x0020, end | 0x0020);
-    }
-  }
-
-  // Special handling for chars in "Special Add" set.
-  for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
-    UChar32 end = in_special_add.getRangeEnd(i);
-    for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
-      // Add the uppercase of this character if itself is not an uppercase
-      // character.
-      // Note: The if condiction cannot be u_islower(ch) because ch could be
-      // neither uppercase nor lowercase but Mn.
-      if (!u_isupper(ch)) {
-        others.add(u_toupper(ch));
-      }
-      icu::UnicodeSet candidates(ch, ch);
-      candidates.closeOver(USET_CASE_INSENSITIVE);
-      for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
-        UChar32 end2 = candidates.getRangeEnd(j);
-        for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
-          // Add character that is not uppercase to others.
-          if (!u_isupper(ch2)) {
-            others.add(ch2);
-          }
-        }
-      }
-    }
-  }
-
-  // Remove all characters which already in the ranges.
+  others.removeAll(RegExpCaseFolding::IgnoreSet());
   others.removeAll(already_added);
 
   // Add others to the ranges
diff --git a/js/src/regexp/regexp-compiler.cc b/js/src/regexp/regexp-compiler.cc
index 9a2aa30dcc..c0070061f8 100644
--- a/js/src/regexp/regexp-compiler.cc
+++ b/js/src/regexp/regexp-compiler.cc
@@ -5,7 +5,9 @@
 #include "regexp/regexp-compiler.h"
 
 #include "regexp/regexp-macro-assembler-arch.h"
-#include "regexp/regexp-macro-assembler-tracer.h"
+#ifdef V8_INTL_SUPPORT
+#include "regexp/special-case.h"
+#endif  // V8_INTL_SUPPORT
 
 #ifdef V8_INTL_SUPPORT
 #include "unicode/locid.h"
@@ -237,20 +239,15 @@ RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
     Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
     int capture_count, Handle<String> pattern) {
-#ifdef DEBUG
-  if (FLAG_trace_regexp_assembler)
-    macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
-  else
-#endif
-    macro_assembler_ = macro_assembler;
+  macro_assembler_ = macro_assembler;
 
-  std::vector<RegExpNode*> work_list;
+  ZoneVector<RegExpNode*> work_list(zone());
   work_list_ = &work_list;
   Label fail;
   macro_assembler_->PushBacktrack(&fail);
   Trace new_trace;
   start->Emit(this, &new_trace);
-  macro_assembler_->Bind(&fail);
+  macro_assembler_->BindJumpTarget(&fail);
   macro_assembler_->Fail();
   while (!work_list.empty()) {
     RegExpNode* node = work_list.back();
@@ -264,14 +261,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
   }
 
   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
-  isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
+  isolate->IncreaseTotalRegexpCodeGenerated(code);
   work_list_ = nullptr;
 
-#ifdef DEBUG
-  if (FLAG_trace_regexp_assembler) {
-    delete macro_assembler_;
-  }
-#endif
   return {*code, next_register_};
 }
 
@@ -557,7 +549,7 @@ void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
   }
 
   // On backtrack we need to restore state.
-  assembler->Bind(&undo);
+  assembler->BindJumpTarget(&undo);
   RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
                            registers_to_clear);
   if (backtrack() == nullptr) {
@@ -720,32 +712,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
                                      unibrow::uchar* letters,
                                      int letter_length) {
 #ifdef V8_INTL_SUPPORT
-  // Special case for U+017F which has upper case in ASCII range.
-  if (character == 0x017f) {
+  if (RegExpCaseFolding::IgnoreSet().contains(character)) {
     letters[0] = character;
     return 1;
   }
+  bool in_special_add_set =
+      RegExpCaseFolding::SpecialAddSet().contains(character);
+
   icu::UnicodeSet set;
   set.add(character);
   set = set.closeOver(USET_CASE_INSENSITIVE);
+
+  UChar32 canon = 0;
+  if (in_special_add_set) {
+    canon = RegExpCaseFolding::Canonicalize(character);
+  }
+
   int32_t range_count = set.getRangeCount();
   int items = 0;
   for (int32_t i = 0; i < range_count; i++) {
     UChar32 start = set.getRangeStart(i);
     UChar32 end = set.getRangeEnd(i);
     CHECK(end - start + items <= letter_length);
-    // Only add to the output if character is not in ASCII range
-    // or the case equivalent character is in ASCII range.
-    // #sec-runtime-semantics-canonicalize-ch
-    // 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128,
-    //     return ch.
-    if (!((start >= 128) && (character < 128))) {
-      // No range have start and end span across code point 128.
-      DCHECK((start >= 128) == (end >= 128));
-      for (UChar32 cu = start; cu <= end; cu++) {
-        if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
-        letters[items++] = (unibrow::uchar)(cu);
+    for (UChar32 cu = start; cu <= end; cu++) {
+      if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
+      if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
+        continue;
       }
+      letters[items++] = (unibrow::uchar)(cu);
     }
   }
   return items;
@@ -852,10 +846,6 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
   return false;
 }
 
-using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler,
-                                   uc16 c, Label* on_failure, int cp_offset,
-                                   bool check, bool preloaded);
-
 // Only emits letters (things that have case).  Only used for case independent
 // matches.
 static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
@@ -1843,13 +1833,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
     if (elm.text_type() == TextElement::ATOM) {
       Vector<const uc16> quarks = elm.atom()->data();
       for (int j = 0; j < quarks.length(); j++) {
-        uint16_t c = quarks[j];
+        uc16 c = quarks[j];
         if (elm.atom()->ignore_case()) {
           c = unibrow::Latin1::TryConvertToLatin1(c);
         }
         if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
         // Replace quark in case we converted to Latin-1.
-        uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.begin());
+        uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
         writable_quarks[j] = c;
       }
     } else {
@@ -2304,7 +2294,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
         if (first_element_checked && i == 0 && j == 0) continue;
         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
-        EmitCharacterFunction* emit_function = nullptr;
         uc16 quark = quarks[j];
         if (elm.atom()->ignore_case()) {
           // Everywhere else we assume that a non-Latin-1 character cannot match
@@ -2312,6 +2301,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
           // invalid by using the Latin1 equivalent instead.
           quark = unibrow::Latin1::TryConvertToLatin1(quark);
         }
+        bool needs_bounds_check =
+            *checked_up_to < cp_offset + j || read_backward();
+        bool bounds_checked = false;
         switch (pass) {
           case NON_LATIN1_MATCH:
             DCHECK(one_byte);
@@ -2321,24 +2313,24 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
             }
             break;
           case NON_LETTER_CHARACTER_MATCH:
-            emit_function = &EmitAtomNonLetter;
+            bounds_checked =
+                EmitAtomNonLetter(isolate, compiler, quark, backtrack,
+                                  cp_offset + j, needs_bounds_check, preloaded);
             break;
           case SIMPLE_CHARACTER_MATCH:
-            emit_function = &EmitSimpleCharacter;
+            bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
+                                                 backtrack, cp_offset + j,
+                                                 needs_bounds_check, preloaded);
             break;
           case CASE_CHARACTER_MATCH:
-            emit_function = &EmitAtomLetter;
+            bounds_checked =
+                EmitAtomLetter(isolate, compiler, quark, backtrack,
+                               cp_offset + j, needs_bounds_check, preloaded);
             break;
           default:
             break;
         }
-        if (emit_function != nullptr) {
-          bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
-          bool bound_checked =
-              emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
-                            bounds_check, preloaded);
-          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
-        }
+        if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
       }
     } else {
       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
@@ -3424,8 +3416,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
 
   DCHECK_EQ(start_reg_ + 1, end_reg_);
   if (IgnoreCase(flags_)) {
-    assembler->CheckNotBackReferenceIgnoreCase(
-        start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
+    assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
+                                               trace->backtrack());
   } else {
     assembler->CheckNotBackReference(start_reg_, read_backward(),
                                      trace->backtrack());
@@ -3597,12 +3589,17 @@ template <typename... Propagators>
 class Analysis : public NodeVisitor {
  public:
   Analysis(Isolate* isolate, bool is_one_byte)
-      : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
+      : isolate_(isolate),
+        is_one_byte_(is_one_byte),
+        error_(RegExpError::kNone) {}
 
   void EnsureAnalyzed(RegExpNode* that) {
     StackLimitCheck check(isolate());
     if (check.HasOverflowed()) {
-      fail("Stack overflow");
+      if (FLAG_correctness_fuzzer_suppressions) {
+        FATAL("Analysis: Aborting on stack overflow");
+      }
+      fail(RegExpError::kAnalysisStackOverflow);
       return;
     }
     if (that->info()->been_analyzed || that->info()->being_analyzed) return;
@@ -3612,12 +3609,12 @@ class Analysis : public NodeVisitor {
     that->info()->been_analyzed = true;
   }
 
-  bool has_failed() { return error_message_ != nullptr; }
-  const char* error_message() {
-    DCHECK(error_message_ != nullptr);
-    return error_message_;
+  bool has_failed() { return error_ != RegExpError::kNone; }
+  RegExpError error() {
+    DCHECK(error_ != RegExpError::kNone);
+    return error_;
   }
-  void fail(const char* error_message) { error_message_ = error_message; }
+  void fail(RegExpError error) { error_ = error; }
 
   Isolate* isolate() const { return isolate_; }
 
@@ -3702,19 +3699,19 @@ class Analysis : public NodeVisitor {
  private:
   Isolate* isolate_;
   bool is_one_byte_;
-  const char* error_message_;
+  RegExpError error_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
 };
 
-const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
                           RegExpNode* node) {
   Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
                                                                 is_one_byte);
   DCHECK_EQ(node->info()->been_analyzed, false);
   analysis.EnsureAnalyzed(node);
-  DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr);
-  return analysis.has_failed() ? analysis.error_message() : nullptr;
+  DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
+  return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
 }
 
 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
diff --git a/js/src/regexp/regexp-compiler.h b/js/src/regexp/regexp-compiler.h
index 192b3284d6..1954f1a4c4 100644
--- a/js/src/regexp/regexp-compiler.h
+++ b/js/src/regexp/regexp-compiler.h
@@ -422,10 +422,7 @@ struct PreloadState {
 // Analysis performs assertion propagation and computes eats_at_least_ values.
 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more
 // details.
-//
-// This method returns nullptr on success or a null-terminated failure message
-// on failure.
-const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
 
 class FrequencyCollator {
  public:
@@ -502,18 +499,17 @@ class RegExpCompiler {
   }
 
   struct CompilationResult final {
-    explicit CompilationResult(const char* error_message)
-        : error_message(error_message) {}
+    explicit CompilationResult(RegExpError err) : error(err) {}
     CompilationResult(Object code, int registers)
         : code(code), num_registers(registers) {}
 
     static CompilationResult RegExpTooBig() {
-      return CompilationResult("RegExp too big");
+      return CompilationResult(RegExpError::kTooLarge);
     }
 
-    bool Succeeded() const { return error_message == nullptr; }
+    bool Succeeded() const { return error == RegExpError::kNone; }
 
-    const char* const error_message = nullptr;
+    const RegExpError error = RegExpError::kNone;
     Object code;
     int num_registers = 0;
   };
@@ -575,7 +571,7 @@ class RegExpCompiler {
   int next_register_;
   int unicode_lookaround_stack_register_;
   int unicode_lookaround_position_register_;
-  std::vector<RegExpNode*>* work_list_;
+  ZoneVector<RegExpNode*>* work_list_;
   int recursion_depth_;
   RegExpMacroAssembler* macro_assembler_;
   bool one_byte_;
diff --git a/js/src/regexp/regexp-error.cc b/js/src/regexp/regexp-error.cc
new file mode 100644
index 0000000000..3906f9d9ff
--- /dev/null
+++ b/js/src/regexp/regexp-error.cc
@@ -0,0 +1,22 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "regexp/regexp-error.h"
+
+namespace v8 {
+namespace internal {
+
+const char* kRegExpErrorStrings[] = {
+#define TEMPLATE(NAME, STRING) STRING,
+    REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+};
+
+const char* RegExpErrorString(RegExpError error) {
+  DCHECK_LT(error, RegExpError::NumErrors);
+  return kRegExpErrorStrings[static_cast<int>(error)];
+}
+
+}  // namespace internal
+}  // namespace v8
diff --git a/js/src/regexp/regexp-error.h b/js/src/regexp/regexp-error.h
new file mode 100644
index 0000000000..ef9d037dd3
--- /dev/null
+++ b/js/src/regexp/regexp-error.h
@@ -0,0 +1,56 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_ERROR_H_
+#define V8_REGEXP_REGEXP_ERROR_H_
+
+
+namespace v8 {
+namespace internal {
+
+#define REGEXP_ERROR_MESSAGES(T)                                          \
+  T(None, "")                                                             \
+  T(StackOverflow, "Maximum call stack size exceeded")                    \
+  T(AnalysisStackOverflow, "Stack overflow")                              \
+  T(TooLarge, "Regular expression too large")                             \
+  T(UnterminatedGroup, "Unterminated group")                              \
+  T(UnmatchedParen, "Unmatched ')'")                                      \
+  T(EscapeAtEndOfPattern, "\\ at end of pattern")                         \
+  T(InvalidPropertyName, "Invalid property name")                         \
+  T(InvalidEscape, "Invalid escape")                                      \
+  T(InvalidDecimalEscape, "Invalid decimal escape")                       \
+  T(InvalidUnicodeEscape, "Invalid Unicode escape")                       \
+  T(NothingToRepeat, "Nothing to repeat")                                 \
+  T(LoneQuantifierBrackets, "Lone quantifier brackets")                   \
+  T(RangeOutOfOrder, "numbers out of order in {} quantifier")             \
+  T(IncompleteQuantifier, "Incomplete quantifier")                        \
+  T(InvalidQuantifier, "Invalid quantifier")                              \
+  T(InvalidGroup, "Invalid group")                                        \
+  T(MultipleFlagDashes, "Multiple dashes in flag group")                  \
+  T(RepeatedFlag, "Repeated flag in flag group")                          \
+  T(InvalidFlagGroup, "Invalid flag group")                               \
+  T(TooManyCaptures, "Too many captures")                                 \
+  T(InvalidCaptureGroupName, "Invalid capture group name")                \
+  T(DuplicateCaptureGroupName, "Duplicate capture group name")            \
+  T(InvalidNamedReference, "Invalid named reference")                     \
+  T(InvalidNamedCaptureReference, "Invalid named capture referenced")     \
+  T(InvalidClassEscape, "Invalid class escape")                           \
+  T(InvalidClassPropertyName, "Invalid property name in character class") \
+  T(InvalidCharacterClass, "Invalid character class")                     \
+  T(UnterminatedCharacterClass, "Unterminated character class")           \
+  T(OutOfOrderCharacterClass, "Range out of order in character class")
+
+enum class RegExpError : uint32_t {
+#define TEMPLATE(NAME, STRING) k##NAME,
+  REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+      NumErrors
+};
+
+V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_REGEXP_REGEXP_ERROR_H_
diff --git a/js/src/regexp/regexp-interpreter.cc b/js/src/regexp/regexp-interpreter.cc
index 6632cd7296..7735d68855 100644
--- a/js/src/regexp/regexp-interpreter.cc
+++ b/js/src/regexp/regexp-interpreter.cc
@@ -28,18 +28,18 @@ namespace internal {
 namespace {
 
 bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
-                          Vector<const uc16> subject, bool unicode) {
+                          Vector<const uc16> subject) {
   Address offset_a =
       reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
   Address offset_b =
       reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
   size_t length = len * kUC16Size;
-  return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
-             offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
+  return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b,
+                                                          length, isolate) == 1;
 }
 
 bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
-                          Vector<const uint8_t> subject, bool unicode) {
+                          Vector<const uint8_t> subject) {
   // For Latin1 characters the unicode flag makes no difference.
   for (int i = 0; i < len; i++) {
     unsigned int old_char = subject[from++];
@@ -82,11 +82,17 @@ int32_t Load32Aligned(const byte* pc) {
   return *reinterpret_cast<const int32_t*>(pc);
 }
 
-int32_t Load16Aligned(const byte* pc) {
+// TODO(jgruber): Rename to Load16AlignedUnsigned.
+uint32_t Load16Aligned(const byte* pc) {
   DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
   return *reinterpret_cast<const uint16_t*>(pc);
 }
 
+int32_t Load16AlignedSigned(const byte* pc) {
+  DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
+  return *reinterpret_cast<const int16_t*>(pc);
+}
+
 // A simple abstraction over the backtracking stack used by the interpreter.
 //
 // Despite the name 'backtracking' stack, it's actually used as a generic stack
@@ -734,26 +740,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
       DISPATCH();
     }
     BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) {
-      int from = registers[insn >> BYTECODE_SHIFT];
-      int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
-      if (from >= 0 && len > 0) {
-        if (current + len > subject.length() ||
-            !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) {
-          SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
-          DISPATCH();
-        }
-        current += len;
-      }
-      ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE);
-      DISPATCH();
+      UNREACHABLE();  // TODO(jgruber): Remove this unused bytecode.
     }
     BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
       int from = registers[insn >> BYTECODE_SHIFT];
       int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
       if (from >= 0 && len > 0) {
         if (current + len > subject.length() ||
-            !BackRefMatchesNoCase(isolate, from, current, len, subject,
-                                  false)) {
+            !BackRefMatchesNoCase(isolate, from, current, len, subject)) {
           SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
           DISPATCH();
         }
@@ -763,27 +757,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
       DISPATCH();
     }
     BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) {
-      int from = registers[insn >> BYTECODE_SHIFT];
-      int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
-      if (from >= 0 && len > 0) {
-        if (current - len < 0 ||
-            !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
-                                  true)) {
-          SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
-          DISPATCH();
-        }
-        current -= len;
-      }
-      ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD);
-      DISPATCH();
+      UNREACHABLE();  // TODO(jgruber): Remove this unused bytecode.
     }
     BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
       int from = registers[insn >> BYTECODE_SHIFT];
       int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
       if (from >= 0 && len > 0) {
         if (current - len < 0 ||
-            !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
-                                  false)) {
+            !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) {
           SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
           DISPATCH();
         }
@@ -828,7 +809,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
     }
     BYTECODE(SKIP_UNTIL_CHAR) {
       int load_offset = (insn >> BYTECODE_SHIFT);
-      uint32_t advance = Load16Aligned(pc + 4);
+      int32_t advance = Load16AlignedSigned(pc + 4);
       uint32_t c = Load16Aligned(pc + 6);
       while (static_cast<uintptr_t>(current + load_offset) <
              static_cast<uintptr_t>(subject.length())) {
@@ -844,7 +825,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
     }
     BYTECODE(SKIP_UNTIL_CHAR_AND) {
       int load_offset = (insn >> BYTECODE_SHIFT);
-      uint16_t advance = Load16Aligned(pc + 4);
+      int32_t advance = Load16AlignedSigned(pc + 4);
       uint16_t c = Load16Aligned(pc + 6);
       uint32_t mask = Load32Aligned(pc + 8);
       int32_t maximum_offset = Load32Aligned(pc + 12);
@@ -862,7 +843,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
     }
     BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) {
       int load_offset = (insn >> BYTECODE_SHIFT);
-      uint16_t advance = Load16Aligned(pc + 4);
+      int32_t advance = Load16AlignedSigned(pc + 4);
       uint16_t c = Load16Aligned(pc + 6);
       int32_t maximum_offset = Load32Aligned(pc + 8);
       while (static_cast<uintptr_t>(current + maximum_offset) <=
@@ -879,7 +860,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
     }
     BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) {
       int load_offset = (insn >> BYTECODE_SHIFT);
-      uint32_t advance = Load16Aligned(pc + 4);
+      int32_t advance = Load16AlignedSigned(pc + 4);
       const byte* table = pc + 8;
       while (static_cast<uintptr_t>(current + load_offset) <
              static_cast<uintptr_t>(subject.length())) {
@@ -895,7 +876,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
     }
     BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) {
       int load_offset = (insn >> BYTECODE_SHIFT);
-      uint16_t advance = Load16Aligned(pc + 4);
+      int32_t advance = Load16AlignedSigned(pc + 4);
       uint16_t limit = Load16Aligned(pc + 6);
       const byte* table = pc + 8;
       while (static_cast<uintptr_t>(current + load_offset) <
@@ -916,7 +897,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
     }
     BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) {
       int load_offset = (insn >> BYTECODE_SHIFT);
-      uint32_t advance = Load32Aligned(pc + 4);
+      int32_t advance = Load32Aligned(pc + 4);
       uint16_t c = Load16Aligned(pc + 8);
       uint16_t c2 = Load16Aligned(pc + 10);
       while (static_cast<uintptr_t>(current + load_offset) <
@@ -1016,6 +997,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
   }
 }
 
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
 // This method is called through an external reference from RegExpExecInternal
 // builtin.
 IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
@@ -1042,6 +1025,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
                start_position, call_origin);
 }
 
+#endif  // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
 IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime(
     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
     int* registers, int registers_length, int start_position) {
diff --git a/js/src/regexp/regexp-macro-assembler-tracer.cc b/js/src/regexp/regexp-macro-assembler-tracer.cc
index 331c57d1ae..b71a0f48e9 100644
--- a/js/src/regexp/regexp-macro-assembler-tracer.cc
+++ b/js/src/regexp/regexp-macro-assembler-tracer.cc
@@ -349,17 +349,15 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
   assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match);
 }
 
-
 void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
-  PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
+    int start_reg, bool read_backward, Label* on_no_match) {
+  PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
          start_reg, read_backward ? "backward" : "forward",
-         unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
-  assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
+         LabelToInt(on_no_match));
+  assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
                                               on_no_match);
 }
 
-
 void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
                                                Label* on_outside_input) {
   PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset,
diff --git a/js/src/regexp/regexp-macro-assembler-tracer.h b/js/src/regexp/regexp-macro-assembler-tracer.h
index 938f84796d..5332e59b89 100644
--- a/js/src/regexp/regexp-macro-assembler-tracer.h
+++ b/js/src/regexp/regexp-macro-assembler-tracer.h
@@ -33,7 +33,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
   void CheckNotBackReference(int start_reg, bool read_backward,
                              Label* on_no_match) override;
   void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
-                                       bool unicode,
                                        Label* on_no_match) override;
   void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
   void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
diff --git a/js/src/regexp/regexp-macro-assembler.cc b/js/src/regexp/regexp-macro-assembler.cc
index 4a8dcd3ce8..7f8de25437 100644
--- a/js/src/regexp/regexp-macro-assembler.cc
+++ b/js/src/regexp/regexp-macro-assembler.cc
@@ -110,34 +110,7 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() {
   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
 }
 
-const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
-    String subject, int start_index, const DisallowHeapAllocation& no_gc) {
-  if (subject.IsConsString()) {
-    subject = ConsString::cast(subject).first();
-  } else if (subject.IsSlicedString()) {
-    start_index += SlicedString::cast(subject).offset();
-    subject = SlicedString::cast(subject).parent();
-  }
-  if (subject.IsThinString()) {
-    subject = ThinString::cast(subject).actual();
-  }
-  DCHECK_LE(0, start_index);
-  DCHECK_LE(start_index, subject.length());
-  if (subject.IsSeqOneByteString()) {
-    return reinterpret_cast<const byte*>(
-        SeqOneByteString::cast(subject).GetChars(no_gc) + start_index);
-  } else if (subject.IsSeqTwoByteString()) {
-    return reinterpret_cast<const byte*>(
-        SeqTwoByteString::cast(subject).GetChars(no_gc) + start_index);
-  } else if (subject.IsExternalOneByteString()) {
-    return reinterpret_cast<const byte*>(
-        ExternalOneByteString::cast(subject).GetChars() + start_index);
-  } else {
-    DCHECK(subject.IsExternalTwoByteString());
-    return reinterpret_cast<const byte*>(
-        ExternalTwoByteString::cast(subject).GetChars() + start_index);
-  }
-}
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
 
 // This method may only be called after an interrupt.
 int NativeRegExpMacroAssembler::CheckStackGuardState(
@@ -145,9 +118,10 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
     Address* return_address, Code re_code, Address* subject,
     const byte** input_start, const byte** input_end) {
   DisallowHeapAllocation no_gc;
+  Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
+  DCHECK_LE(re_code.raw_instruction_start(), old_pc);
+  DCHECK_LE(old_pc, re_code.raw_instruction_end());
 
-  DCHECK(re_code.raw_instruction_start() <= *return_address);
-  DCHECK(*return_address <= re_code.raw_instruction_end());
   StackLimitCheck check(isolate);
   bool js_has_overflowed = check.JsHasOverflowed();
 
@@ -189,9 +163,11 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
   }
 
   if (*code_handle != re_code) {  // Return address no longer valid
-    intptr_t delta = code_handle->address() - re_code.address();
     // Overwrite the return address on the stack.
-    *return_address += delta;
+    intptr_t delta = code_handle->address() - re_code.address();
+    Address new_pc = old_pc + delta;
+    // TODO(v8:10026): avoid replacing a signed pointer.
+    PointerAuthentication::ReplacePC(return_address, new_pc, 0);
   }
 
   // If we continue, we need to update the subject string addresses.
@@ -206,8 +182,7 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
     } else {
       *subject = subject_handle->ptr();
       intptr_t byte_length = *input_end - *input_start;
-      *input_start =
-          StringCharacterPosition(*subject_handle, start_index, no_gc);
+      *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
       *input_end = *input_start + byte_length;
     }
   }
@@ -255,7 +230,7 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
 
   DisallowHeapAllocation no_gc;
   const byte* input_start =
-      StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
+      subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
   int byte_length = char_length << char_size_shift;
   const byte* input_end = input_start + byte_length;
   return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
@@ -301,6 +276,8 @@ int NativeRegExpMacroAssembler::Execute(
   return result;
 }
 
+#endif  // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
 // clang-format off
 const byte NativeRegExpMacroAssembler::word_character_map[] = {
     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
diff --git a/js/src/regexp/regexp-macro-assembler.h b/js/src/regexp/regexp-macro-assembler.h
index dd059a43d0..ef3961a70a 100644
--- a/js/src/regexp/regexp-macro-assembler.h
+++ b/js/src/regexp/regexp-macro-assembler.h
@@ -87,7 +87,7 @@ class RegExpMacroAssembler {
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match) = 0;
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward, bool unicode,
+                                               bool read_backward,
                                                Label* on_no_match) = 0;
   // Check the current character for a match with a literal character.  If we
   // fail to match then goto the on_failure label.  End of input always
@@ -122,6 +122,11 @@ class RegExpMacroAssembler {
   // not have custom support.
   // May clobber the current loaded character.
   virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
+
+  // Control-flow integrity:
+  // Define a jump target and bind a label.
+  virtual void BindJumpTarget(Label* label) { Bind(label); }
+
   virtual void Fail() = 0;
   virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
   virtual void GoTo(Label* label) = 0;
@@ -246,9 +251,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
   static Address GrowStack(Address stack_pointer, Address* stack_top,
                            Isolate* isolate);
 
-  static const byte* StringCharacterPosition(
-      String subject, int start_index, const DisallowHeapAllocation& no_gc);
-
   static int CheckStackGuardState(Isolate* isolate, int start_index,
                                   RegExp::CallOrigin call_origin,
                                   Address* return_address, Code re_code,
diff --git a/js/src/regexp/regexp-parser.cc b/js/src/regexp/regexp-parser.cc
index 377b942477..e2bbb6ed03 100644
--- a/js/src/regexp/regexp-parser.cc
+++ b/js/src/regexp/regexp-parser.cc
@@ -17,11 +17,10 @@
 namespace v8 {
 namespace internal {
 
-RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
-                           JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
+RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
+                           Isolate* isolate, Zone* zone)
     : isolate_(isolate),
       zone_(zone),
-      error_(error),
       captures_(nullptr),
       named_captures_(nullptr),
       named_back_references_(nullptr),
@@ -74,13 +73,12 @@ void RegExpParser::Advance() {
       if (FLAG_correctness_fuzzer_suppressions) {
         FATAL("Aborting on stack overflow");
       }
-      ReportError(CStrVector(
-          MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
+      ReportError(RegExpError::kStackOverflow);
     } else if (zone()->excess_allocation()) {
       if (FLAG_correctness_fuzzer_suppressions) {
         FATAL("Aborting on excess zone allocation");
       }
-      ReportError(CStrVector("Regular expression too large"));
+      ReportError(RegExpError::kTooLarge);
     } else {
       current_ = ReadNext<true>();
     }
@@ -132,15 +130,12 @@ bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
   return false;
 }
 
-
-RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
+RegExpTree* RegExpParser::ReportError(RegExpError error) {
   if (failed_) return nullptr;  // Do not overwrite any existing error.
   failed_ = true;
-  *error_ = isolate()
-                ->factory()
-                ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
-                .ToHandleChecked();
-  // Zip to the end to make sure the no more input is read.
+  error_ = error;
+  error_pos_ = position();
+  // Zip to the end to make sure no more input is read.
   current_ = kEndMarker;
   next_pos_ = in()->length();
   return nullptr;
@@ -187,14 +182,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case kEndMarker:
         if (state->IsSubexpression()) {
           // Inside a parenthesized group when hitting end of input.
-          return ReportError(CStrVector("Unterminated group"));
+          return ReportError(RegExpError::kUnterminatedGroup);
         }
         DCHECK_EQ(INITIAL, state->group_type());
         // Parsing completed successfully.
         return builder->ToRegExp();
       case ')': {
         if (!state->IsSubexpression()) {
-          return ReportError(CStrVector("Unmatched ')'"));
+          return ReportError(RegExpError::kUnmatchedParen);
         }
         DCHECK_NE(INITIAL, state->group_type());
 
@@ -245,7 +240,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '*':
       case '+':
       case '?':
-        return ReportError(CStrVector("Nothing to repeat"));
+        return ReportError(RegExpError::kNothingToRepeat);
       case '^': {
         Advance();
         if (builder->multiline()) {
@@ -300,7 +295,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '\\':
         switch (Next()) {
           case kEndMarker:
-            return ReportError(CStrVector("\\ at end of pattern"));
+            return ReportError(RegExpError::kEscapeAtEndOfPattern);
           case 'b':
             Advance(2);
             builder->AddAssertion(new (zone()) RegExpAssertion(
@@ -340,7 +335,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
             if (unicode()) {
               ZoneList<CharacterRange>* ranges =
                   new (zone()) ZoneList<CharacterRange>(2, zone());
-              std::vector<char> name_1, name_2;
+              ZoneVector<char> name_1(zone());
+              ZoneVector<char> name_2(zone());
               if (ParsePropertyClassName(&name_1, &name_2)) {
                 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
                   RegExpCharacterClass* cc = new (zone())
@@ -356,7 +352,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
                   }
                 }
               }
-              return ReportError(CStrVector("Invalid property name"));
+              return ReportError(RegExpError::kInvalidPropertyName);
             } else {
               builder->AddCharacter(p);
             }
@@ -392,7 +388,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
             // With /u, no identity escapes except for syntax characters
             // are allowed. Otherwise, all identity escapes are allowed.
             if (unicode()) {
-              return ReportError(CStrVector("Invalid escape"));
+              return ReportError(RegExpError::kInvalidEscape);
             }
             uc32 first_digit = Next();
             if (first_digit == '8' || first_digit == '9') {
@@ -406,7 +402,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
             Advance();
             if (unicode() && Next() >= '0' && Next() <= '9') {
               // With /u, decimal escape with leading 0 are not parsed as octal.
-              return ReportError(CStrVector("Invalid decimal escape"));
+              return ReportError(RegExpError::kInvalidDecimalEscape);
             }
             uc32 octal = ParseOctalLiteral();
             builder->AddCharacter(octal);
@@ -447,7 +443,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               // ES#prod-annexB-ExtendedPatternCharacter
               if (unicode()) {
                 // With /u, invalid escapes are not treated as identity escapes.
-                return ReportError(CStrVector("Invalid unicode escape"));
+                return ReportError(RegExpError::kInvalidUnicodeEscape);
               }
               builder->AddCharacter('\\');
             } else {
@@ -465,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               builder->AddCharacter('x');
             } else {
               // With /u, invalid escapes are not treated as identity escapes.
-              return ReportError(CStrVector("Invalid escape"));
+              return ReportError(RegExpError::kInvalidEscape);
             }
             break;
           }
@@ -478,7 +474,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               builder->AddCharacter('u');
             } else {
               // With /u, invalid escapes are not treated as identity escapes.
-              return ReportError(CStrVector("Invalid Unicode escape"));
+              return ReportError(RegExpError::kInvalidUnicodeEscape);
             }
             break;
           }
@@ -502,7 +498,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               builder->AddCharacter(current());
               Advance();
             } else {
-              return ReportError(CStrVector("Invalid escape"));
+              return ReportError(RegExpError::kInvalidEscape);
             }
             break;
         }
@@ -510,13 +506,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '{': {
         int dummy;
         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
-        if (parsed) return ReportError(CStrVector("Nothing to repeat"));
+        if (parsed) return ReportError(RegExpError::kNothingToRepeat);
         V8_FALLTHROUGH;
       }
       case '}':
       case ']':
         if (unicode()) {
-          return ReportError(CStrVector("Lone quantifier brackets"));
+          return ReportError(RegExpError::kLoneQuantifierBrackets);
         }
         V8_FALLTHROUGH;
       default:
@@ -551,13 +547,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '{':
         if (ParseIntervalQuantifier(&min, &max)) {
           if (max < min) {
-            return ReportError(
-                CStrVector("numbers out of order in {} quantifier"));
+            return ReportError(RegExpError::kRangeOutOfOrder);
           }
           break;
         } else if (unicode()) {
           // With /u, incomplete quantifiers are not allowed.
-          return ReportError(CStrVector("Incomplete quantifier"));
+          return ReportError(RegExpError::kIncompleteQuantifier);
         }
         continue;
       default:
@@ -573,7 +568,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       Advance();
     }
     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
-      return ReportError(CStrVector("Invalid quantifier"));
+      return ReportError(RegExpError::kInvalidQuantifier);
     }
   }
 }
@@ -608,7 +603,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
       case 's':
       case 'm': {
         if (!FLAG_regexp_mode_modifiers) {
-          ReportError(CStrVector("Invalid group"));
+          ReportError(RegExpError::kInvalidGroup);
           return nullptr;
         }
         Advance();
@@ -617,7 +612,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
           switch (current()) {
             case '-':
               if (!flags_sense) {
-                ReportError(CStrVector("Multiple dashes in flag group"));
+                ReportError(RegExpError::kMultipleFlagDashes);
                 return nullptr;
               }
               flags_sense = false;
@@ -631,7 +626,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
               if (current() == 'm') bit = JSRegExp::kMultiline;
               if (current() == 's') bit = JSRegExp::kDotAll;
               if (((switch_on | switch_off) & bit) != 0) {
-                ReportError(CStrVector("Repeated flag in flag group"));
+                ReportError(RegExpError::kRepeatedFlag);
                 return nullptr;
               }
               if (flags_sense) {
@@ -659,7 +654,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
               subexpr_type = GROUPING;  // Will break us out of the outer loop.
               continue;
             default:
-              ReportError(CStrVector("Invalid flag group"));
+              ReportError(RegExpError::kInvalidFlagGroup);
               return nullptr;
           }
         }
@@ -683,13 +678,13 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
         Advance();
         break;
       default:
-        ReportError(CStrVector("Invalid group"));
+        ReportError(RegExpError::kInvalidGroup);
         return nullptr;
     }
   }
   if (subexpr_type == CAPTURE) {
     if (captures_started_ >= JSRegExp::kMaxCaptures) {
-      ReportError(CStrVector("Too many captures"));
+      ReportError(RegExpError::kTooManyCaptures);
       return nullptr;
     }
     captures_started_++;
@@ -838,20 +833,20 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
     if (c == '\\' && current() == 'u') {
       Advance();
       if (!ParseUnicodeEscape(&c)) {
-        ReportError(CStrVector("Invalid Unicode escape sequence"));
+        ReportError(RegExpError::kInvalidUnicodeEscape);
         return nullptr;
       }
     }
 
     // The backslash char is misclassified as both ID_Start and ID_Continue.
     if (c == '\\') {
-      ReportError(CStrVector("Invalid capture group name"));
+      ReportError(RegExpError::kInvalidCaptureGroupName);
       return nullptr;
     }
 
     if (at_start) {
       if (!IsIdentifierStart(c)) {
-        ReportError(CStrVector("Invalid capture group name"));
+        ReportError(RegExpError::kInvalidCaptureGroupName);
         return nullptr;
       }
       push_code_unit(name, c);
@@ -862,7 +857,7 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
       } else if (IsIdentifierPart(c)) {
         push_code_unit(name, c);
       } else {
-        ReportError(CStrVector("Invalid capture group name"));
+        ReportError(RegExpError::kInvalidCaptureGroupName);
         return nullptr;
       }
     }
@@ -889,7 +884,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
 
     const auto& named_capture_it = named_captures_->find(capture);
     if (named_capture_it != named_captures_->end()) {
-      ReportError(CStrVector("Duplicate capture group name"));
+      ReportError(RegExpError::kDuplicateCaptureGroupName);
       return false;
     }
   }
@@ -903,7 +898,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
                                            RegExpParserState* state) {
   // The parser is assumed to be on the '<' in \k<name>.
   if (current() != '<') {
-    ReportError(CStrVector("Invalid named reference"));
+    ReportError(RegExpError::kInvalidNamedReference);
     return false;
   }
 
@@ -936,7 +931,7 @@ void RegExpParser::PatchNamedBackReferences() {
   if (named_back_references_ == nullptr) return;
 
   if (named_captures_ == nullptr) {
-    ReportError(CStrVector("Invalid named capture referenced"));
+    ReportError(RegExpError::kInvalidNamedCaptureReference);
     return;
   }
 
@@ -957,7 +952,7 @@ void RegExpParser::PatchNamedBackReferences() {
     if (capture_it != named_captures_->end()) {
       index = (*capture_it)->index();
     } else {
-      ReportError(CStrVector("Invalid named capture referenced"));
+      ReportError(RegExpError::kInvalidNamedCaptureReference);
       return;
     }
 
@@ -1378,8 +1373,8 @@ bool IsUnicodePropertyValueCharacter(char c) {
 
 }  // anonymous namespace
 
-bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
-                                          std::vector<char>* name_2) {
+bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
+                                          ZoneVector<char>* name_2) {
   DCHECK(name_1->empty());
   DCHECK(name_2->empty());
   // Parse the property class as follows:
@@ -1418,8 +1413,8 @@ bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
 
 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
                                          bool negate,
-                                         const std::vector<char>& name_1,
-                                         const std::vector<char>& name_2) {
+                                         const ZoneVector<char>& name_1,
+                                         const ZoneVector<char>& name_2) {
   if (name_2.empty()) {
     // First attempt to interpret as general category property value name.
     const char* name = name_1.data();
@@ -1456,7 +1451,7 @@ bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
   }
 }
 
-RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
+RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
   if (!FLAG_harmony_regexp_sequence) return nullptr;
   const char* name = name_1.data();
   const uc32* sequence_list = nullptr;
@@ -1522,19 +1517,19 @@ RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
 
 #else  // V8_INTL_SUPPORT
 
-bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
-                                          std::vector<char>* name_2) {
+bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
+                                          ZoneVector<char>* name_2) {
   return false;
 }
 
 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
                                          bool negate,
-                                         const std::vector<char>& name_1,
-                                         const std::vector<char>& name_2) {
+                                         const ZoneVector<char>& name_1,
+                                         const ZoneVector<char>& name_2) {
   return false;
 }
 
-RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
+RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
   return nullptr;
 }
 
@@ -1598,7 +1593,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
       }
       if (unicode()) {
         // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(CStrVector("Invalid class escape"));
+        ReportError(RegExpError::kInvalidClassEscape);
         return 0;
       }
       if ((controlLetter >= '0' && controlLetter <= '9') ||
@@ -1631,7 +1626,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
       // ES#prod-annexB-LegacyOctalEscapeSequence
       if (unicode()) {
         // With /u, decimal escape is not interpreted as octal character code.
-        ReportError(CStrVector("Invalid class escape"));
+        ReportError(RegExpError::kInvalidClassEscape);
         return 0;
       }
       return ParseOctalLiteral();
@@ -1641,7 +1636,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
       if (ParseHexEscape(2, &value)) return value;
       if (unicode()) {
         // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(CStrVector("Invalid escape"));
+        ReportError(RegExpError::kInvalidEscape);
         return 0;
       }
       // If \x is not followed by a two-digit hexadecimal, treat it
@@ -1654,7 +1649,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
       if (ParseUnicodeEscape(&value)) return value;
       if (unicode()) {
         // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(CStrVector("Invalid unicode escape"));
+        ReportError(RegExpError::kInvalidUnicodeEscape);
         return 0;
       }
       // If \u is not followed by a two-digit hexadecimal, treat it
@@ -1669,11 +1664,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
         Advance();
         return result;
       }
-      ReportError(CStrVector("Invalid escape"));
+      ReportError(RegExpError::kInvalidEscape);
       return 0;
     }
   }
-  return 0;
+  UNREACHABLE();
 }
 
 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
@@ -1696,17 +1691,18 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
         return;
       }
       case kEndMarker:
-        ReportError(CStrVector("\\ at end of pattern"));
+        ReportError(RegExpError::kEscapeAtEndOfPattern);
         return;
       case 'p':
       case 'P':
         if (unicode()) {
           bool negate = Next() == 'P';
           Advance(2);
-          std::vector<char> name_1, name_2;
+          ZoneVector<char> name_1(zone);
+          ZoneVector<char> name_2(zone);
           if (!ParsePropertyClassName(&name_1, &name_2) ||
               !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
-            ReportError(CStrVector("Invalid property name in character class"));
+            ReportError(RegExpError::kInvalidClassPropertyName);
           }
           *is_class_escape = true;
           return;
@@ -1725,10 +1721,6 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
 }
 
 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
-  static const char* kUnterminated = "Unterminated character class";
-  static const char* kRangeInvalid = "Invalid character class";
-  static const char* kRangeOutOfOrder = "Range out of order in character class";
-
   DCHECK_EQ(current(), '[');
   Advance();
   bool is_negated = false;
@@ -1761,7 +1753,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
         // Either end is an escaped character class. Treat the '-' verbatim.
         if (unicode()) {
           // ES2015 21.2.2.15.1 step 1.
-          return ReportError(CStrVector(kRangeInvalid));
+          return ReportError(RegExpError::kInvalidCharacterClass);
         }
         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
         ranges->Add(CharacterRange::Singleton('-'), zone());
@@ -1770,7 +1762,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
       }
       // ES2015 21.2.2.15.1 step 6.
       if (char_1 > char_2) {
-        return ReportError(CStrVector(kRangeOutOfOrder));
+        return ReportError(RegExpError::kOutOfOrderCharacterClass);
       }
       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
     } else {
@@ -1778,7 +1770,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
     }
   }
   if (!has_more()) {
-    return ReportError(CStrVector(kUnterminated));
+    return ReportError(RegExpError::kUnterminatedCharacterClass);
   }
   Advance();
   RegExpCharacterClass::CharacterClassFlags character_class_flags;
@@ -1795,14 +1787,16 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
                                FlatStringReader* input, JSRegExp::Flags flags,
                                RegExpCompileData* result) {
   DCHECK(result != nullptr);
-  RegExpParser parser(input, &result->error, flags, isolate, zone);
+  RegExpParser parser(input, flags, isolate, zone);
   RegExpTree* tree = parser.ParsePattern();
   if (parser.failed()) {
     DCHECK(tree == nullptr);
-    DCHECK(!result->error.is_null());
+    DCHECK(parser.error_ != RegExpError::kNone);
+    result->error = parser.error_;
+    result->error_pos = parser.error_pos_;
   } else {
     DCHECK(tree != nullptr);
-    DCHECK(result->error.is_null());
+    DCHECK(parser.error_ == RegExpError::kNone);
     if (FLAG_trace_regexp_parser) {
       StdoutStream os;
       tree->Print(os, zone);
diff --git a/js/src/regexp/regexp-parser.h b/js/src/regexp/regexp-parser.h
index 91677d6c35..131d12161f 100644
--- a/js/src/regexp/regexp-parser.h
+++ b/js/src/regexp/regexp-parser.h
@@ -6,6 +6,7 @@
 #define V8_REGEXP_REGEXP_PARSER_H_
 
 #include "regexp/regexp-ast.h"
+#include "regexp/regexp-error.h"
 
 namespace v8 {
 namespace internal {
@@ -150,8 +151,8 @@ class RegExpBuilder : public ZoneObject {
 
 class V8_EXPORT_PRIVATE RegExpParser {
  public:
-  RegExpParser(FlatStringReader* in, Handle<String>* error,
-               JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
+  RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
+               Zone* zone);
 
   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
                           JSRegExp::Flags flags, RegExpCompileData* result);
@@ -174,13 +175,13 @@ class V8_EXPORT_PRIVATE RegExpParser {
   bool ParseUnicodeEscape(uc32* value);
   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
 
-  bool ParsePropertyClassName(std::vector<char>* name_1,
-                              std::vector<char>* name_2);
+  bool ParsePropertyClassName(ZoneVector<char>* name_1,
+                              ZoneVector<char>* name_2);
   bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
-                             const std::vector<char>& name_1,
-                             const std::vector<char>& name_2);
+                             const ZoneVector<char>& name_1,
+                             const ZoneVector<char>& name_2);
 
-  RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
+  RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
   RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
 
   uc32 ParseOctalLiteral();
@@ -199,7 +200,7 @@ class V8_EXPORT_PRIVATE RegExpParser {
 
   char ParseClassEscape();
 
-  RegExpTree* ReportError(Vector<const char> message);
+  RegExpTree* ReportError(RegExpError error);
   void Advance();
   void Advance(int dist);
   void Reset(int pos);
@@ -332,7 +333,8 @@ class V8_EXPORT_PRIVATE RegExpParser {
 
   Isolate* isolate_;
   Zone* zone_;
-  Handle<String>* error_;
+  RegExpError error_ = RegExpError::kNone;
+  int error_pos_ = 0;
   ZoneList<RegExpCapture*>* captures_;
   ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
   ZoneList<RegExpBackReference*>* named_back_references_;
diff --git a/js/src/regexp/regexp-shim.h b/js/src/regexp/regexp-shim.h
index 38b0357272..462e396f40 100644
--- a/js/src/regexp/regexp-shim.h
+++ b/js/src/regexp/regexp-shim.h
@@ -60,6 +60,7 @@ class RegExpStack;
 #define DCHECK_NOT_NULL(val) MOZ_ASSERT((val) != nullptr)
 #define DCHECK_IMPLIES(lhs, rhs) MOZ_ASSERT_IF(lhs, rhs)
 #define CHECK MOZ_RELEASE_ASSERT
+#define CHECK_LE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) <= (rhs))
 
 template <class T>
 static constexpr inline T Min(T t1, T t2) {
@@ -1009,7 +1010,7 @@ private:
 
 public:
   // An empty stub for telemetry we don't support
-  void IncreaseTotalRegexpCodeGenerated(int size) {}
+  void IncreaseTotalRegexpCodeGenerated(Handle<HeapObject> code) {}
 
   Counters* counters() { return &counters_; }
 
@@ -1155,6 +1156,7 @@ extern bool FLAG_trace_regexp_parser;
 extern bool FLAG_trace_regexp_peephole_optimization;
 
 #define V8_USE_COMPUTED_GOTO 1
+#define COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
 
 }  // namespace internal
 }  // namespace v8
diff --git a/js/src/regexp/regexp-stack.h b/js/src/regexp/regexp-stack.h
index 812195ad12..0b452c0059 100644
--- a/js/src/regexp/regexp-stack.h
+++ b/js/src/regexp/regexp-stack.h
@@ -36,6 +36,9 @@ class RegExpStackScope {
 
 class RegExpStack {
  public:
+  RegExpStack();
+  ~RegExpStack();
+
   // Number of allocated locations on the stack below the limit.
   // No sequence of pushes must be longer that this without doing a stack-limit
   // check.
@@ -75,9 +78,6 @@ class RegExpStack {
   static constexpr size_t kMaximumStackSize = 64 * MB;
 
  private:
-  RegExpStack();
-  ~RegExpStack();
-
   // Artificial limit used when the thread-local state has been destroyed.
   static const Address kMemoryTop =
       static_cast<Address>(static_cast<uintptr_t>(-1));
diff --git a/js/src/regexp/regexp.h b/js/src/regexp/regexp.h
index cce58da384..a36662b78a 100644
--- a/js/src/regexp/regexp.h
+++ b/js/src/regexp/regexp.h
@@ -5,6 +5,7 @@
 #ifndef V8_REGEXP_REGEXP_H_
 #define V8_REGEXP_REGEXP_H_
 
+#include "regexp/regexp-error.h"
 #include "regexp/regexp-shim.h"
 
 namespace v8 {
@@ -42,7 +43,11 @@ struct RegExpCompileData {
 
   // The error message. Only used if an error occurred during parsing or
   // compilation.
-  Handle<String> error;
+  RegExpError error = RegExpError::kNone;
+
+  // The position at which the error was detected. Only used if an
+  // error occurred.
+  int error_pos = 0;
 
   // The number of capture groups, without the global capture \0.
   int capture_count = 0;
diff --git a/js/src/regexp/special-case.cc b/js/src/regexp/special-case.cc
index d60b987645..6b12d28d7d 100644
--- a/js/src/regexp/special-case.cc
+++ b/js/src/regexp/special-case.cc
@@ -1,10 +1,15 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that
+// can be found in the LICENSE file.
 
 // Automatically generated by regexp/gen-regexp-special-case.cc
-// The following functions are used to build icu::UnicodeSet
-// for specical cases different between Unicode and ECMA262.
+
+// The following functions are used to build UnicodeSets
+// for special cases where the case-folding algorithm used by
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match
+// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime
+// Semantics: Canonicalize) step 3.
+
 #ifdef V8_INTL_SUPPORT
 #include "regexp/special-case.h"
 
@@ -14,14 +19,46 @@ namespace internal {
 
 icu::UnicodeSet BuildIgnoreSet() {
   icu::UnicodeSet set;
+  set.add(0xdf);
+  set.add(0x17f);
+  set.add(0x390);
+  set.add(0x3b0);
   set.add(0x3f4);
+  set.add(0x1e9e);
+  set.add(0x1f80, 0x1faf);
+  set.add(0x1fb3);
+  set.add(0x1fbc);
+  set.add(0x1fc3);
+  set.add(0x1fcc);
+  set.add(0x1fd3);
+  set.add(0x1fe3);
+  set.add(0x1ff3);
+  set.add(0x1ffc);
   set.add(0x2126);
   set.add(0x212a, 0x212b);
+  set.add(0xfb05, 0xfb06);
   set.freeze();
   return set;
 }
+
+struct IgnoreSetData {
+  IgnoreSetData() : set(BuildIgnoreSet()) {}
+  const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() {
+  static base::LazyInstance<IgnoreSetData>::type set =
+      LAZY_INSTANCE_INITIALIZER;
+  return set.Pointer()->set;
+}
+
 icu::UnicodeSet BuildSpecialAddSet() {
   icu::UnicodeSet set;
+  set.add(0x4b);
+  set.add(0x53);
+  set.add(0x6b);
+  set.add(0x73);
   set.add(0xc5);
   set.add(0xe5);
   set.add(0x398);
@@ -33,6 +70,19 @@ icu::UnicodeSet BuildSpecialAddSet() {
   return set;
 }
 
+struct SpecialAddSetData {
+  SpecialAddSetData() : set(BuildSpecialAddSet()) {}
+  const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() {
+  static base::LazyInstance<SpecialAddSetData>::type set =
+      LAZY_INSTANCE_INITIALIZER;
+  return set.Pointer()->set;
+}
+
+
 }  // namespace internal
 }  // namespace v8
 #endif  // V8_INTL_SUPPORT
diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h
index 1ccec5d31a..3aca983028 100644
--- a/js/src/regexp/special-case.h
+++ b/js/src/regexp/special-case.h
@@ -6,70 +6,108 @@
 #define V8_REGEXP_SPECIAL_CASE_H_
 
 #ifdef V8_INTL_SUPPORT
-#include "unicode/uversion.h"
-namespace U_ICU_NAMESPACE {
-class UnicodeSet;
-}  //  namespace U_ICU_NAMESPACE
+#include "regexp/regexp-shim.h"
+
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
 
 namespace v8 {
 namespace internal {
 
-// Functions to build special sets of Unicode characters that need special
-// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
+// Sets of Unicode characters that need special handling under "i" mode
+
+// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
+// defines slightly different case-folding rules than Unicode. An
+// input character should match a pattern character if the result of
+// the Canonicalize algorithm is the same for both characters.
 //
-// For the characters in the "ignore set", the process should not treat other
-// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
-// equivlant under the ECMA262 RegExp "i" mode because these characters are
-// uppercase themselves that no other characters in the set uppercase to.
+// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
+// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
+// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
+// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
+// the precise definition.
 //
-// For the characters in the "special add set", the proecess should add only
-// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
-// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
-// and also that ONE uppercase character that other non uppercase character
-// uppercase into to the set. Other uppercase characters in the result of
-// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
-// RegExp "i" mode consider two characters as "case equivlant" if both
-// characters uppercase to the same character.
+// While compiling such regular expressions, we need to compute the
+// set of characters that should match a given input character. (See
+// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
+// For almost all characters, this can be efficiently computed using
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
+// the remaining special cases.
 //
-// For example, consider the following case equivalent set defined by Unicode
-// standard. Notice there are more than one uppercase characters in this set:
-//  U+212B Å Angstrom Sign - an uppercase character.
-//  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
-//  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
-//    uppercase to U+00C5.
-// In this case equivlant set is a special set and need special handling while
-// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
-// different than Unicode Standard:
-//  * U+212B should be included into the "ignore" set because there are no other
-//    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
-//    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
-//    uppercase to U+212B.
-//  * U+00C5 and U+00E5 will both be included into the "special add" set. While
-//    calculate the "equivlant set" under ECMA262 "i" mode, the process will
-//    add U+00E5, because it is not an uppercase character in the set. The
-//    process will also add U+00C5, because it is the uppercase character which
-//    other non uppercase character, U+00C5, uppercase into.
+// For a character c, the rules are as follows:
 //
-// For characters not included in "ignore set" and "special add set", the
-// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
-// much faster.
+// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
+//    UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
+//    containing c will produce the set of characters that should
+//    match /c/i (or /[c]/i), and only those characters.
 //
-// Under Unicode 12.0, there are only 7 characters in the "special add set" and
-// 4 characters in "ignore set" so even the special add process is slower, it is
-// limited to a small set of cases only.
+// 2. If c is in IgnoreSet, then the only character it should match is
+//    itself. However, closeOver will add additional incorrect
+//    matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
+//    (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
+//    "SS".  Step 3.e therefore requires that 'ß' canonicalizes to
+//    itself, and should not match 'ẞ'. In these cases, we can skip
+//    the closeOver entirely, because it will never add an equivalent
+//    character.
 //
-// The implementation of these two function will be generated by calling ICU
-// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
-// the code in src/regexp/gen-regexp-special-case.cc.
+// 3. If c is in SpecialAddSet, then it should match at least one
+//    character other than itself. However, closeOver will add at
+//    least one additional incorrect match. For example, consider the
+//    letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
+//    K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
+//    SIGN should not match either of the other two characters. As a
+//    result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
+//    IgnoreSet). To find the correct matches for characters in
+//    SpecialAddSet, we closeOver the original character, but filter
+//    out the results that do not have the same canonical value.
 //
-// These two function will be used with LazyInstance<> template to generate
-// global sharable set to reduce memory usage and speed up performance.
+// The contents of these sets are calculated at build time by
+// src/regexp/gen-regexp-special-case.cc, which generates
+// gen/src/regexp/special-case.cc. This is done by iterating over the
+// result of closeOver for each BMP character, and finding sets for
+// which at least one character has a different canonical value than
+// another character. Characters that match no other characters in
+// their equivalence class are added to IgnoreSet. Characters that
+// match at least one other character are added to SpecialAddSet.
+
+class RegExpCaseFolding final : public AllStatic {
+ public:
+  static const icu::UnicodeSet& IgnoreSet();
+  static const icu::UnicodeSet& SpecialAddSet();
+
+  // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
+  // Canonicalize) step 3, which is used to determine whether
+  // characters match when ignoreCase is true and unicode is false.
+  static UChar32 Canonicalize(UChar32 ch) {
+    // a. Assert: ch is a UTF-16 code unit.
+    CHECK_LE(ch, 0xffff);
+
+    // b. Let s be the String value consisting of the single code unit ch.
+    icu::UnicodeString s(ch);
+
+    // c. Let u be the same result produced as if by performing the algorithm
+    // for String.prototype.toUpperCase using s as the this value.
+    // d. Assert: Type(u) is String.
+    icu::UnicodeString& u = s.toUpper();
+
+    // e. If u does not consist of a single code unit, return ch.
+    if (u.length() != 1) {
+      return ch;
+    }
+
+    // f. Let cu be u's single code unit element.
+    UChar32 cu = u.char32At(0);
 
-// Function to build and return the Ignore set.
-icu::UnicodeSet BuildIgnoreSet();
+    // g. If the value of ch >= 128 and the value of cu < 128, return ch.
+    if (ch >= 128 && cu < 128) {
+      return ch;
+    }
 
-// Function to build and return the Special Add set.
-icu::UnicodeSet BuildSpecialAddSet();
+    // h. Return cu.
+    return cu;
+  }
+};
 
 }  // namespace internal
 }  // namespace v8
diff --git a/js/src/regexp/update-headers.py b/js/src/regexp/update-headers.py
deleted file mode 100644
index 0cff9d6aee..0000000000
--- a/js/src/regexp/update-headers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#
-# This script modifies V8 regexp source files to make them suitable for
-# inclusion in SpiderMonkey. Specifically, it:
-#
-# 1. Rewrites all #includes of V8 regexp headers to point to their location in
-#    the SM tree: src/regexp/* --> regexp/*
-# 2. Removes all #includes of other V8 src/* headers. The required definitions
-#    will be provided by regexp-shim.h.
-#
-# Usage:
-#    cd js/src/regexp
-#    find . -name "*.h" -o -name "*.cc" | xargs ./update_headers.py
-#
-
-import fileinput
-import re
-import sys
-
-# 1. Rewrite includes of V8 regexp headers
-regexp_include = re.compile('#include "src/regexp')
-regexp_include_new = '#include "regexp'
-
-# 2. Remove includes of other V8 headers
-other_include = re.compile('#include "src/')
-
-for line in fileinput.input(inplace=1):
-    if regexp_include.search(line):
-        sys.stdout.write(re.sub(regexp_include, regexp_include_new, line))
-    elif other_include.search(line):
-        pass
-    else:
-        sys.stdout.write(line)
author	Matt A. Tobin <email@mattatobin.com>	2020-11-09 20:37:05 -0500
committer	Matt A. Tobin <email@mattatobin.com>	2020-11-09 20:37:05 -0500
commit	51468e998c8e7191ddecacec3944c806b29dd590 (patch)
tree	c713f075c54781868ec119ea5c5f3c9369af3576
parent	77746f1d900a35eceb23bd760983e95de7b4a547 (diff)
download	uxp-51468e998c8e7191ddecacec3944c806b29dd590.tar.gz