summaryrefslogtreecommitdiff
path: root/js/src/new-regexp/gen-regexp-special-case.cc
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/new-regexp/gen-regexp-special-case.cc')
-rw-r--r--js/src/new-regexp/gen-regexp-special-case.cc165
1 files changed, 0 insertions, 165 deletions
diff --git a/js/src/new-regexp/gen-regexp-special-case.cc b/js/src/new-regexp/gen-regexp-special-case.cc
deleted file mode 100644
index 5a82c5d27..000000000
--- a/js/src/new-regexp/gen-regexp-special-case.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright 2020 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-
-#include "new-regexp/special-case.h"
-
-namespace v8 {
-namespace internal {
-
-static const uc32 kSurrogateStart = 0xd800;
-static const uc32 kSurrogateEnd = 0xdfff;
-static const uc32 kNonBmpStart = 0x10000;
-
-// The following code generates "src/regexp/special-case.cc".
-void PrintSet(std::ofstream& out, const char* name,
- const icu::UnicodeSet& set) {
- out << "icu::UnicodeSet Build" << name << "() {\n"
- << " icu::UnicodeSet set;\n";
- for (int32_t i = 0; i < set.getRangeCount(); i++) {
- if (set.getRangeStart(i) == set.getRangeEnd(i)) {
- out << " set.add(0x" << set.getRangeStart(i) << ");\n";
- } else {
- out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
- << set.getRangeEnd(i) << ");\n";
- }
- }
- out << " set.freeze();\n"
- << " return set;\n"
- << "}\n\n";
-
- out << "struct " << name << "Data {\n"
- << " " << name << "Data() : set(Build" << name << "()) {}\n"
- << " const icu::UnicodeSet set;\n"
- << "};\n\n";
-
- out << "//static\n"
- << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
- << " static base::LazyInstance<" << name << "Data>::type set =\n"
- << " LAZY_INSTANCE_INITIALIZER;\n"
- << " return set.Pointer()->set;\n"
- << "}\n\n";
-}
-
-void PrintSpecial(std::ofstream& out) {
- icu::UnicodeSet current;
- icu::UnicodeSet special_add;
- icu::UnicodeSet ignore;
- UErrorCode status = U_ZERO_ERROR;
- icu::UnicodeSet upper("[\\p{Lu}]", status);
- CHECK(U_SUCCESS(status));
-
- // Iterate through all chars in BMP except surrogates.
- for (UChar32 i = 0; i < kNonBmpStart; i++) {
- if (i >= kSurrogateStart && i <= kSurrogateEnd) {
- continue; // Ignore surrogate range
- }
- current.set(i, i);
- current.closeOver(USET_CASE_INSENSITIVE);
-
- // Check to see if all characters in the case-folding equivalence
- // class as defined by UnicodeSet::closeOver all map to the same
- // canonical value.
- UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
- bool class_has_matching_canonical_char = false;
- bool class_has_non_matching_canonical_char = false;
- for (int32_t j = 0; j < current.getRangeCount(); j++) {
- for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
- c++) {
- if (c == i) {
- continue;
- }
- UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
- if (canonical == other_canonical) {
- class_has_matching_canonical_char = true;
- } else {
- class_has_non_matching_canonical_char = true;
- }
- }
- }
- // If any other character in i's equivalence class has a
- // different canonical value, then i needs special handling. If
- // no other character shares a canonical value with i, we can
- // ignore i when adding alternatives for case-independent
- // comparison. If at least one other character shares a
- // canonical value, then i needs special handling.
- if (class_has_non_matching_canonical_char) {
- if (class_has_matching_canonical_char) {
- special_add.add(i);
- } else {
- ignore.add(i);
- }
- }
- }
-
- // Verify that no Unicode equivalence class contains two non-trivial
- // JS equivalence classes. Every character in SpecialAddSet has the
- // same canonical value as every other non-IgnoreSet character in
- // its Unicode equivalence class. Therefore, if we call closeOver on
- // a set containing no IgnoreSet characters, the only characters
- // that must be removed from the result are in IgnoreSet. This fact
- // is used in CharacterRange::AddCaseEquivalents.
- for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
- for (UChar32 c = special_add.getRangeStart(i);
- c <= special_add.getRangeEnd(i); c++) {
- UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
- current.set(c, c);
- current.closeOver(USET_CASE_INSENSITIVE);
- current.removeAll(ignore);
- for (int32_t j = 0; j < current.getRangeCount(); j++) {
- for (UChar32 c2 = current.getRangeStart(j);
- c2 <= current.getRangeEnd(j); c2++) {
- CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
- }
- }
- }
- }
-
- PrintSet(out, "IgnoreSet", ignore);
- PrintSet(out, "SpecialAddSet", special_add);
-}
-
-void WriteHeader(const char* header_filename) {
- std::ofstream out(header_filename);
- out << std::hex << std::setfill('0') << std::setw(4);
- out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
- << "// Use of this source code is governed by a BSD-style license that\n"
- << "// can be found in the LICENSE file.\n\n"
- << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
- << "// The following functions are used to build UnicodeSets\n"
- << "// for special cases where the case-folding algorithm used by\n"
- << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
- << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
- << "// Semantics: Canonicalize) step 3.\n\n"
- << "#ifdef V8_INTL_SUPPORT\n"
- << "#include \"src/base/lazy-instance.h\"\n\n"
- << "#include \"src/regexp/special-case.h\"\n\n"
- << "#include \"unicode/uniset.h\"\n"
- << "namespace v8 {\n"
- << "namespace internal {\n\n";
-
- PrintSpecial(out);
-
- out << "\n"
- << "} // namespace internal\n"
- << "} // namespace v8\n"
- << "#endif // V8_INTL_SUPPORT\n";
-}
-
-} // namespace internal
-} // namespace v8
-
-int main(int argc, const char** argv) {
- if (argc != 2) {
- std::cerr << "Usage: " << argv[0] << " <output filename>\n";
- std::exit(1);
- }
- v8::internal::WriteHeader(argv[1]);
-
- return 0;
-}