From 74124f150b7167b69f0f4ae6657489c5db556ad3 Mon Sep 17 00:00:00 2001 From: Moonchild Date: Fri, 2 Sep 2022 19:29:43 +0000 Subject: Issue #1999 - Switch to the unorm2 API for String.normalize() Resolves #1999 --- config/check_spidermonkey_style.py | 2 +- js/src/jsstr.cpp | 115 ++++++++++++++++++++++++++++--------- 2 files changed, 89 insertions(+), 28 deletions(-) diff --git a/config/check_spidermonkey_style.py b/config/check_spidermonkey_style.py index eb272a81c6..cb9e2418f2 100644 --- a/config/check_spidermonkey_style.py +++ b/config/check_spidermonkey_style.py @@ -89,7 +89,7 @@ included_inclnames_to_ignore = set([ 'unicode/udisplaycontext.h',# ICU 'unicode/uenum.h', # ICU 'unicode/uniset.h', # ICU - 'unicode/unorm.h', # ICU + 'unicode/unorm2.h', # ICU 'unicode/unum.h', # ICU 'unicode/unumsys.h', # ICU 'unicode/upluralrules.h', # ICU diff --git a/js/src/jsstr.cpp b/js/src/jsstr.cpp index 4167d78741..b9e10b61b9 100644 --- a/js/src/jsstr.cpp +++ b/js/src/jsstr.cpp @@ -35,7 +35,7 @@ #include "jit/InlinableNatives.h" #include "js/Conversions.h" #include "js/UniquePtr.h" -#include "unicode/unorm.h" +#include "unicode/unorm2.h" #include "vm/GlobalObject.h" #include "vm/Interpreter.h" #include "vm/Opcodes.h" @@ -900,79 +900,140 @@ js::str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp) return ToUpperCaseHelper(cx, args); } -/* ES6 20140210 draft 21.1.3.12. */ +/* ES2017 21.1.3.12. */ bool js::str_normalize(JSContext* cx, unsigned argc, Value* vp) { CallArgs args = CallArgsFromVp(argc, vp); - // Steps 1-3. + // Steps 1-2. RootedString str(cx, ToStringForStringFunction(cx, args.thisv())); if (!str) return false; - // Step 4. - UNormalizationMode form; + enum NormalizationForm { + NFC, NFD, NFKC, NFKD + }; + + NormalizationForm form; if (!args.hasDefined(0)) { - form = UNORM_NFC; + // Step 3. + form = NFC; } else { - // Steps 5-6. + // Step 4. RootedLinearString formStr(cx, ArgToRootedString(cx, args, 0)); if (!formStr) return false; - // Step 7. + // Step 5. if (EqualStrings(formStr, cx->names().NFC)) { - form = UNORM_NFC; + form = NFC; } else if (EqualStrings(formStr, cx->names().NFD)) { - form = UNORM_NFD; + form = NFD; } else if (EqualStrings(formStr, cx->names().NFKC)) { - form = UNORM_NFKC; + form = NFKC; } else if (EqualStrings(formStr, cx->names().NFKD)) { - form = UNORM_NFKD; + form = NFKD; } else { JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INVALID_NORMALIZE_FORM); return false; } } - // Step 8. + JSLinearString* linear = str->ensureLinear(cx); + if (!linear) + return false; + + // Latin1 strings are already in Normalization Form C. + if (form == NFC && linear->hasLatin1Chars()) { + // Step 7. + args.rval().setString(str); + return true; + } + + // Step 6. AutoStableStringChars stableChars(cx); - if (!str->ensureFlat(cx) || !stableChars.initTwoByte(cx, str)) + if (!stableChars.initTwoByte(cx, linear)) + return false; + + mozilla::Range srcChars = stableChars.twoByteRange(); + + // The unorm2_getXXXInstance() methods return a shared instance which must + // not be deleted. + UErrorCode status = U_ZERO_ERROR; + const UNormalizer2* normalizer; + if (form == NFC) { + normalizer = unorm2_getNFCInstance(&status); + } else if (form == NFD) { + normalizer = unorm2_getNFDInstance(&status); + } else if (form == NFKC) { + normalizer = unorm2_getNFKCInstance(&status); + } else { + MOZ_ASSERT(form == NFKD); + normalizer = unorm2_getNFKDInstance(&status); + } + if (U_FAILURE(status)) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR); return false; + } + + int32_t spanLength = unorm2_spanQuickCheckYes(normalizer, + Char16ToUChar(srcChars.begin().get()), + srcChars.length(), &status); + if (U_FAILURE(status)) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR); + return false; + } + MOZ_ASSERT(0 <= spanLength && size_t(spanLength) <= srcChars.length()); + + // Return if the input string is already normalized. + if (size_t(spanLength) == srcChars.length()) { + // Step 7. + args.rval().setString(str); + return true; + } static const size_t INLINE_CAPACITY = 32; - const UChar* srcChars = Char16ToUChar(stableChars.twoByteRange().begin().get()); - int32_t srcLen = AssertedCast(str->length()); Vector chars(cx); - if (!chars.resize(INLINE_CAPACITY)) + if (!chars.resize(Max(INLINE_CAPACITY, srcChars.length()))) return false; - UErrorCode status = U_ZERO_ERROR; - int32_t size = unorm_normalize(srcChars, srcLen, form, 0, - Char16ToUChar(chars.begin()), INLINE_CAPACITY, - &status); + // Copy the already normalized prefix. + if (spanLength > 0) + PodCopy(chars.begin(), srcChars.begin().get(), size_t(spanLength)); + + mozilla::RangedPtr remainingStart = srcChars.begin() + spanLength; + size_t remainingLength = srcChars.length() - size_t(spanLength); + + int32_t size = unorm2_normalizeSecondAndAppend(normalizer, Char16ToUChar(chars.begin()), + spanLength, chars.length(), + Char16ToUChar(remainingStart.get()), + remainingLength, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { + MOZ_ASSERT(size >= 0); if (!chars.resize(size)) return false; status = U_ZERO_ERROR; #ifdef DEBUG int32_t finalSize = #endif - unorm_normalize(srcChars, srcLen, form, 0, - Char16ToUChar(chars.begin()), size, - &status); - MOZ_ASSERT(size == finalSize || U_FAILURE(status), "unorm_normalize behaved inconsistently"); + unorm2_normalizeSecondAndAppend(normalizer, Char16ToUChar(chars.begin()), spanLength, + chars.length(), Char16ToUChar(remainingStart.get()), + remainingLength, &status); + MOZ_ASSERT_IF(!U_FAILURE(status), size == finalSize); } - if (U_FAILURE(status)) + if (U_FAILURE(status)) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR); return false; + } + MOZ_ASSERT(size >= 0); JSString* ns = NewStringCopyN(cx, chars.begin(), size); if (!ns) return false; - // Step 9. + // Step 7. args.rval().setString(ns); return true; } -- cgit v1.2.3