diff options
author | Martok <martok@martoks-place.de> | 2023-06-29 23:03:12 +0200 |
---|---|---|
committer | Martok <martok@martoks-place.de> | 2023-06-29 23:03:12 +0200 |
commit | 9eb285a9fb89cfd64ca9c9cba77746af4547f0a4 (patch) | |
tree | 18e977c43d94bc8c0bdafad1a3f915e30498bae8 | |
parent | bbe3b8e12f1bbd274baeadf26ea7d124b344ef2b (diff) | |
download | uxp-9eb285a9fb89cfd64ca9c9cba77746af4547f0a4.tar.gz |
Issue #2259 - Implement caseFirst option in Intl.Collator
Based-on: m-c 866473
-rw-r--r-- | js/src/builtin/intl/Collator.cpp | 22 | ||||
-rw-r--r-- | js/src/builtin/intl/Collator.h | 9 | ||||
-rw-r--r-- | js/src/builtin/intl/Collator.js | 36 | ||||
-rw-r--r-- | js/src/builtin/intl/SharedIntlData.cpp | 134 | ||||
-rw-r--r-- | js/src/builtin/intl/SharedIntlData.h | 83 | ||||
-rw-r--r-- | js/src/vm/SelfHosting.cpp | 1 |
6 files changed, 261 insertions, 24 deletions
diff --git a/js/src/builtin/intl/Collator.cpp b/js/src/builtin/intl/Collator.cpp index d54bcaa971..ab031dde52 100644 --- a/js/src/builtin/intl/Collator.cpp +++ b/js/src/builtin/intl/Collator.cpp @@ -373,8 +373,10 @@ NewUCollator(JSContext* cx, Handle<CollatorObject*> collator) uCaseFirst = UCOL_UPPER_FIRST;
else if (StringsAreEqual(caseFirst, "lower"))
uCaseFirst = UCOL_LOWER_FIRST;
- else
+ else {
MOZ_ASSERT(StringsAreEqual(caseFirst, "false"));
+ uCaseFirst = UCOL_OFF;
+ }
}
UErrorCode status = U_ZERO_ERROR;
@@ -463,3 +465,21 @@ js::intl_CompareStrings(JSContext* cx, unsigned argc, Value* vp) RootedString str2(cx, args[2].toString());
return intl_CompareStrings(cx, coll, str1, str2, args.rval());
}
+
+bool
+js::intl_isUpperCaseFirst(JSContext* cx, unsigned argc, Value* vp)
+{
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 1);
+ MOZ_ASSERT(args[0].isString());
+
+ SharedIntlData& sharedIntlData = cx->sharedIntlData;
+
+ RootedString locale(cx, args[0].toString());
+ bool isUpperFirst;
+ if (!sharedIntlData.isUpperCaseFirst(cx, locale, &isUpperFirst))
+ return false;
+
+ args.rval().setBoolean(isUpperFirst);
+ return true;
+}
diff --git a/js/src/builtin/intl/Collator.h b/js/src/builtin/intl/Collator.h index 4c6889b35b..9e77ee6f46 100644 --- a/js/src/builtin/intl/Collator.h +++ b/js/src/builtin/intl/Collator.h @@ -87,6 +87,15 @@ intl_availableCollations(JSContext* cx, unsigned argc, Value* vp); extern MOZ_MUST_USE bool
intl_CompareStrings(JSContext* cx, unsigned argc, Value* vp);
+/**
+ * Returns true if the given locale sorts upper-case before lower-case
+ * characters.
+ *
+ * Usage: result = intl_isUpperCaseFirst(locale)
+ */
+extern MOZ_MUST_USE bool
+intl_isUpperCaseFirst(JSContext* cx, unsigned argc, Value* vp);
+
} // namespace js
diff --git a/js/src/builtin/intl/Collator.js b/js/src/builtin/intl/Collator.js index eb96f6cc54..eba09d3c83 100644 --- a/js/src/builtin/intl/Collator.js +++ b/js/src/builtin/intl/Collator.js @@ -258,16 +258,41 @@ var collatorInternalProperties = { addSpecialMissingLanguageTags(locales);
return (this._availableLocales = locales);
},
- relevantExtensionKeys: ["co", "kn"]
+ relevantExtensionKeys: ["co", "kn", "kf"]
};
+/**
+ * Returns the default caseFirst values for the given locale and usage. The
+ * first element in the returned array denotes the default value per ES2017
+ * Intl, 9.1 Internal slots of Service Constructors.
+ */
+function collatorCaseFirst(locale, usage) {
+ assert(typeof locale === "string", "locale should be string");
+ assert(usage === "sort" || usage === "search", "invalid usage option");
+
+ if (usage === "sort") {
+ // If |locale| is the default locale (e.g. da-DK), but only supported
+ // through a fallback (da), we need to get the actual locale before we
+ // can call intl_isUpperCaseFirst. Also see BestAvailableLocaleHelper.
+ var availableLocales = callFunction(collatorInternalProperties.availableLocales,
+ collatorInternalProperties);
+ var actualLocale = BestAvailableLocaleIgnoringDefault(availableLocales, locale);
+
+ if (intl_isUpperCaseFirst(actualLocale))
+ return ["upper", "false", "lower"];
+ }
+
+ // Default caseFirst values for all other languages.
+ return ["false", "lower", "upper"];
+}
+
+
function collatorSortLocaleData(locale) {
- var collations = intl_availableCollations(locale);
- callFunction(std_Array_unshift, collations, null);
return {
- co: collations,
- kn: ["false", "true"]
+ co: intl_availableCollations(locale),
+ kn: ["false", "true"],
+ kf: collatorCaseFirst(locale, "sort"),
};
}
@@ -276,6 +301,7 @@ function collatorSearchLocaleData(locale) { return {
co: [null],
kn: ["false", "true"],
+ kf: collatorCaseFirst(locale, "search"),
// In theory the default sensitivity is locale dependent;
// in reality the CLDR/ICU default strength is always tertiary.
sensitivity: "variant"
diff --git a/js/src/builtin/intl/SharedIntlData.cpp b/js/src/builtin/intl/SharedIntlData.cpp index def8ceaf07..03d6a4d3e8 100644 --- a/js/src/builtin/intl/SharedIntlData.cpp +++ b/js/src/builtin/intl/SharedIntlData.cpp @@ -63,15 +63,12 @@ EqualCharsIgnoreCaseASCII(const Char1* s1, const Char2* s2, size_t len) }
js::intl::SharedIntlData::TimeZoneHasher::Lookup::Lookup(JSFlatString* timeZone)
- : isLatin1(timeZone->hasLatin1Chars()), length(timeZone->length())
+ : js::intl::SharedIntlData::LinearStringLookup(timeZone)
{
- if (isLatin1) {
- latin1Chars = timeZone->latin1Chars(nogc);
+ if (isLatin1)
hash = HashStringIgnoreCaseASCII(latin1Chars, length);
- } else {
- twoByteChars = timeZone->twoByteChars(nogc);
+ else
hash = HashStringIgnoreCaseASCII(twoByteChars, length);
- }
}
bool
@@ -110,7 +107,7 @@ js::intl::SharedIntlData::ensureTimeZones(JSContext* cx) if (timeZoneDataInitialized)
return true;
- // If initTimeZones() was called previously, but didn't complete due to
+ // If ensureTimeZones() was called previously, but didn't complete due to
// OOM, clear all sets/maps and start from scratch.
if (availableTimeZones.initialized())
availableTimeZones.finish();
@@ -272,12 +269,131 @@ js::intl::SharedIntlData::tryCanonicalizeTimeZoneConsistentWithIANA(JSContext* c return true;
}
+js::intl::SharedIntlData::LocaleHasher::Lookup::Lookup(JSLinearString* locale)
+ : js::intl::SharedIntlData::LinearStringLookup(locale)
+{
+ if (isLatin1)
+ hash = mozilla::HashString(latin1Chars, length);
+ else
+ hash = mozilla::HashString(twoByteChars, length);
+}
+
+bool
+js::intl::SharedIntlData::LocaleHasher::match(Locale key, const Lookup& lookup)
+{
+ if (key->length() != lookup.length)
+ return false;
+
+ if (key->hasLatin1Chars()) {
+ const Latin1Char* keyChars = key->latin1Chars(lookup.nogc);
+ if (lookup.isLatin1)
+ return EqualChars(keyChars, lookup.latin1Chars, lookup.length);
+ return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
+ }
+
+ const char16_t* keyChars = key->twoByteChars(lookup.nogc);
+ if (lookup.isLatin1)
+ return EqualChars(lookup.latin1Chars, keyChars, lookup.length);
+ return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
+}
+
+bool
+js::intl::SharedIntlData::ensureUpperCaseFirstLocales(JSContext* cx)
+{
+ if (upperCaseFirstInitialized)
+ return true;
+
+ // If ensureUpperCaseFirstLocales() was called previously, but didn't
+ // complete due to OOM, clear all data and start from scratch.
+ if (upperCaseFirstLocales.initialized())
+ upperCaseFirstLocales.finish();
+ if (!upperCaseFirstLocales.init()) {
+ ReportOutOfMemory(cx);
+ return false;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ UEnumeration* available = ucol_openAvailableLocales(&status);
+ if (U_FAILURE(status)) {
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
+ return false;
+ }
+ ScopedICUObject<UEnumeration, uenum_close> toClose(available);
+
+ RootedAtom locale(cx);
+ while (true) {
+ int32_t size;
+ const char* rawLocale = uenum_next(available, &size, &status);
+ if (U_FAILURE(status)) {
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
+ return false;
+ }
+
+ if (rawLocale == nullptr)
+ break;
+
+ UCollator* collator = ucol_open(rawLocale, &status);
+ if (U_FAILURE(status)) {
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
+ return false;
+ }
+ ScopedICUObject<UCollator, ucol_close> toCloseCollator(collator);
+
+ UColAttributeValue caseFirst = ucol_getAttribute(collator, UCOL_CASE_FIRST, &status);
+ if (U_FAILURE(status)) {
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
+ return false;
+ }
+
+ if (caseFirst != UCOL_UPPER_FIRST)
+ continue;
+
+ MOZ_ASSERT(size >= 0);
+ locale = Atomize(cx, rawLocale, size_t(size));
+ if (!locale)
+ return false;
+
+ LocaleHasher::Lookup lookup(locale);
+ LocaleSet::AddPtr p = upperCaseFirstLocales.lookupForAdd(lookup);
+
+ // ICU shouldn't report any duplicate locales, but if it does, just
+ // ignore the duplicated locale.
+ if (!p && !upperCaseFirstLocales.add(p, locale)) {
+ ReportOutOfMemory(cx);
+ return false;
+ }
+ }
+
+ MOZ_ASSERT(!upperCaseFirstInitialized,
+ "ensureUpperCaseFirstLocales is neither reentrant nor thread-safe");
+ upperCaseFirstInitialized = true;
+
+ return true;
+}
+
+bool
+js::intl::SharedIntlData::isUpperCaseFirst(JSContext* cx, HandleString locale, bool* isUpperFirst)
+{
+ if (!ensureUpperCaseFirstLocales(cx))
+ return false;
+
+ RootedLinearString localeLinear(cx, locale->ensureLinear(cx));
+ if (!localeLinear)
+ return false;
+
+ LocaleHasher::Lookup lookup(localeLinear);
+ *isUpperFirst = upperCaseFirstLocales.has(lookup);
+
+ return true;
+}
+
void
js::intl::SharedIntlData::destroyInstance()
{
availableTimeZones.finish();
ianaZonesTreatedAsLinksByICU.finish();
ianaLinksCanonicalizedDifferentlyByICU.finish();
+ upperCaseFirstLocales.finish();
}
void
@@ -288,6 +404,7 @@ js::intl::SharedIntlData::trace(JSTracer* trc) availableTimeZones.trace(trc);
ianaZonesTreatedAsLinksByICU.trace(trc);
ianaLinksCanonicalizedDifferentlyByICU.trace(trc);
+ upperCaseFirstLocales.trace(trc);
}
}
@@ -296,5 +413,6 @@ js::intl::SharedIntlData::sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf {
return availableTimeZones.sizeOfExcludingThis(mallocSizeOf) +
ianaZonesTreatedAsLinksByICU.sizeOfExcludingThis(mallocSizeOf) +
- ianaLinksCanonicalizedDifferentlyByICU.sizeOfExcludingThis(mallocSizeOf);
+ ianaLinksCanonicalizedDifferentlyByICU.sizeOfExcludingThis(mallocSizeOf) +
+ upperCaseFirstLocales.sizeOfExcludingThis(mallocSizeOf);
}
diff --git a/js/src/builtin/intl/SharedIntlData.h b/js/src/builtin/intl/SharedIntlData.h index 959d1d6a74..e171c8dfea 100644 --- a/js/src/builtin/intl/SharedIntlData.h +++ b/js/src/builtin/intl/SharedIntlData.h @@ -30,6 +30,28 @@ namespace intl { */
class SharedIntlData
{
+ struct LinearStringLookup
+ {
+ union {
+ const JS::Latin1Char* latin1Chars;
+ const char16_t* twoByteChars;
+ };
+ bool isLatin1;
+ size_t length;
+ JS::AutoCheckCannotGC nogc;
+ HashNumber hash = 0;
+
+ explicit LinearStringLookup(JSLinearString* string)
+ : isLatin1(string->hasLatin1Chars()), length(string->length())
+ {
+ if (isLatin1)
+ latin1Chars = string->latin1Chars(nogc);
+ else
+ twoByteChars = string->twoByteChars(nogc);
+ }
+ };
+
+ private:
/**
* Information tracking the set of the supported time zone names, derived
* from the IANA time zone database <https://www.iana.org/time-zones>.
@@ -59,17 +81,8 @@ class SharedIntlData struct TimeZoneHasher
{
- struct Lookup
+ struct Lookup : LinearStringLookup
{
- union {
- const JS::Latin1Char* latin1Chars;
- const char16_t* twoByteChars;
- };
- bool isLatin1;
- size_t length;
- JS::AutoCheckCannotGC nogc;
- HashNumber hash;
-
explicit Lookup(JSFlatString* timeZone);
};
@@ -148,7 +161,57 @@ class SharedIntlData */
bool tryCanonicalizeTimeZoneConsistentWithIANA(JSContext* cx, JS::HandleString timeZone,
JS::MutableHandleString result);
+ private:
+ /**
+ * The case first parameter (BCP47 key "kf") allows to switch the order of
+ * upper- and lower-case characters. ICU doesn't directly provide an API
+ * to query the default case first value of a given locale, but instead
+ * requires to instantiate a collator object and then query the case first
+ * attribute (UCOL_CASE_FIRST).
+ * To avoid instantiating an additional collator object whenever we need
+ * to retrieve the default case first value of a specific locale, we
+ * compute the default case first value for every supported locale only
+ * once and then keep a list of all locales which don't use the default
+ * case first setting.
+ * There is almost no difference between lower-case first and when case
+ * first is disabled (UCOL_LOWER_FIRST resp. UCOL_OFF), so we only need to
+ * track locales which use upper-case first as their default setting.
+ */
+
+ using Locale = JSAtom*;
+
+ struct LocaleHasher
+ {
+ struct Lookup : LinearStringLookup
+ {
+ explicit Lookup(JSLinearString* locale);
+ };
+ static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; }
+ static bool match(Locale key, const Lookup& lookup);
+ };
+
+ using LocaleSet = js::GCHashSet<Locale,
+ LocaleHasher,
+ js::SystemAllocPolicy>;
+
+ LocaleSet upperCaseFirstLocales;
+
+ bool upperCaseFirstInitialized = false;
+
+ /**
+ * Precomputes the available locales which use upper-case first sorting.
+ */
+ bool ensureUpperCaseFirstLocales(JSContext* cx);
+
+ public:
+ /**
+ * Sets |isUpperFirst| to true if |locale| sorts upper-case characters
+ * before lower-case characters.
+ */
+ bool isUpperCaseFirst(JSContext* cx, JS::HandleString locale, bool* isUpperFirst);
+
+ public:
void destroyInstance();
void trace(JSTracer* trc);
diff --git a/js/src/vm/SelfHosting.cpp b/js/src/vm/SelfHosting.cpp index e4695332cf..bc66d6aa1e 100644 --- a/js/src/vm/SelfHosting.cpp +++ b/js/src/vm/SelfHosting.cpp @@ -2475,6 +2475,7 @@ static const JSFunctionSpec intrinsic_functions[] = { JS_FN("intl_FormatNumber", intl_FormatNumber, 2,0), JS_FN("intl_GetCalendarInfo", intl_GetCalendarInfo, 1,0), JS_FN("intl_ComputeDisplayNames", intl_ComputeDisplayNames, 3,0), + JS_FN("intl_isUpperCaseFirst", intl_isUpperCaseFirst, 1,0), JS_FN("intl_IsValidTimeZoneName", intl_IsValidTimeZoneName, 1,0), JS_FN("intl_NumberFormat", intl_NumberFormat, 2,0), JS_FN("intl_NumberFormat_availableLocales", intl_NumberFormat_availableLocales, 0,0), |