diff options
author | Martok <martok@martoks-place.de> | 2023-06-29 23:09:26 +0200 |
---|---|---|
committer | Martok <martok@martoks-place.de> | 2023-06-30 00:01:35 +0200 |
commit | af47a256b5cf2b81e4c3bf8f36682f8b9f31be42 (patch) | |
tree | af1b472d545dcd80afa9de5e468912f39cf8ee12 /js/src | |
parent | e96f965422528636e13adc3473679248941540e7 (diff) | |
download | uxp-af47a256b5cf2b81e4c3bf8f36682f8b9f31be42.tar.gz |
Issue #1819 - Further align Intl.Locale to spec
- Reference updates (UTS 35)
- variant subtag and transform extension canonicalisation
Diffstat (limited to 'js/src')
-rw-r--r-- | js/src/builtin/intl/Collator.cpp | 49 | ||||
-rw-r--r-- | js/src/builtin/intl/DateTimeFormat.cpp | 56 | ||||
-rw-r--r-- | js/src/builtin/intl/DateTimeFormat.js | 33 | ||||
-rw-r--r-- | js/src/builtin/intl/IntlObject.cpp | 4 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTag.cpp | 210 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTag.h | 96 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTagGenerated.cpp | 195 | ||||
-rw-r--r-- | js/src/builtin/intl/Locale.cpp | 258 | ||||
-rw-r--r-- | js/src/builtin/intl/Locale.h | 3 | ||||
-rw-r--r-- | js/src/builtin/intl/NumberFormat.cpp | 42 | ||||
-rw-r--r-- | js/src/builtin/intl/NumberFormat.js | 12 | ||||
-rw-r--r-- | js/src/builtin/intl/make_intl_data.py | 319 | ||||
-rw-r--r-- | js/src/vm/SelfHosting.cpp | 1 |
13 files changed, 958 insertions, 320 deletions
diff --git a/js/src/builtin/intl/Collator.cpp b/js/src/builtin/intl/Collator.cpp index 5f142d7e6d..450c654620 100644 --- a/js/src/builtin/intl/Collator.cpp +++ b/js/src/builtin/intl/Collator.cpp @@ -8,12 +8,14 @@ #include "builtin/intl/Collator.h"
#include "mozilla/Assertions.h"
+#include "mozilla/Span.h"
#include "jsapi.h"
#include "jscntxt.h"
#include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/LanguageTag.h"
#include "builtin/intl/ScopedICUObject.h"
#include "builtin/intl/SharedIntlData.h"
#include "js/TypeDecls.h"
@@ -283,32 +285,33 @@ NewUCollator(JSContext* cx, Handle<CollatorObject*> collator) return nullptr;
if (StringsAreEqual(usage, "search")) {
// ICU expects search as a Unicode locale extension on locale.
- // Unicode locale extensions must occur before private use extensions.
- const char* oldLocale = locale.ptr();
- const char* p;
- size_t index;
- size_t localeLen = strlen(oldLocale);
- if ((p = strstr(oldLocale, "-x-")))
- index = p - oldLocale;
- else
- index = localeLen;
-
- const char* insert;
- if ((p = strstr(oldLocale, "-u-")) && static_cast<size_t>(p - oldLocale) < index) {
- index = p - oldLocale + 2;
- insert = "-co-search";
- } else {
- insert = "-u-co-search";
+ intl::LanguageTag tag(cx);
+ if (!intl::LanguageTagParser::parse(
+ cx, mozilla::MakeCStringSpan(locale.ptr()), tag)) {
+ return nullptr;
+ }
+
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
+
+ if (!keywords.emplaceBack("co", cx->names().search)) {
+ return nullptr;
}
- size_t insertLen = strlen(insert);
- char* newLocale = cx->pod_malloc<char>(localeLen + insertLen + 1);
- if (!newLocale)
+
+ // |ApplyUnicodeExtensionToTag| applies the new keywords to the front of
+ // the Unicode extension subtag. We're then relying on ICU to follow RFC
+ // 6067, which states that any trailing keywords using the same key
+ // should be ignored.
+ if (!intl::ApplyUnicodeExtensionToTag(cx, tag, keywords)) {
return nullptr;
- memcpy(newLocale, oldLocale, index);
- memcpy(newLocale + index, insert, insertLen);
- memcpy(newLocale + index + insertLen, oldLocale + index, localeLen - index + 1); // '\0'
+ }
+
locale.clear();
- locale.initBytes(newLocale);
+ locale.encodeLatin1(cx, tag.toString(cx));
+ if (!locale) {
+ return nullptr;
+ }
+ } else {
+ MOZ_ASSERT(StringsAreEqual(usage, "sort"));
}
// We don't need to look at the collation property - it can only be set
diff --git a/js/src/builtin/intl/DateTimeFormat.cpp b/js/src/builtin/intl/DateTimeFormat.cpp index 78e863eedf..0dd724bf2e 100644 --- a/js/src/builtin/intl/DateTimeFormat.cpp +++ b/js/src/builtin/intl/DateTimeFormat.cpp @@ -15,6 +15,7 @@ #include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/LanguageTag.h"
#include "builtin/intl/ScopedICUObject.h"
#include "builtin/intl/SharedIntlData.h"
#include "builtin/intl/TimeZoneDataGenerated.h"
@@ -582,14 +583,57 @@ NewUDateFormat(JSContext* cx, Handle<DateTimeFormatObject*> dateTimeFormat) if (!GetProperty(cx, internals, internals, cx->names().locale, &value))
return nullptr;
- JSAutoByteString locale(cx, value.toString());
- if (!locale)
- return nullptr;
- // We don't need to look at calendar and numberingSystem - they can only be
- // set via the Unicode locale extension and are therefore already set on
+ // ICU expects calendar and numberingSystem as Unicode locale extensions on
// locale.
+ intl::LanguageTag tag(cx);
+ {
+ JSLinearString* locale = value.toString()->ensureLinear(cx);
+ if (!locale)
+ return nullptr;
+
+ if (!intl::LanguageTagParser::parse(cx, locale, tag))
+ return nullptr;
+ }
+
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
+
+ if (!GetProperty(cx, internals, internals, cx->names().calendar, &value))
+ return nullptr;
+
+ {
+ JSLinearString* calendar = value.toString()->ensureLinear(cx);
+ if (!calendar)
+ return nullptr;
+
+ if (!keywords.emplaceBack("ca", calendar))
+ return nullptr;
+ }
+
+ if (!GetProperty(cx, internals, internals, cx->names().numberingSystem, &value))
+ return nullptr;
+
+ {
+ JSLinearString* numberingSystem = value.toString()->ensureLinear(cx);
+ if (!numberingSystem)
+ return nullptr;
+
+ if (!keywords.emplaceBack("nu", numberingSystem))
+ return nullptr;
+ }
+
+ // |ApplyUnicodeExtensionToTag| applies the new keywords to the front of
+ // the Unicode extension subtag. We're then relying on ICU to follow RFC
+ // 6067, which states that any trailing keywords using the same key
+ // should be ignored.
+ if (!intl::ApplyUnicodeExtensionToTag(cx, tag, keywords))
+ return nullptr;
+
+ UniqueChars locale = tag.toStringZ(cx);
+ if (!locale)
+ return nullptr;
+
if (!GetProperty(cx, internals, internals, cx->names().timeZone, &value))
return nullptr;
@@ -614,7 +658,7 @@ NewUDateFormat(JSContext* cx, Handle<DateTimeFormatObject*> dateTimeFormat) UErrorCode status = U_ZERO_ERROR;
UDateFormat* df =
- udat_open(UDAT_PATTERN, UDAT_PATTERN, IcuLocale(locale.ptr()), uTimeZone, uTimeZoneLength,
+ udat_open(UDAT_PATTERN, UDAT_PATTERN, IcuLocale(locale.get()), uTimeZone, uTimeZoneLength,
uPattern, uPatternLength, &status);
if (U_FAILURE(status)) {
intl::ReportInternalError(cx);
diff --git a/js/src/builtin/intl/DateTimeFormat.js b/js/src/builtin/intl/DateTimeFormat.js index 77e10fa5f7..9d1adc8687 100644 --- a/js/src/builtin/intl/DateTimeFormat.js +++ b/js/src/builtin/intl/DateTimeFormat.js @@ -20,9 +20,11 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) { // {
// localeMatcher: "lookup" / "best fit",
//
- // hour12: true / false, // optional
+ // ca: string matching a Unicode extension type, // optional
+ //
+ // nu: string matching a Unicode extension type, // optional
//
- // hourCycle: "h11" / "h12" / "h23" / "h24", // optional
+ // hc: "h11" / "h12" / "h23" / "h24", // optional
// }
//
// timeZone: IANA time zone name,
@@ -31,6 +33,8 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) { // {
// // all the properties/values listed in Table 3
// // (weekday, era, year, month, day, &c.)
+ //
+ // hour12: true / false, // optional
// }
//
// formatMatcher: "basic" / "best fit",
@@ -343,6 +347,12 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m // localeOpt: // *first* opt computed in InitializeDateTimeFormat
// {
// localeMatcher: "lookup" / "best fit",
+ //
+ // ca: string matching a Unicode extension type, // optional
+ //
+ // nu: string matching a Unicode extension type, // optional
+ //
+ // hc: "h11" / "h12" / "h23" / "h24", // optional
// }
//
// timeZone: IANA time zone name,
@@ -353,7 +363,6 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m // // (weekday, era, year, month, day, &c.)
//
// hour12: true / false, // optional
- // hourCycle: "h11" / "h12" / "h23" / "h24", // optional
// }
//
// formatMatcher: "basic" / "best fit",
@@ -382,6 +391,24 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m "best fit");
localeOpt.localeMatcher = localeMatcher;
+ var calendar = GetOption(options, "calendar", "string", undefined, undefined);
+
+ if (calendar !== undefined) {
+ calendar = intl_ValidateAndCanonicalizeUnicodeExtensionType(calendar, "calendar", "ca");
+ }
+
+ localeOpt.ca = calendar;
+
+ var numberingSystem = GetOption(options, "numberingSystem", "string", undefined, undefined);
+
+ if (numberingSystem !== undefined) {
+ numberingSystem = intl_ValidateAndCanonicalizeUnicodeExtensionType(numberingSystem,
+ "numberingSystem",
+ "nu");
+ }
+
+ localeOpt.nu = numberingSystem;
+
// Step 6.
var hr12 = GetOption(options, "hour12", "boolean", undefined, undefined);
diff --git a/js/src/builtin/intl/IntlObject.cpp b/js/src/builtin/intl/IntlObject.cpp index e0dd36dac4..2f42e1df76 100644 --- a/js/src/builtin/intl/IntlObject.cpp +++ b/js/src/builtin/intl/IntlObject.cpp @@ -548,7 +548,7 @@ js::intl_BestAvailableLocale(JSContext* cx, unsigned argc, Value* vp) MOZ_ASSERT(!tag.unicodeExtension(), "locale must contain no Unicode extensions"); - if (!tag.canonicalize(cx, intl::LanguageTag::UnicodeExtensionCanonicalForm::No)) { + if (!tag.canonicalize(cx)) { return false; } @@ -608,7 +608,7 @@ js::intl_supportedLocaleOrFallback(JSContext* cx, unsigned argc, Value* vp) return false; } } else { - if (!tag.canonicalize(cx, intl::LanguageTag::UnicodeExtensionCanonicalForm::No)) { + if (!tag.canonicalize(cx)) { return false; } diff --git a/js/src/builtin/intl/LanguageTag.cpp b/js/src/builtin/intl/LanguageTag.cpp index 583033f629..501885dd9d 100644 --- a/js/src/builtin/intl/LanguageTag.cpp +++ b/js/src/builtin/intl/LanguageTag.cpp @@ -27,7 +27,9 @@ #include "builtin/intl/CommonFunctions.h" #include "ds/Sort.h" +#include "gc/Tracer.h" #include "js/Result.h" +#include "js/TracingAPI.h" #include "js/Utility.h" #include "js/Vector.h" #include "unicode/uloc.h" @@ -259,10 +261,11 @@ static bool SortAlphabetically(JSContext* cx, return true; } -bool LanguageTag::canonicalizeBaseName(JSContext* cx) { - // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by - // normalizing the case and ordering all subtags. The canonical syntax form - // itself is specified in UTS 35, 3.2.1. +bool LanguageTag::canonicalizeBaseName(JSContext* cx, + DuplicateVariants duplicateVariants) { + // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to + // canonicalize the syntax by normalizing the case and ordering all subtags. + // The canonical syntax form is specified in UTS 35, 3.2.1. // Language codes need to be in lower case. "JA" -> "ja" language_.toLowerCase(); @@ -299,25 +302,42 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) { return false; } - // Reject the Locale identifier if a duplicate variant was found, e.g. - // "en-variant-Variant". - const UniqueChars* duplicate = std::adjacent_find( - variants().begin(), variants().end(), [](const auto& a, const auto& b) { - return strcmp(a.get(), b.get()) == 0; - }); - if (duplicate != variants().end()) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG, - duplicate->get()); - return false; + if (duplicateVariants == DuplicateVariants::Reject) { + // Reject the Locale identifier if a duplicate variant was found, e.g. + // "en-variant-Variant". + const UniqueChars* duplicate = + std::adjacent_find(variants().begin(), variants().end(), + [](const auto& a, const auto& b) { + return strcmp(a.get(), b.get()) == 0; + }); + if (duplicate != variants().end()) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_DUPLICATE_VARIANT_SUBTAG, + duplicate->get()); + return false; + } } } // 2. Any extensions are in alphabetical order by their singleton. - // - A subsequent call to canonicalizeExtensions() will perform this. + // 3. All attributes are sorted in alphabetical order. + // 4. All keywords and tfields are sorted by alphabetical order of their keys, + // within their respective extensions. + // 5. Any type or tfield value "true" is removed. + // - A subsequent call to canonicalizeExtensions() will perform these steps. + + // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier + // into its canonical form per UTS 3.2.1. + + // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their + // canonical forms. + // - A subsequent call to canonicalizeExtensions() will perform this step. - // The next two steps in 3.3.1 replace deprecated language and region - // subtags with their preferred mappings. + // 2. Replace aliases in the unicode_language_id and tlang (if any). + // - tlang is handled in canonicalizeExtensions(). + + // Replace deprecated language, region, and variant subtags with their + // preferred mappings. if (!updateGrandfatheredMappings(cx)) { return false; @@ -337,19 +357,34 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) { } } - // No variant subtag replacements are currently present. + // Replace deprecated variant subtags with their preferred values. + if (!performVariantMappings(cx)) { + return false; + } + // No extension replacements are currently present. // Private use sequences are left as is. - // The two final steps in 3.3.1, handling irregular grandfathered and - // private-use only language tags, don't apply, because these two forms - // can't occur in Unicode BCP 47 locale identifiers. + // 3. Replace aliases in special key values. + // - A subsequent call to canonicalizeExtensions() will perform this step. return true; } -bool LanguageTag::canonicalizeExtensions( - JSContext* cx, UnicodeExtensionCanonicalForm canonicalForm) { +#ifdef DEBUG +template <typename CharT> +static bool IsAsciiLowercaseAlphanumericOrDash( + mozilla::Span<const CharT> span) { + const CharT* ptr = span.data(); + size_t length = span.size(); + return std::all_of(ptr, ptr + length, [](auto c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c) || + c == '-'; + }); +} +#endif + +bool LanguageTag::canonicalizeExtensions(JSContext* cx) { // The canonical case for all extension subtags is lowercase. for (UniqueChars& extension : extensions_) { char* extensionChars = extension.get(); @@ -368,7 +403,7 @@ bool LanguageTag::canonicalizeExtensions( for (UniqueChars& extension : extensions_) { if (extension[0] == 'u') { - if (!canonicalizeUnicodeExtension(cx, extension, canonicalForm)) { + if (!canonicalizeUnicodeExtension(cx, extension)) { return false; } } else if (extension[0] == 't') { @@ -376,6 +411,9 @@ bool LanguageTag::canonicalizeExtensions( return false; } } + + MOZ_ASSERT(IsAsciiLowercaseAlphanumericOrDash( + mozilla::MakeCStringSpan(extension.get()))); } // The canonical case for privateuse subtags is lowercase. @@ -406,8 +444,7 @@ bool LanguageTag::canonicalizeExtensions( * see Section 3.6.4 U Extension Data Files). */ bool LanguageTag::canonicalizeUnicodeExtension( - JSContext* cx, JS::UniqueChars& unicodeExtension, - UnicodeExtensionCanonicalForm canonicalForm) { + JSContext* cx, JS::UniqueChars& unicodeExtension) { const char* const extension = unicodeExtension.get(); MOZ_ASSERT(extension[0] == 'u'); MOZ_ASSERT(extension[1] == '-'); @@ -504,7 +541,7 @@ bool LanguageTag::canonicalizeUnicodeExtension( const auto& attribute = attributes[i]; // Skip duplicate attributes. - if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) { + if (i > 0) { const auto& lastAttribute = attributes[i - 1]; if (attribute.length() == lastAttribute.length() && std::char_traits<char>::compare(attribute.begin(extension), @@ -570,7 +607,7 @@ bool LanguageTag::canonicalizeUnicodeExtension( const auto& keyword = keywords[i]; // Skip duplicate keywords. - if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) { + if (i > 0) { const auto& lastKeyword = keywords[i - 1]; if (std::char_traits<char>::compare(keyword.begin(extension), lastKeyword.begin(extension), @@ -594,17 +631,10 @@ bool LanguageTag::canonicalizeUnicodeExtension( StringSpan type(keyword.begin(extension) + UnicodeKeyWithSepLength, keyword.length() - UnicodeKeyWithSepLength); - if (canonicalForm == UnicodeExtensionCanonicalForm::Yes) { - // Search if there's a replacement for the current Unicode keyword. - if (const char* replacement = replaceUnicodeExtensionType(key, type)) { - if (!appendReplacement(keyword, - mozilla::MakeCStringSpan(replacement))) { - return false; - } - } else { - if (!appendKeyword(keyword, type)) { - return false; - } + // Search if there's a replacement for the current Unicode keyword. + if (const char* replacement = replaceUnicodeExtensionType(key, type)) { + if (!appendReplacement(keyword, mozilla::MakeCStringSpan(replacement))) { + return false; } } else { if (!appendKeyword(keyword, type)) { @@ -761,26 +791,35 @@ bool LanguageTag::canonicalizeTransformExtension( // Append the language subtag if present. // - // [1] is a bit unclear whether or not the `tlang` subtag also needs to be - // canonicalized (and case-adjusted). For now simply append it as is. - // (|parseTransformExtension| doesn't alter case from the lowercased form we - // have previously taken pains to ensure is present in the extension, so no - // special effort is required to ensure lowercasing.) If we switch to [2], the - // `tlang` subtag also needs to be canonicalized according to the same rules - // as `unicode_language_id` subtags are canonicalized. Also see [3]. - // - // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier - // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - // [3] https://github.com/tc39/ecma402/issues/330 + // Replace aliases in tlang per + // <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>. if (tag.language().present()) { if (!sb.append('-')) { return false; } + + // ECMA-402 is unclear whether or not duplicate variants are allowed in + // transform extensions. Tentatively allow duplicates until + // https://github.com/tc39/ecma402/issues/330 has been addressed. + if (!tag.canonicalizeBaseName(cx, DuplicateVariants::Accept)) { + return false; + } + + // The canonical case for Transform extensions is lowercase per + // <https://unicode.org/reports/tr35/#BCP47_T_Extension>. Convert the two + // subtags which don't use lowercase for their canonical syntax. + tag.script_.toLowerCase(); + tag.region_.toLowerCase(); + if (!LanguageTagToString(cx, tag, sb)) { return false; } } + static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; + + using StringSpan = mozilla::Span<const char>; + // Append all fields. // // UTS 35, 3.2.1 specifies: @@ -793,8 +832,23 @@ bool LanguageTag::canonicalizeTransformExtension( if (!sb.append('-')) { return false; } - if (!sb.append(field.begin(extension), field.length())) { - return false; + + StringSpan key(field.begin(extension), TransformKeyLength); + StringSpan value(field.begin(extension) + TransformKeyWithSepLength, + field.length() - TransformKeyWithSepLength); + + // Search if there's a replacement for the current transform keyword. + if (const char* replacement = replaceTransformExtensionType(key, value)) { + if (!sb.append(field.begin(extension), TransformKeyWithSepLength)) { + return false; + } + if (!sb.append(replacement, strlen(replacement))) { + return false; + } + } else { + if (!sb.append(field.begin(extension), field.length())) { + return false; + } } } @@ -824,6 +878,18 @@ JSString* LanguageTag::toString(JSContext* cx) const { return sb.finishString(); } +UniqueChars LanguageTag::toStringZ(JSContext* cx) const { + Vector<char, 16> sb(cx); + if (!LanguageTagToString(cx, *this, sb)) { + return nullptr; + } + if (!sb.append('\0')) { + return nullptr; + } + + return UniqueChars(sb.extractOrCopyRawBuffer()); +} + // Zero-terminated ICU Locale ID. using LocaleId = js::Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>; @@ -1158,12 +1224,25 @@ JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx, LanguageTag& tag) { JS::AutoCheckCannotGC nogc; LocaleChars localeChars = StringChars(locale, nogc); + return tryParse(cx, localeChars, locale->length(), tag); +} + +JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx, + mozilla::Span<const char> locale, + LanguageTag& tag) { + LocaleChars localeChars = StringChars(locale.data()); + return tryParse(cx, localeChars, locale.size(), tag); +} +JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx, + LocaleChars& localeChars, + size_t localeLength, + LanguageTag& tag) { // unicode_locale_id = unicode_language_id // extensions* // pu_extensions? ; - LanguageTagParser ts(localeChars, locale->length()); + LanguageTagParser ts(localeChars, localeLength); Token tok = ts.nextToken(); bool ok; @@ -1301,6 +1380,20 @@ bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale, return false; } +bool LanguageTagParser::parse(JSContext* cx, mozilla::Span<const char> locale, + LanguageTag& tag) { + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag)); + if (ok) { + return true; + } + if (UniqueChars localeChars = DuplicateString(cx, locale.data())) { + JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr, + JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); + } + return false; +} + bool LanguageTagParser::parseBaseName(JSContext* cx, mozilla::Span<const char> locale, LanguageTag& tag) { @@ -1314,8 +1407,7 @@ bool LanguageTagParser::parseBaseName(JSContext* cx, if (ok) { return true; } - if (UniqueChars localeChars = DuplicateString(cx, locale.data(), - locale.size())) { + if (UniqueChars localeChars = DuplicateString(cx, locale.data())) { JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr, JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); } @@ -1477,6 +1569,8 @@ bool LanguageTagParser::canParseUnicodeExtension( bool LanguageTagParser::canParseUnicodeExtensionType( JSLinearString* unicodeType) { + MOZ_ASSERT(unicodeType->length() > 0, "caller must exclude empty strings"); + JS::AutoCheckCannotGC nogc; LocaleChars unicodeTypeChars = StringChars(unicodeType, nogc); @@ -1627,5 +1721,9 @@ JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx, return result; } +void js::intl::UnicodeExtensionKeyword::trace(JSTracer* trc) { + TraceRoot(trc, &type_, "UnicodeExtensionKeyword::type"); +} + } // namespace intl } // namespace js diff --git a/js/src/builtin/intl/LanguageTag.h b/js/src/builtin/intl/LanguageTag.h index 384ff4bb7a..5fcce26480 100644 --- a/js/src/builtin/intl/LanguageTag.h +++ b/js/src/builtin/intl/LanguageTag.h @@ -31,6 +31,7 @@ struct JSContext; class JSLinearString; class JSString; +class JSTracer; namespace js { @@ -204,14 +205,8 @@ class MOZ_STACK_CLASS LanguageTag final { friend class LanguageTagParser; - public: - // Flag to request canonicalized Unicode extensions. - enum class UnicodeExtensionCanonicalForm : bool { No, Yes }; - - private: - bool canonicalizeUnicodeExtension( - JSContext* cx, JS::UniqueChars& unicodeExtension, - UnicodeExtensionCanonicalForm canonicalForm); + bool canonicalizeUnicodeExtension(JSContext* cx, + JS::UniqueChars& unicodeExtension); bool canonicalizeTransformExtension(JSContext* cx, JS::UniqueChars& transformExtension); @@ -226,9 +221,22 @@ class MOZ_STACK_CLASS LanguageTag final { void performComplexLanguageMappings(); void performComplexRegionMappings(); + MOZ_MUST_USE bool performVariantMappings(JSContext* cx); MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx); + static const char* replaceTransformExtensionType( + mozilla::Span<const char> key, mozilla::Span<const char> type); + + public: + /** + * Given a Unicode key and type, return the null-terminated preferred + * replacement for that type if there is one, or null if there is none, e.g. + * in effect + * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"| + * and + * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|. + */ static const char* replaceUnicodeExtensionType( mozilla::Span<const char> key, mozilla::Span<const char> type); @@ -337,17 +345,24 @@ class MOZ_STACK_CLASS LanguageTag final { privateuse_ = std::move(privateuse); } + private: + enum class DuplicateVariants { Reject, Accept }; + + bool canonicalizeBaseName(JSContext* cx, DuplicateVariants duplicateVariants); + + public: /** * Canonicalize the base-name subtags, that means the language, script, * region, and variant subtags. */ - bool canonicalizeBaseName(JSContext* cx); + bool canonicalizeBaseName(JSContext* cx) { + return canonicalizeBaseName(cx, DuplicateVariants::Reject); + } /** * Canonicalize all extension subtags. */ - bool canonicalizeExtensions(JSContext* cx, - UnicodeExtensionCanonicalForm canonicalForm); + bool canonicalizeExtensions(JSContext* cx); /** * Canonicalizes the given structurally valid Unicode BCP 47 locale @@ -366,22 +381,10 @@ class MOZ_STACK_CLASS LanguageTag final { * * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private * - * UTS 35 specifies two different canonicalization algorithms. There's one to - * canonicalize BCP 47 language tags and other one to canonicalize Unicode - * locale identifiers. The latter one wasn't present when ECMA-402 was changed - * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags, - * so ECMA-402 currently only uses the former to canonicalize Unicode BCP 47 - * locale identifiers. - * * Spec: ECMAScript Internationalization API Specification, 6.2.3. - * Spec: - * https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion */ - bool canonicalize(JSContext* cx, - UnicodeExtensionCanonicalForm canonicalForm) { - return canonicalizeBaseName(cx) && - canonicalizeExtensions(cx, canonicalForm); + bool canonicalize(JSContext* cx) { + return canonicalizeBaseName(cx) && canonicalizeExtensions(cx); } /** @@ -390,6 +393,12 @@ class MOZ_STACK_CLASS LanguageTag final { JSString* toString(JSContext* cx) const; /** + * Return the string representation of this language tag as a null-terminated + * C-string. + */ + JS::UniqueChars toStringZ(JSContext* cx) const; + + /** * Add likely-subtags to the language tag. * * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> @@ -664,17 +673,32 @@ class MOZ_STACK_CLASS LanguageTagParser final { JSContext* cx, mozilla::Span<const char> extension, AttributesVector& attributes, KeywordsVector& keywords); + static JS::Result<bool> tryParse(JSContext* cx, LocaleChars& localeChars, + size_t localeLength, LanguageTag& tag); + public: // Parse the input string as a language tag. Reports an error to the context // if the input can't be parsed completely. static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag); + // Parse the input string as a language tag. Reports an error to the context + // if the input can't be parsed completely. + static bool parse(JSContext* cx, mozilla::Span<const char> locale, + LanguageTag& tag); + // Parse the input string as a language tag. Returns Ok(true) if the input // could be completely parsed, Ok(false) if the input couldn't be parsed, // or Err() in case of internal error. static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale, LanguageTag& tag); + // Parse the input string as a language tag. Returns Ok(true) if the input + // could be completely parsed, Ok(false) if the input couldn't be parsed, + // or Err() in case of internal error. + static JS::Result<bool> tryParse(JSContext* cx, + mozilla::Span<const char> locale, + LanguageTag& tag); + // Parse the input string as the base-name parts (language, script, region, // variants) of a language tag. Ignores any trailing characters. static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale, @@ -718,6 +742,28 @@ MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str, JS::Result<JSString*> ParseStandaloneISO639LanguageTag( JSContext* cx, JS::Handle<JSLinearString*> str); +class UnicodeExtensionKeyword final { + char key_[LanguageTagLimits::UnicodeKeyLength]; + JSLinearString* type_; + + public: + using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1]; + using UnicodeKeySpan = + mozilla::Span<const char, LanguageTagLimits::UnicodeKeyLength>; + + UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type) + : key_{key[0], key[1]}, type_(type) {} + + UnicodeKeySpan key() const { return {key_, sizeof(key_)}; } + JSLinearString* type() const { return type_; } + + void trace(JSTracer* trc); +}; + +extern MOZ_MUST_USE bool ApplyUnicodeExtensionToTag( + JSContext* cx, LanguageTag& tag, + JS::HandleVector<UnicodeExtensionKeyword> keywords); + } // namespace intl } // namespace js diff --git a/js/src/builtin/intl/LanguageTagGenerated.cpp b/js/src/builtin/intl/LanguageTagGenerated.cpp index 6255861141..bd99140ace 100644 --- a/js/src/builtin/intl/LanguageTagGenerated.cpp +++ b/js/src/builtin/intl/LanguageTagGenerated.cpp @@ -10,6 +10,7 @@ #include <cstdint> #include <cstring> #include <iterator> +#include <string> #include <type_traits> #include "jscntxt.h" @@ -53,6 +54,14 @@ static inline const char* SearchReplacement( } #ifdef DEBUG +static bool IsAsciiLowercaseAlphanumeric(char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); +} + +static bool IsAsciiLowercaseAlphanumericOrDash(char c) { + return IsAsciiLowercaseAlphanumeric(c) || c == '-'; +} + static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; @@ -69,14 +78,26 @@ static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { } static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { - auto isAsciiLowercaseAlphaOrDigit = [](char c) { - return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); - }; - // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; - return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit); + return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); +} + +static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); } #endif @@ -566,6 +587,80 @@ void js::intl::LanguageTag::performComplexRegionMappings() { } } +static const char* ToCharPointer(const char* str) { + return str; +} + +static const char* ToCharPointer(const js::UniqueChars& str) { + return str.get(); +} + +template <typename T, typename U = T> +static bool IsLessThan(const T& a, const U& b) { + return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; +} + +// Mappings from variant subtags to preferred values. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), + IsLessThan<decltype(variants_)::ElementType>)); + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, + IsLessThan<decltype(variants_)::ElementType, + decltype(variant)>); + + // Don't insert the replacement when already present. + if (p != variants_.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } + return !!variants_.insert(p, std::move(preferred)); + }; + + for (size_t i = 0; i < variants_.length(); ) { + auto& variant = variants_[i]; + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get()))); + + if (strcmp(variant.get(), "aaland") == 0) { + variants_.erase(variants_.begin() + i); + setRegion("AX"); + } + else if (strcmp(variant.get(), "arevela") == 0) { + variants_.erase(variants_.begin() + i); + setLanguage("hy"); + } + else if (strcmp(variant.get(), "arevmda") == 0) { + variants_.erase(variants_.begin() + i); + setLanguage("hyw"); + } + else if (strcmp(variant.get(), "heploc") == 0) { + variants_.erase(variants_.begin() + i); + if (!insertVariantSortedIfNotPresent("alalc97")) { + return false; + } + } + else if (strcmp(variant.get(), "polytoni") == 0) { + variants_.erase(variants_.begin() + i); + if (!insertVariantSortedIfNotPresent("polyton")) { + return false; + } + } + else { + i++; + } + } + return true; +} + // Canonicalize grandfathered locale identifiers. // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip @@ -656,16 +751,16 @@ bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { } template <size_t Length> -static inline bool IsUnicodeKey(mozilla::Span<const char> key, - const char (&str)[Length]) { +static inline bool IsUnicodeKey( + mozilla::Span<const char> key, const char (&str)[Length]) { static_assert(Length == UnicodeKeyLength + 1, "Unicode extension key is two characters long"); return memcmp(key.data(), str, Length - 1) == 0; } template <size_t Length> -static inline bool IsUnicodeType(mozilla::Span<const char> type, - const char (&str)[Length]) { +static inline bool IsUnicodeType( + mozilla::Span<const char> type, const char (&str)[Length]) { static_assert(Length > UnicodeKeyLength + 1, "Unicode extension type contains more than two characters"); return type.size() == (Length - 1) && @@ -673,13 +768,7 @@ static inline bool IsUnicodeType(mozilla::Span<const char> type, } static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) { -#ifdef DEBUG - auto isNull = [](char c) { - return c == '\0'; - }; -#endif - - MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull), + MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\0'), "unexpected null-character in string"); using UnsignedChar = unsigned char; @@ -695,12 +784,12 @@ static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) { // Return zero if both strings are equal or a negative number if |b| is a // prefix of |a|. return -int32_t(UnsignedChar(a[b.size()])); -}; +} template <size_t Length> -static inline const char* SearchReplacement(const char* (&types)[Length], - const char* (&aliases)[Length], - mozilla::Span<const char> type) { +static inline const char* SearchUnicodeReplacement( + const char* (&types)[Length], const char* (&aliases)[Length], + mozilla::Span<const char> type) { auto p = std::lower_bound(std::begin(types), std::end(types), type, [](const auto& a, const auto& b) { @@ -717,26 +806,15 @@ static inline const char* SearchReplacement(const char* (&types)[Length], * values. * * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + * Spec: https://www.unicode.org/reports/tr35/#t_Extension */ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( mozilla::Span<const char> key, mozilla::Span<const char> type) { -#ifdef DEBUG - static auto isAsciiLowercaseAlphanumeric = [](char c) { - return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); - }; - - static auto isAsciiLowercaseAlphanumericOrDash = [](char c) { - return isAsciiLowercaseAlphanumeric(c) || c == '-'; - }; -#endif - MOZ_ASSERT(key.size() == UnicodeKeyLength); - MOZ_ASSERT(std::all_of(key.begin(), key.end(), - isAsciiLowercaseAlphanumeric)); + MOZ_ASSERT(IsCanonicallyCasedUnicodeKey(key)); MOZ_ASSERT(type.size() > UnicodeKeyLength); - MOZ_ASSERT(std::all_of(type.begin(), type.end(), - isAsciiLowercaseAlphanumericOrDash)); + MOZ_ASSERT(IsCanonicallyCasedUnicodeType(type)); if (IsUnicodeKey(key, "ca")) { if (IsUnicodeType(type, "ethiopic-amete-alem")) { @@ -804,7 +882,7 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( "pl26", "pl24", "pl28", "pl30", "pl32", "tttob", "ttmrc", "tttob", "twkhh", "twtnn", "twnwt", "twtxg", }; - return SearchReplacement(types, aliases, type); + return SearchUnicodeReplacement(types, aliases, type); } else if (IsUnicodeKey(key, "tz")) { static const char* types[28] = { @@ -821,7 +899,52 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( "usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel", "trist", "utc", "usden", "utc", }; - return SearchReplacement(types, aliases, type); + return SearchUnicodeReplacement(types, aliases, type); + } + return nullptr; +} + +template <size_t Length> +static inline bool IsTransformKey( + mozilla::Span<const char> key, const char (&str)[Length]) { + static_assert(Length == TransformKeyLength + 1, + "Transform extension key is two characters long"); + return memcmp(key.data(), str, Length - 1) == 0; +} + +template <size_t Length> +static inline bool IsTransformType( + mozilla::Span<const char> type, const char (&str)[Length]) { + static_assert(Length > TransformKeyLength + 1, + "Transform extension type contains more than two characters"); + return type.size() == (Length - 1) && + memcmp(type.data(), str, Length - 1) == 0; +} + +/** + * Mapping from deprecated BCP 47 Transform extension types to their preferred + * values. + * + * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + * Spec: https://www.unicode.org/reports/tr35/#t_Extension + */ +const char* js::intl::LanguageTag::replaceTransformExtensionType( + mozilla::Span<const char> key, mozilla::Span<const char> type) { + MOZ_ASSERT(key.size() == TransformKeyLength); + MOZ_ASSERT(IsCanonicallyCasedTransformKey(key)); + + MOZ_ASSERT(type.size() > TransformKeyLength); + MOZ_ASSERT(IsCanonicallyCasedTransformType(type)); + + if (IsTransformKey(key, "d0")) { + if (IsTransformType(type, "name")) { + return "charname"; + } + } + else if (IsTransformKey(key, "m0")) { + if (IsTransformType(type, "names")) { + return "prprname"; + } } return nullptr; } diff --git a/js/src/builtin/intl/Locale.cpp b/js/src/builtin/intl/Locale.cpp index 5d55fad2a1..ee70c0b06f 100644 --- a/js/src/builtin/intl/Locale.cpp +++ b/js/src/builtin/intl/Locale.cpp @@ -362,17 +362,12 @@ static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag, /** * ApplyUnicodeExtensionToTag( tag, options, relevantExtensionKeys ) */ -static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag, - HandleLinearString calendar, - HandleLinearString collation, - HandleLinearString hourCycle, - HandleLinearString caseFirst, - HandleLinearString numeric, - HandleLinearString numberingSystem) { +bool js::intl::ApplyUnicodeExtensionToTag( + JSContext* cx, LanguageTag& tag, + JS::HandleVector<intl::UnicodeExtensionKeyword> keywords) { // If no Unicode extensions were present in the options object, we can skip // everything below and directly return. - if (!calendar && !collation && !caseFirst && !hourCycle && !numeric && - !numberingSystem) { + if (keywords.length() == 0) { return true; } @@ -402,53 +397,32 @@ static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag, } } - using UnicodeKeyWithSeparator = const char(&)[UnicodeKeyLength + 3]; - - auto appendKeyword = [&newExtension](UnicodeKeyWithSeparator key, - JSLinearString* value) { - if (!newExtension.append(key, UnicodeKeyLength + 2)) { - return false; - } - - JS::AutoCheckCannotGC nogc; - return value->hasLatin1Chars() - ? newExtension.append(value->latin1Chars(nogc), value->length()) - : newExtension.append(value->twoByteChars(nogc), - value->length()); - }; - // Append the new keywords before any existing keywords. That way any previous // keyword with the same key is detected as a duplicate when canonicalizing // the Unicode extension subtag and gets discarded. - if (calendar) { - if (!appendKeyword("-ca-", calendar)) { - return false; - } - } - if (collation) { - if (!appendKeyword("-co-", collation)) { - return false; - } - } - if (hourCycle) { - if (!appendKeyword("-hc-", hourCycle)) { + for (const auto& keyword : keywords) { + UnicodeExtensionKeyword::UnicodeKeySpan key = keyword.key(); + if (!newExtension.append('-')) { return false; } - } - if (caseFirst) { - if (!appendKeyword("-kf-", caseFirst)) { + if (!newExtension.append(key.data(), key.size())) { return false; } - } - if (numeric) { - if (!appendKeyword("-kn-", numeric)) { + if (!newExtension.append('-')) { return false; } - } - if (numberingSystem) { - if (!appendKeyword("-nu-", numberingSystem)) { - return false; + + JS::AutoCheckCannotGC nogc; + JSLinearString* type = keyword.type(); + if (type->hasLatin1Chars()) { + if (!newExtension.append(type->latin1Chars(nogc), type->length())) { + return false; + } + } else { + if (!newExtension.append(type->twoByteChars(nogc), type->length())) { + return false; + } } } @@ -560,15 +534,16 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { return false; } - // Step 13 (not applicable). + // Step 13. + JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx); - // Steps 14, 16. + // Step 14. RootedLinearString calendar(cx); if (!GetStringOption(cx, options, cx->names().calendar, &calendar)) { return false; } - // Step 15. + // Steps 15-16. if (calendar) { if (!IsValidUnicodeExtensionValue(calendar)) { if (UniqueChars str = StringToNewUTF8CharsZ(cx, *calendar)) { @@ -578,15 +553,19 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { } return false; } + + if (!keywords.emplaceBack("ca", calendar)) { + return false; + } } - // Steps 17, 19. + // Step 17. RootedLinearString collation(cx); if (!GetStringOption(cx, options, cx->names().collation, &collation)) { return false; } - // Step 18. + // Steps 18-19. if (collation) { if (!IsValidUnicodeExtensionValue(collation)) { if (UniqueChars str = StringToNewUTF8CharsZ(cx, *collation)) { @@ -596,14 +575,19 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { } return false; } + + if (!keywords.emplaceBack("co", collation)) { + return false; + } } - // Steps 20-21. + // Step 20 (without validation). RootedLinearString hourCycle(cx); if (!GetStringOption(cx, options, cx->names().hourCycle, &hourCycle)) { return false; } + // Steps 20-21. if (hourCycle) { if (!StringEqualsAscii(hourCycle, "h11") && !StringEqualsAscii(hourCycle, "h12") && @@ -616,14 +600,19 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { } return false; } + + if (!keywords.emplaceBack("hc", hourCycle)) { + return false; + } } - // Steps 22-23. + // Step 22 (without validation). RootedLinearString caseFirst(cx); if (!GetStringOption(cx, options, cx->names().caseFirst, &caseFirst)) { return false; } + // Steps 22-23. if (caseFirst) { if (!StringEqualsAscii(caseFirst, "upper") && !StringEqualsAscii(caseFirst, "lower") && @@ -635,22 +624,33 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { } return false; } + + if (!keywords.emplaceBack("kf", caseFirst)) { + return false; + } } - // Steps 24-26. + // Steps 24-25. RootedLinearString numeric(cx); if (!GetBooleanOption(cx, options, cx->names().numeric, &numeric)) { return false; } - // Steps 27, 29. + // Step 26. + if (numeric) { + if (!keywords.emplaceBack("kn", numeric)) { + return false; + } + } + + // Step 27. RootedLinearString numberingSystem(cx); if (!GetStringOption(cx, options, cx->names().numberingSystem, &numberingSystem)) { return false; } - // Step 28. + // Steps 28-29. if (numberingSystem) { if (!IsValidUnicodeExtensionValue(numberingSystem)) { if (UniqueChars str = StringToNewUTF8CharsZ(cx, *numberingSystem)) { @@ -660,19 +660,21 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { } return false; } + + if (!keywords.emplaceBack("nu", numberingSystem)) { + return false; + } } // Step 30. - if (!ApplyUnicodeExtensionToTag(cx, tag, calendar, collation, hourCycle, - caseFirst, numeric, numberingSystem)) { + if (!ApplyUnicodeExtensionToTag(cx, tag, keywords)) { return false; } } // ApplyOptionsToTag, steps 9 and 13. - // ApplyUnicodeExtensionToTag, step 8. - if (!tag.canonicalizeExtensions( - cx, LanguageTag::UnicodeExtensionCanonicalForm::Yes)) { + // ApplyUnicodeExtensionToTag, step 9. + if (!tag.canonicalizeExtensions(cx)) { return false; } @@ -954,10 +956,7 @@ static bool Locale_toString(JSContext* cx, unsigned argc, Value* vp) { static bool Locale_baseName(JSContext* cx, const CallArgs& args) { MOZ_ASSERT(IsLocale(args.thisv())); - // FIXME: spec bug - invalid assertion in step 4. - // FIXME: spec bug - subtag production names not updated. - - // Steps 3, 5. + // Steps 3-4. auto* locale = &args.thisv().toObject().as<LocaleObject>(); args.rval().setString(locale->baseName()); return true; @@ -986,6 +985,22 @@ static bool Locale_calendar(JSContext* cx, unsigned argc, Value* vp) { return CallNonGenericMethod<IsLocale, Locale_calendar>(cx, args); } +// get Intl.Locale.prototype.caseFirst +static bool Locale_caseFirst(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + return GetUnicodeExtension(cx, locale, "kf", args.rval()); +} + +// get Intl.Locale.prototype.caseFirst +static bool Locale_caseFirst(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_caseFirst>(cx, args); +} + // get Intl.Locale.prototype.collation static bool Locale_collation(JSContext* cx, const CallArgs& args) { MOZ_ASSERT(IsLocale(args.thisv())); @@ -1018,22 +1033,6 @@ static bool Locale_hourCycle(JSContext* cx, unsigned argc, Value* vp) { return CallNonGenericMethod<IsLocale, Locale_hourCycle>(cx, args); } -// get Intl.Locale.prototype.caseFirst -static bool Locale_caseFirst(JSContext* cx, const CallArgs& args) { - MOZ_ASSERT(IsLocale(args.thisv())); - - // Step 3. - auto* locale = &args.thisv().toObject().as<LocaleObject>(); - return GetUnicodeExtension(cx, locale, "kf", args.rval()); -} - -// get Intl.Locale.prototype.caseFirst -static bool Locale_caseFirst(JSContext* cx, unsigned argc, Value* vp) { - // Steps 1-2. - CallArgs args = CallArgsFromVp(argc, vp); - return CallNonGenericMethod<IsLocale, Locale_caseFirst>(cx, args); -} - // get Intl.Locale.prototype.numeric static bool Locale_numeric(JSContext* cx, const CallArgs& args) { MOZ_ASSERT(IsLocale(args.thisv())); @@ -1045,8 +1044,13 @@ static bool Locale_numeric(JSContext* cx, const CallArgs& args) { return false; } - // FIXME: spec bug - comparison should be against the empty string, too. + // Compare against the empty string per Intl.Locale, step 36.a. The Unicode + // extension is already canonicalized, so we don't need to compare against + // "true" at this point. MOZ_ASSERT(value.isUndefined() || value.isString()); + MOZ_ASSERT_IF(value.isString(), + !StringEqualsAscii(&value.toString()->asLinear(), "true")); + args.rval().setBoolean(value.isString() && value.toString()->empty()); return true; } @@ -1093,7 +1097,6 @@ static bool Locale_language(JSContext* cx, const CallArgs& args) { size_t length = language.length; // Step 5. - // FIXME: spec bug - not all production names updated. JSString* str = NewDependentString(cx, baseName, index, length); if (!str) { return false; @@ -1126,7 +1129,6 @@ static bool Locale_script(JSContext* cx, const CallArgs& args) { auto script = BaseNameParts(baseName).script; // Step 5. - // FIXME: spec bug - not all production names updated. if (!script) { args.rval().setUndefined(); return true; @@ -1208,9 +1210,9 @@ static const JSFunctionSpec locale_methods[] = { static const JSPropertySpec locale_properties[] = { JS_PSG("baseName", Locale_baseName, 0), JS_PSG("calendar", Locale_calendar, 0), + JS_PSG("caseFirst", Locale_caseFirst, 0), JS_PSG("collation", Locale_collation, 0), JS_PSG("hourCycle", Locale_hourCycle, 0), - JS_PSG("caseFirst", Locale_caseFirst, 0), JS_PSG("numeric", Locale_numeric, 0), JS_PSG("numberingSystem", Locale_numberingSystem, 0), JS_PSG("language", Locale_language, 0), @@ -1301,7 +1303,7 @@ bool js::intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, unsigned argc, return false; } - if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) { + if (!tag.canonicalize(cx)) { return false; } @@ -1334,7 +1336,7 @@ bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx, return true; } - if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) { + if (!tag.canonicalize(cx)) { return false; } @@ -1345,3 +1347,85 @@ bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx, args.rval().setString(resultStr); return true; } + +bool js::intl_ValidateAndCanonicalizeUnicodeExtensionType(JSContext* cx, + unsigned argc, + Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + MOZ_ASSERT(args.length() == 3); + + HandleValue typeArg = args[0]; + MOZ_ASSERT(typeArg.isString(), "type must be a string"); + + HandleValue optionArg = args[1]; + MOZ_ASSERT(optionArg.isString(), "option name must be a string"); + + HandleValue keyArg = args[2]; + MOZ_ASSERT(keyArg.isString(), "key must be a string"); + + RootedLinearString unicodeType(cx, typeArg.toString()->ensureLinear(cx)); + if (!unicodeType) { + return false; + } + + if (!IsValidUnicodeExtensionValue(unicodeType)) { + JSAutoByteString optionStr(cx, optionArg.toString()); + if (!optionStr) { + return false; + } + + JSAutoByteString unicodeTypeQuot(cx, QuoteString(cx, unicodeType, '"')); + if (!unicodeTypeQuot) { + return false; + } + + JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, optionStr.ptr(), + unicodeTypeQuot.ptr()); + return false; + } + + char unicodeKey[UnicodeKeyLength]; + { + JSLinearString* str = keyArg.toString()->ensureLinear(cx); + if (!str) { + return false; + } + MOZ_ASSERT(str->length() == UnicodeKeyLength); + + for (size_t i = 0; i < UnicodeKeyLength; i++) { + char16_t ch = str->latin1OrTwoByteChar(i); + MOZ_ASSERT(mozilla::IsAscii(ch)); + unicodeKey[i] = char(ch); + } + } + + JSAutoByteString unicodeTypeChars(cx, unicodeType); + if (!unicodeTypeChars) { + return false; + } + + size_t unicodeTypeLength = unicodeType->length(); + MOZ_ASSERT(strlen(unicodeTypeChars.ptr()) == unicodeTypeLength); + + // Convert into canonical case before searching for replacements. + intl::AsciiToLowerCase(unicodeTypeChars.ptr(), unicodeTypeLength, + unicodeTypeChars.ptr()); + + auto key = mozilla::MakeSpan(unicodeKey, UnicodeKeyLength); + auto type = mozilla::MakeSpan(unicodeTypeChars.ptr(), unicodeTypeLength); + + // Search if there's a replacement for the current Unicode keyword. + JSString* result; + if (const char* replacement = LanguageTag::replaceUnicodeExtensionType(key, type)) { + result = NewStringCopyZ<CanGC>(cx, replacement); + } else { + result = StringToLowerCase(cx, unicodeType); + } + if (!result) { + return false; + } + + args.rval().setString(result); + return true; +} diff --git a/js/src/builtin/intl/Locale.h b/js/src/builtin/intl/Locale.h index 31b3caca5c..74ff4b5a71 100644 --- a/js/src/builtin/intl/Locale.h +++ b/js/src/builtin/intl/Locale.h @@ -56,6 +56,9 @@ extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, extern MOZ_MUST_USE bool intl_TryValidateAndCanonicalizeLanguageTag( JSContext* cx, unsigned argc, Value* vp); +extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeUnicodeExtensionType( + JSContext* cx, unsigned argc, Value* vp); + } // namespace js #endif /* builtin_intl_Locale_h */ diff --git a/js/src/builtin/intl/NumberFormat.cpp b/js/src/builtin/intl/NumberFormat.cpp index df40e751c8..9ee3b02109 100644 --- a/js/src/builtin/intl/NumberFormat.cpp +++ b/js/src/builtin/intl/NumberFormat.cpp @@ -18,6 +18,7 @@ #include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/LanguageTag.h"
#include "builtin/intl/ScopedICUObject.h"
#include "ds/Sort.h"
#include "js/RootingAPI.h"
@@ -246,7 +247,41 @@ NewUNumberFormat(JSContext* cx, Handle<NumberFormatObject*> numberFormat) if (!GetProperty(cx, internals, internals, cx->names().locale, &value))
return nullptr;
- JSAutoByteString locale(cx, value.toString());
+
+ // ICU expects numberingSystem as a Unicode locale extensions on locale.
+
+ intl::LanguageTag tag(cx);
+ {
+ JSLinearString* locale = value.toString()->ensureLinear(cx);
+ if (!locale)
+ return nullptr;
+
+ if (!intl::LanguageTagParser::parse(cx, locale, tag))
+ return nullptr;
+ }
+
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
+
+ if (!GetProperty(cx, internals, internals, cx->names().numberingSystem, &value))
+ return nullptr;
+
+ {
+ JSLinearString* numberingSystem = value.toString()->ensureLinear(cx);
+ if (!numberingSystem)
+ return nullptr;
+
+ if (!keywords.emplaceBack("nu", numberingSystem))
+ return nullptr;
+ }
+
+ // |ApplyUnicodeExtensionToTag| applies the new keywords to the front of
+ // the Unicode extension subtag. We're then relying on ICU to follow RFC
+ // 6067, which states that any trailing keywords using the same key
+ // should be ignored.
+ if (!intl::ApplyUnicodeExtensionToTag(cx, tag, keywords))
+ return nullptr;
+
+ UniqueChars locale = tag.toStringZ(cx);
if (!locale)
return nullptr;
@@ -264,9 +299,6 @@ NewUNumberFormat(JSContext* cx, Handle<NumberFormatObject*> numberFormat) RootedString currency(cx);
AutoStableStringChars stableChars(cx);
- // We don't need to look at numberingSystem - it can only be set via
- // the Unicode locale extension and is therefore already set on locale.
-
if (!GetProperty(cx, internals, internals, cx->names().style, &value))
return nullptr;
JSAutoByteString style(cx, value.toString());
@@ -339,7 +371,7 @@ NewUNumberFormat(JSContext* cx, Handle<NumberFormatObject*> numberFormat) uUseGrouping = value.toBoolean();
UErrorCode status = U_ZERO_ERROR;
- UNumberFormat* nf = unum_open(uStyle, nullptr, 0, IcuLocale(locale.ptr()), nullptr, &status);
+ UNumberFormat* nf = unum_open(uStyle, nullptr, 0, IcuLocale(locale.get()), nullptr, &status);
if (U_FAILURE(status)) {
intl::ReportInternalError(cx);
return nullptr;
diff --git a/js/src/builtin/intl/NumberFormat.js b/js/src/builtin/intl/NumberFormat.js index 973abd026a..238a59405b 100644 --- a/js/src/builtin/intl/NumberFormat.js +++ b/js/src/builtin/intl/NumberFormat.js @@ -211,6 +211,8 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { // opt: // opt object computed in InitializeNumberFormat
// {
// localeMatcher: "lookup" / "best fit",
+ //
+ // nu: string matching a Unicode extension type, // optional
// }
//
// minimumIntegerDigits: integer ∈ [1, 21],
@@ -253,6 +255,16 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { // Steps 5-6.
var matcher = GetOption(options, "localeMatcher", "string", ["lookup", "best fit"], "best fit");
opt.localeMatcher = matcher;
+
+ var numberingSystem = GetOption(options, "numberingSystem", "string", undefined, undefined);
+
+ if (numberingSystem !== undefined) {
+ numberingSystem = intl_ValidateAndCanonicalizeUnicodeExtensionType(numberingSystem,
+ "numberingSystem",
+ "nu");
+ }
+
+ opt.nu = numberingSystem;
// Compute formatting options.
// Step 12.
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py index 0370d422d9..59ff14d76c 100644 --- a/js/src/builtin/intl/make_intl_data.py +++ b/js/src/builtin/intl/make_intl_data.py @@ -331,6 +331,96 @@ void js::intl::LanguageTag::performComplexRegionMappings() { """.strip("\n")) +def writeVariantTagMappings(println, variant_mappings, description, source, + url): + """ Writes a function definition that maps variant subtags. """ + println(u""" +static const char* ToCharPointer(const char* str) { + return str; +} + +static const char* ToCharPointer(const js::UniqueChars& str) { + return str.get(); +} + +template <typename T, typename U = T> +static bool IsLessThan(const T& a, const U& b) { + return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; +} +""") + writeMappingHeader(println, description, source, url) + println(u""" +bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), + IsLessThan<decltype(variants_)::ElementType>)); + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, + IsLessThan<decltype(variants_)::ElementType, + decltype(variant)>); + + // Don't insert the replacement when already present. + if (p != variants_.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } + return !!variants_.insert(p, std::move(preferred)); + }; + + for (size_t i = 0; i < variants_.length(); ) { + auto& variant = variants_[i]; + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get()))); +""".lstrip()) + + first_variant = True + + for (deprecated_variant, (type, replacement)) in ( + sorted(variant_mappings.items(), key=itemgetter(0)) + ): + if_kind = u"if" if first_variant else u"else if" + first_variant = False + + println(u""" + {} (strcmp(variant.get(), "{}") == 0) {{ + variants_.erase(variants_.begin() + i); +""".format(if_kind, deprecated_variant).strip("\n")) + + if type == "language": + println(u""" + setLanguage("{}"); +""".format(replacement).strip("\n")) + elif type == "region": + println(u""" + setRegion("{}"); +""".format(replacement).strip("\n")) + else: + assert type == "variant" + println(u""" + if (!insertVariantSortedIfNotPresent("{}")) {{ + return false; + }} +""".format(replacement).strip("\n")) + + println(u""" + } +""".strip("\n")) + + println(u""" + else { + i++; + } + } + return true; +} +""".strip("\n")) + + def writeGrandfatheredMappingsFunction(println, grandfathered_mappings, description, source, url): """ Writes a function definition that maps grandfathered language tags. """ @@ -498,6 +588,7 @@ def readSupplementalData(core_file): - complexLanguageMappings: mappings from language subtags with complex rules - regionMappings: mappings from region subtags to preferred subtags - complexRegionMappings: mappings from region subtags with complex rules + - variantMappings: mappings from variant subtags to preferred subtags - likelySubtags: likely subtags used for generating test data only Returns these mappings as dictionaries. """ @@ -541,6 +632,14 @@ def readSupplementalData(core_file): $ """, re.IGNORECASE | re.VERBOSE) + re_unicode_variant_subtag = re.compile( + r""" + ^ + # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3})) + $ + """, re.IGNORECASE | re.VERBOSE) + # The fixed list of BCP 47 grandfathered language tags. grandfathered_tags = ( "art-lojban", @@ -589,6 +688,11 @@ def readSupplementalData(core_file): # replacement, e.g. "SU" -> ("RU", ["AM",complex_region_mappings[type] = replacements "AZ", "BY", ...]). complex_region_mappings = {} + # Dictionary of aliased variant subtags to a tuple of preferred replacement + # type and replacement, e.g. "arevela" -> ("language", "hy") or + # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97"). + variant_mappings = {} + # Dictionary of grandfathered mappings to preferred values. grandfathered_mappings = {} @@ -624,6 +728,8 @@ def readSupplementalData(core_file): if re_unicode_language_subtag.match(type) is None: continue + assert type.islower() + if re_unicode_language_subtag.match(replacement) is not None: # Canonical case for language subtags is lower-case. language_mappings[type] = replacement.lower() @@ -647,6 +753,8 @@ def readSupplementalData(core_file): if re_unicode_region_subtag.match(type) is None: continue + assert type.isupper() or type.isdigit() + if re_unicode_region_subtag.match(replacement) is not None: # Canonical case for region subtags is upper-case. region_mappings[type] = replacement.upper() @@ -658,6 +766,33 @@ def readSupplementalData(core_file): ), "{} invalid region subtags".format(replacement) complex_region_mappings[type] = replacements + for variant_alias in tree.iterfind(".//variantAlias"): + type = variant_alias.get("type") + replacement = variant_alias.get("replacement") + + assert re_unicode_variant_subtag.match(type) is not None, ( + "{} invalid variant subtag".format(type)) + + # Normalize the case, because some variants are in upper case. + type = type.lower() + + # The replacement can be a language, a region, or a variant subtag. + # Language and region subtags are case normalized, variant subtags can + # be in any case. + + if re_unicode_language_subtag.match(replacement) is not None and replacement.islower(): + variant_mappings[type] = ("language", replacement) + + elif re_unicode_region_subtag.match(replacement) is not None: + assert replacement.isupper() or replacement.isdigit(), ( + "{} invalid variant subtag replacement".format(replacement)) + variant_mappings[type] = ("region", replacement) + + else: + assert re_unicode_variant_subtag.match(replacement) is not None, ( + "{} invalid variant subtag replacement".format(replacement)) + variant_mappings[type] = ("variant", replacement.lower()) + tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml")) likely_subtags = {} @@ -724,6 +859,7 @@ def readSupplementalData(core_file): "complexLanguageMappings": complex_language_mappings, "regionMappings": region_mappings, "complexRegionMappings": complex_region_mappings_final, + "variantMappings": variant_mappings, "likelySubtags": likely_subtags, } @@ -740,14 +876,20 @@ def readUnicodeExtensions(core_file): # Mapping from Unicode extension types to dict of deprecated to # preferred values. - mapping = {} + mapping = { + # Unicode BCP 47 U Extension + "u": {}, + + # Unicode BCP 47 T Extension + "t": {}, + } def readBCP47File(file): tree = ET.parse(file) for keyword in tree.iterfind(".//keyword/key"): - # Skip over keywords whose extension is not "u". - if keyword.get("extension", "u") != "u": - continue + extension = keyword.get("extension", "u") + assert extension == "u" or extension == "t", ( + "unknown extension type: {}".format(extension)) extension_name = keyword.get("name") @@ -806,7 +948,7 @@ def readUnicodeExtensions(core_file): if preferred is not None: assert typeRE.match(preferred), preferred - mapping.setdefault(extension_name, {})[name] = preferred + mapping[extension].setdefault(extension_name, {})[name] = preferred if alias is not None: for alias_name in alias.lower().split(" "): @@ -816,7 +958,7 @@ def readUnicodeExtensions(core_file): # See comment above when 'alias' and 'preferred' are both present. if (preferred is not None and - name in mapping[extension_name]): + name in mapping[extension][extension_name]): continue # Skip over entries where 'name' and 'alias' are equal. @@ -828,7 +970,7 @@ def readUnicodeExtensions(core_file): if name == alias_name: continue - mapping.setdefault(extension_name, {})[alias_name] = name + mapping[extension].setdefault(extension_name, {})[alias_name] = name def readSupplementalMetadata(file): # Find subdivision and region replacements. @@ -857,8 +999,8 @@ def readUnicodeExtensions(core_file): continue # 'subdivisionAlias' applies to 'rg' and 'sd' keys. - mapping.setdefault("rg", {})[type] = replacement - mapping.setdefault("sd", {})[type] = replacement + mapping["u"].setdefault("rg", {})[type] = replacement + mapping["u"].setdefault("sd", {})[type] = replacement for name in core_file.namelist(): if bcpFileRE.match(name): @@ -866,7 +1008,10 @@ def readUnicodeExtensions(core_file): readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml")) - return mapping + return { + "unicodeMappings": mapping["u"], + "transformMappings": mapping["t"], + } def writeCLDRLanguageTagData(println, data, url): """ Writes the language tag data to the Intl data file. """ @@ -884,6 +1029,7 @@ def writeCLDRLanguageTagData(println, data, url): #include <cstdint> #include <cstring> #include <iterator> +#include <string> #include <type_traits> #include "jscntxt.h" @@ -927,6 +1073,14 @@ static inline const char* SearchReplacement( } #ifdef DEBUG +static bool IsAsciiLowercaseAlphanumeric(char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); +} + +static bool IsAsciiLowercaseAlphanumericOrDash(char c) { + return IsAsciiLowercaseAlphanumeric(c) || c == '-'; +} + static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; @@ -943,14 +1097,26 @@ static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { } static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { - auto isAsciiLowercaseAlphaOrDigit = [](char c) { - return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); - }; - // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; - return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit); + return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); +} + +static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); } #endif """.rstrip()) @@ -961,7 +1127,9 @@ static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { complex_language_mappings = data["complexLanguageMappings"] region_mappings = data["regionMappings"] complex_region_mappings = data["complexRegionMappings"] + variant_mappings = data["variantMappings"] unicode_mappings = data["unicodeMappings"] + transform_mappings = data["transformMappings"] # unicode_language_subtag = alpha{2,3} | alpha{5,8} ; language_maxlength = 8 @@ -999,11 +1167,15 @@ static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { writeComplexRegionTagMappings(println, complex_region_mappings, "Region subtags with complex mappings.", source, url) + writeVariantTagMappings(println, variant_mappings, + "Mappings from variant subtags to preferred values.", source, url) + writeGrandfatheredMappingsFunction(println, grandfathered_mappings, "Canonicalize grandfathered locale identifiers.", source, url) - writeUnicodeExtensionsMappings(println, unicode_mappings) + writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode") + writeUnicodeExtensionsMappings(println, transform_mappings, "Transform") def writeCLDRLanguageTagLikelySubtagsTest(println, data, url): @@ -1157,7 +1329,7 @@ def updateCLDRLangTags(args): def readFiles(cldr_file): with ZipFile(cldr_file) as zip_file: data.update(readSupplementalData(zip_file)) - data["unicodeMappings"] = readUnicodeExtensions(zip_file) + data.update(readUnicodeExtensions(zip_file)) print("Processing CLDR data...") if filename is not None: @@ -1181,8 +1353,7 @@ def updateCLDRLangTags(args): with io.open(test_file, mode="w", encoding="utf-8", newline="") as f: println = partial(print, file=f) - println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||" - u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))") + println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl'))") writeCLDRLanguageTagLikelySubtagsTest(println, data, url) @@ -1898,91 +2069,84 @@ def updateTzdata(topsrcdir, args): else: updateFrom(tzDir) -def writeUnicodeExtensionsMappings(println, mapping): +def writeUnicodeExtensionsMappings(println, mapping, extension): println(u""" template <size_t Length> -static inline bool IsUnicodeKey(mozilla::Span<const char> key, - const char (&str)[Length]) { - static_assert(Length == UnicodeKeyLength + 1, - "Unicode extension key is two characters long"); +static inline bool Is{0}Key( + mozilla::Span<const char> key, const char (&str)[Length]) {{ + static_assert(Length == {0}KeyLength + 1, + "{0} extension key is two characters long"); return memcmp(key.data(), str, Length - 1) == 0; -} +}} template <size_t Length> -static inline bool IsUnicodeType(mozilla::Span<const char> type, - const char (&str)[Length]) { - static_assert(Length > UnicodeKeyLength + 1, - "Unicode extension type contains more than two characters"); +static inline bool Is{0}Type( + mozilla::Span<const char> type, const char (&str)[Length]) {{ + static_assert(Length > {0}KeyLength + 1, + "{0} extension type contains more than two characters"); return type.size() == (Length - 1) && memcmp(type.data(), str, Length - 1) == 0; -} +}} +""".format(extension).rstrip("\n")) -static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) { -#ifdef DEBUG - auto isNull = [](char c) { - return c == '\\0'; - }; -#endif + linear_search_max_length = 4 + + needs_binary_search = any(len(replacements.items()) > linear_search_max_length + for replacements in mapping.values()) - MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull), + if needs_binary_search: + println(u""" +static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{ + MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'), "unexpected null-character in string"); using UnsignedChar = unsigned char; - for (size_t i = 0; i < b.size(); i++) { + for (size_t i = 0; i < b.size(); i++) {{ // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if // we've reached the end of |a|, the below if-statement will always be true. // That ensures we don't read past the end of |a|. - if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) { + if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{ return r; - } - } + }} + }} // Return zero if both strings are equal or a negative number if |b| is a // prefix of |a|. return -int32_t(UnsignedChar(a[b.size()])); -}; +}} template <size_t Length> -static inline const char* SearchReplacement(const char* (&types)[Length], - const char* (&aliases)[Length], - mozilla::Span<const char> type) { +static inline const char* Search{0}Replacement( + const char* (&types)[Length], const char* (&aliases)[Length], + mozilla::Span<const char> type) {{ auto p = std::lower_bound(std::begin(types), std::end(types), type, - [](const auto& a, const auto& b) { - return CompareUnicodeType(a, b) < 0; - }); - if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) { + [](const auto& a, const auto& b) {{ + return Compare{0}Type(a, b) < 0; + }}); + if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{ return aliases[std::distance(std::begin(types), p)]; - } + }} return nullptr; -} +}} +""".format(extension).rstrip("\n")) + println(u""" /** - * Mapping from deprecated BCP 47 Unicode extension types to their preferred + * Mapping from deprecated BCP 47 {0} extension types to their preferred * values. * * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + * Spec: https://www.unicode.org/reports/tr35/#t_Extension */ -const char* js::intl::LanguageTag::replaceUnicodeExtensionType( - mozilla::Span<const char> key, mozilla::Span<const char> type) { -#ifdef DEBUG - static auto isAsciiLowercaseAlphanumeric = [](char c) { - return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); - }; +const char* js::intl::LanguageTag::replace{0}ExtensionType( + mozilla::Span<const char> key, mozilla::Span<const char> type) {{ + MOZ_ASSERT(key.size() == {0}KeyLength); + MOZ_ASSERT(IsCanonicallyCased{0}Key(key)); - static auto isAsciiLowercaseAlphanumericOrDash = [](char c) { - return isAsciiLowercaseAlphanumeric(c) || c == '-'; - }; -#endif - - MOZ_ASSERT(key.size() == UnicodeKeyLength); - MOZ_ASSERT(std::all_of(key.begin(), key.end(), - isAsciiLowercaseAlphanumeric)); - - MOZ_ASSERT(type.size() > UnicodeKeyLength); - MOZ_ASSERT(std::all_of(type.begin(), type.end(), - isAsciiLowercaseAlphanumericOrDash)); -""") + MOZ_ASSERT(type.size() > {0}KeyLength); + MOZ_ASSERT(IsCanonicallyCased{0}Type(type)); +""".format(extension)) def to_hash_key(replacements): return str(sorted(replacements.items())) @@ -2014,7 +2178,8 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( if key in key_aliases[hash_key]: continue - cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key]) + cond = (u"Is{}Key(key, \"{}\")".format(extension, k) + for k in [key] + key_aliases[hash_key]) if_kind = u"if" if first_key else u"else if" cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) @@ -2024,7 +2189,7 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( replacements = sorted(replacements.items(), key=itemgetter(0)) - if len(replacements) > 4: + if len(replacements) > linear_search_max_length: types = [t for (t, _) in replacements] preferred = [r for (_, r) in replacements] max_len = max(len(k) for k in types + preferred) @@ -2032,14 +2197,14 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( write_array(types, "types", max_len) write_array(preferred, "aliases", max_len) println(u""" - return SearchReplacement(types, aliases, type); -""".strip("\n")) + return Search{}Replacement(types, aliases, type); +""".format(extension).strip("\n")) else: for (type, replacement) in replacements: println(u""" - if (IsUnicodeType(type, "{}")) {{ + if (Is{}Type(type, "{}")) {{ return "{}"; - }}""".format(type, replacement).strip("\n")) + }}""".format(extension, type, replacement).strip("\n")) println(u""" }""".lstrip("\n")) diff --git a/js/src/vm/SelfHosting.cpp b/js/src/vm/SelfHosting.cpp index ef007a69db..6446cbb4be 100644 --- a/js/src/vm/SelfHosting.cpp +++ b/js/src/vm/SelfHosting.cpp @@ -2487,6 +2487,7 @@ static const JSFunctionSpec intrinsic_functions[] = { JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0), JS_FN("intl_ValidateAndCanonicalizeLanguageTag", intl_ValidateAndCanonicalizeLanguageTag, 2, 0), JS_FN("intl_TryValidateAndCanonicalizeLanguageTag", intl_TryValidateAndCanonicalizeLanguageTag, 1, 0), + JS_FN("intl_ValidateAndCanonicalizeUnicodeExtensionType", intl_ValidateAndCanonicalizeUnicodeExtensionType, 3, 0), JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0), JS_INLINABLE_FN("IsCollator", |