summaryrefslogtreecommitdiff
path: root/js/src
diff options
context:
space:
mode:
authorMartok <martok@martoks-place.de>2023-06-29 23:09:26 +0200
committerMartok <martok@martoks-place.de>2023-06-30 00:01:35 +0200
commitaf47a256b5cf2b81e4c3bf8f36682f8b9f31be42 (patch)
treeaf1b472d545dcd80afa9de5e468912f39cf8ee12 /js/src
parente96f965422528636e13adc3473679248941540e7 (diff)
downloaduxp-af47a256b5cf2b81e4c3bf8f36682f8b9f31be42.tar.gz
Issue #1819 - Further align Intl.Locale to spec
- Reference updates (UTS 35) - variant subtag and transform extension canonicalisation
Diffstat (limited to 'js/src')
-rw-r--r--js/src/builtin/intl/Collator.cpp49
-rw-r--r--js/src/builtin/intl/DateTimeFormat.cpp56
-rw-r--r--js/src/builtin/intl/DateTimeFormat.js33
-rw-r--r--js/src/builtin/intl/IntlObject.cpp4
-rw-r--r--js/src/builtin/intl/LanguageTag.cpp210
-rw-r--r--js/src/builtin/intl/LanguageTag.h96
-rw-r--r--js/src/builtin/intl/LanguageTagGenerated.cpp195
-rw-r--r--js/src/builtin/intl/Locale.cpp258
-rw-r--r--js/src/builtin/intl/Locale.h3
-rw-r--r--js/src/builtin/intl/NumberFormat.cpp42
-rw-r--r--js/src/builtin/intl/NumberFormat.js12
-rw-r--r--js/src/builtin/intl/make_intl_data.py319
-rw-r--r--js/src/vm/SelfHosting.cpp1
13 files changed, 958 insertions, 320 deletions
diff --git a/js/src/builtin/intl/Collator.cpp b/js/src/builtin/intl/Collator.cpp
index 5f142d7e6d..450c654620 100644
--- a/js/src/builtin/intl/Collator.cpp
+++ b/js/src/builtin/intl/Collator.cpp
@@ -8,12 +8,14 @@
#include "builtin/intl/Collator.h"
#include "mozilla/Assertions.h"
+#include "mozilla/Span.h"
#include "jsapi.h"
#include "jscntxt.h"
#include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/LanguageTag.h"
#include "builtin/intl/ScopedICUObject.h"
#include "builtin/intl/SharedIntlData.h"
#include "js/TypeDecls.h"
@@ -283,32 +285,33 @@ NewUCollator(JSContext* cx, Handle<CollatorObject*> collator)
return nullptr;
if (StringsAreEqual(usage, "search")) {
// ICU expects search as a Unicode locale extension on locale.
- // Unicode locale extensions must occur before private use extensions.
- const char* oldLocale = locale.ptr();
- const char* p;
- size_t index;
- size_t localeLen = strlen(oldLocale);
- if ((p = strstr(oldLocale, "-x-")))
- index = p - oldLocale;
- else
- index = localeLen;
-
- const char* insert;
- if ((p = strstr(oldLocale, "-u-")) && static_cast<size_t>(p - oldLocale) < index) {
- index = p - oldLocale + 2;
- insert = "-co-search";
- } else {
- insert = "-u-co-search";
+ intl::LanguageTag tag(cx);
+ if (!intl::LanguageTagParser::parse(
+ cx, mozilla::MakeCStringSpan(locale.ptr()), tag)) {
+ return nullptr;
+ }
+
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
+
+ if (!keywords.emplaceBack("co", cx->names().search)) {
+ return nullptr;
}
- size_t insertLen = strlen(insert);
- char* newLocale = cx->pod_malloc<char>(localeLen + insertLen + 1);
- if (!newLocale)
+
+ // |ApplyUnicodeExtensionToTag| applies the new keywords to the front of
+ // the Unicode extension subtag. We're then relying on ICU to follow RFC
+ // 6067, which states that any trailing keywords using the same key
+ // should be ignored.
+ if (!intl::ApplyUnicodeExtensionToTag(cx, tag, keywords)) {
return nullptr;
- memcpy(newLocale, oldLocale, index);
- memcpy(newLocale + index, insert, insertLen);
- memcpy(newLocale + index + insertLen, oldLocale + index, localeLen - index + 1); // '\0'
+ }
+
locale.clear();
- locale.initBytes(newLocale);
+ locale.encodeLatin1(cx, tag.toString(cx));
+ if (!locale) {
+ return nullptr;
+ }
+ } else {
+ MOZ_ASSERT(StringsAreEqual(usage, "sort"));
}
// We don't need to look at the collation property - it can only be set
diff --git a/js/src/builtin/intl/DateTimeFormat.cpp b/js/src/builtin/intl/DateTimeFormat.cpp
index 78e863eedf..0dd724bf2e 100644
--- a/js/src/builtin/intl/DateTimeFormat.cpp
+++ b/js/src/builtin/intl/DateTimeFormat.cpp
@@ -15,6 +15,7 @@
#include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/LanguageTag.h"
#include "builtin/intl/ScopedICUObject.h"
#include "builtin/intl/SharedIntlData.h"
#include "builtin/intl/TimeZoneDataGenerated.h"
@@ -582,14 +583,57 @@ NewUDateFormat(JSContext* cx, Handle<DateTimeFormatObject*> dateTimeFormat)
if (!GetProperty(cx, internals, internals, cx->names().locale, &value))
return nullptr;
- JSAutoByteString locale(cx, value.toString());
- if (!locale)
- return nullptr;
- // We don't need to look at calendar and numberingSystem - they can only be
- // set via the Unicode locale extension and are therefore already set on
+ // ICU expects calendar and numberingSystem as Unicode locale extensions on
// locale.
+ intl::LanguageTag tag(cx);
+ {
+ JSLinearString* locale = value.toString()->ensureLinear(cx);
+ if (!locale)
+ return nullptr;
+
+ if (!intl::LanguageTagParser::parse(cx, locale, tag))
+ return nullptr;
+ }
+
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
+
+ if (!GetProperty(cx, internals, internals, cx->names().calendar, &value))
+ return nullptr;
+
+ {
+ JSLinearString* calendar = value.toString()->ensureLinear(cx);
+ if (!calendar)
+ return nullptr;
+
+ if (!keywords.emplaceBack("ca", calendar))
+ return nullptr;
+ }
+
+ if (!GetProperty(cx, internals, internals, cx->names().numberingSystem, &value))
+ return nullptr;
+
+ {
+ JSLinearString* numberingSystem = value.toString()->ensureLinear(cx);
+ if (!numberingSystem)
+ return nullptr;
+
+ if (!keywords.emplaceBack("nu", numberingSystem))
+ return nullptr;
+ }
+
+ // |ApplyUnicodeExtensionToTag| applies the new keywords to the front of
+ // the Unicode extension subtag. We're then relying on ICU to follow RFC
+ // 6067, which states that any trailing keywords using the same key
+ // should be ignored.
+ if (!intl::ApplyUnicodeExtensionToTag(cx, tag, keywords))
+ return nullptr;
+
+ UniqueChars locale = tag.toStringZ(cx);
+ if (!locale)
+ return nullptr;
+
if (!GetProperty(cx, internals, internals, cx->names().timeZone, &value))
return nullptr;
@@ -614,7 +658,7 @@ NewUDateFormat(JSContext* cx, Handle<DateTimeFormatObject*> dateTimeFormat)
UErrorCode status = U_ZERO_ERROR;
UDateFormat* df =
- udat_open(UDAT_PATTERN, UDAT_PATTERN, IcuLocale(locale.ptr()), uTimeZone, uTimeZoneLength,
+ udat_open(UDAT_PATTERN, UDAT_PATTERN, IcuLocale(locale.get()), uTimeZone, uTimeZoneLength,
uPattern, uPatternLength, &status);
if (U_FAILURE(status)) {
intl::ReportInternalError(cx);
diff --git a/js/src/builtin/intl/DateTimeFormat.js b/js/src/builtin/intl/DateTimeFormat.js
index 77e10fa5f7..9d1adc8687 100644
--- a/js/src/builtin/intl/DateTimeFormat.js
+++ b/js/src/builtin/intl/DateTimeFormat.js
@@ -20,9 +20,11 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) {
// {
// localeMatcher: "lookup" / "best fit",
//
- // hour12: true / false, // optional
+ // ca: string matching a Unicode extension type, // optional
+ //
+ // nu: string matching a Unicode extension type, // optional
//
- // hourCycle: "h11" / "h12" / "h23" / "h24", // optional
+ // hc: "h11" / "h12" / "h23" / "h24", // optional
// }
//
// timeZone: IANA time zone name,
@@ -31,6 +33,8 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) {
// {
// // all the properties/values listed in Table 3
// // (weekday, era, year, month, day, &c.)
+ //
+ // hour12: true / false, // optional
// }
//
// formatMatcher: "basic" / "best fit",
@@ -343,6 +347,12 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m
// localeOpt: // *first* opt computed in InitializeDateTimeFormat
// {
// localeMatcher: "lookup" / "best fit",
+ //
+ // ca: string matching a Unicode extension type, // optional
+ //
+ // nu: string matching a Unicode extension type, // optional
+ //
+ // hc: "h11" / "h12" / "h23" / "h24", // optional
// }
//
// timeZone: IANA time zone name,
@@ -353,7 +363,6 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m
// // (weekday, era, year, month, day, &c.)
//
// hour12: true / false, // optional
- // hourCycle: "h11" / "h12" / "h23" / "h24", // optional
// }
//
// formatMatcher: "basic" / "best fit",
@@ -382,6 +391,24 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m
"best fit");
localeOpt.localeMatcher = localeMatcher;
+ var calendar = GetOption(options, "calendar", "string", undefined, undefined);
+
+ if (calendar !== undefined) {
+ calendar = intl_ValidateAndCanonicalizeUnicodeExtensionType(calendar, "calendar", "ca");
+ }
+
+ localeOpt.ca = calendar;
+
+ var numberingSystem = GetOption(options, "numberingSystem", "string", undefined, undefined);
+
+ if (numberingSystem !== undefined) {
+ numberingSystem = intl_ValidateAndCanonicalizeUnicodeExtensionType(numberingSystem,
+ "numberingSystem",
+ "nu");
+ }
+
+ localeOpt.nu = numberingSystem;
+
// Step 6.
var hr12 = GetOption(options, "hour12", "boolean", undefined, undefined);
diff --git a/js/src/builtin/intl/IntlObject.cpp b/js/src/builtin/intl/IntlObject.cpp
index e0dd36dac4..2f42e1df76 100644
--- a/js/src/builtin/intl/IntlObject.cpp
+++ b/js/src/builtin/intl/IntlObject.cpp
@@ -548,7 +548,7 @@ js::intl_BestAvailableLocale(JSContext* cx, unsigned argc, Value* vp)
MOZ_ASSERT(!tag.unicodeExtension(),
"locale must contain no Unicode extensions");
- if (!tag.canonicalize(cx, intl::LanguageTag::UnicodeExtensionCanonicalForm::No)) {
+ if (!tag.canonicalize(cx)) {
return false;
}
@@ -608,7 +608,7 @@ js::intl_supportedLocaleOrFallback(JSContext* cx, unsigned argc, Value* vp)
return false;
}
} else {
- if (!tag.canonicalize(cx, intl::LanguageTag::UnicodeExtensionCanonicalForm::No)) {
+ if (!tag.canonicalize(cx)) {
return false;
}
diff --git a/js/src/builtin/intl/LanguageTag.cpp b/js/src/builtin/intl/LanguageTag.cpp
index 583033f629..501885dd9d 100644
--- a/js/src/builtin/intl/LanguageTag.cpp
+++ b/js/src/builtin/intl/LanguageTag.cpp
@@ -27,7 +27,9 @@
#include "builtin/intl/CommonFunctions.h"
#include "ds/Sort.h"
+#include "gc/Tracer.h"
#include "js/Result.h"
+#include "js/TracingAPI.h"
#include "js/Utility.h"
#include "js/Vector.h"
#include "unicode/uloc.h"
@@ -259,10 +261,11 @@ static bool SortAlphabetically(JSContext* cx,
return true;
}
-bool LanguageTag::canonicalizeBaseName(JSContext* cx) {
- // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by
- // normalizing the case and ordering all subtags. The canonical syntax form
- // itself is specified in UTS 35, 3.2.1.
+bool LanguageTag::canonicalizeBaseName(JSContext* cx,
+ DuplicateVariants duplicateVariants) {
+ // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to
+ // canonicalize the syntax by normalizing the case and ordering all subtags.
+ // The canonical syntax form is specified in UTS 35, 3.2.1.
// Language codes need to be in lower case. "JA" -> "ja"
language_.toLowerCase();
@@ -299,25 +302,42 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) {
return false;
}
- // Reject the Locale identifier if a duplicate variant was found, e.g.
- // "en-variant-Variant".
- const UniqueChars* duplicate = std::adjacent_find(
- variants().begin(), variants().end(), [](const auto& a, const auto& b) {
- return strcmp(a.get(), b.get()) == 0;
- });
- if (duplicate != variants().end()) {
- JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
- JSMSG_DUPLICATE_VARIANT_SUBTAG,
- duplicate->get());
- return false;
+ if (duplicateVariants == DuplicateVariants::Reject) {
+ // Reject the Locale identifier if a duplicate variant was found, e.g.
+ // "en-variant-Variant".
+ const UniqueChars* duplicate =
+ std::adjacent_find(variants().begin(), variants().end(),
+ [](const auto& a, const auto& b) {
+ return strcmp(a.get(), b.get()) == 0;
+ });
+ if (duplicate != variants().end()) {
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+ JSMSG_DUPLICATE_VARIANT_SUBTAG,
+ duplicate->get());
+ return false;
+ }
}
}
// 2. Any extensions are in alphabetical order by their singleton.
- // - A subsequent call to canonicalizeExtensions() will perform this.
+ // 3. All attributes are sorted in alphabetical order.
+ // 4. All keywords and tfields are sorted by alphabetical order of their keys,
+ // within their respective extensions.
+ // 5. Any type or tfield value "true" is removed.
+ // - A subsequent call to canonicalizeExtensions() will perform these steps.
+
+ // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier
+ // into its canonical form per UTS 3.2.1.
+
+ // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their
+ // canonical forms.
+ // - A subsequent call to canonicalizeExtensions() will perform this step.
- // The next two steps in 3.3.1 replace deprecated language and region
- // subtags with their preferred mappings.
+ // 2. Replace aliases in the unicode_language_id and tlang (if any).
+ // - tlang is handled in canonicalizeExtensions().
+
+ // Replace deprecated language, region, and variant subtags with their
+ // preferred mappings.
if (!updateGrandfatheredMappings(cx)) {
return false;
@@ -337,19 +357,34 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) {
}
}
- // No variant subtag replacements are currently present.
+ // Replace deprecated variant subtags with their preferred values.
+ if (!performVariantMappings(cx)) {
+ return false;
+ }
+
// No extension replacements are currently present.
// Private use sequences are left as is.
- // The two final steps in 3.3.1, handling irregular grandfathered and
- // private-use only language tags, don't apply, because these two forms
- // can't occur in Unicode BCP 47 locale identifiers.
+ // 3. Replace aliases in special key values.
+ // - A subsequent call to canonicalizeExtensions() will perform this step.
return true;
}
-bool LanguageTag::canonicalizeExtensions(
- JSContext* cx, UnicodeExtensionCanonicalForm canonicalForm) {
+#ifdef DEBUG
+template <typename CharT>
+static bool IsAsciiLowercaseAlphanumericOrDash(
+ mozilla::Span<const CharT> span) {
+ const CharT* ptr = span.data();
+ size_t length = span.size();
+ return std::all_of(ptr, ptr + length, [](auto c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c) ||
+ c == '-';
+ });
+}
+#endif
+
+bool LanguageTag::canonicalizeExtensions(JSContext* cx) {
// The canonical case for all extension subtags is lowercase.
for (UniqueChars& extension : extensions_) {
char* extensionChars = extension.get();
@@ -368,7 +403,7 @@ bool LanguageTag::canonicalizeExtensions(
for (UniqueChars& extension : extensions_) {
if (extension[0] == 'u') {
- if (!canonicalizeUnicodeExtension(cx, extension, canonicalForm)) {
+ if (!canonicalizeUnicodeExtension(cx, extension)) {
return false;
}
} else if (extension[0] == 't') {
@@ -376,6 +411,9 @@ bool LanguageTag::canonicalizeExtensions(
return false;
}
}
+
+ MOZ_ASSERT(IsAsciiLowercaseAlphanumericOrDash(
+ mozilla::MakeCStringSpan(extension.get())));
}
// The canonical case for privateuse subtags is lowercase.
@@ -406,8 +444,7 @@ bool LanguageTag::canonicalizeExtensions(
* see Section 3.6.4 U Extension Data Files).
*/
bool LanguageTag::canonicalizeUnicodeExtension(
- JSContext* cx, JS::UniqueChars& unicodeExtension,
- UnicodeExtensionCanonicalForm canonicalForm) {
+ JSContext* cx, JS::UniqueChars& unicodeExtension) {
const char* const extension = unicodeExtension.get();
MOZ_ASSERT(extension[0] == 'u');
MOZ_ASSERT(extension[1] == '-');
@@ -504,7 +541,7 @@ bool LanguageTag::canonicalizeUnicodeExtension(
const auto& attribute = attributes[i];
// Skip duplicate attributes.
- if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) {
+ if (i > 0) {
const auto& lastAttribute = attributes[i - 1];
if (attribute.length() == lastAttribute.length() &&
std::char_traits<char>::compare(attribute.begin(extension),
@@ -570,7 +607,7 @@ bool LanguageTag::canonicalizeUnicodeExtension(
const auto& keyword = keywords[i];
// Skip duplicate keywords.
- if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) {
+ if (i > 0) {
const auto& lastKeyword = keywords[i - 1];
if (std::char_traits<char>::compare(keyword.begin(extension),
lastKeyword.begin(extension),
@@ -594,17 +631,10 @@ bool LanguageTag::canonicalizeUnicodeExtension(
StringSpan type(keyword.begin(extension) + UnicodeKeyWithSepLength,
keyword.length() - UnicodeKeyWithSepLength);
- if (canonicalForm == UnicodeExtensionCanonicalForm::Yes) {
- // Search if there's a replacement for the current Unicode keyword.
- if (const char* replacement = replaceUnicodeExtensionType(key, type)) {
- if (!appendReplacement(keyword,
- mozilla::MakeCStringSpan(replacement))) {
- return false;
- }
- } else {
- if (!appendKeyword(keyword, type)) {
- return false;
- }
+ // Search if there's a replacement for the current Unicode keyword.
+ if (const char* replacement = replaceUnicodeExtensionType(key, type)) {
+ if (!appendReplacement(keyword, mozilla::MakeCStringSpan(replacement))) {
+ return false;
}
} else {
if (!appendKeyword(keyword, type)) {
@@ -761,26 +791,35 @@ bool LanguageTag::canonicalizeTransformExtension(
// Append the language subtag if present.
//
- // [1] is a bit unclear whether or not the `tlang` subtag also needs to be
- // canonicalized (and case-adjusted). For now simply append it as is.
- // (|parseTransformExtension| doesn't alter case from the lowercased form we
- // have previously taken pains to ensure is present in the extension, so no
- // special effort is required to ensure lowercasing.) If we switch to [2], the
- // `tlang` subtag also needs to be canonicalized according to the same rules
- // as `unicode_language_id` subtags are canonicalized. Also see [3].
- //
- // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier
- // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
- // [3] https://github.com/tc39/ecma402/issues/330
+ // Replace aliases in tlang per
+ // <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>.
if (tag.language().present()) {
if (!sb.append('-')) {
return false;
}
+
+ // ECMA-402 is unclear whether or not duplicate variants are allowed in
+ // transform extensions. Tentatively allow duplicates until
+ // https://github.com/tc39/ecma402/issues/330 has been addressed.
+ if (!tag.canonicalizeBaseName(cx, DuplicateVariants::Accept)) {
+ return false;
+ }
+
+ // The canonical case for Transform extensions is lowercase per
+ // <https://unicode.org/reports/tr35/#BCP47_T_Extension>. Convert the two
+ // subtags which don't use lowercase for their canonical syntax.
+ tag.script_.toLowerCase();
+ tag.region_.toLowerCase();
+
if (!LanguageTagToString(cx, tag, sb)) {
return false;
}
}
+ static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1;
+
+ using StringSpan = mozilla::Span<const char>;
+
// Append all fields.
//
// UTS 35, 3.2.1 specifies:
@@ -793,8 +832,23 @@ bool LanguageTag::canonicalizeTransformExtension(
if (!sb.append('-')) {
return false;
}
- if (!sb.append(field.begin(extension), field.length())) {
- return false;
+
+ StringSpan key(field.begin(extension), TransformKeyLength);
+ StringSpan value(field.begin(extension) + TransformKeyWithSepLength,
+ field.length() - TransformKeyWithSepLength);
+
+ // Search if there's a replacement for the current transform keyword.
+ if (const char* replacement = replaceTransformExtensionType(key, value)) {
+ if (!sb.append(field.begin(extension), TransformKeyWithSepLength)) {
+ return false;
+ }
+ if (!sb.append(replacement, strlen(replacement))) {
+ return false;
+ }
+ } else {
+ if (!sb.append(field.begin(extension), field.length())) {
+ return false;
+ }
}
}
@@ -824,6 +878,18 @@ JSString* LanguageTag::toString(JSContext* cx) const {
return sb.finishString();
}
+UniqueChars LanguageTag::toStringZ(JSContext* cx) const {
+ Vector<char, 16> sb(cx);
+ if (!LanguageTagToString(cx, *this, sb)) {
+ return nullptr;
+ }
+ if (!sb.append('\0')) {
+ return nullptr;
+ }
+
+ return UniqueChars(sb.extractOrCopyRawBuffer());
+}
+
// Zero-terminated ICU Locale ID.
using LocaleId =
js::Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>;
@@ -1158,12 +1224,25 @@ JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
LanguageTag& tag) {
JS::AutoCheckCannotGC nogc;
LocaleChars localeChars = StringChars(locale, nogc);
+ return tryParse(cx, localeChars, locale->length(), tag);
+}
+
+JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
+ mozilla::Span<const char> locale,
+ LanguageTag& tag) {
+ LocaleChars localeChars = StringChars(locale.data());
+ return tryParse(cx, localeChars, locale.size(), tag);
+}
+JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
+ LocaleChars& localeChars,
+ size_t localeLength,
+ LanguageTag& tag) {
// unicode_locale_id = unicode_language_id
// extensions*
// pu_extensions? ;
- LanguageTagParser ts(localeChars, locale->length());
+ LanguageTagParser ts(localeChars, localeLength);
Token tok = ts.nextToken();
bool ok;
@@ -1301,6 +1380,20 @@ bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale,
return false;
}
+bool LanguageTagParser::parse(JSContext* cx, mozilla::Span<const char> locale,
+ LanguageTag& tag) {
+ bool ok;
+ JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag));
+ if (ok) {
+ return true;
+ }
+ if (UniqueChars localeChars = DuplicateString(cx, locale.data())) {
+ JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr,
+ JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
+ }
+ return false;
+}
+
bool LanguageTagParser::parseBaseName(JSContext* cx,
mozilla::Span<const char> locale,
LanguageTag& tag) {
@@ -1314,8 +1407,7 @@ bool LanguageTagParser::parseBaseName(JSContext* cx,
if (ok) {
return true;
}
- if (UniqueChars localeChars = DuplicateString(cx, locale.data(),
- locale.size())) {
+ if (UniqueChars localeChars = DuplicateString(cx, locale.data())) {
JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr,
JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
}
@@ -1477,6 +1569,8 @@ bool LanguageTagParser::canParseUnicodeExtension(
bool LanguageTagParser::canParseUnicodeExtensionType(
JSLinearString* unicodeType) {
+ MOZ_ASSERT(unicodeType->length() > 0, "caller must exclude empty strings");
+
JS::AutoCheckCannotGC nogc;
LocaleChars unicodeTypeChars = StringChars(unicodeType, nogc);
@@ -1627,5 +1721,9 @@ JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx,
return result;
}
+void js::intl::UnicodeExtensionKeyword::trace(JSTracer* trc) {
+ TraceRoot(trc, &type_, "UnicodeExtensionKeyword::type");
+}
+
} // namespace intl
} // namespace js
diff --git a/js/src/builtin/intl/LanguageTag.h b/js/src/builtin/intl/LanguageTag.h
index 384ff4bb7a..5fcce26480 100644
--- a/js/src/builtin/intl/LanguageTag.h
+++ b/js/src/builtin/intl/LanguageTag.h
@@ -31,6 +31,7 @@
struct JSContext;
class JSLinearString;
class JSString;
+class JSTracer;
namespace js {
@@ -204,14 +205,8 @@ class MOZ_STACK_CLASS LanguageTag final {
friend class LanguageTagParser;
- public:
- // Flag to request canonicalized Unicode extensions.
- enum class UnicodeExtensionCanonicalForm : bool { No, Yes };
-
- private:
- bool canonicalizeUnicodeExtension(
- JSContext* cx, JS::UniqueChars& unicodeExtension,
- UnicodeExtensionCanonicalForm canonicalForm);
+ bool canonicalizeUnicodeExtension(JSContext* cx,
+ JS::UniqueChars& unicodeExtension);
bool canonicalizeTransformExtension(JSContext* cx,
JS::UniqueChars& transformExtension);
@@ -226,9 +221,22 @@ class MOZ_STACK_CLASS LanguageTag final {
void performComplexLanguageMappings();
void performComplexRegionMappings();
+ MOZ_MUST_USE bool performVariantMappings(JSContext* cx);
MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx);
+ static const char* replaceTransformExtensionType(
+ mozilla::Span<const char> key, mozilla::Span<const char> type);
+
+ public:
+ /**
+ * Given a Unicode key and type, return the null-terminated preferred
+ * replacement for that type if there is one, or null if there is none, e.g.
+ * in effect
+ * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
+ * and
+ * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
+ */
static const char* replaceUnicodeExtensionType(
mozilla::Span<const char> key, mozilla::Span<const char> type);
@@ -337,17 +345,24 @@ class MOZ_STACK_CLASS LanguageTag final {
privateuse_ = std::move(privateuse);
}
+ private:
+ enum class DuplicateVariants { Reject, Accept };
+
+ bool canonicalizeBaseName(JSContext* cx, DuplicateVariants duplicateVariants);
+
+ public:
/**
* Canonicalize the base-name subtags, that means the language, script,
* region, and variant subtags.
*/
- bool canonicalizeBaseName(JSContext* cx);
+ bool canonicalizeBaseName(JSContext* cx) {
+ return canonicalizeBaseName(cx, DuplicateVariants::Reject);
+ }
/**
* Canonicalize all extension subtags.
*/
- bool canonicalizeExtensions(JSContext* cx,
- UnicodeExtensionCanonicalForm canonicalForm);
+ bool canonicalizeExtensions(JSContext* cx);
/**
* Canonicalizes the given structurally valid Unicode BCP 47 locale
@@ -366,22 +381,10 @@ class MOZ_STACK_CLASS LanguageTag final {
*
* becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
*
- * UTS 35 specifies two different canonicalization algorithms. There's one to
- * canonicalize BCP 47 language tags and other one to canonicalize Unicode
- * locale identifiers. The latter one wasn't present when ECMA-402 was changed
- * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags,
- * so ECMA-402 currently only uses the former to canonicalize Unicode BCP 47
- * locale identifiers.
- *
* Spec: ECMAScript Internationalization API Specification, 6.2.3.
- * Spec:
- * https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
- * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion
*/
- bool canonicalize(JSContext* cx,
- UnicodeExtensionCanonicalForm canonicalForm) {
- return canonicalizeBaseName(cx) &&
- canonicalizeExtensions(cx, canonicalForm);
+ bool canonicalize(JSContext* cx) {
+ return canonicalizeBaseName(cx) && canonicalizeExtensions(cx);
}
/**
@@ -390,6 +393,12 @@ class MOZ_STACK_CLASS LanguageTag final {
JSString* toString(JSContext* cx) const;
/**
+ * Return the string representation of this language tag as a null-terminated
+ * C-string.
+ */
+ JS::UniqueChars toStringZ(JSContext* cx) const;
+
+ /**
* Add likely-subtags to the language tag.
*
* Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
@@ -664,17 +673,32 @@ class MOZ_STACK_CLASS LanguageTagParser final {
JSContext* cx, mozilla::Span<const char> extension,
AttributesVector& attributes, KeywordsVector& keywords);
+ static JS::Result<bool> tryParse(JSContext* cx, LocaleChars& localeChars,
+ size_t localeLength, LanguageTag& tag);
+
public:
// Parse the input string as a language tag. Reports an error to the context
// if the input can't be parsed completely.
static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
+ // Parse the input string as a language tag. Reports an error to the context
+ // if the input can't be parsed completely.
+ static bool parse(JSContext* cx, mozilla::Span<const char> locale,
+ LanguageTag& tag);
+
// Parse the input string as a language tag. Returns Ok(true) if the input
// could be completely parsed, Ok(false) if the input couldn't be parsed,
// or Err() in case of internal error.
static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
LanguageTag& tag);
+ // Parse the input string as a language tag. Returns Ok(true) if the input
+ // could be completely parsed, Ok(false) if the input couldn't be parsed,
+ // or Err() in case of internal error.
+ static JS::Result<bool> tryParse(JSContext* cx,
+ mozilla::Span<const char> locale,
+ LanguageTag& tag);
+
// Parse the input string as the base-name parts (language, script, region,
// variants) of a language tag. Ignores any trailing characters.
static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale,
@@ -718,6 +742,28 @@ MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str,
JS::Result<JSString*> ParseStandaloneISO639LanguageTag(
JSContext* cx, JS::Handle<JSLinearString*> str);
+class UnicodeExtensionKeyword final {
+ char key_[LanguageTagLimits::UnicodeKeyLength];
+ JSLinearString* type_;
+
+ public:
+ using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1];
+ using UnicodeKeySpan =
+ mozilla::Span<const char, LanguageTagLimits::UnicodeKeyLength>;
+
+ UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type)
+ : key_{key[0], key[1]}, type_(type) {}
+
+ UnicodeKeySpan key() const { return {key_, sizeof(key_)}; }
+ JSLinearString* type() const { return type_; }
+
+ void trace(JSTracer* trc);
+};
+
+extern MOZ_MUST_USE bool ApplyUnicodeExtensionToTag(
+ JSContext* cx, LanguageTag& tag,
+ JS::HandleVector<UnicodeExtensionKeyword> keywords);
+
} // namespace intl
} // namespace js
diff --git a/js/src/builtin/intl/LanguageTagGenerated.cpp b/js/src/builtin/intl/LanguageTagGenerated.cpp
index 6255861141..bd99140ace 100644
--- a/js/src/builtin/intl/LanguageTagGenerated.cpp
+++ b/js/src/builtin/intl/LanguageTagGenerated.cpp
@@ -10,6 +10,7 @@
#include <cstdint>
#include <cstring>
#include <iterator>
+#include <string>
#include <type_traits>
#include "jscntxt.h"
@@ -53,6 +54,14 @@ static inline const char* SearchReplacement(
}
#ifdef DEBUG
+static bool IsAsciiLowercaseAlphanumeric(char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+}
+
+static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
+ return IsAsciiLowercaseAlphanumeric(c) || c == '-';
+}
+
static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
// Tell the analysis the |std::all_of| function can't GC.
JS::AutoSuppressGCAnalysis nogc;
@@ -69,14 +78,26 @@ static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
}
static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
- auto isAsciiLowercaseAlphaOrDigit = [](char c) {
- return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
- };
-
// Tell the analysis the |std::all_of| function can't GC.
JS::AutoSuppressGCAnalysis nogc;
- return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit);
+ return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+
+static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
}
#endif
@@ -566,6 +587,80 @@ void js::intl::LanguageTag::performComplexRegionMappings() {
}
}
+static const char* ToCharPointer(const char* str) {
+ return str;
+}
+
+static const char* ToCharPointer(const js::UniqueChars& str) {
+ return str.get();
+}
+
+template <typename T, typename U = T>
+static bool IsLessThan(const T& a, const U& b) {
+ return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
+}
+
+// Mappings from variant subtags to preferred values.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) {
+ // The variant subtags need to be sorted for binary search.
+ MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
+ IsLessThan<decltype(variants_)::ElementType>));
+
+ auto insertVariantSortedIfNotPresent = [&](const char* variant) {
+ auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
+ IsLessThan<decltype(variants_)::ElementType,
+ decltype(variant)>);
+
+ // Don't insert the replacement when already present.
+ if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
+ return true;
+ }
+
+ // Insert the preferred variant in sort order.
+ auto preferred = DuplicateString(cx, variant);
+ if (!preferred) {
+ return false;
+ }
+ return !!variants_.insert(p, std::move(preferred));
+ };
+
+ for (size_t i = 0; i < variants_.length(); ) {
+ auto& variant = variants_[i];
+ MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get())));
+
+ if (strcmp(variant.get(), "aaland") == 0) {
+ variants_.erase(variants_.begin() + i);
+ setRegion("AX");
+ }
+ else if (strcmp(variant.get(), "arevela") == 0) {
+ variants_.erase(variants_.begin() + i);
+ setLanguage("hy");
+ }
+ else if (strcmp(variant.get(), "arevmda") == 0) {
+ variants_.erase(variants_.begin() + i);
+ setLanguage("hyw");
+ }
+ else if (strcmp(variant.get(), "heploc") == 0) {
+ variants_.erase(variants_.begin() + i);
+ if (!insertVariantSortedIfNotPresent("alalc97")) {
+ return false;
+ }
+ }
+ else if (strcmp(variant.get(), "polytoni") == 0) {
+ variants_.erase(variants_.begin() + i);
+ if (!insertVariantSortedIfNotPresent("polyton")) {
+ return false;
+ }
+ }
+ else {
+ i++;
+ }
+ }
+ return true;
+}
+
// Canonicalize grandfathered locale identifiers.
// Derived from CLDR Supplemental Data, version 35.1.
// https://unicode.org/Public/cldr/35.1/core.zip
@@ -656,16 +751,16 @@ bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
}
template <size_t Length>
-static inline bool IsUnicodeKey(mozilla::Span<const char> key,
- const char (&str)[Length]) {
+static inline bool IsUnicodeKey(
+ mozilla::Span<const char> key, const char (&str)[Length]) {
static_assert(Length == UnicodeKeyLength + 1,
"Unicode extension key is two characters long");
return memcmp(key.data(), str, Length - 1) == 0;
}
template <size_t Length>
-static inline bool IsUnicodeType(mozilla::Span<const char> type,
- const char (&str)[Length]) {
+static inline bool IsUnicodeType(
+ mozilla::Span<const char> type, const char (&str)[Length]) {
static_assert(Length > UnicodeKeyLength + 1,
"Unicode extension type contains more than two characters");
return type.size() == (Length - 1) &&
@@ -673,13 +768,7 @@ static inline bool IsUnicodeType(mozilla::Span<const char> type,
}
static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) {
-#ifdef DEBUG
- auto isNull = [](char c) {
- return c == '\0';
- };
-#endif
-
- MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull),
+ MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\0'),
"unexpected null-character in string");
using UnsignedChar = unsigned char;
@@ -695,12 +784,12 @@ static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) {
// Return zero if both strings are equal or a negative number if |b| is a
// prefix of |a|.
return -int32_t(UnsignedChar(a[b.size()]));
-};
+}
template <size_t Length>
-static inline const char* SearchReplacement(const char* (&types)[Length],
- const char* (&aliases)[Length],
- mozilla::Span<const char> type) {
+static inline const char* SearchUnicodeReplacement(
+ const char* (&types)[Length], const char* (&aliases)[Length],
+ mozilla::Span<const char> type) {
auto p = std::lower_bound(std::begin(types), std::end(types), type,
[](const auto& a, const auto& b) {
@@ -717,26 +806,15 @@ static inline const char* SearchReplacement(const char* (&types)[Length],
* values.
*
* Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ * Spec: https://www.unicode.org/reports/tr35/#t_Extension
*/
const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
mozilla::Span<const char> key, mozilla::Span<const char> type) {
-#ifdef DEBUG
- static auto isAsciiLowercaseAlphanumeric = [](char c) {
- return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
- };
-
- static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
- return isAsciiLowercaseAlphanumeric(c) || c == '-';
- };
-#endif
-
MOZ_ASSERT(key.size() == UnicodeKeyLength);
- MOZ_ASSERT(std::all_of(key.begin(), key.end(),
- isAsciiLowercaseAlphanumeric));
+ MOZ_ASSERT(IsCanonicallyCasedUnicodeKey(key));
MOZ_ASSERT(type.size() > UnicodeKeyLength);
- MOZ_ASSERT(std::all_of(type.begin(), type.end(),
- isAsciiLowercaseAlphanumericOrDash));
+ MOZ_ASSERT(IsCanonicallyCasedUnicodeType(type));
if (IsUnicodeKey(key, "ca")) {
if (IsUnicodeType(type, "ethiopic-amete-alem")) {
@@ -804,7 +882,7 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
"pl26", "pl24", "pl28", "pl30", "pl32", "tttob", "ttmrc", "tttob",
"twkhh", "twtnn", "twnwt", "twtxg",
};
- return SearchReplacement(types, aliases, type);
+ return SearchUnicodeReplacement(types, aliases, type);
}
else if (IsUnicodeKey(key, "tz")) {
static const char* types[28] = {
@@ -821,7 +899,52 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
"usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel",
"trist", "utc", "usden", "utc",
};
- return SearchReplacement(types, aliases, type);
+ return SearchUnicodeReplacement(types, aliases, type);
+ }
+ return nullptr;
+}
+
+template <size_t Length>
+static inline bool IsTransformKey(
+ mozilla::Span<const char> key, const char (&str)[Length]) {
+ static_assert(Length == TransformKeyLength + 1,
+ "Transform extension key is two characters long");
+ return memcmp(key.data(), str, Length - 1) == 0;
+}
+
+template <size_t Length>
+static inline bool IsTransformType(
+ mozilla::Span<const char> type, const char (&str)[Length]) {
+ static_assert(Length > TransformKeyLength + 1,
+ "Transform extension type contains more than two characters");
+ return type.size() == (Length - 1) &&
+ memcmp(type.data(), str, Length - 1) == 0;
+}
+
+/**
+ * Mapping from deprecated BCP 47 Transform extension types to their preferred
+ * values.
+ *
+ * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ * Spec: https://www.unicode.org/reports/tr35/#t_Extension
+ */
+const char* js::intl::LanguageTag::replaceTransformExtensionType(
+ mozilla::Span<const char> key, mozilla::Span<const char> type) {
+ MOZ_ASSERT(key.size() == TransformKeyLength);
+ MOZ_ASSERT(IsCanonicallyCasedTransformKey(key));
+
+ MOZ_ASSERT(type.size() > TransformKeyLength);
+ MOZ_ASSERT(IsCanonicallyCasedTransformType(type));
+
+ if (IsTransformKey(key, "d0")) {
+ if (IsTransformType(type, "name")) {
+ return "charname";
+ }
+ }
+ else if (IsTransformKey(key, "m0")) {
+ if (IsTransformType(type, "names")) {
+ return "prprname";
+ }
}
return nullptr;
}
diff --git a/js/src/builtin/intl/Locale.cpp b/js/src/builtin/intl/Locale.cpp
index 5d55fad2a1..ee70c0b06f 100644
--- a/js/src/builtin/intl/Locale.cpp
+++ b/js/src/builtin/intl/Locale.cpp
@@ -362,17 +362,12 @@ static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag,
/**
* ApplyUnicodeExtensionToTag( tag, options, relevantExtensionKeys )
*/
-static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag,
- HandleLinearString calendar,
- HandleLinearString collation,
- HandleLinearString hourCycle,
- HandleLinearString caseFirst,
- HandleLinearString numeric,
- HandleLinearString numberingSystem) {
+bool js::intl::ApplyUnicodeExtensionToTag(
+ JSContext* cx, LanguageTag& tag,
+ JS::HandleVector<intl::UnicodeExtensionKeyword> keywords) {
// If no Unicode extensions were present in the options object, we can skip
// everything below and directly return.
- if (!calendar && !collation && !caseFirst && !hourCycle && !numeric &&
- !numberingSystem) {
+ if (keywords.length() == 0) {
return true;
}
@@ -402,53 +397,32 @@ static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag,
}
}
- using UnicodeKeyWithSeparator = const char(&)[UnicodeKeyLength + 3];
-
- auto appendKeyword = [&newExtension](UnicodeKeyWithSeparator key,
- JSLinearString* value) {
- if (!newExtension.append(key, UnicodeKeyLength + 2)) {
- return false;
- }
-
- JS::AutoCheckCannotGC nogc;
- return value->hasLatin1Chars()
- ? newExtension.append(value->latin1Chars(nogc), value->length())
- : newExtension.append(value->twoByteChars(nogc),
- value->length());
- };
-
// Append the new keywords before any existing keywords. That way any previous
// keyword with the same key is detected as a duplicate when canonicalizing
// the Unicode extension subtag and gets discarded.
- if (calendar) {
- if (!appendKeyword("-ca-", calendar)) {
- return false;
- }
- }
- if (collation) {
- if (!appendKeyword("-co-", collation)) {
- return false;
- }
- }
- if (hourCycle) {
- if (!appendKeyword("-hc-", hourCycle)) {
+ for (const auto& keyword : keywords) {
+ UnicodeExtensionKeyword::UnicodeKeySpan key = keyword.key();
+ if (!newExtension.append('-')) {
return false;
}
- }
- if (caseFirst) {
- if (!appendKeyword("-kf-", caseFirst)) {
+ if (!newExtension.append(key.data(), key.size())) {
return false;
}
- }
- if (numeric) {
- if (!appendKeyword("-kn-", numeric)) {
+ if (!newExtension.append('-')) {
return false;
}
- }
- if (numberingSystem) {
- if (!appendKeyword("-nu-", numberingSystem)) {
- return false;
+
+ JS::AutoCheckCannotGC nogc;
+ JSLinearString* type = keyword.type();
+ if (type->hasLatin1Chars()) {
+ if (!newExtension.append(type->latin1Chars(nogc), type->length())) {
+ return false;
+ }
+ } else {
+ if (!newExtension.append(type->twoByteChars(nogc), type->length())) {
+ return false;
+ }
}
}
@@ -560,15 +534,16 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
return false;
}
- // Step 13 (not applicable).
+ // Step 13.
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
- // Steps 14, 16.
+ // Step 14.
RootedLinearString calendar(cx);
if (!GetStringOption(cx, options, cx->names().calendar, &calendar)) {
return false;
}
- // Step 15.
+ // Steps 15-16.
if (calendar) {
if (!IsValidUnicodeExtensionValue(calendar)) {
if (UniqueChars str = StringToNewUTF8CharsZ(cx, *calendar)) {
@@ -578,15 +553,19 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
}
return false;
}
+
+ if (!keywords.emplaceBack("ca", calendar)) {
+ return false;
+ }
}
- // Steps 17, 19.
+ // Step 17.
RootedLinearString collation(cx);
if (!GetStringOption(cx, options, cx->names().collation, &collation)) {
return false;
}
- // Step 18.
+ // Steps 18-19.
if (collation) {
if (!IsValidUnicodeExtensionValue(collation)) {
if (UniqueChars str = StringToNewUTF8CharsZ(cx, *collation)) {
@@ -596,14 +575,19 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
}
return false;
}
+
+ if (!keywords.emplaceBack("co", collation)) {
+ return false;
+ }
}
- // Steps 20-21.
+ // Step 20 (without validation).
RootedLinearString hourCycle(cx);
if (!GetStringOption(cx, options, cx->names().hourCycle, &hourCycle)) {
return false;
}
+ // Steps 20-21.
if (hourCycle) {
if (!StringEqualsAscii(hourCycle, "h11") &&
!StringEqualsAscii(hourCycle, "h12") &&
@@ -616,14 +600,19 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
}
return false;
}
+
+ if (!keywords.emplaceBack("hc", hourCycle)) {
+ return false;
+ }
}
- // Steps 22-23.
+ // Step 22 (without validation).
RootedLinearString caseFirst(cx);
if (!GetStringOption(cx, options, cx->names().caseFirst, &caseFirst)) {
return false;
}
+ // Steps 22-23.
if (caseFirst) {
if (!StringEqualsAscii(caseFirst, "upper") &&
!StringEqualsAscii(caseFirst, "lower") &&
@@ -635,22 +624,33 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
}
return false;
}
+
+ if (!keywords.emplaceBack("kf", caseFirst)) {
+ return false;
+ }
}
- // Steps 24-26.
+ // Steps 24-25.
RootedLinearString numeric(cx);
if (!GetBooleanOption(cx, options, cx->names().numeric, &numeric)) {
return false;
}
- // Steps 27, 29.
+ // Step 26.
+ if (numeric) {
+ if (!keywords.emplaceBack("kn", numeric)) {
+ return false;
+ }
+ }
+
+ // Step 27.
RootedLinearString numberingSystem(cx);
if (!GetStringOption(cx, options, cx->names().numberingSystem,
&numberingSystem)) {
return false;
}
- // Step 28.
+ // Steps 28-29.
if (numberingSystem) {
if (!IsValidUnicodeExtensionValue(numberingSystem)) {
if (UniqueChars str = StringToNewUTF8CharsZ(cx, *numberingSystem)) {
@@ -660,19 +660,21 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
}
return false;
}
+
+ if (!keywords.emplaceBack("nu", numberingSystem)) {
+ return false;
+ }
}
// Step 30.
- if (!ApplyUnicodeExtensionToTag(cx, tag, calendar, collation, hourCycle,
- caseFirst, numeric, numberingSystem)) {
+ if (!ApplyUnicodeExtensionToTag(cx, tag, keywords)) {
return false;
}
}
// ApplyOptionsToTag, steps 9 and 13.
- // ApplyUnicodeExtensionToTag, step 8.
- if (!tag.canonicalizeExtensions(
- cx, LanguageTag::UnicodeExtensionCanonicalForm::Yes)) {
+ // ApplyUnicodeExtensionToTag, step 9.
+ if (!tag.canonicalizeExtensions(cx)) {
return false;
}
@@ -954,10 +956,7 @@ static bool Locale_toString(JSContext* cx, unsigned argc, Value* vp) {
static bool Locale_baseName(JSContext* cx, const CallArgs& args) {
MOZ_ASSERT(IsLocale(args.thisv()));
- // FIXME: spec bug - invalid assertion in step 4.
- // FIXME: spec bug - subtag production names not updated.
-
- // Steps 3, 5.
+ // Steps 3-4.
auto* locale = &args.thisv().toObject().as<LocaleObject>();
args.rval().setString(locale->baseName());
return true;
@@ -986,6 +985,22 @@ static bool Locale_calendar(JSContext* cx, unsigned argc, Value* vp) {
return CallNonGenericMethod<IsLocale, Locale_calendar>(cx, args);
}
+// get Intl.Locale.prototype.caseFirst
+static bool Locale_caseFirst(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ return GetUnicodeExtension(cx, locale, "kf", args.rval());
+}
+
+// get Intl.Locale.prototype.caseFirst
+static bool Locale_caseFirst(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_caseFirst>(cx, args);
+}
+
// get Intl.Locale.prototype.collation
static bool Locale_collation(JSContext* cx, const CallArgs& args) {
MOZ_ASSERT(IsLocale(args.thisv()));
@@ -1018,22 +1033,6 @@ static bool Locale_hourCycle(JSContext* cx, unsigned argc, Value* vp) {
return CallNonGenericMethod<IsLocale, Locale_hourCycle>(cx, args);
}
-// get Intl.Locale.prototype.caseFirst
-static bool Locale_caseFirst(JSContext* cx, const CallArgs& args) {
- MOZ_ASSERT(IsLocale(args.thisv()));
-
- // Step 3.
- auto* locale = &args.thisv().toObject().as<LocaleObject>();
- return GetUnicodeExtension(cx, locale, "kf", args.rval());
-}
-
-// get Intl.Locale.prototype.caseFirst
-static bool Locale_caseFirst(JSContext* cx, unsigned argc, Value* vp) {
- // Steps 1-2.
- CallArgs args = CallArgsFromVp(argc, vp);
- return CallNonGenericMethod<IsLocale, Locale_caseFirst>(cx, args);
-}
-
// get Intl.Locale.prototype.numeric
static bool Locale_numeric(JSContext* cx, const CallArgs& args) {
MOZ_ASSERT(IsLocale(args.thisv()));
@@ -1045,8 +1044,13 @@ static bool Locale_numeric(JSContext* cx, const CallArgs& args) {
return false;
}
- // FIXME: spec bug - comparison should be against the empty string, too.
+ // Compare against the empty string per Intl.Locale, step 36.a. The Unicode
+ // extension is already canonicalized, so we don't need to compare against
+ // "true" at this point.
MOZ_ASSERT(value.isUndefined() || value.isString());
+ MOZ_ASSERT_IF(value.isString(),
+ !StringEqualsAscii(&value.toString()->asLinear(), "true"));
+
args.rval().setBoolean(value.isString() && value.toString()->empty());
return true;
}
@@ -1093,7 +1097,6 @@ static bool Locale_language(JSContext* cx, const CallArgs& args) {
size_t length = language.length;
// Step 5.
- // FIXME: spec bug - not all production names updated.
JSString* str = NewDependentString(cx, baseName, index, length);
if (!str) {
return false;
@@ -1126,7 +1129,6 @@ static bool Locale_script(JSContext* cx, const CallArgs& args) {
auto script = BaseNameParts(baseName).script;
// Step 5.
- // FIXME: spec bug - not all production names updated.
if (!script) {
args.rval().setUndefined();
return true;
@@ -1208,9 +1210,9 @@ static const JSFunctionSpec locale_methods[] = {
static const JSPropertySpec locale_properties[] = {
JS_PSG("baseName", Locale_baseName, 0),
JS_PSG("calendar", Locale_calendar, 0),
+ JS_PSG("caseFirst", Locale_caseFirst, 0),
JS_PSG("collation", Locale_collation, 0),
JS_PSG("hourCycle", Locale_hourCycle, 0),
- JS_PSG("caseFirst", Locale_caseFirst, 0),
JS_PSG("numeric", Locale_numeric, 0),
JS_PSG("numberingSystem", Locale_numberingSystem, 0),
JS_PSG("language", Locale_language, 0),
@@ -1301,7 +1303,7 @@ bool js::intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, unsigned argc,
return false;
}
- if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) {
+ if (!tag.canonicalize(cx)) {
return false;
}
@@ -1334,7 +1336,7 @@ bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx,
return true;
}
- if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) {
+ if (!tag.canonicalize(cx)) {
return false;
}
@@ -1345,3 +1347,85 @@ bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx,
args.rval().setString(resultStr);
return true;
}
+
+bool js::intl_ValidateAndCanonicalizeUnicodeExtensionType(JSContext* cx,
+ unsigned argc,
+ Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 3);
+
+ HandleValue typeArg = args[0];
+ MOZ_ASSERT(typeArg.isString(), "type must be a string");
+
+ HandleValue optionArg = args[1];
+ MOZ_ASSERT(optionArg.isString(), "option name must be a string");
+
+ HandleValue keyArg = args[2];
+ MOZ_ASSERT(keyArg.isString(), "key must be a string");
+
+ RootedLinearString unicodeType(cx, typeArg.toString()->ensureLinear(cx));
+ if (!unicodeType) {
+ return false;
+ }
+
+ if (!IsValidUnicodeExtensionValue(unicodeType)) {
+ JSAutoByteString optionStr(cx, optionArg.toString());
+ if (!optionStr) {
+ return false;
+ }
+
+ JSAutoByteString unicodeTypeQuot(cx, QuoteString(cx, unicodeType, '"'));
+ if (!unicodeTypeQuot) {
+ return false;
+ }
+
+ JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, optionStr.ptr(),
+ unicodeTypeQuot.ptr());
+ return false;
+ }
+
+ char unicodeKey[UnicodeKeyLength];
+ {
+ JSLinearString* str = keyArg.toString()->ensureLinear(cx);
+ if (!str) {
+ return false;
+ }
+ MOZ_ASSERT(str->length() == UnicodeKeyLength);
+
+ for (size_t i = 0; i < UnicodeKeyLength; i++) {
+ char16_t ch = str->latin1OrTwoByteChar(i);
+ MOZ_ASSERT(mozilla::IsAscii(ch));
+ unicodeKey[i] = char(ch);
+ }
+ }
+
+ JSAutoByteString unicodeTypeChars(cx, unicodeType);
+ if (!unicodeTypeChars) {
+ return false;
+ }
+
+ size_t unicodeTypeLength = unicodeType->length();
+ MOZ_ASSERT(strlen(unicodeTypeChars.ptr()) == unicodeTypeLength);
+
+ // Convert into canonical case before searching for replacements.
+ intl::AsciiToLowerCase(unicodeTypeChars.ptr(), unicodeTypeLength,
+ unicodeTypeChars.ptr());
+
+ auto key = mozilla::MakeSpan(unicodeKey, UnicodeKeyLength);
+ auto type = mozilla::MakeSpan(unicodeTypeChars.ptr(), unicodeTypeLength);
+
+ // Search if there's a replacement for the current Unicode keyword.
+ JSString* result;
+ if (const char* replacement = LanguageTag::replaceUnicodeExtensionType(key, type)) {
+ result = NewStringCopyZ<CanGC>(cx, replacement);
+ } else {
+ result = StringToLowerCase(cx, unicodeType);
+ }
+ if (!result) {
+ return false;
+ }
+
+ args.rval().setString(result);
+ return true;
+}
diff --git a/js/src/builtin/intl/Locale.h b/js/src/builtin/intl/Locale.h
index 31b3caca5c..74ff4b5a71 100644
--- a/js/src/builtin/intl/Locale.h
+++ b/js/src/builtin/intl/Locale.h
@@ -56,6 +56,9 @@ extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx,
extern MOZ_MUST_USE bool intl_TryValidateAndCanonicalizeLanguageTag(
JSContext* cx, unsigned argc, Value* vp);
+extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeUnicodeExtensionType(
+ JSContext* cx, unsigned argc, Value* vp);
+
} // namespace js
#endif /* builtin_intl_Locale_h */
diff --git a/js/src/builtin/intl/NumberFormat.cpp b/js/src/builtin/intl/NumberFormat.cpp
index df40e751c8..9ee3b02109 100644
--- a/js/src/builtin/intl/NumberFormat.cpp
+++ b/js/src/builtin/intl/NumberFormat.cpp
@@ -18,6 +18,7 @@
#include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/LanguageTag.h"
#include "builtin/intl/ScopedICUObject.h"
#include "ds/Sort.h"
#include "js/RootingAPI.h"
@@ -246,7 +247,41 @@ NewUNumberFormat(JSContext* cx, Handle<NumberFormatObject*> numberFormat)
if (!GetProperty(cx, internals, internals, cx->names().locale, &value))
return nullptr;
- JSAutoByteString locale(cx, value.toString());
+
+ // ICU expects numberingSystem as a Unicode locale extensions on locale.
+
+ intl::LanguageTag tag(cx);
+ {
+ JSLinearString* locale = value.toString()->ensureLinear(cx);
+ if (!locale)
+ return nullptr;
+
+ if (!intl::LanguageTagParser::parse(cx, locale, tag))
+ return nullptr;
+ }
+
+ JS::RootedVector<intl::UnicodeExtensionKeyword> keywords(cx);
+
+ if (!GetProperty(cx, internals, internals, cx->names().numberingSystem, &value))
+ return nullptr;
+
+ {
+ JSLinearString* numberingSystem = value.toString()->ensureLinear(cx);
+ if (!numberingSystem)
+ return nullptr;
+
+ if (!keywords.emplaceBack("nu", numberingSystem))
+ return nullptr;
+ }
+
+ // |ApplyUnicodeExtensionToTag| applies the new keywords to the front of
+ // the Unicode extension subtag. We're then relying on ICU to follow RFC
+ // 6067, which states that any trailing keywords using the same key
+ // should be ignored.
+ if (!intl::ApplyUnicodeExtensionToTag(cx, tag, keywords))
+ return nullptr;
+
+ UniqueChars locale = tag.toStringZ(cx);
if (!locale)
return nullptr;
@@ -264,9 +299,6 @@ NewUNumberFormat(JSContext* cx, Handle<NumberFormatObject*> numberFormat)
RootedString currency(cx);
AutoStableStringChars stableChars(cx);
- // We don't need to look at numberingSystem - it can only be set via
- // the Unicode locale extension and is therefore already set on locale.
-
if (!GetProperty(cx, internals, internals, cx->names().style, &value))
return nullptr;
JSAutoByteString style(cx, value.toString());
@@ -339,7 +371,7 @@ NewUNumberFormat(JSContext* cx, Handle<NumberFormatObject*> numberFormat)
uUseGrouping = value.toBoolean();
UErrorCode status = U_ZERO_ERROR;
- UNumberFormat* nf = unum_open(uStyle, nullptr, 0, IcuLocale(locale.ptr()), nullptr, &status);
+ UNumberFormat* nf = unum_open(uStyle, nullptr, 0, IcuLocale(locale.get()), nullptr, &status);
if (U_FAILURE(status)) {
intl::ReportInternalError(cx);
return nullptr;
diff --git a/js/src/builtin/intl/NumberFormat.js b/js/src/builtin/intl/NumberFormat.js
index 973abd026a..238a59405b 100644
--- a/js/src/builtin/intl/NumberFormat.js
+++ b/js/src/builtin/intl/NumberFormat.js
@@ -211,6 +211,8 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) {
// opt: // opt object computed in InitializeNumberFormat
// {
// localeMatcher: "lookup" / "best fit",
+ //
+ // nu: string matching a Unicode extension type, // optional
// }
//
// minimumIntegerDigits: integer ∈ [1, 21],
@@ -253,6 +255,16 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) {
// Steps 5-6.
var matcher = GetOption(options, "localeMatcher", "string", ["lookup", "best fit"], "best fit");
opt.localeMatcher = matcher;
+
+ var numberingSystem = GetOption(options, "numberingSystem", "string", undefined, undefined);
+
+ if (numberingSystem !== undefined) {
+ numberingSystem = intl_ValidateAndCanonicalizeUnicodeExtensionType(numberingSystem,
+ "numberingSystem",
+ "nu");
+ }
+
+ opt.nu = numberingSystem;
// Compute formatting options.
// Step 12.
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py
index 0370d422d9..59ff14d76c 100644
--- a/js/src/builtin/intl/make_intl_data.py
+++ b/js/src/builtin/intl/make_intl_data.py
@@ -331,6 +331,96 @@ void js::intl::LanguageTag::performComplexRegionMappings() {
""".strip("\n"))
+def writeVariantTagMappings(println, variant_mappings, description, source,
+ url):
+ """ Writes a function definition that maps variant subtags. """
+ println(u"""
+static const char* ToCharPointer(const char* str) {
+ return str;
+}
+
+static const char* ToCharPointer(const js::UniqueChars& str) {
+ return str.get();
+}
+
+template <typename T, typename U = T>
+static bool IsLessThan(const T& a, const U& b) {
+ return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
+}
+""")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) {
+ // The variant subtags need to be sorted for binary search.
+ MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
+ IsLessThan<decltype(variants_)::ElementType>));
+
+ auto insertVariantSortedIfNotPresent = [&](const char* variant) {
+ auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
+ IsLessThan<decltype(variants_)::ElementType,
+ decltype(variant)>);
+
+ // Don't insert the replacement when already present.
+ if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
+ return true;
+ }
+
+ // Insert the preferred variant in sort order.
+ auto preferred = DuplicateString(cx, variant);
+ if (!preferred) {
+ return false;
+ }
+ return !!variants_.insert(p, std::move(preferred));
+ };
+
+ for (size_t i = 0; i < variants_.length(); ) {
+ auto& variant = variants_[i];
+ MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get())));
+""".lstrip())
+
+ first_variant = True
+
+ for (deprecated_variant, (type, replacement)) in (
+ sorted(variant_mappings.items(), key=itemgetter(0))
+ ):
+ if_kind = u"if" if first_variant else u"else if"
+ first_variant = False
+
+ println(u"""
+ {} (strcmp(variant.get(), "{}") == 0) {{
+ variants_.erase(variants_.begin() + i);
+""".format(if_kind, deprecated_variant).strip("\n"))
+
+ if type == "language":
+ println(u"""
+ setLanguage("{}");
+""".format(replacement).strip("\n"))
+ elif type == "region":
+ println(u"""
+ setRegion("{}");
+""".format(replacement).strip("\n"))
+ else:
+ assert type == "variant"
+ println(u"""
+ if (!insertVariantSortedIfNotPresent("{}")) {{
+ return false;
+ }}
+""".format(replacement).strip("\n"))
+
+ println(u"""
+ }
+""".strip("\n"))
+
+ println(u"""
+ else {
+ i++;
+ }
+ }
+ return true;
+}
+""".strip("\n"))
+
+
def writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
description, source, url):
""" Writes a function definition that maps grandfathered language tags. """
@@ -498,6 +588,7 @@ def readSupplementalData(core_file):
- complexLanguageMappings: mappings from language subtags with complex rules
- regionMappings: mappings from region subtags to preferred subtags
- complexRegionMappings: mappings from region subtags with complex rules
+ - variantMappings: mappings from variant subtags to preferred subtags
- likelySubtags: likely subtags used for generating test data only
Returns these mappings as dictionaries.
"""
@@ -541,6 +632,14 @@ def readSupplementalData(core_file):
$
""", re.IGNORECASE | re.VERBOSE)
+ re_unicode_variant_subtag = re.compile(
+ r"""
+ ^
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
+ $
+ """, re.IGNORECASE | re.VERBOSE)
+
# The fixed list of BCP 47 grandfathered language tags.
grandfathered_tags = (
"art-lojban",
@@ -589,6 +688,11 @@ def readSupplementalData(core_file):
# replacement, e.g. "SU" -> ("RU", ["AM",complex_region_mappings[type] = replacements "AZ", "BY", ...]).
complex_region_mappings = {}
+ # Dictionary of aliased variant subtags to a tuple of preferred replacement
+ # type and replacement, e.g. "arevela" -> ("language", "hy") or
+ # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
+ variant_mappings = {}
+
# Dictionary of grandfathered mappings to preferred values.
grandfathered_mappings = {}
@@ -624,6 +728,8 @@ def readSupplementalData(core_file):
if re_unicode_language_subtag.match(type) is None:
continue
+ assert type.islower()
+
if re_unicode_language_subtag.match(replacement) is not None:
# Canonical case for language subtags is lower-case.
language_mappings[type] = replacement.lower()
@@ -647,6 +753,8 @@ def readSupplementalData(core_file):
if re_unicode_region_subtag.match(type) is None:
continue
+ assert type.isupper() or type.isdigit()
+
if re_unicode_region_subtag.match(replacement) is not None:
# Canonical case for region subtags is upper-case.
region_mappings[type] = replacement.upper()
@@ -658,6 +766,33 @@ def readSupplementalData(core_file):
), "{} invalid region subtags".format(replacement)
complex_region_mappings[type] = replacements
+ for variant_alias in tree.iterfind(".//variantAlias"):
+ type = variant_alias.get("type")
+ replacement = variant_alias.get("replacement")
+
+ assert re_unicode_variant_subtag.match(type) is not None, (
+ "{} invalid variant subtag".format(type))
+
+ # Normalize the case, because some variants are in upper case.
+ type = type.lower()
+
+ # The replacement can be a language, a region, or a variant subtag.
+ # Language and region subtags are case normalized, variant subtags can
+ # be in any case.
+
+ if re_unicode_language_subtag.match(replacement) is not None and replacement.islower():
+ variant_mappings[type] = ("language", replacement)
+
+ elif re_unicode_region_subtag.match(replacement) is not None:
+ assert replacement.isupper() or replacement.isdigit(), (
+ "{} invalid variant subtag replacement".format(replacement))
+ variant_mappings[type] = ("region", replacement)
+
+ else:
+ assert re_unicode_variant_subtag.match(replacement) is not None, (
+ "{} invalid variant subtag replacement".format(replacement))
+ variant_mappings[type] = ("variant", replacement.lower())
+
tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
likely_subtags = {}
@@ -724,6 +859,7 @@ def readSupplementalData(core_file):
"complexLanguageMappings": complex_language_mappings,
"regionMappings": region_mappings,
"complexRegionMappings": complex_region_mappings_final,
+ "variantMappings": variant_mappings,
"likelySubtags": likely_subtags,
}
@@ -740,14 +876,20 @@ def readUnicodeExtensions(core_file):
# Mapping from Unicode extension types to dict of deprecated to
# preferred values.
- mapping = {}
+ mapping = {
+ # Unicode BCP 47 U Extension
+ "u": {},
+
+ # Unicode BCP 47 T Extension
+ "t": {},
+ }
def readBCP47File(file):
tree = ET.parse(file)
for keyword in tree.iterfind(".//keyword/key"):
- # Skip over keywords whose extension is not "u".
- if keyword.get("extension", "u") != "u":
- continue
+ extension = keyword.get("extension", "u")
+ assert extension == "u" or extension == "t", (
+ "unknown extension type: {}".format(extension))
extension_name = keyword.get("name")
@@ -806,7 +948,7 @@ def readUnicodeExtensions(core_file):
if preferred is not None:
assert typeRE.match(preferred), preferred
- mapping.setdefault(extension_name, {})[name] = preferred
+ mapping[extension].setdefault(extension_name, {})[name] = preferred
if alias is not None:
for alias_name in alias.lower().split(" "):
@@ -816,7 +958,7 @@ def readUnicodeExtensions(core_file):
# See comment above when 'alias' and 'preferred' are both present.
if (preferred is not None and
- name in mapping[extension_name]):
+ name in mapping[extension][extension_name]):
continue
# Skip over entries where 'name' and 'alias' are equal.
@@ -828,7 +970,7 @@ def readUnicodeExtensions(core_file):
if name == alias_name:
continue
- mapping.setdefault(extension_name, {})[alias_name] = name
+ mapping[extension].setdefault(extension_name, {})[alias_name] = name
def readSupplementalMetadata(file):
# Find subdivision and region replacements.
@@ -857,8 +999,8 @@ def readUnicodeExtensions(core_file):
continue
# 'subdivisionAlias' applies to 'rg' and 'sd' keys.
- mapping.setdefault("rg", {})[type] = replacement
- mapping.setdefault("sd", {})[type] = replacement
+ mapping["u"].setdefault("rg", {})[type] = replacement
+ mapping["u"].setdefault("sd", {})[type] = replacement
for name in core_file.namelist():
if bcpFileRE.match(name):
@@ -866,7 +1008,10 @@ def readUnicodeExtensions(core_file):
readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
- return mapping
+ return {
+ "unicodeMappings": mapping["u"],
+ "transformMappings": mapping["t"],
+ }
def writeCLDRLanguageTagData(println, data, url):
""" Writes the language tag data to the Intl data file. """
@@ -884,6 +1029,7 @@ def writeCLDRLanguageTagData(println, data, url):
#include <cstdint>
#include <cstring>
#include <iterator>
+#include <string>
#include <type_traits>
#include "jscntxt.h"
@@ -927,6 +1073,14 @@ static inline const char* SearchReplacement(
}
#ifdef DEBUG
+static bool IsAsciiLowercaseAlphanumeric(char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+}
+
+static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
+ return IsAsciiLowercaseAlphanumeric(c) || c == '-';
+}
+
static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
// Tell the analysis the |std::all_of| function can't GC.
JS::AutoSuppressGCAnalysis nogc;
@@ -943,14 +1097,26 @@ static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
}
static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
- auto isAsciiLowercaseAlphaOrDigit = [](char c) {
- return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
- };
-
// Tell the analysis the |std::all_of| function can't GC.
JS::AutoSuppressGCAnalysis nogc;
- return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit);
+ return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+
+static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
}
#endif
""".rstrip())
@@ -961,7 +1127,9 @@ static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
complex_language_mappings = data["complexLanguageMappings"]
region_mappings = data["regionMappings"]
complex_region_mappings = data["complexRegionMappings"]
+ variant_mappings = data["variantMappings"]
unicode_mappings = data["unicodeMappings"]
+ transform_mappings = data["transformMappings"]
# unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
language_maxlength = 8
@@ -999,11 +1167,15 @@ static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
writeComplexRegionTagMappings(println, complex_region_mappings,
"Region subtags with complex mappings.", source, url)
+ writeVariantTagMappings(println, variant_mappings,
+ "Mappings from variant subtags to preferred values.", source, url)
+
writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
"Canonicalize grandfathered locale identifiers.", source,
url)
- writeUnicodeExtensionsMappings(println, unicode_mappings)
+ writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
+ writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
@@ -1157,7 +1329,7 @@ def updateCLDRLangTags(args):
def readFiles(cldr_file):
with ZipFile(cldr_file) as zip_file:
data.update(readSupplementalData(zip_file))
- data["unicodeMappings"] = readUnicodeExtensions(zip_file)
+ data.update(readUnicodeExtensions(zip_file))
print("Processing CLDR data...")
if filename is not None:
@@ -1181,8 +1353,7 @@ def updateCLDRLangTags(args):
with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
println = partial(print, file=f)
- println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||"
- u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))")
+ println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
@@ -1898,91 +2069,84 @@ def updateTzdata(topsrcdir, args):
else:
updateFrom(tzDir)
-def writeUnicodeExtensionsMappings(println, mapping):
+def writeUnicodeExtensionsMappings(println, mapping, extension):
println(u"""
template <size_t Length>
-static inline bool IsUnicodeKey(mozilla::Span<const char> key,
- const char (&str)[Length]) {
- static_assert(Length == UnicodeKeyLength + 1,
- "Unicode extension key is two characters long");
+static inline bool Is{0}Key(
+ mozilla::Span<const char> key, const char (&str)[Length]) {{
+ static_assert(Length == {0}KeyLength + 1,
+ "{0} extension key is two characters long");
return memcmp(key.data(), str, Length - 1) == 0;
-}
+}}
template <size_t Length>
-static inline bool IsUnicodeType(mozilla::Span<const char> type,
- const char (&str)[Length]) {
- static_assert(Length > UnicodeKeyLength + 1,
- "Unicode extension type contains more than two characters");
+static inline bool Is{0}Type(
+ mozilla::Span<const char> type, const char (&str)[Length]) {{
+ static_assert(Length > {0}KeyLength + 1,
+ "{0} extension type contains more than two characters");
return type.size() == (Length - 1) &&
memcmp(type.data(), str, Length - 1) == 0;
-}
+}}
+""".format(extension).rstrip("\n"))
-static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) {
-#ifdef DEBUG
- auto isNull = [](char c) {
- return c == '\\0';
- };
-#endif
+ linear_search_max_length = 4
+
+ needs_binary_search = any(len(replacements.items()) > linear_search_max_length
+ for replacements in mapping.values())
- MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull),
+ if needs_binary_search:
+ println(u"""
+static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{
+ MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
"unexpected null-character in string");
using UnsignedChar = unsigned char;
- for (size_t i = 0; i < b.size(); i++) {
+ for (size_t i = 0; i < b.size(); i++) {{
// |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
// we've reached the end of |a|, the below if-statement will always be true.
// That ensures we don't read past the end of |a|.
- if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
+ if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
return r;
- }
- }
+ }}
+ }}
// Return zero if both strings are equal or a negative number if |b| is a
// prefix of |a|.
return -int32_t(UnsignedChar(a[b.size()]));
-};
+}}
template <size_t Length>
-static inline const char* SearchReplacement(const char* (&types)[Length],
- const char* (&aliases)[Length],
- mozilla::Span<const char> type) {
+static inline const char* Search{0}Replacement(
+ const char* (&types)[Length], const char* (&aliases)[Length],
+ mozilla::Span<const char> type) {{
auto p = std::lower_bound(std::begin(types), std::end(types), type,
- [](const auto& a, const auto& b) {
- return CompareUnicodeType(a, b) < 0;
- });
- if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
+ [](const auto& a, const auto& b) {{
+ return Compare{0}Type(a, b) < 0;
+ }});
+ if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{
return aliases[std::distance(std::begin(types), p)];
- }
+ }}
return nullptr;
-}
+}}
+""".format(extension).rstrip("\n"))
+ println(u"""
/**
- * Mapping from deprecated BCP 47 Unicode extension types to their preferred
+ * Mapping from deprecated BCP 47 {0} extension types to their preferred
* values.
*
* Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ * Spec: https://www.unicode.org/reports/tr35/#t_Extension
*/
-const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
- mozilla::Span<const char> key, mozilla::Span<const char> type) {
-#ifdef DEBUG
- static auto isAsciiLowercaseAlphanumeric = [](char c) {
- return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
- };
+const char* js::intl::LanguageTag::replace{0}ExtensionType(
+ mozilla::Span<const char> key, mozilla::Span<const char> type) {{
+ MOZ_ASSERT(key.size() == {0}KeyLength);
+ MOZ_ASSERT(IsCanonicallyCased{0}Key(key));
- static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
- return isAsciiLowercaseAlphanumeric(c) || c == '-';
- };
-#endif
-
- MOZ_ASSERT(key.size() == UnicodeKeyLength);
- MOZ_ASSERT(std::all_of(key.begin(), key.end(),
- isAsciiLowercaseAlphanumeric));
-
- MOZ_ASSERT(type.size() > UnicodeKeyLength);
- MOZ_ASSERT(std::all_of(type.begin(), type.end(),
- isAsciiLowercaseAlphanumericOrDash));
-""")
+ MOZ_ASSERT(type.size() > {0}KeyLength);
+ MOZ_ASSERT(IsCanonicallyCased{0}Type(type));
+""".format(extension))
def to_hash_key(replacements):
return str(sorted(replacements.items()))
@@ -2014,7 +2178,8 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
if key in key_aliases[hash_key]:
continue
- cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key])
+ cond = (u"Is{}Key(key, \"{}\")".format(extension, k)
+ for k in [key] + key_aliases[hash_key])
if_kind = u"if" if first_key else u"else if"
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
@@ -2024,7 +2189,7 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
replacements = sorted(replacements.items(), key=itemgetter(0))
- if len(replacements) > 4:
+ if len(replacements) > linear_search_max_length:
types = [t for (t, _) in replacements]
preferred = [r for (_, r) in replacements]
max_len = max(len(k) for k in types + preferred)
@@ -2032,14 +2197,14 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
write_array(types, "types", max_len)
write_array(preferred, "aliases", max_len)
println(u"""
- return SearchReplacement(types, aliases, type);
-""".strip("\n"))
+ return Search{}Replacement(types, aliases, type);
+""".format(extension).strip("\n"))
else:
for (type, replacement) in replacements:
println(u"""
- if (IsUnicodeType(type, "{}")) {{
+ if (Is{}Type(type, "{}")) {{
return "{}";
- }}""".format(type, replacement).strip("\n"))
+ }}""".format(extension, type, replacement).strip("\n"))
println(u"""
}""".lstrip("\n"))
diff --git a/js/src/vm/SelfHosting.cpp b/js/src/vm/SelfHosting.cpp
index ef007a69db..6446cbb4be 100644
--- a/js/src/vm/SelfHosting.cpp
+++ b/js/src/vm/SelfHosting.cpp
@@ -2487,6 +2487,7 @@ static const JSFunctionSpec intrinsic_functions[] = {
JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0),
JS_FN("intl_ValidateAndCanonicalizeLanguageTag", intl_ValidateAndCanonicalizeLanguageTag, 2, 0),
JS_FN("intl_TryValidateAndCanonicalizeLanguageTag", intl_TryValidateAndCanonicalizeLanguageTag, 1, 0),
+ JS_FN("intl_ValidateAndCanonicalizeUnicodeExtensionType", intl_ValidateAndCanonicalizeUnicodeExtensionType, 3, 0),
JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0),
JS_INLINABLE_FN("IsCollator",