summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartok <martok@martoks-place.de>2023-06-29 23:07:51 +0200
committerMartok <martok@martoks-place.de>2023-06-30 00:01:35 +0200
commit1a9d6d6372fb1fc585e21af53ccfafd6f89eda73 (patch)
treef5780e47a59bbbf9408147ecbda630897bfac96d
parent2f940bdc9dcbfe83e17ed26c5d1af7fe874c24ac (diff)
downloaduxp-1a9d6d6372fb1fc585e21af53ccfafd6f89eda73.tar.gz
Issue #1819 - Implement Intl.Locale proposal
This is according to spec for the mozilla71 cycle, a follow-up will further adjust to spec. - Add Intl.Locale as native C++ - Port Unicode BCP 47 locale identifier parser to C++ - Port language tag parser to C++ - adjust make_intl_data to generate the data Based-on: m-c 1433303, 1570370
-rw-r--r--js/public/Class.h2
-rw-r--r--js/src/builtin/String.js4
-rw-r--r--js/src/builtin/intl/CommonFunctions.h4
-rw-r--r--js/src/builtin/intl/CommonFunctions.js1102
-rw-r--r--js/src/builtin/intl/IntlObject.cpp6
-rw-r--r--js/src/builtin/intl/LangTagMappingsGenerated.js1246
-rw-r--r--js/src/builtin/intl/LanguageTag.cpp1677
-rw-r--r--js/src/builtin/intl/LanguageTag.h722
-rw-r--r--js/src/builtin/intl/LanguageTagGenerated.cpp790
-rw-r--r--js/src/builtin/intl/Locale.cpp1372
-rw-r--r--js/src/builtin/intl/Locale.h61
-rw-r--r--js/src/builtin/intl/make_intl_data.py1118
-rw-r--r--js/src/moz.build4
-rw-r--r--js/src/vm/CommonPropertyNames.h8
-rw-r--r--js/src/vm/GlobalObject.h6
-rw-r--r--js/src/vm/SelfHosting.cpp3
-rw-r--r--js/src/vm/String.h14
-rw-r--r--js/src/vm/StringBuffer.cpp6
18 files changed, 5545 insertions, 2600 deletions
diff --git a/js/public/Class.h b/js/public/Class.h
index 1c785646e3..f1d7739718 100644
--- a/js/public/Class.h
+++ b/js/public/Class.h
@@ -913,7 +913,7 @@ struct JSClass {
// application.
#define JSCLASS_GLOBAL_APPLICATION_SLOTS 5
#define JSCLASS_GLOBAL_SLOT_COUNT \
- (JSCLASS_GLOBAL_APPLICATION_SLOTS + JSProto_LIMIT * 2 + 49)
+ (JSCLASS_GLOBAL_APPLICATION_SLOTS + JSProto_LIMIT * 2 + 50)
#define JSCLASS_GLOBAL_FLAGS_WITH_SLOTS(n) \
(JSCLASS_IS_GLOBAL | JSCLASS_HAS_RESERVED_SLOTS(JSCLASS_GLOBAL_SLOT_COUNT + (n)))
#define JSCLASS_GLOBAL_FLAGS \
diff --git a/js/src/builtin/String.js b/js/src/builtin/String.js
index 0fab35966a..e1c32482ae 100644
--- a/js/src/builtin/String.js
+++ b/js/src/builtin/String.js
@@ -752,7 +752,7 @@ function String_toLocaleLowerCase() {
requestedLocale = undefined;
} else if (typeof locales === "string") {
// Steps 3, 5.
- requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
+ requestedLocale = intl_ValidateAndCanonicalizeLanguageTag(locales, false);
} else {
// Step 3.
var requestedLocales = CanonicalizeLocaleList(locales);
@@ -793,7 +793,7 @@ function String_toLocaleUpperCase() {
requestedLocale = undefined;
} else if (typeof locales === "string") {
// Steps 3, 5.
- requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
+ requestedLocale = intl_ValidateAndCanonicalizeLanguageTag(locales, false);
} else {
// Step 3.
var requestedLocales = CanonicalizeLocaleList(locales);
diff --git a/js/src/builtin/intl/CommonFunctions.h b/js/src/builtin/intl/CommonFunctions.h
index 256db49b18..12b4da4a72 100644
--- a/js/src/builtin/intl/CommonFunctions.h
+++ b/js/src/builtin/intl/CommonFunctions.h
@@ -89,9 +89,9 @@ static_assert(mozilla::IsSame<UChar, char16_t>::value,
// buffer's entire inline capacity before growing it and heap-allocating.
static const size_t INITIAL_CHAR_BUFFER_SIZE = 32;
-template <typename ICUStringFunction, size_t InlineCapacity>
+template <typename ICUStringFunction, typename CharT, size_t InlineCapacity>
static int32_t
-CallICU(JSContext* cx, Vector<char16_t, InlineCapacity>& chars, const ICUStringFunction& strFn)
+CallICU(JSContext* cx, Vector<CharT, InlineCapacity>& chars, const ICUStringFunction& strFn)
{
MOZ_ASSERT(chars.length() == 0);
MOZ_ALWAYS_TRUE(chars.resize(InlineCapacity));
diff --git a/js/src/builtin/intl/CommonFunctions.js b/js/src/builtin/intl/CommonFunctions.js
index 36b2bec9b2..9fad595979 100644
--- a/js/src/builtin/intl/CommonFunctions.js
+++ b/js/src/builtin/intl/CommonFunctions.js
@@ -13,6 +13,19 @@ function hasOwn(propName, object) {
return callFunction(std_Object_hasOwnProperty, object, propName);
}
+#ifdef DEBUG
+#define assertIsValidAndCanonicalLanguageTag(locale, desc) \
+ do { \
+ let canonical = intl_TryValidateAndCanonicalizeLanguageTag(locale); \
+ assert(canonical !== null, \
+ `${desc} is a structurally valid language tag`); \
+ assert(canonical === locale, \
+ `${desc} is a canonicalized language tag`); \
+ } while (false)
+#else
+#define assertIsValidAndCanonicalLanguageTag(locale, desc) ; // Elided assertion.
+#endif
+
/**
* Returns the start index of a "Unicode locale extension sequence", which the
* specification defines as: "any substring of a language tag that starts with
@@ -46,8 +59,6 @@ function startOfUnicodeExtensions(locale) {
*/
function endOfUnicodeExtensions(locale, start) {
assert(typeof locale === "string", "locale is a string");
- assert(IsStructurallyValidLanguageTag(locale), "locale is a language tag");
- assert(CanonicalizeLanguageTag(locale) === locale, "locale is a canonicalized language tag");
assert(0 <= start && start < locale.length, "start is an index into locale");
assert(Substring(locale, start, 3) === "-u-", "start points to Unicode extension sequence");
@@ -95,10 +106,9 @@ function removeUnicodeExtensions(locale) {
var right = Substring(locale, end, locale.length - end);
var combined = left + right;
- assert(IsStructurallyValidLanguageTag(combined),
- "recombination produced an invalid language tag");
+ assertIsValidAndCanonicalLanguageTag(combined, "the recombined locale");
assert(startOfUnicodeExtensions(combined) < 0,
- "recombination failed to remove all Unicode locale extension sequences");
+ "recombination failed to remove all Unicode locale extension sequences");
return combined;
}
@@ -114,1000 +124,6 @@ function getUnicodeExtensions(locale) {
return Substring(locale, start, end - start);
}
-// The three possible token type bits. Expressed as #defines to avoid
-// extra named lookups in the interpreter/jits.
-#define NONE 0b00
-#define ALPHA 0b01
-#define DIGIT 0b10
-
-// Constants for code units used below.
-#define HYPHEN 0x2D
-#define DIGIT_ZERO 0x30
-#define DIGIT_NINE 0x39
-#define UPPER_A 0x41
-#define UPPER_Z 0x5A
-#define LOWER_A 0x61
-#define LOWER_T 0x74
-#define LOWER_U 0x75
-#define LOWER_X 0x78
-#define LOWER_Z 0x7A
-
-// The requirement to use callFunction() for method calls makes the parser
-// harder to read. Use macros for the rescue.
-
-// Reads the next token.
-#define NEXT_TOKEN_OR_RETURN_NULL(ts) \
- if (!callFunction(ts.nextToken, ts)) \
- return null;
-
-#define NEXT_TOKEN_OR_ASSERT(ts) \
- if (!callFunction(ts.nextToken, ts)) \
- assert(false, "unexpected invalid subtag");
-
-// Assigns the current subtag part transformed to lower-case to the target.
-#define SUBTAG_VAR_OR_RETURN_NULL(ts, target) \
- { \
- target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \
- NEXT_TOKEN_OR_RETURN_NULL(ts); \
- }
-
-// Assigns the current subtag part transformed to lower-case to the target.
-#define SUBTAG_VAR_OR_ASSERT(ts, target) \
- { \
- target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \
- NEXT_TOKEN_OR_ASSERT(ts) \
- }
-
-/**
- * Tokenizer for Unicode BCP 47 locale identifiers.
- */
-function BCP47TokenStream(locale) {
- this.locale = locale;
-
- // Locale identifiers are compared and processed case-insensitively, so
- // technically it's not necessary to adjust case. But for easier processing,
- // and because the canonical form for most subtags is lower case, we start
- // with lower case for all.
- //
- // Note that the tokenizer function keeps using the original input string
- // to properly detect non-ASCII characters. The lower-case string can't be
- // used to detect those characters, because some non-ASCII characters
- // lower-case map into ASCII characters, e.g. U+212A (KELVIN SIGN) lower-
- // case maps to U+006B (LATIN SMALL LETTER K).
- this.localeLowercase = callFunction(std_String_toLowerCase, locale);
-
- // Current parse index in |locale|.
- this.index = 0;
-
- // The current token type, its start index, and its length.
- this.token = NONE;
- this.tokenStart = 0;
- this.tokenLength = 0;
-
- assert(std_String_fromCharCode(HYPHEN) === "-" &&
- std_String_fromCharCode(DIGIT_ZERO) === "0" &&
- std_String_fromCharCode(DIGIT_NINE) === "9" &&
- std_String_fromCharCode(UPPER_A) === "A" &&
- std_String_fromCharCode(UPPER_Z) === "Z" &&
- std_String_fromCharCode(LOWER_A) === "a" &&
- std_String_fromCharCode(LOWER_T) === "t" &&
- std_String_fromCharCode(LOWER_U) === "u" &&
- std_String_fromCharCode(LOWER_X) === "x" &&
- std_String_fromCharCode(LOWER_Z) === "z",
- "code unit constants should match the expected characters");
-}
-
-MakeConstructible(BCP47TokenStream, {
- __proto__: null,
-
- // Reads the next token, returns |false| if an illegal character was found,
- // otherwise returns |true|.
- //
- // eslint-disable-next-line object-shorthand
- nextToken: function() {
- var type = NONE;
- var {index, locale} = this;
- for (var i = index; i < locale.length; i++) {
- // UTS 35, section 3.1.
- // alpha = [A-Z a-z] ;
- // digit = [0-9] ;
- var c = callFunction(std_String_charCodeAt, locale, i);
- if ((UPPER_A <= c && c <= UPPER_Z) || (LOWER_A <= c && c <= LOWER_Z))
- type |= ALPHA;
- else if (DIGIT_ZERO <= c && c <= DIGIT_NINE)
- type |= DIGIT;
- else if (c === HYPHEN && i > index && i + 1 < locale.length)
- break;
- else
- return false;
- }
-
- this.token = type;
- this.tokenStart = index;
- this.tokenLength = i - index;
- this.index = i + 1;
- return true;
- },
-
- // Returns true if the character at the requested index within the current
- // token is a digit.
- //
- // eslint-disable-next-line object-shorthand
- isDigitAt: function(index) {
- assert(0 <= index && index < this.tokenLength,
- "must be an index into the current token");
- var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart + index);
- assert(!(c <= DIGIT_NINE) || c >= DIGIT_ZERO,
- "token-start-code-unit <= '9' implies token-start-code-unit is in '0'..'9' " +
- "and because all digits are sorted before any letters");
- return c <= DIGIT_NINE;
- },
-
- // Returns the code unit of the first character at the current token
- // position. Always returns the lower-case form of an alphabetical
- // character.
- //
- // eslint-disable-next-line object-shorthand
- singletonKey: function() {
- assert(this.tokenLength === 1, "token is not a singleton");
- var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart);
- assert((DIGIT_ZERO <= c && c <= DIGIT_NINE) || (LOWER_A <= c && c <= LOWER_Z),
- "unexpected code unit");
- return c;
- },
-
- // eslint-disable-next-line object-shorthand
- singletonValue: function() {
- var singletonStart = this.tokenStart;
- var min = callFunction(this.singletonKey, this) === LOWER_X ? 1 : 2;
-
- NEXT_TOKEN_OR_RETURN_NULL(this);
-
- // At least one non-singleton subtag must be present.
- if (!(min <= this.tokenLength && this.tokenLength <= 8))
- return null;
- do {
- NEXT_TOKEN_OR_RETURN_NULL(this);
- } while (min <= this.tokenLength && this.tokenLength <= 8);
-
- return callFunction(this.singletonValueAt, this, singletonStart);
- },
-
- // eslint-disable-next-line object-shorthand
- singletonValueAt: function(start) {
- // Singletons must be followed by a non-singleton subtag, "en-a-b" is not allowed.
- var length = this.tokenStart - 1 - start;
- if (length <= 2)
- return null;
- return Substring(this.localeLowercase, start, length);
- }
-});
-
-/* eslint-disable complexity */
-/**
- * Parser for Unicode BCP 47 locale identifiers.
- *
- * Returns null if |locale| can't be parsed as a `unicode_locale_id`. If the
- * input is a grandfathered language tag, it is directly canonicalized to its
- * modern form. The returned object has the following structure:
- *
- * {
- * language: `unicode_language_subtag`,
- * script: `unicode_script_subtag` / undefined,
- * region: `unicode_region_subtag` / undefined,
- * variants: array of `unicode_variant_subtag`,
- * extensions: array of `extensions`,
- * privateuse: `pu_extensions` / undefined,
- * }
- *
- * All locale identifier subtags are returned in their normalized case:
- *
- * var langtag = parseLanguageTag("en-latn-us");
- * assertEq("en", langtag.language);
- * assertEq("Latn", langtag.script);
- * assertEq("US", langtag.region);
- *
- * Spec: https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers
- */
-function parseLanguageTag(locale) {
- assert(typeof locale === "string", "locale is a string");
-
- // unicode_locale_id = unicode_language_id
- // extensions*
- // pu_extensions? ;
- var ts = new BCP47TokenStream(locale);
- NEXT_TOKEN_OR_RETURN_NULL(ts);
-
- var language, script, region, privateuse;
- var variants = [];
- var extensions = [];
-
- // unicode_language_id = unicode_language_subtag
- // (sep unicode_script_subtag)?
- // (sep unicode_region_subtag)?
- // (sep unicode_variant_subtag)* ;
- //
- // sep = "-"
- //
- // Note: Unicode CLDR locale identifier backward compatibility extensions
- // removed from `unicode_language_id`.
-
- // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
- if (ts.token !== ALPHA || ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) {
- // Four character language subtags are not allowed in Unicode BCP 47
- // locale identifiers. Also see the comparison to Unicode CLDR locale
- // identifiers in <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
- return null;
- }
- assert((2 <= ts.tokenLength && ts.tokenLength <= 3) ||
- (5 <= ts.tokenLength && ts.tokenLength <= 8),
- "language subtags have 2-3 or 5-8 letters");
-
- SUBTAG_VAR_OR_RETURN_NULL(ts, language);
-
- // unicode_script_subtag = alpha{4} ;
- if (ts.tokenLength === 4 && ts.token === ALPHA) {
- SUBTAG_VAR_OR_RETURN_NULL(ts, script);
-
- // The first character of a script code needs to be capitalized.
- // "hans" -> "Hans"
- script = callFunction(std_String_toUpperCase, script[0]) +
- Substring(script, 1, script.length - 1);
- }
-
- // unicode_region_subtag = (alpha{2} | digit{3}) ;
- if ((ts.tokenLength === 2 && ts.token === ALPHA) ||
- (ts.tokenLength === 3 && ts.token === DIGIT))
- {
- SUBTAG_VAR_OR_RETURN_NULL(ts, region);
-
- // Region codes need to be in upper-case. "bu" -> "BU"
- region = callFunction(std_String_toUpperCase, region);
- }
-
- // unicode_variant_subtag = (alphanum{5,8}
- // | digit alphanum{3}) ;
- //
- // alphanum = [0-9 A-Z a-z] ;
- while ((5 <= ts.tokenLength && ts.tokenLength <= 8) ||
- (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0)))
- {
- // Locale identifiers are case insensitive (UTS 35, section 3.2).
- // All seen variants are compared ignoring case differences by
- // using the lower-case form. This allows to properly detect and
- // reject variant repetitions with differing case, e.g.
- // "en-variant-Variant".
- var variant;
- SUBTAG_VAR_OR_RETURN_NULL(ts, variant);
-
- // Reject the Locale identifier if a duplicate variant was found.
- //
- // This linear-time verification step means the whole variant
- // subtag checking is potentially quadratic, but we're okay doing
- // that because language tags are unlikely to be deliberately
- // pathological.
- if (callFunction(ArrayIndexOf, variants, variant) !== -1)
- return null;
- _DefineDataProperty(variants, variants.length, variant);
- }
-
- // extensions = unicode_locale_extensions
- // | transformed_extensions
- // | other_extensions ;
- //
- // unicode_locale_extensions = sep [uU]
- // ((sep keyword)+
- // |(sep attribute)+ (sep keyword)*) ;
- //
- // transformed_extensions = sep [tT]
- // ((sep tlang (sep tfield)*)
- // |(sep tfield)+) ;
- //
- // other_extensions = [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
- //
- // keyword = key (sep type)? ;
- //
- // key = alphanum alpha ;
- //
- // type = alphanum{3,8} (sep alphanum{3,8})* ;
- //
- // attribute = alphanum{3,8} ;
- //
- // tlang = unicode_language_subtag
- // (sep unicode_script_subtag)?
- // (sep unicode_region_subtag)?
- // (sep unicode_variant_subtag)* ;
- //
- // tfield = tkey tvalue;
- //
- // tkey = alpha digit ;
- //
- // tvalue = (sep alphanum{3,8})+ ;
- var seenSingletons = [];
- while (ts.tokenLength === 1) {
- var singleton = callFunction(ts.singletonKey, ts);
- if (singleton === LOWER_X)
- break;
-
- // Locale identifiers are case insensitive (UTS 35, section 3.2).
- // Ensure |singletonKey()| does not return the code unit of an
- // upper-case character, so we can properly detect and reject
- // singletons with different case, e.g. "en-u-foo-U-foo".
- assert(!(UPPER_A <= singleton && singleton <= UPPER_Z),
- "unexpected upper-case code unit");
-
- // Reject the input if a duplicate singleton was found.
- //
- // Similar to the variant validation step this check is O(n**2),
- // but given that there are only 35 possible singletons the
- // quadratic runtime is negligible.
- if (callFunction(ArrayIndexOf, seenSingletons, singleton) !== -1)
- return null;
- _DefineDataProperty(seenSingletons, seenSingletons.length, singleton);
-
- var extension;
- if (singleton === LOWER_U) {
- var extensionStart = ts.tokenStart;
- NEXT_TOKEN_OR_RETURN_NULL(ts);
-
- while (2 <= ts.tokenLength && ts.tokenLength <= 8) {
- // `key` doesn't allow a digit as its second character.
- if (ts.tokenLength === 2 && callFunction(ts.isDigitAt, ts, 1))
- return null;
- NEXT_TOKEN_OR_RETURN_NULL(ts);
- }
- extension = callFunction(ts.singletonValueAt, ts, extensionStart);
- } else if (singleton === LOWER_T) {
- var extensionStart = ts.tokenStart;
- NEXT_TOKEN_OR_RETURN_NULL(ts);
-
- // `tfield` starts with `tkey`, which in turn is `alpha digit`, so
- // an alpha-only token must be a `tlang`.
- if (ts.token === ALPHA) {
- // `unicode_language_subtag`
- if (ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8)
- return null;
- NEXT_TOKEN_OR_RETURN_NULL(ts);
-
- // `unicode_script_subtag` (optional)
- if (ts.tokenLength === 4 && ts.token === ALPHA) {
- NEXT_TOKEN_OR_RETURN_NULL(ts);
- }
-
- // `unicode_region_subtag` (optional)
- if ((ts.tokenLength === 2 && ts.token === ALPHA) ||
- (ts.tokenLength === 3 && ts.token === DIGIT))
- {
- NEXT_TOKEN_OR_RETURN_NULL(ts);
- }
-
- // `unicode_variant_subtag` (optional)
- while ((5 <= ts.tokenLength && ts.tokenLength <= 8) ||
- (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0)))
- {
- NEXT_TOKEN_OR_RETURN_NULL(ts);
- }
- }
-
- // Trailing `tfield` subtags.
- while (ts.tokenLength === 2) {
- // `tkey` is `alpha digit`.
- if (callFunction(ts.isDigitAt, ts, 0) ||
- !callFunction(ts.isDigitAt, ts, 1))
- {
- return null;
- }
- NEXT_TOKEN_OR_RETURN_NULL(ts);
-
- // `tfield` requires at least one `tvalue`.
- if (!(3 <= ts.tokenLength && ts.tokenLength <= 8))
- return null;
- do {
- NEXT_TOKEN_OR_RETURN_NULL(ts);
- } while (3 <= ts.tokenLength && ts.tokenLength <= 8);
- }
- extension = callFunction(ts.singletonValueAt, ts, extensionStart);
- } else {
- extension = callFunction(ts.singletonValue, ts);
- }
- if (!extension)
- return null;
-
- _DefineDataProperty(extensions, extensions.length, extension);
- }
-
- // Trailing pu_extensions component of the unicode_locale_id production.
- //
- // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
- if (ts.tokenLength === 1 && callFunction(ts.singletonKey, ts) === LOWER_X) {
- privateuse = callFunction(ts.singletonValue, ts);
- if (!privateuse)
- return null;
- }
-
- // Reject the input if it couldn't be parsed completely.
- if (ts.token !== NONE)
- return null;
-
- var tagObj = {
- language,
- script,
- region,
- variants,
- extensions,
- privateuse,
- };
-
- // Handle grandfathered tags right away, so we don't need to have extra
- // paths for grandfathered tags later on.
- //
- // grandfathered = "art-lojban" ; non-redundant tags registered
- // / "cel-gaulish" ; during the RFC 3066 era
- // / "zh-guoyu" ; these tags match the 'langtag'
- // / "zh-hakka" ; production, but their subtags
- // / "zh-xiang" ; are not extended language
- // ; or variant subtags: their meaning
- // ; is defined by their registration
- // ; and all of these are deprecated
- // ; in favor of a more modern
- // ; subtag or sequence of subtags
- if (hasOwn(ts.localeLowercase, grandfatheredMappings))
- updateGrandfatheredMappings(tagObj);
-
- // Return if the complete input was successfully parsed.
- return tagObj;
-}
-
-/**
- * Return the locale and fields components of the given valid Transform
- * extension subtag.
- */
-function TransformExtensionComponents(extension) {
- assert(typeof extension === "string", "extension is a String value");
- assert(callFunction(std_String_startsWith, extension, "t-"),
- "extension starts with 't-'");
-
- var ts = new BCP47TokenStream(Substring(extension, 2, extension.length - 2));
- NEXT_TOKEN_OR_ASSERT(ts);
-
- // `tfield` starts with `tkey`, which in turn is `alpha digit`, so
- // an alpha-only token must be a `tlang`.
- var localeObj;
- if (ts.token === ALPHA) {
- // `unicode_language_subtag`
- assert((2 <= ts.tokenLength && ts.tokenLength <= 3) ||
- (5 <= ts.tokenLength && ts.tokenLength <= 8),
- "language subtags have 2-3 or 5-8 letters");
-
- var language;
- SUBTAG_VAR_OR_ASSERT(ts, language);
-
- // unicode_script_subtag = alpha{4} ;
- var script;
- if (ts.tokenLength === 4 && ts.token === ALPHA) {
- SUBTAG_VAR_OR_ASSERT(ts, script);
-
- // The first character of a script code needs to be capitalized.
- // "hans" -> "Hans"
- script = callFunction(std_String_toUpperCase, script[0]) +
- Substring(script, 1, script.length - 1);
- }
-
- // unicode_region_subtag = (alpha{2} | digit{3}) ;
- var region;
- if ((ts.tokenLength === 2 && ts.token === ALPHA) ||
- (ts.tokenLength === 3 && ts.token === DIGIT))
- {
- SUBTAG_VAR_OR_ASSERT(ts, region);
-
- // Region codes need to be in upper-case. "bu" -> "BU"
- region = callFunction(std_String_toUpperCase, region);
- }
-
- // unicode_variant_subtag = (alphanum{5,8}
- // | digit alphanum{3}) ;
- //
- // alphanum = [0-9 A-Z a-z] ;
- var variants = [];
- while ((5 <= ts.tokenLength && ts.tokenLength <= 8) ||
- (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0)))
- {
- var variant;
- SUBTAG_VAR_OR_ASSERT(ts, variant);
-
- _DefineDataProperty(variants, variants.length, variant);
- }
-
- localeObj = {
- language,
- script,
- region,
- variants,
- extensions: [],
- privateuse: undefined,
- };
- }
-
- // Trailing `tfield` subtags. (Any other trailing subtags are an error,
- // because we're guaranteed to only see a valid tranform extension here.)
- var fields = [];
- while (ts.tokenLength === 2) {
- // `tkey` is `alpha digit`.
- assert(!callFunction(ts.isDigitAt, ts, 0) && callFunction(ts.isDigitAt, ts, 1),
- "unexpected invalid tkey subtag");
-
- var key;
- SUBTAG_VAR_OR_ASSERT(ts, key);
-
- // `tfield` requires at least one `tvalue`.
- assert(3 <= ts.tokenLength && ts.tokenLength <= 8,
- "unexpected invalid tvalue subtag");
-
- var value;
- SUBTAG_VAR_OR_ASSERT(ts, value);
-
- while (3 <= ts.tokenLength && ts.tokenLength <= 8) {
- var part;
- SUBTAG_VAR_OR_ASSERT(ts, part);
- value += "-" + part;
- }
-
- _DefineDataProperty(fields, fields.length, {key, value});
- }
-
- assert(ts.token === NONE,
- "unexpected trailing characters in promised-to-be-valid transform extension");
-
- return {locale: localeObj, fields};
-}
-/* eslint-enable complexity */
-
-#undef NONE
-#undef ALPHA
-#undef DIGIT
-
-#undef HYPHEN
-#undef DIGIT_ZERO
-#undef DIGIT_NINE
-#undef UPPER_A
-#undef UPPER_Z
-#undef LOWER_A
-#undef LOWER_T
-#undef LOWER_U
-#undef LOWER_X
-#undef LOWER_Z
-
-#undef SUBTAG_VAR_OR_ASSERT
-#undef SUBTAG_VAR_OR_RETURN_NULL
-#undef NEXT_TOKEN_OR_ASSERT
-#undef NEXT_TOKEN_OR_RETURN_NULL
-
-/**
- * Verifies that the given string is a well-formed BCP 47 language tag
- * with no duplicate variant or singleton subtags.
- *
- * Spec: ECMAScript Internationalization API Specification, 6.2.2.
- */
-function IsStructurallyValidLanguageTag(locale) {
- return parseLanguageTag(locale) !== null;
-}
-
-/**
- * Canonicalizes the given structurally valid Unicode BCP 47 locale identifier,
- * including regularized case of subtags. For example, the language tag
- * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where
- *
- * Zh ; 2*3ALPHA
- * -haNS ; ["-" script]
- * -bu ; ["-" region]
- * -variant2 ; *("-" variant)
- * -Variant1
- * -u-ca-chinese ; *("-" extension)
- * -t-Zh-laTN
- * -x-PRIVATE ; ["-" privateuse]
- *
- * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
- *
- * UTS 35 specifies two different canonicalization algorithms. There's one to
- * canonicalize BCP 47 language tags and other one to canonicalize Unicode
- * locale identifiers. The latter one wasn't present when ECMA-402 was changed
- * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags, so
- * ECMA-402 currently only uses the former to canonicalize Unicode BCP 47 locale
- * identifiers.
- *
- * Spec: ECMAScript Internationalization API Specification, 6.2.3.
- * Spec: https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
- * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion
- */
-function CanonicalizeLanguageTagObject(localeObj) {
- assert(IsObject(localeObj), "CanonicalizeLanguageTagObject");
-
- // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by
- // normalizing the case and ordering all subtags. The canonical syntax form
- // itself is specified in UTS 35, 3.2.1.
-
- // The parser already normalized the case for all subtags.
-
-#ifdef DEBUG
- function IsLowerCase(s) {
- return s === callFunction(std_String_toLowerCase, s);
- }
- function IsUpperCase(s) {
- return s === callFunction(std_String_toUpperCase, s);
- }
- function IsTitleCase(s) {
- assert(s.length > 0, "unexpected empy string");
- var r = callFunction(std_String_toUpperCase, s[0]) +
- callFunction(std_String_toLowerCase, Substring(s, 1, s.length - 1));
- return s === r;
- }
-#endif
-
- // 1. Any script subtag is in title case.
- assert(localeObj.script === undefined || IsTitleCase(localeObj.script),
- "If present, script subtag is in title case");
-
- // 2. Any region subtag is in uppercase.
- assert(localeObj.region === undefined || IsUpperCase(localeObj.region),
- "If present, region subtag is in upper case");
-
- // 3. All other subtags are in lowercase.
- assert(IsLowerCase(localeObj.language),
- "language subtag is in lower case");
- assert(callFunction(ArrayEvery, localeObj.variants, IsLowerCase),
- "variant subtags are in lower case");
- assert(callFunction(ArrayEvery, localeObj.extensions, IsLowerCase),
- "extension subtags are in lower case");
- assert(localeObj.privateuse === undefined || IsLowerCase(localeObj.privateuse),
- "If present, privateuse subtag is in lower case");
-
-
- // The second step in UTS 35, 3.2.1, is to order all subtags.
-
- // 1. Any variants are in alphabetical order.
- var variants = localeObj.variants;
- if (variants.length > 0) {
- callFunction(ArraySort, variants);
- }
-
- // 2. Any extensions are in alphabetical order by their singleton.
- var extensions = localeObj.extensions;
- if (extensions.length > 0) {
- // Extension sequences are sorted by their singleton characters.
- // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese"
- callFunction(ArraySort, extensions);
-
- // The last three bullet points in UTS 35, 3.2.1 apply only to Unicode and Transform
- // extensions.
- //
- // 3. All attributes are sorted in alphabetical order.
- //
- // 4. All keywords and tfields are sorted by alphabetical order of their
- // keys, within their respective extensions.
- //
- // 5. Any type or tfield value "true" is removed.
-
- for (var i = 0; i < extensions.length; i++) {
- var ext = extensions[i];
- assert(IsLowerCase(ext),
- "extension subtags must be in lower-case");
- assert(ext[1] === "-",
- "extension subtags start with a singleton");
-
- // Canonicalize Unicode locale extension subtag if present.
- if (ext[0] === "u") {
- var {attributes, keywords} = UnicodeExtensionComponents(ext);
- extensions[i] = CanonicalizeUnicodeExtension(attributes, keywords);
- }
-
- // Canonicalize Unicode BCP 47 T extension if present.
- if (ext[0] === "t") {
- var {locale, fields} = TransformExtensionComponents(ext);
- extensions[i] = CanonicalizeTransformExtension(locale, fields);
- }
- }
- }
-
- // The next two steps in 3.3.1 replace deprecated language and region
- // subtags with their preferred mappings.
- updateLocaleIdMappings(localeObj);
-
- // The two final steps in 3.3.1, handling irregular grandfathered and
- // private-use only language tags, don't apply, because these two forms
- // can't occur in Unicode BCP 47 locale identifiers.
-}
-
-/**
- * Intl.Locale proposal
- *
- * UnicodeExtensionComponents( extension )
- *
- * Returns the components of |extension| where |extension| is a "Unicode locale
- * extension sequence" (ECMA-402, 6.2.1) without the starting separator
- * character.
- */
-function UnicodeExtensionComponents(extension) {
- assert(typeof extension === "string", "extension is a String value");
-
- // Step 1.
- var attributes = [];
-
- // Step 2.
- var keywords = [];
-
- // Step 3.
- var isKeyword = false;
-
- // Step 4.
- var size = extension.length;
-
- // Step 5.
- // |extension| starts with "u-" instead of "-u-" in our implementation, so
- // we need to initialize |k| with 2 instead of 3.
- assert(callFunction(std_String_startsWith, extension, "u-"),
- "extension starts with 'u-'");
- var k = 2;
-
- // Step 6.
- var key, value;
- while (k < size) {
- // Step 6.a.
- var e = callFunction(std_String_indexOf, extension, "-", k);
-
- // Step 6.b.
- var len = (e < 0 ? size : e) - k;
-
- // Step 6.c.
- var subtag = Substring(extension, k, len);
-
- // Steps 6.d-e.
- if (!isKeyword) {
- // Step 6.d.
- // NB: Duplicates are handled elsewhere in our implementation.
- if (len !== 2)
- _DefineDataProperty(attributes, attributes.length, subtag);
- } else {
- // Steps 6.e.i-ii.
- if (len === 2) {
- // Step 6.e.i.1.
- // NB: Duplicates are handled elsewhere in our implementation.
- _DefineDataProperty(keywords, keywords.length, {key, value});
- } else {
- // Step 6.e.ii.1.
- if (value !== "")
- value += "-";
-
- // Step 6.e.ii.2.
- value += subtag;
- }
- }
-
- // Step 6.f.
- if (len === 2) {
- // Step 6.f.i.
- isKeyword = true;
-
- // Step 6.f.ii.
- key = subtag;
-
- // Step 6.f.iii.
- value = "";
- }
-
- // Step 6.g.
- k += len + 1;
- }
-
- // Step 7.
- if (isKeyword) {
- // Step 7.a.
- // NB: Duplicates are handled elsewhere in our implementation.
- _DefineDataProperty(keywords, keywords.length, {key, value});
- }
-
- // Step 8.
- return {attributes, keywords};
-}
-
-/**
- * CanonicalizeUnicodeExtension( attributes, keywords )
- *
- * Canonical syntax per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>:
- *
- * - All attributes and keywords are in lowercase.
- * - Note: The parser already converted keywords to lowercase.
- * - All attributes are sorted in alphabetical order.
- * - All keywords are sorted by alphabetical order of their keys.
- * - Any type value "true" is removed.
- *
- * Canonical form:
- * - All keys and types use the canonical form (from the name attribute;
- * see Section 3.6.4 U Extension Data Files).
- */
-function CanonicalizeUnicodeExtension(attributes, keywords) {
- assert(attributes.length > 0 || keywords.length > 0,
- "unexpected empty Unicode locale extension components");
-
- // All attributes are sorted in alphabetical order.
- if (attributes.length > 1)
- callFunction(ArraySort, attributes);
-
- // All keywords are sorted by alphabetical order of keys.
- if (keywords.length > 1) {
- function UnicodeKeySort(left, right) {
- var leftKey = left.key;
- var rightKey = right.key;
- assert(leftKey.length === 2, "left key is a Unicode key");
- assert(rightKey.length === 2, "right key is a Unicode key");
-
- // Compare both strings using charCodeAt(), because relational
- // string comparison always calls into the VM, whereas charCodeAt
- // can be inlined by Ion.
- var diff = callFunction(std_String_charCodeAt, leftKey, 0) -
- callFunction(std_String_charCodeAt, rightKey, 0);
- if (diff === 0) {
- diff = callFunction(std_String_charCodeAt, leftKey, 1) -
- callFunction(std_String_charCodeAt, rightKey, 1);
- }
- return diff;
- }
-
- callFunction(ArraySort, keywords, UnicodeKeySort);
- }
-
- var extension = "u";
-
- // Append all attributes.
- for (var i = 0; i < attributes.length; i++) {
- extension += "-" + attributes[i];
- }
-
- // Append all keywords.
- for (var i = 0; i < keywords.length; i++) {
- var {key, value} = keywords[i];
- extension += "-" + key;
-
- // Type value "true" is removed.
- if (value !== "" && value !== "true")
- extension += "-" + value;
- }
-
- return extension;
-}
-
-/**
- * CanonicalizeTransformExtension
- *
- * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>:
- *
- * - These subtags are all in lowercase (that is the canonical casing for these
- * subtags), [...].
- *
- * And per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>:
- *
- * - All keywords and tfields are sorted by alphabetical order of their keys,
- * within their respective extensions.
- */
-function CanonicalizeTransformExtension(localeObj, fields) {
- assert(localeObj !== undefined || fields.length > 0,
- "unexpected empty Transform locale extension components");
-
- if (fields.length > 0) {
- function TransformKeySort(left, right) {
- var leftKey = left.key;
- var rightKey = right.key;
- assert(leftKey.length === 2, "left key is a Transform key");
- assert(rightKey.length === 2, "right key is a Transform key");
-
- // Compare both strings using charCodeAt(), because relational
- // string comparison always calls into the VM, whereas charCodeAt
- // can be inlined by Ion.
- var diff = callFunction(std_String_charCodeAt, leftKey, 0) -
- callFunction(std_String_charCodeAt, rightKey, 0);
- if (diff === 0) {
- diff = callFunction(std_String_charCodeAt, leftKey, 1) -
- callFunction(std_String_charCodeAt, rightKey, 1);
- }
- return diff;
- }
-
- callFunction(ArraySort, fields, TransformKeySort);
- }
-
- var extension = "t";
-
- // Append the language subtag if present.
- if (localeObj !== undefined) {
- // [1] is a bit unclear whether or not the `tlang` subtag also needs
- // to be canonicalized (and case-adjusted). For now simply append it as
- // is and change it to all lower-case. If we switch to [2], the `tlang`
- // subtag also needs to be canonicalized according to the same rules as
- // `unicode_language_id` subtags are canonicalized. Also see [3].
- //
- // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier
- // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
- // [3] https://github.com/tc39/ecma402/issues/330
- var localeStr = StringFromLanguageTagObject(localeObj);
- extension += "-" + callFunction(std_String_toLowerCase, localeStr);
- }
-
- // Append all fields.
- for (var i = 0; i < fields.length; i++) {
- // UTS 35, 3.2.1 specifies:
- // - Any type or tfield value "true" is removed.
- //
- // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so
- // ignore this apparently invalid part of the UTS 35 specification and
- // simply append all `tfield` subtags.
- var {key, value} = fields[i];
- extension += "-" + key + "-" + value;
- }
-
- return extension;
-}
-
-/**
- * Canonicalizes the given structurally valid BCP 47 language tag, including
- * regularized case of subtags. For example, the language tag
- * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where
- *
- * Zh ; 2*3ALPHA
- * -haNS ; ["-" script]
- * -bu ; ["-" region]
- * -variant2 ; *("-" variant)
- * -Variant1
- * -u-ca-chinese ; *("-" extension)
- * -t-Zh-laTN
- * -x-PRIVATE ; ["-" privateuse]
- *
- * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
- *
- * Spec: ECMAScript Internationalization API Specification, 6.2.3.
- */
-function CanonicalizeLanguageTag(locale) {
- var localeObj = parseLanguageTag(locale);
- assert(localeObj !== null, "CanonicalizeLanguageTag");
-
- CanonicalizeLanguageTagObject(localeObj);
-
- return StringFromLanguageTagObject(localeObj);
-}
-
-/**
- * Returns the string representation of the given language tag object.
- */
-function StringFromLanguageTagObject(localeObj) {
- assert(IsObject(localeObj), "StringFromLanguageTagObject");
-
- var {
- language,
- script,
- region,
- variants,
- extensions,
- privateuse,
- } = localeObj;
-
- var canonical = language;
-
- if (script !== undefined)
- canonical += "-" + script;
-
- if (region !== undefined)
- canonical += "-" + region;
-
- if (variants.length > 0)
- canonical += "-" + callFunction(std_Array_join, variants, "-");
-
- if (extensions.length > 0)
- canonical += "-" + callFunction(std_Array_join, extensions, "-");
-
- if (privateuse !== undefined)
- canonical += "-" + privateuse;
-
- return canonical;
-}
-
/**
* Returns true if the input contains only ASCII alphabetical characters.
*/
@@ -1122,50 +138,6 @@ function IsASCIIAlphaString(s) {
return true;
}
-
-/**
- * Validates and canonicalizes the given language tag.
- */
-function ValidateAndCanonicalizeLanguageTag(locale) {
- assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag");
-
- // Handle the common case (a standalone language) first.
- // Only the following Unicode BCP 47 locale identifier subset is accepted:
- // unicode_locale_id = unicode_language_id
- // unicode_language_id = unicode_language_subtag
- // unicode_language_subtag = alpha{2,3}
- if (locale.length === 2 || locale.length === 3) {
- if (!IsASCIIAlphaString(locale))
- ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
- assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag");
-
- // The language subtag is canonicalized to lower case.
- locale = callFunction(std_String_toLowerCase, locale);
-
- // updateLocaleIdMappings may modify tags containing only |language|
- // subtags, if the language is in |complexLanguageMappings|, so we need
- // to handle that case first.
- if (!hasOwn(locale, complexLanguageMappings)) {
- // Replace deprecated subtags with their preferred values.
- locale = hasOwn(locale, languageMappings)
- ? languageMappings[locale]
- : locale;
- assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization");
-
- return locale;
- }
- }
-
- var localeObj = parseLanguageTag(locale);
- if (localeObj === null)
- ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
-
- CanonicalizeLanguageTagObject(localeObj);
-
- return StringFromLanguageTagObject(localeObj);
-}
-
-
// The last-ditch locale is used if none of the available locales satisfies a
// request. "en-GB" is used based on the assumptions that English is the most
// common second language, that both en-GB and en-US are normally available in
@@ -1215,14 +187,10 @@ function DefaultLocaleIgnoringAvailableLocales() {
// If we didn't get a cache hit, compute the candidate default locale and
// cache it. Fall back on the last-ditch locale when necessary.
- var candidate = parseLanguageTag(runtimeDefaultLocale);
+ var candidate = intl_TryValidateAndCanonicalizeLanguageTag(runtimeDefaultLocale);
if (candidate === null) {
candidate = lastDitchLocale();
} else {
- CanonicalizeLanguageTagObject(candidate);
-
- candidate = StringFromLanguageTagObject(candidate);
-
// The default locale must be in [[availableLocales]], and that list
// must not contain any locales with Unicode extension sequences, so
// remove any present in the candidate.
@@ -1236,10 +204,7 @@ function DefaultLocaleIgnoringAvailableLocales() {
localeCandidateCache.candidateDefaultLocale = candidate;
localeCandidateCache.runtimeDefaultLocale = runtimeDefaultLocale;
- assert(IsStructurallyValidLanguageTag(candidate),
- "the candidate must be structurally valid");
- assert(startOfUnicodeExtensions(candidate) < 0,
- "the candidate must not contain a Unicode extension sequence");
+ assertIsValidAndCanonicalLanguageTag(candidate, "the candidate");
return candidate;
}
@@ -1275,10 +240,7 @@ function DefaultLocale() {
locale = lastDitchLocale();
}
- assert(IsStructurallyValidLanguageTag(locale),
- "the computed default locale must be structurally valid");
- assert(locale === CanonicalizeLanguageTag(locale),
- "the computed default locale must be canonical");
+ assertIsValidAndCanonicalLanguageTag(locale, "the computed default locale");
assert(startOfUnicodeExtensions(locale) < 0,
"the computed default locale must not contain a Unicode extension sequence");
@@ -1325,8 +287,12 @@ function CanonicalizeLocaleList(locales) {
return [];
// Step 3 (and the remaining steps).
- if (typeof locales === "string")
- return [ValidateAndCanonicalizeLanguageTag(locales)];
+ var tag = intl_ValidateAndCanonicalizeLanguageTag(locales, false);
+ if (tag !== null) {
+ assert(typeof tag === "string",
+ "intl_ValidateAndCanonicalizeLanguageTag returns a string value");
+ return [tag];
+ }
// Step 2.
var seen = [];
@@ -1351,11 +317,10 @@ function CanonicalizeLocaleList(locales) {
if (!(typeof kValue === "string" || IsObject(kValue)))
ThrowTypeError(JSMSG_INVALID_LOCALES_ELEMENT);
- // Step 7.c.iii.
- var tag = ToString(kValue);
-
- // Step 7.c.iv.
- tag = ValidateAndCanonicalizeLanguageTag(tag);
+ // Steps 7.c.iii-iv.
+ var tag = intl_ValidateAndCanonicalizeLanguageTag(kValue, true);
+ assert(typeof tag === "string",
+ "ValidateAndCanonicalizeLanguageTag returns a string value");
// Step 7.c.v.
if (callFunction(ArrayIndexOf, seen, tag) === -1)
@@ -1372,8 +337,7 @@ function CanonicalizeLocaleList(locales) {
function BestAvailableLocaleHelper(availableLocales, locale, considerDefaultLocale) {
- assert(IsStructurallyValidLanguageTag(locale), "invalid BestAvailableLocale locale structure");
- assert(locale === CanonicalizeLanguageTag(locale), "non-canonical BestAvailableLocale locale");
+ assertIsValidAndCanonicalLanguageTag(locale, "BestAvailableLocale locale");
assert(startOfUnicodeExtensions(locale) < 0, "locale must contain no Unicode extensions");
// In the spec, [[availableLocales]] is formally a list of all available
@@ -1703,13 +667,9 @@ function ResolveLocale(availableLocales, requestedLocales, options, relevantExte
foundLocale = preExtension + supportedExtension + postExtension;
}
- // Step 9.d.
- assert(IsStructurallyValidLanguageTag(foundLocale), "invalid locale after concatenation");
-
- // Step 9.e (Not required in this implementation, because we don't
+ // Step 9.d-e (Step 9.e is not required in this implementation, because we don't
// canonicalize Unicode extension subtags).
- assert(foundLocale === CanonicalizeLanguageTag(foundLocale), "same locale with extension");
-
+ assertIsValidAndCanonicalLanguageTag(foundLocale, "same locale with extension");
}
// Step 10.
diff --git a/js/src/builtin/intl/IntlObject.cpp b/js/src/builtin/intl/IntlObject.cpp
index c415079ae3..1c1a8d2477 100644
--- a/js/src/builtin/intl/IntlObject.cpp
+++ b/js/src/builtin/intl/IntlObject.cpp
@@ -19,6 +19,7 @@
#include "builtin/intl/CommonFunctions.h"
#include "builtin/intl/DateTimeFormat.h"
#include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/Locale.h"
#include "builtin/intl/NumberFormat.h"
#include "builtin/intl/PluralRules.h"
#include "builtin/intl/RelativeTimeFormat.h"
@@ -459,6 +460,10 @@ GlobalObject::initIntlObject(JSContext* cx, Handle<GlobalObject*> global)
dateTimeFormatProto = CreateDateTimeFormatPrototype(cx, intl, global, &dateTimeFormat, DateTimeFormatOptions::Standard);
if (!dateTimeFormatProto)
return false;
+ RootedObject localeProto(cx);
+ localeProto = CreateLocalePrototype(cx, intl, global);
+ if (!localeProto)
+ return false;
RootedObject numberFormatProto(cx), numberFormat(cx);
numberFormatProto = CreateNumberFormatPrototype(cx, intl, global, &numberFormat);
if (!numberFormatProto)
@@ -492,6 +497,7 @@ GlobalObject::initIntlObject(JSContext* cx, Handle<GlobalObject*> global)
global->setReservedSlot(COLLATOR_PROTO, ObjectValue(*collatorProto));
global->setReservedSlot(DATE_TIME_FORMAT, ObjectValue(*dateTimeFormat));
global->setReservedSlot(DATE_TIME_FORMAT_PROTO, ObjectValue(*dateTimeFormatProto));
+ global->setReservedSlot(LOCALE_PROTO, ObjectValue(*localeProto));
global->setReservedSlot(NUMBER_FORMAT, ObjectValue(*numberFormat));
global->setReservedSlot(NUMBER_FORMAT_PROTO, ObjectValue(*numberFormatProto));
global->setReservedSlot(PLURAL_RULES_PROTO, ObjectValue(*pluralRulesProto));
diff --git a/js/src/builtin/intl/LangTagMappingsGenerated.js b/js/src/builtin/intl/LangTagMappingsGenerated.js
deleted file mode 100644
index 83a8ff8f60..0000000000
--- a/js/src/builtin/intl/LangTagMappingsGenerated.js
+++ /dev/null
@@ -1,1246 +0,0 @@
-// Generated by make_intl_data.py. DO NOT EDIT.
-
-// Mappings from grandfathered tags to preferred values.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-var grandfatheredMappings = {
- "art-lojban": "jbo",
- "cel-gaulish": "xtg-x-cel-gaulish",
- "zh-guoyu": "zh",
- "zh-hakka": "hak",
- "zh-xiang": "hsn",
-};
-
-// Mappings from language subtags to preferred values.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-var languageMappings = {
- "aam": "aas",
- "aar": "aa",
- "abk": "ab",
- "adp": "dz",
- "afr": "af",
- "aju": "jrb",
- "aka": "ak",
- "alb": "sq",
- "als": "sq",
- "amh": "am",
- "ara": "ar",
- "arb": "ar",
- "arg": "an",
- "arm": "hy",
- "asd": "snz",
- "asm": "as",
- "aue": "ktz",
- "ava": "av",
- "ave": "ae",
- "aym": "ay",
- "ayr": "ay",
- "ayx": "nun",
- "aze": "az",
- "azj": "az",
- "bak": "ba",
- "bam": "bm",
- "baq": "eu",
- "bcc": "bal",
- "bcl": "bik",
- "bel": "be",
- "ben": "bn",
- "bgm": "bcg",
- "bh": "bho",
- "bih": "bho",
- "bis": "bi",
- "bjd": "drl",
- "bod": "bo",
- "bos": "bs",
- "bre": "br",
- "bul": "bg",
- "bur": "my",
- "bxk": "luy",
- "bxr": "bua",
- "cat": "ca",
- "ccq": "rki",
- "ces": "cs",
- "cha": "ch",
- "che": "ce",
- "chi": "zh",
- "chu": "cu",
- "chv": "cv",
- "cjr": "mom",
- "cka": "cmr",
- "cld": "syr",
- "cmk": "xch",
- "cmn": "zh",
- "cor": "kw",
- "cos": "co",
- "coy": "pij",
- "cqu": "quh",
- "cre": "cr",
- "cwd": "cr",
- "cym": "cy",
- "cze": "cs",
- "dan": "da",
- "deu": "de",
- "dgo": "doi",
- "dhd": "mwr",
- "dik": "din",
- "diq": "zza",
- "dit": "dif",
- "div": "dv",
- "drh": "mn",
- "dut": "nl",
- "dzo": "dz",
- "ekk": "et",
- "ell": "el",
- "emk": "man",
- "eng": "en",
- "epo": "eo",
- "esk": "ik",
- "est": "et",
- "eus": "eu",
- "ewe": "ee",
- "fao": "fo",
- "fas": "fa",
- "fat": "ak",
- "fij": "fj",
- "fin": "fi",
- "fra": "fr",
- "fre": "fr",
- "fry": "fy",
- "fuc": "ff",
- "ful": "ff",
- "gav": "dev",
- "gaz": "om",
- "gbo": "grb",
- "geo": "ka",
- "ger": "de",
- "gfx": "vaj",
- "ggn": "gvr",
- "gla": "gd",
- "gle": "ga",
- "glg": "gl",
- "glv": "gv",
- "gno": "gon",
- "gre": "el",
- "grn": "gn",
- "gti": "nyc",
- "gug": "gn",
- "guj": "gu",
- "guv": "duz",
- "gya": "gba",
- "hat": "ht",
- "hau": "ha",
- "hdn": "hai",
- "hea": "hmn",
- "heb": "he",
- "her": "hz",
- "him": "srx",
- "hin": "hi",
- "hmo": "ho",
- "hrr": "jal",
- "hrv": "hr",
- "hun": "hu",
- "hye": "hy",
- "ibi": "opa",
- "ibo": "ig",
- "ice": "is",
- "ido": "io",
- "iii": "ii",
- "ike": "iu",
- "iku": "iu",
- "ile": "ie",
- "ilw": "gal",
- "in": "id",
- "ina": "ia",
- "ind": "id",
- "ipk": "ik",
- "isl": "is",
- "ita": "it",
- "iw": "he",
- "jav": "jv",
- "jeg": "oyb",
- "ji": "yi",
- "jpn": "ja",
- "jw": "jv",
- "kal": "kl",
- "kan": "kn",
- "kas": "ks",
- "kat": "ka",
- "kau": "kr",
- "kaz": "kk",
- "kgc": "tdf",
- "kgh": "kml",
- "khk": "mn",
- "khm": "km",
- "kik": "ki",
- "kin": "rw",
- "kir": "ky",
- "kmr": "ku",
- "knc": "kr",
- "kng": "kg",
- "knn": "kok",
- "koj": "kwv",
- "kom": "kv",
- "kon": "kg",
- "kor": "ko",
- "kpv": "kv",
- "krm": "bmf",
- "ktr": "dtp",
- "kua": "kj",
- "kur": "ku",
- "kvs": "gdj",
- "kwq": "yam",
- "kxe": "tvd",
- "kzj": "dtp",
- "kzt": "dtp",
- "lao": "lo",
- "lat": "la",
- "lav": "lv",
- "lbk": "bnc",
- "lii": "raq",
- "lim": "li",
- "lin": "ln",
- "lit": "lt",
- "llo": "ngt",
- "lmm": "rmx",
- "ltz": "lb",
- "lub": "lu",
- "lug": "lg",
- "lvs": "lv",
- "mac": "mk",
- "mah": "mh",
- "mal": "ml",
- "mao": "mi",
- "mar": "mr",
- "may": "ms",
- "meg": "cir",
- "mhr": "chm",
- "mkd": "mk",
- "mlg": "mg",
- "mlt": "mt",
- "mnk": "man",
- "mo": "ro",
- "mol": "ro",
- "mon": "mn",
- "mri": "mi",
- "msa": "ms",
- "mst": "mry",
- "mup": "raj",
- "mwj": "vaj",
- "mya": "my",
- "myd": "aog",
- "myt": "mry",
- "nad": "xny",
- "nau": "na",
- "nav": "nv",
- "nbl": "nr",
- "ncp": "kdz",
- "nde": "nd",
- "ndo": "ng",
- "nep": "ne",
- "nld": "nl",
- "nno": "nn",
- "nns": "nbr",
- "nnx": "ngv",
- "no": "nb",
- "nob": "nb",
- "nor": "nb",
- "npi": "ne",
- "nts": "pij",
- "nya": "ny",
- "oci": "oc",
- "ojg": "oj",
- "oji": "oj",
- "ori": "or",
- "orm": "om",
- "ory": "or",
- "oss": "os",
- "oun": "vaj",
- "pan": "pa",
- "pbu": "ps",
- "pcr": "adx",
- "per": "fa",
- "pes": "fa",
- "pli": "pi",
- "plt": "mg",
- "pmc": "huw",
- "pmu": "phr",
- "pnb": "lah",
- "pol": "pl",
- "por": "pt",
- "ppa": "bfy",
- "ppr": "lcq",
- "pry": "prt",
- "pus": "ps",
- "puz": "pub",
- "que": "qu",
- "quz": "qu",
- "rmy": "rom",
- "roh": "rm",
- "ron": "ro",
- "rum": "ro",
- "run": "rn",
- "rus": "ru",
- "sag": "sg",
- "san": "sa",
- "sca": "hle",
- "scc": "sr",
- "scr": "hr",
- "sin": "si",
- "skk": "oyb",
- "slk": "sk",
- "slo": "sk",
- "slv": "sl",
- "sme": "se",
- "smo": "sm",
- "sna": "sn",
- "snd": "sd",
- "som": "so",
- "sot": "st",
- "spa": "es",
- "spy": "kln",
- "sqi": "sq",
- "src": "sc",
- "srd": "sc",
- "srp": "sr",
- "ssw": "ss",
- "sun": "su",
- "swa": "sw",
- "swe": "sv",
- "swh": "sw",
- "tah": "ty",
- "tam": "ta",
- "tat": "tt",
- "tdu": "dtp",
- "tel": "te",
- "tgk": "tg",
- "tgl": "fil",
- "tha": "th",
- "thc": "tpo",
- "thx": "oyb",
- "tib": "bo",
- "tie": "ras",
- "tir": "ti",
- "tkk": "twm",
- "tl": "fil",
- "tlw": "weo",
- "tmp": "tyj",
- "tne": "kak",
- "ton": "to",
- "tsf": "taj",
- "tsn": "tn",
- "tso": "ts",
- "ttq": "tmh",
- "tuk": "tk",
- "tur": "tr",
- "tw": "ak",
- "twi": "ak",
- "uig": "ug",
- "ukr": "uk",
- "umu": "del",
- "uok": "ema",
- "urd": "ur",
- "uzb": "uz",
- "uzn": "uz",
- "ven": "ve",
- "vie": "vi",
- "vol": "vo",
- "wel": "cy",
- "wln": "wa",
- "wol": "wo",
- "xba": "cax",
- "xho": "xh",
- "xia": "acn",
- "xkh": "waw",
- "xpe": "kpe",
- "xsj": "suj",
- "xsl": "den",
- "ybd": "rki",
- "ydd": "yi",
- "yid": "yi",
- "yma": "lrr",
- "ymt": "mtm",
- "yor": "yo",
- "yos": "zom",
- "yuu": "yug",
- "zai": "zap",
- "zha": "za",
- "zho": "zh",
- "zsm": "ms",
- "zul": "zu",
- "zyb": "za",
-};
-
-// Language subtags with complex mappings.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-var complexLanguageMappings = {
- "cnr": true,
- "drw": true,
- "hbs": true,
- "prs": true,
- "sh": true,
- "swc": true,
- "tnf": true,
-};
-
-// Mappings from region subtags to preferred values.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-var regionMappings = {
- "004": "AF",
- "008": "AL",
- "010": "AQ",
- "012": "DZ",
- "016": "AS",
- "020": "AD",
- "024": "AO",
- "028": "AG",
- "031": "AZ",
- "032": "AR",
- "036": "AU",
- "040": "AT",
- "044": "BS",
- "048": "BH",
- "050": "BD",
- "051": "AM",
- "052": "BB",
- "056": "BE",
- "060": "BM",
- "062": "034",
- "064": "BT",
- "068": "BO",
- "070": "BA",
- "072": "BW",
- "074": "BV",
- "076": "BR",
- "084": "BZ",
- "086": "IO",
- "090": "SB",
- "092": "VG",
- "096": "BN",
- "100": "BG",
- "104": "MM",
- "108": "BI",
- "112": "BY",
- "116": "KH",
- "120": "CM",
- "124": "CA",
- "132": "CV",
- "136": "KY",
- "140": "CF",
- "144": "LK",
- "148": "TD",
- "152": "CL",
- "156": "CN",
- "158": "TW",
- "162": "CX",
- "166": "CC",
- "170": "CO",
- "174": "KM",
- "175": "YT",
- "178": "CG",
- "180": "CD",
- "184": "CK",
- "188": "CR",
- "191": "HR",
- "192": "CU",
- "196": "CY",
- "203": "CZ",
- "204": "BJ",
- "208": "DK",
- "212": "DM",
- "214": "DO",
- "218": "EC",
- "222": "SV",
- "226": "GQ",
- "230": "ET",
- "231": "ET",
- "232": "ER",
- "233": "EE",
- "234": "FO",
- "238": "FK",
- "239": "GS",
- "242": "FJ",
- "246": "FI",
- "248": "AX",
- "249": "FR",
- "250": "FR",
- "254": "GF",
- "258": "PF",
- "260": "TF",
- "262": "DJ",
- "266": "GA",
- "268": "GE",
- "270": "GM",
- "275": "PS",
- "276": "DE",
- "278": "DE",
- "280": "DE",
- "288": "GH",
- "292": "GI",
- "296": "KI",
- "300": "GR",
- "304": "GL",
- "308": "GD",
- "312": "GP",
- "316": "GU",
- "320": "GT",
- "324": "GN",
- "328": "GY",
- "332": "HT",
- "334": "HM",
- "336": "VA",
- "340": "HN",
- "344": "HK",
- "348": "HU",
- "352": "IS",
- "356": "IN",
- "360": "ID",
- "364": "IR",
- "368": "IQ",
- "372": "IE",
- "376": "IL",
- "380": "IT",
- "384": "CI",
- "388": "JM",
- "392": "JP",
- "398": "KZ",
- "400": "JO",
- "404": "KE",
- "408": "KP",
- "410": "KR",
- "414": "KW",
- "417": "KG",
- "418": "LA",
- "422": "LB",
- "426": "LS",
- "428": "LV",
- "430": "LR",
- "434": "LY",
- "438": "LI",
- "440": "LT",
- "442": "LU",
- "446": "MO",
- "450": "MG",
- "454": "MW",
- "458": "MY",
- "462": "MV",
- "466": "ML",
- "470": "MT",
- "474": "MQ",
- "478": "MR",
- "480": "MU",
- "484": "MX",
- "492": "MC",
- "496": "MN",
- "498": "MD",
- "499": "ME",
- "500": "MS",
- "504": "MA",
- "508": "MZ",
- "512": "OM",
- "516": "NA",
- "520": "NR",
- "524": "NP",
- "528": "NL",
- "531": "CW",
- "533": "AW",
- "534": "SX",
- "535": "BQ",
- "540": "NC",
- "548": "VU",
- "554": "NZ",
- "558": "NI",
- "562": "NE",
- "566": "NG",
- "570": "NU",
- "574": "NF",
- "578": "NO",
- "580": "MP",
- "581": "UM",
- "583": "FM",
- "584": "MH",
- "585": "PW",
- "586": "PK",
- "591": "PA",
- "598": "PG",
- "600": "PY",
- "604": "PE",
- "608": "PH",
- "612": "PN",
- "616": "PL",
- "620": "PT",
- "624": "GW",
- "626": "TL",
- "630": "PR",
- "634": "QA",
- "638": "RE",
- "642": "RO",
- "643": "RU",
- "646": "RW",
- "652": "BL",
- "654": "SH",
- "659": "KN",
- "660": "AI",
- "662": "LC",
- "663": "MF",
- "666": "PM",
- "670": "VC",
- "674": "SM",
- "678": "ST",
- "682": "SA",
- "686": "SN",
- "688": "RS",
- "690": "SC",
- "694": "SL",
- "702": "SG",
- "703": "SK",
- "704": "VN",
- "705": "SI",
- "706": "SO",
- "710": "ZA",
- "716": "ZW",
- "720": "YE",
- "724": "ES",
- "728": "SS",
- "729": "SD",
- "732": "EH",
- "736": "SD",
- "740": "SR",
- "744": "SJ",
- "748": "SZ",
- "752": "SE",
- "756": "CH",
- "760": "SY",
- "762": "TJ",
- "764": "TH",
- "768": "TG",
- "772": "TK",
- "776": "TO",
- "780": "TT",
- "784": "AE",
- "788": "TN",
- "792": "TR",
- "795": "TM",
- "796": "TC",
- "798": "TV",
- "800": "UG",
- "804": "UA",
- "807": "MK",
- "818": "EG",
- "826": "GB",
- "830": "JE",
- "831": "GG",
- "832": "JE",
- "833": "IM",
- "834": "TZ",
- "840": "US",
- "850": "VI",
- "854": "BF",
- "858": "UY",
- "860": "UZ",
- "862": "VE",
- "876": "WF",
- "882": "WS",
- "886": "YE",
- "887": "YE",
- "891": "RS",
- "894": "ZM",
- "958": "AA",
- "959": "QM",
- "960": "QN",
- "962": "QP",
- "963": "QQ",
- "964": "QR",
- "965": "QS",
- "966": "QT",
- "967": "EU",
- "968": "QV",
- "969": "QW",
- "970": "QX",
- "971": "QY",
- "972": "QZ",
- "973": "XA",
- "974": "XB",
- "975": "XC",
- "976": "XD",
- "977": "XE",
- "978": "XF",
- "979": "XG",
- "980": "XH",
- "981": "XI",
- "982": "XJ",
- "983": "XK",
- "984": "XL",
- "985": "XM",
- "986": "XN",
- "987": "XO",
- "988": "XP",
- "989": "XQ",
- "990": "XR",
- "991": "XS",
- "992": "XT",
- "993": "XU",
- "994": "XV",
- "995": "XW",
- "996": "XX",
- "997": "XY",
- "998": "XZ",
- "999": "ZZ",
- "BU": "MM",
- "CS": "RS",
- "CT": "KI",
- "DD": "DE",
- "DY": "BJ",
- "FQ": "AQ",
- "FX": "FR",
- "HV": "BF",
- "JT": "UM",
- "MI": "UM",
- "NH": "VU",
- "NQ": "AQ",
- "PU": "UM",
- "PZ": "PA",
- "QU": "EU",
- "RH": "ZW",
- "TP": "TL",
- "UK": "GB",
- "VD": "VN",
- "WK": "UM",
- "YD": "YE",
- "YU": "RS",
- "ZR": "CD",
-};
-
-// Region subtags with complex mappings.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-var complexRegionMappings = {
- "172": true,
- "200": true,
- "530": true,
- "532": true,
- "536": true,
- "582": true,
- "810": true,
- "890": true,
- "AN": true,
- "NT": true,
- "PC": true,
- "SU": true,
-};
-
-// Canonicalize Unicode BCP 47 locale identifiers.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-/* eslint-disable complexity */
-function updateLocaleIdMappings(tag) {
- assert(IsObject(tag), "tag is an object");
-
- // Replace deprecated language tags with their preferred values.
- var language = tag.language;
- if (hasOwn(language, languageMappings)) {
- tag.language = languageMappings[language];
- } else if (hasOwn(language, complexLanguageMappings)) {
- switch (language) {
- case "cnr":
- tag.language = "sr";
- if (tag.region === undefined)
- tag.region = "ME";
- break;
- case "drw":
- case "prs":
- case "tnf":
- tag.language = "fa";
- if (tag.region === undefined)
- tag.region = "AF";
- break;
- case "hbs":
- case "sh":
- tag.language = "sr";
- if (tag.script === undefined)
- tag.script = "Latn";
- break;
- case "swc":
- tag.language = "sw";
- if (tag.region === undefined)
- tag.region = "CD";
- break;
- default:
- assert(false, "language not handled: " + language);
- }
- }
-
- // No script replacements are currently present.
-
- // Replace deprecated subtags with their preferred values.
- var region = tag.region;
- if (region !== undefined) {
- if (hasOwn(region, regionMappings)) {
- tag.region = regionMappings[region];
- } else if (hasOwn(region, complexRegionMappings)) {
- switch (region) {
- case "172":
- if (tag.language === "ab") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "az") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "be") {
- tag.region = "BY";
- break;
- }
- if (tag.language === "crh") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "gag") {
- tag.region = "MD";
- break;
- }
- if (tag.language === "got") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "hy") {
- tag.region = "AM";
- break;
- }
- if (tag.language === "ji") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "ka") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "kaa") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "kk") {
- tag.region = "KZ";
- break;
- }
- if (tag.language === "ku" && tag.script === "Yezi") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "ky") {
- tag.region = "KG";
- break;
- }
- if (tag.language === "os") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "rue") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "sog") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "tg") {
- tag.region = "TJ";
- break;
- }
- if (tag.language === "tk") {
- tag.region = "TM";
- break;
- }
- if (tag.language === "tkr") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "tly") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "ttt") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "ug" && tag.script === "Cyrl") {
- tag.region = "KZ";
- break;
- }
- if (tag.language === "uk") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "und" && tag.script === "Geor") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "und" && tag.script === "Armn") {
- tag.region = "AM";
- break;
- }
- if (tag.language === "und" && tag.script === "Sogo") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "und" && tag.script === "Goth") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "und" && tag.script === "Chrs") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "und" && tag.script === "Sogd") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "und" && tag.script === "Yezi") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "uz") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "xco") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "xmf") {
- tag.region = "GE";
- break;
- }
- tag.region = "RU";
- break;
- case "200":
- if (tag.language === "sk") {
- tag.region = "SK";
- break;
- }
- tag.region = "CZ";
- break;
- case "530":
- case "532":
- case "AN":
- if (tag.language === "vic") {
- tag.region = "SX";
- break;
- }
- tag.region = "CW";
- break;
- case "536":
- case "NT":
- if (tag.language === "akk") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "ckb") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "ku" && tag.script === "Arab") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "mis") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "syr") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "und" && tag.script === "Syrc") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "und" && tag.script === "Hatr") {
- tag.region = "IQ";
- break;
- }
- if (tag.language === "und" && tag.script === "Xsux") {
- tag.region = "IQ";
- break;
- }
- tag.region = "SA";
- break;
- case "582":
- case "PC":
- if (tag.language === "mh") {
- tag.region = "MH";
- break;
- }
- if (tag.language === "pau") {
- tag.region = "PW";
- break;
- }
- tag.region = "FM";
- break;
- case "810":
- case "SU":
- if (tag.language === "ab") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "az") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "be") {
- tag.region = "BY";
- break;
- }
- if (tag.language === "crh") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "et") {
- tag.region = "EE";
- break;
- }
- if (tag.language === "gag") {
- tag.region = "MD";
- break;
- }
- if (tag.language === "got") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "hy") {
- tag.region = "AM";
- break;
- }
- if (tag.language === "ji") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "ka") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "kaa") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "kk") {
- tag.region = "KZ";
- break;
- }
- if (tag.language === "ku" && tag.script === "Yezi") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "ky") {
- tag.region = "KG";
- break;
- }
- if (tag.language === "lt") {
- tag.region = "LT";
- break;
- }
- if (tag.language === "ltg") {
- tag.region = "LV";
- break;
- }
- if (tag.language === "lv") {
- tag.region = "LV";
- break;
- }
- if (tag.language === "os") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "rue") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "sgs") {
- tag.region = "LT";
- break;
- }
- if (tag.language === "sog") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "tg") {
- tag.region = "TJ";
- break;
- }
- if (tag.language === "tk") {
- tag.region = "TM";
- break;
- }
- if (tag.language === "tkr") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "tly") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "ttt") {
- tag.region = "AZ";
- break;
- }
- if (tag.language === "ug" && tag.script === "Cyrl") {
- tag.region = "KZ";
- break;
- }
- if (tag.language === "uk") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "und" && tag.script === "Geor") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "und" && tag.script === "Armn") {
- tag.region = "AM";
- break;
- }
- if (tag.language === "und" && tag.script === "Sogo") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "und" && tag.script === "Goth") {
- tag.region = "UA";
- break;
- }
- if (tag.language === "und" && tag.script === "Chrs") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "und" && tag.script === "Sogd") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "und" && tag.script === "Yezi") {
- tag.region = "GE";
- break;
- }
- if (tag.language === "uz") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "vro") {
- tag.region = "EE";
- break;
- }
- if (tag.language === "xco") {
- tag.region = "UZ";
- break;
- }
- if (tag.language === "xmf") {
- tag.region = "GE";
- break;
- }
- tag.region = "RU";
- break;
- case "890":
- if (tag.language === "bs") {
- tag.region = "BA";
- break;
- }
- if (tag.language === "hr") {
- tag.region = "HR";
- break;
- }
- if (tag.language === "mk") {
- tag.region = "MK";
- break;
- }
- if (tag.language === "sl") {
- tag.region = "SI";
- break;
- }
- tag.region = "RS";
- break;
- default:
- assert(false, "region not handled: " + region);
- }
- }
-
- // No variant replacements are currently present.
- // No extension replacements are currently present.
- // Private use sequences are left as is.
-
- }
-}
-/* eslint-enable complexity */
-
-// Canonicalize grandfathered locale identifiers.
-// Derived from CLDR Supplemental Data, version 36.1.
-// https://github.com/unicode-org/cldr.git
-function updateGrandfatheredMappings(tag) {
- assert(IsObject(tag), "tag is an object");
-
- // We're mapping regular grandfathered tags to non-grandfathered form here.
- // Other tags remain unchanged.
- //
- // regular = "art-lojban"
- // / "cel-gaulish"
- // / "no-bok"
- // / "no-nyn"
- // / "zh-guoyu"
- // / "zh-hakka"
- // / "zh-min"
- // / "zh-min-nan"
- // / "zh-xiang"
- //
- // Therefore we can quickly exclude most tags by checking every
- // |unicode_locale_id| subcomponent for characteristics not shared by any of
- // the regular grandfathered (RG) tags:
- //
- // * Real-world |unicode_language_subtag|s are all two or three letters,
- // so don't waste time running a useless |language.length > 3| fast-path.
- // * No RG tag has a "script"-looking component.
- // * No RG tag has a "region"-looking component.
- // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
- // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
- // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
- // that |unicode_locale_id| doesn't support.)
- // * No RG tag contains |extensions| or |pu_extensions|.
- if (tag.script !== undefined ||
- tag.region !== undefined ||
- tag.variants.length !== 1 ||
- tag.extensions.length !== 0 ||
- tag.privateuse !== undefined)
- {
- return;
- }
-
- // art-lojban -> jbo
- if (tag.language === "art" && tag.variants[0] === "lojban") {
- tag.language = "jbo";
- tag.variants.length = 0;
- }
-
- // cel-gaulish -> xtg-x-cel-gaulish
- else if (tag.language === "cel" && tag.variants[0] === "gaulish") {
- tag.language = "xtg";
- tag.variants.length = 0;
- tag.privateuse = "x-cel-gaulish";
- }
-
- // zh-guoyu -> zh
- else if (tag.language === "zh" && tag.variants[0] === "guoyu") {
- tag.language = "zh";
- tag.variants.length = 0;
- }
-
- // zh-hakka -> hak
- else if (tag.language === "zh" && tag.variants[0] === "hakka") {
- tag.language = "hak";
- tag.variants.length = 0;
- }
-
- // zh-xiang -> hsn
- else if (tag.language === "zh" && tag.variants[0] === "xiang") {
- tag.language = "hsn";
- tag.variants.length = 0;
- }
-}
diff --git a/js/src/builtin/intl/LanguageTag.cpp b/js/src/builtin/intl/LanguageTag.cpp
new file mode 100644
index 0000000000..1f5c1fa110
--- /dev/null
+++ b/js/src/builtin/intl/LanguageTag.cpp
@@ -0,0 +1,1677 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "builtin/intl/LanguageTag.h"
+
+#include "mozilla/Assertions.h"
+#include "mozilla/MathAlgorithms.h"
+#include "mozilla/Range.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/Variant.h"
+
+#include <algorithm>
+#include <iterator>
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <string.h>
+#include <type_traits>
+#include <utility>
+
+#include "jsapi.h"
+#include "jsfriendapi.h"
+#include "jscntxt.h"
+
+#include "builtin/intl/CommonFunctions.h"
+#include "ds/Sort.h"
+#include "js/Result.h"
+#include "js/Utility.h"
+#include "js/Vector.h"
+#include "unicode/uloc.h"
+#include "unicode/utypes.h"
+#include "vm/String.h"
+#include "vm/StringBuffer.h"
+
+namespace js {
+namespace intl {
+
+using namespace js::intl::LanguageTagLimits;
+
+using ConstCharRange = mozilla::Range<const char>;
+
+#ifdef DEBUG
+template <typename CharT>
+bool IsStructurallyValidLanguageTag(
+ const mozilla::Range<const CharT>& language) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ // unicode_language_subtag = alpha{2,3} | alpha{5,8};
+ size_t length = language.length();
+ const CharT* str = language.begin().get();
+ return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) &&
+ std::all_of(str, str + length, mozilla::IsAsciiLowercaseAlpha<CharT>);
+}
+
+template bool IsStructurallyValidLanguageTag(
+ const mozilla::Range<const Latin1Char>& language);
+template bool IsStructurallyValidLanguageTag(
+ const mozilla::Range<const char16_t>& language);
+
+template <typename CharT>
+bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ // unicode_script_subtag = alpha{4} ;
+ size_t length = script.length();
+ const CharT* str = script.begin().get();
+ return length == 4 && mozilla::IsAsciiUppercaseAlpha<CharT>(str[0]) &&
+ std::all_of(str + 1, str + length,
+ mozilla::IsAsciiLowercaseAlpha<CharT>);
+}
+
+template bool IsStructurallyValidScriptTag(
+ const mozilla::Range<const Latin1Char>& script);
+template bool IsStructurallyValidScriptTag(
+ const mozilla::Range<const char16_t>& script);
+
+template <typename CharT>
+bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ // unicode_region_subtag = (alpha{2} | digit{3}) ;
+ size_t length = region.length();
+ const CharT* str = region.begin().get();
+ return (length == 2 && std::all_of(str, str + length,
+ mozilla::IsAsciiUppercaseAlpha<CharT>)) ||
+ (length == 3 &&
+ std::all_of(str, str + length, mozilla::IsAsciiDigit<CharT>));
+}
+
+template bool IsStructurallyValidRegionTag(
+ const mozilla::Range<const Latin1Char>& region);
+template bool IsStructurallyValidRegionTag(
+ const mozilla::Range<const char16_t>& region);
+
+bool IsStructurallyValidVariantTag(const ConstCharRange& variant) {
+ // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
+ auto isAsciiLowercaseAlphanumeric = [](char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+ };
+ size_t length = variant.length();
+ const char* str = variant.begin().get();
+ return ((5 <= length && length <= 8) ||
+ (length == 4 && mozilla::IsAsciiDigit(str[0]))) &&
+ std::all_of(str, str + length, isAsciiLowercaseAlphanumeric);
+}
+
+bool IsStructurallyValidUnicodeExtensionTag(const ConstCharRange& extension) {
+ auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c) ||
+ c == '-';
+ };
+
+ size_t length = extension.length();
+ const char* str = extension.begin().get();
+ return LanguageTagParser::canParseUnicodeExtension(extension) &&
+ std::all_of(str, str + length, isAsciiLowercaseAlphanumericOrDash);
+}
+
+static bool IsStructurallyValidExtensionTag(const ConstCharRange& extension) {
+ // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+ // NB: Allow any extension, including Unicode and Transform here, because
+ // this function is only used for an assertion.
+ auto isAsciiDigitOrLowercaseAlpha = [](char c) {
+ return mozilla::IsAsciiDigit(c) || mozilla::IsAsciiLowercaseAlpha(c);
+ };
+
+ size_t length = extension.length();
+ const char* str = extension.begin().get();
+ if (length <= 2) {
+ return false;
+ }
+ if (!isAsciiDigitOrLowercaseAlpha(str[0]) || str[0] == 'x') {
+ return false;
+ }
+ str++;
+ if (*str++ != '-') {
+ return false;
+ }
+ while (true) {
+ const char* sep = reinterpret_cast<const char*>(
+ memchr(str, '-', extension.end().get() - str));
+ size_t len = (sep ? sep : extension.end().get()) - str;
+ if (len < 2 || len > 8 ||
+ !std::all_of(str, str + len, isAsciiDigitOrLowercaseAlpha)) {
+ return false;
+ }
+ if (!sep) {
+ return true;
+ }
+ str = sep + 1;
+ }
+}
+
+bool IsStructurallyValidPrivateUseTag(const ConstCharRange& privateUse) {
+ // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
+ auto isAsciiDigitOrLowercaseAlpha = [](char c) {
+ return mozilla::IsAsciiDigit(c) || mozilla::IsAsciiLowercaseAlpha(c);
+ };
+
+ size_t length = privateUse.length();
+ const char* str = privateUse.begin().get();
+ if (length <= 2 || *str++ != 'x' || *str++ != '-') {
+ return false;
+ }
+ while (true) {
+ const char* sep = reinterpret_cast<const char*>(
+ memchr(str, '-', privateUse.end().get() - str));
+ size_t len = (sep ? sep : privateUse.end().get()) - str;
+ if (len == 0 || len > 8 ||
+ !std::all_of(str, str + len, isAsciiDigitOrLowercaseAlpha)) {
+ return false;
+ }
+ if (!sep) {
+ return true;
+ }
+ str = sep + 1;
+ }
+}
+#endif
+
+bool LanguageTag::setUnicodeExtension(UniqueChars extension) {
+ MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(
+ {extension.get(), strlen(extension.get())}));
+
+ // Replace the existing Unicode extension subtag or append a new one.
+ auto p = std::find_if(extensions().begin(), extensions().end(),
+ [](const auto& ext) { return ext[0] == 'u'; });
+ if (p != extensions().end()) {
+ size_t index = std::distance(extensions().begin(), p);
+ extensions_[index] = std::move(extension);
+ return true;
+ }
+ return extensions_.append(std::move(extension));
+}
+
+template <size_t InitialCapacity>
+static bool SortAlphabetically(JSContext* cx,
+ Vector<UniqueChars, InitialCapacity>& subtags) {
+ size_t length = subtags.length();
+
+ // Zero or one element lists are already sorted.
+ if (length < 2) {
+ return true;
+ }
+
+ // Handle two element lists inline.
+ if (length == 2) {
+ if (strcmp(subtags[0].get(), subtags[1].get()) > 0) {
+ subtags[0].swap(subtags[1]);
+ }
+ return true;
+ }
+
+ Vector<char*, 8> scratch(cx);
+ if (!scratch.resizeUninitialized(length * 2)) {
+ return false;
+ }
+ for (size_t i = 0; i < length; i++) {
+ scratch[i] = subtags[i].release();
+ }
+
+ MOZ_ALWAYS_TRUE(
+ MergeSort(scratch.begin(), length, scratch.begin() + length,
+ [](const char* a, const char* b, bool* lessOrEqualp) {
+ *lessOrEqualp = strcmp(a, b) <= 0;
+ return true;
+ }));
+
+ for (size_t i = 0; i < length; i++) {
+ subtags[i] = UniqueChars(scratch[i]);
+ }
+ return true;
+}
+
+bool LanguageTag::canonicalizeBaseName(JSContext* cx) {
+ // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by
+ // normalizing the case and ordering all subtags. The canonical syntax form
+ // itself is specified in UTS 35, 3.2.1.
+
+ // The |LanguageTag| fields are already in normalized case, so we can skip
+ // this step.
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
+ MOZ_ASSERT(script().length() == 0 ||
+ IsStructurallyValidScriptTag(script().range()));
+ MOZ_ASSERT(region().length() == 0 ||
+ IsStructurallyValidRegionTag(region().range()));
+#ifdef DEBUG
+ auto validVariant = [](const auto& variant) {
+ const char* str = variant.get();
+ return IsStructurallyValidVariantTag({str, strlen(str)});
+ };
+ MOZ_ASSERT(std::all_of(variants().begin(), variants().end(), validVariant));
+
+ auto validExtension = [](const auto& extension) {
+ const char* str = extension.get();
+ return IsStructurallyValidExtensionTag({str, strlen(str)});
+ };
+ MOZ_ASSERT(
+ std::all_of(extensions().begin(), extensions().end(), validExtension));
+#endif
+ MOZ_ASSERT(!privateuse() || IsStructurallyValidPrivateUseTag(
+ {privateuse(), strlen(privateuse())}));
+
+ // The second step in UTS 35, 3.2.1, is to order all subtags.
+
+ // 1. Any variants are in alphabetical order.
+ if (!SortAlphabetically(cx, variants_)) {
+ return false;
+ }
+
+ // 2. Any extensions are in alphabetical order by their singleton.
+ // - A subsequent call to canonicalizeExtensions() will perform this.
+
+ // The next two steps in 3.3.1 replace deprecated language and region
+ // subtags with their preferred mappings.
+
+ if (!updateGrandfatheredMappings(cx)) {
+ return false;
+ }
+
+ // Replace deprecated language subtags with their preferred values.
+ if (!languageMapping(language_) && complexLanguageMapping(language_)) {
+ performComplexLanguageMappings();
+ }
+
+ // No script replacements are currently present.
+
+ // Replace deprecated region subtags with their preferred values.
+ if (region().length() > 0) {
+ if (!regionMapping(region_) && complexRegionMapping(region_)) {
+ performComplexRegionMappings();
+ }
+ }
+
+ // No variant subtag replacements are currently present.
+ // No extension replacements are currently present.
+ // Private use sequences are left as is.
+
+ // The two final steps in 3.3.1, handling irregular grandfathered and
+ // private-use only language tags, don't apply, because these two forms
+ // can't occur in Unicode BCP 47 locale identifiers.
+
+ return true;
+}
+
+bool LanguageTag::canonicalizeExtensions(
+ JSContext* cx, UnicodeExtensionCanonicalForm canonicalForm) {
+ // Any extensions are in alphabetical order by their singleton.
+ // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese"
+ if (!SortAlphabetically(cx, extensions_)) {
+ return false;
+ }
+
+ for (UniqueChars& extension : extensions_) {
+ if (extension[0] == 'u') {
+ if (!canonicalizeUnicodeExtension(cx, extension, canonicalForm)) {
+ return false;
+ }
+ } else if (extension[0] == 't') {
+ if (!canonicalizeTransformExtension(cx, extension)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/**
+ * CanonicalizeUnicodeExtension( attributes, keywords )
+ *
+ * Canonical syntax per
+ * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>:
+ *
+ * - All attributes and keywords are in lowercase.
+ * - Note: The parser already converted keywords to lowercase.
+ * - All attributes are sorted in alphabetical order.
+ * - All keywords are sorted by alphabetical order of their keys.
+ * - Any type value "true" is removed.
+ *
+ * Canonical form:
+ * - All keys and types use the canonical form (from the name attribute;
+ * see Section 3.6.4 U Extension Data Files).
+ */
+bool LanguageTag::canonicalizeUnicodeExtension(
+ JSContext* cx, JS::UniqueChars& unicodeExtension,
+ UnicodeExtensionCanonicalForm canonicalForm) {
+ const char* const extension = unicodeExtension.get();
+ MOZ_ASSERT(extension[0] == 'u');
+ MOZ_ASSERT(extension[1] == '-');
+ MOZ_ASSERT(IsStructurallyValidExtensionTag({extension, strlen(extension)}));
+
+ size_t length = strlen(extension);
+
+ LanguageTagParser::AttributesVector attributes(cx);
+ LanguageTagParser::KeywordsVector keywords(cx);
+
+ using Attribute = LanguageTagParser::AttributesVector::ElementType;
+ using Keyword = LanguageTagParser::KeywordsVector::ElementType;
+
+ bool ok;
+ JS_TRY_VAR_OR_RETURN_FALSE(
+ cx, ok,
+ LanguageTagParser::parseUnicodeExtension(
+ cx, ConstCharRange(extension, length), attributes, keywords));
+ MOZ_ASSERT(ok, "unexpected invalid Unicode extension subtag");
+
+ auto attributesLessOrEqual = [extension](const Attribute& a,
+ const Attribute& b) {
+ const char* astr = a.begin(extension);
+ const char* bstr = b.begin(extension);
+ size_t alen = a.length();
+ size_t blen = b.length();
+
+ if (int r =
+ std::char_traits<char>::compare(astr, bstr, std::min(alen, blen))) {
+ return r < 0;
+ }
+ return alen <= blen;
+ };
+
+ // All attributes are sorted in alphabetical order.
+ size_t attributesLength = attributes.length();
+ if (attributesLength > 1) {
+ if (!attributes.growByUninitialized(attributesLength)) {
+ return false;
+ }
+
+ MOZ_ALWAYS_TRUE(
+ MergeSort(attributes.begin(), attributesLength,
+ attributes.begin() + attributesLength,
+ [&](const auto& a, const auto& b, bool* lessOrEqualp) {
+ *lessOrEqualp = attributesLessOrEqual(a, b);
+ return true;
+ }));
+
+ attributes.shrinkBy(attributesLength);
+ }
+
+ auto keywordsLessOrEqual = [extension](const Keyword& a, const Keyword& b) {
+ const char* astr = a.begin(extension);
+ const char* bstr = b.begin(extension);
+ MOZ_ASSERT(a.length() >= UnicodeKeyLength);
+ MOZ_ASSERT(b.length() >= UnicodeKeyLength);
+
+ return std::char_traits<char>::compare(astr, bstr, UnicodeKeyLength) <= 0;
+ };
+
+ // All keywords are sorted by alphabetical order of keys.
+ size_t keywordsLength = keywords.length();
+ if (keywordsLength > 1) {
+ if (!keywords.growByUninitialized(keywordsLength)) {
+ return false;
+ }
+
+ // Using merge sort, being a stable sort algorithm, guarantees that two
+ // keywords using the same key are never reordered. That means for example
+ // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to
+ // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs
+ // before "nu-latn".
+ // This is required so that deduplication below preserves the first keyword
+ // for a given key and discards the rest.
+ MOZ_ALWAYS_TRUE(MergeSort(
+ keywords.begin(), keywordsLength, keywords.begin() + keywordsLength,
+ [&](const auto& a, const auto& b, bool* lessOrEqualp) {
+ *lessOrEqualp = keywordsLessOrEqual(a, b);
+ return true;
+ }));
+
+ keywords.shrinkBy(keywordsLength);
+ }
+
+ Vector<char, 32> sb(cx);
+ if (!sb.append('u')) {
+ return false;
+ }
+
+ // Append all Unicode extension attributes.
+ for (size_t i = 0; i < attributes.length(); i++) {
+ const auto& attribute = attributes[i];
+
+ // Skip duplicate attributes.
+ if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) {
+ const auto& lastAttribute = attributes[i - 1];
+ if (attribute.length() == lastAttribute.length() &&
+ std::char_traits<char>::compare(attribute.begin(extension),
+ lastAttribute.begin(extension),
+ attribute.length()) == 0) {
+ continue;
+ }
+ MOZ_ASSERT(!attributesLessOrEqual(attribute, lastAttribute));
+ }
+
+ if (!sb.append('-')) {
+ return false;
+ }
+ if (!sb.append(attribute.begin(extension), attribute.length())) {
+ return false;
+ }
+ }
+
+ static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1;
+
+ static auto isTrue = [](const ConstCharRange& type) {
+ constexpr char True[] = "true";
+ const size_t TrueLength = strlen(True);
+ return type.length() == TrueLength &&
+ std::char_traits<char>::compare(type.begin().get(), True,
+ TrueLength) == 0;
+ };
+
+ auto appendKey = [&sb, extension](const Keyword& keyword) {
+ MOZ_ASSERT(keyword.length() == UnicodeKeyLength);
+ return sb.append(keyword.begin(extension), UnicodeKeyLength);
+ };
+
+ auto appendKeyword = [&sb, extension](const Keyword& keyword,
+ const ConstCharRange& type) {
+ MOZ_ASSERT(keyword.length() > UnicodeKeyLength);
+
+ // Elide the Unicode extension type "true".
+ if (isTrue(type)) {
+ return sb.append(keyword.begin(extension), UnicodeKeyLength);
+ }
+ // Otherwise append the complete Unicode extension keyword.
+ return sb.append(keyword.begin(extension), keyword.length());
+ };
+
+ auto appendReplacement = [&sb, extension](const Keyword& keyword,
+ const ConstCharRange& replacement) {
+ MOZ_ASSERT(keyword.length() > UnicodeKeyLength);
+
+ // Elide the type "true" if present in the replacement.
+ if (isTrue(replacement)) {
+ return sb.append(keyword.begin(extension), UnicodeKeyLength);
+ }
+ // Otherwise append the Unicode key (including the separator) and the
+ // replaced type.
+ return sb.append(keyword.begin(extension), UnicodeKeyWithSepLength) &&
+ sb.append(replacement.begin().get(), replacement.length());
+ };
+
+ // Append all Unicode extension keywords.
+ for (size_t i = 0; i < keywords.length(); i++) {
+ const auto& keyword = keywords[i];
+
+ // Skip duplicate keywords.
+ if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) {
+ const auto& lastKeyword = keywords[i - 1];
+ if (std::char_traits<char>::compare(keyword.begin(extension),
+ lastKeyword.begin(extension),
+ UnicodeKeyLength) == 0) {
+ continue;
+ }
+ MOZ_ASSERT(!keywordsLessOrEqual(keyword, lastKeyword));
+ }
+
+ if (!sb.append('-')) {
+ return false;
+ }
+
+ if (keyword.length() == UnicodeKeyLength) {
+ // Keyword without type value.
+ if (!appendKey(keyword)) {
+ return false;
+ }
+ } else {
+ ConstCharRange key(keyword.begin(extension), UnicodeKeyLength);
+ ConstCharRange type(keyword.begin(extension) + UnicodeKeyWithSepLength,
+ keyword.length() - UnicodeKeyWithSepLength);
+
+ if (canonicalForm == UnicodeExtensionCanonicalForm::Yes) {
+ // Search if there's a replacement for the current Unicode keyword.
+ if (const char* replacement = replaceUnicodeExtensionType(key, type)) {
+ if (!appendReplacement(
+ keyword, ConstCharRange(replacement, strlen(replacement)))) {
+ return false;
+ }
+ } else {
+ if (!appendKeyword(keyword, type)) {
+ return false;
+ }
+ }
+ } else {
+ if (!appendKeyword(keyword, type)) {
+ return false;
+ }
+ }
+ }
+ }
+
+ // We can keep the previous extension when canonicalization didn't modify it.
+ if (sb.length() != length ||
+ std::char_traits<char>::compare(sb.begin(), extension, length) != 0) {
+ // Null-terminate the new string and replace the previous extension.
+ if (!sb.append('\0')) {
+ return false;
+ }
+ UniqueChars canonical(sb.extractOrCopyRawBuffer());
+ if (!canonical) {
+ return false;
+ }
+ unicodeExtension = std::move(canonical);
+ }
+
+ return true;
+}
+
+template <class Buffer>
+static bool LanguageTagToString(JSContext* cx, const LanguageTag& tag,
+ Buffer& sb) {
+ auto appendSubtag = [&sb](const auto& subtag) {
+ auto range = subtag.range();
+ MOZ_ASSERT(range.length() > 0);
+ return sb.append(range.begin().get(), range.length());
+ };
+
+ auto appendSubtagZ = [&sb](const char* subtag) {
+ MOZ_ASSERT(strlen(subtag) > 0);
+ return sb.append(subtag, strlen(subtag));
+ };
+
+ auto appendSubtagsZ = [&sb, &appendSubtagZ](const auto& subtags) {
+ for (const auto& subtag : subtags) {
+ if (!sb.append('-') || !appendSubtagZ(subtag.get())) {
+ return false;
+ }
+ }
+ return true;
+ };
+
+ // Append the language subtag.
+ if (!appendSubtag(tag.language())) {
+ return false;
+ }
+
+ // Append the script subtag if present.
+ if (tag.script().length() > 0) {
+ if (!sb.append('-') || !appendSubtag(tag.script())) {
+ return false;
+ }
+ }
+
+ // Append the region subtag if present.
+ if (tag.region().length() > 0) {
+ if (!sb.append('-') || !appendSubtag(tag.region())) {
+ return false;
+ }
+ }
+
+ // Append the variant subtags if present.
+ if (!appendSubtagsZ(tag.variants())) {
+ return false;
+ }
+
+ // Append the extensions subtags if present.
+ if (!appendSubtagsZ(tag.extensions())) {
+ return false;
+ }
+
+ // Append the private-use subtag if present.
+ if (tag.privateuse()) {
+ if (!sb.append('-') || !appendSubtagZ(tag.privateuse())) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * CanonicalizeTransformExtension
+ *
+ * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>:
+ *
+ * - These subtags are all in lowercase (that is the canonical casing for these
+ * subtags), [...].
+ *
+ * And per
+ * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>:
+ *
+ * - All keywords and tfields are sorted by alphabetical order of their keys,
+ * within their respective extensions.
+ */
+bool LanguageTag::canonicalizeTransformExtension(
+ JSContext* cx, JS::UniqueChars& transformExtension) {
+ const char* const extension = transformExtension.get();
+ MOZ_ASSERT(extension[0] == 't');
+ MOZ_ASSERT(extension[1] == '-');
+ MOZ_ASSERT(IsStructurallyValidExtensionTag({extension, strlen(extension)}));
+
+ size_t length = strlen(extension);
+
+ LanguageTag tag(cx);
+ LanguageTagParser::TFieldVector fields(cx);
+
+ using TField = LanguageTagParser::TFieldVector::ElementType;
+
+ bool ok;
+ JS_TRY_VAR_OR_RETURN_FALSE(
+ cx, ok,
+ LanguageTagParser::parseTransformExtension(
+ cx, ConstCharRange(extension, length), tag, fields));
+ MOZ_ASSERT(ok, "unexpected invalid transform extension subtag");
+
+ auto tfieldLessOrEqual = [extension](const TField& a, const TField& b) {
+ MOZ_ASSERT(a.length() > TransformKeyLength);
+ MOZ_ASSERT(b.length() > TransformKeyLength);
+ const char* astr = a.begin(extension);
+ const char* bstr = b.begin(extension);
+ return std::char_traits<char>::compare(astr, bstr, TransformKeyLength) <= 0;
+ };
+
+ // All tfields are sorted by alphabetical order of their keys.
+ size_t fieldsLength = fields.length();
+ if (fieldsLength > 1) {
+ if (!fields.growByUninitialized(fieldsLength)) {
+ return false;
+ }
+
+ MOZ_ALWAYS_TRUE(
+ MergeSort(fields.begin(), fieldsLength, fields.begin() + fieldsLength,
+ [&](const auto& a, const auto& b, bool* lessOrEqualp) {
+ *lessOrEqualp = tfieldLessOrEqual(a, b);
+ return true;
+ }));
+
+ fields.shrinkBy(fieldsLength);
+ }
+
+ Vector<char, 32> sb(cx);
+ if (!sb.append('t')) {
+ return false;
+ }
+
+ // Append the language subtag if present.
+ //
+ // [1] is a bit unclear whether or not the `tlang` subtag also needs to be
+ // canonicalized (and case-adjusted). For now simply append it as is.
+ // (|parseTransformExtension| doesn't alter case from the lowercased form we
+ // have previously taken pains to ensure is present in the extension, so no
+ // special effort is required to ensure lowercasing.) If we switch to [2], the
+ // `tlang` subtag also needs to be canonicalized according to the same rules
+ // as `unicode_language_id` subtags are canonicalized. Also see [3].
+ //
+ // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier
+ // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
+ // [3] https://github.com/tc39/ecma402/issues/330
+ if (tag.language().length() > 0) {
+ if (!sb.append('-')) {
+ return false;
+ }
+ if (!LanguageTagToString(cx, tag, sb)) {
+ return false;
+ }
+ }
+
+ // Append all fields.
+ //
+ // UTS 35, 3.2.1 specifies:
+ // - Any type or tfield value "true" is removed.
+ //
+ // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore
+ // this apparently invalid part of the UTS 35 specification and simply
+ // append all `tfield` subtags.
+ for (const auto& field : fields) {
+ if (!sb.append('-')) {
+ return false;
+ }
+ if (!sb.append(field.begin(extension), field.length())) {
+ return false;
+ }
+ }
+
+ // We can keep the previous extension when canonicalization didn't modify it.
+ if (sb.length() != length ||
+ std::char_traits<char>::compare(sb.begin(), extension, length) != 0) {
+ // Null-terminate the new string and replace the previous extension.
+ if (!sb.append('\0')) {
+ return false;
+ }
+ UniqueChars canonical(sb.extractOrCopyRawBuffer());
+ if (!canonical) {
+ return false;
+ }
+ transformExtension = std::move(canonical);
+ }
+
+ return true;
+}
+
+bool LanguageTag::appendTo(JSContext* cx, StringBuffer& sb) const {
+ return LanguageTagToString(cx, *this, sb);
+}
+
+// Zero-terminated ICU Locale ID.
+using LocaleId =
+ js::Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>;
+
+enum class LikelySubtags : bool { Add, Remove };
+
+// Return true iff the language tag is already maximized resp. minimized.
+static bool HasLikelySubtags(LikelySubtags likelySubtags,
+ const LanguageTag& tag) {
+ // The language tag is already maximized if the language, script, and region
+ // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are
+ // used.
+ if (likelySubtags == LikelySubtags::Add) {
+ return !tag.language().equalTo("und") &&
+ (tag.script().length() > 0 && !tag.script().equalTo("Zzzz")) &&
+ (tag.region().length() > 0 && !tag.region().equalTo("ZZ"));
+ }
+
+ // The language tag is already minimized if it only contains a language
+ // subtag whose value is not the placeholder value "und".
+ return !tag.language().equalTo("und") && tag.script().length() == 0 &&
+ tag.region().length() == 0;
+}
+
+// Create an ICU locale ID from the given language tag.
+static bool CreateLocaleForLikelySubtags(const LanguageTag& tag,
+ LocaleId& locale) {
+ MOZ_ASSERT(locale.length() == 0);
+
+ auto appendSubtag = [&locale](const auto& subtag) {
+ auto range = subtag.range();
+ MOZ_ASSERT(range.length() > 0);
+ return locale.append(range.begin().get(), range.length());
+ };
+
+ // Append the language subtag.
+ if (!appendSubtag(tag.language())) {
+ return false;
+ }
+
+ // Append the script subtag if present.
+ if (tag.script().length() > 0) {
+ if (!locale.append('_') || !appendSubtag(tag.script())) {
+ return false;
+ }
+ }
+
+ // Append the region subtag if present.
+ if (tag.region().length() > 0) {
+ if (!locale.append('_') || !appendSubtag(tag.region())) {
+ return false;
+ }
+ }
+
+ // Zero-terminated for use with ICU.
+ return locale.append('\0');
+}
+
+// Assign the language, script, and region subtags from an ICU locale ID.
+//
+// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to
+// retrieve these subtags, but unfortunately these functions are rather slow, so
+// we use our own implementation.
+static bool AssignFromLocaleId(JSContext* cx, LocaleId& localeId,
+ LanguageTag& tag) {
+ MOZ_ASSERT(localeId.back() == '\0',
+ "Locale ID should be zero-terminated for ICU");
+
+ // Replace the ICU locale ID separator.
+ std::replace(localeId.begin(), localeId.end(), '_', '-');
+
+ // ICU replaces "und" with the empty string, which means "und" becomes "" and
+ // "und-Latn" becomes "-Latn". Handle this case separately.
+ if (localeId[0] == '\0' || localeId[0] == '-') {
+ static constexpr char und[] = "und";
+ size_t length = strlen(und);
+
+ // Insert "und" in front of the locale ID.
+ if (!localeId.growBy(length)) {
+ return false;
+ }
+ memmove(localeId.begin() + length, localeId.begin(), localeId.length());
+ memmove(localeId.begin(), und, length);
+ }
+
+ ConstCharRange localeRange(localeId.begin(), localeId.length() - 1);
+
+ // Retrieve the language, script, and region subtags from the locale ID, but
+ // ignore any other subtags.
+ LanguageTag localeTag(cx);
+ if (!LanguageTagParser::parseBaseName(cx, localeRange, localeTag)) {
+ return false;
+ }
+
+ tag.setLanguage(localeTag.language());
+ tag.setScript(localeTag.script());
+ tag.setRegion(localeTag.region());
+
+ return true;
+}
+
+template <decltype(uloc_addLikelySubtags) likelySubtagsFn>
+static bool CallLikelySubtags(JSContext* cx, const LocaleId& localeId,
+ LocaleId& result) {
+ // Locale ID must be zero-terminated before passing it to ICU.
+ MOZ_ASSERT(localeId.back() == '\0');
+ MOZ_ASSERT(result.length() == 0);
+
+ int32_t length = intl::CallICU(
+ cx,
+ result,
+ [&localeId](char* chars, int32_t size, UErrorCode* status) {
+ return likelySubtagsFn(localeId.begin(), chars, size, status);
+ });
+ if (length < 0) {
+ return false;
+ }
+
+ MOZ_ASSERT(
+ size_t(length) <= LocaleId::InlineLength,
+ "Unexpected extra subtags were added by ICU. If this assertion ever "
+ "fails, simply remove it and move on like nothing ever happended.");
+
+ // Resize the vector to the actual string length.
+ result.shrinkTo(length);
+
+ // Zero-terminated for use with ICU.
+ return result.append('\0');
+}
+
+// The canonical way to compute the Unicode BCP 47 locale identifier with likely
+// subtags is as follows:
+//
+// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU
+// locale ID.
+// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID.
+// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into
+// a Unicode BCP 47 locale identifier.
+//
+// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow
+// and we know, by construction, that the input Unicode BCP 47 locale identifier
+// only contains valid language, script, and region subtags, we can avoid both
+// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and
+// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of
+// |Intl.Locale.prototype.maximize|.)
+static bool LikelySubtags(JSContext* cx, LikelySubtags likelySubtags,
+ LanguageTag& tag) {
+ // Return early if the input is already maximized/minimized.
+ if (HasLikelySubtags(likelySubtags, tag)) {
+ return true;
+ }
+
+ // Create the locale ID for the input argument.
+ LocaleId locale(cx);
+ if (!CreateLocaleForLikelySubtags(tag, locale)) {
+ return false;
+ }
+
+ // UTS #35 requires that locale ID is maximized before its likely subtags are
+ // removed, so we need to call uloc_addLikelySubtags() for both cases.
+ // See <https://ssl.icu-project.org/trac/ticket/10220> and
+ // <https://ssl.icu-project.org/trac/ticket/12345>.
+
+ LocaleId localeLikelySubtags(cx);
+
+ // Add likely subtags to the locale ID. When minimizing we can skip adding the
+ // likely subtags for already maximized tags. (When maximizing we've already
+ // verified above that the tag is missing likely subtags.)
+ bool addLikelySubtags = likelySubtags == LikelySubtags::Add ||
+ !HasLikelySubtags(LikelySubtags::Add, tag);
+
+ if (addLikelySubtags) {
+ if (!CallLikelySubtags<uloc_addLikelySubtags>(cx, locale,
+ localeLikelySubtags)) {
+ return false;
+ }
+ }
+
+ // Now that we've succesfully maximized the locale, we can minimize it.
+ if (likelySubtags == LikelySubtags::Remove) {
+ if (addLikelySubtags) {
+ // Copy the maximized subtags back into |locale|.
+ locale = std::move(localeLikelySubtags);
+ localeLikelySubtags = LocaleId(cx);
+ }
+
+ // Remove likely subtags from the locale ID.
+ if (!CallLikelySubtags<uloc_minimizeSubtags>(cx, locale,
+ localeLikelySubtags)) {
+ return false;
+ }
+ }
+
+ // Assign the language, script, and region subtags from the locale ID.
+ if (!AssignFromLocaleId(cx, localeLikelySubtags, tag)) {
+ return false;
+ }
+
+ // Update mappings in case ICU returned a non-canonical locale.
+ return tag.canonicalizeBaseName(cx);
+}
+
+bool LanguageTag::addLikelySubtags(JSContext* cx) {
+ return LikelySubtags(cx, LikelySubtags::Add, *this);
+}
+
+bool LanguageTag::removeLikelySubtags(JSContext* cx) {
+ return LikelySubtags(cx, LikelySubtags::Remove, *this);
+}
+
+LanguageTagParser::Token LanguageTagParser::nextToken() {
+ MOZ_ASSERT(index_ <= length_ + 1, "called after 'None' token was read");
+
+ TokenKind kind = TokenKind::None;
+ size_t tokenLength = 0;
+ for (size_t i = index_; i < length_; i++) {
+ // UTS 35, section 3.1.
+ // alpha = [A-Z a-z] ;
+ // digit = [0-9] ;
+ char16_t c = charAtUnchecked(i);
+ if (mozilla::IsAsciiAlpha(c)) {
+ kind |= TokenKind::Alpha;
+ } else if (mozilla::IsAsciiDigit(c)) {
+ kind |= TokenKind::Digit;
+ } else if (c == '-' && i > index_ && i + 1 < length_) {
+ break;
+ } else {
+ return {TokenKind::Error, 0, 0};
+ }
+ tokenLength += 1;
+ }
+
+ Token token{kind, index_, tokenLength};
+ index_ += tokenLength + 1;
+ return token;
+}
+
+UniqueChars LanguageTagParser::chars(JSContext* cx, size_t index,
+ size_t length) const {
+ // Add +1 to null-terminate the string.
+ auto chars = cx->make_pod_array<char>(length + 1);
+ if (chars) {
+ char* dest = chars.get();
+ if (locale_.is<const JS::Latin1Char*>()) {
+ std::copy_n(locale_.as<const JS::Latin1Char*>() + index, length, dest);
+ } else {
+ std::copy_n(locale_.as<const char16_t*>() + index, length, dest);
+ }
+ dest[length] = '\0';
+ }
+ return chars;
+}
+
+UniqueChars LanguageTagParser::extension(JSContext* cx, const Token& start,
+ const Token& end) const {
+ MOZ_ASSERT(start.index() < end.index());
+
+ size_t length = end.index() - 1 - start.index();
+ UniqueChars extension = chars(cx, start.index(), length);
+ if (extension) {
+ AsciiToLowerCase(extension.get(), length, extension.get());
+ }
+ return extension;
+}
+
+// Parse the `unicode_language_id` production.
+//
+// unicode_language_id = unicode_language_subtag
+// (sep unicode_script_subtag)?
+// (sep unicode_region_subtag)?
+// (sep unicode_variant_subtag)* ;
+//
+// sep = "-"
+//
+// Note: Unicode CLDR locale identifier backward compatibility extensions
+// removed from `unicode_language_id`.
+//
+// |tok| is the current token from |ts|.
+//
+// The trailing |parseType| argument corresponds to one of two modes.
+//
+// In the |BaseNameParsing::Normal| mode, our input is in unknown case and is
+// potentially invalid. |tag| will be filled with canonically-cased output, and
+// duplicate variants will lead to an error.
+//
+// In the |BaseNameParsing::WithinTransformExtension| mode, our input is the
+// `tlang` in a lowercased `transform_extensions`. |tag| subtags will be
+// directly copied from the input (i.e. in lowercase). Variant subtags in the
+// `tlang` subtag may contain duplicates.
+//
+// Do not use this function directly: use |parseBaseName| or
+// |parseTlangFromTransformExtension| instead.
+JS::Result<bool> LanguageTagParser::internalParseBaseName(
+ JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok,
+ BaseNameParsing parseType) {
+#ifdef DEBUG
+ auto isAsciiLowerCase = [](const auto& range) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ const char* ptr = range.begin().get();
+ size_t length = range.length();
+ return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<char>);
+ };
+ auto isAsciiDigit = [](const auto& range) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ const char* ptr = range.begin().get();
+ size_t length = range.length();
+ return std::all_of(ptr, ptr + length, mozilla::IsAsciiDigit<char>);
+ };
+#endif
+
+ if (ts.isLanguage(tok)) {
+ ts.copyChars(tok, tag.language_);
+
+ // Language codes need to be in lower case. "JA" -> "ja"
+ if (parseType == BaseNameParsing::Normal) {
+ tag.language_.toLowerCase();
+ } else {
+ MOZ_ASSERT(isAsciiLowerCase(tag.language_.range()));
+ }
+
+ tok = ts.nextToken();
+ } else {
+ MOZ_ASSERT(parseType == BaseNameParsing::Normal);
+
+ // The language subtag is mandatory.
+ return false;
+ }
+
+ if (ts.isScript(tok)) {
+ ts.copyChars(tok, tag.script_);
+
+ // The first character of a script code needs to be capitalized.
+ // "hans" -> "Hans"
+ if (parseType == BaseNameParsing::Normal) {
+ tag.script_.toTitleCase();
+ } else {
+ MOZ_ASSERT(isAsciiLowerCase(tag.script_.range()));
+ }
+
+ tok = ts.nextToken();
+ }
+
+ if (ts.isRegion(tok)) {
+ ts.copyChars(tok, tag.region_);
+
+ // Region codes need to be in upper case. "bu" -> "BU"
+ if (parseType == BaseNameParsing::Normal) {
+ tag.region_.toUpperCase();
+ } else {
+ MOZ_ASSERT_IF(tok.length() == 2, isAsciiLowerCase(tag.region_.range()));
+ MOZ_ASSERT_IF(tok.length() == 3, isAsciiDigit(tag.region_.range()));
+ }
+
+ tok = ts.nextToken();
+ }
+
+ auto& variants = tag.variants_;
+ MOZ_ASSERT(variants.length() == 0);
+ while (ts.isVariant(tok)) {
+ auto variant = ts.chars(cx, tok);
+ if (!variant) {
+ return cx->alreadyReportedOOM();
+ }
+
+ if (parseType == BaseNameParsing::Normal) {
+ // Locale identifiers are case insensitive (UTS 35, section 3.2).
+ // All seen variants are compared ignoring case differences by using the
+ // lower case form. This allows to properly detect and reject variant
+ // repetitions with differing case, e.g. "en-variant-Variant".
+ AsciiToLowerCase(variant.get(), tok.length(), variant.get());
+
+ // Reject the Locale identifier if a duplicate variant was found.
+ //
+ // This linear-time verification step means the whole variant subtag
+ // checking is potentially quadratic. Language tags are unlikely to be
+ // deliberately pathological, so this is okay at least for now.
+ for (const auto& seenVariant : variants) {
+ if (strcmp(variant.get(), seenVariant.get()) == 0) {
+ return false;
+ }
+ }
+ } else {
+ // When parsing variants in a `tlang` subtag, duplicates are allowed.
+ }
+
+ if (!variants.append(std::move(variant))) {
+ return cx->alreadyReportedOOM();
+ }
+
+ tok = ts.nextToken();
+ }
+
+ return true;
+}
+
+static mozilla::Variant<const Latin1Char*, const char16_t*> StringChars(
+ const char* locale) {
+ return mozilla::AsVariant(reinterpret_cast<const JS::Latin1Char*>(locale));
+}
+
+static mozilla::Variant<const Latin1Char*, const char16_t*> StringChars(
+ JSLinearString* linear, JS::AutoCheckCannotGC& nogc) {
+ if (linear->hasLatin1Chars()) {
+ return mozilla::AsVariant(linear->latin1Chars(nogc));
+ }
+ return mozilla::AsVariant(linear->twoByteChars(nogc));
+}
+
+JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
+ JSLinearString* locale,
+ LanguageTag& tag) {
+ JS::AutoCheckCannotGC nogc;
+ LocaleChars localeChars = StringChars(locale, nogc);
+
+ // unicode_locale_id = unicode_language_id
+ // extensions*
+ // pu_extensions? ;
+
+ LanguageTagParser ts(localeChars, locale->length());
+ Token tok = ts.nextToken();
+
+ bool ok;
+ MOZ_TRY_VAR(ok, parseBaseName(cx, ts, tag, tok));
+ if (!ok) {
+ return false;
+ }
+
+ // extensions = unicode_locale_extensions
+ // | transformed_extensions
+ // | other_extensions ;
+
+ // Bit set of seen singletons.
+ uint64_t seenSingletons = 0;
+
+ auto& extensions = tag.extensions_;
+ while (ts.isExtensionStart(tok)) {
+ char singleton = ts.singletonKey(tok);
+
+ // Reject the input if a duplicate singleton was found.
+ uint64_t hash = 1ULL << (mozilla::AsciiAlphanumericToNumber(singleton) + 1);
+ if (seenSingletons & hash) {
+ return false;
+ }
+ seenSingletons |= hash;
+
+ Token start = tok;
+ tok = ts.nextToken();
+
+ // We'll check for missing non-singleton subtags after this block by
+ // comparing |startValue| with the then-current position.
+ size_t startValue = tok.index();
+
+ if (singleton == 'u') {
+ while (ts.isUnicodeExtensionPart(tok)) {
+ tok = ts.nextToken();
+ }
+ } else if (singleton == 't') {
+ // transformed_extensions = sep [tT]
+ // ((sep tlang (sep tfield)*)
+ // | (sep tfield)+) ;
+
+ // tlang = unicode_language_subtag
+ // (sep unicode_script_subtag)?
+ // (sep unicode_region_subtag)?
+ // (sep unicode_variant_subtag)* ;
+ if (ts.isLanguage(tok)) {
+ tok = ts.nextToken();
+
+ if (ts.isScript(tok)) {
+ tok = ts.nextToken();
+ }
+
+ if (ts.isRegion(tok)) {
+ tok = ts.nextToken();
+ }
+
+ while (ts.isVariant(tok)) {
+ tok = ts.nextToken();
+ }
+ }
+
+ // tfield = tkey tvalue;
+ while (ts.isTransformExtensionKey(tok)) {
+ tok = ts.nextToken();
+
+ size_t startTValue = tok.index();
+ while (ts.isTransformExtensionPart(tok)) {
+ tok = ts.nextToken();
+ }
+
+ // `tfield` requires at least one `tvalue`.
+ if (tok.index() <= startTValue) {
+ return false;
+ }
+ }
+ } else {
+ while (ts.isOtherExtensionPart(tok)) {
+ tok = ts.nextToken();
+ }
+ }
+
+ // Singletons must be followed by a non-singleton subtag, "en-a-b" is not
+ // allowed.
+ if (tok.index() <= startValue) {
+ return false;
+ }
+
+ UniqueChars extension = ts.extension(cx, start, tok);
+ if (!extension) {
+ return cx->alreadyReportedOOM();
+ }
+ if (!extensions.append(std::move(extension))) {
+ return cx->alreadyReportedOOM();
+ }
+ }
+
+ // Trailing `pu_extension` component of the `unicode_locale_id` production.
+ if (ts.isPrivateUseStart(tok)) {
+ Token start = tok;
+ tok = ts.nextToken();
+
+ size_t startValue = tok.index();
+ while (ts.isPrivateUsePart(tok)) {
+ tok = ts.nextToken();
+ }
+
+ // There must be at least one subtag after the "-x-".
+ if (tok.index() <= startValue) {
+ return false;
+ }
+
+ UniqueChars privateUse = ts.extension(cx, start, tok);
+ if (!privateUse) {
+ return cx->alreadyReportedOOM();
+ }
+ tag.privateuse_ = std::move(privateUse);
+ }
+
+ // Return true if the complete input was successfully parsed.
+ return tok.isNone();
+}
+
+bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale,
+ LanguageTag& tag) {
+ bool ok;
+ JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag));
+ if (ok) {
+ return true;
+ }
+ if (UniqueChars localeChars = StringToNewUTF8CharsZ(cx, *locale)) {
+ JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr,
+ JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
+ }
+ return false;
+}
+
+bool LanguageTagParser::parseBaseName(JSContext* cx, ConstCharRange locale,
+ LanguageTag& tag) {
+ LocaleChars localeChars = StringChars(locale.begin().get());
+ LanguageTagParser ts(localeChars, locale.length());
+ Token tok = ts.nextToken();
+
+ // Parse only the base-name part and ignore any trailing characters.
+ bool ok;
+ JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, parseBaseName(cx, ts, tag, tok));
+ if (ok) {
+ return true;
+ }
+ if (UniqueChars localeChars =
+ DuplicateString(locale.begin().get(), locale.length())) {
+ JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr,
+ JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
+ } else {
+ JS_ReportOutOfMemory(cx);
+ }
+ return false;
+}
+
+// Parse |extension|, which must be a valid `transformed_extensions` subtag, and
+// fill |tag| and |fields| from the `tlang` and `tfield` components.
+JS::Result<bool> LanguageTagParser::parseTransformExtension(
+ JSContext* cx, ConstCharRange extension, LanguageTag& tag,
+ TFieldVector& fields) {
+ LocaleChars extensionChars = StringChars(extension.begin().get());
+ LanguageTagParser ts(extensionChars, extension.length());
+ Token tok = ts.nextToken();
+
+ if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 't') {
+ return false;
+ }
+
+ tok = ts.nextToken();
+
+ if (tok.isNone()) {
+ return false;
+ }
+
+ if (ts.isLanguage(tok)) {
+ // We're parsing a possible `tlang` in a known-valid transform extension, so
+ // use the special-purpose function that takes advantage of this to compute
+ // lowercased |tag| contents in an optimal manner.
+ MOZ_TRY(parseTlangInTransformExtension(cx, ts, tag, tok));
+
+ // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end
+ // of the transform extension.
+ MOZ_ASSERT(ts.isTransformExtensionKey(tok) || tok.isNone());
+ } else {
+ // If there's no `tlang` subtag, at least one `tfield` must be present.
+ MOZ_ASSERT(ts.isTransformExtensionKey(tok));
+ }
+
+ // Trailing `tfield` subtags. (Any other trailing subtags are an error,
+ // because we're guaranteed to only see a valid tranform extension here.)
+ while (ts.isTransformExtensionKey(tok)) {
+ size_t begin = tok.index();
+ tok = ts.nextToken();
+
+ size_t startTValue = tok.index();
+ while (ts.isTransformExtensionPart(tok)) {
+ tok = ts.nextToken();
+ }
+
+ // `tfield` requires at least one `tvalue`.
+ if (tok.index() <= startTValue) {
+ return false;
+ }
+
+ size_t length = tok.index() - 1 - begin;
+ if (!fields.emplaceBack(begin, length)) {
+ return cx->alreadyReportedOOM();
+ }
+ }
+
+ // Return true if the complete input was successfully parsed.
+ return tok.isNone();
+}
+
+// Parse |extension|, which must be a valid `unicode_locale_extensions` subtag,
+// and fill |attributes| and |keywords| from the `attribute` and `keyword`
+// components.
+JS::Result<bool> LanguageTagParser::parseUnicodeExtension(
+ JSContext* cx, ConstCharRange extension, AttributesVector& attributes,
+ KeywordsVector& keywords) {
+ LocaleChars extensionChars = StringChars(extension.begin().get());
+ LanguageTagParser ts(extensionChars, extension.length());
+ Token tok = ts.nextToken();
+
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
+ // (sep attribute)+ (sep keyword)*) ;
+
+ if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') {
+ return false;
+ }
+
+ tok = ts.nextToken();
+
+ if (tok.isNone()) {
+ return false;
+ }
+
+ while (ts.isUnicodeExtensionAttribute(tok)) {
+ if (!attributes.emplaceBack(tok.index(), tok.length())) {
+ return cx->alreadyReportedOOM();
+ }
+
+ tok = ts.nextToken();
+ }
+
+ // keyword = key (sep type)? ;
+ while (ts.isUnicodeExtensionKey(tok)) {
+ size_t begin = tok.index();
+ tok = ts.nextToken();
+
+ while (ts.isUnicodeExtensionType(tok)) {
+ tok = ts.nextToken();
+ }
+
+ if (tok.isError()) {
+ return false;
+ }
+
+ size_t length = tok.index() - 1 - begin;
+ if (!keywords.emplaceBack(begin, length)) {
+ return cx->alreadyReportedOOM();
+ }
+ }
+
+ // Return true if the complete input was successfully parsed.
+ return tok.isNone();
+}
+
+bool LanguageTagParser::canParseUnicodeExtension(ConstCharRange extension) {
+ LocaleChars extensionChars = StringChars(extension.begin().get());
+ LanguageTagParser ts(extensionChars, extension.length());
+ Token tok = ts.nextToken();
+
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
+ // (sep attribute)+ (sep keyword)*) ;
+
+ if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') {
+ return false;
+ }
+
+ tok = ts.nextToken();
+
+ if (tok.isNone()) {
+ return false;
+ }
+
+ while (ts.isUnicodeExtensionAttribute(tok)) {
+ tok = ts.nextToken();
+ }
+
+ // keyword = key (sep type)? ;
+ while (ts.isUnicodeExtensionKey(tok)) {
+ tok = ts.nextToken();
+
+ while (ts.isUnicodeExtensionType(tok)) {
+ tok = ts.nextToken();
+ }
+
+ if (tok.isError()) {
+ return false;
+ }
+ }
+
+ // Return true if the complete input was successfully parsed.
+ return tok.isNone();
+}
+
+bool LanguageTagParser::canParseUnicodeExtensionType(
+ JSLinearString* unicodeType) {
+ JS::AutoCheckCannotGC nogc;
+ LocaleChars unicodeTypeChars = StringChars(unicodeType, nogc);
+
+ LanguageTagParser ts(unicodeTypeChars, unicodeType->length());
+ Token tok = ts.nextToken();
+
+ while (ts.isUnicodeExtensionType(tok)) {
+ tok = ts.nextToken();
+ }
+
+ // Return true if the complete input was successfully parsed.
+ return tok.isNone();
+}
+
+bool ParseStandaloneLanguagTag(HandleLinearString str, LanguageSubtag& result) {
+ auto isLanguage = [](const auto* language, size_t length) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ using T = std::remove_pointer_t<decltype(language)>;
+ return length >= 2 && length != 4 && length <= 8 &&
+ std::all_of(language, language + length, mozilla::IsAsciiAlpha<T>);
+ };
+
+ JS::AutoCheckCannotGC nogc;
+ if (str->hasLatin1Chars()) {
+ if (!isLanguage(str->latin1Chars(nogc), str->length())) {
+ return false;
+ }
+ result.set(str->latin1Range(nogc));
+ } else {
+ if (!isLanguage(str->twoByteChars(nogc), str->length())) {
+ return false;
+ }
+ result.set(str->twoByteRange(nogc));
+ }
+ result.toLowerCase();
+ return true;
+}
+
+bool ParseStandaloneScriptTag(HandleLinearString str, ScriptSubtag& result) {
+ auto isScript = [](const auto* script, size_t length) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ using T = std::remove_pointer_t<decltype(script)>;
+ return length == ScriptLength &&
+ std::all_of(script, script + ScriptLength, mozilla::IsAsciiAlpha<T>);
+ };
+
+ JS::AutoCheckCannotGC nogc;
+ if (str->hasLatin1Chars()) {
+ if (!isScript(str->latin1Chars(nogc), str->length())) {
+ return false;
+ }
+ result.set(str->latin1Range(nogc));
+ } else {
+ if (!isScript(str->twoByteChars(nogc), str->length())) {
+ return false;
+ }
+ result.set(str->twoByteRange(nogc));
+ }
+ result.toTitleCase();
+ return true;
+}
+
+bool ParseStandaloneRegionTag(HandleLinearString str, RegionSubtag& result) {
+ auto isRegion = [](const auto* region, size_t length) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ using T = std::remove_pointer_t<decltype(region)>;
+ return (length == AlphaRegionLength &&
+ std::all_of(region, region + AlphaRegionLength,
+ mozilla::IsAsciiAlpha<T>)) ||
+ (length == DigitRegionLength &&
+ std::all_of(region, region + DigitRegionLength,
+ mozilla::IsAsciiDigit<T>));
+ };
+
+ JS::AutoCheckCannotGC nogc;
+ if (str->hasLatin1Chars()) {
+ if (!isRegion(str->latin1Chars(nogc), str->length())) {
+ return false;
+ }
+ result.set(str->latin1Range(nogc));
+ } else {
+ if (!isRegion(str->twoByteChars(nogc), str->length())) {
+ return false;
+ }
+ result.set(str->twoByteRange(nogc));
+ }
+ result.toUpperCase();
+ return true;
+}
+
+template <typename CharT>
+static bool IsAsciiLowercaseAlpha(const mozilla::Range<const CharT>& range) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ const CharT* ptr = range.begin().get();
+ size_t length = range.length();
+ return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<CharT>);
+}
+
+static bool IsAsciiLowercaseAlpha(JSLinearString* str) {
+ JS::AutoCheckCannotGC nogc;
+ return str->hasLatin1Chars() ? IsAsciiLowercaseAlpha(str->latin1Range(nogc))
+ : IsAsciiLowercaseAlpha(str->twoByteRange(nogc));
+}
+
+template <typename CharT>
+static bool IsAsciiAlpha(const mozilla::Range<const CharT>& range) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ const CharT* ptr = range.begin().get();
+ size_t length = range.length();
+ return std::all_of(ptr, ptr + length, mozilla::IsAsciiAlpha<CharT>);
+}
+
+static bool IsAsciiAlpha(JSLinearString* str) {
+ JS::AutoCheckCannotGC nogc;
+ return str->hasLatin1Chars() ? IsAsciiAlpha(str->latin1Range(nogc))
+ : IsAsciiAlpha(str->twoByteRange(nogc));
+}
+
+JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx,
+ HandleLinearString str) {
+ // ISO-639 language codes contain either two or three characters.
+ size_t length = str->length();
+ if (length != 2 && length != 3) {
+ return nullptr;
+ }
+
+ // We can directly the return the input below if it's in the correct case.
+ bool isLowerCase = IsAsciiLowercaseAlpha(str);
+ if (!isLowerCase) {
+ // Must be an ASCII alpha string.
+ if (!IsAsciiAlpha(str)) {
+ return nullptr;
+ }
+ }
+
+ LanguageSubtag languageTag;
+ if (str->hasLatin1Chars()) {
+ JS::AutoCheckCannotGC nogc;
+ languageTag.set(str->latin1Range(nogc));
+ } else {
+ JS::AutoCheckCannotGC nogc;
+ languageTag.set(str->twoByteRange(nogc));
+ }
+
+ if (!isLowerCase) {
+ // The language subtag is canonicalized to lower case.
+ languageTag.toLowerCase();
+ }
+
+ // Reject the input if the canonical tag contains more than just a single
+ // language subtag.
+ if (LanguageTag::complexLanguageMapping(languageTag)) {
+ return nullptr;
+ }
+
+ // Take care to replace deprecated subtags with their preferred values.
+ JSString* result;
+ if (LanguageTag::languageMapping(languageTag) || !isLowerCase) {
+ auto range = languageTag.range();
+ result = NewStringCopyN<CanGC>(cx, range.begin().get(), range.length());
+ } else {
+ result = str;
+ }
+ if (!result) {
+ return cx->alreadyReportedOOM();
+ }
+ return result;
+}
+
+} // namespace intl
+} // namespace js
diff --git a/js/src/builtin/intl/LanguageTag.h b/js/src/builtin/intl/LanguageTag.h
new file mode 100644
index 0000000000..5f190757b8
--- /dev/null
+++ b/js/src/builtin/intl/LanguageTag.h
@@ -0,0 +1,722 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Structured representation of Unicode locale IDs used with Intl functions. */
+
+#ifndef builtin_intl_LanguageTag_h
+#define builtin_intl_LanguageTag_h
+
+#include "mozilla/Assertions.h"
+#include "mozilla/Range.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/TypedEnumBits.h"
+#include "mozilla/Variant.h"
+
+#include <algorithm>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <utility>
+
+#include "jsalloc.h"
+#include "js/Result.h"
+
+#include "js/GCAPI.h"
+#include "js/Utility.h"
+#include "js/Vector.h"
+
+struct JSContext;
+class JSLinearString;
+class JSString;
+
+namespace js {
+
+class StringBuffer;
+
+namespace intl {
+
+#ifdef DEBUG
+
+/**
+ * Return true if |language| is a valid, case-normalized language subtag.
+ */
+template <typename CharT>
+bool IsStructurallyValidLanguageTag(
+ const mozilla::Range<const CharT>& language);
+
+/**
+ * Return true if |script| is a valid, case-normalized script subtag.
+ */
+template <typename CharT>
+bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script);
+
+/**
+ * Return true if |region| is a valid, case-normalized region subtag.
+ */
+template <typename CharT>
+bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region);
+
+/**
+ * Return true if |variant| is a valid, case-normalized variant subtag.
+ */
+bool IsStructurallyValidVariantTag(const mozilla::Range<const char>& variant);
+
+/**
+ * Return true if |extension| is a valid, case-normalized Unicode extension
+ * subtag.
+ */
+bool IsStructurallyValidUnicodeExtensionTag(
+ const mozilla::Range<const char>& extension);
+
+/**
+ * Return true if |privateUse| is a valid, case-normalized private-use subtag.
+ */
+bool IsStructurallyValidPrivateUseTag(
+ const mozilla::Range<const char>& privateUse);
+
+#endif
+
+template <typename CharT>
+char AsciiToLowerCase(CharT c) {
+ MOZ_ASSERT(mozilla::IsAscii(c));
+ return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
+}
+
+template <typename CharT>
+char AsciiToUpperCase(CharT c) {
+ MOZ_ASSERT(mozilla::IsAscii(c));
+ return mozilla::IsAsciiLowercaseAlpha(c) ? (c & ~0x20) : c;
+}
+
+template <typename CharT>
+void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
+ // Tell the analysis the |std::transform| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ char (&fn)(CharT) = AsciiToLowerCase;
+ std::transform(chars, chars + length, dest, fn);
+}
+
+template <typename CharT>
+void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
+ // Tell the analysis the |std::transform| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ char (&fn)(CharT) = AsciiToUpperCase;
+ std::transform(chars, chars + length, dest, fn);
+}
+
+template <typename CharT>
+void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
+ if (length > 0) {
+ AsciiToUpperCase(chars, 1, dest);
+ AsciiToLowerCase(chars + 1, length - 1, dest + 1);
+ }
+}
+
+// Constants for language subtag lengths.
+namespace LanguageTagLimits {
+
+// unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+static constexpr size_t LanguageLength = 8;
+
+// unicode_script_subtag = alpha{4} ;
+static constexpr size_t ScriptLength = 4;
+
+// unicode_region_subtag = (alpha{2} | digit{3}) ;
+static constexpr size_t RegionLength = 3;
+static constexpr size_t AlphaRegionLength = 2;
+static constexpr size_t DigitRegionLength = 3;
+
+// key = alphanum alpha ;
+static constexpr size_t UnicodeKeyLength = 2;
+
+// tkey = alpha digit ;
+static constexpr size_t TransformKeyLength = 2;
+
+} // namespace LanguageTagLimits
+
+// Fixed size language subtag which is stored inline in LanguageTag.
+template <size_t Length>
+class LanguageTagSubtag final {
+ uint8_t length_ = 0;
+ char chars_[Length];
+
+ public:
+ LanguageTagSubtag() = default;
+
+ LanguageTagSubtag(const LanguageTagSubtag&) = delete;
+ LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
+
+ size_t length() const { return length_; }
+
+ mozilla::Range<const char> range() const { return {chars_, length_}; }
+
+ template <typename CharT>
+ void set(const mozilla::Range<const CharT>& str) {
+ MOZ_ASSERT(str.length() <= Length);
+ std::copy_n(str.begin().get(), str.length(), chars_);
+ length_ = str.length();
+ }
+
+ void toLowerCase() { AsciiToLowerCase(chars_, length(), chars_); }
+
+ void toUpperCase() { AsciiToUpperCase(chars_, length(), chars_); }
+
+ void toTitleCase() { AsciiToTitleCase(chars_, length(), chars_); }
+
+ template <size_t N>
+ bool equalTo(const char (&str)[N]) const {
+ static_assert(N - 1 <= Length,
+ "subtag literals must not exceed the maximum subtag length");
+
+ return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
+ }
+};
+
+using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
+using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
+using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
+
+/**
+ * Object representing a language tag.
+ *
+ * All subtags are already in canonicalized case.
+ */
+class MOZ_STACK_CLASS LanguageTag final {
+ LanguageSubtag language_ = {};
+ ScriptSubtag script_ = {};
+ RegionSubtag region_ = {};
+
+ using VariantsVector = Vector<JS::UniqueChars, 2>;
+ using ExtensionsVector = Vector<JS::UniqueChars, 2>;
+
+ VariantsVector variants_;
+ ExtensionsVector extensions_;
+ JS::UniqueChars privateuse_ = nullptr;
+
+ friend class LanguageTagParser;
+
+ public:
+ // Flag to request canonicalized Unicode extensions.
+ enum class UnicodeExtensionCanonicalForm : bool { No, Yes };
+
+ private:
+ bool canonicalizeUnicodeExtension(
+ JSContext* cx, JS::UniqueChars& unicodeExtension,
+ UnicodeExtensionCanonicalForm canonicalForm);
+
+ bool canonicalizeTransformExtension(JSContext* cx,
+ JS::UniqueChars& transformExtension);
+
+ public:
+ static bool languageMapping(LanguageSubtag& language);
+ static bool complexLanguageMapping(const LanguageSubtag& language);
+
+ private:
+ static bool regionMapping(RegionSubtag& region);
+ static bool complexRegionMapping(const RegionSubtag& region);
+
+ void performComplexLanguageMappings();
+ void performComplexRegionMappings();
+
+ MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx);
+
+ static const char* replaceUnicodeExtensionType(
+ const mozilla::Range<const char>& key,
+ const mozilla::Range<const char>& type);
+
+ public:
+ explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
+
+ LanguageTag(const LanguageTag&) = delete;
+ LanguageTag& operator=(const LanguageTag&) = delete;
+
+ const LanguageSubtag& language() const { return language_; }
+ const ScriptSubtag& script() const { return script_; }
+ const RegionSubtag& region() const { return region_; }
+ const auto& variants() const { return variants_; }
+ const auto& extensions() const { return extensions_; }
+ const char* privateuse() const { return privateuse_.get(); }
+
+ /**
+ * Set the language subtag. The input must be a valid, case-normalized
+ * language subtag.
+ */
+ template <size_t N>
+ void setLanguage(const char (&language)[N]) {
+ mozilla::Range<const char> range(language, N - 1);
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(range));
+ language_.set(range);
+ }
+
+ /**
+ * Set the language subtag. The input must be a valid, case-normalized
+ * language subtag.
+ */
+ void setLanguage(const LanguageSubtag& language) {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
+ language_.set(language.range());
+ }
+
+ /**
+ * Set the script subtag. The input must be a valid, case-normalized
+ * script subtag or the empty string.
+ */
+ template <size_t N>
+ void setScript(const char (&script)[N]) {
+ mozilla::Range<const char> range(script, N - 1);
+ MOZ_ASSERT(IsStructurallyValidScriptTag(range));
+ script_.set(range);
+ }
+
+ /**
+ * Set the script subtag. The input must be a valid, case-normalized
+ * script subtag or the empty string.
+ */
+ void setScript(const ScriptSubtag& script) {
+ MOZ_ASSERT(script.length() == 0 ||
+ IsStructurallyValidScriptTag(script.range()));
+ script_.set(script.range());
+ }
+
+ /**
+ * Set the region subtag. The input must be a valid, case-normalized
+ * region subtag or the empty string.
+ */
+ template <size_t N>
+ void setRegion(const char (&region)[N]) {
+ mozilla::Range<const char> range(region, N - 1);
+ MOZ_ASSERT(IsStructurallyValidRegionTag(range));
+ region_.set(range);
+ }
+
+ /**
+ * Set the region subtag. The input must be a valid, case-normalized
+ * region subtag or the empty string.
+ */
+ void setRegion(const RegionSubtag& region) {
+ MOZ_ASSERT(region.length() == 0 ||
+ IsStructurallyValidRegionTag(region.range()));
+ region_.set(region.range());
+ }
+
+ /**
+ * Removes all variant subtags.
+ */
+ void clearVariants() { variants_.clearAndFree(); }
+
+ /**
+ * Set the Unicode extension subtag. The input must be a valid,
+ * case-normalized Unicode extension subtag.
+ */
+ bool setUnicodeExtension(JS::UniqueChars extension);
+
+ /**
+ * Set the private-use subtag. The input must be a valid, case-normalized
+ * private-use subtag or the empty string.
+ */
+ void setPrivateuse(JS::UniqueChars privateuse) {
+ MOZ_ASSERT(!privateuse ||
+ IsStructurallyValidPrivateUseTag(
+ {privateuse.get(), strlen(privateuse.get())}));
+ privateuse_ = std::move(privateuse);
+ }
+
+ /**
+ * Canonicalize the base-name subtags, that means the language, script,
+ * region, and variant subtags.
+ */
+ bool canonicalizeBaseName(JSContext* cx);
+
+ /**
+ * Canonicalize all extension subtags.
+ */
+ bool canonicalizeExtensions(JSContext* cx,
+ UnicodeExtensionCanonicalForm canonicalForm);
+
+ /**
+ * Canonicalizes the given structurally valid Unicode BCP 47 locale
+ * identifier, including regularized case of subtags. For example, the
+ * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
+ * where
+ *
+ * Zh ; 2*3ALPHA
+ * -haNS ; ["-" script]
+ * -bu ; ["-" region]
+ * -variant2 ; *("-" variant)
+ * -Variant1
+ * -u-ca-chinese ; *("-" extension)
+ * -t-Zh-laTN
+ * -x-PRIVATE ; ["-" privateuse]
+ *
+ * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
+ *
+ * UTS 35 specifies two different canonicalization algorithms. There's one to
+ * canonicalize BCP 47 language tags and other one to canonicalize Unicode
+ * locale identifiers. The latter one wasn't present when ECMA-402 was changed
+ * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags,
+ * so ECMA-402 currently only uses the former to canonicalize Unicode BCP 47
+ * locale identifiers.
+ *
+ * Spec: ECMAScript Internationalization API Specification, 6.2.3.
+ * Spec:
+ * https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
+ * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion
+ */
+ bool canonicalize(JSContext* cx,
+ UnicodeExtensionCanonicalForm canonicalForm) {
+ return canonicalizeBaseName(cx) &&
+ canonicalizeExtensions(cx, canonicalForm);
+ }
+
+ /**
+ * Append the string representation of this language tag to the given
+ * string buffer.
+ */
+ bool appendTo(JSContext* cx, StringBuffer& sb) const;
+
+ /**
+ * Add likely-subtags to the language tag.
+ *
+ * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
+ */
+ bool addLikelySubtags(JSContext* cx);
+
+ /**
+ * Remove likely-subtags from the language tag.
+ *
+ * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
+ */
+ bool removeLikelySubtags(JSContext* cx);
+};
+
+/**
+ * Parser for Unicode BCP 47 locale identifiers.
+ *
+ * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
+ */
+class MOZ_STACK_CLASS LanguageTagParser final {
+ public:
+ // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
+ enum class TokenKind : uint8_t {
+ None = 0b000,
+ Alpha = 0b001,
+ Digit = 0b010,
+ AlphaDigit = 0b011,
+ Error = 0b100
+ };
+
+ private:
+ class Token final {
+ size_t index_;
+ size_t length_;
+ TokenKind kind_;
+
+ public:
+ Token(TokenKind kind, size_t index, size_t length)
+ : index_(index), length_(length), kind_(kind) {}
+
+ TokenKind kind() const { return kind_; }
+ size_t index() const { return index_; }
+ size_t length() const { return length_; }
+
+ bool isError() const { return kind_ == TokenKind::Error; }
+ bool isNone() const { return kind_ == TokenKind::None; }
+ bool isAlpha() const { return kind_ == TokenKind::Alpha; }
+ bool isDigit() const { return kind_ == TokenKind::Digit; }
+ bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
+ };
+
+ using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
+
+ const LocaleChars& locale_;
+ size_t length_;
+ size_t index_ = 0;
+
+ LanguageTagParser(const LocaleChars& locale, size_t length)
+ : locale_(locale), length_(length) {}
+
+ char16_t charAtUnchecked(size_t index) const {
+ if (locale_.is<const JS::Latin1Char*>()) {
+ return locale_.as<const JS::Latin1Char*>()[index];
+ }
+ return locale_.as<const char16_t*>()[index];
+ }
+
+ char charAt(size_t index) const {
+ char16_t c = charAtUnchecked(index);
+ MOZ_ASSERT(mozilla::IsAscii(c));
+ return c;
+ }
+
+ // Copy the token characters into |subtag|.
+ template <size_t N>
+ void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
+ size_t index = tok.index();
+ size_t length = tok.length();
+ if (locale_.is<const JS::Latin1Char*>()) {
+ using T = const JS::Latin1Char;
+ subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length));
+ } else {
+ using T = const char16_t;
+ subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length));
+ }
+ }
+
+ // Create a string copy of |length| characters starting at |index|.
+ JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
+
+ // Create a string copy of the token characters.
+ JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
+ return chars(cx, tok.index(), tok.length());
+ }
+
+ Token nextToken();
+
+ JS::UniqueChars extension(JSContext* cx, const Token& start,
+ const Token& end) const;
+
+ // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+ //
+ // Four character language subtags are not allowed in Unicode BCP 47 locale
+ // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
+ // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
+ bool isLanguage(const Token& tok) const {
+ return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
+ (5 <= tok.length() && tok.length() <= 8));
+ }
+
+ // unicode_script_subtag = alpha{4} ;
+ bool isScript(const Token& tok) const {
+ return tok.isAlpha() && tok.length() == 4;
+ }
+
+ // unicode_region_subtag = (alpha{2} | digit{3}) ;
+ bool isRegion(const Token& tok) const {
+ return (tok.isAlpha() && tok.length() == 2) ||
+ (tok.isDigit() && tok.length() == 3);
+ }
+
+ // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
+ bool isVariant(const Token& tok) const {
+ return (5 <= tok.length() && tok.length() <= 8) ||
+ (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
+ }
+
+ // Returns the code unit of the first character at the given singleton token.
+ // Always returns the lower case form of an alphabetical character.
+ char singletonKey(const Token& tok) const {
+ MOZ_ASSERT(tok.length() == 1);
+ char c = charAt(tok.index());
+ return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
+ }
+
+ // extensions = unicode_locale_extensions |
+ // transformed_extensions |
+ // other_extensions ;
+ //
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
+ // (sep attribute)+ (sep keyword)*) ;
+ //
+ // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
+ // (sep tfield)+) ;
+ //
+ // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+ bool isExtensionStart(const Token& tok) const {
+ return tok.length() == 1 && singletonKey(tok) != 'x';
+ }
+
+ // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+ bool isOtherExtensionPart(const Token& tok) const {
+ return 2 <= tok.length() && tok.length() <= 8;
+ }
+
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
+ // (sep attribute)+ (sep keyword)*) ;
+ // keyword = key (sep type)? ;
+ bool isUnicodeExtensionPart(const Token& tok) const {
+ return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
+ isUnicodeExtensionAttribute(tok);
+ }
+
+ // attribute = alphanum{3,8} ;
+ bool isUnicodeExtensionAttribute(const Token& tok) const {
+ return 3 <= tok.length() && tok.length() <= 8;
+ }
+
+ // key = alphanum alpha ;
+ bool isUnicodeExtensionKey(const Token& tok) const {
+ return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
+ }
+
+ // type = alphanum{3,8} (sep alphanum{3,8})* ;
+ bool isUnicodeExtensionType(const Token& tok) const {
+ return 3 <= tok.length() && tok.length() <= 8;
+ }
+
+ // tkey = alpha digit ;
+ bool isTransformExtensionKey(const Token& tok) const {
+ return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
+ mozilla::IsAsciiDigit(charAt(tok.index() + 1));
+ }
+
+ // tvalue = (sep alphanum{3,8})+ ;
+ bool isTransformExtensionPart(const Token& tok) const {
+ return 3 <= tok.length() && tok.length() <= 8;
+ }
+
+ // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
+ bool isPrivateUseStart(const Token& tok) const {
+ return tok.length() == 1 && singletonKey(tok) == 'x';
+ }
+
+ // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
+ bool isPrivateUsePart(const Token& tok) const {
+ return 1 <= tok.length() && tok.length() <= 8;
+ }
+
+ enum class BaseNameParsing : bool { Normal, WithinTransformExtension };
+
+ // Helper function for use in |parseBaseName| and
+ // |parseTlangInTransformExtension|. Do not use this directly!
+ static JS::Result<bool> internalParseBaseName(JSContext* cx,
+ LanguageTagParser& ts,
+ LanguageTag& tag, Token& tok,
+ BaseNameParsing parseType);
+
+ // Parse the `unicode_language_id` production, i.e. the
+ // language/script/region/variants portion of a language tag, into |tag|,
+ // which will be filled with canonical-cased components (lowercase language,
+ // titlecase script, uppercase region, lowercased and alphabetized and
+ // deduplicated variants). |tok| must be the current token.
+ static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
+ LanguageTag& tag, Token& tok) {
+ return internalParseBaseName(cx, ts, tag, tok, BaseNameParsing::Normal);
+ }
+
+ // Parse the `tlang` production within a parsed 't' transform extension.
+ // The precise requirements for "previously parsed" are:
+ //
+ // * the input begins from current token |tok| with a valid `tlang`
+ // * the `tlang` is wholly lowercase (*not* canonical case)
+ // * variant subtags in the `tlang` may contain duplicates and be
+ // unordered
+ //
+ // Return an error on internal failure. Otherwise, return a success value. If
+ // there was no `tlang`, then |tag.language().missing()|. But if there was a
+ // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
+ // parse input: fully lowercase, variants in alphabetical order without
+ // duplicates.
+ static JS::Result<JS::Ok> parseTlangInTransformExtension(
+ JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
+ MOZ_ASSERT(ts.isLanguage(tok));
+ return internalParseBaseName(cx, ts, tag, tok,
+ BaseNameParsing::WithinTransformExtension)
+ .map([](bool parsed) {
+ MOZ_ASSERT(parsed);
+ return JS::Ok();
+ });
+ }
+
+ friend class LanguageTag;
+
+ class Range final {
+ size_t begin_;
+ size_t length_;
+
+ public:
+ Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
+
+ template <typename T>
+ T* begin(T* ptr) const {
+ return ptr + begin_;
+ }
+
+ size_t length() const { return length_; }
+ };
+
+ using TFieldVector = js::Vector<Range, 8>;
+ using AttributesVector = js::Vector<Range, 8>;
+ using KeywordsVector = js::Vector<Range, 8>;
+
+ // Parse |extension|, which must be a validated, fully lowercase
+ // `transformed_extensions` subtag, and fill |tag| and |fields| from the
+ // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
+ // with |extension|.
+ static JS::Result<bool> parseTransformExtension(
+ JSContext* cx, mozilla::Range<const char> extension, LanguageTag& tag,
+ TFieldVector& fields);
+
+ // Parse |extension|, which must be a validated, fully lowercase
+ // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
+ // from the `attribute` and `keyword` components.
+ static JS::Result<bool> parseUnicodeExtension(
+ JSContext* cx, mozilla::Range<const char> extension,
+ AttributesVector& attributes, KeywordsVector& keywords);
+
+ public:
+ // Parse the input string as a language tag. Reports an error to the context
+ // if the input can't be parsed completely.
+ static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
+
+ // Parse the input string as a language tag. Returns Ok(true) if the input
+ // could be completely parsed, Ok(false) if the input couldn't be parsed,
+ // or Err() in case of internal error.
+ static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
+ LanguageTag& tag);
+
+ // Parse the input string as the base-name parts (language, script, region,
+ // variants) of a language tag. Ignores any trailing characters.
+ static bool parseBaseName(JSContext* cx, mozilla::Range<const char> locale,
+ LanguageTag& tag);
+
+ // Return true iff |extension| can be parsed as a Unicode extension subtag.
+ static bool canParseUnicodeExtension(mozilla::Range<const char> extension);
+
+ // Return true iff |unicodeType| can be parsed as a Unicode extension type.
+ static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
+};
+
+MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
+
+/**
+ * Parse a string as a standalone |language| tag. If |str| is a standalone
+ * language tag, store it in case-normalized form in |result| and return true.
+ * Otherwise return false.
+ */
+MOZ_MUST_USE bool ParseStandaloneLanguagTag(JS::Handle<JSLinearString*> str,
+ LanguageSubtag& result);
+
+/**
+ * Parse a string as a standalone |script| tag. If |str| is a standalone script
+ * tag, store it in case-normalized form in |result| and return true. Otherwise
+ * return false.
+ */
+MOZ_MUST_USE bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str,
+ ScriptSubtag& result);
+
+/**
+ * Parse a string as a standalone |region| tag. If |str| is a standalone region
+ * tag, store it in case-normalized form in |result| and return true. Otherwise
+ * return false.
+ */
+MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str,
+ RegionSubtag& result);
+
+/**
+ * Parse a string as an ISO-639 language code. Return |nullptr| in the result if
+ * the input could not be parsed or the canonical form of the resulting language
+ * tag contains more than a single language subtag.
+ */
+JS::Result<JSString*> ParseStandaloneISO639LanguageTag(
+ JSContext* cx, JS::Handle<JSLinearString*> str);
+
+} // namespace intl
+
+} // namespace js
+
+#endif /* builtin_intl_LanguageTag_h */
diff --git a/js/src/builtin/intl/LanguageTagGenerated.cpp b/js/src/builtin/intl/LanguageTagGenerated.cpp
new file mode 100644
index 0000000000..8952286976
--- /dev/null
+++ b/js/src/builtin/intl/LanguageTagGenerated.cpp
@@ -0,0 +1,790 @@
+// Generated by make_intl_data.py. DO NOT EDIT.
+// Version: CLDR-35.1
+// URL: https://unicode.org/Public/cldr/35.1/core.zip
+
+#include "mozilla/Assertions.h"
+#include "mozilla/Range.h"
+#include "mozilla/TextUtils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+
+#include "jscntxt.h"
+#include "jsstr.h"
+
+#include "builtin/intl/LanguageTag.h"
+
+using namespace js::intl::LanguageTagLimits;
+using ConstCharRange = mozilla::Range<const char>;
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline bool HasReplacement(
+ const char (&subtags)[Length][TagLength],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.range().begin().get();
+ return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+}
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline const char* SearchReplacement(
+ const char (&subtags)[Length][TagLength],
+ const char* (&aliases)[Length],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.range().begin().get();
+ auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+ if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
+ return aliases[std::distance(std::begin(subtags), p)];
+ }
+ return nullptr;
+}
+
+// Mappings from language subtags to preferred values.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
+
+ if (language.length() == 2) {
+ static const char languages[9][3] = {
+ "bh", "in", "iw", "ji", "jw", "mo", "no", "tl", "tw",
+ };
+ static const char* aliases[9] = {
+ "bho", "id", "he", "yi", "jv", "ro", "nb", "fil", "ak",
+ };
+
+ if (const char* replacement = SearchReplacement(languages, aliases, language)) {
+ language.set(ConstCharRange(replacement, strlen(replacement)));
+ return true;
+ }
+ return false;
+ }
+
+ if (language.length() == 3) {
+ static const char languages[340][4] = {
+ "aam", "aar", "abk", "adp", "afr", "aju", "aka", "alb", "als", "amh",
+ "ara", "arb", "arg", "arm", "asm", "aue", "ava", "ave", "aym", "ayr",
+ "ayx", "aze", "azj", "bak", "bam", "baq", "bcc", "bcl", "bel", "ben",
+ "bgm", "bih", "bis", "bjd", "bod", "bos", "bre", "bul", "bur", "bxk",
+ "bxr", "cat", "ccq", "ces", "cha", "che", "chi", "chu", "chv", "cjr",
+ "cka", "cld", "cmk", "cmn", "cor", "cos", "coy", "cqu", "cre", "cwd",
+ "cym", "cze", "dan", "deu", "dgo", "dhd", "dik", "diq", "div", "drh",
+ "dut", "dzo", "ekk", "ell", "emk", "eng", "epo", "esk", "est", "eus",
+ "ewe", "fao", "fas", "fat", "fij", "fin", "fra", "fre", "fry", "fuc",
+ "ful", "gav", "gaz", "gbo", "geo", "ger", "gfx", "ggn", "gla", "gle",
+ "glg", "glv", "gno", "gre", "grn", "gti", "gug", "guj", "guv", "gya",
+ "hat", "hau", "hdn", "hea", "heb", "her", "him", "hin", "hmo", "hrr",
+ "hrv", "hun", "hye", "ibi", "ibo", "ice", "ido", "iii", "ike", "iku",
+ "ile", "ilw", "ina", "ind", "ipk", "isl", "ita", "jav", "jeg", "jpn",
+ "kal", "kan", "kas", "kat", "kau", "kaz", "kgc", "kgh", "khk", "khm",
+ "kik", "kin", "kir", "kmr", "knc", "kng", "knn", "koj", "kom", "kon",
+ "kor", "kpv", "krm", "ktr", "kua", "kur", "kvs", "kwq", "kxe", "kzj",
+ "kzt", "lao", "lat", "lav", "lbk", "lii", "lim", "lin", "lit", "lmm",
+ "ltz", "lub", "lug", "lvs", "mac", "mah", "mal", "mao", "mar", "may",
+ "meg", "mhr", "mkd", "mlg", "mlt", "mnk", "mol", "mon", "mri", "msa",
+ "mst", "mup", "mwj", "mya", "myt", "nad", "nau", "nav", "nbl", "ncp",
+ "nde", "ndo", "nep", "nld", "nno", "nnx", "nob", "nor", "npi", "nts",
+ "nya", "oci", "ojg", "oji", "ori", "orm", "ory", "oss", "oun", "pan",
+ "pbu", "pcr", "per", "pes", "pli", "plt", "pmc", "pmu", "pnb", "pol",
+ "por", "ppa", "ppr", "pry", "pus", "puz", "que", "quz", "rmy", "roh",
+ "ron", "rum", "run", "rus", "sag", "san", "sca", "scc", "scr", "sin",
+ "skk", "slk", "slo", "slv", "sme", "smo", "sna", "snd", "som", "sot",
+ "spa", "spy", "sqi", "src", "srd", "srp", "ssw", "sun", "swa", "swe",
+ "swh", "tah", "tam", "tat", "tdu", "tel", "tgk", "tgl", "tha", "thc",
+ "thx", "tib", "tie", "tir", "tkk", "tlw", "tmp", "tne", "ton", "tsf",
+ "tsn", "tso", "ttq", "tuk", "tur", "twi", "uig", "ukr", "umu", "uok",
+ "urd", "uzb", "uzn", "ven", "vie", "vol", "wel", "wln", "wol", "xba",
+ "xho", "xia", "xkh", "xpe", "xsj", "xsl", "ybd", "ydd", "yid", "yma",
+ "ymt", "yor", "yos", "yuu", "zai", "zha", "zho", "zsm", "zul", "zyb",
+ };
+ static const char* aliases[340] = {
+ "aas", "aa", "ab", "dz", "af", "jrb", "ak", "sq", "sq", "am",
+ "ar", "ar", "an", "hy", "as", "ktz", "av", "ae", "ay", "ay",
+ "nun", "az", "az", "ba", "bm", "eu", "bal", "bik", "be", "bn",
+ "bcg", "bho", "bi", "drl", "bo", "bs", "br", "bg", "my", "luy",
+ "bua", "ca", "rki", "cs", "ch", "ce", "zh", "cu", "cv", "mom",
+ "cmr", "syr", "xch", "zh", "kw", "co", "pij", "quh", "cr", "cr",
+ "cy", "cs", "da", "de", "doi", "mwr", "din", "zza", "dv", "mn",
+ "nl", "dz", "et", "el", "man", "en", "eo", "ik", "et", "eu",
+ "ee", "fo", "fa", "ak", "fj", "fi", "fr", "fr", "fy", "ff",
+ "ff", "dev", "om", "grb", "ka", "de", "vaj", "gvr", "gd", "ga",
+ "gl", "gv", "gon", "el", "gn", "nyc", "gn", "gu", "duz", "gba",
+ "ht", "ha", "hai", "hmn", "he", "hz", "srx", "hi", "ho", "jal",
+ "hr", "hu", "hy", "opa", "ig", "is", "io", "ii", "iu", "iu",
+ "ie", "gal", "ia", "id", "ik", "is", "it", "jv", "oyb", "ja",
+ "kl", "kn", "ks", "ka", "kr", "kk", "tdf", "kml", "mn", "km",
+ "ki", "rw", "ky", "ku", "kr", "kg", "kok", "kwv", "kv", "kg",
+ "ko", "kv", "bmf", "dtp", "kj", "ku", "gdj", "yam", "tvd", "dtp",
+ "dtp", "lo", "la", "lv", "bnc", "raq", "li", "ln", "lt", "rmx",
+ "lb", "lu", "lg", "lv", "mk", "mh", "ml", "mi", "mr", "ms",
+ "cir", "chm", "mk", "mg", "mt", "man", "ro", "mn", "mi", "ms",
+ "mry", "raj", "vaj", "my", "mry", "xny", "na", "nv", "nr", "kdz",
+ "nd", "ng", "ne", "nl", "nn", "ngv", "nb", "nb", "ne", "pij",
+ "ny", "oc", "oj", "oj", "or", "om", "or", "os", "vaj", "pa",
+ "ps", "adx", "fa", "fa", "pi", "mg", "huw", "phr", "lah", "pl",
+ "pt", "bfy", "lcq", "prt", "ps", "pub", "qu", "qu", "rom", "rm",
+ "ro", "ro", "rn", "ru", "sg", "sa", "hle", "sr", "hr", "si",
+ "oyb", "sk", "sk", "sl", "se", "sm", "sn", "sd", "so", "st",
+ "es", "kln", "sq", "sc", "sc", "sr", "ss", "su", "sw", "sv",
+ "sw", "ty", "ta", "tt", "dtp", "te", "tg", "fil", "th", "tpo",
+ "oyb", "bo", "ras", "ti", "twm", "weo", "tyj", "kak", "to", "taj",
+ "tn", "ts", "tmh", "tk", "tr", "ak", "ug", "uk", "del", "ema",
+ "ur", "uz", "uz", "ve", "vi", "vo", "cy", "wa", "wo", "cax",
+ "xh", "acn", "waw", "kpe", "suj", "den", "rki", "yi", "yi", "lrr",
+ "mtm", "yo", "zom", "yug", "zap", "za", "zh", "ms", "zu", "za",
+ };
+
+ if (const char* replacement = SearchReplacement(languages, aliases, language)) {
+ language.set(ConstCharRange(replacement, strlen(replacement)));
+ return true;
+ }
+ return false;
+ }
+
+ return false;
+}
+
+// Language subtags with complex mappings.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& language) {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
+
+ if (language.length() == 2) {
+ return language.equalTo("sh");
+ }
+
+ if (language.length() == 3) {
+ static const char languages[6][4] = {
+ "cnr", "drw", "hbs", "prs", "swc", "tnf",
+ };
+
+ return HasReplacement(languages, language);
+ }
+
+ return false;
+}
+
+// Mappings from region subtags to preferred values.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) {
+ MOZ_ASSERT(IsStructurallyValidRegionTag(region.range()));
+
+ if (region.length() == 2) {
+ static const char regions[23][3] = {
+ "BU", "CS", "CT", "DD", "DY", "FQ", "FX", "HV", "JT", "MI",
+ "NH", "NQ", "PU", "PZ", "QU", "RH", "TP", "UK", "VD", "WK",
+ "YD", "YU", "ZR",
+ };
+ static const char* aliases[23] = {
+ "MM", "RS", "KI", "DE", "BJ", "AQ", "FR", "BF", "UM", "UM",
+ "VU", "AQ", "UM", "PA", "EU", "ZW", "TL", "GB", "VN", "UM",
+ "YE", "RS", "CD",
+ };
+
+ if (const char* replacement = SearchReplacement(regions, aliases, region)) {
+ region.set(ConstCharRange(replacement, strlen(replacement)));
+ return true;
+ }
+ return false;
+ }
+
+ {
+ static const char regions[300][4] = {
+ "004", "008", "010", "012", "016", "020", "024", "028", "031", "032",
+ "036", "040", "044", "048", "050", "051", "052", "056", "060", "062",
+ "064", "068", "070", "072", "074", "076", "084", "086", "090", "092",
+ "096", "100", "104", "108", "112", "116", "120", "124", "132", "136",
+ "140", "144", "148", "152", "156", "158", "162", "166", "170", "174",
+ "175", "178", "180", "184", "188", "191", "192", "196", "203", "204",
+ "208", "212", "214", "218", "222", "226", "230", "231", "232", "233",
+ "234", "238", "239", "242", "246", "248", "249", "250", "254", "258",
+ "260", "262", "266", "268", "270", "275", "276", "278", "280", "288",
+ "292", "296", "300", "304", "308", "312", "316", "320", "324", "328",
+ "332", "334", "336", "340", "344", "348", "352", "356", "360", "364",
+ "368", "372", "376", "380", "384", "388", "392", "398", "400", "404",
+ "408", "410", "414", "417", "418", "422", "426", "428", "430", "434",
+ "438", "440", "442", "446", "450", "454", "458", "462", "466", "470",
+ "474", "478", "480", "484", "492", "496", "498", "499", "500", "504",
+ "508", "512", "516", "520", "524", "528", "531", "533", "534", "535",
+ "540", "548", "554", "558", "562", "566", "570", "574", "578", "580",
+ "581", "583", "584", "585", "586", "591", "598", "600", "604", "608",
+ "612", "616", "620", "624", "626", "630", "634", "638", "642", "643",
+ "646", "652", "654", "659", "660", "662", "663", "666", "670", "674",
+ "678", "682", "686", "688", "690", "694", "702", "703", "704", "705",
+ "706", "710", "716", "720", "724", "728", "729", "732", "736", "740",
+ "744", "748", "752", "756", "760", "762", "764", "768", "772", "776",
+ "780", "784", "788", "792", "795", "796", "798", "800", "804", "807",
+ "818", "826", "830", "831", "832", "833", "834", "840", "850", "854",
+ "858", "860", "862", "876", "882", "886", "887", "891", "894", "958",
+ "959", "960", "962", "963", "964", "965", "966", "967", "968", "969",
+ "970", "971", "972", "973", "974", "975", "976", "977", "978", "979",
+ "980", "981", "982", "983", "984", "985", "986", "987", "988", "989",
+ "990", "991", "992", "993", "994", "995", "996", "997", "998", "999",
+ };
+ static const char* aliases[300] = {
+ "AF", "AL", "AQ", "DZ", "AS", "AD", "AO", "AG", "AZ", "AR",
+ "AU", "AT", "BS", "BH", "BD", "AM", "BB", "BE", "BM", "034",
+ "BT", "BO", "BA", "BW", "BV", "BR", "BZ", "IO", "SB", "VG",
+ "BN", "BG", "MM", "BI", "BY", "KH", "CM", "CA", "CV", "KY",
+ "CF", "LK", "TD", "CL", "CN", "TW", "CX", "CC", "CO", "KM",
+ "YT", "CG", "CD", "CK", "CR", "HR", "CU", "CY", "CZ", "BJ",
+ "DK", "DM", "DO", "EC", "SV", "GQ", "ET", "ET", "ER", "EE",
+ "FO", "FK", "GS", "FJ", "FI", "AX", "FR", "FR", "GF", "PF",
+ "TF", "DJ", "GA", "GE", "GM", "PS", "DE", "DE", "DE", "GH",
+ "GI", "KI", "GR", "GL", "GD", "GP", "GU", "GT", "GN", "GY",
+ "HT", "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR",
+ "IQ", "IE", "IL", "IT", "CI", "JM", "JP", "KZ", "JO", "KE",
+ "KP", "KR", "KW", "KG", "LA", "LB", "LS", "LV", "LR", "LY",
+ "LI", "LT", "LU", "MO", "MG", "MW", "MY", "MV", "ML", "MT",
+ "MQ", "MR", "MU", "MX", "MC", "MN", "MD", "ME", "MS", "MA",
+ "MZ", "OM", "NA", "NR", "NP", "NL", "CW", "AW", "SX", "BQ",
+ "NC", "VU", "NZ", "NI", "NE", "NG", "NU", "NF", "NO", "MP",
+ "UM", "FM", "MH", "PW", "PK", "PA", "PG", "PY", "PE", "PH",
+ "PN", "PL", "PT", "GW", "TL", "PR", "QA", "RE", "RO", "RU",
+ "RW", "BL", "SH", "KN", "AI", "LC", "MF", "PM", "VC", "SM",
+ "ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "VN", "SI",
+ "SO", "ZA", "ZW", "YE", "ES", "SS", "SD", "EH", "SD", "SR",
+ "SJ", "SZ", "SE", "CH", "SY", "TJ", "TH", "TG", "TK", "TO",
+ "TT", "AE", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "MK",
+ "EG", "GB", "JE", "GG", "JE", "IM", "TZ", "US", "VI", "BF",
+ "UY", "UZ", "VE", "WF", "WS", "YE", "YE", "RS", "ZM", "AA",
+ "QM", "QN", "QP", "QQ", "QR", "QS", "QT", "EU", "QV", "QW",
+ "QX", "QY", "QZ", "XA", "XB", "XC", "XD", "XE", "XF", "XG",
+ "XH", "XI", "XJ", "XK", "XL", "XM", "XN", "XO", "XP", "XQ",
+ "XR", "XS", "XT", "XU", "XV", "XW", "XX", "XY", "XZ", "ZZ",
+ };
+
+ if (const char* replacement = SearchReplacement(regions, aliases, region)) {
+ region.set(ConstCharRange(replacement, strlen(replacement)));
+ return true;
+ }
+ return false;
+ }
+}
+
+// Region subtags with complex mappings.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) {
+ MOZ_ASSERT(IsStructurallyValidRegionTag(region.range()));
+
+ if (region.length() == 2) {
+ return region.equalTo("AN") ||
+ region.equalTo("NT") ||
+ region.equalTo("PC") ||
+ region.equalTo("SU");
+ }
+
+ {
+ static const char regions[8][4] = {
+ "172", "200", "530", "532", "536", "582", "810", "890",
+ };
+
+ return HasReplacement(regions, region);
+ }
+}
+
+// Language subtags with complex mappings.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+void js::intl::LanguageTag::performComplexLanguageMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
+
+ if (language().equalTo("cnr")) {
+ setLanguage("sr");
+ if (region().length() == 0) {
+ setRegion("ME");
+ }
+ }
+ else if (language().equalTo("drw") ||
+ language().equalTo("prs") ||
+ language().equalTo("tnf")) {
+ setLanguage("fa");
+ if (region().length() == 0) {
+ setRegion("AF");
+ }
+ }
+ else if (language().equalTo("hbs") ||
+ language().equalTo("sh")) {
+ setLanguage("sr");
+ if (script().length() == 0) {
+ setScript("Latn");
+ }
+ }
+ else if (language().equalTo("swc")) {
+ setLanguage("sw");
+ if (region().length() == 0) {
+ setRegion("CD");
+ }
+ }
+}
+
+// Region subtags with complex mappings.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+void js::intl::LanguageTag::performComplexRegionMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
+ MOZ_ASSERT(IsStructurallyValidRegionTag(region().range()));
+
+ if (region().equalTo("172")) {
+ if (language().equalTo("hy") ||
+ (language().equalTo("und") && script().equalTo("Armn"))) {
+ setRegion("AM");
+ }
+ else if (language().equalTo("az") ||
+ language().equalTo("tkr") ||
+ language().equalTo("tly") ||
+ language().equalTo("ttt")) {
+ setRegion("AZ");
+ }
+ else if (language().equalTo("be")) {
+ setRegion("BY");
+ }
+ else if (language().equalTo("ab") ||
+ language().equalTo("ka") ||
+ language().equalTo("os") ||
+ (language().equalTo("und") && script().equalTo("Geor")) ||
+ language().equalTo("xmf")) {
+ setRegion("GE");
+ }
+ else if (language().equalTo("ky")) {
+ setRegion("KG");
+ }
+ else if (language().equalTo("kk") ||
+ (language().equalTo("ug") && script().equalTo("Cyrl"))) {
+ setRegion("KZ");
+ }
+ else if (language().equalTo("gag")) {
+ setRegion("MD");
+ }
+ else if (language().equalTo("tg")) {
+ setRegion("TJ");
+ }
+ else if (language().equalTo("tk")) {
+ setRegion("TM");
+ }
+ else if (language().equalTo("crh") ||
+ language().equalTo("got") ||
+ language().equalTo("ji") ||
+ language().equalTo("rue") ||
+ language().equalTo("uk") ||
+ (language().equalTo("und") && script().equalTo("Goth"))) {
+ setRegion("UA");
+ }
+ else if (language().equalTo("kaa") ||
+ language().equalTo("sog") ||
+ (language().equalTo("und") && script().equalTo("Sogd")) ||
+ (language().equalTo("und") && script().equalTo("Sogo")) ||
+ language().equalTo("uz")) {
+ setRegion("UZ");
+ }
+ else {
+ setRegion("RU");
+ }
+ }
+ else if (region().equalTo("200")) {
+ if (language().equalTo("sk")) {
+ setRegion("SK");
+ }
+ else {
+ setRegion("CZ");
+ }
+ }
+ else if (region().equalTo("530") ||
+ region().equalTo("532") ||
+ region().equalTo("AN")) {
+ if (language().equalTo("vic")) {
+ setRegion("SX");
+ }
+ else {
+ setRegion("CW");
+ }
+ }
+ else if (region().equalTo("536") ||
+ region().equalTo("NT")) {
+ if (language().equalTo("akk") ||
+ language().equalTo("ckb") ||
+ (language().equalTo("ku") && script().equalTo("Arab")) ||
+ language().equalTo("mis") ||
+ language().equalTo("syr") ||
+ (language().equalTo("und") && script().equalTo("Syrc")) ||
+ (language().equalTo("und") && script().equalTo("Xsux")) ||
+ (language().equalTo("und") && script().equalTo("Hatr"))) {
+ setRegion("IQ");
+ }
+ else {
+ setRegion("SA");
+ }
+ }
+ else if (region().equalTo("582") ||
+ region().equalTo("PC")) {
+ if (language().equalTo("mh")) {
+ setRegion("MH");
+ }
+ else if (language().equalTo("pau")) {
+ setRegion("PW");
+ }
+ else {
+ setRegion("FM");
+ }
+ }
+ else if (region().equalTo("810") ||
+ region().equalTo("SU")) {
+ if (language().equalTo("hy") ||
+ (language().equalTo("und") && script().equalTo("Armn"))) {
+ setRegion("AM");
+ }
+ else if (language().equalTo("az") ||
+ language().equalTo("tkr") ||
+ language().equalTo("tly") ||
+ language().equalTo("ttt")) {
+ setRegion("AZ");
+ }
+ else if (language().equalTo("be")) {
+ setRegion("BY");
+ }
+ else if (language().equalTo("et") ||
+ language().equalTo("vro")) {
+ setRegion("EE");
+ }
+ else if (language().equalTo("ab") ||
+ language().equalTo("ka") ||
+ language().equalTo("os") ||
+ (language().equalTo("und") && script().equalTo("Geor")) ||
+ language().equalTo("xmf")) {
+ setRegion("GE");
+ }
+ else if (language().equalTo("ky")) {
+ setRegion("KG");
+ }
+ else if (language().equalTo("kk") ||
+ (language().equalTo("ug") && script().equalTo("Cyrl"))) {
+ setRegion("KZ");
+ }
+ else if (language().equalTo("lt") ||
+ language().equalTo("sgs")) {
+ setRegion("LT");
+ }
+ else if (language().equalTo("ltg") ||
+ language().equalTo("lv")) {
+ setRegion("LV");
+ }
+ else if (language().equalTo("gag")) {
+ setRegion("MD");
+ }
+ else if (language().equalTo("tg")) {
+ setRegion("TJ");
+ }
+ else if (language().equalTo("tk")) {
+ setRegion("TM");
+ }
+ else if (language().equalTo("crh") ||
+ language().equalTo("got") ||
+ language().equalTo("ji") ||
+ language().equalTo("rue") ||
+ language().equalTo("uk") ||
+ (language().equalTo("und") && script().equalTo("Goth"))) {
+ setRegion("UA");
+ }
+ else if (language().equalTo("kaa") ||
+ language().equalTo("sog") ||
+ (language().equalTo("und") && script().equalTo("Sogd")) ||
+ (language().equalTo("und") && script().equalTo("Sogo")) ||
+ language().equalTo("uz")) {
+ setRegion("UZ");
+ }
+ else {
+ setRegion("RU");
+ }
+ }
+ else if (region().equalTo("890")) {
+ if (language().equalTo("bs")) {
+ setRegion("BA");
+ }
+ else if (language().equalTo("hr")) {
+ setRegion("HR");
+ }
+ else if (language().equalTo("mk")) {
+ setRegion("MK");
+ }
+ else if (language().equalTo("sl")) {
+ setRegion("SI");
+ }
+ else {
+ setRegion("RS");
+ }
+ }
+}
+
+// Canonicalize grandfathered locale identifiers.
+// Derived from CLDR Supplemental Data, version 35.1.
+// https://unicode.org/Public/cldr/35.1/core.zip
+bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
+ // We're mapping regular grandfathered tags to non-grandfathered form here.
+ // Other tags remain unchanged.
+ //
+ // regular = "art-lojban"
+ // / "cel-gaulish"
+ // / "no-bok"
+ // / "no-nyn"
+ // / "zh-guoyu"
+ // / "zh-hakka"
+ // / "zh-min"
+ // / "zh-min-nan"
+ // / "zh-xiang"
+ //
+ // Therefore we can quickly exclude most tags by checking every
+ // |unicode_locale_id| subcomponent for characteristics not shared by any of
+ // the regular grandfathered (RG) tags:
+ //
+ // * Real-world |unicode_language_subtag|s are all two or three letters,
+ // so don't waste time running a useless |language.length > 3| fast-path.
+ // * No RG tag has a "script"-looking component.
+ // * No RG tag has a "region"-looking component.
+ // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
+ // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
+ // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
+ // that |unicode_locale_id| doesn't support.)
+ // * No RG tag contains |extensions| or |pu_extensions|.
+ if (script().length() != 0 ||
+ region().length() != 0 ||
+ variants().length() != 1 ||
+ extensions().length() != 0 ||
+ privateuse()) {
+ return true;
+ }
+
+ auto variantEqualTo = [this](const char* variant) {
+ return strcmp(variants()[0].get(), variant) == 0;
+ };
+
+ // art-lojban -> jbo
+ if (language().equalTo("art") && variantEqualTo("lojban")) {
+ setLanguage("jbo");
+ clearVariants();
+ return true;
+ }
+
+ // cel-gaulish -> xtg-x-cel-gaulish
+ else if (language().equalTo("cel") && variantEqualTo("gaulish")) {
+ setLanguage("xtg");
+ clearVariants();
+
+ auto privateuse = DuplicateString(cx, "x-cel-gaulish");
+ if (!privateuse) {
+ return false;
+ }
+ setPrivateuse(std::move(privateuse));
+ return true;
+ }
+
+ // zh-guoyu -> zh
+ else if (language().equalTo("zh") && variantEqualTo("guoyu")) {
+ setLanguage("zh");
+ clearVariants();
+ return true;
+ }
+
+ // zh-hakka -> hak
+ else if (language().equalTo("zh") && variantEqualTo("hakka")) {
+ setLanguage("hak");
+ clearVariants();
+ return true;
+ }
+
+ // zh-xiang -> hsn
+ else if (language().equalTo("zh") && variantEqualTo("xiang")) {
+ setLanguage("hsn");
+ clearVariants();
+ return true;
+ }
+
+ return true;
+}
+
+template <size_t Length>
+static inline bool IsUnicodeKey(const ConstCharRange& key,
+ const char (&str)[Length]) {
+ static_assert(Length == UnicodeKeyLength + 1,
+ "Unicode extension key is two characters long");
+ return memcmp(key.begin().get(), str, Length - 1) == 0;
+}
+
+template <size_t Length>
+static inline bool IsUnicodeType(const ConstCharRange& type,
+ const char (&str)[Length]) {
+ static_assert(Length > UnicodeKeyLength + 1,
+ "Unicode extension type contains more than two characters");
+ return type.length() == (Length - 1) &&
+ memcmp(type.begin().get(), str, Length - 1) == 0;
+}
+
+static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) {
+#ifdef DEBUG
+ auto isNull = [](char c) {
+ return c == '\0';
+ };
+#endif
+
+ MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull),
+ "unexpected null-character in string");
+
+ using UnsignedChar = unsigned char;
+ for (size_t i = 0; i < b.length(); i++) {
+ // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
+ // we've reached the end of |a|, the below if-statement will always be true.
+ // That ensures we don't read past the end of |a|.
+ if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
+ return r;
+ }
+ }
+
+ // Return zero if both strings are equal or a negative number if |b| is a
+ // prefix of |a|.
+ return -int32_t(UnsignedChar(a[b.length()]));
+};
+
+template <size_t Length>
+static inline const char* SearchReplacement(const char* (&types)[Length],
+ const char* (&aliases)[Length],
+ const ConstCharRange& type) {
+
+ auto p = std::lower_bound(std::begin(types), std::end(types), type,
+ [](const auto& a, const auto& b) {
+ return CompareUnicodeType(a, b) < 0;
+ });
+ if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
+ return aliases[std::distance(std::begin(types), p)];
+ }
+ return nullptr;
+}
+
+/**
+ * Mapping from deprecated BCP 47 Unicode extension types to their preferred
+ * values.
+ *
+ * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ */
+const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
+ const ConstCharRange& key, const ConstCharRange& type) {
+#ifdef DEBUG
+ static auto isAsciiLowercaseAlphanumeric = [](char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+ };
+
+ static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
+ return isAsciiLowercaseAlphanumeric(c) || c == '-';
+ };
+#endif
+
+ MOZ_ASSERT(key.length() == UnicodeKeyLength);
+ MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(),
+ isAsciiLowercaseAlphanumeric));
+
+ MOZ_ASSERT(type.length() > UnicodeKeyLength);
+ MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(),
+ isAsciiLowercaseAlphanumericOrDash));
+
+ if (IsUnicodeKey(key, "ca")) {
+ if (IsUnicodeType(type, "ethiopic-amete-alem")) {
+ return "ethioaa";
+ }
+ if (IsUnicodeType(type, "islamicc")) {
+ return "islamic-civil";
+ }
+ }
+ else if (IsUnicodeKey(key, "kb") ||
+ IsUnicodeKey(key, "kc") ||
+ IsUnicodeKey(key, "kh") ||
+ IsUnicodeKey(key, "kk") ||
+ IsUnicodeKey(key, "kn")) {
+ if (IsUnicodeType(type, "yes")) {
+ return "true";
+ }
+ }
+ else if (IsUnicodeKey(key, "ks")) {
+ if (IsUnicodeType(type, "primary")) {
+ return "level1";
+ }
+ if (IsUnicodeType(type, "tertiary")) {
+ return "level3";
+ }
+ }
+ else if (IsUnicodeKey(key, "ms")) {
+ if (IsUnicodeType(type, "imperial")) {
+ return "uksystem";
+ }
+ }
+ else if (IsUnicodeKey(key, "rg") ||
+ IsUnicodeKey(key, "sd")) {
+ static const char* types[116] = {
+ "cn11", "cn12", "cn13", "cn14", "cn15", "cn21", "cn22", "cn23",
+ "cn31", "cn32", "cn33", "cn34", "cn35", "cn36", "cn37", "cn41",
+ "cn42", "cn43", "cn44", "cn45", "cn46", "cn50", "cn51", "cn52",
+ "cn53", "cn54", "cn61", "cn62", "cn63", "cn64", "cn65", "cz10a",
+ "cz10b", "cz10c", "cz10d", "cz10e", "cz10f", "cz611", "cz612", "cz613",
+ "cz614", "cz615", "cz621", "cz622", "cz623", "cz624", "cz626", "cz627",
+ "czjc", "czjm", "czka", "czkr", "czli", "czmo", "czol", "czpa",
+ "czpl", "czpr", "czst", "czus", "czvy", "czzl", "fra", "frb",
+ "frc", "frd", "fre", "frf", "frg", "frh", "fri", "frj",
+ "frk", "frl", "frm", "frn", "fro", "frp", "frq", "frr",
+ "frs", "frt", "fru", "frv", "laxn", "lud", "lug", "lul",
+ "mrnkc", "nzn", "nzs", "omba", "omsh", "plds", "plkp", "pllb",
+ "plld", "pllu", "plma", "plmz", "plop", "plpd", "plpk", "plpm",
+ "plsk", "plsl", "plwn", "plwp", "plzp", "tteto", "ttrcm", "ttwto",
+ "twkhq", "twtnq", "twtpq", "twtxq",
+ };
+ static const char* aliases[116] = {
+ "cnbj", "cntj", "cnhe", "cnsx", "cnmn", "cnln", "cnjl", "cnhl",
+ "cnsh", "cnjs", "cnzj", "cnah", "cnfj", "cnjx", "cnsd", "cnha",
+ "cnhb", "cnhn", "cngd", "cngx", "cnhi", "cncq", "cnsc", "cngz",
+ "cnyn", "cnxz", "cnsn", "cngs", "cnqh", "cnnx", "cnxj", "cz110",
+ "cz111", "cz112", "cz113", "cz114", "cz115", "cz663", "cz632", "cz633",
+ "cz634", "cz635", "cz641", "cz642", "cz643", "cz644", "cz646", "cz647",
+ "cz31", "cz64", "cz41", "cz52", "cz51", "cz80", "cz71", "cz53",
+ "cz32", "cz10", "cz20", "cz42", "cz63", "cz72", "frges", "frnaq",
+ "frara", "frbfc", "frbre", "frcvl", "frges", "frcor", "frbfc", "fridf",
+ "frocc", "frnaq", "frges", "frocc", "frhdf", "frnor", "frnor", "frpdl",
+ "frhdf", "frnaq", "frpac", "frara", "laxs", "lucl", "luec", "luca",
+ "mr13", "nzauk", "nzcan", "ombj", "omsj", "pl02", "pl04", "pl08",
+ "pl10", "pl06", "pl12", "pl14", "pl16", "pl20", "pl18", "pl22",
+ "pl26", "pl24", "pl28", "pl30", "pl32", "tttob", "ttmrc", "tttob",
+ "twkhh", "twtnn", "twnwt", "twtxg",
+ };
+ return SearchReplacement(types, aliases, type);
+ }
+ else if (IsUnicodeKey(key, "tz")) {
+ static const char* types[28] = {
+ "aqams", "cnckg", "cnhrb", "cnkhg", "cuba", "egypt",
+ "eire", "est", "gmt0", "hongkong", "hst", "iceland",
+ "iran", "israel", "jamaica", "japan", "libya", "mst",
+ "navajo", "poland", "portugal", "prc", "roc", "rok",
+ "turkey", "uct", "usnavajo", "zulu",
+ };
+ static const char* aliases[28] = {
+ "nzakl", "cnsha", "cnsha", "cnurc", "cuhav", "egcai",
+ "iedub", "utcw05", "gmt", "hkhkg", "utcw10", "isrey",
+ "irthr", "jeruslm", "jmkin", "jptyo", "lytip", "utcw07",
+ "usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel",
+ "trist", "utc", "usden", "utc",
+ };
+ return SearchReplacement(types, aliases, type);
+ }
+ return nullptr;
+}
diff --git a/js/src/builtin/intl/Locale.cpp b/js/src/builtin/intl/Locale.cpp
new file mode 100644
index 0000000000..2bd93732c2
--- /dev/null
+++ b/js/src/builtin/intl/Locale.cpp
@@ -0,0 +1,1372 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Intl.Locale implementation. */
+
+#include "builtin/intl/Locale.h"
+
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/Assertions.h"
+#include "mozilla/Casting.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/Range.h"
+#include "mozilla/TextUtils.h"
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <string.h>
+#include <utility>
+
+#include "jsapi.h"
+#include "jsfriendapi.h"
+#include "jscntxt.h"
+#include "jsobjinlines.h"
+#include "jswrapper.h"
+
+#include "builtin/intl/CommonFunctions.h"
+#include "builtin/intl/LanguageTag.h"
+#include "gc/Rooting.h"
+#include "js/Conversions.h"
+#include "js/TypeDecls.h"
+#include "vm/GlobalObject.h"
+#include "vm/String.h"
+#include "vm/StringBuffer.h"
+
+#include "vm/NativeObject-inl.h"
+
+using namespace js;
+using namespace js::intl::LanguageTagLimits;
+
+using intl::LanguageTag;
+using intl::LanguageTagParser;
+
+const Class LocaleObject::class_ = {
+ js_Object_str,
+ JSCLASS_HAS_RESERVED_SLOTS(LocaleObject::SLOT_COUNT),
+};
+
+static inline bool IsLocale(HandleValue v) {
+ return v.isObject() && v.toObject().is<LocaleObject>();
+}
+
+// Return the length of the base-name subtags.
+static size_t BaseNameLength(const LanguageTag& tag) {
+ size_t baseNameLength = tag.language().length();
+ if (tag.script().length() > 0) {
+ baseNameLength += 1 + tag.script().length();
+ }
+ if (tag.region().length() > 0) {
+ baseNameLength += 1 + tag.region().length();
+ }
+ for (const auto& variant : tag.variants()) {
+ baseNameLength += 1 + strlen(variant.get());
+ }
+ return baseNameLength;
+}
+
+struct IndexAndLength {
+ size_t index;
+ size_t length;
+
+ IndexAndLength(size_t index, size_t length) : index(index), length(length){};
+
+ template <typename T>
+ mozilla::Range<const T> rangeOf(const T* ptr) const {
+ return {ptr + index, length};
+ }
+};
+
+// Compute the Unicode extension's index and length in the extension subtag.
+static mozilla::Maybe<IndexAndLength> UnicodeExtensionPosition(
+ const LanguageTag& tag) {
+ size_t index = 0;
+ for (const auto& extension : tag.extensions()) {
+ size_t extensionLength = strlen(extension.get());
+ if (extension[0] == 'u') {
+ return mozilla::Some(IndexAndLength{index, extensionLength});
+ }
+
+ // Add +1 to skip over the preceding separator.
+ index += 1 + extensionLength;
+ }
+ return mozilla::Nothing();
+}
+
+static LocaleObject* CreateLocaleObject(JSContext* cx, HandleObject prototype,
+ const LanguageTag& tag) {
+ RootedObject proto(cx, prototype);
+ if (!proto) {
+ proto = GlobalObject::getOrCreateLocalePrototype(cx, cx->global());
+ if (!proto) {
+ return nullptr;
+ }
+ }
+
+ StringBuffer sb(cx);
+ if (!tag.appendTo(cx, sb)) {
+ return nullptr;
+ }
+
+ RootedString tagStr(cx, sb.finishString());
+ if (!tagStr) {
+ return nullptr;
+ }
+
+ size_t baseNameLength = BaseNameLength(tag);
+
+ RootedString baseName(cx, NewDependentString(cx, tagStr, 0, baseNameLength));
+ if (!baseName) {
+ return nullptr;
+ }
+
+ RootedValue unicodeExtension(cx, UndefinedValue());
+ if (auto result = UnicodeExtensionPosition(tag)) {
+ JSString* str = NewDependentString(
+ cx, tagStr, baseNameLength + 1 + result->index, result->length);
+ if (!str) {
+ return nullptr;
+ }
+
+ unicodeExtension.setString(str);
+ }
+
+ auto* locale = NewObjectWithGivenProto<LocaleObject>(cx, proto);
+ if (!locale) {
+ return nullptr;
+ }
+
+ locale->setFixedSlot(LocaleObject::LANGUAGE_TAG_SLOT, StringValue(tagStr));
+ locale->setFixedSlot(LocaleObject::BASENAME_SLOT, StringValue(baseName));
+ locale->setFixedSlot(LocaleObject::UNICODE_EXTENSION_SLOT, unicodeExtension);
+
+ return locale;
+}
+
+static inline bool IsValidUnicodeExtensionValue(JSLinearString* linear) {
+ return linear->length() > 0 &&
+ LanguageTagParser::canParseUnicodeExtensionType(linear);
+}
+
+/** Iterate through (sep keyword) in a valid, lowercased Unicode extension. */
+template <typename CharT>
+class SepKeywordIterator {
+ const CharT* iter_;
+ const CharT* const end_;
+
+ public:
+ SepKeywordIterator(const CharT* unicodeExtensionBegin,
+ const CharT* unicodeExtensionEnd)
+ : iter_(unicodeExtensionBegin), end_(unicodeExtensionEnd) {}
+
+ /**
+ * Return (sep keyword) in the Unicode locale extension from begin to end.
+ * The first call after all (sep keyword) are consumed returns |nullptr|; no
+ * further calls are allowed.
+ */
+ const CharT* next() {
+ MOZ_ASSERT(iter_ != nullptr,
+ "can't call next() once it's returned nullptr");
+
+ constexpr size_t SepKeyLength = 1 + UnicodeKeyLength; // "-co"/"-nu"/etc.
+
+ MOZ_ASSERT(iter_ + SepKeyLength <= end_,
+ "overall Unicode locale extension or non-leading subtags must "
+ "be at least key-sized");
+
+ MOZ_ASSERT((iter_[0] == 'u' && iter_[1] == '-') || iter_[0] == '-');
+
+ while (true) {
+ // Skip past '-' so |std::char_traits::find| makes progress. Skipping
+ // 'u' is harmless -- skip or not, |find| returns the first '-'.
+ iter_++;
+
+ // Find the next separator.
+ iter_ = std::char_traits<CharT>::find(
+ iter_, mozilla::PointerRangeSize(iter_, end_), CharT('-'));
+ if (!iter_) {
+ return nullptr;
+ }
+
+ MOZ_ASSERT(iter_ + SepKeyLength <= end_,
+ "non-leading subtags in a Unicode locale extension are all "
+ "at least as long as a key");
+
+ if (iter_ + SepKeyLength == end_ || // key is terminal subtag
+ iter_[SepKeyLength] == '-') { // key is followed by more subtags
+ break;
+ }
+ }
+
+ MOZ_ASSERT(iter_[0] == '-');
+ MOZ_ASSERT(mozilla::IsAsciiLowercaseAlpha(iter_[1]) ||
+ mozilla::IsAsciiDigit(iter_[1]));
+ MOZ_ASSERT(mozilla::IsAsciiLowercaseAlpha(iter_[2]));
+ MOZ_ASSERT_IF(iter_ + SepKeyLength < end_, iter_[SepKeyLength] == '-');
+ return iter_;
+ }
+};
+
+/**
+ * 9.2.10 GetOption ( options, property, type, values, fallback )
+ *
+ * If the requested property is present and not-undefined, set the result string
+ * to |ToString(value)|. Otherwise set the result string to nullptr.
+ */
+static bool GetStringOption(JSContext* cx, HandleObject options,
+ HandlePropertyName name,
+ MutableHandle<JSLinearString*> string) {
+ // Step 1.
+ RootedValue option(cx);
+ if (!GetProperty(cx, options, options, name, &option)) {
+ return false;
+ }
+
+ // Step 2.
+ JSLinearString* linear = nullptr;
+ if (!option.isUndefined()) {
+ // Steps 2.a-b, 2.d (not applicable).
+
+ // Steps 2.c, 2.e.
+ JSString* str = ToString(cx, option);
+ if (!str) {
+ return false;
+ }
+ linear = str->ensureLinear(cx);
+ if (!linear) {
+ return false;
+ }
+ }
+
+ // Step 3.
+ string.set(linear);
+ return true;
+}
+
+/**
+ * 9.2.10 GetOption ( options, property, type, values, fallback )
+ *
+ * If the requested property is present and not-undefined, set the result string
+ * to |ToString(ToBoolean(value))|. Otherwise set the result string to nullptr.
+ */
+static bool GetBooleanOption(JSContext* cx, HandleObject options,
+ HandlePropertyName name,
+ MutableHandle<JSLinearString*> string) {
+ // Step 1.
+ RootedValue option(cx);
+ if (!GetProperty(cx, options, options, name, &option)) {
+ return false;
+ }
+
+ // Step 2.
+ JSLinearString* linear = nullptr;
+ if (!option.isUndefined()) {
+ // Steps 2.a, 2.c-d (not applicable).
+
+ // Steps 2.c, 2.e.
+ JSString* str = BooleanToString(cx, ToBoolean(option));
+ MOZ_ALWAYS_TRUE(linear = str->ensureLinear(cx));
+ }
+
+ // Step 3.
+ string.set(linear);
+ return true;
+}
+
+/**
+ * ApplyOptionsToTag ( tag, options )
+ */
+static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag,
+ HandleObject options) {
+ // Steps 1-2 (Already performed in caller).
+
+ RootedLinearString option(cx);
+
+ // Step 3.
+ if (!GetStringOption(cx, options, cx->names().language, &option)) {
+ return false;
+ }
+
+ // Step 4.
+ intl::LanguageSubtag language;
+ if (option && !intl::ParseStandaloneLanguagTag(option, language)) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "language",
+ str.get());
+ }
+ return false;
+ }
+
+ // Step 5.
+ if (!GetStringOption(cx, options, cx->names().script, &option)) {
+ return false;
+ }
+
+ // Step 6.
+ intl::ScriptSubtag script;
+ if (option && !intl::ParseStandaloneScriptTag(option, script)) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "script", str.get());
+ }
+ return false;
+ }
+
+ // Step 7.
+ if (!GetStringOption(cx, options, cx->names().region, &option)) {
+ return false;
+ }
+
+ // Step 8.
+ intl::RegionSubtag region;
+ if (option && !intl::ParseStandaloneRegionTag(option, region)) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "region", str.get());
+ }
+ return false;
+ }
+
+ // Step 9 (Already performed in caller).
+
+ // Skip steps 10-13 when no subtags were modified.
+ if (language.length() > 0 || script.length() > 0 || region.length() > 0) {
+ // Step 10.
+ if (language.length() > 0) {
+ tag.setLanguage(language);
+ }
+
+ // Step 11.
+ if (script.length() > 0) {
+ tag.setScript(script);
+ }
+
+ // Step 12.
+ if (region.length() > 0) {
+ tag.setRegion(region);
+ }
+
+ // Step 13.
+ // Optimized to only canonicalize the base-name subtags. All other
+ // canonicalization steps will happen later.
+ if (!tag.canonicalizeBaseName(cx)) {
+ return true;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * ApplyUnicodeExtensionToTag( tag, options, relevantExtensionKeys )
+ */
+static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag,
+ HandleLinearString calendar,
+ HandleLinearString collation,
+ HandleLinearString hourCycle,
+ HandleLinearString caseFirst,
+ HandleLinearString numeric,
+ HandleLinearString numberingSystem) {
+ // If no Unicode extensions were present in the options object, we can skip
+ // everything below and directly return.
+ if (!calendar && !collation && !caseFirst && !hourCycle && !numeric &&
+ !numberingSystem) {
+ return true;
+ }
+
+ Vector<char, 32> newExtension(cx);
+ if (!newExtension.append('u')) {
+ return false;
+ }
+
+ // Check if there's an existing Unicode extension subtag. (The extension
+ // subtags aren't necessarily sorted, so we can't use binary search here.)
+ const UniqueChars* existingUnicodeExtension =
+ std::find_if(tag.extensions().begin(), tag.extensions().end(),
+ [](const auto& extension) { return extension[0] == 'u'; });
+
+ const char* unicodeExtensionEnd = nullptr;
+ const char* unicodeExtensionKeywords = nullptr;
+ if (existingUnicodeExtension != tag.extensions().end()) {
+ const char* unicodeExtension = existingUnicodeExtension->get();
+ unicodeExtensionEnd = unicodeExtension + strlen(unicodeExtension);
+
+ SepKeywordIterator<char> iter(unicodeExtension, unicodeExtensionEnd);
+
+ // Find the start of the first keyword.
+ unicodeExtensionKeywords = iter.next();
+
+ // Copy any attributes present before the first keyword.
+ const char* attributesEnd = unicodeExtensionKeywords
+ ? unicodeExtensionKeywords
+ : unicodeExtensionEnd;
+ if (!newExtension.append(unicodeExtension + 1, attributesEnd)) {
+ return false;
+ }
+ }
+
+ using UnicodeKeyWithSeparator = const char(&)[UnicodeKeyLength + 3];
+
+ auto appendKeyword = [&newExtension](UnicodeKeyWithSeparator key,
+ JSLinearString* value) {
+ if (!newExtension.append(key, UnicodeKeyLength + 2)) {
+ return false;
+ }
+
+ JS::AutoCheckCannotGC nogc;
+ return value->hasLatin1Chars()
+ ? newExtension.append(value->latin1Chars(nogc), value->length())
+ : newExtension.append(value->twoByteChars(nogc),
+ value->length());
+ };
+
+ // Append the new keywords before any existing keywords. That way any previous
+ // keyword with the same key is detected as a duplicate when canonicalizing
+ // the Unicode extension subtag and gets discarded.
+
+ size_t startNewKeywords = newExtension.length();
+
+ if (calendar) {
+ if (!appendKeyword("-ca-", calendar)) {
+ return false;
+ }
+ }
+ if (collation) {
+ if (!appendKeyword("-co-", collation)) {
+ return false;
+ }
+ }
+ if (hourCycle) {
+ if (!appendKeyword("-hc-", hourCycle)) {
+ return false;
+ }
+ }
+ if (caseFirst) {
+ if (!appendKeyword("-kf-", caseFirst)) {
+ return false;
+ }
+ }
+ if (numeric) {
+ if (!appendKeyword("-kn-", numeric)) {
+ return false;
+ }
+ }
+ if (numberingSystem) {
+ if (!appendKeyword("-nu-", numberingSystem)) {
+ return false;
+ }
+ }
+
+ // Normalize the case of the new keywords.
+ std::transform(newExtension.begin() + startNewKeywords, newExtension.end(),
+ newExtension.begin() + startNewKeywords, [](char c) {
+ return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
+ });
+
+ // Append the remaining keywords from the previous Unicode extension subtag.
+ if (unicodeExtensionKeywords) {
+ if (!newExtension.append(unicodeExtensionKeywords, unicodeExtensionEnd)) {
+ return false;
+ }
+ }
+
+ // Null-terminate the new Unicode extension string.
+ if (!newExtension.append('\0')) {
+ return false;
+ }
+
+ // Insert the new Unicode extension string into the language tag.
+ UniqueChars newExtensionChars(newExtension.extractOrCopyRawBuffer());
+ if (!newExtensionChars) {
+ return false;
+ }
+ return tag.setUnicodeExtension(std::move(newExtensionChars));
+}
+
+static JS::Result<JSString*> LanguageTagFromMaybeWrappedLocale(JSContext* cx,
+ JSObject* obj) {
+ if (obj->is<LocaleObject>()) {
+ return obj->as<LocaleObject>().languageTag();
+ }
+
+ JSObject* unwrapped = CheckedUnwrap(obj);
+ if (!unwrapped) {
+ /* ReportAccessDenied(cx); */
+ return cx->alreadyReportedError();
+ }
+
+ if (!unwrapped->is<LocaleObject>()) {
+ return nullptr;
+ }
+
+ RootedString tagStr(cx, unwrapped->as<LocaleObject>().languageTag());
+ if (!cx->compartment()->wrap(cx, &tagStr)) {
+ return cx->alreadyReportedError();
+ }
+ return tagStr.get();
+}
+
+/**
+ * Intl.Locale( tag[, options] )
+ */
+static bool Locale(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+
+ // Step 1.
+ if (!ThrowIfNotConstructing(cx, args, "Intl.Locale")) {
+ return false;
+ }
+
+ // Steps 2-6 (Inlined 9.1.14, OrdinaryCreateFromConstructor).
+ RootedObject proto(cx);
+ if (!GetPrototypeFromCallableConstructor(cx, args, &proto)) {
+ return false;
+ }
+
+ // Steps 7-9.
+ HandleValue tagValue = args.get(0);
+ JSString* tagStr;
+ if (tagValue.isObject()) {
+ JS_TRY_VAR_OR_RETURN_FALSE(
+ cx, tagStr,
+ LanguageTagFromMaybeWrappedLocale(cx, &tagValue.toObject()));
+ if (!tagStr) {
+ tagStr = ToString(cx, tagValue);
+ if (!tagStr) {
+ return false;
+ }
+ }
+ } else if (tagValue.isString()) {
+ tagStr = tagValue.toString();
+ } else {
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+ JSMSG_INVALID_LOCALES_ELEMENT);
+ return false;
+ }
+
+ RootedLinearString tagLinearStr(cx, tagStr->ensureLinear(cx));
+ if (!tagLinearStr) {
+ return false;
+ }
+
+ // ApplyOptionsToTag, steps 2 and 9.
+ LanguageTag tag(cx);
+ if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) {
+ return false;
+ }
+
+ if (!tag.canonicalizeBaseName(cx)) {
+ return false;
+ }
+
+ // Steps 10-11.
+ if (args.hasDefined(1)) {
+ RootedObject options(cx, ToObject(cx, args[1]));
+ if (!options) {
+ return false;
+ }
+
+ // Step 12.
+ if (!ApplyOptionsToTag(cx, tag, options)) {
+ return false;
+ }
+
+ // Step 13 (not applicable).
+
+ // Steps 14, 16.
+ RootedLinearString calendar(cx);
+ if (!GetStringOption(cx, options, cx->names().calendar, &calendar)) {
+ return false;
+ }
+
+ // Step 15.
+ if (calendar) {
+ if (!IsValidUnicodeExtensionValue(calendar)) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *calendar)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "calendar",
+ str.get());
+ }
+ return false;
+ }
+ }
+
+ // Steps 17, 19.
+ RootedLinearString collation(cx);
+ if (!GetStringOption(cx, options, cx->names().collation, &collation)) {
+ return false;
+ }
+
+ // Step 18.
+ if (collation) {
+ if (!IsValidUnicodeExtensionValue(collation)) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *collation)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "collation",
+ str.get());
+ }
+ return false;
+ }
+ }
+
+ // Steps 20-21.
+ RootedLinearString hourCycle(cx);
+ if (!GetStringOption(cx, options, cx->names().hourCycle, &hourCycle)) {
+ return false;
+ }
+
+ if (hourCycle) {
+ if (!StringEqualsAscii(hourCycle, "h11") &&
+ !StringEqualsAscii(hourCycle, "h12") &&
+ !StringEqualsAscii(hourCycle, "h23") &&
+ !StringEqualsAscii(hourCycle, "h24")) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *hourCycle)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "hourCycle",
+ str.get());
+ }
+ return false;
+ }
+ }
+
+ // Steps 22-23.
+ RootedLinearString caseFirst(cx);
+ if (!GetStringOption(cx, options, cx->names().caseFirst, &caseFirst)) {
+ return false;
+ }
+
+ if (caseFirst) {
+ if (!StringEqualsAscii(caseFirst, "upper") &&
+ !StringEqualsAscii(caseFirst, "lower") &&
+ !StringEqualsAscii(caseFirst, "false")) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *caseFirst)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE, "caseFirst",
+ str.get());
+ }
+ return false;
+ }
+ }
+
+ // Steps 24-26.
+ RootedLinearString numeric(cx);
+ if (!GetBooleanOption(cx, options, cx->names().numeric, &numeric)) {
+ return false;
+ }
+
+ // Steps 27, 29.
+ RootedLinearString numberingSystem(cx);
+ if (!GetStringOption(cx, options, cx->names().numberingSystem,
+ &numberingSystem)) {
+ return false;
+ }
+
+ // Step 28.
+ if (numberingSystem) {
+ if (!IsValidUnicodeExtensionValue(numberingSystem)) {
+ if (UniqueChars str = StringToNewUTF8CharsZ(cx, *numberingSystem)) {
+ JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr,
+ JSMSG_INVALID_OPTION_VALUE,
+ "numberingSystem", str.get());
+ }
+ return false;
+ }
+ }
+
+ // Step 30.
+ if (!ApplyUnicodeExtensionToTag(cx, tag, calendar, collation, hourCycle,
+ caseFirst, numeric, numberingSystem)) {
+ return false;
+ }
+ }
+
+ // ApplyOptionsToTag, steps 9 and 13.
+ // ApplyUnicodeExtensionToTag, step 8.
+ if (!tag.canonicalizeExtensions(
+ cx, LanguageTag::UnicodeExtensionCanonicalForm::Yes)) {
+ return false;
+ }
+
+ // Steps 6, 31-37.
+ JSObject* obj = CreateLocaleObject(cx, proto, tag);
+ if (!obj) {
+ return false;
+ }
+
+ // Step 38.
+ args.rval().setObject(*obj);
+ return true;
+}
+
+using UnicodeKey = const char (&)[UnicodeKeyLength + 1];
+
+// Returns the tuple [index, length] of the `type` in the `keyword` in Unicode
+// locale extension |extension| that has |key| as its `key`. If `keyword` lacks
+// a type, the returned |index| will be where `type` would have been, and
+// |length| will be set to zero.
+template <typename CharT>
+static mozilla::Maybe<IndexAndLength> FindUnicodeExtensionType(
+ const CharT* extension, size_t length, UnicodeKey key) {
+ MOZ_ASSERT(extension[0] == 'u');
+ MOZ_ASSERT(extension[1] == '-');
+
+ const CharT* end = extension + length;
+
+ SepKeywordIterator<CharT> iter(extension, end);
+
+ // Search all keywords until a match was found.
+ const CharT* beginKey;
+ while (true) {
+ beginKey = iter.next();
+ if (!beginKey) {
+ return mozilla::Nothing();
+ }
+
+ // Add +1 to skip over the separator preceding the keyword.
+ MOZ_ASSERT(beginKey[0] == '-');
+ beginKey++;
+
+ // Exit the loop on the first match.
+ if (std::equal(beginKey, beginKey + UnicodeKeyLength, key)) {
+ break;
+ }
+ }
+
+ // Skip over the key.
+ const CharT* beginType = beginKey + UnicodeKeyLength;
+
+ // Find the start of the next keyword.
+ const CharT* endType = iter.next();
+
+ // No further keyword present, the current keyword ends the Unicode extension.
+ if (!endType) {
+ endType = end;
+ }
+
+ // If the keyword has a type, skip over the separator preceding the type.
+ if (beginType != endType) {
+ MOZ_ASSERT(beginType[0] == '-');
+ beginType++;
+ }
+ return mozilla::Some(IndexAndLength{size_t(beginType - extension),
+ size_t(endType - beginType)});
+}
+
+static inline auto FindUnicodeExtensionType(JSLinearString* unicodeExtension,
+ UnicodeKey key) {
+ JS::AutoCheckCannotGC nogc;
+ return unicodeExtension->hasLatin1Chars()
+ ? FindUnicodeExtensionType(unicodeExtension->latin1Chars(nogc),
+ unicodeExtension->length(), key)
+ : FindUnicodeExtensionType(unicodeExtension->twoByteChars(nogc),
+ unicodeExtension->length(), key);
+}
+
+// Return the sequence of types for the Unicode extension keyword specified by
+// key or undefined when the keyword isn't present.
+static bool GetUnicodeExtension(JSContext* cx, LocaleObject* locale,
+ UnicodeKey key, MutableHandleValue value) {
+ // Return undefined when no Unicode extension subtag is present.
+ const Value& unicodeExtensionValue = locale->unicodeExtension();
+ if (unicodeExtensionValue.isUndefined()) {
+ value.setUndefined();
+ return true;
+ }
+
+ JSLinearString* unicodeExtension =
+ unicodeExtensionValue.toString()->ensureLinear(cx);
+ if (!unicodeExtension) {
+ return false;
+ }
+
+ // Find the type of the requested key in the Unicode extension subtag.
+ auto result = FindUnicodeExtensionType(unicodeExtension, key);
+
+ // Return undefined if the requested key isn't present in the extension.
+ if (!result) {
+ value.setUndefined();
+ return true;
+ }
+
+ size_t index = result->index;
+ size_t length = result->length;
+
+ // Otherwise return the type value of the found keyword.
+ JSString* str = NewDependentString(cx, unicodeExtension, index, length);
+ if (!str) {
+ return false;
+ }
+ value.setString(str);
+ return true;
+}
+
+struct BaseNamePartsResult {
+ IndexAndLength language;
+ mozilla::Maybe<IndexAndLength> script;
+ mozilla::Maybe<IndexAndLength> region;
+};
+
+// Returns [language-length, script-index, region-index, region-length].
+template <typename CharT>
+static BaseNamePartsResult BaseNameParts(const CharT* baseName, size_t length) {
+ size_t languageLength;
+ size_t scriptIndex = 0;
+ size_t regionIndex = 0;
+ size_t regionLength = 0;
+
+ // Search the first separator to find the end of the language subtag.
+ if (const CharT* sep = std::char_traits<CharT>::find(baseName, length, '-')) {
+ languageLength = sep - baseName;
+
+ // Add +1 to skip over the separator character.
+ size_t nextSubtag = languageLength + 1;
+
+ // Script subtags are always four characters long, but take care for a four
+ // character long variant subtag. These start with a digit.
+ if ((nextSubtag + ScriptLength == length ||
+ (nextSubtag + ScriptLength < length &&
+ baseName[nextSubtag + ScriptLength] == '-')) &&
+ mozilla::IsAsciiAlpha(baseName[nextSubtag])) {
+ scriptIndex = nextSubtag;
+ nextSubtag = scriptIndex + ScriptLength + 1;
+ }
+
+ // Region subtags can be either two or three characters long.
+ if (nextSubtag < length) {
+ for (size_t rlen : {AlphaRegionLength, DigitRegionLength}) {
+ MOZ_ASSERT(nextSubtag + rlen <= length);
+ if (nextSubtag + rlen == length || baseName[nextSubtag + rlen] == '-') {
+ regionIndex = nextSubtag;
+ regionLength = rlen;
+ break;
+ }
+ }
+ }
+ } else {
+ // No separator found, the base-name consists of just a language subtag.
+ languageLength = length;
+ }
+
+ IndexAndLength language{0, languageLength};
+ MOZ_ASSERT(intl::IsStructurallyValidLanguageTag(language.rangeOf(baseName)));
+
+ mozilla::Maybe<IndexAndLength> script{};
+ if (scriptIndex) {
+ script.emplace(scriptIndex, ScriptLength);
+ MOZ_ASSERT(intl::IsStructurallyValidScriptTag(script->rangeOf(baseName)));
+ }
+
+ mozilla::Maybe<IndexAndLength> region{};
+ if (regionIndex) {
+ region.emplace(regionIndex, regionLength);
+ MOZ_ASSERT(intl::IsStructurallyValidRegionTag(region->rangeOf(baseName)));
+ }
+
+ return {language, script, region};
+}
+
+static inline auto BaseNameParts(JSLinearString* baseName) {
+ JS::AutoCheckCannotGC nogc;
+ return baseName->hasLatin1Chars()
+ ? BaseNameParts(baseName->latin1Chars(nogc), baseName->length())
+ : BaseNameParts(baseName->twoByteChars(nogc), baseName->length());
+}
+
+// Intl.Locale.prototype.maximize ()
+static bool Locale_maximize(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ RootedLinearString tagStr(cx, locale->languageTag()->ensureLinear(cx));
+ if (!tagStr) {
+ return false;
+ }
+
+ LanguageTag tag(cx);
+ if (!LanguageTagParser::parse(cx, tagStr, tag)) {
+ return false;
+ }
+
+ if (!tag.addLikelySubtags(cx)) {
+ return false;
+ }
+
+ // Step 4.
+ auto* result = CreateLocaleObject(cx, nullptr, tag);
+ if (!result) {
+ return false;
+ }
+ args.rval().setObject(*result);
+ return true;
+}
+
+// Intl.Locale.prototype.maximize ()
+static bool Locale_maximize(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_maximize>(cx, args);
+}
+
+// Intl.Locale.prototype.minimize ()
+static bool Locale_minimize(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ RootedLinearString tagStr(cx, locale->languageTag()->ensureLinear(cx));
+ if (!tagStr) {
+ return false;
+ }
+
+ LanguageTag tag(cx);
+ if (!LanguageTagParser::parse(cx, tagStr, tag)) {
+ return false;
+ }
+
+ if (!tag.removeLikelySubtags(cx)) {
+ return false;
+ }
+
+ // Step 4.
+ auto* result = CreateLocaleObject(cx, nullptr, tag);
+ if (!result) {
+ return false;
+ }
+ args.rval().setObject(*result);
+ return true;
+}
+
+// Intl.Locale.prototype.minimize ()
+static bool Locale_minimize(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_minimize>(cx, args);
+}
+
+// Intl.Locale.prototype.toString ()
+static bool Locale_toString(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ args.rval().setString(locale->languageTag());
+ return true;
+}
+
+// Intl.Locale.prototype.toString ()
+static bool Locale_toString(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_toString>(cx, args);
+}
+
+// get Intl.Locale.prototype.baseName
+static bool Locale_baseName(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // FIXME: spec bug - invalid assertion in step 4.
+ // FIXME: spec bug - subtag production names not updated.
+
+ // Steps 3, 5.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ args.rval().setString(locale->baseName());
+ return true;
+}
+
+// get Intl.Locale.prototype.baseName
+static bool Locale_baseName(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_baseName>(cx, args);
+}
+
+// get Intl.Locale.prototype.calendar
+static bool Locale_calendar(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ return GetUnicodeExtension(cx, locale, "ca", args.rval());
+}
+
+// get Intl.Locale.prototype.calendar
+static bool Locale_calendar(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_calendar>(cx, args);
+}
+
+// get Intl.Locale.prototype.collation
+static bool Locale_collation(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ return GetUnicodeExtension(cx, locale, "co", args.rval());
+}
+
+// get Intl.Locale.prototype.collation
+static bool Locale_collation(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_collation>(cx, args);
+}
+
+// get Intl.Locale.prototype.hourCycle
+static bool Locale_hourCycle(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ return GetUnicodeExtension(cx, locale, "hc", args.rval());
+}
+
+// get Intl.Locale.prototype.hourCycle
+static bool Locale_hourCycle(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_hourCycle>(cx, args);
+}
+
+// get Intl.Locale.prototype.caseFirst
+static bool Locale_caseFirst(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ return GetUnicodeExtension(cx, locale, "kf", args.rval());
+}
+
+// get Intl.Locale.prototype.caseFirst
+static bool Locale_caseFirst(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_caseFirst>(cx, args);
+}
+
+// get Intl.Locale.prototype.numeric
+static bool Locale_numeric(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ RootedValue value(cx);
+ if (!GetUnicodeExtension(cx, locale, "kn", &value)) {
+ return false;
+ }
+
+ // FIXME: spec bug - comparison should be against the empty string, too.
+ MOZ_ASSERT(value.isUndefined() || value.isString());
+ args.rval().setBoolean(value.isString() && value.toString()->empty());
+ return true;
+}
+
+// get Intl.Locale.prototype.numeric
+static bool Locale_numeric(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_numeric>(cx, args);
+}
+
+// get Intl.Locale.prototype.numberingSystem
+static bool Intl_Locale_numberingSystem(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ return GetUnicodeExtension(cx, locale, "nu", args.rval());
+}
+
+// get Intl.Locale.prototype.numberingSystem
+static bool Locale_numberingSystem(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Intl_Locale_numberingSystem>(cx, args);
+}
+
+// get Intl.Locale.prototype.language
+static bool Locale_language(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ JSLinearString* baseName = locale->baseName()->ensureLinear(cx);
+ if (!baseName) {
+ return false;
+ }
+
+ // Step 4 (Unnecessary assertion).
+
+ auto language = BaseNameParts(baseName).language;
+
+ size_t index = language.index;
+ size_t length = language.length;
+
+ // Step 5.
+ // FIXME: spec bug - not all production names updated.
+ JSString* str = NewDependentString(cx, baseName, index, length);
+ if (!str) {
+ return false;
+ }
+
+ args.rval().setString(str);
+ return true;
+}
+
+// get Intl.Locale.prototype.language
+static bool Locale_language(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_language>(cx, args);
+}
+
+// get Intl.Locale.prototype.script
+static bool Locale_script(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ JSLinearString* baseName = locale->baseName()->ensureLinear(cx);
+ if (!baseName) {
+ return false;
+ }
+
+ // Step 4 (Unnecessary assertion).
+
+ auto script = BaseNameParts(baseName).script;
+
+ // Step 5.
+ // FIXME: spec bug - not all production names updated.
+ if (!script) {
+ args.rval().setUndefined();
+ return true;
+ }
+
+ size_t index = script->index;
+ size_t length = script->length;
+
+ // Step 6.
+ JSString* str = NewDependentString(cx, baseName, index, length);
+ if (!str) {
+ return false;
+ }
+
+ args.rval().setString(str);
+ return true;
+}
+
+// get Intl.Locale.prototype.script
+static bool Locale_script(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_script>(cx, args);
+}
+
+// get Intl.Locale.prototype.region
+static bool Locale_region(JSContext* cx, const CallArgs& args) {
+ MOZ_ASSERT(IsLocale(args.thisv()));
+
+ // Step 3.
+ auto* locale = &args.thisv().toObject().as<LocaleObject>();
+ JSLinearString* baseName = locale->baseName()->ensureLinear(cx);
+ if (!baseName) {
+ return false;
+ }
+
+ // Step 4 (Unnecessary assertion).
+
+ auto region = BaseNameParts(baseName).region;
+
+ // Step 5.
+ if (!region) {
+ args.rval().setUndefined();
+ return true;
+ }
+
+ size_t index = region->index;
+ size_t length = region->length;
+
+ // Step 6.
+ JSString* str = NewDependentString(cx, baseName, index, length);
+ if (!str) {
+ return false;
+ }
+
+ args.rval().setString(str);
+ return true;
+}
+
+// get Intl.Locale.prototype.region
+static bool Locale_region(JSContext* cx, unsigned argc, Value* vp) {
+ // Steps 1-2.
+ CallArgs args = CallArgsFromVp(argc, vp);
+ return CallNonGenericMethod<IsLocale, Locale_region>(cx, args);
+}
+
+static bool Locale_toSource(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ args.rval().setString(cx->names().Locale);
+ return true;
+}
+
+static const JSFunctionSpec locale_methods[] = {
+ JS_FN("maximize", Locale_maximize, 0, 0),
+ JS_FN("minimize", Locale_minimize, 0, 0),
+ JS_FN(js_toString_str, Locale_toString, 0, 0),
+ JS_FN(js_toSource_str, Locale_toSource, 0, 0), JS_FS_END};
+
+static const JSPropertySpec locale_properties[] = {
+ JS_PSG("baseName", Locale_baseName, 0),
+ JS_PSG("calendar", Locale_calendar, 0),
+ JS_PSG("collation", Locale_collation, 0),
+ JS_PSG("hourCycle", Locale_hourCycle, 0),
+ JS_PSG("caseFirst", Locale_caseFirst, 0),
+ JS_PSG("numeric", Locale_numeric, 0),
+ JS_PSG("numberingSystem", Locale_numberingSystem, 0),
+ JS_PSG("language", Locale_language, 0),
+ JS_PSG("script", Locale_script, 0),
+ JS_PSG("region", Locale_region, 0),
+ JS_STRING_SYM_PS(toStringTag, "Intl.Locale", JSPROP_READONLY),
+ JS_PS_END};
+
+JSObject* js::CreateLocalePrototype(JSContext* cx, HandleObject Intl,
+ Handle<GlobalObject*> global) {
+ RootedFunction ctor(cx,
+ GlobalObject::createConstructor(cx, &Locale, cx->names().Locale, 1));
+ if (!ctor) {
+ return nullptr;
+ }
+
+ RootedObject proto(
+ cx, GlobalObject::createBlankPrototype<PlainObject>(cx, global));
+ if (!proto) {
+ return nullptr;
+ }
+
+ if (!LinkConstructorAndPrototype(cx, ctor, proto)) {
+ return nullptr;
+ }
+
+ if (!DefinePropertiesAndFunctions(cx, proto, locale_properties, locale_methods)) {
+ return nullptr;
+ }
+
+ RootedValue ctorValue(cx, ObjectValue(*ctor));
+ if (!DefineProperty(cx, Intl, cx->names().Locale, ctorValue, nullptr, nullptr, 0)) {
+ return nullptr;
+ }
+
+ return proto;
+}
+
+bool js::intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, unsigned argc,
+ Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 2);
+
+ HandleValue tagValue = args[0];
+ bool applyToString = args[1].toBoolean();
+
+ if (tagValue.isObject()) {
+ JSString* tagStr;
+ JS_TRY_VAR_OR_RETURN_FALSE(
+ cx, tagStr,
+ LanguageTagFromMaybeWrappedLocale(cx, &tagValue.toObject()));
+ if (tagStr) {
+ args.rval().setString(tagStr);
+ return true;
+ }
+ }
+
+ if (!applyToString && !tagValue.isString()) {
+ args.rval().setNull();
+ return true;
+ }
+
+ JSString* tagStr = ToString(cx, tagValue);
+ if (!tagStr) {
+ return false;
+ }
+
+ RootedLinearString tagLinearStr(cx, tagStr->ensureLinear(cx));
+ if (!tagLinearStr) {
+ return false;
+ }
+
+ // Handle the common case (a standalone language) first.
+ // Only the following Unicode BCP 47 locale identifier subset is accepted:
+ // unicode_locale_id = unicode_language_id
+ // unicode_language_id = unicode_language_subtag
+ // unicode_language_subtag = alpha{2,3}
+ JSString* language;
+ JS_TRY_VAR_OR_RETURN_FALSE(
+ cx, language, intl::ParseStandaloneISO639LanguageTag(cx, tagLinearStr));
+ if (language) {
+ args.rval().setString(language);
+ return true;
+ }
+
+ LanguageTag tag(cx);
+ if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) {
+ return false;
+ }
+
+ if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) {
+ return false;
+ }
+
+ StringBuffer sb(cx);
+ if (!tag.appendTo(cx, sb)) {
+ return false;
+ }
+
+ JSString* resultStr = sb.finishString();
+ if (!resultStr) {
+ return false;
+ }
+ args.rval().setString(resultStr);
+ return true;
+}
+
+bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx,
+ unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 1);
+
+ RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
+ if (!linear) {
+ return false;
+ }
+
+ LanguageTag tag(cx);
+ bool ok;
+ JS_TRY_VAR_OR_RETURN_FALSE(cx, ok,
+ LanguageTagParser::tryParse(cx, linear, tag));
+
+ // The caller handles invalid inputs.
+ if (!ok) {
+ args.rval().setNull();
+ return true;
+ }
+
+ if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) {
+ return false;
+ }
+
+ StringBuffer sb(cx);
+ if (!tag.appendTo(cx, sb)) {
+ return false;
+ }
+
+ JSString* resultStr = sb.finishString();
+ if (!resultStr) {
+ return false;
+ }
+ args.rval().setString(resultStr);
+ return true;
+}
diff --git a/js/src/builtin/intl/Locale.h b/js/src/builtin/intl/Locale.h
new file mode 100644
index 0000000000..31b3caca5c
--- /dev/null
+++ b/js/src/builtin/intl/Locale.h
@@ -0,0 +1,61 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef builtin_intl_Locale_h
+#define builtin_intl_Locale_h
+
+#include <stdint.h>
+
+#include "builtin/SelfHostingDefines.h"
+#include "js/Class.h"
+#include "vm/NativeObject.h"
+
+namespace js {
+
+class GlobalObject;
+
+class LocaleObject : public NativeObject {
+ public:
+ static const Class class_;
+
+ static constexpr uint32_t LANGUAGE_TAG_SLOT = 0;
+ static constexpr uint32_t BASENAME_SLOT = 1;
+ static constexpr uint32_t UNICODE_EXTENSION_SLOT = 2;
+ static constexpr uint32_t SLOT_COUNT = 3;
+
+ /**
+ * Returns the complete language tag, including any extensions and privateuse
+ * subtags.
+ */
+ JSString* languageTag() const {
+ return getFixedSlot(LANGUAGE_TAG_SLOT).toString();
+ }
+
+ /**
+ * Returns the basename subtags, i.e. excluding any extensions and privateuse
+ * subtags.
+ */
+ JSString* baseName() const { return getFixedSlot(BASENAME_SLOT).toString(); }
+
+ const Value& unicodeExtension() const {
+ return getFixedSlot(UNICODE_EXTENSION_SLOT);
+ }
+};
+
+extern JSObject* CreateLocalePrototype(JSContext* cx,
+ JS::Handle<JSObject*> Intl,
+ JS::Handle<GlobalObject*> global);
+
+extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx,
+ unsigned argc,
+ Value* vp);
+
+extern MOZ_MUST_USE bool intl_TryValidateAndCanonicalizeLanguageTag(
+ JSContext* cx, unsigned argc, Value* vp);
+
+} // namespace js
+
+#endif /* builtin_intl_Locale_h */
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py
index f2a6b32082..670a46357b 100644
--- a/js/src/builtin/intl/make_intl_data.py
+++ b/js/src/builtin/intl/make_intl_data.py
@@ -6,14 +6,15 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
""" Usage:
- make_intl_data.py langtags [ldmlSupplemental.dtd supplementalMetadata.xml likelySubtags.xml]
+ make_intl_data.py langtags [cldr_core.zip]
make_intl_data.py tzdata
+ make_intl_data.py unicode-ext
Target "langtags":
- This script extracts information about mappings between deprecated and
- current Unicode BCP 47 locale identifiers from CLDR and converts it to
- JavaScript object definitions in LangTagMappingsGenerated.js. The
- definitions are used in Intl.js.
+ This script extracts information about 1) mappings between deprecated and
+ current Unicode BCP 47 locale identifiers, and 2) deprecated and current
+ BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
+ code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp.
Target "tzdata":
@@ -27,17 +28,23 @@ import os
import re
import io
import codecs
-import shutil
-import subprocess
import sys
import tarfile
import tempfile
import urllib2
-from contextlib import closing, contextmanager
+from contextlib import closing
from functools import partial
-from itertools import chain, ifilter, ifilterfalse, imap, tee
+from itertools import chain, ifilter, ifilterfalse, imap, izip_longest, groupby, tee
from operator import attrgetter, itemgetter
-from urlparse import urlsplit, urlunsplit
+from urlparse import urlsplit
+from zipfile import ZipFile
+
+# From https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+ args = [iter(iterable)] * n
+ return izip_longest(*args, fillvalue=fillvalue)
def writeMappingHeader(println, description, source, url):
if type(description) is not list:
@@ -57,40 +64,134 @@ def writeMappingsVar(println, mapping, name, description, source, url):
println(u"")
writeMappingHeader(println, description, source, url)
println(u"var {0} = {{".format(name))
- for key in sorted(mapping):
- if not isinstance(mapping[key], dict):
- value = mapping[key]
- if isinstance(value, bool):
- value = "true" if value else "false"
- else:
- value = '"{0}"'.format(value)
- else:
- preferred = mapping[key]["preferred"]
- prefix = mapping[key]["prefix"]
- if key != preferred:
- raise Exception("Expected '{0}' matches preferred locale '{1}'".format(key, preferred))
- value = '"{0}"'.format(prefix)
- println(u' "{0}": {1},'.format(key, value))
+ for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
+ println(u' "{0}": "{1}",'.format(key, value))
println(u"};")
-def writeUpdateLocaleIdMappingsFunction(println,
- complex_language_mappings,
- complex_region_mappings,
- description, source, url):
- """ Writes a function definition that performs language tag mapping. """
+def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, mappings,
+ tag_maxlength, description, source, url):
+ """ Emit code to perform a binary search on language tag subtags.
+
+ Uses the contents of |mapping|, which can either be a dictionary or set,
+ to emit a mapping function to find subtag replacements.
+ """
println(u"")
writeMappingHeader(println, description, source, url)
- println(u"""\
-/* eslint-disable complexity */
-function updateLocaleIdMappings(tag) {
- assert(IsObject(tag), "tag is an object");
+ println(u"""
+bool js::intl::LanguageTag::{0}({1} {2}) {{
+ MOZ_ASSERT({3}({2}.range()));
+""".format(fn_name, type_name, name, validate_fn).strip())
+
+ def write_array(subtags, name, length, fixed):
+ if fixed:
+ println(u" static const char {}[{}][{}] = {{".format(name, len(subtags),
+ length + 1))
+ else:
+ println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
+
+ # Group in pairs of ten to not exceed the 80 line column limit.
+ for entries in grouper(subtags, 10):
+ entries = (u"\"{}\"".format(tag).rjust(length + 2)
+ for tag in entries if tag is not None)
+ println(u" {},".format(u", ".join(entries)))
+
+ println(u" };")
+
+ trailing_return = True
+
+ # Sort the subtags by length. That enables using an optimized comparator
+ # for the binary search, which only performs a single |memcmp| for multiple
+ # of two subtag lengths.
+ mappings_keys = mappings.keys() if type(mappings) == dict else mappings
+ for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
+ # Omit the length check if the current length is the maximum length.
+ if length != tag_maxlength:
+ println(u"""
+ if ({}.length() == {}) {{
+""".format(name, length).rstrip("\n"))
+ else:
+ trailing_return = False
+ println(u"""
+ {
+""".rstrip("\n"))
+
+ # The subtags need to be sorted for binary search to work.
+ subtags = sorted(subtags)
+
+ def equals(subtag):
+ return u"""{}.equalTo("{}")""".format(name, subtag)
+
+ # Don't emit a binary search for short lists.
+ if len(subtags) == 1:
+ if type(mappings) == dict:
+ println(u"""
+ if ({}) {{
+ {}.set("{}");
+ return true;
+ }}
+ return false;
+""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n"))
+ else:
+ println(u"""
+ return {};
+""".format(equals(subtags[0])).strip("\n"))
+ elif len(subtags) <= 4:
+ if type(mappings) == dict:
+ for subtag in subtags:
+ println(u"""
+ if ({}) {{
+ {}.set("{}");
+ return true;
+ }}
+""".format(equals(subtag), name, mappings[subtag]).strip("\n"))
+
+ println(u"""
+ return false;
+""".strip("\n"))
+ else:
+ cond = (equals(subtag) for subtag in subtags)
+ cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond)
+ println(u"""
+ return {};
+""".format(cond).strip("\n"))
+ else:
+ write_array(subtags, name + "s", length, True)
+
+ if type(mappings) == dict:
+ write_array([mappings[k] for k in subtags], u"aliases", length, False)
+
+ println(u"""
+ if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
+ {0}.set(ConstCharRange(replacement, strlen(replacement)));
+ return true;
+ }}
+ return false;
+""".format(name).rstrip())
+ else:
+ println(u"""
+ return HasReplacement({0}s, {0});
+""".format(name).rstrip())
+
+ println(u"""
+ }
+""".strip("\n"))
+
+ if trailing_return:
+ println(u"""
+ return false;""")
- // Replace deprecated language tags with their preferred values.
- var language = tag.language;
- if (hasOwn(language, languageMappings)) {
- tag.language = languageMappings[language];
- } else if (hasOwn(language, complexLanguageMappings)) {
- switch (language) {""")
+ println(u"""
+}""".lstrip("\n"))
+
+
+def writeComplexLanguageTagMappings(println, complex_language_mappings,
+ description, source, url):
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+void js::intl::LanguageTag::performComplexLanguageMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
+""".lstrip())
# Merge duplicate language entries.
language_aliases = {}
@@ -103,6 +204,7 @@ function updateLocaleIdMappings(tag) {
else:
language_aliases[key].append(deprecated_language)
+ first_language = True
for (deprecated_language, (language, script, region)) in (
sorted(complex_language_mappings.items(), key=itemgetter(0))
):
@@ -110,43 +212,46 @@ function updateLocaleIdMappings(tag) {
if deprecated_language in language_aliases[key]:
continue
- for lang in [deprecated_language] + language_aliases[key]:
- println(u"""
- case "{}":
- """.format(lang).rstrip().strip("\n"))
+ if_kind = u"if" if first_language else u"else if"
+ first_language = False
+
+ cond = (u"language().equalTo(\"{}\")".format(lang)
+ for lang in [deprecated_language] + language_aliases[key])
+ cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+
+ println(u"""
+ {} ({}) {{""".format(if_kind, cond).strip("\n"))
println(u"""
- tag.language = "{}";
- """.format(language).rstrip().strip("\n"))
+ setLanguage("{}");""".format(language).strip("\n"))
+
if script is not None:
println(u"""
- if (tag.script === undefined)
- tag.script = "{}";
- """.format(script).rstrip().strip("\n"))
+ if (script().length() == 0) {{
+ setScript("{}");
+ }}""".format(script).strip("\n"))
if region is not None:
println(u"""
- if (tag.region === undefined)
- tag.region = "{}";
- """.format(region).rstrip().strip("\n"))
+ if (region().length() == 0) {{
+ setRegion("{}");
+ }}""".format(region).strip("\n"))
println(u"""
- break;
- """.rstrip().strip("\n"))
+ }""".strip("\n"))
println(u"""
- default:
- assert(false, "language not handled: " + language);
- }
- }
+}
+""".strip("\n"))
- // No script replacements are currently present.
- // Replace deprecated subtags with their preferred values.
- var region = tag.region;
- if (region !== undefined) {
- if (hasOwn(region, regionMappings)) {
- tag.region = regionMappings[region];
- } else if (hasOwn(region, complexRegionMappings)) {
- switch (region) {""".lstrip("\n"))
+def writeComplexRegionTagMappings(println, complex_region_mappings,
+ description, source, url):
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+void js::intl::LanguageTag::performComplexRegionMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
+ MOZ_ASSERT(IsStructurallyValidRegionTag(region().range()));
+""".lstrip())
# |non_default_replacements| is a list and hence not hashable. Convert it
# to a string to get a proper hashable value.
@@ -164,6 +269,7 @@ function updateLocaleIdMappings(tag) {
else:
region_aliases[key].append(deprecated_region)
+ first_region = True
for (deprecated_region, (default, non_default_replacements)) in (
sorted(complex_region_mappings.items(), key=itemgetter(0))
):
@@ -171,91 +277,100 @@ function updateLocaleIdMappings(tag) {
if deprecated_region in region_aliases[key]:
continue
- for region in [deprecated_region] + region_aliases[key]:
- println(u"""
- case "{}":
- """.format(region).rstrip().strip("\n"))
+ if_kind = u"if" if first_region else u"else if"
+ first_region = False
- for (language, script, region) in sorted(non_default_replacements, key=itemgetter(0)):
- if script is None:
- println(u"""
- if (tag.language === "{}") {{
- """.format(language).rstrip().strip("\n"))
- else:
- println(u"""
- if (tag.language === "{}" && tag.script === "{}") {{
- """.format(language, script).rstrip().strip("\n"))
- println(u"""
- tag.region = "{}";
- break;
- }}
- """.format(region).rstrip().strip("\n"))
+ cond = (u"region().equalTo(\"{}\")".format(region)
+ for region in [deprecated_region] + region_aliases[key])
+ cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
println(u"""
- tag.region = "{}";
- break;
- """.format(default).rstrip().strip("\n"))
+ {} ({}) {{""".format(if_kind, cond).strip("\n"))
- println(u"""
- default:
- assert(false, "region not handled: " + region);
- }
- }
+ replacement_regions = sorted({region for (_, _, region) in non_default_replacements})
- // No variant replacements are currently present.
- // No extension replacements are currently present.
- // Private use sequences are left as is.
+ first_case = True
+ for replacement_region in replacement_regions:
+ replacement_language_script = sorted(((language, script)
+ for (language, script, region) in (
+ non_default_replacements
+ )
+ if region == replacement_region),
+ key=itemgetter(0))
- }
+ if_kind = u"if" if first_case else u"else if"
+ first_case = False
+
+ def compare_tags(language, script):
+ if script is None:
+ return u"language().equalTo(\"{}\")".format(language)
+ return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format(
+ language, script)
+
+ cond = (compare_tags(language, script)
+ for (language, script) in replacement_language_script)
+ cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond)
+
+ println(u"""
+ {} ({}) {{
+ setRegion("{}");
+ }}""".format(if_kind, cond, replacement_region).rstrip().strip("\n"))
+
+ println(u"""
+ else {{
+ setRegion("{}");
+ }}
+ }}""".format(default).rstrip().strip("\n"))
+
+ println(u"""
}
-/* eslint-enable complexity */
""".strip("\n"))
-def writeGrandfatheredMappingsFunction(println,
- grandfathered_mappings,
+def writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
description, source, url):
""" Writes a function definition that maps grandfathered language tags. """
println(u"")
writeMappingHeader(println, description, source, url)
println(u"""\
-function updateGrandfatheredMappings(tag) {
- assert(IsObject(tag), "tag is an object");
-
- // We're mapping regular grandfathered tags to non-grandfathered form here.
- // Other tags remain unchanged.
- //
- // regular = "art-lojban"
- // / "cel-gaulish"
- // / "no-bok"
- // / "no-nyn"
- // / "zh-guoyu"
- // / "zh-hakka"
- // / "zh-min"
- // / "zh-min-nan"
- // / "zh-xiang"
- //
- // Therefore we can quickly exclude most tags by checking every
- // |unicode_locale_id| subcomponent for characteristics not shared by any of
- // the regular grandfathered (RG) tags:
- //
- // * Real-world |unicode_language_subtag|s are all two or three letters,
- // so don't waste time running a useless |language.length > 3| fast-path.
- // * No RG tag has a "script"-looking component.
- // * No RG tag has a "region"-looking component.
- // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
- // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
- // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
- // that |unicode_locale_id| doesn't support.)
- // * No RG tag contains |extensions| or |pu_extensions|.
- if (tag.script !== undefined ||
- tag.region !== undefined ||
- tag.variants.length !== 1 ||
- tag.extensions.length !== 0 ||
- tag.privateuse !== undefined)
- {
- return;
- }""")
+bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
+ // We're mapping regular grandfathered tags to non-grandfathered form here.
+ // Other tags remain unchanged.
+ //
+ // regular = "art-lojban"
+ // / "cel-gaulish"
+ // / "no-bok"
+ // / "no-nyn"
+ // / "zh-guoyu"
+ // / "zh-hakka"
+ // / "zh-min"
+ // / "zh-min-nan"
+ // / "zh-xiang"
+ //
+ // Therefore we can quickly exclude most tags by checking every
+ // |unicode_locale_id| subcomponent for characteristics not shared by any of
+ // the regular grandfathered (RG) tags:
+ //
+ // * Real-world |unicode_language_subtag|s are all two or three letters,
+ // so don't waste time running a useless |language.length > 3| fast-path.
+ // * No RG tag has a "script"-looking component.
+ // * No RG tag has a "region"-looking component.
+ // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
+ // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
+ // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
+ // that |unicode_locale_id| doesn't support.)
+ // * No RG tag contains |extensions| or |pu_extensions|.
+ if (script().length() != 0 ||
+ region().length() != 0 ||
+ variants().length() != 1 ||
+ extensions().length() != 0 ||
+ privateuse()) {
+ return true;
+ }
+
+ auto variantEqualTo = [this](const char* variant) {
+ return strcmp(variants()[0].get(), variant) == 0;
+ };""")
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
#
@@ -316,60 +431,57 @@ function updateGrandfatheredMappings(tag) {
modern_privateuse = modern_match.group("privateuse")
println(u"""
- // {} -> {}
+ // {} -> {}
""".format(tag, modern).rstrip())
println(u"""
- {}if (tag.language === "{}" && tag.variants[0] === "{}") {{
- """.format("" if is_first else "else ", tag_language, tag_variant).rstrip().strip("\n"))
+ {}if (language().equalTo("{}") && variantEqualTo("{}")) {{
+ """.format("" if is_first else "else ",
+ tag_language,
+ tag_variant).rstrip().strip("\n"))
is_first = False
println(u"""
- tag.language = "{}";
+ setLanguage("{}");
""".format(modern_language).rstrip().strip("\n"))
if modern_script is not None:
println(u"""
- tag.script = "{}";
- """.format(modern_script).rstrip().strip("\n"))
+ setScript("{}");
+ """.format(modern_script).rstrip().strip("\n"))
if modern_region is not None:
println(u"""
- tag.region = "{}";
- """.format(modern_region).rstrip().strip("\n"))
+ setRegion("{}");
+ """.format(modern_region).rstrip().strip("\n"))
- if modern_variants is not None:
- println(u"""
- tag.variants = {};
- """.format(sorted(modern_variants[1:].split("-"))).rstrip().strip("\n"))
- else:
- println(u"""
- tag.variants.length = 0;
+ assert modern_variants is None, (
+ "all regular grandfathered tags' modern forms do not contain variant subtags")
+
+ println(u"""
+ clearVariants();
""".rstrip().strip("\n"))
if modern_privateuse is not None:
println(u"""
- tag.privateuse = "{}";
- """.format(modern_privateuse).rstrip().strip("\n"))
+ auto privateuse = DuplicateString(cx, "{}");
+ if (!privateuse) {{
+ return false;
+ }}
+ setPrivateuse(std::move(privateuse));
+ """.format(modern_privateuse).rstrip().rstrip("\n"))
println(u"""
- }""".rstrip().strip("\n"))
+ return true;
+ }""".rstrip().strip("\n"))
println(u"""
-}""".lstrip("\n"))
-
-
-@contextmanager
-def TemporaryDirectory():
- tmpDir = tempfile.mkdtemp()
- try:
- yield tmpDir
- finally:
- shutil.rmtree(tmpDir)
+ return true;
+}""")
-def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, likely_subtags_file):
+def readSupplementalData(core_file):
""" Reads CLDR Supplemental Data and extracts information for Intl.js.
Information extracted:
@@ -379,19 +491,11 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like
- complexLanguageMappings: mappings from language subtags with complex rules
- regionMappings: mappings from region subtags to preferred subtags
- complexRegionMappings: mappings from region subtags with complex rules
- Returns these five mappings as dictionaries.
+ - likelySubtags: likely subtags used for generating test data only
+ Returns these mappings as dictionaries.
"""
import xml.etree.ElementTree as ET
- # <!ATTLIST version cldrVersion CDATA #FIXED "36" >
- re_cldr_version = re.compile(
- r"""<!ATTLIST version cldrVersion CDATA #FIXED "(?P<version>[\d|\.]+)" >""")
-
- with io.open(supplemental_dtd_file, mode="r", encoding="utf-8") as f:
- version_match = re_cldr_version.search(f.read())
- assert version_match is not None, "CLDR version string not found"
- cldr_version = version_match.group("version")
-
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
re_unicode_language_id = re.compile(
r"""
@@ -497,7 +601,7 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like
script.title() if script else None,
region.upper() if region else None)
- tree = ET.parse(supplemental_metadata_file)
+ tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
for language_alias in tree.iterfind(".//languageAlias"):
type = bcp47_id(language_alias.get("type"))
@@ -547,7 +651,7 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like
), "{} invalid region subtags".format(replacement)
complex_region_mappings[type] = replacements
- tree = ET.parse(likely_subtags_file)
+ tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
likely_subtags = {}
@@ -608,133 +712,441 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like
else:
region_mappings[deprecated_region] = default
- return {"version": cldr_version,
- "grandfatheredMappings": grandfathered_mappings,
+ return {"grandfatheredMappings": grandfathered_mappings,
"languageMappings": language_mappings,
"complexLanguageMappings": complex_language_mappings,
"regionMappings": region_mappings,
"complexRegionMappings": complex_region_mappings_final,
+ "likelySubtags": likely_subtags,
}
+def readUnicodeExtensions(core_file):
+ import xml.etree.ElementTree as ET
+
+ # Match all xml-files in the BCP 47 directory.
+ bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
+
+ # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
+ #
+ # type = alphanum{3,8} (sep alphanum{3,8})* ;
+ typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
+
+ # Mapping from Unicode extension types to dict of deprecated to
+ # preferred values.
+ mapping = {}
+
+ def readBCP47File(file):
+ tree = ET.parse(file)
+ for keyword in tree.iterfind(".//keyword/key"):
+ # Skip over keywords whose extension is not "u".
+ if keyword.get("extension", "u") != "u":
+ continue
+
+ extension_name = keyword.get("name")
+
+ for type in keyword.iterfind("type"):
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The key or type name used by Unicode locale extension with 'u' extension
+ # syntax or the 't' extensions syntax. When alias below is absent, this name
+ # can be also used with the old style "@key=type" syntax.
+ name = type.get("name")
+
+ # Ignore the special name:
+ # - <https://unicode.org/reports/tr35/#CODEPOINTS>
+ # - <https://unicode.org/reports/tr35/#REORDER_CODE>
+ # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
+ # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
+ # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
+ if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
+ "PRIVATE_USE"):
+ continue
+
+ # All other names should match the 'type' production.
+ assert typeRE.match(name) is not None, (
+ "{} matches the 'type' production".format(name))
+
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The preferred value of the deprecated key, type or attribute element.
+ # When a key, type or attribute element is deprecated, this attribute is
+ # used for specifying a new canonical form if available.
+ preferred = type.get("preferred")
+
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The BCP 47 form is the canonical form, and recommended. Other aliases are
+ # included only for backwards compatibility.
+ alias = type.get("alias")
+
+ # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+ #
+ # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
+ # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
+ # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
+ # value, while the canonical is in the name attribute value.
+
+ # 'preferred' contains the new preferred name, 'alias' the compatibility
+ # name, but then there's this entry where 'preferred' and 'alias' are the
+ # same. So which one to choose? Assume 'preferred' is the actual canonical
+ # name.
+ #
+ # <type name="islamicc"
+ # description="Civil (algorithmic) Arabic calendar"
+ # deprecated="true"
+ # preferred="islamic-civil"
+ # alias="islamic-civil"/>
+
+ if preferred is not None:
+ assert typeRE.match(preferred), preferred
+ mapping.setdefault(extension_name, {})[name] = preferred
+
+ if alias is not None:
+ for alias_name in alias.lower().split(" "):
+ # Ignore alias entries which don't match the 'type' production.
+ if typeRE.match(alias_name) is None:
+ continue
+
+ # See comment above when 'alias' and 'preferred' are both present.
+ if (preferred is not None and
+ name in mapping[extension_name]):
+ continue
+
+ # Skip over entries where 'name' and 'alias' are equal.
+ #
+ # <type name="pst8pdt"
+ # description="POSIX style time zone for US Pacific Time"
+ # alias="PST8PDT"
+ # since="1.8"/>
+ if name == alias_name:
+ continue
+
+ mapping.setdefault(extension_name, {})[alias_name] = name
+
+ def readSupplementalMetadata(file):
+ # Find subdivision and region replacements.
+ #
+ # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+ #
+ # Replace aliases in special key values:
+ # - If there is an 'sd' or 'rg' key, replace any subdivision alias
+ # in its value in the same way, using subdivisionAlias data.
+ tree = ET.parse(file)
+ for alias in tree.iterfind(".//subdivisionAlias"):
+ type = alias.get("type")
+ assert typeRE.match(type) is not None, (
+ "{} matches the 'type' production".format(type))
+
+ # Take the first replacement when multiple ones are present.
+ replacement = alias.get("replacement").split(" ")[0].lower()
+
+ # Skip over invalid replacements.
+ #
+ # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
+ #
+ # It's not entirely clear to me if CLDR actually wants to use
+ # "axzzzz" as the replacement for this case.
+ if typeRE.match(replacement) is None:
+ continue
+
+ # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
+ mapping.setdefault("rg", {})[type] = replacement
+ mapping.setdefault("sd", {})[type] = replacement
+
+ for name in core_file.namelist():
+ if bcpFileRE.match(name):
+ readBCP47File(core_file.open(name))
+
+ readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+ return mapping
+
def writeCLDRLanguageTagData(println, data, url):
""" Writes the language tag data to the Intl data file. """
+ println(generatedFileWarning)
+ println(u"// Version: CLDR-{}".format(data["version"]))
+ println(u"// URL: {}".format(url))
+
+ println(u"""
+#include "mozilla/Assertions.h"
+#include "mozilla/Range.h"
+#include "mozilla/TextUtils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+
+#include "jscntxt.h"
+#include "jsstr.h"
+
+#include "builtin/intl/LanguageTag.h"
+
+using namespace js::intl::LanguageTagLimits;
+using ConstCharRange = mozilla::Range<const char>;
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline bool HasReplacement(
+ const char (&subtags)[Length][TagLength],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.range().begin().get();
+ return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+}
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline const char* SearchReplacement(
+ const char (&subtags)[Length][TagLength],
+ const char* (&aliases)[Length],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.range().begin().get();
+ auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+ if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
+ return aliases[std::distance(std::begin(subtags), p)];
+ }
+ return nullptr;
+}
+""".rstrip())
+
source = u"CLDR Supplemental Data, version {}".format(data["version"])
grandfathered_mappings = data["grandfatheredMappings"]
language_mappings = data["languageMappings"]
complex_language_mappings = data["complexLanguageMappings"]
region_mappings = data["regionMappings"]
complex_region_mappings = data["complexRegionMappings"]
+ unicode_mappings = data["unicodeMappings"]
+
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+ language_maxlength = 8
+
+ # unicode_region_subtag = (alpha{2} | digit{3}) ;
+ region_maxlength = 3
+
+ writeMappingsBinarySearch(println, "languageMapping",
+ "LanguageSubtag&", "language",
+ "IsStructurallyValidLanguageTag",
+ language_mappings, language_maxlength,
+ "Mappings from language subtags to preferred values.", source, url)
+ writeMappingsBinarySearch(println, "complexLanguageMapping",
+ "const LanguageSubtag&", "language",
+ "IsStructurallyValidLanguageTag",
+ complex_language_mappings.keys(), language_maxlength,
+ "Language subtags with complex mappings.", source, url)
+ writeMappingsBinarySearch(println, "regionMapping",
+ "RegionSubtag&", "region",
+ "IsStructurallyValidRegionTag",
+ region_mappings, region_maxlength,
+ "Mappings from region subtags to preferred values.", source, url)
+ writeMappingsBinarySearch(println, "complexRegionMapping",
+ "const RegionSubtag&", "region",
+ "IsStructurallyValidRegionTag",
+ complex_region_mappings.keys(), region_maxlength,
+ "Region subtags with complex mappings.", source, url)
+
+ writeComplexLanguageTagMappings(println, complex_language_mappings,
+ "Language subtags with complex mappings.", source, url)
+ writeComplexRegionTagMappings(println, complex_region_mappings,
+ "Region subtags with complex mappings.", source, url)
- writeMappingsVar(println, grandfathered_mappings, "grandfatheredMappings",
- "Mappings from grandfathered tags to preferred values.", source, url)
- writeMappingsVar(println, language_mappings, "languageMappings",
- "Mappings from language subtags to preferred values.", source, url)
- writeMappingsVar(println, {key: True for key in complex_language_mappings},
- "complexLanguageMappings",
- "Language subtags with complex mappings.", source, url)
- writeMappingsVar(println, region_mappings, "regionMappings",
- "Mappings from region subtags to preferred values.", source, url)
- writeMappingsVar(println, {key: True for key in complex_region_mappings},
- "complexRegionMappings",
- "Region subtags with complex mappings.", source, url)
-
- writeUpdateLocaleIdMappingsFunction(println, complex_language_mappings,
- complex_region_mappings,
- "Canonicalize Unicode BCP 47 locale identifiers.",
- source, url)
writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
- "Canonicalize grandfathered locale identifiers.",
- source, url)
+ "Canonicalize grandfathered locale identifiers.", source,
+ url)
+
+ writeUnicodeExtensionsMappings(println, unicode_mappings)
+
+
+def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
+ """ Writes the likely-subtags test file. """
+
+ println(generatedFileWarning)
+
+ source = u"CLDR Supplemental Data, version {}".format(data["version"])
+ language_mappings = data["languageMappings"]
+ complex_language_mappings = data["complexLanguageMappings"]
+ region_mappings = data["regionMappings"]
+ complex_region_mappings = data["complexRegionMappings"]
+ likely_subtags = data["likelySubtags"]
+
+ def bcp47(tag):
+ (language, script, region) = tag
+ return "{}{}{}".format(language,
+ "-" + script if script else "",
+ "-" + region if region else "")
+
+ def canonical(tag):
+ (language, script, region) = tag
+
+ # Map deprecated language subtags.
+ if language in language_mappings:
+ language = language_mappings[language]
+ elif language in complex_language_mappings:
+ (language2, script2, region2) = complex_language_mappings[language]
+ (language, script, region) = (language2,
+ script if script else script2,
+ region if region else region2)
+
+ # Map deprecated region subtags.
+ if region in region_mappings:
+ region = region_mappings[region]
+ else:
+ # Assume no complex region mappings are needed for now.
+ assert region not in complex_region_mappings,\
+ "unexpected region with complex mappings: {}".format(region)
+
+ return (language, script, region)
+
+ # https://unicode.org/reports/tr35/#Likely_Subtags
+
+ def addLikelySubtags(tag):
+ # Step 1: Canonicalize.
+ (language, script, region) = canonical(tag)
+ if script == "Zzzz":
+ script = None
+ if region == "ZZ":
+ region = None
+
+ # Step 2: Lookup.
+ searches = ((language, script, region),
+ (language, None, region),
+ (language, script, None),
+ (language, None, None),
+ ("und", script, None))
+ search = next(search for search in searches if search in likely_subtags)
+
+ (language_s, script_s, region_s) = search
+ (language_m, script_m, region_m) = likely_subtags[search]
+
+ # Step 3: Return.
+ return (language if language != language_s else language_m,
+ script if script != script_s else script_m,
+ region if region != region_s else region_m)
+
+ # https://unicode.org/reports/tr35/#Likely_Subtags
+ def removeLikelySubtags(tag):
+ # Step 1: Add likely subtags.
+ max = addLikelySubtags(tag)
+
+ # Step 2: Remove variants (doesn't apply here).
+
+ # Step 3: Find a match.
+ (language, script, region) = max
+ for trial in ((language, None, None), (language, None, region), (language, script, None)):
+ if addLikelySubtags(trial) == max:
+ return trial
+
+ # Step 4: Return maximized if no match found.
+ return max
+
+ def likely_canonical(from_tag, to_tag):
+ # Canonicalize the input tag.
+ from_tag = canonical(from_tag)
+
+ # Update the expected result if necessary.
+ if from_tag in likely_subtags:
+ to_tag = likely_subtags[from_tag]
+
+ # Canonicalize the expected output.
+ to_canonical = canonical(to_tag)
+
+ # Sanity check: This should match the result of |addLikelySubtags|.
+ assert to_canonical == addLikelySubtags(from_tag)
+
+ return to_canonical
+
+ # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
+ likely_subtags_canonical = {k: likely_canonical(k, v) for (k, v) in likely_subtags.items()}
+
+ # Add test data for |Intl.Locale.prototype.maximize()|.
+ writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
+ "maxLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
+
+ # Use the maximalized tags as the input for the remove likely-subtags test.
+ minimized = {tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()}
+
+ # Add test data for |Intl.Locale.prototype.minimize()|.
+ writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
+ "minLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
+
+ println(u"""
+for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
+ assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
+}""")
+
+ println(u"""
+for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
+ assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
+}""")
+
+ println(u"""
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);""")
def updateCLDRLangTags(args):
- """ Update the LangTagMappingsCLDRGenerated.js file. """
+ """ Update the LanguageTagGenerated.cpp file. """
+ version = args.version
url = args.url
- branch = args.branch
- revision = args.revision
out = args.out
- files = args.files
+ filename = args.file
+
+ url = url.replace("<VERSION>", version)
print("Arguments:")
+ print("\tCLDR version: %s" % version)
print("\tDownload url: %s" % url)
- print("\tBranch: %s" % branch)
- print("\tRevision: %s" % revision)
- print("\tLocal supplemental data and likely subtags: %s" % files)
+ if filename is not None:
+ print("\tLocal CLDR core.zip file: %s" % filename)
print("\tOutput file: %s" % out)
print("")
- if files:
- if len(files) != 3:
- raise Exception("Expected three files, but got: {}".format(files))
+ data = {
+ "version": version,
+ }
- print(("Always make sure you have the newest ldmlSupplemental.dtd, "
- "supplementalMetadata.xml, and likelySubtags.xml!"))
+ def readFiles(cldr_file):
+ with ZipFile(cldr_file) as zip_file:
+ data.update(readSupplementalData(zip_file))
+ data["unicodeMappings"] = readUnicodeExtensions(zip_file)
- supplemental_dtd_file = files[0]
- supplemental_metadata_file = files[1]
- likely_subtags_file = files[2]
+ print("Processing CLDR data...")
+ if filename is not None:
+ print("Always make sure you have the newest CLDR core.zip!")
+ with open(filename, "rb") as cldr_file:
+ readFiles(cldr_file)
else:
- print("Downloading CLDR supplemental data...")
-
- supplemental_dtd_filename = "ldmlSupplemental.dtd"
- supplemental_dtd_path = "common/dtd/{}".format(supplemental_dtd_filename)
- supplemental_dtd_file = os.path.join(os.getcwd(), supplemental_dtd_filename)
-
- supplemental_metadata_filename = "supplementalMetadata.xml"
- supplemental_metadata_path = "common/supplemental/{}".format(
- supplemental_metadata_filename)
- supplemental_metadata_file = os.path.join(os.getcwd(), supplemental_metadata_filename)
-
- likely_subtags_filename = "likelySubtags.xml"
- likely_subtags_path = "common/supplemental/{}".format(likely_subtags_filename)
- likely_subtags_file = os.path.join(os.getcwd(), likely_subtags_filename)
-
- # Try to download the raw file directly from GitHub if possible.
- split = urlsplit(url)
- if split.netloc == "github.com" and split.path.endswith(".git") and revision == "HEAD":
- def download(path, file):
- urlpath = "{}/raw/{}/{}".format(urlsplit(url).path[:-4], branch, path)
- raw_url = urlunsplit((split.scheme, split.netloc, urlpath, split.query,
- split.fragment))
-
- with closing(urllib2.urlopen(raw_url)) as reader:
- text = reader.read().decode("utf-8")
- with io.open(file, "w", encoding="utf-8") as saved_file:
- saved_file.write(text)
-
- download(supplemental_dtd_path, supplemental_dtd_file)
- download(supplemental_metadata_path, supplemental_metadata_file)
- download(likely_subtags_path, likely_subtags_file)
- else:
- # Download the requested branch in a temporary directory.
- with TemporaryDirectory() as inDir:
- if revision == "HEAD":
- subprocess.check_call(["git", "clone", "--depth=1",
- "--branch=%s" % branch, url, inDir])
- else:
- subprocess.check_call(["git", "clone", "--single-branch",
- "--branch=%s" % branch, url, inDir])
- subprocess.check_call(["git", "-C", inDir, "reset", "--hard", revision])
-
- shutil.copyfile(os.path.join(inDir, supplemental_dtd_path),
- supplemental_dtd_file)
- shutil.copyfile(os.path.join(inDir, supplemental_metadata_path),
- supplemental_metadata_file)
- shutil.copyfile(os.path.join(inDir, likely_subtags_path), likely_subtags_file)
-
- print("Processing CLDR supplemental data...")
- data = readSupplementalData(supplemental_dtd_file,
- supplemental_metadata_file,
- likely_subtags_file)
+ print("Downloading CLDR core.zip...")
+ with closing(urllib2.urlopen(url)) as cldr_file:
+ cldr_data = io.BytesIO(cldr_file.read())
+ readFiles(cldr_data)
print("Writing Intl data...")
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
println = partial(print, file=f)
-
- println(u"// Generated by make_intl_data.py. DO NOT EDIT.")
writeCLDRLanguageTagData(println, data, url)
+ print("Writing Intl test data...")
+ test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+ "../../tests/non262/Intl/Locale/likely-subtags-generated.js")
+ with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||"
+ u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))")
+ writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
+
def flines(filepath, encoding="utf-8"):
""" Open filepath and iterate over its content. """
@@ -1448,6 +1860,158 @@ def updateTzdata(topsrcdir, args):
else:
updateFrom(tzDir)
+def writeUnicodeExtensionsMappings(println, mapping):
+ println(u"""
+template <size_t Length>
+static inline bool IsUnicodeKey(const ConstCharRange& key,
+ const char (&str)[Length]) {
+ static_assert(Length == UnicodeKeyLength + 1,
+ "Unicode extension key is two characters long");
+ return memcmp(key.begin().get(), str, Length - 1) == 0;
+}
+
+template <size_t Length>
+static inline bool IsUnicodeType(const ConstCharRange& type,
+ const char (&str)[Length]) {
+ static_assert(Length > UnicodeKeyLength + 1,
+ "Unicode extension type contains more than two characters");
+ return type.length() == (Length - 1) &&
+ memcmp(type.begin().get(), str, Length - 1) == 0;
+}
+
+static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) {
+#ifdef DEBUG
+ auto isNull = [](char c) {
+ return c == '\\0';
+ };
+#endif
+
+ MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull),
+ "unexpected null-character in string");
+
+ using UnsignedChar = unsigned char;
+ for (size_t i = 0; i < b.length(); i++) {
+ // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
+ // we've reached the end of |a|, the below if-statement will always be true.
+ // That ensures we don't read past the end of |a|.
+ if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
+ return r;
+ }
+ }
+
+ // Return zero if both strings are equal or a negative number if |b| is a
+ // prefix of |a|.
+ return -int32_t(UnsignedChar(a[b.length()]));
+};
+
+template <size_t Length>
+static inline const char* SearchReplacement(const char* (&types)[Length],
+ const char* (&aliases)[Length],
+ const ConstCharRange& type) {
+
+ auto p = std::lower_bound(std::begin(types), std::end(types), type,
+ [](const auto& a, const auto& b) {
+ return CompareUnicodeType(a, b) < 0;
+ });
+ if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
+ return aliases[std::distance(std::begin(types), p)];
+ }
+ return nullptr;
+}
+
+/**
+ * Mapping from deprecated BCP 47 Unicode extension types to their preferred
+ * values.
+ *
+ * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ */
+const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
+ const ConstCharRange& key, const ConstCharRange& type) {
+#ifdef DEBUG
+ static auto isAsciiLowercaseAlphanumeric = [](char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+ };
+
+ static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
+ return isAsciiLowercaseAlphanumeric(c) || c == '-';
+ };
+#endif
+
+ MOZ_ASSERT(key.length() == UnicodeKeyLength);
+ MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(),
+ isAsciiLowercaseAlphanumeric));
+
+ MOZ_ASSERT(type.length() > UnicodeKeyLength);
+ MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(),
+ isAsciiLowercaseAlphanumericOrDash));
+""")
+
+ def to_hash_key(replacements):
+ return str(sorted(replacements.items()))
+
+ def write_array(subtags, name, length):
+ max_entries = (80 - len(" ")) // (length + len('"", '))
+
+ println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
+
+ for entries in grouper(subtags, max_entries):
+ entries = (u"\"{}\"".format(tag).rjust(length + 2)
+ for tag in entries if tag is not None)
+ println(u" {},".format(u", ".join(entries)))
+
+ println(u" };")
+
+ # Merge duplicate keys.
+ key_aliases = {}
+ for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+ hash_key = to_hash_key(replacements)
+ if hash_key not in key_aliases:
+ key_aliases[hash_key] = []
+ else:
+ key_aliases[hash_key].append(key)
+
+ first_key = True
+ for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+ hash_key = to_hash_key(replacements)
+ if key in key_aliases[hash_key]:
+ continue
+
+ cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key])
+
+ if_kind = u"if" if first_key else u"else if"
+ cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+ println(u"""
+ {} ({}) {{""".format(if_kind, cond).strip("\n"))
+ first_key = False
+
+ replacements = sorted(replacements.items(), key=itemgetter(0))
+
+ if len(replacements) > 4:
+ types = [t for (t, _) in replacements]
+ preferred = [r for (_, r) in replacements]
+ max_len = max(len(k) for k in types + preferred)
+
+ write_array(types, "types", max_len)
+ write_array(preferred, "aliases", max_len)
+ println(u"""
+ return SearchReplacement(types, aliases, type);
+""".strip("\n"))
+ else:
+ for (type, replacement) in replacements:
+ println(u"""
+ if (IsUnicodeType(type, "{}")) {{
+ return "{}";
+ }}""".format(type, replacement).strip("\n"))
+
+ println(u"""
+ }""".lstrip("\n"))
+
+ println(u"""
+ return nullptr;
+}
+""".strip("\n"))
+
+
if __name__ == "__main__":
import argparse
@@ -1468,21 +2032,21 @@ if __name__ == "__main__":
parser_cldr_tags = subparsers.add_parser("langtags",
help="Update CLDR language tags data")
+ parser_cldr_tags.add_argument("--version",
+ metavar="VERSION",
+ required=True,
+ help="CLDR version number")
parser_cldr_tags.add_argument("--url",
metavar="URL",
- default="https://github.com/unicode-org/cldr.git",
- help="URL to git repository (default: %(default)s)")
- parser_cldr_tags.add_argument("--branch", default="latest",
- help="Git branch (default: %(default)s)")
- parser_cldr_tags.add_argument("--revision", default="HEAD",
- help="Git revision (default: %(default)s)")
+ default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
+ type=EnsureHttps,
+ help="Download url CLDR data (default: %(default)s)")
parser_cldr_tags.add_argument("--out",
- default="LangTagMappingsGenerated.js",
+ default="LanguageTagGenerated.cpp",
help="Output file (default: %(default)s)")
- parser_cldr_tags.add_argument("files",
- nargs="*",
- help="Local ldmlSupplemental.dtd, supplementalMetadata.xml, "
- "and likelySubtags.xml files, if omitted uses <URL>")
+ parser_cldr_tags.add_argument("file",
+ nargs="?",
+ help="Local cldr-core.zip file, if omitted uses <URL>")
parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
diff --git a/js/src/moz.build b/js/src/moz.build
index 32102bde39..cecb7ae32d 100644
--- a/js/src/moz.build
+++ b/js/src/moz.build
@@ -118,6 +118,9 @@ main_deunified_sources = [
'builtin/intl/CommonFunctions.cpp',
'builtin/intl/DateTimeFormat.cpp',
'builtin/intl/IntlObject.cpp',
+ 'builtin/intl/LanguageTag.cpp',
+ 'builtin/intl/LanguageTagGenerated.cpp',
+ 'builtin/intl/Locale.cpp',
'builtin/intl/NumberFormat.cpp',
'builtin/intl/PluralRules.cpp',
'builtin/intl/RelativeTimeFormat.cpp',
@@ -709,7 +712,6 @@ selfhosted.inputs = [
'builtin/intl/CommonFunctions.js',
'builtin/intl/DateTimeFormat.js',
'builtin/intl/IntlObject.js',
- 'builtin/intl/LangTagMappingsGenerated.js',
'builtin/intl/NumberFormat.js',
'builtin/intl/PluralRules.js',
'builtin/intl/RelativeTimeFormat.js',
diff --git a/js/src/vm/CommonPropertyNames.h b/js/src/vm/CommonPropertyNames.h
index 1d398190ae..d5e7a2d058 100644
--- a/js/src/vm/CommonPropertyNames.h
+++ b/js/src/vm/CommonPropertyNames.h
@@ -51,6 +51,7 @@
macro(byteOffset, byteOffset, "byteOffset") \
macro(bytes, bytes, "bytes") \
macro(BYTES_PER_ELEMENT, BYTES_PER_ELEMENT, "BYTES_PER_ELEMENT") \
+ macro(calendar, calendar, "calendar") \
macro(call, call, "call") \
macro(callContentFunction, callContentFunction, "callContentFunction") \
macro(callee, callee, "callee") \
@@ -61,6 +62,7 @@
macro(catch, catch_, "catch") \
macro(class, class_, "class") \
macro(close, close, "close") \
+ macro(collation, collation, "collation") \
macro(Collator, Collator, "Collator") \
macro(CollatorCompareGet, CollatorCompareGet, "Intl_Collator_compare_get") \
macro(collections, collections, "collections") \
@@ -177,6 +179,7 @@
macro(hasOwn, hasOwn, "hasOwn") \
macro(hasOwnProperty, hasOwnProperty, "hasOwnProperty") \
macro(hour, hour, "hour") \
+ macro(hourCycle, hourCycle, "hourCycle") \
macro(if, if_, "if") \
macro(ignoreCase, ignoreCase, "ignoreCase") \
macro(ignorePunctuation, ignorePunctuation, "ignorePunctuation") \
@@ -190,6 +193,7 @@
macro(Infinity, Infinity, "Infinity") \
macro(InitializeCollator, InitializeCollator, "InitializeCollator") \
macro(InitializeDateTimeFormat, InitializeDateTimeFormat, "InitializeDateTimeFormat") \
+ macro(InitializeLocale, InitializeLocale, "InitializeLocale") \
macro(InitializeNumberFormat, InitializeNumberFormat, "InitializeNumberFormat") \
macro(InitializePluralRules, InitializePluralRules, "InitializePluralRules") \
macro(InitializeRelativeTimeFormat, InitializeRelativeTimeFormat, "InitializeRelativeTimeFormat") \
@@ -218,6 +222,7 @@
macro(js, js, "js") \
macro(keys, keys, "keys") \
macro(label, label, "label") \
+ macro(language, language, "language") \
macro(lastIndex, lastIndex, "lastIndex") \
macro(LegacyGeneratorCloseInternal, LegacyGeneratorCloseInternal, "LegacyGeneratorCloseInternal") \
macro(length, length, "length") \
@@ -226,6 +231,7 @@
macro(lineNumber, lineNumber, "lineNumber") \
macro(literal, literal, "literal") \
macro(loc, loc, "loc") \
+ macro(Locale, Locale, "Locale") \
macro(locale, locale, "locale") \
macro(lookupGetter, lookupGetter, "__lookupGetter__") \
macro(lookupSetter, lookupSetter, "__lookupSetter__") \
@@ -263,6 +269,7 @@
macro(noStack, noStack, "noStack") \
macro(notes, notes, "notes") \
macro(NumberFormat, NumberFormat, "NumberFormat") \
+ macro(numberingSystem, numberingSystem, "numberingSystem") \
macro(NumberFormatFormatGet, NumberFormatFormatGet, "Intl_NumberFormat_format_get") \
macro(numeric, numeric, "numeric") \
macro(objectArguments, objectArguments, "[object Arguments]") \
@@ -306,6 +313,7 @@
macro(reason, reason, "reason") \
macro(RegExpFlagsGetter, RegExpFlagsGetter, "RegExpFlagsGetter") \
macro(RegExpStringIterator, RegExpStringIterator, "RegExp String Iterator") \
+ macro(region, region, "region") \
macro(Reify, Reify, "Reify") \
macro(reject, reject, "reject") \
macro(rejected, rejected, "rejected") \
diff --git a/js/src/vm/GlobalObject.h b/js/src/vm/GlobalObject.h
index bf9255e85e..1e10fe5da3 100644
--- a/js/src/vm/GlobalObject.h
+++ b/js/src/vm/GlobalObject.h
@@ -113,6 +113,7 @@ class GlobalObject : public NativeObject
DATE_TIME_FORMAT_PROTO,
PLURAL_RULES_PROTO,
RELATIVE_TIME_FORMAT_PROTO,
+ LOCALE_PROTO,
MODULE_PROTO,
IMPORT_ENTRY_PROTO,
EXPORT_ENTRY_PROTO,
@@ -501,6 +502,11 @@ class GlobalObject : public NativeObject
return getOrCreateObject(cx, global, COLLATOR_PROTO, initIntlObject);
}
+ static JSObject*
+ getOrCreateLocalePrototype(JSContext* cx, Handle<GlobalObject*> global) {
+ return getOrCreateObject(cx, global, LOCALE_PROTO, initIntlObject);
+ }
+
static JSFunction*
getOrCreateNumberFormatConstructor(JSContext* cx, Handle<GlobalObject*> global) {
JSObject* obj = getOrCreateObject(cx, global, NUMBER_FORMAT, initIntlObject);
diff --git a/js/src/vm/SelfHosting.cpp b/js/src/vm/SelfHosting.cpp
index fff1baf630..cce912759d 100644
--- a/js/src/vm/SelfHosting.cpp
+++ b/js/src/vm/SelfHosting.cpp
@@ -25,6 +25,7 @@
#include "builtin/intl/Collator.h"
#include "builtin/intl/DateTimeFormat.h"
#include "builtin/intl/IntlObject.h"
+#include "builtin/intl/Locale.h"
#include "builtin/intl/NumberFormat.h"
#include "builtin/intl/PluralRules.h"
#include "builtin/intl/RelativeTimeFormat.h"
@@ -2486,6 +2487,8 @@ static const JSFunctionSpec intrinsic_functions[] = {
JS_FN("intl_SelectPluralRule", intl_SelectPluralRule, 2,0),
JS_FN("intl_toLocaleLowerCase", intl_toLocaleLowerCase, 2,0),
JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0),
+ JS_FN("intl_ValidateAndCanonicalizeLanguageTag", intl_ValidateAndCanonicalizeLanguageTag, 2, 0),
+ JS_FN("intl_TryValidateAndCanonicalizeLanguageTag", intl_TryValidateAndCanonicalizeLanguageTag, 1, 0),
JS_FN("intl_RelativeTimeFormat_availableLocales", intl_RelativeTimeFormat_availableLocales, 0,0),
JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0),
diff --git a/js/src/vm/String.h b/js/src/vm/String.h
index 4c43439cd9..5eaf9e0c2e 100644
--- a/js/src/vm/String.h
+++ b/js/src/vm/String.h
@@ -1131,6 +1131,20 @@ class StaticStrings
static bool isStatic(JSAtom* atom);
/* Return null if no static atom exists for the given (chars, length). */
+ MOZ_ALWAYS_INLINE JSAtom* lookup(const char* chars, size_t length) {
+ // Collapse calls for |const char*| into |const Latin1Char char*| to avoid
+ // excess instantiations.
+ return lookup(reinterpret_cast<const Latin1Char*>(chars), length);
+ }
+
+ template <typename CharT,
+ typename = typename std::enable_if<!std::is_const<CharT>::value>::type>
+ MOZ_ALWAYS_INLINE JSAtom* lookup(CharT* chars, size_t length) {
+ // Collapse the remaining |CharT*| to |const CharT*| to avoid excess
+ // instantiations.
+ return lookup(const_cast<const CharT*>(chars), length);
+ }
+
template <typename CharT>
JSAtom* lookup(const CharT* chars, size_t length) {
switch (length) {
diff --git a/js/src/vm/StringBuffer.cpp b/js/src/vm/StringBuffer.cpp
index ec8592f951..e4f0e4f4d6 100644
--- a/js/src/vm/StringBuffer.cpp
+++ b/js/src/vm/StringBuffer.cpp
@@ -111,11 +111,17 @@ StringBuffer::finishString()
JS_STATIC_ASSERT(JSFatInlineString::MAX_LENGTH_LATIN1 < Latin1CharBuffer::InlineLength);
if (isLatin1()) {
+ if (JSAtom* staticStr = cx->staticStrings().lookup(latin1Chars().begin(), len))
+ return staticStr;
+
if (JSInlineString::lengthFits<Latin1Char>(len)) {
mozilla::Range<const Latin1Char> range(latin1Chars().begin(), len);
return NewInlineString<CanGC>(cx, range);
}
} else {
+ if (JSAtom* staticStr = cx->staticStrings().lookup(twoByteChars().begin(), len))
+ return staticStr;
+
if (JSInlineString::lengthFits<char16_t>(len)) {
mozilla::Range<const char16_t> range(twoByteChars().begin(), len);
return NewInlineString<CanGC>(cx, range);