diff options
author | Martok <martok@martoks-place.de> | 2023-06-29 23:07:51 +0200 |
---|---|---|
committer | Martok <martok@martoks-place.de> | 2023-06-30 00:01:35 +0200 |
commit | 1a9d6d6372fb1fc585e21af53ccfafd6f89eda73 (patch) | |
tree | f5780e47a59bbbf9408147ecbda630897bfac96d | |
parent | 2f940bdc9dcbfe83e17ed26c5d1af7fe874c24ac (diff) | |
download | uxp-1a9d6d6372fb1fc585e21af53ccfafd6f89eda73.tar.gz |
Issue #1819 - Implement Intl.Locale proposal
This is according to spec for the mozilla71 cycle, a follow-up will further adjust to spec.
- Add Intl.Locale as native C++
- Port Unicode BCP 47 locale identifier parser to C++
- Port language tag parser to C++
- adjust make_intl_data to generate the data
Based-on: m-c 1433303, 1570370
-rw-r--r-- | js/public/Class.h | 2 | ||||
-rw-r--r-- | js/src/builtin/String.js | 4 | ||||
-rw-r--r-- | js/src/builtin/intl/CommonFunctions.h | 4 | ||||
-rw-r--r-- | js/src/builtin/intl/CommonFunctions.js | 1102 | ||||
-rw-r--r-- | js/src/builtin/intl/IntlObject.cpp | 6 | ||||
-rw-r--r-- | js/src/builtin/intl/LangTagMappingsGenerated.js | 1246 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTag.cpp | 1677 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTag.h | 722 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTagGenerated.cpp | 790 | ||||
-rw-r--r-- | js/src/builtin/intl/Locale.cpp | 1372 | ||||
-rw-r--r-- | js/src/builtin/intl/Locale.h | 61 | ||||
-rw-r--r-- | js/src/builtin/intl/make_intl_data.py | 1118 | ||||
-rw-r--r-- | js/src/moz.build | 4 | ||||
-rw-r--r-- | js/src/vm/CommonPropertyNames.h | 8 | ||||
-rw-r--r-- | js/src/vm/GlobalObject.h | 6 | ||||
-rw-r--r-- | js/src/vm/SelfHosting.cpp | 3 | ||||
-rw-r--r-- | js/src/vm/String.h | 14 | ||||
-rw-r--r-- | js/src/vm/StringBuffer.cpp | 6 |
18 files changed, 5545 insertions, 2600 deletions
diff --git a/js/public/Class.h b/js/public/Class.h index 1c785646e3..f1d7739718 100644 --- a/js/public/Class.h +++ b/js/public/Class.h @@ -913,7 +913,7 @@ struct JSClass { // application. #define JSCLASS_GLOBAL_APPLICATION_SLOTS 5 #define JSCLASS_GLOBAL_SLOT_COUNT \ - (JSCLASS_GLOBAL_APPLICATION_SLOTS + JSProto_LIMIT * 2 + 49) + (JSCLASS_GLOBAL_APPLICATION_SLOTS + JSProto_LIMIT * 2 + 50) #define JSCLASS_GLOBAL_FLAGS_WITH_SLOTS(n) \ (JSCLASS_IS_GLOBAL | JSCLASS_HAS_RESERVED_SLOTS(JSCLASS_GLOBAL_SLOT_COUNT + (n))) #define JSCLASS_GLOBAL_FLAGS \ diff --git a/js/src/builtin/String.js b/js/src/builtin/String.js index 0fab35966a..e1c32482ae 100644 --- a/js/src/builtin/String.js +++ b/js/src/builtin/String.js @@ -752,7 +752,7 @@ function String_toLocaleLowerCase() { requestedLocale = undefined; } else if (typeof locales === "string") { // Steps 3, 5. - requestedLocale = ValidateAndCanonicalizeLanguageTag(locales); + requestedLocale = intl_ValidateAndCanonicalizeLanguageTag(locales, false); } else { // Step 3. var requestedLocales = CanonicalizeLocaleList(locales); @@ -793,7 +793,7 @@ function String_toLocaleUpperCase() { requestedLocale = undefined; } else if (typeof locales === "string") { // Steps 3, 5. - requestedLocale = ValidateAndCanonicalizeLanguageTag(locales); + requestedLocale = intl_ValidateAndCanonicalizeLanguageTag(locales, false); } else { // Step 3. var requestedLocales = CanonicalizeLocaleList(locales); diff --git a/js/src/builtin/intl/CommonFunctions.h b/js/src/builtin/intl/CommonFunctions.h index 256db49b18..12b4da4a72 100644 --- a/js/src/builtin/intl/CommonFunctions.h +++ b/js/src/builtin/intl/CommonFunctions.h @@ -89,9 +89,9 @@ static_assert(mozilla::IsSame<UChar, char16_t>::value, // buffer's entire inline capacity before growing it and heap-allocating.
static const size_t INITIAL_CHAR_BUFFER_SIZE = 32;
-template <typename ICUStringFunction, size_t InlineCapacity>
+template <typename ICUStringFunction, typename CharT, size_t InlineCapacity>
static int32_t
-CallICU(JSContext* cx, Vector<char16_t, InlineCapacity>& chars, const ICUStringFunction& strFn)
+CallICU(JSContext* cx, Vector<CharT, InlineCapacity>& chars, const ICUStringFunction& strFn)
{
MOZ_ASSERT(chars.length() == 0);
MOZ_ALWAYS_TRUE(chars.resize(InlineCapacity));
diff --git a/js/src/builtin/intl/CommonFunctions.js b/js/src/builtin/intl/CommonFunctions.js index 36b2bec9b2..9fad595979 100644 --- a/js/src/builtin/intl/CommonFunctions.js +++ b/js/src/builtin/intl/CommonFunctions.js @@ -13,6 +13,19 @@ function hasOwn(propName, object) { return callFunction(std_Object_hasOwnProperty, object, propName); } +#ifdef DEBUG +#define assertIsValidAndCanonicalLanguageTag(locale, desc) \ + do { \ + let canonical = intl_TryValidateAndCanonicalizeLanguageTag(locale); \ + assert(canonical !== null, \ + `${desc} is a structurally valid language tag`); \ + assert(canonical === locale, \ + `${desc} is a canonicalized language tag`); \ + } while (false) +#else +#define assertIsValidAndCanonicalLanguageTag(locale, desc) ; // Elided assertion. +#endif + /** * Returns the start index of a "Unicode locale extension sequence", which the * specification defines as: "any substring of a language tag that starts with @@ -46,8 +59,6 @@ function startOfUnicodeExtensions(locale) { */ function endOfUnicodeExtensions(locale, start) { assert(typeof locale === "string", "locale is a string"); - assert(IsStructurallyValidLanguageTag(locale), "locale is a language tag"); - assert(CanonicalizeLanguageTag(locale) === locale, "locale is a canonicalized language tag"); assert(0 <= start && start < locale.length, "start is an index into locale"); assert(Substring(locale, start, 3) === "-u-", "start points to Unicode extension sequence"); @@ -95,10 +106,9 @@ function removeUnicodeExtensions(locale) { var right = Substring(locale, end, locale.length - end); var combined = left + right; - assert(IsStructurallyValidLanguageTag(combined), - "recombination produced an invalid language tag"); + assertIsValidAndCanonicalLanguageTag(combined, "the recombined locale"); assert(startOfUnicodeExtensions(combined) < 0, - "recombination failed to remove all Unicode locale extension sequences"); + "recombination failed to remove all Unicode locale extension sequences"); return combined; } @@ -114,1000 +124,6 @@ function getUnicodeExtensions(locale) { return Substring(locale, start, end - start); } -// The three possible token type bits. Expressed as #defines to avoid -// extra named lookups in the interpreter/jits. -#define NONE 0b00 -#define ALPHA 0b01 -#define DIGIT 0b10 - -// Constants for code units used below. -#define HYPHEN 0x2D -#define DIGIT_ZERO 0x30 -#define DIGIT_NINE 0x39 -#define UPPER_A 0x41 -#define UPPER_Z 0x5A -#define LOWER_A 0x61 -#define LOWER_T 0x74 -#define LOWER_U 0x75 -#define LOWER_X 0x78 -#define LOWER_Z 0x7A - -// The requirement to use callFunction() for method calls makes the parser -// harder to read. Use macros for the rescue. - -// Reads the next token. -#define NEXT_TOKEN_OR_RETURN_NULL(ts) \ - if (!callFunction(ts.nextToken, ts)) \ - return null; - -#define NEXT_TOKEN_OR_ASSERT(ts) \ - if (!callFunction(ts.nextToken, ts)) \ - assert(false, "unexpected invalid subtag"); - -// Assigns the current subtag part transformed to lower-case to the target. -#define SUBTAG_VAR_OR_RETURN_NULL(ts, target) \ - { \ - target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \ - NEXT_TOKEN_OR_RETURN_NULL(ts); \ - } - -// Assigns the current subtag part transformed to lower-case to the target. -#define SUBTAG_VAR_OR_ASSERT(ts, target) \ - { \ - target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \ - NEXT_TOKEN_OR_ASSERT(ts) \ - } - -/** - * Tokenizer for Unicode BCP 47 locale identifiers. - */ -function BCP47TokenStream(locale) { - this.locale = locale; - - // Locale identifiers are compared and processed case-insensitively, so - // technically it's not necessary to adjust case. But for easier processing, - // and because the canonical form for most subtags is lower case, we start - // with lower case for all. - // - // Note that the tokenizer function keeps using the original input string - // to properly detect non-ASCII characters. The lower-case string can't be - // used to detect those characters, because some non-ASCII characters - // lower-case map into ASCII characters, e.g. U+212A (KELVIN SIGN) lower- - // case maps to U+006B (LATIN SMALL LETTER K). - this.localeLowercase = callFunction(std_String_toLowerCase, locale); - - // Current parse index in |locale|. - this.index = 0; - - // The current token type, its start index, and its length. - this.token = NONE; - this.tokenStart = 0; - this.tokenLength = 0; - - assert(std_String_fromCharCode(HYPHEN) === "-" && - std_String_fromCharCode(DIGIT_ZERO) === "0" && - std_String_fromCharCode(DIGIT_NINE) === "9" && - std_String_fromCharCode(UPPER_A) === "A" && - std_String_fromCharCode(UPPER_Z) === "Z" && - std_String_fromCharCode(LOWER_A) === "a" && - std_String_fromCharCode(LOWER_T) === "t" && - std_String_fromCharCode(LOWER_U) === "u" && - std_String_fromCharCode(LOWER_X) === "x" && - std_String_fromCharCode(LOWER_Z) === "z", - "code unit constants should match the expected characters"); -} - -MakeConstructible(BCP47TokenStream, { - __proto__: null, - - // Reads the next token, returns |false| if an illegal character was found, - // otherwise returns |true|. - // - // eslint-disable-next-line object-shorthand - nextToken: function() { - var type = NONE; - var {index, locale} = this; - for (var i = index; i < locale.length; i++) { - // UTS 35, section 3.1. - // alpha = [A-Z a-z] ; - // digit = [0-9] ; - var c = callFunction(std_String_charCodeAt, locale, i); - if ((UPPER_A <= c && c <= UPPER_Z) || (LOWER_A <= c && c <= LOWER_Z)) - type |= ALPHA; - else if (DIGIT_ZERO <= c && c <= DIGIT_NINE) - type |= DIGIT; - else if (c === HYPHEN && i > index && i + 1 < locale.length) - break; - else - return false; - } - - this.token = type; - this.tokenStart = index; - this.tokenLength = i - index; - this.index = i + 1; - return true; - }, - - // Returns true if the character at the requested index within the current - // token is a digit. - // - // eslint-disable-next-line object-shorthand - isDigitAt: function(index) { - assert(0 <= index && index < this.tokenLength, - "must be an index into the current token"); - var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart + index); - assert(!(c <= DIGIT_NINE) || c >= DIGIT_ZERO, - "token-start-code-unit <= '9' implies token-start-code-unit is in '0'..'9' " + - "and because all digits are sorted before any letters"); - return c <= DIGIT_NINE; - }, - - // Returns the code unit of the first character at the current token - // position. Always returns the lower-case form of an alphabetical - // character. - // - // eslint-disable-next-line object-shorthand - singletonKey: function() { - assert(this.tokenLength === 1, "token is not a singleton"); - var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart); - assert((DIGIT_ZERO <= c && c <= DIGIT_NINE) || (LOWER_A <= c && c <= LOWER_Z), - "unexpected code unit"); - return c; - }, - - // eslint-disable-next-line object-shorthand - singletonValue: function() { - var singletonStart = this.tokenStart; - var min = callFunction(this.singletonKey, this) === LOWER_X ? 1 : 2; - - NEXT_TOKEN_OR_RETURN_NULL(this); - - // At least one non-singleton subtag must be present. - if (!(min <= this.tokenLength && this.tokenLength <= 8)) - return null; - do { - NEXT_TOKEN_OR_RETURN_NULL(this); - } while (min <= this.tokenLength && this.tokenLength <= 8); - - return callFunction(this.singletonValueAt, this, singletonStart); - }, - - // eslint-disable-next-line object-shorthand - singletonValueAt: function(start) { - // Singletons must be followed by a non-singleton subtag, "en-a-b" is not allowed. - var length = this.tokenStart - 1 - start; - if (length <= 2) - return null; - return Substring(this.localeLowercase, start, length); - } -}); - -/* eslint-disable complexity */ -/** - * Parser for Unicode BCP 47 locale identifiers. - * - * Returns null if |locale| can't be parsed as a `unicode_locale_id`. If the - * input is a grandfathered language tag, it is directly canonicalized to its - * modern form. The returned object has the following structure: - * - * { - * language: `unicode_language_subtag`, - * script: `unicode_script_subtag` / undefined, - * region: `unicode_region_subtag` / undefined, - * variants: array of `unicode_variant_subtag`, - * extensions: array of `extensions`, - * privateuse: `pu_extensions` / undefined, - * } - * - * All locale identifier subtags are returned in their normalized case: - * - * var langtag = parseLanguageTag("en-latn-us"); - * assertEq("en", langtag.language); - * assertEq("Latn", langtag.script); - * assertEq("US", langtag.region); - * - * Spec: https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers - */ -function parseLanguageTag(locale) { - assert(typeof locale === "string", "locale is a string"); - - // unicode_locale_id = unicode_language_id - // extensions* - // pu_extensions? ; - var ts = new BCP47TokenStream(locale); - NEXT_TOKEN_OR_RETURN_NULL(ts); - - var language, script, region, privateuse; - var variants = []; - var extensions = []; - - // unicode_language_id = unicode_language_subtag - // (sep unicode_script_subtag)? - // (sep unicode_region_subtag)? - // (sep unicode_variant_subtag)* ; - // - // sep = "-" - // - // Note: Unicode CLDR locale identifier backward compatibility extensions - // removed from `unicode_language_id`. - - // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; - if (ts.token !== ALPHA || ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) { - // Four character language subtags are not allowed in Unicode BCP 47 - // locale identifiers. Also see the comparison to Unicode CLDR locale - // identifiers in <https://unicode.org/reports/tr35/#BCP_47_Conformance>. - return null; - } - assert((2 <= ts.tokenLength && ts.tokenLength <= 3) || - (5 <= ts.tokenLength && ts.tokenLength <= 8), - "language subtags have 2-3 or 5-8 letters"); - - SUBTAG_VAR_OR_RETURN_NULL(ts, language); - - // unicode_script_subtag = alpha{4} ; - if (ts.tokenLength === 4 && ts.token === ALPHA) { - SUBTAG_VAR_OR_RETURN_NULL(ts, script); - - // The first character of a script code needs to be capitalized. - // "hans" -> "Hans" - script = callFunction(std_String_toUpperCase, script[0]) + - Substring(script, 1, script.length - 1); - } - - // unicode_region_subtag = (alpha{2} | digit{3}) ; - if ((ts.tokenLength === 2 && ts.token === ALPHA) || - (ts.tokenLength === 3 && ts.token === DIGIT)) - { - SUBTAG_VAR_OR_RETURN_NULL(ts, region); - - // Region codes need to be in upper-case. "bu" -> "BU" - region = callFunction(std_String_toUpperCase, region); - } - - // unicode_variant_subtag = (alphanum{5,8} - // | digit alphanum{3}) ; - // - // alphanum = [0-9 A-Z a-z] ; - while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || - (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) - { - // Locale identifiers are case insensitive (UTS 35, section 3.2). - // All seen variants are compared ignoring case differences by - // using the lower-case form. This allows to properly detect and - // reject variant repetitions with differing case, e.g. - // "en-variant-Variant". - var variant; - SUBTAG_VAR_OR_RETURN_NULL(ts, variant); - - // Reject the Locale identifier if a duplicate variant was found. - // - // This linear-time verification step means the whole variant - // subtag checking is potentially quadratic, but we're okay doing - // that because language tags are unlikely to be deliberately - // pathological. - if (callFunction(ArrayIndexOf, variants, variant) !== -1) - return null; - _DefineDataProperty(variants, variants.length, variant); - } - - // extensions = unicode_locale_extensions - // | transformed_extensions - // | other_extensions ; - // - // unicode_locale_extensions = sep [uU] - // ((sep keyword)+ - // |(sep attribute)+ (sep keyword)*) ; - // - // transformed_extensions = sep [tT] - // ((sep tlang (sep tfield)*) - // |(sep tfield)+) ; - // - // other_extensions = [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; - // - // keyword = key (sep type)? ; - // - // key = alphanum alpha ; - // - // type = alphanum{3,8} (sep alphanum{3,8})* ; - // - // attribute = alphanum{3,8} ; - // - // tlang = unicode_language_subtag - // (sep unicode_script_subtag)? - // (sep unicode_region_subtag)? - // (sep unicode_variant_subtag)* ; - // - // tfield = tkey tvalue; - // - // tkey = alpha digit ; - // - // tvalue = (sep alphanum{3,8})+ ; - var seenSingletons = []; - while (ts.tokenLength === 1) { - var singleton = callFunction(ts.singletonKey, ts); - if (singleton === LOWER_X) - break; - - // Locale identifiers are case insensitive (UTS 35, section 3.2). - // Ensure |singletonKey()| does not return the code unit of an - // upper-case character, so we can properly detect and reject - // singletons with different case, e.g. "en-u-foo-U-foo". - assert(!(UPPER_A <= singleton && singleton <= UPPER_Z), - "unexpected upper-case code unit"); - - // Reject the input if a duplicate singleton was found. - // - // Similar to the variant validation step this check is O(n**2), - // but given that there are only 35 possible singletons the - // quadratic runtime is negligible. - if (callFunction(ArrayIndexOf, seenSingletons, singleton) !== -1) - return null; - _DefineDataProperty(seenSingletons, seenSingletons.length, singleton); - - var extension; - if (singleton === LOWER_U) { - var extensionStart = ts.tokenStart; - NEXT_TOKEN_OR_RETURN_NULL(ts); - - while (2 <= ts.tokenLength && ts.tokenLength <= 8) { - // `key` doesn't allow a digit as its second character. - if (ts.tokenLength === 2 && callFunction(ts.isDigitAt, ts, 1)) - return null; - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - extension = callFunction(ts.singletonValueAt, ts, extensionStart); - } else if (singleton === LOWER_T) { - var extensionStart = ts.tokenStart; - NEXT_TOKEN_OR_RETURN_NULL(ts); - - // `tfield` starts with `tkey`, which in turn is `alpha digit`, so - // an alpha-only token must be a `tlang`. - if (ts.token === ALPHA) { - // `unicode_language_subtag` - if (ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) - return null; - NEXT_TOKEN_OR_RETURN_NULL(ts); - - // `unicode_script_subtag` (optional) - if (ts.tokenLength === 4 && ts.token === ALPHA) { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - - // `unicode_region_subtag` (optional) - if ((ts.tokenLength === 2 && ts.token === ALPHA) || - (ts.tokenLength === 3 && ts.token === DIGIT)) - { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - - // `unicode_variant_subtag` (optional) - while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || - (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) - { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - } - - // Trailing `tfield` subtags. - while (ts.tokenLength === 2) { - // `tkey` is `alpha digit`. - if (callFunction(ts.isDigitAt, ts, 0) || - !callFunction(ts.isDigitAt, ts, 1)) - { - return null; - } - NEXT_TOKEN_OR_RETURN_NULL(ts); - - // `tfield` requires at least one `tvalue`. - if (!(3 <= ts.tokenLength && ts.tokenLength <= 8)) - return null; - do { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } while (3 <= ts.tokenLength && ts.tokenLength <= 8); - } - extension = callFunction(ts.singletonValueAt, ts, extensionStart); - } else { - extension = callFunction(ts.singletonValue, ts); - } - if (!extension) - return null; - - _DefineDataProperty(extensions, extensions.length, extension); - } - - // Trailing pu_extensions component of the unicode_locale_id production. - // - // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; - if (ts.tokenLength === 1 && callFunction(ts.singletonKey, ts) === LOWER_X) { - privateuse = callFunction(ts.singletonValue, ts); - if (!privateuse) - return null; - } - - // Reject the input if it couldn't be parsed completely. - if (ts.token !== NONE) - return null; - - var tagObj = { - language, - script, - region, - variants, - extensions, - privateuse, - }; - - // Handle grandfathered tags right away, so we don't need to have extra - // paths for grandfathered tags later on. - // - // grandfathered = "art-lojban" ; non-redundant tags registered - // / "cel-gaulish" ; during the RFC 3066 era - // / "zh-guoyu" ; these tags match the 'langtag' - // / "zh-hakka" ; production, but their subtags - // / "zh-xiang" ; are not extended language - // ; or variant subtags: their meaning - // ; is defined by their registration - // ; and all of these are deprecated - // ; in favor of a more modern - // ; subtag or sequence of subtags - if (hasOwn(ts.localeLowercase, grandfatheredMappings)) - updateGrandfatheredMappings(tagObj); - - // Return if the complete input was successfully parsed. - return tagObj; -} - -/** - * Return the locale and fields components of the given valid Transform - * extension subtag. - */ -function TransformExtensionComponents(extension) { - assert(typeof extension === "string", "extension is a String value"); - assert(callFunction(std_String_startsWith, extension, "t-"), - "extension starts with 't-'"); - - var ts = new BCP47TokenStream(Substring(extension, 2, extension.length - 2)); - NEXT_TOKEN_OR_ASSERT(ts); - - // `tfield` starts with `tkey`, which in turn is `alpha digit`, so - // an alpha-only token must be a `tlang`. - var localeObj; - if (ts.token === ALPHA) { - // `unicode_language_subtag` - assert((2 <= ts.tokenLength && ts.tokenLength <= 3) || - (5 <= ts.tokenLength && ts.tokenLength <= 8), - "language subtags have 2-3 or 5-8 letters"); - - var language; - SUBTAG_VAR_OR_ASSERT(ts, language); - - // unicode_script_subtag = alpha{4} ; - var script; - if (ts.tokenLength === 4 && ts.token === ALPHA) { - SUBTAG_VAR_OR_ASSERT(ts, script); - - // The first character of a script code needs to be capitalized. - // "hans" -> "Hans" - script = callFunction(std_String_toUpperCase, script[0]) + - Substring(script, 1, script.length - 1); - } - - // unicode_region_subtag = (alpha{2} | digit{3}) ; - var region; - if ((ts.tokenLength === 2 && ts.token === ALPHA) || - (ts.tokenLength === 3 && ts.token === DIGIT)) - { - SUBTAG_VAR_OR_ASSERT(ts, region); - - // Region codes need to be in upper-case. "bu" -> "BU" - region = callFunction(std_String_toUpperCase, region); - } - - // unicode_variant_subtag = (alphanum{5,8} - // | digit alphanum{3}) ; - // - // alphanum = [0-9 A-Z a-z] ; - var variants = []; - while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || - (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) - { - var variant; - SUBTAG_VAR_OR_ASSERT(ts, variant); - - _DefineDataProperty(variants, variants.length, variant); - } - - localeObj = { - language, - script, - region, - variants, - extensions: [], - privateuse: undefined, - }; - } - - // Trailing `tfield` subtags. (Any other trailing subtags are an error, - // because we're guaranteed to only see a valid tranform extension here.) - var fields = []; - while (ts.tokenLength === 2) { - // `tkey` is `alpha digit`. - assert(!callFunction(ts.isDigitAt, ts, 0) && callFunction(ts.isDigitAt, ts, 1), - "unexpected invalid tkey subtag"); - - var key; - SUBTAG_VAR_OR_ASSERT(ts, key); - - // `tfield` requires at least one `tvalue`. - assert(3 <= ts.tokenLength && ts.tokenLength <= 8, - "unexpected invalid tvalue subtag"); - - var value; - SUBTAG_VAR_OR_ASSERT(ts, value); - - while (3 <= ts.tokenLength && ts.tokenLength <= 8) { - var part; - SUBTAG_VAR_OR_ASSERT(ts, part); - value += "-" + part; - } - - _DefineDataProperty(fields, fields.length, {key, value}); - } - - assert(ts.token === NONE, - "unexpected trailing characters in promised-to-be-valid transform extension"); - - return {locale: localeObj, fields}; -} -/* eslint-enable complexity */ - -#undef NONE -#undef ALPHA -#undef DIGIT - -#undef HYPHEN -#undef DIGIT_ZERO -#undef DIGIT_NINE -#undef UPPER_A -#undef UPPER_Z -#undef LOWER_A -#undef LOWER_T -#undef LOWER_U -#undef LOWER_X -#undef LOWER_Z - -#undef SUBTAG_VAR_OR_ASSERT -#undef SUBTAG_VAR_OR_RETURN_NULL -#undef NEXT_TOKEN_OR_ASSERT -#undef NEXT_TOKEN_OR_RETURN_NULL - -/** - * Verifies that the given string is a well-formed BCP 47 language tag - * with no duplicate variant or singleton subtags. - * - * Spec: ECMAScript Internationalization API Specification, 6.2.2. - */ -function IsStructurallyValidLanguageTag(locale) { - return parseLanguageTag(locale) !== null; -} - -/** - * Canonicalizes the given structurally valid Unicode BCP 47 locale identifier, - * including regularized case of subtags. For example, the language tag - * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where - * - * Zh ; 2*3ALPHA - * -haNS ; ["-" script] - * -bu ; ["-" region] - * -variant2 ; *("-" variant) - * -Variant1 - * -u-ca-chinese ; *("-" extension) - * -t-Zh-laTN - * -x-PRIVATE ; ["-" privateuse] - * - * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private - * - * UTS 35 specifies two different canonicalization algorithms. There's one to - * canonicalize BCP 47 language tags and other one to canonicalize Unicode - * locale identifiers. The latter one wasn't present when ECMA-402 was changed - * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags, so - * ECMA-402 currently only uses the former to canonicalize Unicode BCP 47 locale - * identifiers. - * - * Spec: ECMAScript Internationalization API Specification, 6.2.3. - * Spec: https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion - */ -function CanonicalizeLanguageTagObject(localeObj) { - assert(IsObject(localeObj), "CanonicalizeLanguageTagObject"); - - // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by - // normalizing the case and ordering all subtags. The canonical syntax form - // itself is specified in UTS 35, 3.2.1. - - // The parser already normalized the case for all subtags. - -#ifdef DEBUG - function IsLowerCase(s) { - return s === callFunction(std_String_toLowerCase, s); - } - function IsUpperCase(s) { - return s === callFunction(std_String_toUpperCase, s); - } - function IsTitleCase(s) { - assert(s.length > 0, "unexpected empy string"); - var r = callFunction(std_String_toUpperCase, s[0]) + - callFunction(std_String_toLowerCase, Substring(s, 1, s.length - 1)); - return s === r; - } -#endif - - // 1. Any script subtag is in title case. - assert(localeObj.script === undefined || IsTitleCase(localeObj.script), - "If present, script subtag is in title case"); - - // 2. Any region subtag is in uppercase. - assert(localeObj.region === undefined || IsUpperCase(localeObj.region), - "If present, region subtag is in upper case"); - - // 3. All other subtags are in lowercase. - assert(IsLowerCase(localeObj.language), - "language subtag is in lower case"); - assert(callFunction(ArrayEvery, localeObj.variants, IsLowerCase), - "variant subtags are in lower case"); - assert(callFunction(ArrayEvery, localeObj.extensions, IsLowerCase), - "extension subtags are in lower case"); - assert(localeObj.privateuse === undefined || IsLowerCase(localeObj.privateuse), - "If present, privateuse subtag is in lower case"); - - - // The second step in UTS 35, 3.2.1, is to order all subtags. - - // 1. Any variants are in alphabetical order. - var variants = localeObj.variants; - if (variants.length > 0) { - callFunction(ArraySort, variants); - } - - // 2. Any extensions are in alphabetical order by their singleton. - var extensions = localeObj.extensions; - if (extensions.length > 0) { - // Extension sequences are sorted by their singleton characters. - // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" - callFunction(ArraySort, extensions); - - // The last three bullet points in UTS 35, 3.2.1 apply only to Unicode and Transform - // extensions. - // - // 3. All attributes are sorted in alphabetical order. - // - // 4. All keywords and tfields are sorted by alphabetical order of their - // keys, within their respective extensions. - // - // 5. Any type or tfield value "true" is removed. - - for (var i = 0; i < extensions.length; i++) { - var ext = extensions[i]; - assert(IsLowerCase(ext), - "extension subtags must be in lower-case"); - assert(ext[1] === "-", - "extension subtags start with a singleton"); - - // Canonicalize Unicode locale extension subtag if present. - if (ext[0] === "u") { - var {attributes, keywords} = UnicodeExtensionComponents(ext); - extensions[i] = CanonicalizeUnicodeExtension(attributes, keywords); - } - - // Canonicalize Unicode BCP 47 T extension if present. - if (ext[0] === "t") { - var {locale, fields} = TransformExtensionComponents(ext); - extensions[i] = CanonicalizeTransformExtension(locale, fields); - } - } - } - - // The next two steps in 3.3.1 replace deprecated language and region - // subtags with their preferred mappings. - updateLocaleIdMappings(localeObj); - - // The two final steps in 3.3.1, handling irregular grandfathered and - // private-use only language tags, don't apply, because these two forms - // can't occur in Unicode BCP 47 locale identifiers. -} - -/** - * Intl.Locale proposal - * - * UnicodeExtensionComponents( extension ) - * - * Returns the components of |extension| where |extension| is a "Unicode locale - * extension sequence" (ECMA-402, 6.2.1) without the starting separator - * character. - */ -function UnicodeExtensionComponents(extension) { - assert(typeof extension === "string", "extension is a String value"); - - // Step 1. - var attributes = []; - - // Step 2. - var keywords = []; - - // Step 3. - var isKeyword = false; - - // Step 4. - var size = extension.length; - - // Step 5. - // |extension| starts with "u-" instead of "-u-" in our implementation, so - // we need to initialize |k| with 2 instead of 3. - assert(callFunction(std_String_startsWith, extension, "u-"), - "extension starts with 'u-'"); - var k = 2; - - // Step 6. - var key, value; - while (k < size) { - // Step 6.a. - var e = callFunction(std_String_indexOf, extension, "-", k); - - // Step 6.b. - var len = (e < 0 ? size : e) - k; - - // Step 6.c. - var subtag = Substring(extension, k, len); - - // Steps 6.d-e. - if (!isKeyword) { - // Step 6.d. - // NB: Duplicates are handled elsewhere in our implementation. - if (len !== 2) - _DefineDataProperty(attributes, attributes.length, subtag); - } else { - // Steps 6.e.i-ii. - if (len === 2) { - // Step 6.e.i.1. - // NB: Duplicates are handled elsewhere in our implementation. - _DefineDataProperty(keywords, keywords.length, {key, value}); - } else { - // Step 6.e.ii.1. - if (value !== "") - value += "-"; - - // Step 6.e.ii.2. - value += subtag; - } - } - - // Step 6.f. - if (len === 2) { - // Step 6.f.i. - isKeyword = true; - - // Step 6.f.ii. - key = subtag; - - // Step 6.f.iii. - value = ""; - } - - // Step 6.g. - k += len + 1; - } - - // Step 7. - if (isKeyword) { - // Step 7.a. - // NB: Duplicates are handled elsewhere in our implementation. - _DefineDataProperty(keywords, keywords.length, {key, value}); - } - - // Step 8. - return {attributes, keywords}; -} - -/** - * CanonicalizeUnicodeExtension( attributes, keywords ) - * - * Canonical syntax per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: - * - * - All attributes and keywords are in lowercase. - * - Note: The parser already converted keywords to lowercase. - * - All attributes are sorted in alphabetical order. - * - All keywords are sorted by alphabetical order of their keys. - * - Any type value "true" is removed. - * - * Canonical form: - * - All keys and types use the canonical form (from the name attribute; - * see Section 3.6.4 U Extension Data Files). - */ -function CanonicalizeUnicodeExtension(attributes, keywords) { - assert(attributes.length > 0 || keywords.length > 0, - "unexpected empty Unicode locale extension components"); - - // All attributes are sorted in alphabetical order. - if (attributes.length > 1) - callFunction(ArraySort, attributes); - - // All keywords are sorted by alphabetical order of keys. - if (keywords.length > 1) { - function UnicodeKeySort(left, right) { - var leftKey = left.key; - var rightKey = right.key; - assert(leftKey.length === 2, "left key is a Unicode key"); - assert(rightKey.length === 2, "right key is a Unicode key"); - - // Compare both strings using charCodeAt(), because relational - // string comparison always calls into the VM, whereas charCodeAt - // can be inlined by Ion. - var diff = callFunction(std_String_charCodeAt, leftKey, 0) - - callFunction(std_String_charCodeAt, rightKey, 0); - if (diff === 0) { - diff = callFunction(std_String_charCodeAt, leftKey, 1) - - callFunction(std_String_charCodeAt, rightKey, 1); - } - return diff; - } - - callFunction(ArraySort, keywords, UnicodeKeySort); - } - - var extension = "u"; - - // Append all attributes. - for (var i = 0; i < attributes.length; i++) { - extension += "-" + attributes[i]; - } - - // Append all keywords. - for (var i = 0; i < keywords.length; i++) { - var {key, value} = keywords[i]; - extension += "-" + key; - - // Type value "true" is removed. - if (value !== "" && value !== "true") - extension += "-" + value; - } - - return extension; -} - -/** - * CanonicalizeTransformExtension - * - * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: - * - * - These subtags are all in lowercase (that is the canonical casing for these - * subtags), [...]. - * - * And per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: - * - * - All keywords and tfields are sorted by alphabetical order of their keys, - * within their respective extensions. - */ -function CanonicalizeTransformExtension(localeObj, fields) { - assert(localeObj !== undefined || fields.length > 0, - "unexpected empty Transform locale extension components"); - - if (fields.length > 0) { - function TransformKeySort(left, right) { - var leftKey = left.key; - var rightKey = right.key; - assert(leftKey.length === 2, "left key is a Transform key"); - assert(rightKey.length === 2, "right key is a Transform key"); - - // Compare both strings using charCodeAt(), because relational - // string comparison always calls into the VM, whereas charCodeAt - // can be inlined by Ion. - var diff = callFunction(std_String_charCodeAt, leftKey, 0) - - callFunction(std_String_charCodeAt, rightKey, 0); - if (diff === 0) { - diff = callFunction(std_String_charCodeAt, leftKey, 1) - - callFunction(std_String_charCodeAt, rightKey, 1); - } - return diff; - } - - callFunction(ArraySort, fields, TransformKeySort); - } - - var extension = "t"; - - // Append the language subtag if present. - if (localeObj !== undefined) { - // [1] is a bit unclear whether or not the `tlang` subtag also needs - // to be canonicalized (and case-adjusted). For now simply append it as - // is and change it to all lower-case. If we switch to [2], the `tlang` - // subtag also needs to be canonicalized according to the same rules as - // `unicode_language_id` subtags are canonicalized. Also see [3]. - // - // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier - // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - // [3] https://github.com/tc39/ecma402/issues/330 - var localeStr = StringFromLanguageTagObject(localeObj); - extension += "-" + callFunction(std_String_toLowerCase, localeStr); - } - - // Append all fields. - for (var i = 0; i < fields.length; i++) { - // UTS 35, 3.2.1 specifies: - // - Any type or tfield value "true" is removed. - // - // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so - // ignore this apparently invalid part of the UTS 35 specification and - // simply append all `tfield` subtags. - var {key, value} = fields[i]; - extension += "-" + key + "-" + value; - } - - return extension; -} - -/** - * Canonicalizes the given structurally valid BCP 47 language tag, including - * regularized case of subtags. For example, the language tag - * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where - * - * Zh ; 2*3ALPHA - * -haNS ; ["-" script] - * -bu ; ["-" region] - * -variant2 ; *("-" variant) - * -Variant1 - * -u-ca-chinese ; *("-" extension) - * -t-Zh-laTN - * -x-PRIVATE ; ["-" privateuse] - * - * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private - * - * Spec: ECMAScript Internationalization API Specification, 6.2.3. - */ -function CanonicalizeLanguageTag(locale) { - var localeObj = parseLanguageTag(locale); - assert(localeObj !== null, "CanonicalizeLanguageTag"); - - CanonicalizeLanguageTagObject(localeObj); - - return StringFromLanguageTagObject(localeObj); -} - -/** - * Returns the string representation of the given language tag object. - */ -function StringFromLanguageTagObject(localeObj) { - assert(IsObject(localeObj), "StringFromLanguageTagObject"); - - var { - language, - script, - region, - variants, - extensions, - privateuse, - } = localeObj; - - var canonical = language; - - if (script !== undefined) - canonical += "-" + script; - - if (region !== undefined) - canonical += "-" + region; - - if (variants.length > 0) - canonical += "-" + callFunction(std_Array_join, variants, "-"); - - if (extensions.length > 0) - canonical += "-" + callFunction(std_Array_join, extensions, "-"); - - if (privateuse !== undefined) - canonical += "-" + privateuse; - - return canonical; -} - /** * Returns true if the input contains only ASCII alphabetical characters. */ @@ -1122,50 +138,6 @@ function IsASCIIAlphaString(s) { return true; } - -/** - * Validates and canonicalizes the given language tag. - */ -function ValidateAndCanonicalizeLanguageTag(locale) { - assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag"); - - // Handle the common case (a standalone language) first. - // Only the following Unicode BCP 47 locale identifier subset is accepted: - // unicode_locale_id = unicode_language_id - // unicode_language_id = unicode_language_subtag - // unicode_language_subtag = alpha{2,3} - if (locale.length === 2 || locale.length === 3) { - if (!IsASCIIAlphaString(locale)) - ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale); - assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag"); - - // The language subtag is canonicalized to lower case. - locale = callFunction(std_String_toLowerCase, locale); - - // updateLocaleIdMappings may modify tags containing only |language| - // subtags, if the language is in |complexLanguageMappings|, so we need - // to handle that case first. - if (!hasOwn(locale, complexLanguageMappings)) { - // Replace deprecated subtags with their preferred values. - locale = hasOwn(locale, languageMappings) - ? languageMappings[locale] - : locale; - assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization"); - - return locale; - } - } - - var localeObj = parseLanguageTag(locale); - if (localeObj === null) - ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale); - - CanonicalizeLanguageTagObject(localeObj); - - return StringFromLanguageTagObject(localeObj); -} - - // The last-ditch locale is used if none of the available locales satisfies a // request. "en-GB" is used based on the assumptions that English is the most // common second language, that both en-GB and en-US are normally available in @@ -1215,14 +187,10 @@ function DefaultLocaleIgnoringAvailableLocales() { // If we didn't get a cache hit, compute the candidate default locale and // cache it. Fall back on the last-ditch locale when necessary. - var candidate = parseLanguageTag(runtimeDefaultLocale); + var candidate = intl_TryValidateAndCanonicalizeLanguageTag(runtimeDefaultLocale); if (candidate === null) { candidate = lastDitchLocale(); } else { - CanonicalizeLanguageTagObject(candidate); - - candidate = StringFromLanguageTagObject(candidate); - // The default locale must be in [[availableLocales]], and that list // must not contain any locales with Unicode extension sequences, so // remove any present in the candidate. @@ -1236,10 +204,7 @@ function DefaultLocaleIgnoringAvailableLocales() { localeCandidateCache.candidateDefaultLocale = candidate; localeCandidateCache.runtimeDefaultLocale = runtimeDefaultLocale; - assert(IsStructurallyValidLanguageTag(candidate), - "the candidate must be structurally valid"); - assert(startOfUnicodeExtensions(candidate) < 0, - "the candidate must not contain a Unicode extension sequence"); + assertIsValidAndCanonicalLanguageTag(candidate, "the candidate"); return candidate; } @@ -1275,10 +240,7 @@ function DefaultLocale() { locale = lastDitchLocale(); } - assert(IsStructurallyValidLanguageTag(locale), - "the computed default locale must be structurally valid"); - assert(locale === CanonicalizeLanguageTag(locale), - "the computed default locale must be canonical"); + assertIsValidAndCanonicalLanguageTag(locale, "the computed default locale"); assert(startOfUnicodeExtensions(locale) < 0, "the computed default locale must not contain a Unicode extension sequence"); @@ -1325,8 +287,12 @@ function CanonicalizeLocaleList(locales) { return []; // Step 3 (and the remaining steps). - if (typeof locales === "string") - return [ValidateAndCanonicalizeLanguageTag(locales)]; + var tag = intl_ValidateAndCanonicalizeLanguageTag(locales, false); + if (tag !== null) { + assert(typeof tag === "string", + "intl_ValidateAndCanonicalizeLanguageTag returns a string value"); + return [tag]; + } // Step 2. var seen = []; @@ -1351,11 +317,10 @@ function CanonicalizeLocaleList(locales) { if (!(typeof kValue === "string" || IsObject(kValue))) ThrowTypeError(JSMSG_INVALID_LOCALES_ELEMENT); - // Step 7.c.iii. - var tag = ToString(kValue); - - // Step 7.c.iv. - tag = ValidateAndCanonicalizeLanguageTag(tag); + // Steps 7.c.iii-iv. + var tag = intl_ValidateAndCanonicalizeLanguageTag(kValue, true); + assert(typeof tag === "string", + "ValidateAndCanonicalizeLanguageTag returns a string value"); // Step 7.c.v. if (callFunction(ArrayIndexOf, seen, tag) === -1) @@ -1372,8 +337,7 @@ function CanonicalizeLocaleList(locales) { function BestAvailableLocaleHelper(availableLocales, locale, considerDefaultLocale) { - assert(IsStructurallyValidLanguageTag(locale), "invalid BestAvailableLocale locale structure"); - assert(locale === CanonicalizeLanguageTag(locale), "non-canonical BestAvailableLocale locale"); + assertIsValidAndCanonicalLanguageTag(locale, "BestAvailableLocale locale"); assert(startOfUnicodeExtensions(locale) < 0, "locale must contain no Unicode extensions"); // In the spec, [[availableLocales]] is formally a list of all available @@ -1703,13 +667,9 @@ function ResolveLocale(availableLocales, requestedLocales, options, relevantExte foundLocale = preExtension + supportedExtension + postExtension; } - // Step 9.d. - assert(IsStructurallyValidLanguageTag(foundLocale), "invalid locale after concatenation"); - - // Step 9.e (Not required in this implementation, because we don't + // Step 9.d-e (Step 9.e is not required in this implementation, because we don't // canonicalize Unicode extension subtags). - assert(foundLocale === CanonicalizeLanguageTag(foundLocale), "same locale with extension"); - + assertIsValidAndCanonicalLanguageTag(foundLocale, "same locale with extension"); } // Step 10. diff --git a/js/src/builtin/intl/IntlObject.cpp b/js/src/builtin/intl/IntlObject.cpp index c415079ae3..1c1a8d2477 100644 --- a/js/src/builtin/intl/IntlObject.cpp +++ b/js/src/builtin/intl/IntlObject.cpp @@ -19,6 +19,7 @@ #include "builtin/intl/CommonFunctions.h" #include "builtin/intl/DateTimeFormat.h" #include "builtin/intl/ICUHeader.h" +#include "builtin/intl/Locale.h" #include "builtin/intl/NumberFormat.h" #include "builtin/intl/PluralRules.h" #include "builtin/intl/RelativeTimeFormat.h" @@ -459,6 +460,10 @@ GlobalObject::initIntlObject(JSContext* cx, Handle<GlobalObject*> global) dateTimeFormatProto = CreateDateTimeFormatPrototype(cx, intl, global, &dateTimeFormat, DateTimeFormatOptions::Standard); if (!dateTimeFormatProto) return false; + RootedObject localeProto(cx); + localeProto = CreateLocalePrototype(cx, intl, global); + if (!localeProto) + return false; RootedObject numberFormatProto(cx), numberFormat(cx); numberFormatProto = CreateNumberFormatPrototype(cx, intl, global, &numberFormat); if (!numberFormatProto) @@ -492,6 +497,7 @@ GlobalObject::initIntlObject(JSContext* cx, Handle<GlobalObject*> global) global->setReservedSlot(COLLATOR_PROTO, ObjectValue(*collatorProto)); global->setReservedSlot(DATE_TIME_FORMAT, ObjectValue(*dateTimeFormat)); global->setReservedSlot(DATE_TIME_FORMAT_PROTO, ObjectValue(*dateTimeFormatProto)); + global->setReservedSlot(LOCALE_PROTO, ObjectValue(*localeProto)); global->setReservedSlot(NUMBER_FORMAT, ObjectValue(*numberFormat)); global->setReservedSlot(NUMBER_FORMAT_PROTO, ObjectValue(*numberFormatProto)); global->setReservedSlot(PLURAL_RULES_PROTO, ObjectValue(*pluralRulesProto)); diff --git a/js/src/builtin/intl/LangTagMappingsGenerated.js b/js/src/builtin/intl/LangTagMappingsGenerated.js deleted file mode 100644 index 83a8ff8f60..0000000000 --- a/js/src/builtin/intl/LangTagMappingsGenerated.js +++ /dev/null @@ -1,1246 +0,0 @@ -// Generated by make_intl_data.py. DO NOT EDIT. - -// Mappings from grandfathered tags to preferred values. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -var grandfatheredMappings = { - "art-lojban": "jbo", - "cel-gaulish": "xtg-x-cel-gaulish", - "zh-guoyu": "zh", - "zh-hakka": "hak", - "zh-xiang": "hsn", -}; - -// Mappings from language subtags to preferred values. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -var languageMappings = { - "aam": "aas", - "aar": "aa", - "abk": "ab", - "adp": "dz", - "afr": "af", - "aju": "jrb", - "aka": "ak", - "alb": "sq", - "als": "sq", - "amh": "am", - "ara": "ar", - "arb": "ar", - "arg": "an", - "arm": "hy", - "asd": "snz", - "asm": "as", - "aue": "ktz", - "ava": "av", - "ave": "ae", - "aym": "ay", - "ayr": "ay", - "ayx": "nun", - "aze": "az", - "azj": "az", - "bak": "ba", - "bam": "bm", - "baq": "eu", - "bcc": "bal", - "bcl": "bik", - "bel": "be", - "ben": "bn", - "bgm": "bcg", - "bh": "bho", - "bih": "bho", - "bis": "bi", - "bjd": "drl", - "bod": "bo", - "bos": "bs", - "bre": "br", - "bul": "bg", - "bur": "my", - "bxk": "luy", - "bxr": "bua", - "cat": "ca", - "ccq": "rki", - "ces": "cs", - "cha": "ch", - "che": "ce", - "chi": "zh", - "chu": "cu", - "chv": "cv", - "cjr": "mom", - "cka": "cmr", - "cld": "syr", - "cmk": "xch", - "cmn": "zh", - "cor": "kw", - "cos": "co", - "coy": "pij", - "cqu": "quh", - "cre": "cr", - "cwd": "cr", - "cym": "cy", - "cze": "cs", - "dan": "da", - "deu": "de", - "dgo": "doi", - "dhd": "mwr", - "dik": "din", - "diq": "zza", - "dit": "dif", - "div": "dv", - "drh": "mn", - "dut": "nl", - "dzo": "dz", - "ekk": "et", - "ell": "el", - "emk": "man", - "eng": "en", - "epo": "eo", - "esk": "ik", - "est": "et", - "eus": "eu", - "ewe": "ee", - "fao": "fo", - "fas": "fa", - "fat": "ak", - "fij": "fj", - "fin": "fi", - "fra": "fr", - "fre": "fr", - "fry": "fy", - "fuc": "ff", - "ful": "ff", - "gav": "dev", - "gaz": "om", - "gbo": "grb", - "geo": "ka", - "ger": "de", - "gfx": "vaj", - "ggn": "gvr", - "gla": "gd", - "gle": "ga", - "glg": "gl", - "glv": "gv", - "gno": "gon", - "gre": "el", - "grn": "gn", - "gti": "nyc", - "gug": "gn", - "guj": "gu", - "guv": "duz", - "gya": "gba", - "hat": "ht", - "hau": "ha", - "hdn": "hai", - "hea": "hmn", - "heb": "he", - "her": "hz", - "him": "srx", - "hin": "hi", - "hmo": "ho", - "hrr": "jal", - "hrv": "hr", - "hun": "hu", - "hye": "hy", - "ibi": "opa", - "ibo": "ig", - "ice": "is", - "ido": "io", - "iii": "ii", - "ike": "iu", - "iku": "iu", - "ile": "ie", - "ilw": "gal", - "in": "id", - "ina": "ia", - "ind": "id", - "ipk": "ik", - "isl": "is", - "ita": "it", - "iw": "he", - "jav": "jv", - "jeg": "oyb", - "ji": "yi", - "jpn": "ja", - "jw": "jv", - "kal": "kl", - "kan": "kn", - "kas": "ks", - "kat": "ka", - "kau": "kr", - "kaz": "kk", - "kgc": "tdf", - "kgh": "kml", - "khk": "mn", - "khm": "km", - "kik": "ki", - "kin": "rw", - "kir": "ky", - "kmr": "ku", - "knc": "kr", - "kng": "kg", - "knn": "kok", - "koj": "kwv", - "kom": "kv", - "kon": "kg", - "kor": "ko", - "kpv": "kv", - "krm": "bmf", - "ktr": "dtp", - "kua": "kj", - "kur": "ku", - "kvs": "gdj", - "kwq": "yam", - "kxe": "tvd", - "kzj": "dtp", - "kzt": "dtp", - "lao": "lo", - "lat": "la", - "lav": "lv", - "lbk": "bnc", - "lii": "raq", - "lim": "li", - "lin": "ln", - "lit": "lt", - "llo": "ngt", - "lmm": "rmx", - "ltz": "lb", - "lub": "lu", - "lug": "lg", - "lvs": "lv", - "mac": "mk", - "mah": "mh", - "mal": "ml", - "mao": "mi", - "mar": "mr", - "may": "ms", - "meg": "cir", - "mhr": "chm", - "mkd": "mk", - "mlg": "mg", - "mlt": "mt", - "mnk": "man", - "mo": "ro", - "mol": "ro", - "mon": "mn", - "mri": "mi", - "msa": "ms", - "mst": "mry", - "mup": "raj", - "mwj": "vaj", - "mya": "my", - "myd": "aog", - "myt": "mry", - "nad": "xny", - "nau": "na", - "nav": "nv", - "nbl": "nr", - "ncp": "kdz", - "nde": "nd", - "ndo": "ng", - "nep": "ne", - "nld": "nl", - "nno": "nn", - "nns": "nbr", - "nnx": "ngv", - "no": "nb", - "nob": "nb", - "nor": "nb", - "npi": "ne", - "nts": "pij", - "nya": "ny", - "oci": "oc", - "ojg": "oj", - "oji": "oj", - "ori": "or", - "orm": "om", - "ory": "or", - "oss": "os", - "oun": "vaj", - "pan": "pa", - "pbu": "ps", - "pcr": "adx", - "per": "fa", - "pes": "fa", - "pli": "pi", - "plt": "mg", - "pmc": "huw", - "pmu": "phr", - "pnb": "lah", - "pol": "pl", - "por": "pt", - "ppa": "bfy", - "ppr": "lcq", - "pry": "prt", - "pus": "ps", - "puz": "pub", - "que": "qu", - "quz": "qu", - "rmy": "rom", - "roh": "rm", - "ron": "ro", - "rum": "ro", - "run": "rn", - "rus": "ru", - "sag": "sg", - "san": "sa", - "sca": "hle", - "scc": "sr", - "scr": "hr", - "sin": "si", - "skk": "oyb", - "slk": "sk", - "slo": "sk", - "slv": "sl", - "sme": "se", - "smo": "sm", - "sna": "sn", - "snd": "sd", - "som": "so", - "sot": "st", - "spa": "es", - "spy": "kln", - "sqi": "sq", - "src": "sc", - "srd": "sc", - "srp": "sr", - "ssw": "ss", - "sun": "su", - "swa": "sw", - "swe": "sv", - "swh": "sw", - "tah": "ty", - "tam": "ta", - "tat": "tt", - "tdu": "dtp", - "tel": "te", - "tgk": "tg", - "tgl": "fil", - "tha": "th", - "thc": "tpo", - "thx": "oyb", - "tib": "bo", - "tie": "ras", - "tir": "ti", - "tkk": "twm", - "tl": "fil", - "tlw": "weo", - "tmp": "tyj", - "tne": "kak", - "ton": "to", - "tsf": "taj", - "tsn": "tn", - "tso": "ts", - "ttq": "tmh", - "tuk": "tk", - "tur": "tr", - "tw": "ak", - "twi": "ak", - "uig": "ug", - "ukr": "uk", - "umu": "del", - "uok": "ema", - "urd": "ur", - "uzb": "uz", - "uzn": "uz", - "ven": "ve", - "vie": "vi", - "vol": "vo", - "wel": "cy", - "wln": "wa", - "wol": "wo", - "xba": "cax", - "xho": "xh", - "xia": "acn", - "xkh": "waw", - "xpe": "kpe", - "xsj": "suj", - "xsl": "den", - "ybd": "rki", - "ydd": "yi", - "yid": "yi", - "yma": "lrr", - "ymt": "mtm", - "yor": "yo", - "yos": "zom", - "yuu": "yug", - "zai": "zap", - "zha": "za", - "zho": "zh", - "zsm": "ms", - "zul": "zu", - "zyb": "za", -}; - -// Language subtags with complex mappings. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -var complexLanguageMappings = { - "cnr": true, - "drw": true, - "hbs": true, - "prs": true, - "sh": true, - "swc": true, - "tnf": true, -}; - -// Mappings from region subtags to preferred values. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -var regionMappings = { - "004": "AF", - "008": "AL", - "010": "AQ", - "012": "DZ", - "016": "AS", - "020": "AD", - "024": "AO", - "028": "AG", - "031": "AZ", - "032": "AR", - "036": "AU", - "040": "AT", - "044": "BS", - "048": "BH", - "050": "BD", - "051": "AM", - "052": "BB", - "056": "BE", - "060": "BM", - "062": "034", - "064": "BT", - "068": "BO", - "070": "BA", - "072": "BW", - "074": "BV", - "076": "BR", - "084": "BZ", - "086": "IO", - "090": "SB", - "092": "VG", - "096": "BN", - "100": "BG", - "104": "MM", - "108": "BI", - "112": "BY", - "116": "KH", - "120": "CM", - "124": "CA", - "132": "CV", - "136": "KY", - "140": "CF", - "144": "LK", - "148": "TD", - "152": "CL", - "156": "CN", - "158": "TW", - "162": "CX", - "166": "CC", - "170": "CO", - "174": "KM", - "175": "YT", - "178": "CG", - "180": "CD", - "184": "CK", - "188": "CR", - "191": "HR", - "192": "CU", - "196": "CY", - "203": "CZ", - "204": "BJ", - "208": "DK", - "212": "DM", - "214": "DO", - "218": "EC", - "222": "SV", - "226": "GQ", - "230": "ET", - "231": "ET", - "232": "ER", - "233": "EE", - "234": "FO", - "238": "FK", - "239": "GS", - "242": "FJ", - "246": "FI", - "248": "AX", - "249": "FR", - "250": "FR", - "254": "GF", - "258": "PF", - "260": "TF", - "262": "DJ", - "266": "GA", - "268": "GE", - "270": "GM", - "275": "PS", - "276": "DE", - "278": "DE", - "280": "DE", - "288": "GH", - "292": "GI", - "296": "KI", - "300": "GR", - "304": "GL", - "308": "GD", - "312": "GP", - "316": "GU", - "320": "GT", - "324": "GN", - "328": "GY", - "332": "HT", - "334": "HM", - "336": "VA", - "340": "HN", - "344": "HK", - "348": "HU", - "352": "IS", - "356": "IN", - "360": "ID", - "364": "IR", - "368": "IQ", - "372": "IE", - "376": "IL", - "380": "IT", - "384": "CI", - "388": "JM", - "392": "JP", - "398": "KZ", - "400": "JO", - "404": "KE", - "408": "KP", - "410": "KR", - "414": "KW", - "417": "KG", - "418": "LA", - "422": "LB", - "426": "LS", - "428": "LV", - "430": "LR", - "434": "LY", - "438": "LI", - "440": "LT", - "442": "LU", - "446": "MO", - "450": "MG", - "454": "MW", - "458": "MY", - "462": "MV", - "466": "ML", - "470": "MT", - "474": "MQ", - "478": "MR", - "480": "MU", - "484": "MX", - "492": "MC", - "496": "MN", - "498": "MD", - "499": "ME", - "500": "MS", - "504": "MA", - "508": "MZ", - "512": "OM", - "516": "NA", - "520": "NR", - "524": "NP", - "528": "NL", - "531": "CW", - "533": "AW", - "534": "SX", - "535": "BQ", - "540": "NC", - "548": "VU", - "554": "NZ", - "558": "NI", - "562": "NE", - "566": "NG", - "570": "NU", - "574": "NF", - "578": "NO", - "580": "MP", - "581": "UM", - "583": "FM", - "584": "MH", - "585": "PW", - "586": "PK", - "591": "PA", - "598": "PG", - "600": "PY", - "604": "PE", - "608": "PH", - "612": "PN", - "616": "PL", - "620": "PT", - "624": "GW", - "626": "TL", - "630": "PR", - "634": "QA", - "638": "RE", - "642": "RO", - "643": "RU", - "646": "RW", - "652": "BL", - "654": "SH", - "659": "KN", - "660": "AI", - "662": "LC", - "663": "MF", - "666": "PM", - "670": "VC", - "674": "SM", - "678": "ST", - "682": "SA", - "686": "SN", - "688": "RS", - "690": "SC", - "694": "SL", - "702": "SG", - "703": "SK", - "704": "VN", - "705": "SI", - "706": "SO", - "710": "ZA", - "716": "ZW", - "720": "YE", - "724": "ES", - "728": "SS", - "729": "SD", - "732": "EH", - "736": "SD", - "740": "SR", - "744": "SJ", - "748": "SZ", - "752": "SE", - "756": "CH", - "760": "SY", - "762": "TJ", - "764": "TH", - "768": "TG", - "772": "TK", - "776": "TO", - "780": "TT", - "784": "AE", - "788": "TN", - "792": "TR", - "795": "TM", - "796": "TC", - "798": "TV", - "800": "UG", - "804": "UA", - "807": "MK", - "818": "EG", - "826": "GB", - "830": "JE", - "831": "GG", - "832": "JE", - "833": "IM", - "834": "TZ", - "840": "US", - "850": "VI", - "854": "BF", - "858": "UY", - "860": "UZ", - "862": "VE", - "876": "WF", - "882": "WS", - "886": "YE", - "887": "YE", - "891": "RS", - "894": "ZM", - "958": "AA", - "959": "QM", - "960": "QN", - "962": "QP", - "963": "QQ", - "964": "QR", - "965": "QS", - "966": "QT", - "967": "EU", - "968": "QV", - "969": "QW", - "970": "QX", - "971": "QY", - "972": "QZ", - "973": "XA", - "974": "XB", - "975": "XC", - "976": "XD", - "977": "XE", - "978": "XF", - "979": "XG", - "980": "XH", - "981": "XI", - "982": "XJ", - "983": "XK", - "984": "XL", - "985": "XM", - "986": "XN", - "987": "XO", - "988": "XP", - "989": "XQ", - "990": "XR", - "991": "XS", - "992": "XT", - "993": "XU", - "994": "XV", - "995": "XW", - "996": "XX", - "997": "XY", - "998": "XZ", - "999": "ZZ", - "BU": "MM", - "CS": "RS", - "CT": "KI", - "DD": "DE", - "DY": "BJ", - "FQ": "AQ", - "FX": "FR", - "HV": "BF", - "JT": "UM", - "MI": "UM", - "NH": "VU", - "NQ": "AQ", - "PU": "UM", - "PZ": "PA", - "QU": "EU", - "RH": "ZW", - "TP": "TL", - "UK": "GB", - "VD": "VN", - "WK": "UM", - "YD": "YE", - "YU": "RS", - "ZR": "CD", -}; - -// Region subtags with complex mappings. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -var complexRegionMappings = { - "172": true, - "200": true, - "530": true, - "532": true, - "536": true, - "582": true, - "810": true, - "890": true, - "AN": true, - "NT": true, - "PC": true, - "SU": true, -}; - -// Canonicalize Unicode BCP 47 locale identifiers. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -/* eslint-disable complexity */ -function updateLocaleIdMappings(tag) { - assert(IsObject(tag), "tag is an object"); - - // Replace deprecated language tags with their preferred values. - var language = tag.language; - if (hasOwn(language, languageMappings)) { - tag.language = languageMappings[language]; - } else if (hasOwn(language, complexLanguageMappings)) { - switch (language) { - case "cnr": - tag.language = "sr"; - if (tag.region === undefined) - tag.region = "ME"; - break; - case "drw": - case "prs": - case "tnf": - tag.language = "fa"; - if (tag.region === undefined) - tag.region = "AF"; - break; - case "hbs": - case "sh": - tag.language = "sr"; - if (tag.script === undefined) - tag.script = "Latn"; - break; - case "swc": - tag.language = "sw"; - if (tag.region === undefined) - tag.region = "CD"; - break; - default: - assert(false, "language not handled: " + language); - } - } - - // No script replacements are currently present. - - // Replace deprecated subtags with their preferred values. - var region = tag.region; - if (region !== undefined) { - if (hasOwn(region, regionMappings)) { - tag.region = regionMappings[region]; - } else if (hasOwn(region, complexRegionMappings)) { - switch (region) { - case "172": - if (tag.language === "ab") { - tag.region = "GE"; - break; - } - if (tag.language === "az") { - tag.region = "AZ"; - break; - } - if (tag.language === "be") { - tag.region = "BY"; - break; - } - if (tag.language === "crh") { - tag.region = "UA"; - break; - } - if (tag.language === "gag") { - tag.region = "MD"; - break; - } - if (tag.language === "got") { - tag.region = "UA"; - break; - } - if (tag.language === "hy") { - tag.region = "AM"; - break; - } - if (tag.language === "ji") { - tag.region = "UA"; - break; - } - if (tag.language === "ka") { - tag.region = "GE"; - break; - } - if (tag.language === "kaa") { - tag.region = "UZ"; - break; - } - if (tag.language === "kk") { - tag.region = "KZ"; - break; - } - if (tag.language === "ku" && tag.script === "Yezi") { - tag.region = "GE"; - break; - } - if (tag.language === "ky") { - tag.region = "KG"; - break; - } - if (tag.language === "os") { - tag.region = "GE"; - break; - } - if (tag.language === "rue") { - tag.region = "UA"; - break; - } - if (tag.language === "sog") { - tag.region = "UZ"; - break; - } - if (tag.language === "tg") { - tag.region = "TJ"; - break; - } - if (tag.language === "tk") { - tag.region = "TM"; - break; - } - if (tag.language === "tkr") { - tag.region = "AZ"; - break; - } - if (tag.language === "tly") { - tag.region = "AZ"; - break; - } - if (tag.language === "ttt") { - tag.region = "AZ"; - break; - } - if (tag.language === "ug" && tag.script === "Cyrl") { - tag.region = "KZ"; - break; - } - if (tag.language === "uk") { - tag.region = "UA"; - break; - } - if (tag.language === "und" && tag.script === "Geor") { - tag.region = "GE"; - break; - } - if (tag.language === "und" && tag.script === "Armn") { - tag.region = "AM"; - break; - } - if (tag.language === "und" && tag.script === "Sogo") { - tag.region = "UZ"; - break; - } - if (tag.language === "und" && tag.script === "Goth") { - tag.region = "UA"; - break; - } - if (tag.language === "und" && tag.script === "Chrs") { - tag.region = "UZ"; - break; - } - if (tag.language === "und" && tag.script === "Sogd") { - tag.region = "UZ"; - break; - } - if (tag.language === "und" && tag.script === "Yezi") { - tag.region = "GE"; - break; - } - if (tag.language === "uz") { - tag.region = "UZ"; - break; - } - if (tag.language === "xco") { - tag.region = "UZ"; - break; - } - if (tag.language === "xmf") { - tag.region = "GE"; - break; - } - tag.region = "RU"; - break; - case "200": - if (tag.language === "sk") { - tag.region = "SK"; - break; - } - tag.region = "CZ"; - break; - case "530": - case "532": - case "AN": - if (tag.language === "vic") { - tag.region = "SX"; - break; - } - tag.region = "CW"; - break; - case "536": - case "NT": - if (tag.language === "akk") { - tag.region = "IQ"; - break; - } - if (tag.language === "ckb") { - tag.region = "IQ"; - break; - } - if (tag.language === "ku" && tag.script === "Arab") { - tag.region = "IQ"; - break; - } - if (tag.language === "mis") { - tag.region = "IQ"; - break; - } - if (tag.language === "syr") { - tag.region = "IQ"; - break; - } - if (tag.language === "und" && tag.script === "Syrc") { - tag.region = "IQ"; - break; - } - if (tag.language === "und" && tag.script === "Hatr") { - tag.region = "IQ"; - break; - } - if (tag.language === "und" && tag.script === "Xsux") { - tag.region = "IQ"; - break; - } - tag.region = "SA"; - break; - case "582": - case "PC": - if (tag.language === "mh") { - tag.region = "MH"; - break; - } - if (tag.language === "pau") { - tag.region = "PW"; - break; - } - tag.region = "FM"; - break; - case "810": - case "SU": - if (tag.language === "ab") { - tag.region = "GE"; - break; - } - if (tag.language === "az") { - tag.region = "AZ"; - break; - } - if (tag.language === "be") { - tag.region = "BY"; - break; - } - if (tag.language === "crh") { - tag.region = "UA"; - break; - } - if (tag.language === "et") { - tag.region = "EE"; - break; - } - if (tag.language === "gag") { - tag.region = "MD"; - break; - } - if (tag.language === "got") { - tag.region = "UA"; - break; - } - if (tag.language === "hy") { - tag.region = "AM"; - break; - } - if (tag.language === "ji") { - tag.region = "UA"; - break; - } - if (tag.language === "ka") { - tag.region = "GE"; - break; - } - if (tag.language === "kaa") { - tag.region = "UZ"; - break; - } - if (tag.language === "kk") { - tag.region = "KZ"; - break; - } - if (tag.language === "ku" && tag.script === "Yezi") { - tag.region = "GE"; - break; - } - if (tag.language === "ky") { - tag.region = "KG"; - break; - } - if (tag.language === "lt") { - tag.region = "LT"; - break; - } - if (tag.language === "ltg") { - tag.region = "LV"; - break; - } - if (tag.language === "lv") { - tag.region = "LV"; - break; - } - if (tag.language === "os") { - tag.region = "GE"; - break; - } - if (tag.language === "rue") { - tag.region = "UA"; - break; - } - if (tag.language === "sgs") { - tag.region = "LT"; - break; - } - if (tag.language === "sog") { - tag.region = "UZ"; - break; - } - if (tag.language === "tg") { - tag.region = "TJ"; - break; - } - if (tag.language === "tk") { - tag.region = "TM"; - break; - } - if (tag.language === "tkr") { - tag.region = "AZ"; - break; - } - if (tag.language === "tly") { - tag.region = "AZ"; - break; - } - if (tag.language === "ttt") { - tag.region = "AZ"; - break; - } - if (tag.language === "ug" && tag.script === "Cyrl") { - tag.region = "KZ"; - break; - } - if (tag.language === "uk") { - tag.region = "UA"; - break; - } - if (tag.language === "und" && tag.script === "Geor") { - tag.region = "GE"; - break; - } - if (tag.language === "und" && tag.script === "Armn") { - tag.region = "AM"; - break; - } - if (tag.language === "und" && tag.script === "Sogo") { - tag.region = "UZ"; - break; - } - if (tag.language === "und" && tag.script === "Goth") { - tag.region = "UA"; - break; - } - if (tag.language === "und" && tag.script === "Chrs") { - tag.region = "UZ"; - break; - } - if (tag.language === "und" && tag.script === "Sogd") { - tag.region = "UZ"; - break; - } - if (tag.language === "und" && tag.script === "Yezi") { - tag.region = "GE"; - break; - } - if (tag.language === "uz") { - tag.region = "UZ"; - break; - } - if (tag.language === "vro") { - tag.region = "EE"; - break; - } - if (tag.language === "xco") { - tag.region = "UZ"; - break; - } - if (tag.language === "xmf") { - tag.region = "GE"; - break; - } - tag.region = "RU"; - break; - case "890": - if (tag.language === "bs") { - tag.region = "BA"; - break; - } - if (tag.language === "hr") { - tag.region = "HR"; - break; - } - if (tag.language === "mk") { - tag.region = "MK"; - break; - } - if (tag.language === "sl") { - tag.region = "SI"; - break; - } - tag.region = "RS"; - break; - default: - assert(false, "region not handled: " + region); - } - } - - // No variant replacements are currently present. - // No extension replacements are currently present. - // Private use sequences are left as is. - - } -} -/* eslint-enable complexity */ - -// Canonicalize grandfathered locale identifiers. -// Derived from CLDR Supplemental Data, version 36.1. -// https://github.com/unicode-org/cldr.git -function updateGrandfatheredMappings(tag) { - assert(IsObject(tag), "tag is an object"); - - // We're mapping regular grandfathered tags to non-grandfathered form here. - // Other tags remain unchanged. - // - // regular = "art-lojban" - // / "cel-gaulish" - // / "no-bok" - // / "no-nyn" - // / "zh-guoyu" - // / "zh-hakka" - // / "zh-min" - // / "zh-min-nan" - // / "zh-xiang" - // - // Therefore we can quickly exclude most tags by checking every - // |unicode_locale_id| subcomponent for characteristics not shared by any of - // the regular grandfathered (RG) tags: - // - // * Real-world |unicode_language_subtag|s are all two or three letters, - // so don't waste time running a useless |language.length > 3| fast-path. - // * No RG tag has a "script"-looking component. - // * No RG tag has a "region"-looking component. - // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, - // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, - // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag - // that |unicode_locale_id| doesn't support.) - // * No RG tag contains |extensions| or |pu_extensions|. - if (tag.script !== undefined || - tag.region !== undefined || - tag.variants.length !== 1 || - tag.extensions.length !== 0 || - tag.privateuse !== undefined) - { - return; - } - - // art-lojban -> jbo - if (tag.language === "art" && tag.variants[0] === "lojban") { - tag.language = "jbo"; - tag.variants.length = 0; - } - - // cel-gaulish -> xtg-x-cel-gaulish - else if (tag.language === "cel" && tag.variants[0] === "gaulish") { - tag.language = "xtg"; - tag.variants.length = 0; - tag.privateuse = "x-cel-gaulish"; - } - - // zh-guoyu -> zh - else if (tag.language === "zh" && tag.variants[0] === "guoyu") { - tag.language = "zh"; - tag.variants.length = 0; - } - - // zh-hakka -> hak - else if (tag.language === "zh" && tag.variants[0] === "hakka") { - tag.language = "hak"; - tag.variants.length = 0; - } - - // zh-xiang -> hsn - else if (tag.language === "zh" && tag.variants[0] === "xiang") { - tag.language = "hsn"; - tag.variants.length = 0; - } -} diff --git a/js/src/builtin/intl/LanguageTag.cpp b/js/src/builtin/intl/LanguageTag.cpp new file mode 100644 index 0000000000..1f5c1fa110 --- /dev/null +++ b/js/src/builtin/intl/LanguageTag.cpp @@ -0,0 +1,1677 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "builtin/intl/LanguageTag.h" + +#include "mozilla/Assertions.h" +#include "mozilla/MathAlgorithms.h" +#include "mozilla/Range.h" +#include "mozilla/TextUtils.h" +#include "mozilla/Variant.h" + +#include <algorithm> +#include <iterator> +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <string.h> +#include <type_traits> +#include <utility> + +#include "jsapi.h" +#include "jsfriendapi.h" +#include "jscntxt.h" + +#include "builtin/intl/CommonFunctions.h" +#include "ds/Sort.h" +#include "js/Result.h" +#include "js/Utility.h" +#include "js/Vector.h" +#include "unicode/uloc.h" +#include "unicode/utypes.h" +#include "vm/String.h" +#include "vm/StringBuffer.h" + +namespace js { +namespace intl { + +using namespace js::intl::LanguageTagLimits; + +using ConstCharRange = mozilla::Range<const char>; + +#ifdef DEBUG +template <typename CharT> +bool IsStructurallyValidLanguageTag( + const mozilla::Range<const CharT>& language) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + // unicode_language_subtag = alpha{2,3} | alpha{5,8}; + size_t length = language.length(); + const CharT* str = language.begin().get(); + return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && + std::all_of(str, str + length, mozilla::IsAsciiLowercaseAlpha<CharT>); +} + +template bool IsStructurallyValidLanguageTag( + const mozilla::Range<const Latin1Char>& language); +template bool IsStructurallyValidLanguageTag( + const mozilla::Range<const char16_t>& language); + +template <typename CharT> +bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + // unicode_script_subtag = alpha{4} ; + size_t length = script.length(); + const CharT* str = script.begin().get(); + return length == 4 && mozilla::IsAsciiUppercaseAlpha<CharT>(str[0]) && + std::all_of(str + 1, str + length, + mozilla::IsAsciiLowercaseAlpha<CharT>); +} + +template bool IsStructurallyValidScriptTag( + const mozilla::Range<const Latin1Char>& script); +template bool IsStructurallyValidScriptTag( + const mozilla::Range<const char16_t>& script); + +template <typename CharT> +bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + size_t length = region.length(); + const CharT* str = region.begin().get(); + return (length == 2 && std::all_of(str, str + length, + mozilla::IsAsciiUppercaseAlpha<CharT>)) || + (length == 3 && + std::all_of(str, str + length, mozilla::IsAsciiDigit<CharT>)); +} + +template bool IsStructurallyValidRegionTag( + const mozilla::Range<const Latin1Char>& region); +template bool IsStructurallyValidRegionTag( + const mozilla::Range<const char16_t>& region); + +bool IsStructurallyValidVariantTag(const ConstCharRange& variant) { + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + auto isAsciiLowercaseAlphanumeric = [](char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); + }; + size_t length = variant.length(); + const char* str = variant.begin().get(); + return ((5 <= length && length <= 8) || + (length == 4 && mozilla::IsAsciiDigit(str[0]))) && + std::all_of(str, str + length, isAsciiLowercaseAlphanumeric); +} + +bool IsStructurallyValidUnicodeExtensionTag(const ConstCharRange& extension) { + auto isAsciiLowercaseAlphanumericOrDash = [](char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c) || + c == '-'; + }; + + size_t length = extension.length(); + const char* str = extension.begin().get(); + return LanguageTagParser::canParseUnicodeExtension(extension) && + std::all_of(str, str + length, isAsciiLowercaseAlphanumericOrDash); +} + +static bool IsStructurallyValidExtensionTag(const ConstCharRange& extension) { + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + // NB: Allow any extension, including Unicode and Transform here, because + // this function is only used for an assertion. + auto isAsciiDigitOrLowercaseAlpha = [](char c) { + return mozilla::IsAsciiDigit(c) || mozilla::IsAsciiLowercaseAlpha(c); + }; + + size_t length = extension.length(); + const char* str = extension.begin().get(); + if (length <= 2) { + return false; + } + if (!isAsciiDigitOrLowercaseAlpha(str[0]) || str[0] == 'x') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = reinterpret_cast<const char*>( + memchr(str, '-', extension.end().get() - str)); + size_t len = (sep ? sep : extension.end().get()) - str; + if (len < 2 || len > 8 || + !std::all_of(str, str + len, isAsciiDigitOrLowercaseAlpha)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} + +bool IsStructurallyValidPrivateUseTag(const ConstCharRange& privateUse) { + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + auto isAsciiDigitOrLowercaseAlpha = [](char c) { + return mozilla::IsAsciiDigit(c) || mozilla::IsAsciiLowercaseAlpha(c); + }; + + size_t length = privateUse.length(); + const char* str = privateUse.begin().get(); + if (length <= 2 || *str++ != 'x' || *str++ != '-') { + return false; + } + while (true) { + const char* sep = reinterpret_cast<const char*>( + memchr(str, '-', privateUse.end().get() - str)); + size_t len = (sep ? sep : privateUse.end().get()) - str; + if (len == 0 || len > 8 || + !std::all_of(str, str + len, isAsciiDigitOrLowercaseAlpha)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} +#endif + +bool LanguageTag::setUnicodeExtension(UniqueChars extension) { + MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag( + {extension.get(), strlen(extension.get())})); + + // Replace the existing Unicode extension subtag or append a new one. + auto p = std::find_if(extensions().begin(), extensions().end(), + [](const auto& ext) { return ext[0] == 'u'; }); + if (p != extensions().end()) { + size_t index = std::distance(extensions().begin(), p); + extensions_[index] = std::move(extension); + return true; + } + return extensions_.append(std::move(extension)); +} + +template <size_t InitialCapacity> +static bool SortAlphabetically(JSContext* cx, + Vector<UniqueChars, InitialCapacity>& subtags) { + size_t length = subtags.length(); + + // Zero or one element lists are already sorted. + if (length < 2) { + return true; + } + + // Handle two element lists inline. + if (length == 2) { + if (strcmp(subtags[0].get(), subtags[1].get()) > 0) { + subtags[0].swap(subtags[1]); + } + return true; + } + + Vector<char*, 8> scratch(cx); + if (!scratch.resizeUninitialized(length * 2)) { + return false; + } + for (size_t i = 0; i < length; i++) { + scratch[i] = subtags[i].release(); + } + + MOZ_ALWAYS_TRUE( + MergeSort(scratch.begin(), length, scratch.begin() + length, + [](const char* a, const char* b, bool* lessOrEqualp) { + *lessOrEqualp = strcmp(a, b) <= 0; + return true; + })); + + for (size_t i = 0; i < length; i++) { + subtags[i] = UniqueChars(scratch[i]); + } + return true; +} + +bool LanguageTag::canonicalizeBaseName(JSContext* cx) { + // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by + // normalizing the case and ordering all subtags. The canonical syntax form + // itself is specified in UTS 35, 3.2.1. + + // The |LanguageTag| fields are already in normalized case, so we can skip + // this step. + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); + MOZ_ASSERT(script().length() == 0 || + IsStructurallyValidScriptTag(script().range())); + MOZ_ASSERT(region().length() == 0 || + IsStructurallyValidRegionTag(region().range())); +#ifdef DEBUG + auto validVariant = [](const auto& variant) { + const char* str = variant.get(); + return IsStructurallyValidVariantTag({str, strlen(str)}); + }; + MOZ_ASSERT(std::all_of(variants().begin(), variants().end(), validVariant)); + + auto validExtension = [](const auto& extension) { + const char* str = extension.get(); + return IsStructurallyValidExtensionTag({str, strlen(str)}); + }; + MOZ_ASSERT( + std::all_of(extensions().begin(), extensions().end(), validExtension)); +#endif + MOZ_ASSERT(!privateuse() || IsStructurallyValidPrivateUseTag( + {privateuse(), strlen(privateuse())})); + + // The second step in UTS 35, 3.2.1, is to order all subtags. + + // 1. Any variants are in alphabetical order. + if (!SortAlphabetically(cx, variants_)) { + return false; + } + + // 2. Any extensions are in alphabetical order by their singleton. + // - A subsequent call to canonicalizeExtensions() will perform this. + + // The next two steps in 3.3.1 replace deprecated language and region + // subtags with their preferred mappings. + + if (!updateGrandfatheredMappings(cx)) { + return false; + } + + // Replace deprecated language subtags with their preferred values. + if (!languageMapping(language_) && complexLanguageMapping(language_)) { + performComplexLanguageMappings(); + } + + // No script replacements are currently present. + + // Replace deprecated region subtags with their preferred values. + if (region().length() > 0) { + if (!regionMapping(region_) && complexRegionMapping(region_)) { + performComplexRegionMappings(); + } + } + + // No variant subtag replacements are currently present. + // No extension replacements are currently present. + // Private use sequences are left as is. + + // The two final steps in 3.3.1, handling irregular grandfathered and + // private-use only language tags, don't apply, because these two forms + // can't occur in Unicode BCP 47 locale identifiers. + + return true; +} + +bool LanguageTag::canonicalizeExtensions( + JSContext* cx, UnicodeExtensionCanonicalForm canonicalForm) { + // Any extensions are in alphabetical order by their singleton. + // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" + if (!SortAlphabetically(cx, extensions_)) { + return false; + } + + for (UniqueChars& extension : extensions_) { + if (extension[0] == 'u') { + if (!canonicalizeUnicodeExtension(cx, extension, canonicalForm)) { + return false; + } + } else if (extension[0] == 't') { + if (!canonicalizeTransformExtension(cx, extension)) { + return false; + } + } + } + return true; +} + +/** + * CanonicalizeUnicodeExtension( attributes, keywords ) + * + * Canonical syntax per + * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All attributes and keywords are in lowercase. + * - Note: The parser already converted keywords to lowercase. + * - All attributes are sorted in alphabetical order. + * - All keywords are sorted by alphabetical order of their keys. + * - Any type value "true" is removed. + * + * Canonical form: + * - All keys and types use the canonical form (from the name attribute; + * see Section 3.6.4 U Extension Data Files). + */ +bool LanguageTag::canonicalizeUnicodeExtension( + JSContext* cx, JS::UniqueChars& unicodeExtension, + UnicodeExtensionCanonicalForm canonicalForm) { + const char* const extension = unicodeExtension.get(); + MOZ_ASSERT(extension[0] == 'u'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT(IsStructurallyValidExtensionTag({extension, strlen(extension)})); + + size_t length = strlen(extension); + + LanguageTagParser::AttributesVector attributes(cx); + LanguageTagParser::KeywordsVector keywords(cx); + + using Attribute = LanguageTagParser::AttributesVector::ElementType; + using Keyword = LanguageTagParser::KeywordsVector::ElementType; + + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, ok, + LanguageTagParser::parseUnicodeExtension( + cx, ConstCharRange(extension, length), attributes, keywords)); + MOZ_ASSERT(ok, "unexpected invalid Unicode extension subtag"); + + auto attributesLessOrEqual = [extension](const Attribute& a, + const Attribute& b) { + const char* astr = a.begin(extension); + const char* bstr = b.begin(extension); + size_t alen = a.length(); + size_t blen = b.length(); + + if (int r = + std::char_traits<char>::compare(astr, bstr, std::min(alen, blen))) { + return r < 0; + } + return alen <= blen; + }; + + // All attributes are sorted in alphabetical order. + size_t attributesLength = attributes.length(); + if (attributesLength > 1) { + if (!attributes.growByUninitialized(attributesLength)) { + return false; + } + + MOZ_ALWAYS_TRUE( + MergeSort(attributes.begin(), attributesLength, + attributes.begin() + attributesLength, + [&](const auto& a, const auto& b, bool* lessOrEqualp) { + *lessOrEqualp = attributesLessOrEqual(a, b); + return true; + })); + + attributes.shrinkBy(attributesLength); + } + + auto keywordsLessOrEqual = [extension](const Keyword& a, const Keyword& b) { + const char* astr = a.begin(extension); + const char* bstr = b.begin(extension); + MOZ_ASSERT(a.length() >= UnicodeKeyLength); + MOZ_ASSERT(b.length() >= UnicodeKeyLength); + + return std::char_traits<char>::compare(astr, bstr, UnicodeKeyLength) <= 0; + }; + + // All keywords are sorted by alphabetical order of keys. + size_t keywordsLength = keywords.length(); + if (keywordsLength > 1) { + if (!keywords.growByUninitialized(keywordsLength)) { + return false; + } + + // Using merge sort, being a stable sort algorithm, guarantees that two + // keywords using the same key are never reordered. That means for example + // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to + // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs + // before "nu-latn". + // This is required so that deduplication below preserves the first keyword + // for a given key and discards the rest. + MOZ_ALWAYS_TRUE(MergeSort( + keywords.begin(), keywordsLength, keywords.begin() + keywordsLength, + [&](const auto& a, const auto& b, bool* lessOrEqualp) { + *lessOrEqualp = keywordsLessOrEqual(a, b); + return true; + })); + + keywords.shrinkBy(keywordsLength); + } + + Vector<char, 32> sb(cx); + if (!sb.append('u')) { + return false; + } + + // Append all Unicode extension attributes. + for (size_t i = 0; i < attributes.length(); i++) { + const auto& attribute = attributes[i]; + + // Skip duplicate attributes. + if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) { + const auto& lastAttribute = attributes[i - 1]; + if (attribute.length() == lastAttribute.length() && + std::char_traits<char>::compare(attribute.begin(extension), + lastAttribute.begin(extension), + attribute.length()) == 0) { + continue; + } + MOZ_ASSERT(!attributesLessOrEqual(attribute, lastAttribute)); + } + + if (!sb.append('-')) { + return false; + } + if (!sb.append(attribute.begin(extension), attribute.length())) { + return false; + } + } + + static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; + + static auto isTrue = [](const ConstCharRange& type) { + constexpr char True[] = "true"; + const size_t TrueLength = strlen(True); + return type.length() == TrueLength && + std::char_traits<char>::compare(type.begin().get(), True, + TrueLength) == 0; + }; + + auto appendKey = [&sb, extension](const Keyword& keyword) { + MOZ_ASSERT(keyword.length() == UnicodeKeyLength); + return sb.append(keyword.begin(extension), UnicodeKeyLength); + }; + + auto appendKeyword = [&sb, extension](const Keyword& keyword, + const ConstCharRange& type) { + MOZ_ASSERT(keyword.length() > UnicodeKeyLength); + + // Elide the Unicode extension type "true". + if (isTrue(type)) { + return sb.append(keyword.begin(extension), UnicodeKeyLength); + } + // Otherwise append the complete Unicode extension keyword. + return sb.append(keyword.begin(extension), keyword.length()); + }; + + auto appendReplacement = [&sb, extension](const Keyword& keyword, + const ConstCharRange& replacement) { + MOZ_ASSERT(keyword.length() > UnicodeKeyLength); + + // Elide the type "true" if present in the replacement. + if (isTrue(replacement)) { + return sb.append(keyword.begin(extension), UnicodeKeyLength); + } + // Otherwise append the Unicode key (including the separator) and the + // replaced type. + return sb.append(keyword.begin(extension), UnicodeKeyWithSepLength) && + sb.append(replacement.begin().get(), replacement.length()); + }; + + // Append all Unicode extension keywords. + for (size_t i = 0; i < keywords.length(); i++) { + const auto& keyword = keywords[i]; + + // Skip duplicate keywords. + if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) { + const auto& lastKeyword = keywords[i - 1]; + if (std::char_traits<char>::compare(keyword.begin(extension), + lastKeyword.begin(extension), + UnicodeKeyLength) == 0) { + continue; + } + MOZ_ASSERT(!keywordsLessOrEqual(keyword, lastKeyword)); + } + + if (!sb.append('-')) { + return false; + } + + if (keyword.length() == UnicodeKeyLength) { + // Keyword without type value. + if (!appendKey(keyword)) { + return false; + } + } else { + ConstCharRange key(keyword.begin(extension), UnicodeKeyLength); + ConstCharRange type(keyword.begin(extension) + UnicodeKeyWithSepLength, + keyword.length() - UnicodeKeyWithSepLength); + + if (canonicalForm == UnicodeExtensionCanonicalForm::Yes) { + // Search if there's a replacement for the current Unicode keyword. + if (const char* replacement = replaceUnicodeExtensionType(key, type)) { + if (!appendReplacement( + keyword, ConstCharRange(replacement, strlen(replacement)))) { + return false; + } + } else { + if (!appendKeyword(keyword, type)) { + return false; + } + } + } else { + if (!appendKeyword(keyword, type)) { + return false; + } + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (sb.length() != length || + std::char_traits<char>::compare(sb.begin(), extension, length) != 0) { + // Null-terminate the new string and replace the previous extension. + if (!sb.append('\0')) { + return false; + } + UniqueChars canonical(sb.extractOrCopyRawBuffer()); + if (!canonical) { + return false; + } + unicodeExtension = std::move(canonical); + } + + return true; +} + +template <class Buffer> +static bool LanguageTagToString(JSContext* cx, const LanguageTag& tag, + Buffer& sb) { + auto appendSubtag = [&sb](const auto& subtag) { + auto range = subtag.range(); + MOZ_ASSERT(range.length() > 0); + return sb.append(range.begin().get(), range.length()); + }; + + auto appendSubtagZ = [&sb](const char* subtag) { + MOZ_ASSERT(strlen(subtag) > 0); + return sb.append(subtag, strlen(subtag)); + }; + + auto appendSubtagsZ = [&sb, &appendSubtagZ](const auto& subtags) { + for (const auto& subtag : subtags) { + if (!sb.append('-') || !appendSubtagZ(subtag.get())) { + return false; + } + } + return true; + }; + + // Append the language subtag. + if (!appendSubtag(tag.language())) { + return false; + } + + // Append the script subtag if present. + if (tag.script().length() > 0) { + if (!sb.append('-') || !appendSubtag(tag.script())) { + return false; + } + } + + // Append the region subtag if present. + if (tag.region().length() > 0) { + if (!sb.append('-') || !appendSubtag(tag.region())) { + return false; + } + } + + // Append the variant subtags if present. + if (!appendSubtagsZ(tag.variants())) { + return false; + } + + // Append the extensions subtags if present. + if (!appendSubtagsZ(tag.extensions())) { + return false; + } + + // Append the private-use subtag if present. + if (tag.privateuse()) { + if (!sb.append('-') || !appendSubtagZ(tag.privateuse())) { + return false; + } + } + + return true; +} + +/** + * CanonicalizeTransformExtension + * + * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: + * + * - These subtags are all in lowercase (that is the canonical casing for these + * subtags), [...]. + * + * And per + * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All keywords and tfields are sorted by alphabetical order of their keys, + * within their respective extensions. + */ +bool LanguageTag::canonicalizeTransformExtension( + JSContext* cx, JS::UniqueChars& transformExtension) { + const char* const extension = transformExtension.get(); + MOZ_ASSERT(extension[0] == 't'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT(IsStructurallyValidExtensionTag({extension, strlen(extension)})); + + size_t length = strlen(extension); + + LanguageTag tag(cx); + LanguageTagParser::TFieldVector fields(cx); + + using TField = LanguageTagParser::TFieldVector::ElementType; + + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, ok, + LanguageTagParser::parseTransformExtension( + cx, ConstCharRange(extension, length), tag, fields)); + MOZ_ASSERT(ok, "unexpected invalid transform extension subtag"); + + auto tfieldLessOrEqual = [extension](const TField& a, const TField& b) { + MOZ_ASSERT(a.length() > TransformKeyLength); + MOZ_ASSERT(b.length() > TransformKeyLength); + const char* astr = a.begin(extension); + const char* bstr = b.begin(extension); + return std::char_traits<char>::compare(astr, bstr, TransformKeyLength) <= 0; + }; + + // All tfields are sorted by alphabetical order of their keys. + size_t fieldsLength = fields.length(); + if (fieldsLength > 1) { + if (!fields.growByUninitialized(fieldsLength)) { + return false; + } + + MOZ_ALWAYS_TRUE( + MergeSort(fields.begin(), fieldsLength, fields.begin() + fieldsLength, + [&](const auto& a, const auto& b, bool* lessOrEqualp) { + *lessOrEqualp = tfieldLessOrEqual(a, b); + return true; + })); + + fields.shrinkBy(fieldsLength); + } + + Vector<char, 32> sb(cx); + if (!sb.append('t')) { + return false; + } + + // Append the language subtag if present. + // + // [1] is a bit unclear whether or not the `tlang` subtag also needs to be + // canonicalized (and case-adjusted). For now simply append it as is. + // (|parseTransformExtension| doesn't alter case from the lowercased form we + // have previously taken pains to ensure is present in the extension, so no + // special effort is required to ensure lowercasing.) If we switch to [2], the + // `tlang` subtag also needs to be canonicalized according to the same rules + // as `unicode_language_id` subtags are canonicalized. Also see [3]. + // + // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier + // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers + // [3] https://github.com/tc39/ecma402/issues/330 + if (tag.language().length() > 0) { + if (!sb.append('-')) { + return false; + } + if (!LanguageTagToString(cx, tag, sb)) { + return false; + } + } + + // Append all fields. + // + // UTS 35, 3.2.1 specifies: + // - Any type or tfield value "true" is removed. + // + // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore + // this apparently invalid part of the UTS 35 specification and simply + // append all `tfield` subtags. + for (const auto& field : fields) { + if (!sb.append('-')) { + return false; + } + if (!sb.append(field.begin(extension), field.length())) { + return false; + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (sb.length() != length || + std::char_traits<char>::compare(sb.begin(), extension, length) != 0) { + // Null-terminate the new string and replace the previous extension. + if (!sb.append('\0')) { + return false; + } + UniqueChars canonical(sb.extractOrCopyRawBuffer()); + if (!canonical) { + return false; + } + transformExtension = std::move(canonical); + } + + return true; +} + +bool LanguageTag::appendTo(JSContext* cx, StringBuffer& sb) const { + return LanguageTagToString(cx, *this, sb); +} + +// Zero-terminated ICU Locale ID. +using LocaleId = + js::Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>; + +enum class LikelySubtags : bool { Add, Remove }; + +// Return true iff the language tag is already maximized resp. minimized. +static bool HasLikelySubtags(LikelySubtags likelySubtags, + const LanguageTag& tag) { + // The language tag is already maximized if the language, script, and region + // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are + // used. + if (likelySubtags == LikelySubtags::Add) { + return !tag.language().equalTo("und") && + (tag.script().length() > 0 && !tag.script().equalTo("Zzzz")) && + (tag.region().length() > 0 && !tag.region().equalTo("ZZ")); + } + + // The language tag is already minimized if it only contains a language + // subtag whose value is not the placeholder value "und". + return !tag.language().equalTo("und") && tag.script().length() == 0 && + tag.region().length() == 0; +} + +// Create an ICU locale ID from the given language tag. +static bool CreateLocaleForLikelySubtags(const LanguageTag& tag, + LocaleId& locale) { + MOZ_ASSERT(locale.length() == 0); + + auto appendSubtag = [&locale](const auto& subtag) { + auto range = subtag.range(); + MOZ_ASSERT(range.length() > 0); + return locale.append(range.begin().get(), range.length()); + }; + + // Append the language subtag. + if (!appendSubtag(tag.language())) { + return false; + } + + // Append the script subtag if present. + if (tag.script().length() > 0) { + if (!locale.append('_') || !appendSubtag(tag.script())) { + return false; + } + } + + // Append the region subtag if present. + if (tag.region().length() > 0) { + if (!locale.append('_') || !appendSubtag(tag.region())) { + return false; + } + } + + // Zero-terminated for use with ICU. + return locale.append('\0'); +} + +// Assign the language, script, and region subtags from an ICU locale ID. +// +// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to +// retrieve these subtags, but unfortunately these functions are rather slow, so +// we use our own implementation. +static bool AssignFromLocaleId(JSContext* cx, LocaleId& localeId, + LanguageTag& tag) { + MOZ_ASSERT(localeId.back() == '\0', + "Locale ID should be zero-terminated for ICU"); + + // Replace the ICU locale ID separator. + std::replace(localeId.begin(), localeId.end(), '_', '-'); + + // ICU replaces "und" with the empty string, which means "und" becomes "" and + // "und-Latn" becomes "-Latn". Handle this case separately. + if (localeId[0] == '\0' || localeId[0] == '-') { + static constexpr char und[] = "und"; + size_t length = strlen(und); + + // Insert "und" in front of the locale ID. + if (!localeId.growBy(length)) { + return false; + } + memmove(localeId.begin() + length, localeId.begin(), localeId.length()); + memmove(localeId.begin(), und, length); + } + + ConstCharRange localeRange(localeId.begin(), localeId.length() - 1); + + // Retrieve the language, script, and region subtags from the locale ID, but + // ignore any other subtags. + LanguageTag localeTag(cx); + if (!LanguageTagParser::parseBaseName(cx, localeRange, localeTag)) { + return false; + } + + tag.setLanguage(localeTag.language()); + tag.setScript(localeTag.script()); + tag.setRegion(localeTag.region()); + + return true; +} + +template <decltype(uloc_addLikelySubtags) likelySubtagsFn> +static bool CallLikelySubtags(JSContext* cx, const LocaleId& localeId, + LocaleId& result) { + // Locale ID must be zero-terminated before passing it to ICU. + MOZ_ASSERT(localeId.back() == '\0'); + MOZ_ASSERT(result.length() == 0); + + int32_t length = intl::CallICU( + cx, + result, + [&localeId](char* chars, int32_t size, UErrorCode* status) { + return likelySubtagsFn(localeId.begin(), chars, size, status); + }); + if (length < 0) { + return false; + } + + MOZ_ASSERT( + size_t(length) <= LocaleId::InlineLength, + "Unexpected extra subtags were added by ICU. If this assertion ever " + "fails, simply remove it and move on like nothing ever happended."); + + // Resize the vector to the actual string length. + result.shrinkTo(length); + + // Zero-terminated for use with ICU. + return result.append('\0'); +} + +// The canonical way to compute the Unicode BCP 47 locale identifier with likely +// subtags is as follows: +// +// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU +// locale ID. +// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. +// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into +// a Unicode BCP 47 locale identifier. +// +// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow +// and we know, by construction, that the input Unicode BCP 47 locale identifier +// only contains valid language, script, and region subtags, we can avoid both +// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and +// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of +// |Intl.Locale.prototype.maximize|.) +static bool LikelySubtags(JSContext* cx, LikelySubtags likelySubtags, + LanguageTag& tag) { + // Return early if the input is already maximized/minimized. + if (HasLikelySubtags(likelySubtags, tag)) { + return true; + } + + // Create the locale ID for the input argument. + LocaleId locale(cx); + if (!CreateLocaleForLikelySubtags(tag, locale)) { + return false; + } + + // UTS #35 requires that locale ID is maximized before its likely subtags are + // removed, so we need to call uloc_addLikelySubtags() for both cases. + // See <https://ssl.icu-project.org/trac/ticket/10220> and + // <https://ssl.icu-project.org/trac/ticket/12345>. + + LocaleId localeLikelySubtags(cx); + + // Add likely subtags to the locale ID. When minimizing we can skip adding the + // likely subtags for already maximized tags. (When maximizing we've already + // verified above that the tag is missing likely subtags.) + bool addLikelySubtags = likelySubtags == LikelySubtags::Add || + !HasLikelySubtags(LikelySubtags::Add, tag); + + if (addLikelySubtags) { + if (!CallLikelySubtags<uloc_addLikelySubtags>(cx, locale, + localeLikelySubtags)) { + return false; + } + } + + // Now that we've succesfully maximized the locale, we can minimize it. + if (likelySubtags == LikelySubtags::Remove) { + if (addLikelySubtags) { + // Copy the maximized subtags back into |locale|. + locale = std::move(localeLikelySubtags); + localeLikelySubtags = LocaleId(cx); + } + + // Remove likely subtags from the locale ID. + if (!CallLikelySubtags<uloc_minimizeSubtags>(cx, locale, + localeLikelySubtags)) { + return false; + } + } + + // Assign the language, script, and region subtags from the locale ID. + if (!AssignFromLocaleId(cx, localeLikelySubtags, tag)) { + return false; + } + + // Update mappings in case ICU returned a non-canonical locale. + return tag.canonicalizeBaseName(cx); +} + +bool LanguageTag::addLikelySubtags(JSContext* cx) { + return LikelySubtags(cx, LikelySubtags::Add, *this); +} + +bool LanguageTag::removeLikelySubtags(JSContext* cx) { + return LikelySubtags(cx, LikelySubtags::Remove, *this); +} + +LanguageTagParser::Token LanguageTagParser::nextToken() { + MOZ_ASSERT(index_ <= length_ + 1, "called after 'None' token was read"); + + TokenKind kind = TokenKind::None; + size_t tokenLength = 0; + for (size_t i = index_; i < length_; i++) { + // UTS 35, section 3.1. + // alpha = [A-Z a-z] ; + // digit = [0-9] ; + char16_t c = charAtUnchecked(i); + if (mozilla::IsAsciiAlpha(c)) { + kind |= TokenKind::Alpha; + } else if (mozilla::IsAsciiDigit(c)) { + kind |= TokenKind::Digit; + } else if (c == '-' && i > index_ && i + 1 < length_) { + break; + } else { + return {TokenKind::Error, 0, 0}; + } + tokenLength += 1; + } + + Token token{kind, index_, tokenLength}; + index_ += tokenLength + 1; + return token; +} + +UniqueChars LanguageTagParser::chars(JSContext* cx, size_t index, + size_t length) const { + // Add +1 to null-terminate the string. + auto chars = cx->make_pod_array<char>(length + 1); + if (chars) { + char* dest = chars.get(); + if (locale_.is<const JS::Latin1Char*>()) { + std::copy_n(locale_.as<const JS::Latin1Char*>() + index, length, dest); + } else { + std::copy_n(locale_.as<const char16_t*>() + index, length, dest); + } + dest[length] = '\0'; + } + return chars; +} + +UniqueChars LanguageTagParser::extension(JSContext* cx, const Token& start, + const Token& end) const { + MOZ_ASSERT(start.index() < end.index()); + + size_t length = end.index() - 1 - start.index(); + UniqueChars extension = chars(cx, start.index(), length); + if (extension) { + AsciiToLowerCase(extension.get(), length, extension.get()); + } + return extension; +} + +// Parse the `unicode_language_id` production. +// +// unicode_language_id = unicode_language_subtag +// (sep unicode_script_subtag)? +// (sep unicode_region_subtag)? +// (sep unicode_variant_subtag)* ; +// +// sep = "-" +// +// Note: Unicode CLDR locale identifier backward compatibility extensions +// removed from `unicode_language_id`. +// +// |tok| is the current token from |ts|. +// +// The trailing |parseType| argument corresponds to one of two modes. +// +// In the |BaseNameParsing::Normal| mode, our input is in unknown case and is +// potentially invalid. |tag| will be filled with canonically-cased output, and +// duplicate variants will lead to an error. +// +// In the |BaseNameParsing::WithinTransformExtension| mode, our input is the +// `tlang` in a lowercased `transform_extensions`. |tag| subtags will be +// directly copied from the input (i.e. in lowercase). Variant subtags in the +// `tlang` subtag may contain duplicates. +// +// Do not use this function directly: use |parseBaseName| or +// |parseTlangFromTransformExtension| instead. +JS::Result<bool> LanguageTagParser::internalParseBaseName( + JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok, + BaseNameParsing parseType) { +#ifdef DEBUG + auto isAsciiLowerCase = [](const auto& range) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + const char* ptr = range.begin().get(); + size_t length = range.length(); + return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<char>); + }; + auto isAsciiDigit = [](const auto& range) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + const char* ptr = range.begin().get(); + size_t length = range.length(); + return std::all_of(ptr, ptr + length, mozilla::IsAsciiDigit<char>); + }; +#endif + + if (ts.isLanguage(tok)) { + ts.copyChars(tok, tag.language_); + + // Language codes need to be in lower case. "JA" -> "ja" + if (parseType == BaseNameParsing::Normal) { + tag.language_.toLowerCase(); + } else { + MOZ_ASSERT(isAsciiLowerCase(tag.language_.range())); + } + + tok = ts.nextToken(); + } else { + MOZ_ASSERT(parseType == BaseNameParsing::Normal); + + // The language subtag is mandatory. + return false; + } + + if (ts.isScript(tok)) { + ts.copyChars(tok, tag.script_); + + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + if (parseType == BaseNameParsing::Normal) { + tag.script_.toTitleCase(); + } else { + MOZ_ASSERT(isAsciiLowerCase(tag.script_.range())); + } + + tok = ts.nextToken(); + } + + if (ts.isRegion(tok)) { + ts.copyChars(tok, tag.region_); + + // Region codes need to be in upper case. "bu" -> "BU" + if (parseType == BaseNameParsing::Normal) { + tag.region_.toUpperCase(); + } else { + MOZ_ASSERT_IF(tok.length() == 2, isAsciiLowerCase(tag.region_.range())); + MOZ_ASSERT_IF(tok.length() == 3, isAsciiDigit(tag.region_.range())); + } + + tok = ts.nextToken(); + } + + auto& variants = tag.variants_; + MOZ_ASSERT(variants.length() == 0); + while (ts.isVariant(tok)) { + auto variant = ts.chars(cx, tok); + if (!variant) { + return cx->alreadyReportedOOM(); + } + + if (parseType == BaseNameParsing::Normal) { + // Locale identifiers are case insensitive (UTS 35, section 3.2). + // All seen variants are compared ignoring case differences by using the + // lower case form. This allows to properly detect and reject variant + // repetitions with differing case, e.g. "en-variant-Variant". + AsciiToLowerCase(variant.get(), tok.length(), variant.get()); + + // Reject the Locale identifier if a duplicate variant was found. + // + // This linear-time verification step means the whole variant subtag + // checking is potentially quadratic. Language tags are unlikely to be + // deliberately pathological, so this is okay at least for now. + for (const auto& seenVariant : variants) { + if (strcmp(variant.get(), seenVariant.get()) == 0) { + return false; + } + } + } else { + // When parsing variants in a `tlang` subtag, duplicates are allowed. + } + + if (!variants.append(std::move(variant))) { + return cx->alreadyReportedOOM(); + } + + tok = ts.nextToken(); + } + + return true; +} + +static mozilla::Variant<const Latin1Char*, const char16_t*> StringChars( + const char* locale) { + return mozilla::AsVariant(reinterpret_cast<const JS::Latin1Char*>(locale)); +} + +static mozilla::Variant<const Latin1Char*, const char16_t*> StringChars( + JSLinearString* linear, JS::AutoCheckCannotGC& nogc) { + if (linear->hasLatin1Chars()) { + return mozilla::AsVariant(linear->latin1Chars(nogc)); + } + return mozilla::AsVariant(linear->twoByteChars(nogc)); +} + +JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx, + JSLinearString* locale, + LanguageTag& tag) { + JS::AutoCheckCannotGC nogc; + LocaleChars localeChars = StringChars(locale, nogc); + + // unicode_locale_id = unicode_language_id + // extensions* + // pu_extensions? ; + + LanguageTagParser ts(localeChars, locale->length()); + Token tok = ts.nextToken(); + + bool ok; + MOZ_TRY_VAR(ok, parseBaseName(cx, ts, tag, tok)); + if (!ok) { + return false; + } + + // extensions = unicode_locale_extensions + // | transformed_extensions + // | other_extensions ; + + // Bit set of seen singletons. + uint64_t seenSingletons = 0; + + auto& extensions = tag.extensions_; + while (ts.isExtensionStart(tok)) { + char singleton = ts.singletonKey(tok); + + // Reject the input if a duplicate singleton was found. + uint64_t hash = 1ULL << (mozilla::AsciiAlphanumericToNumber(singleton) + 1); + if (seenSingletons & hash) { + return false; + } + seenSingletons |= hash; + + Token start = tok; + tok = ts.nextToken(); + + // We'll check for missing non-singleton subtags after this block by + // comparing |startValue| with the then-current position. + size_t startValue = tok.index(); + + if (singleton == 'u') { + while (ts.isUnicodeExtensionPart(tok)) { + tok = ts.nextToken(); + } + } else if (singleton == 't') { + // transformed_extensions = sep [tT] + // ((sep tlang (sep tfield)*) + // | (sep tfield)+) ; + + // tlang = unicode_language_subtag + // (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* ; + if (ts.isLanguage(tok)) { + tok = ts.nextToken(); + + if (ts.isScript(tok)) { + tok = ts.nextToken(); + } + + if (ts.isRegion(tok)) { + tok = ts.nextToken(); + } + + while (ts.isVariant(tok)) { + tok = ts.nextToken(); + } + } + + // tfield = tkey tvalue; + while (ts.isTransformExtensionKey(tok)) { + tok = ts.nextToken(); + + size_t startTValue = tok.index(); + while (ts.isTransformExtensionPart(tok)) { + tok = ts.nextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.index() <= startTValue) { + return false; + } + } + } else { + while (ts.isOtherExtensionPart(tok)) { + tok = ts.nextToken(); + } + } + + // Singletons must be followed by a non-singleton subtag, "en-a-b" is not + // allowed. + if (tok.index() <= startValue) { + return false; + } + + UniqueChars extension = ts.extension(cx, start, tok); + if (!extension) { + return cx->alreadyReportedOOM(); + } + if (!extensions.append(std::move(extension))) { + return cx->alreadyReportedOOM(); + } + } + + // Trailing `pu_extension` component of the `unicode_locale_id` production. + if (ts.isPrivateUseStart(tok)) { + Token start = tok; + tok = ts.nextToken(); + + size_t startValue = tok.index(); + while (ts.isPrivateUsePart(tok)) { + tok = ts.nextToken(); + } + + // There must be at least one subtag after the "-x-". + if (tok.index() <= startValue) { + return false; + } + + UniqueChars privateUse = ts.extension(cx, start, tok); + if (!privateUse) { + return cx->alreadyReportedOOM(); + } + tag.privateuse_ = std::move(privateUse); + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale, + LanguageTag& tag) { + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag)); + if (ok) { + return true; + } + if (UniqueChars localeChars = StringToNewUTF8CharsZ(cx, *locale)) { + JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr, + JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); + } + return false; +} + +bool LanguageTagParser::parseBaseName(JSContext* cx, ConstCharRange locale, + LanguageTag& tag) { + LocaleChars localeChars = StringChars(locale.begin().get()); + LanguageTagParser ts(localeChars, locale.length()); + Token tok = ts.nextToken(); + + // Parse only the base-name part and ignore any trailing characters. + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, parseBaseName(cx, ts, tag, tok)); + if (ok) { + return true; + } + if (UniqueChars localeChars = + DuplicateString(locale.begin().get(), locale.length())) { + JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr, + JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); + } else { + JS_ReportOutOfMemory(cx); + } + return false; +} + +// Parse |extension|, which must be a valid `transformed_extensions` subtag, and +// fill |tag| and |fields| from the `tlang` and `tfield` components. +JS::Result<bool> LanguageTagParser::parseTransformExtension( + JSContext* cx, ConstCharRange extension, LanguageTag& tag, + TFieldVector& fields) { + LocaleChars extensionChars = StringChars(extension.begin().get()); + LanguageTagParser ts(extensionChars, extension.length()); + Token tok = ts.nextToken(); + + if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 't') { + return false; + } + + tok = ts.nextToken(); + + if (tok.isNone()) { + return false; + } + + if (ts.isLanguage(tok)) { + // We're parsing a possible `tlang` in a known-valid transform extension, so + // use the special-purpose function that takes advantage of this to compute + // lowercased |tag| contents in an optimal manner. + MOZ_TRY(parseTlangInTransformExtension(cx, ts, tag, tok)); + + // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end + // of the transform extension. + MOZ_ASSERT(ts.isTransformExtensionKey(tok) || tok.isNone()); + } else { + // If there's no `tlang` subtag, at least one `tfield` must be present. + MOZ_ASSERT(ts.isTransformExtensionKey(tok)); + } + + // Trailing `tfield` subtags. (Any other trailing subtags are an error, + // because we're guaranteed to only see a valid tranform extension here.) + while (ts.isTransformExtensionKey(tok)) { + size_t begin = tok.index(); + tok = ts.nextToken(); + + size_t startTValue = tok.index(); + while (ts.isTransformExtensionPart(tok)) { + tok = ts.nextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.index() <= startTValue) { + return false; + } + + size_t length = tok.index() - 1 - begin; + if (!fields.emplaceBack(begin, length)) { + return cx->alreadyReportedOOM(); + } + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +// Parse |extension|, which must be a valid `unicode_locale_extensions` subtag, +// and fill |attributes| and |keywords| from the `attribute` and `keyword` +// components. +JS::Result<bool> LanguageTagParser::parseUnicodeExtension( + JSContext* cx, ConstCharRange extension, AttributesVector& attributes, + KeywordsVector& keywords) { + LocaleChars extensionChars = StringChars(extension.begin().get()); + LanguageTagParser ts(extensionChars, extension.length()); + Token tok = ts.nextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') { + return false; + } + + tok = ts.nextToken(); + + if (tok.isNone()) { + return false; + } + + while (ts.isUnicodeExtensionAttribute(tok)) { + if (!attributes.emplaceBack(tok.index(), tok.length())) { + return cx->alreadyReportedOOM(); + } + + tok = ts.nextToken(); + } + + // keyword = key (sep type)? ; + while (ts.isUnicodeExtensionKey(tok)) { + size_t begin = tok.index(); + tok = ts.nextToken(); + + while (ts.isUnicodeExtensionType(tok)) { + tok = ts.nextToken(); + } + + if (tok.isError()) { + return false; + } + + size_t length = tok.index() - 1 - begin; + if (!keywords.emplaceBack(begin, length)) { + return cx->alreadyReportedOOM(); + } + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool LanguageTagParser::canParseUnicodeExtension(ConstCharRange extension) { + LocaleChars extensionChars = StringChars(extension.begin().get()); + LanguageTagParser ts(extensionChars, extension.length()); + Token tok = ts.nextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') { + return false; + } + + tok = ts.nextToken(); + + if (tok.isNone()) { + return false; + } + + while (ts.isUnicodeExtensionAttribute(tok)) { + tok = ts.nextToken(); + } + + // keyword = key (sep type)? ; + while (ts.isUnicodeExtensionKey(tok)) { + tok = ts.nextToken(); + + while (ts.isUnicodeExtensionType(tok)) { + tok = ts.nextToken(); + } + + if (tok.isError()) { + return false; + } + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool LanguageTagParser::canParseUnicodeExtensionType( + JSLinearString* unicodeType) { + JS::AutoCheckCannotGC nogc; + LocaleChars unicodeTypeChars = StringChars(unicodeType, nogc); + + LanguageTagParser ts(unicodeTypeChars, unicodeType->length()); + Token tok = ts.nextToken(); + + while (ts.isUnicodeExtensionType(tok)) { + tok = ts.nextToken(); + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool ParseStandaloneLanguagTag(HandleLinearString str, LanguageSubtag& result) { + auto isLanguage = [](const auto* language, size_t length) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + using T = std::remove_pointer_t<decltype(language)>; + return length >= 2 && length != 4 && length <= 8 && + std::all_of(language, language + length, mozilla::IsAsciiAlpha<T>); + }; + + JS::AutoCheckCannotGC nogc; + if (str->hasLatin1Chars()) { + if (!isLanguage(str->latin1Chars(nogc), str->length())) { + return false; + } + result.set(str->latin1Range(nogc)); + } else { + if (!isLanguage(str->twoByteChars(nogc), str->length())) { + return false; + } + result.set(str->twoByteRange(nogc)); + } + result.toLowerCase(); + return true; +} + +bool ParseStandaloneScriptTag(HandleLinearString str, ScriptSubtag& result) { + auto isScript = [](const auto* script, size_t length) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + using T = std::remove_pointer_t<decltype(script)>; + return length == ScriptLength && + std::all_of(script, script + ScriptLength, mozilla::IsAsciiAlpha<T>); + }; + + JS::AutoCheckCannotGC nogc; + if (str->hasLatin1Chars()) { + if (!isScript(str->latin1Chars(nogc), str->length())) { + return false; + } + result.set(str->latin1Range(nogc)); + } else { + if (!isScript(str->twoByteChars(nogc), str->length())) { + return false; + } + result.set(str->twoByteRange(nogc)); + } + result.toTitleCase(); + return true; +} + +bool ParseStandaloneRegionTag(HandleLinearString str, RegionSubtag& result) { + auto isRegion = [](const auto* region, size_t length) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + using T = std::remove_pointer_t<decltype(region)>; + return (length == AlphaRegionLength && + std::all_of(region, region + AlphaRegionLength, + mozilla::IsAsciiAlpha<T>)) || + (length == DigitRegionLength && + std::all_of(region, region + DigitRegionLength, + mozilla::IsAsciiDigit<T>)); + }; + + JS::AutoCheckCannotGC nogc; + if (str->hasLatin1Chars()) { + if (!isRegion(str->latin1Chars(nogc), str->length())) { + return false; + } + result.set(str->latin1Range(nogc)); + } else { + if (!isRegion(str->twoByteChars(nogc), str->length())) { + return false; + } + result.set(str->twoByteRange(nogc)); + } + result.toUpperCase(); + return true; +} + +template <typename CharT> +static bool IsAsciiLowercaseAlpha(const mozilla::Range<const CharT>& range) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + const CharT* ptr = range.begin().get(); + size_t length = range.length(); + return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<CharT>); +} + +static bool IsAsciiLowercaseAlpha(JSLinearString* str) { + JS::AutoCheckCannotGC nogc; + return str->hasLatin1Chars() ? IsAsciiLowercaseAlpha(str->latin1Range(nogc)) + : IsAsciiLowercaseAlpha(str->twoByteRange(nogc)); +} + +template <typename CharT> +static bool IsAsciiAlpha(const mozilla::Range<const CharT>& range) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + const CharT* ptr = range.begin().get(); + size_t length = range.length(); + return std::all_of(ptr, ptr + length, mozilla::IsAsciiAlpha<CharT>); +} + +static bool IsAsciiAlpha(JSLinearString* str) { + JS::AutoCheckCannotGC nogc; + return str->hasLatin1Chars() ? IsAsciiAlpha(str->latin1Range(nogc)) + : IsAsciiAlpha(str->twoByteRange(nogc)); +} + +JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx, + HandleLinearString str) { + // ISO-639 language codes contain either two or three characters. + size_t length = str->length(); + if (length != 2 && length != 3) { + return nullptr; + } + + // We can directly the return the input below if it's in the correct case. + bool isLowerCase = IsAsciiLowercaseAlpha(str); + if (!isLowerCase) { + // Must be an ASCII alpha string. + if (!IsAsciiAlpha(str)) { + return nullptr; + } + } + + LanguageSubtag languageTag; + if (str->hasLatin1Chars()) { + JS::AutoCheckCannotGC nogc; + languageTag.set(str->latin1Range(nogc)); + } else { + JS::AutoCheckCannotGC nogc; + languageTag.set(str->twoByteRange(nogc)); + } + + if (!isLowerCase) { + // The language subtag is canonicalized to lower case. + languageTag.toLowerCase(); + } + + // Reject the input if the canonical tag contains more than just a single + // language subtag. + if (LanguageTag::complexLanguageMapping(languageTag)) { + return nullptr; + } + + // Take care to replace deprecated subtags with their preferred values. + JSString* result; + if (LanguageTag::languageMapping(languageTag) || !isLowerCase) { + auto range = languageTag.range(); + result = NewStringCopyN<CanGC>(cx, range.begin().get(), range.length()); + } else { + result = str; + } + if (!result) { + return cx->alreadyReportedOOM(); + } + return result; +} + +} // namespace intl +} // namespace js diff --git a/js/src/builtin/intl/LanguageTag.h b/js/src/builtin/intl/LanguageTag.h new file mode 100644 index 0000000000..5f190757b8 --- /dev/null +++ b/js/src/builtin/intl/LanguageTag.h @@ -0,0 +1,722 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Structured representation of Unicode locale IDs used with Intl functions. */ + +#ifndef builtin_intl_LanguageTag_h +#define builtin_intl_LanguageTag_h + +#include "mozilla/Assertions.h" +#include "mozilla/Range.h" +#include "mozilla/TextUtils.h" +#include "mozilla/TypedEnumBits.h" +#include "mozilla/Variant.h" + +#include <algorithm> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <utility> + +#include "jsalloc.h" +#include "js/Result.h" + +#include "js/GCAPI.h" +#include "js/Utility.h" +#include "js/Vector.h" + +struct JSContext; +class JSLinearString; +class JSString; + +namespace js { + +class StringBuffer; + +namespace intl { + +#ifdef DEBUG + +/** + * Return true if |language| is a valid, case-normalized language subtag. + */ +template <typename CharT> +bool IsStructurallyValidLanguageTag( + const mozilla::Range<const CharT>& language); + +/** + * Return true if |script| is a valid, case-normalized script subtag. + */ +template <typename CharT> +bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script); + +/** + * Return true if |region| is a valid, case-normalized region subtag. + */ +template <typename CharT> +bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region); + +/** + * Return true if |variant| is a valid, case-normalized variant subtag. + */ +bool IsStructurallyValidVariantTag(const mozilla::Range<const char>& variant); + +/** + * Return true if |extension| is a valid, case-normalized Unicode extension + * subtag. + */ +bool IsStructurallyValidUnicodeExtensionTag( + const mozilla::Range<const char>& extension); + +/** + * Return true if |privateUse| is a valid, case-normalized private-use subtag. + */ +bool IsStructurallyValidPrivateUseTag( + const mozilla::Range<const char>& privateUse); + +#endif + +template <typename CharT> +char AsciiToLowerCase(CharT c) { + MOZ_ASSERT(mozilla::IsAscii(c)); + return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c; +} + +template <typename CharT> +char AsciiToUpperCase(CharT c) { + MOZ_ASSERT(mozilla::IsAscii(c)); + return mozilla::IsAsciiLowercaseAlpha(c) ? (c & ~0x20) : c; +} + +template <typename CharT> +void AsciiToLowerCase(CharT* chars, size_t length, char* dest) { + // Tell the analysis the |std::transform| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + char (&fn)(CharT) = AsciiToLowerCase; + std::transform(chars, chars + length, dest, fn); +} + +template <typename CharT> +void AsciiToUpperCase(CharT* chars, size_t length, char* dest) { + // Tell the analysis the |std::transform| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + char (&fn)(CharT) = AsciiToUpperCase; + std::transform(chars, chars + length, dest, fn); +} + +template <typename CharT> +void AsciiToTitleCase(CharT* chars, size_t length, char* dest) { + if (length > 0) { + AsciiToUpperCase(chars, 1, dest); + AsciiToLowerCase(chars + 1, length - 1, dest + 1); + } +} + +// Constants for language subtag lengths. +namespace LanguageTagLimits { + +// unicode_language_subtag = alpha{2,3} | alpha{5,8} ; +static constexpr size_t LanguageLength = 8; + +// unicode_script_subtag = alpha{4} ; +static constexpr size_t ScriptLength = 4; + +// unicode_region_subtag = (alpha{2} | digit{3}) ; +static constexpr size_t RegionLength = 3; +static constexpr size_t AlphaRegionLength = 2; +static constexpr size_t DigitRegionLength = 3; + +// key = alphanum alpha ; +static constexpr size_t UnicodeKeyLength = 2; + +// tkey = alpha digit ; +static constexpr size_t TransformKeyLength = 2; + +} // namespace LanguageTagLimits + +// Fixed size language subtag which is stored inline in LanguageTag. +template <size_t Length> +class LanguageTagSubtag final { + uint8_t length_ = 0; + char chars_[Length]; + + public: + LanguageTagSubtag() = default; + + LanguageTagSubtag(const LanguageTagSubtag&) = delete; + LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete; + + size_t length() const { return length_; } + + mozilla::Range<const char> range() const { return {chars_, length_}; } + + template <typename CharT> + void set(const mozilla::Range<const CharT>& str) { + MOZ_ASSERT(str.length() <= Length); + std::copy_n(str.begin().get(), str.length(), chars_); + length_ = str.length(); + } + + void toLowerCase() { AsciiToLowerCase(chars_, length(), chars_); } + + void toUpperCase() { AsciiToUpperCase(chars_, length(), chars_); } + + void toTitleCase() { AsciiToTitleCase(chars_, length(), chars_); } + + template <size_t N> + bool equalTo(const char (&str)[N]) const { + static_assert(N - 1 <= Length, + "subtag literals must not exceed the maximum subtag length"); + + return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0; + } +}; + +using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>; +using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>; +using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>; + +/** + * Object representing a language tag. + * + * All subtags are already in canonicalized case. + */ +class MOZ_STACK_CLASS LanguageTag final { + LanguageSubtag language_ = {}; + ScriptSubtag script_ = {}; + RegionSubtag region_ = {}; + + using VariantsVector = Vector<JS::UniqueChars, 2>; + using ExtensionsVector = Vector<JS::UniqueChars, 2>; + + VariantsVector variants_; + ExtensionsVector extensions_; + JS::UniqueChars privateuse_ = nullptr; + + friend class LanguageTagParser; + + public: + // Flag to request canonicalized Unicode extensions. + enum class UnicodeExtensionCanonicalForm : bool { No, Yes }; + + private: + bool canonicalizeUnicodeExtension( + JSContext* cx, JS::UniqueChars& unicodeExtension, + UnicodeExtensionCanonicalForm canonicalForm); + + bool canonicalizeTransformExtension(JSContext* cx, + JS::UniqueChars& transformExtension); + + public: + static bool languageMapping(LanguageSubtag& language); + static bool complexLanguageMapping(const LanguageSubtag& language); + + private: + static bool regionMapping(RegionSubtag& region); + static bool complexRegionMapping(const RegionSubtag& region); + + void performComplexLanguageMappings(); + void performComplexRegionMappings(); + + MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx); + + static const char* replaceUnicodeExtensionType( + const mozilla::Range<const char>& key, + const mozilla::Range<const char>& type); + + public: + explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {} + + LanguageTag(const LanguageTag&) = delete; + LanguageTag& operator=(const LanguageTag&) = delete; + + const LanguageSubtag& language() const { return language_; } + const ScriptSubtag& script() const { return script_; } + const RegionSubtag& region() const { return region_; } + const auto& variants() const { return variants_; } + const auto& extensions() const { return extensions_; } + const char* privateuse() const { return privateuse_.get(); } + + /** + * Set the language subtag. The input must be a valid, case-normalized + * language subtag. + */ + template <size_t N> + void setLanguage(const char (&language)[N]) { + mozilla::Range<const char> range(language, N - 1); + MOZ_ASSERT(IsStructurallyValidLanguageTag(range)); + language_.set(range); + } + + /** + * Set the language subtag. The input must be a valid, case-normalized + * language subtag. + */ + void setLanguage(const LanguageSubtag& language) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range())); + language_.set(language.range()); + } + + /** + * Set the script subtag. The input must be a valid, case-normalized + * script subtag or the empty string. + */ + template <size_t N> + void setScript(const char (&script)[N]) { + mozilla::Range<const char> range(script, N - 1); + MOZ_ASSERT(IsStructurallyValidScriptTag(range)); + script_.set(range); + } + + /** + * Set the script subtag. The input must be a valid, case-normalized + * script subtag or the empty string. + */ + void setScript(const ScriptSubtag& script) { + MOZ_ASSERT(script.length() == 0 || + IsStructurallyValidScriptTag(script.range())); + script_.set(script.range()); + } + + /** + * Set the region subtag. The input must be a valid, case-normalized + * region subtag or the empty string. + */ + template <size_t N> + void setRegion(const char (®ion)[N]) { + mozilla::Range<const char> range(region, N - 1); + MOZ_ASSERT(IsStructurallyValidRegionTag(range)); + region_.set(range); + } + + /** + * Set the region subtag. The input must be a valid, case-normalized + * region subtag or the empty string. + */ + void setRegion(const RegionSubtag& region) { + MOZ_ASSERT(region.length() == 0 || + IsStructurallyValidRegionTag(region.range())); + region_.set(region.range()); + } + + /** + * Removes all variant subtags. + */ + void clearVariants() { variants_.clearAndFree(); } + + /** + * Set the Unicode extension subtag. The input must be a valid, + * case-normalized Unicode extension subtag. + */ + bool setUnicodeExtension(JS::UniqueChars extension); + + /** + * Set the private-use subtag. The input must be a valid, case-normalized + * private-use subtag or the empty string. + */ + void setPrivateuse(JS::UniqueChars privateuse) { + MOZ_ASSERT(!privateuse || + IsStructurallyValidPrivateUseTag( + {privateuse.get(), strlen(privateuse.get())})); + privateuse_ = std::move(privateuse); + } + + /** + * Canonicalize the base-name subtags, that means the language, script, + * region, and variant subtags. + */ + bool canonicalizeBaseName(JSContext* cx); + + /** + * Canonicalize all extension subtags. + */ + bool canonicalizeExtensions(JSContext* cx, + UnicodeExtensionCanonicalForm canonicalForm); + + /** + * Canonicalizes the given structurally valid Unicode BCP 47 locale + * identifier, including regularized case of subtags. For example, the + * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, + * where + * + * Zh ; 2*3ALPHA + * -haNS ; ["-" script] + * -bu ; ["-" region] + * -variant2 ; *("-" variant) + * -Variant1 + * -u-ca-chinese ; *("-" extension) + * -t-Zh-laTN + * -x-PRIVATE ; ["-" privateuse] + * + * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private + * + * UTS 35 specifies two different canonicalization algorithms. There's one to + * canonicalize BCP 47 language tags and other one to canonicalize Unicode + * locale identifiers. The latter one wasn't present when ECMA-402 was changed + * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags, + * so ECMA-402 currently only uses the former to canonicalize Unicode BCP 47 + * locale identifiers. + * + * Spec: ECMAScript Internationalization API Specification, 6.2.3. + * Spec: + * https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers + * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion + */ + bool canonicalize(JSContext* cx, + UnicodeExtensionCanonicalForm canonicalForm) { + return canonicalizeBaseName(cx) && + canonicalizeExtensions(cx, canonicalForm); + } + + /** + * Append the string representation of this language tag to the given + * string buffer. + */ + bool appendTo(JSContext* cx, StringBuffer& sb) const; + + /** + * Add likely-subtags to the language tag. + * + * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> + */ + bool addLikelySubtags(JSContext* cx); + + /** + * Remove likely-subtags from the language tag. + * + * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> + */ + bool removeLikelySubtags(JSContext* cx); +}; + +/** + * Parser for Unicode BCP 47 locale identifiers. + * + * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers> + */ +class MOZ_STACK_CLASS LanguageTagParser final { + public: + // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|. + enum class TokenKind : uint8_t { + None = 0b000, + Alpha = 0b001, + Digit = 0b010, + AlphaDigit = 0b011, + Error = 0b100 + }; + + private: + class Token final { + size_t index_; + size_t length_; + TokenKind kind_; + + public: + Token(TokenKind kind, size_t index, size_t length) + : index_(index), length_(length), kind_(kind) {} + + TokenKind kind() const { return kind_; } + size_t index() const { return index_; } + size_t length() const { return length_; } + + bool isError() const { return kind_ == TokenKind::Error; } + bool isNone() const { return kind_ == TokenKind::None; } + bool isAlpha() const { return kind_ == TokenKind::Alpha; } + bool isDigit() const { return kind_ == TokenKind::Digit; } + bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; } + }; + + using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>; + + const LocaleChars& locale_; + size_t length_; + size_t index_ = 0; + + LanguageTagParser(const LocaleChars& locale, size_t length) + : locale_(locale), length_(length) {} + + char16_t charAtUnchecked(size_t index) const { + if (locale_.is<const JS::Latin1Char*>()) { + return locale_.as<const JS::Latin1Char*>()[index]; + } + return locale_.as<const char16_t*>()[index]; + } + + char charAt(size_t index) const { + char16_t c = charAtUnchecked(index); + MOZ_ASSERT(mozilla::IsAscii(c)); + return c; + } + + // Copy the token characters into |subtag|. + template <size_t N> + void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const { + size_t index = tok.index(); + size_t length = tok.length(); + if (locale_.is<const JS::Latin1Char*>()) { + using T = const JS::Latin1Char; + subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length)); + } else { + using T = const char16_t; + subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length)); + } + } + + // Create a string copy of |length| characters starting at |index|. + JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const; + + // Create a string copy of the token characters. + JS::UniqueChars chars(JSContext* cx, const Token& tok) const { + return chars(cx, tok.index(), tok.length()); + } + + Token nextToken(); + + JS::UniqueChars extension(JSContext* cx, const Token& start, + const Token& end) const; + + // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + // + // Four character language subtags are not allowed in Unicode BCP 47 locale + // identifiers. Also see the comparison to Unicode CLDR locale identifiers in + // <https://unicode.org/reports/tr35/#BCP_47_Conformance>. + bool isLanguage(const Token& tok) const { + return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) || + (5 <= tok.length() && tok.length() <= 8)); + } + + // unicode_script_subtag = alpha{4} ; + bool isScript(const Token& tok) const { + return tok.isAlpha() && tok.length() == 4; + } + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + bool isRegion(const Token& tok) const { + return (tok.isAlpha() && tok.length() == 2) || + (tok.isDigit() && tok.length() == 3); + } + + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + bool isVariant(const Token& tok) const { + return (5 <= tok.length() && tok.length() <= 8) || + (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index()))); + } + + // Returns the code unit of the first character at the given singleton token. + // Always returns the lower case form of an alphabetical character. + char singletonKey(const Token& tok) const { + MOZ_ASSERT(tok.length() == 1); + char c = charAt(tok.index()); + return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c; + } + + // extensions = unicode_locale_extensions | + // transformed_extensions | + // other_extensions ; + // + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + // + // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | + // (sep tfield)+) ; + // + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + bool isExtensionStart(const Token& tok) const { + return tok.length() == 1 && singletonKey(tok) != 'x'; + } + + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + bool isOtherExtensionPart(const Token& tok) const { + return 2 <= tok.length() && tok.length() <= 8; + } + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + // keyword = key (sep type)? ; + bool isUnicodeExtensionPart(const Token& tok) const { + return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) || + isUnicodeExtensionAttribute(tok); + } + + // attribute = alphanum{3,8} ; + bool isUnicodeExtensionAttribute(const Token& tok) const { + return 3 <= tok.length() && tok.length() <= 8; + } + + // key = alphanum alpha ; + bool isUnicodeExtensionKey(const Token& tok) const { + return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1)); + } + + // type = alphanum{3,8} (sep alphanum{3,8})* ; + bool isUnicodeExtensionType(const Token& tok) const { + return 3 <= tok.length() && tok.length() <= 8; + } + + // tkey = alpha digit ; + bool isTransformExtensionKey(const Token& tok) const { + return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) && + mozilla::IsAsciiDigit(charAt(tok.index() + 1)); + } + + // tvalue = (sep alphanum{3,8})+ ; + bool isTransformExtensionPart(const Token& tok) const { + return 3 <= tok.length() && tok.length() <= 8; + } + + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + bool isPrivateUseStart(const Token& tok) const { + return tok.length() == 1 && singletonKey(tok) == 'x'; + } + + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + bool isPrivateUsePart(const Token& tok) const { + return 1 <= tok.length() && tok.length() <= 8; + } + + enum class BaseNameParsing : bool { Normal, WithinTransformExtension }; + + // Helper function for use in |parseBaseName| and + // |parseTlangInTransformExtension|. Do not use this directly! + static JS::Result<bool> internalParseBaseName(JSContext* cx, + LanguageTagParser& ts, + LanguageTag& tag, Token& tok, + BaseNameParsing parseType); + + // Parse the `unicode_language_id` production, i.e. the + // language/script/region/variants portion of a language tag, into |tag|, + // which will be filled with canonical-cased components (lowercase language, + // titlecase script, uppercase region, lowercased and alphabetized and + // deduplicated variants). |tok| must be the current token. + static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts, + LanguageTag& tag, Token& tok) { + return internalParseBaseName(cx, ts, tag, tok, BaseNameParsing::Normal); + } + + // Parse the `tlang` production within a parsed 't' transform extension. + // The precise requirements for "previously parsed" are: + // + // * the input begins from current token |tok| with a valid `tlang` + // * the `tlang` is wholly lowercase (*not* canonical case) + // * variant subtags in the `tlang` may contain duplicates and be + // unordered + // + // Return an error on internal failure. Otherwise, return a success value. If + // there was no `tlang`, then |tag.language().missing()|. But if there was a + // `tlang`, then |tag| is filled with subtags exactly as they appeared in the + // parse input: fully lowercase, variants in alphabetical order without + // duplicates. + static JS::Result<JS::Ok> parseTlangInTransformExtension( + JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) { + MOZ_ASSERT(ts.isLanguage(tok)); + return internalParseBaseName(cx, ts, tag, tok, + BaseNameParsing::WithinTransformExtension) + .map([](bool parsed) { + MOZ_ASSERT(parsed); + return JS::Ok(); + }); + } + + friend class LanguageTag; + + class Range final { + size_t begin_; + size_t length_; + + public: + Range(size_t begin, size_t length) : begin_(begin), length_(length) {} + + template <typename T> + T* begin(T* ptr) const { + return ptr + begin_; + } + + size_t length() const { return length_; } + }; + + using TFieldVector = js::Vector<Range, 8>; + using AttributesVector = js::Vector<Range, 8>; + using KeywordsVector = js::Vector<Range, 8>; + + // Parse |extension|, which must be a validated, fully lowercase + // `transformed_extensions` subtag, and fill |tag| and |fields| from the + // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent + // with |extension|. + static JS::Result<bool> parseTransformExtension( + JSContext* cx, mozilla::Range<const char> extension, LanguageTag& tag, + TFieldVector& fields); + + // Parse |extension|, which must be a validated, fully lowercase + // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| + // from the `attribute` and `keyword` components. + static JS::Result<bool> parseUnicodeExtension( + JSContext* cx, mozilla::Range<const char> extension, + AttributesVector& attributes, KeywordsVector& keywords); + + public: + // Parse the input string as a language tag. Reports an error to the context + // if the input can't be parsed completely. + static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag); + + // Parse the input string as a language tag. Returns Ok(true) if the input + // could be completely parsed, Ok(false) if the input couldn't be parsed, + // or Err() in case of internal error. + static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale, + LanguageTag& tag); + + // Parse the input string as the base-name parts (language, script, region, + // variants) of a language tag. Ignores any trailing characters. + static bool parseBaseName(JSContext* cx, mozilla::Range<const char> locale, + LanguageTag& tag); + + // Return true iff |extension| can be parsed as a Unicode extension subtag. + static bool canParseUnicodeExtension(mozilla::Range<const char> extension); + + // Return true iff |unicodeType| can be parsed as a Unicode extension type. + static bool canParseUnicodeExtensionType(JSLinearString* unicodeType); +}; + +MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind) + +/** + * Parse a string as a standalone |language| tag. If |str| is a standalone + * language tag, store it in case-normalized form in |result| and return true. + * Otherwise return false. + */ +MOZ_MUST_USE bool ParseStandaloneLanguagTag(JS::Handle<JSLinearString*> str, + LanguageSubtag& result); + +/** + * Parse a string as a standalone |script| tag. If |str| is a standalone script + * tag, store it in case-normalized form in |result| and return true. Otherwise + * return false. + */ +MOZ_MUST_USE bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str, + ScriptSubtag& result); + +/** + * Parse a string as a standalone |region| tag. If |str| is a standalone region + * tag, store it in case-normalized form in |result| and return true. Otherwise + * return false. + */ +MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str, + RegionSubtag& result); + +/** + * Parse a string as an ISO-639 language code. Return |nullptr| in the result if + * the input could not be parsed or the canonical form of the resulting language + * tag contains more than a single language subtag. + */ +JS::Result<JSString*> ParseStandaloneISO639LanguageTag( + JSContext* cx, JS::Handle<JSLinearString*> str); + +} // namespace intl + +} // namespace js + +#endif /* builtin_intl_LanguageTag_h */ diff --git a/js/src/builtin/intl/LanguageTagGenerated.cpp b/js/src/builtin/intl/LanguageTagGenerated.cpp new file mode 100644 index 0000000000..8952286976 --- /dev/null +++ b/js/src/builtin/intl/LanguageTagGenerated.cpp @@ -0,0 +1,790 @@ +// Generated by make_intl_data.py. DO NOT EDIT. +// Version: CLDR-35.1 +// URL: https://unicode.org/Public/cldr/35.1/core.zip + +#include "mozilla/Assertions.h" +#include "mozilla/Range.h" +#include "mozilla/TextUtils.h" + +#include <algorithm> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <type_traits> + +#include "jscntxt.h" +#include "jsstr.h" + +#include "builtin/intl/LanguageTag.h" + +using namespace js::intl::LanguageTagLimits; +using ConstCharRange = mozilla::Range<const char>; + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline bool HasReplacement( + const char (&subtags)[Length][TagLength], + const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.range().begin().get(); + return std::binary_search(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); +} + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline const char* SearchReplacement( + const char (&subtags)[Length][TagLength], + const char* (&aliases)[Length], + const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.range().begin().get(); + auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); + if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { + return aliases[std::distance(std::begin(subtags), p)]; + } + return nullptr; +} + +// Mappings from language subtags to preferred values. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range())); + + if (language.length() == 2) { + static const char languages[9][3] = { + "bh", "in", "iw", "ji", "jw", "mo", "no", "tl", "tw", + }; + static const char* aliases[9] = { + "bho", "id", "he", "yi", "jv", "ro", "nb", "fil", "ak", + }; + + if (const char* replacement = SearchReplacement(languages, aliases, language)) { + language.set(ConstCharRange(replacement, strlen(replacement))); + return true; + } + return false; + } + + if (language.length() == 3) { + static const char languages[340][4] = { + "aam", "aar", "abk", "adp", "afr", "aju", "aka", "alb", "als", "amh", + "ara", "arb", "arg", "arm", "asm", "aue", "ava", "ave", "aym", "ayr", + "ayx", "aze", "azj", "bak", "bam", "baq", "bcc", "bcl", "bel", "ben", + "bgm", "bih", "bis", "bjd", "bod", "bos", "bre", "bul", "bur", "bxk", + "bxr", "cat", "ccq", "ces", "cha", "che", "chi", "chu", "chv", "cjr", + "cka", "cld", "cmk", "cmn", "cor", "cos", "coy", "cqu", "cre", "cwd", + "cym", "cze", "dan", "deu", "dgo", "dhd", "dik", "diq", "div", "drh", + "dut", "dzo", "ekk", "ell", "emk", "eng", "epo", "esk", "est", "eus", + "ewe", "fao", "fas", "fat", "fij", "fin", "fra", "fre", "fry", "fuc", + "ful", "gav", "gaz", "gbo", "geo", "ger", "gfx", "ggn", "gla", "gle", + "glg", "glv", "gno", "gre", "grn", "gti", "gug", "guj", "guv", "gya", + "hat", "hau", "hdn", "hea", "heb", "her", "him", "hin", "hmo", "hrr", + "hrv", "hun", "hye", "ibi", "ibo", "ice", "ido", "iii", "ike", "iku", + "ile", "ilw", "ina", "ind", "ipk", "isl", "ita", "jav", "jeg", "jpn", + "kal", "kan", "kas", "kat", "kau", "kaz", "kgc", "kgh", "khk", "khm", + "kik", "kin", "kir", "kmr", "knc", "kng", "knn", "koj", "kom", "kon", + "kor", "kpv", "krm", "ktr", "kua", "kur", "kvs", "kwq", "kxe", "kzj", + "kzt", "lao", "lat", "lav", "lbk", "lii", "lim", "lin", "lit", "lmm", + "ltz", "lub", "lug", "lvs", "mac", "mah", "mal", "mao", "mar", "may", + "meg", "mhr", "mkd", "mlg", "mlt", "mnk", "mol", "mon", "mri", "msa", + "mst", "mup", "mwj", "mya", "myt", "nad", "nau", "nav", "nbl", "ncp", + "nde", "ndo", "nep", "nld", "nno", "nnx", "nob", "nor", "npi", "nts", + "nya", "oci", "ojg", "oji", "ori", "orm", "ory", "oss", "oun", "pan", + "pbu", "pcr", "per", "pes", "pli", "plt", "pmc", "pmu", "pnb", "pol", + "por", "ppa", "ppr", "pry", "pus", "puz", "que", "quz", "rmy", "roh", + "ron", "rum", "run", "rus", "sag", "san", "sca", "scc", "scr", "sin", + "skk", "slk", "slo", "slv", "sme", "smo", "sna", "snd", "som", "sot", + "spa", "spy", "sqi", "src", "srd", "srp", "ssw", "sun", "swa", "swe", + "swh", "tah", "tam", "tat", "tdu", "tel", "tgk", "tgl", "tha", "thc", + "thx", "tib", "tie", "tir", "tkk", "tlw", "tmp", "tne", "ton", "tsf", + "tsn", "tso", "ttq", "tuk", "tur", "twi", "uig", "ukr", "umu", "uok", + "urd", "uzb", "uzn", "ven", "vie", "vol", "wel", "wln", "wol", "xba", + "xho", "xia", "xkh", "xpe", "xsj", "xsl", "ybd", "ydd", "yid", "yma", + "ymt", "yor", "yos", "yuu", "zai", "zha", "zho", "zsm", "zul", "zyb", + }; + static const char* aliases[340] = { + "aas", "aa", "ab", "dz", "af", "jrb", "ak", "sq", "sq", "am", + "ar", "ar", "an", "hy", "as", "ktz", "av", "ae", "ay", "ay", + "nun", "az", "az", "ba", "bm", "eu", "bal", "bik", "be", "bn", + "bcg", "bho", "bi", "drl", "bo", "bs", "br", "bg", "my", "luy", + "bua", "ca", "rki", "cs", "ch", "ce", "zh", "cu", "cv", "mom", + "cmr", "syr", "xch", "zh", "kw", "co", "pij", "quh", "cr", "cr", + "cy", "cs", "da", "de", "doi", "mwr", "din", "zza", "dv", "mn", + "nl", "dz", "et", "el", "man", "en", "eo", "ik", "et", "eu", + "ee", "fo", "fa", "ak", "fj", "fi", "fr", "fr", "fy", "ff", + "ff", "dev", "om", "grb", "ka", "de", "vaj", "gvr", "gd", "ga", + "gl", "gv", "gon", "el", "gn", "nyc", "gn", "gu", "duz", "gba", + "ht", "ha", "hai", "hmn", "he", "hz", "srx", "hi", "ho", "jal", + "hr", "hu", "hy", "opa", "ig", "is", "io", "ii", "iu", "iu", + "ie", "gal", "ia", "id", "ik", "is", "it", "jv", "oyb", "ja", + "kl", "kn", "ks", "ka", "kr", "kk", "tdf", "kml", "mn", "km", + "ki", "rw", "ky", "ku", "kr", "kg", "kok", "kwv", "kv", "kg", + "ko", "kv", "bmf", "dtp", "kj", "ku", "gdj", "yam", "tvd", "dtp", + "dtp", "lo", "la", "lv", "bnc", "raq", "li", "ln", "lt", "rmx", + "lb", "lu", "lg", "lv", "mk", "mh", "ml", "mi", "mr", "ms", + "cir", "chm", "mk", "mg", "mt", "man", "ro", "mn", "mi", "ms", + "mry", "raj", "vaj", "my", "mry", "xny", "na", "nv", "nr", "kdz", + "nd", "ng", "ne", "nl", "nn", "ngv", "nb", "nb", "ne", "pij", + "ny", "oc", "oj", "oj", "or", "om", "or", "os", "vaj", "pa", + "ps", "adx", "fa", "fa", "pi", "mg", "huw", "phr", "lah", "pl", + "pt", "bfy", "lcq", "prt", "ps", "pub", "qu", "qu", "rom", "rm", + "ro", "ro", "rn", "ru", "sg", "sa", "hle", "sr", "hr", "si", + "oyb", "sk", "sk", "sl", "se", "sm", "sn", "sd", "so", "st", + "es", "kln", "sq", "sc", "sc", "sr", "ss", "su", "sw", "sv", + "sw", "ty", "ta", "tt", "dtp", "te", "tg", "fil", "th", "tpo", + "oyb", "bo", "ras", "ti", "twm", "weo", "tyj", "kak", "to", "taj", + "tn", "ts", "tmh", "tk", "tr", "ak", "ug", "uk", "del", "ema", + "ur", "uz", "uz", "ve", "vi", "vo", "cy", "wa", "wo", "cax", + "xh", "acn", "waw", "kpe", "suj", "den", "rki", "yi", "yi", "lrr", + "mtm", "yo", "zom", "yug", "zap", "za", "zh", "ms", "zu", "za", + }; + + if (const char* replacement = SearchReplacement(languages, aliases, language)) { + language.set(ConstCharRange(replacement, strlen(replacement))); + return true; + } + return false; + } + + return false; +} + +// Language subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& language) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range())); + + if (language.length() == 2) { + return language.equalTo("sh"); + } + + if (language.length() == 3) { + static const char languages[6][4] = { + "cnr", "drw", "hbs", "prs", "swc", "tnf", + }; + + return HasReplacement(languages, language); + } + + return false; +} + +// Mappings from region subtags to preferred values. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) { + MOZ_ASSERT(IsStructurallyValidRegionTag(region.range())); + + if (region.length() == 2) { + static const char regions[23][3] = { + "BU", "CS", "CT", "DD", "DY", "FQ", "FX", "HV", "JT", "MI", + "NH", "NQ", "PU", "PZ", "QU", "RH", "TP", "UK", "VD", "WK", + "YD", "YU", "ZR", + }; + static const char* aliases[23] = { + "MM", "RS", "KI", "DE", "BJ", "AQ", "FR", "BF", "UM", "UM", + "VU", "AQ", "UM", "PA", "EU", "ZW", "TL", "GB", "VN", "UM", + "YE", "RS", "CD", + }; + + if (const char* replacement = SearchReplacement(regions, aliases, region)) { + region.set(ConstCharRange(replacement, strlen(replacement))); + return true; + } + return false; + } + + { + static const char regions[300][4] = { + "004", "008", "010", "012", "016", "020", "024", "028", "031", "032", + "036", "040", "044", "048", "050", "051", "052", "056", "060", "062", + "064", "068", "070", "072", "074", "076", "084", "086", "090", "092", + "096", "100", "104", "108", "112", "116", "120", "124", "132", "136", + "140", "144", "148", "152", "156", "158", "162", "166", "170", "174", + "175", "178", "180", "184", "188", "191", "192", "196", "203", "204", + "208", "212", "214", "218", "222", "226", "230", "231", "232", "233", + "234", "238", "239", "242", "246", "248", "249", "250", "254", "258", + "260", "262", "266", "268", "270", "275", "276", "278", "280", "288", + "292", "296", "300", "304", "308", "312", "316", "320", "324", "328", + "332", "334", "336", "340", "344", "348", "352", "356", "360", "364", + "368", "372", "376", "380", "384", "388", "392", "398", "400", "404", + "408", "410", "414", "417", "418", "422", "426", "428", "430", "434", + "438", "440", "442", "446", "450", "454", "458", "462", "466", "470", + "474", "478", "480", "484", "492", "496", "498", "499", "500", "504", + "508", "512", "516", "520", "524", "528", "531", "533", "534", "535", + "540", "548", "554", "558", "562", "566", "570", "574", "578", "580", + "581", "583", "584", "585", "586", "591", "598", "600", "604", "608", + "612", "616", "620", "624", "626", "630", "634", "638", "642", "643", + "646", "652", "654", "659", "660", "662", "663", "666", "670", "674", + "678", "682", "686", "688", "690", "694", "702", "703", "704", "705", + "706", "710", "716", "720", "724", "728", "729", "732", "736", "740", + "744", "748", "752", "756", "760", "762", "764", "768", "772", "776", + "780", "784", "788", "792", "795", "796", "798", "800", "804", "807", + "818", "826", "830", "831", "832", "833", "834", "840", "850", "854", + "858", "860", "862", "876", "882", "886", "887", "891", "894", "958", + "959", "960", "962", "963", "964", "965", "966", "967", "968", "969", + "970", "971", "972", "973", "974", "975", "976", "977", "978", "979", + "980", "981", "982", "983", "984", "985", "986", "987", "988", "989", + "990", "991", "992", "993", "994", "995", "996", "997", "998", "999", + }; + static const char* aliases[300] = { + "AF", "AL", "AQ", "DZ", "AS", "AD", "AO", "AG", "AZ", "AR", + "AU", "AT", "BS", "BH", "BD", "AM", "BB", "BE", "BM", "034", + "BT", "BO", "BA", "BW", "BV", "BR", "BZ", "IO", "SB", "VG", + "BN", "BG", "MM", "BI", "BY", "KH", "CM", "CA", "CV", "KY", + "CF", "LK", "TD", "CL", "CN", "TW", "CX", "CC", "CO", "KM", + "YT", "CG", "CD", "CK", "CR", "HR", "CU", "CY", "CZ", "BJ", + "DK", "DM", "DO", "EC", "SV", "GQ", "ET", "ET", "ER", "EE", + "FO", "FK", "GS", "FJ", "FI", "AX", "FR", "FR", "GF", "PF", + "TF", "DJ", "GA", "GE", "GM", "PS", "DE", "DE", "DE", "GH", + "GI", "KI", "GR", "GL", "GD", "GP", "GU", "GT", "GN", "GY", + "HT", "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR", + "IQ", "IE", "IL", "IT", "CI", "JM", "JP", "KZ", "JO", "KE", + "KP", "KR", "KW", "KG", "LA", "LB", "LS", "LV", "LR", "LY", + "LI", "LT", "LU", "MO", "MG", "MW", "MY", "MV", "ML", "MT", + "MQ", "MR", "MU", "MX", "MC", "MN", "MD", "ME", "MS", "MA", + "MZ", "OM", "NA", "NR", "NP", "NL", "CW", "AW", "SX", "BQ", + "NC", "VU", "NZ", "NI", "NE", "NG", "NU", "NF", "NO", "MP", + "UM", "FM", "MH", "PW", "PK", "PA", "PG", "PY", "PE", "PH", + "PN", "PL", "PT", "GW", "TL", "PR", "QA", "RE", "RO", "RU", + "RW", "BL", "SH", "KN", "AI", "LC", "MF", "PM", "VC", "SM", + "ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "VN", "SI", + "SO", "ZA", "ZW", "YE", "ES", "SS", "SD", "EH", "SD", "SR", + "SJ", "SZ", "SE", "CH", "SY", "TJ", "TH", "TG", "TK", "TO", + "TT", "AE", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "MK", + "EG", "GB", "JE", "GG", "JE", "IM", "TZ", "US", "VI", "BF", + "UY", "UZ", "VE", "WF", "WS", "YE", "YE", "RS", "ZM", "AA", + "QM", "QN", "QP", "QQ", "QR", "QS", "QT", "EU", "QV", "QW", + "QX", "QY", "QZ", "XA", "XB", "XC", "XD", "XE", "XF", "XG", + "XH", "XI", "XJ", "XK", "XL", "XM", "XN", "XO", "XP", "XQ", + "XR", "XS", "XT", "XU", "XV", "XW", "XX", "XY", "XZ", "ZZ", + }; + + if (const char* replacement = SearchReplacement(regions, aliases, region)) { + region.set(ConstCharRange(replacement, strlen(replacement))); + return true; + } + return false; + } +} + +// Region subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) { + MOZ_ASSERT(IsStructurallyValidRegionTag(region.range())); + + if (region.length() == 2) { + return region.equalTo("AN") || + region.equalTo("NT") || + region.equalTo("PC") || + region.equalTo("SU"); + } + + { + static const char regions[8][4] = { + "172", "200", "530", "532", "536", "582", "810", "890", + }; + + return HasReplacement(regions, region); + } +} + +// Language subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +void js::intl::LanguageTag::performComplexLanguageMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); + + if (language().equalTo("cnr")) { + setLanguage("sr"); + if (region().length() == 0) { + setRegion("ME"); + } + } + else if (language().equalTo("drw") || + language().equalTo("prs") || + language().equalTo("tnf")) { + setLanguage("fa"); + if (region().length() == 0) { + setRegion("AF"); + } + } + else if (language().equalTo("hbs") || + language().equalTo("sh")) { + setLanguage("sr"); + if (script().length() == 0) { + setScript("Latn"); + } + } + else if (language().equalTo("swc")) { + setLanguage("sw"); + if (region().length() == 0) { + setRegion("CD"); + } + } +} + +// Region subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +void js::intl::LanguageTag::performComplexRegionMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region().range())); + + if (region().equalTo("172")) { + if (language().equalTo("hy") || + (language().equalTo("und") && script().equalTo("Armn"))) { + setRegion("AM"); + } + else if (language().equalTo("az") || + language().equalTo("tkr") || + language().equalTo("tly") || + language().equalTo("ttt")) { + setRegion("AZ"); + } + else if (language().equalTo("be")) { + setRegion("BY"); + } + else if (language().equalTo("ab") || + language().equalTo("ka") || + language().equalTo("os") || + (language().equalTo("und") && script().equalTo("Geor")) || + language().equalTo("xmf")) { + setRegion("GE"); + } + else if (language().equalTo("ky")) { + setRegion("KG"); + } + else if (language().equalTo("kk") || + (language().equalTo("ug") && script().equalTo("Cyrl"))) { + setRegion("KZ"); + } + else if (language().equalTo("gag")) { + setRegion("MD"); + } + else if (language().equalTo("tg")) { + setRegion("TJ"); + } + else if (language().equalTo("tk")) { + setRegion("TM"); + } + else if (language().equalTo("crh") || + language().equalTo("got") || + language().equalTo("ji") || + language().equalTo("rue") || + language().equalTo("uk") || + (language().equalTo("und") && script().equalTo("Goth"))) { + setRegion("UA"); + } + else if (language().equalTo("kaa") || + language().equalTo("sog") || + (language().equalTo("und") && script().equalTo("Sogd")) || + (language().equalTo("und") && script().equalTo("Sogo")) || + language().equalTo("uz")) { + setRegion("UZ"); + } + else { + setRegion("RU"); + } + } + else if (region().equalTo("200")) { + if (language().equalTo("sk")) { + setRegion("SK"); + } + else { + setRegion("CZ"); + } + } + else if (region().equalTo("530") || + region().equalTo("532") || + region().equalTo("AN")) { + if (language().equalTo("vic")) { + setRegion("SX"); + } + else { + setRegion("CW"); + } + } + else if (region().equalTo("536") || + region().equalTo("NT")) { + if (language().equalTo("akk") || + language().equalTo("ckb") || + (language().equalTo("ku") && script().equalTo("Arab")) || + language().equalTo("mis") || + language().equalTo("syr") || + (language().equalTo("und") && script().equalTo("Syrc")) || + (language().equalTo("und") && script().equalTo("Xsux")) || + (language().equalTo("und") && script().equalTo("Hatr"))) { + setRegion("IQ"); + } + else { + setRegion("SA"); + } + } + else if (region().equalTo("582") || + region().equalTo("PC")) { + if (language().equalTo("mh")) { + setRegion("MH"); + } + else if (language().equalTo("pau")) { + setRegion("PW"); + } + else { + setRegion("FM"); + } + } + else if (region().equalTo("810") || + region().equalTo("SU")) { + if (language().equalTo("hy") || + (language().equalTo("und") && script().equalTo("Armn"))) { + setRegion("AM"); + } + else if (language().equalTo("az") || + language().equalTo("tkr") || + language().equalTo("tly") || + language().equalTo("ttt")) { + setRegion("AZ"); + } + else if (language().equalTo("be")) { + setRegion("BY"); + } + else if (language().equalTo("et") || + language().equalTo("vro")) { + setRegion("EE"); + } + else if (language().equalTo("ab") || + language().equalTo("ka") || + language().equalTo("os") || + (language().equalTo("und") && script().equalTo("Geor")) || + language().equalTo("xmf")) { + setRegion("GE"); + } + else if (language().equalTo("ky")) { + setRegion("KG"); + } + else if (language().equalTo("kk") || + (language().equalTo("ug") && script().equalTo("Cyrl"))) { + setRegion("KZ"); + } + else if (language().equalTo("lt") || + language().equalTo("sgs")) { + setRegion("LT"); + } + else if (language().equalTo("ltg") || + language().equalTo("lv")) { + setRegion("LV"); + } + else if (language().equalTo("gag")) { + setRegion("MD"); + } + else if (language().equalTo("tg")) { + setRegion("TJ"); + } + else if (language().equalTo("tk")) { + setRegion("TM"); + } + else if (language().equalTo("crh") || + language().equalTo("got") || + language().equalTo("ji") || + language().equalTo("rue") || + language().equalTo("uk") || + (language().equalTo("und") && script().equalTo("Goth"))) { + setRegion("UA"); + } + else if (language().equalTo("kaa") || + language().equalTo("sog") || + (language().equalTo("und") && script().equalTo("Sogd")) || + (language().equalTo("und") && script().equalTo("Sogo")) || + language().equalTo("uz")) { + setRegion("UZ"); + } + else { + setRegion("RU"); + } + } + else if (region().equalTo("890")) { + if (language().equalTo("bs")) { + setRegion("BA"); + } + else if (language().equalTo("hr")) { + setRegion("HR"); + } + else if (language().equalTo("mk")) { + setRegion("MK"); + } + else if (language().equalTo("sl")) { + setRegion("SI"); + } + else { + setRegion("RS"); + } + } +} + +// Canonicalize grandfathered locale identifiers. +// Derived from CLDR Supplemental Data, version 35.1. +// https://unicode.org/Public/cldr/35.1/core.zip +bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { + // We're mapping regular grandfathered tags to non-grandfathered form here. + // Other tags remain unchanged. + // + // regular = "art-lojban" + // / "cel-gaulish" + // / "no-bok" + // / "no-nyn" + // / "zh-guoyu" + // / "zh-hakka" + // / "zh-min" + // / "zh-min-nan" + // / "zh-xiang" + // + // Therefore we can quickly exclude most tags by checking every + // |unicode_locale_id| subcomponent for characteristics not shared by any of + // the regular grandfathered (RG) tags: + // + // * Real-world |unicode_language_subtag|s are all two or three letters, + // so don't waste time running a useless |language.length > 3| fast-path. + // * No RG tag has a "script"-looking component. + // * No RG tag has a "region"-looking component. + // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, + // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, + // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag + // that |unicode_locale_id| doesn't support.) + // * No RG tag contains |extensions| or |pu_extensions|. + if (script().length() != 0 || + region().length() != 0 || + variants().length() != 1 || + extensions().length() != 0 || + privateuse()) { + return true; + } + + auto variantEqualTo = [this](const char* variant) { + return strcmp(variants()[0].get(), variant) == 0; + }; + + // art-lojban -> jbo + if (language().equalTo("art") && variantEqualTo("lojban")) { + setLanguage("jbo"); + clearVariants(); + return true; + } + + // cel-gaulish -> xtg-x-cel-gaulish + else if (language().equalTo("cel") && variantEqualTo("gaulish")) { + setLanguage("xtg"); + clearVariants(); + + auto privateuse = DuplicateString(cx, "x-cel-gaulish"); + if (!privateuse) { + return false; + } + setPrivateuse(std::move(privateuse)); + return true; + } + + // zh-guoyu -> zh + else if (language().equalTo("zh") && variantEqualTo("guoyu")) { + setLanguage("zh"); + clearVariants(); + return true; + } + + // zh-hakka -> hak + else if (language().equalTo("zh") && variantEqualTo("hakka")) { + setLanguage("hak"); + clearVariants(); + return true; + } + + // zh-xiang -> hsn + else if (language().equalTo("zh") && variantEqualTo("xiang")) { + setLanguage("hsn"); + clearVariants(); + return true; + } + + return true; +} + +template <size_t Length> +static inline bool IsUnicodeKey(const ConstCharRange& key, + const char (&str)[Length]) { + static_assert(Length == UnicodeKeyLength + 1, + "Unicode extension key is two characters long"); + return memcmp(key.begin().get(), str, Length - 1) == 0; +} + +template <size_t Length> +static inline bool IsUnicodeType(const ConstCharRange& type, + const char (&str)[Length]) { + static_assert(Length > UnicodeKeyLength + 1, + "Unicode extension type contains more than two characters"); + return type.length() == (Length - 1) && + memcmp(type.begin().get(), str, Length - 1) == 0; +} + +static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) { +#ifdef DEBUG + auto isNull = [](char c) { + return c == '\0'; + }; +#endif + + MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull), + "unexpected null-character in string"); + + using UnsignedChar = unsigned char; + for (size_t i = 0; i < b.length(); i++) { + // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if + // we've reached the end of |a|, the below if-statement will always be true. + // That ensures we don't read past the end of |a|. + if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) { + return r; + } + } + + // Return zero if both strings are equal or a negative number if |b| is a + // prefix of |a|. + return -int32_t(UnsignedChar(a[b.length()])); +}; + +template <size_t Length> +static inline const char* SearchReplacement(const char* (&types)[Length], + const char* (&aliases)[Length], + const ConstCharRange& type) { + + auto p = std::lower_bound(std::begin(types), std::end(types), type, + [](const auto& a, const auto& b) { + return CompareUnicodeType(a, b) < 0; + }); + if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) { + return aliases[std::distance(std::begin(types), p)]; + } + return nullptr; +} + +/** + * Mapping from deprecated BCP 47 Unicode extension types to their preferred + * values. + * + * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + */ +const char* js::intl::LanguageTag::replaceUnicodeExtensionType( + const ConstCharRange& key, const ConstCharRange& type) { +#ifdef DEBUG + static auto isAsciiLowercaseAlphanumeric = [](char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); + }; + + static auto isAsciiLowercaseAlphanumericOrDash = [](char c) { + return isAsciiLowercaseAlphanumeric(c) || c == '-'; + }; +#endif + + MOZ_ASSERT(key.length() == UnicodeKeyLength); + MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(), + isAsciiLowercaseAlphanumeric)); + + MOZ_ASSERT(type.length() > UnicodeKeyLength); + MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(), + isAsciiLowercaseAlphanumericOrDash)); + + if (IsUnicodeKey(key, "ca")) { + if (IsUnicodeType(type, "ethiopic-amete-alem")) { + return "ethioaa"; + } + if (IsUnicodeType(type, "islamicc")) { + return "islamic-civil"; + } + } + else if (IsUnicodeKey(key, "kb") || + IsUnicodeKey(key, "kc") || + IsUnicodeKey(key, "kh") || + IsUnicodeKey(key, "kk") || + IsUnicodeKey(key, "kn")) { + if (IsUnicodeType(type, "yes")) { + return "true"; + } + } + else if (IsUnicodeKey(key, "ks")) { + if (IsUnicodeType(type, "primary")) { + return "level1"; + } + if (IsUnicodeType(type, "tertiary")) { + return "level3"; + } + } + else if (IsUnicodeKey(key, "ms")) { + if (IsUnicodeType(type, "imperial")) { + return "uksystem"; + } + } + else if (IsUnicodeKey(key, "rg") || + IsUnicodeKey(key, "sd")) { + static const char* types[116] = { + "cn11", "cn12", "cn13", "cn14", "cn15", "cn21", "cn22", "cn23", + "cn31", "cn32", "cn33", "cn34", "cn35", "cn36", "cn37", "cn41", + "cn42", "cn43", "cn44", "cn45", "cn46", "cn50", "cn51", "cn52", + "cn53", "cn54", "cn61", "cn62", "cn63", "cn64", "cn65", "cz10a", + "cz10b", "cz10c", "cz10d", "cz10e", "cz10f", "cz611", "cz612", "cz613", + "cz614", "cz615", "cz621", "cz622", "cz623", "cz624", "cz626", "cz627", + "czjc", "czjm", "czka", "czkr", "czli", "czmo", "czol", "czpa", + "czpl", "czpr", "czst", "czus", "czvy", "czzl", "fra", "frb", + "frc", "frd", "fre", "frf", "frg", "frh", "fri", "frj", + "frk", "frl", "frm", "frn", "fro", "frp", "frq", "frr", + "frs", "frt", "fru", "frv", "laxn", "lud", "lug", "lul", + "mrnkc", "nzn", "nzs", "omba", "omsh", "plds", "plkp", "pllb", + "plld", "pllu", "plma", "plmz", "plop", "plpd", "plpk", "plpm", + "plsk", "plsl", "plwn", "plwp", "plzp", "tteto", "ttrcm", "ttwto", + "twkhq", "twtnq", "twtpq", "twtxq", + }; + static const char* aliases[116] = { + "cnbj", "cntj", "cnhe", "cnsx", "cnmn", "cnln", "cnjl", "cnhl", + "cnsh", "cnjs", "cnzj", "cnah", "cnfj", "cnjx", "cnsd", "cnha", + "cnhb", "cnhn", "cngd", "cngx", "cnhi", "cncq", "cnsc", "cngz", + "cnyn", "cnxz", "cnsn", "cngs", "cnqh", "cnnx", "cnxj", "cz110", + "cz111", "cz112", "cz113", "cz114", "cz115", "cz663", "cz632", "cz633", + "cz634", "cz635", "cz641", "cz642", "cz643", "cz644", "cz646", "cz647", + "cz31", "cz64", "cz41", "cz52", "cz51", "cz80", "cz71", "cz53", + "cz32", "cz10", "cz20", "cz42", "cz63", "cz72", "frges", "frnaq", + "frara", "frbfc", "frbre", "frcvl", "frges", "frcor", "frbfc", "fridf", + "frocc", "frnaq", "frges", "frocc", "frhdf", "frnor", "frnor", "frpdl", + "frhdf", "frnaq", "frpac", "frara", "laxs", "lucl", "luec", "luca", + "mr13", "nzauk", "nzcan", "ombj", "omsj", "pl02", "pl04", "pl08", + "pl10", "pl06", "pl12", "pl14", "pl16", "pl20", "pl18", "pl22", + "pl26", "pl24", "pl28", "pl30", "pl32", "tttob", "ttmrc", "tttob", + "twkhh", "twtnn", "twnwt", "twtxg", + }; + return SearchReplacement(types, aliases, type); + } + else if (IsUnicodeKey(key, "tz")) { + static const char* types[28] = { + "aqams", "cnckg", "cnhrb", "cnkhg", "cuba", "egypt", + "eire", "est", "gmt0", "hongkong", "hst", "iceland", + "iran", "israel", "jamaica", "japan", "libya", "mst", + "navajo", "poland", "portugal", "prc", "roc", "rok", + "turkey", "uct", "usnavajo", "zulu", + }; + static const char* aliases[28] = { + "nzakl", "cnsha", "cnsha", "cnurc", "cuhav", "egcai", + "iedub", "utcw05", "gmt", "hkhkg", "utcw10", "isrey", + "irthr", "jeruslm", "jmkin", "jptyo", "lytip", "utcw07", + "usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel", + "trist", "utc", "usden", "utc", + }; + return SearchReplacement(types, aliases, type); + } + return nullptr; +} diff --git a/js/src/builtin/intl/Locale.cpp b/js/src/builtin/intl/Locale.cpp new file mode 100644 index 0000000000..2bd93732c2 --- /dev/null +++ b/js/src/builtin/intl/Locale.cpp @@ -0,0 +1,1372 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Intl.Locale implementation. */ + +#include "builtin/intl/Locale.h" + +#include "mozilla/ArrayUtils.h" +#include "mozilla/Assertions.h" +#include "mozilla/Casting.h" +#include "mozilla/Maybe.h" +#include "mozilla/Range.h" +#include "mozilla/TextUtils.h" + +#include <algorithm> +#include <iterator> +#include <string> +#include <string.h> +#include <utility> + +#include "jsapi.h" +#include "jsfriendapi.h" +#include "jscntxt.h" +#include "jsobjinlines.h" +#include "jswrapper.h" + +#include "builtin/intl/CommonFunctions.h" +#include "builtin/intl/LanguageTag.h" +#include "gc/Rooting.h" +#include "js/Conversions.h" +#include "js/TypeDecls.h" +#include "vm/GlobalObject.h" +#include "vm/String.h" +#include "vm/StringBuffer.h" + +#include "vm/NativeObject-inl.h" + +using namespace js; +using namespace js::intl::LanguageTagLimits; + +using intl::LanguageTag; +using intl::LanguageTagParser; + +const Class LocaleObject::class_ = { + js_Object_str, + JSCLASS_HAS_RESERVED_SLOTS(LocaleObject::SLOT_COUNT), +}; + +static inline bool IsLocale(HandleValue v) { + return v.isObject() && v.toObject().is<LocaleObject>(); +} + +// Return the length of the base-name subtags. +static size_t BaseNameLength(const LanguageTag& tag) { + size_t baseNameLength = tag.language().length(); + if (tag.script().length() > 0) { + baseNameLength += 1 + tag.script().length(); + } + if (tag.region().length() > 0) { + baseNameLength += 1 + tag.region().length(); + } + for (const auto& variant : tag.variants()) { + baseNameLength += 1 + strlen(variant.get()); + } + return baseNameLength; +} + +struct IndexAndLength { + size_t index; + size_t length; + + IndexAndLength(size_t index, size_t length) : index(index), length(length){}; + + template <typename T> + mozilla::Range<const T> rangeOf(const T* ptr) const { + return {ptr + index, length}; + } +}; + +// Compute the Unicode extension's index and length in the extension subtag. +static mozilla::Maybe<IndexAndLength> UnicodeExtensionPosition( + const LanguageTag& tag) { + size_t index = 0; + for (const auto& extension : tag.extensions()) { + size_t extensionLength = strlen(extension.get()); + if (extension[0] == 'u') { + return mozilla::Some(IndexAndLength{index, extensionLength}); + } + + // Add +1 to skip over the preceding separator. + index += 1 + extensionLength; + } + return mozilla::Nothing(); +} + +static LocaleObject* CreateLocaleObject(JSContext* cx, HandleObject prototype, + const LanguageTag& tag) { + RootedObject proto(cx, prototype); + if (!proto) { + proto = GlobalObject::getOrCreateLocalePrototype(cx, cx->global()); + if (!proto) { + return nullptr; + } + } + + StringBuffer sb(cx); + if (!tag.appendTo(cx, sb)) { + return nullptr; + } + + RootedString tagStr(cx, sb.finishString()); + if (!tagStr) { + return nullptr; + } + + size_t baseNameLength = BaseNameLength(tag); + + RootedString baseName(cx, NewDependentString(cx, tagStr, 0, baseNameLength)); + if (!baseName) { + return nullptr; + } + + RootedValue unicodeExtension(cx, UndefinedValue()); + if (auto result = UnicodeExtensionPosition(tag)) { + JSString* str = NewDependentString( + cx, tagStr, baseNameLength + 1 + result->index, result->length); + if (!str) { + return nullptr; + } + + unicodeExtension.setString(str); + } + + auto* locale = NewObjectWithGivenProto<LocaleObject>(cx, proto); + if (!locale) { + return nullptr; + } + + locale->setFixedSlot(LocaleObject::LANGUAGE_TAG_SLOT, StringValue(tagStr)); + locale->setFixedSlot(LocaleObject::BASENAME_SLOT, StringValue(baseName)); + locale->setFixedSlot(LocaleObject::UNICODE_EXTENSION_SLOT, unicodeExtension); + + return locale; +} + +static inline bool IsValidUnicodeExtensionValue(JSLinearString* linear) { + return linear->length() > 0 && + LanguageTagParser::canParseUnicodeExtensionType(linear); +} + +/** Iterate through (sep keyword) in a valid, lowercased Unicode extension. */ +template <typename CharT> +class SepKeywordIterator { + const CharT* iter_; + const CharT* const end_; + + public: + SepKeywordIterator(const CharT* unicodeExtensionBegin, + const CharT* unicodeExtensionEnd) + : iter_(unicodeExtensionBegin), end_(unicodeExtensionEnd) {} + + /** + * Return (sep keyword) in the Unicode locale extension from begin to end. + * The first call after all (sep keyword) are consumed returns |nullptr|; no + * further calls are allowed. + */ + const CharT* next() { + MOZ_ASSERT(iter_ != nullptr, + "can't call next() once it's returned nullptr"); + + constexpr size_t SepKeyLength = 1 + UnicodeKeyLength; // "-co"/"-nu"/etc. + + MOZ_ASSERT(iter_ + SepKeyLength <= end_, + "overall Unicode locale extension or non-leading subtags must " + "be at least key-sized"); + + MOZ_ASSERT((iter_[0] == 'u' && iter_[1] == '-') || iter_[0] == '-'); + + while (true) { + // Skip past '-' so |std::char_traits::find| makes progress. Skipping + // 'u' is harmless -- skip or not, |find| returns the first '-'. + iter_++; + + // Find the next separator. + iter_ = std::char_traits<CharT>::find( + iter_, mozilla::PointerRangeSize(iter_, end_), CharT('-')); + if (!iter_) { + return nullptr; + } + + MOZ_ASSERT(iter_ + SepKeyLength <= end_, + "non-leading subtags in a Unicode locale extension are all " + "at least as long as a key"); + + if (iter_ + SepKeyLength == end_ || // key is terminal subtag + iter_[SepKeyLength] == '-') { // key is followed by more subtags + break; + } + } + + MOZ_ASSERT(iter_[0] == '-'); + MOZ_ASSERT(mozilla::IsAsciiLowercaseAlpha(iter_[1]) || + mozilla::IsAsciiDigit(iter_[1])); + MOZ_ASSERT(mozilla::IsAsciiLowercaseAlpha(iter_[2])); + MOZ_ASSERT_IF(iter_ + SepKeyLength < end_, iter_[SepKeyLength] == '-'); + return iter_; + } +}; + +/** + * 9.2.10 GetOption ( options, property, type, values, fallback ) + * + * If the requested property is present and not-undefined, set the result string + * to |ToString(value)|. Otherwise set the result string to nullptr. + */ +static bool GetStringOption(JSContext* cx, HandleObject options, + HandlePropertyName name, + MutableHandle<JSLinearString*> string) { + // Step 1. + RootedValue option(cx); + if (!GetProperty(cx, options, options, name, &option)) { + return false; + } + + // Step 2. + JSLinearString* linear = nullptr; + if (!option.isUndefined()) { + // Steps 2.a-b, 2.d (not applicable). + + // Steps 2.c, 2.e. + JSString* str = ToString(cx, option); + if (!str) { + return false; + } + linear = str->ensureLinear(cx); + if (!linear) { + return false; + } + } + + // Step 3. + string.set(linear); + return true; +} + +/** + * 9.2.10 GetOption ( options, property, type, values, fallback ) + * + * If the requested property is present and not-undefined, set the result string + * to |ToString(ToBoolean(value))|. Otherwise set the result string to nullptr. + */ +static bool GetBooleanOption(JSContext* cx, HandleObject options, + HandlePropertyName name, + MutableHandle<JSLinearString*> string) { + // Step 1. + RootedValue option(cx); + if (!GetProperty(cx, options, options, name, &option)) { + return false; + } + + // Step 2. + JSLinearString* linear = nullptr; + if (!option.isUndefined()) { + // Steps 2.a, 2.c-d (not applicable). + + // Steps 2.c, 2.e. + JSString* str = BooleanToString(cx, ToBoolean(option)); + MOZ_ALWAYS_TRUE(linear = str->ensureLinear(cx)); + } + + // Step 3. + string.set(linear); + return true; +} + +/** + * ApplyOptionsToTag ( tag, options ) + */ +static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag, + HandleObject options) { + // Steps 1-2 (Already performed in caller). + + RootedLinearString option(cx); + + // Step 3. + if (!GetStringOption(cx, options, cx->names().language, &option)) { + return false; + } + + // Step 4. + intl::LanguageSubtag language; + if (option && !intl::ParseStandaloneLanguagTag(option, language)) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "language", + str.get()); + } + return false; + } + + // Step 5. + if (!GetStringOption(cx, options, cx->names().script, &option)) { + return false; + } + + // Step 6. + intl::ScriptSubtag script; + if (option && !intl::ParseStandaloneScriptTag(option, script)) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "script", str.get()); + } + return false; + } + + // Step 7. + if (!GetStringOption(cx, options, cx->names().region, &option)) { + return false; + } + + // Step 8. + intl::RegionSubtag region; + if (option && !intl::ParseStandaloneRegionTag(option, region)) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "region", str.get()); + } + return false; + } + + // Step 9 (Already performed in caller). + + // Skip steps 10-13 when no subtags were modified. + if (language.length() > 0 || script.length() > 0 || region.length() > 0) { + // Step 10. + if (language.length() > 0) { + tag.setLanguage(language); + } + + // Step 11. + if (script.length() > 0) { + tag.setScript(script); + } + + // Step 12. + if (region.length() > 0) { + tag.setRegion(region); + } + + // Step 13. + // Optimized to only canonicalize the base-name subtags. All other + // canonicalization steps will happen later. + if (!tag.canonicalizeBaseName(cx)) { + return true; + } + } + + return true; +} + +/** + * ApplyUnicodeExtensionToTag( tag, options, relevantExtensionKeys ) + */ +static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag, + HandleLinearString calendar, + HandleLinearString collation, + HandleLinearString hourCycle, + HandleLinearString caseFirst, + HandleLinearString numeric, + HandleLinearString numberingSystem) { + // If no Unicode extensions were present in the options object, we can skip + // everything below and directly return. + if (!calendar && !collation && !caseFirst && !hourCycle && !numeric && + !numberingSystem) { + return true; + } + + Vector<char, 32> newExtension(cx); + if (!newExtension.append('u')) { + return false; + } + + // Check if there's an existing Unicode extension subtag. (The extension + // subtags aren't necessarily sorted, so we can't use binary search here.) + const UniqueChars* existingUnicodeExtension = + std::find_if(tag.extensions().begin(), tag.extensions().end(), + [](const auto& extension) { return extension[0] == 'u'; }); + + const char* unicodeExtensionEnd = nullptr; + const char* unicodeExtensionKeywords = nullptr; + if (existingUnicodeExtension != tag.extensions().end()) { + const char* unicodeExtension = existingUnicodeExtension->get(); + unicodeExtensionEnd = unicodeExtension + strlen(unicodeExtension); + + SepKeywordIterator<char> iter(unicodeExtension, unicodeExtensionEnd); + + // Find the start of the first keyword. + unicodeExtensionKeywords = iter.next(); + + // Copy any attributes present before the first keyword. + const char* attributesEnd = unicodeExtensionKeywords + ? unicodeExtensionKeywords + : unicodeExtensionEnd; + if (!newExtension.append(unicodeExtension + 1, attributesEnd)) { + return false; + } + } + + using UnicodeKeyWithSeparator = const char(&)[UnicodeKeyLength + 3]; + + auto appendKeyword = [&newExtension](UnicodeKeyWithSeparator key, + JSLinearString* value) { + if (!newExtension.append(key, UnicodeKeyLength + 2)) { + return false; + } + + JS::AutoCheckCannotGC nogc; + return value->hasLatin1Chars() + ? newExtension.append(value->latin1Chars(nogc), value->length()) + : newExtension.append(value->twoByteChars(nogc), + value->length()); + }; + + // Append the new keywords before any existing keywords. That way any previous + // keyword with the same key is detected as a duplicate when canonicalizing + // the Unicode extension subtag and gets discarded. + + size_t startNewKeywords = newExtension.length(); + + if (calendar) { + if (!appendKeyword("-ca-", calendar)) { + return false; + } + } + if (collation) { + if (!appendKeyword("-co-", collation)) { + return false; + } + } + if (hourCycle) { + if (!appendKeyword("-hc-", hourCycle)) { + return false; + } + } + if (caseFirst) { + if (!appendKeyword("-kf-", caseFirst)) { + return false; + } + } + if (numeric) { + if (!appendKeyword("-kn-", numeric)) { + return false; + } + } + if (numberingSystem) { + if (!appendKeyword("-nu-", numberingSystem)) { + return false; + } + } + + // Normalize the case of the new keywords. + std::transform(newExtension.begin() + startNewKeywords, newExtension.end(), + newExtension.begin() + startNewKeywords, [](char c) { + return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c; + }); + + // Append the remaining keywords from the previous Unicode extension subtag. + if (unicodeExtensionKeywords) { + if (!newExtension.append(unicodeExtensionKeywords, unicodeExtensionEnd)) { + return false; + } + } + + // Null-terminate the new Unicode extension string. + if (!newExtension.append('\0')) { + return false; + } + + // Insert the new Unicode extension string into the language tag. + UniqueChars newExtensionChars(newExtension.extractOrCopyRawBuffer()); + if (!newExtensionChars) { + return false; + } + return tag.setUnicodeExtension(std::move(newExtensionChars)); +} + +static JS::Result<JSString*> LanguageTagFromMaybeWrappedLocale(JSContext* cx, + JSObject* obj) { + if (obj->is<LocaleObject>()) { + return obj->as<LocaleObject>().languageTag(); + } + + JSObject* unwrapped = CheckedUnwrap(obj); + if (!unwrapped) { + /* ReportAccessDenied(cx); */ + return cx->alreadyReportedError(); + } + + if (!unwrapped->is<LocaleObject>()) { + return nullptr; + } + + RootedString tagStr(cx, unwrapped->as<LocaleObject>().languageTag()); + if (!cx->compartment()->wrap(cx, &tagStr)) { + return cx->alreadyReportedError(); + } + return tagStr.get(); +} + +/** + * Intl.Locale( tag[, options] ) + */ +static bool Locale(JSContext* cx, unsigned argc, Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + + // Step 1. + if (!ThrowIfNotConstructing(cx, args, "Intl.Locale")) { + return false; + } + + // Steps 2-6 (Inlined 9.1.14, OrdinaryCreateFromConstructor). + RootedObject proto(cx); + if (!GetPrototypeFromCallableConstructor(cx, args, &proto)) { + return false; + } + + // Steps 7-9. + HandleValue tagValue = args.get(0); + JSString* tagStr; + if (tagValue.isObject()) { + JS_TRY_VAR_OR_RETURN_FALSE( + cx, tagStr, + LanguageTagFromMaybeWrappedLocale(cx, &tagValue.toObject())); + if (!tagStr) { + tagStr = ToString(cx, tagValue); + if (!tagStr) { + return false; + } + } + } else if (tagValue.isString()) { + tagStr = tagValue.toString(); + } else { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_INVALID_LOCALES_ELEMENT); + return false; + } + + RootedLinearString tagLinearStr(cx, tagStr->ensureLinear(cx)); + if (!tagLinearStr) { + return false; + } + + // ApplyOptionsToTag, steps 2 and 9. + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) { + return false; + } + + if (!tag.canonicalizeBaseName(cx)) { + return false; + } + + // Steps 10-11. + if (args.hasDefined(1)) { + RootedObject options(cx, ToObject(cx, args[1])); + if (!options) { + return false; + } + + // Step 12. + if (!ApplyOptionsToTag(cx, tag, options)) { + return false; + } + + // Step 13 (not applicable). + + // Steps 14, 16. + RootedLinearString calendar(cx); + if (!GetStringOption(cx, options, cx->names().calendar, &calendar)) { + return false; + } + + // Step 15. + if (calendar) { + if (!IsValidUnicodeExtensionValue(calendar)) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *calendar)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "calendar", + str.get()); + } + return false; + } + } + + // Steps 17, 19. + RootedLinearString collation(cx); + if (!GetStringOption(cx, options, cx->names().collation, &collation)) { + return false; + } + + // Step 18. + if (collation) { + if (!IsValidUnicodeExtensionValue(collation)) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *collation)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "collation", + str.get()); + } + return false; + } + } + + // Steps 20-21. + RootedLinearString hourCycle(cx); + if (!GetStringOption(cx, options, cx->names().hourCycle, &hourCycle)) { + return false; + } + + if (hourCycle) { + if (!StringEqualsAscii(hourCycle, "h11") && + !StringEqualsAscii(hourCycle, "h12") && + !StringEqualsAscii(hourCycle, "h23") && + !StringEqualsAscii(hourCycle, "h24")) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *hourCycle)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "hourCycle", + str.get()); + } + return false; + } + } + + // Steps 22-23. + RootedLinearString caseFirst(cx); + if (!GetStringOption(cx, options, cx->names().caseFirst, &caseFirst)) { + return false; + } + + if (caseFirst) { + if (!StringEqualsAscii(caseFirst, "upper") && + !StringEqualsAscii(caseFirst, "lower") && + !StringEqualsAscii(caseFirst, "false")) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *caseFirst)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, "caseFirst", + str.get()); + } + return false; + } + } + + // Steps 24-26. + RootedLinearString numeric(cx); + if (!GetBooleanOption(cx, options, cx->names().numeric, &numeric)) { + return false; + } + + // Steps 27, 29. + RootedLinearString numberingSystem(cx); + if (!GetStringOption(cx, options, cx->names().numberingSystem, + &numberingSystem)) { + return false; + } + + // Step 28. + if (numberingSystem) { + if (!IsValidUnicodeExtensionValue(numberingSystem)) { + if (UniqueChars str = StringToNewUTF8CharsZ(cx, *numberingSystem)) { + JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, + JSMSG_INVALID_OPTION_VALUE, + "numberingSystem", str.get()); + } + return false; + } + } + + // Step 30. + if (!ApplyUnicodeExtensionToTag(cx, tag, calendar, collation, hourCycle, + caseFirst, numeric, numberingSystem)) { + return false; + } + } + + // ApplyOptionsToTag, steps 9 and 13. + // ApplyUnicodeExtensionToTag, step 8. + if (!tag.canonicalizeExtensions( + cx, LanguageTag::UnicodeExtensionCanonicalForm::Yes)) { + return false; + } + + // Steps 6, 31-37. + JSObject* obj = CreateLocaleObject(cx, proto, tag); + if (!obj) { + return false; + } + + // Step 38. + args.rval().setObject(*obj); + return true; +} + +using UnicodeKey = const char (&)[UnicodeKeyLength + 1]; + +// Returns the tuple [index, length] of the `type` in the `keyword` in Unicode +// locale extension |extension| that has |key| as its `key`. If `keyword` lacks +// a type, the returned |index| will be where `type` would have been, and +// |length| will be set to zero. +template <typename CharT> +static mozilla::Maybe<IndexAndLength> FindUnicodeExtensionType( + const CharT* extension, size_t length, UnicodeKey key) { + MOZ_ASSERT(extension[0] == 'u'); + MOZ_ASSERT(extension[1] == '-'); + + const CharT* end = extension + length; + + SepKeywordIterator<CharT> iter(extension, end); + + // Search all keywords until a match was found. + const CharT* beginKey; + while (true) { + beginKey = iter.next(); + if (!beginKey) { + return mozilla::Nothing(); + } + + // Add +1 to skip over the separator preceding the keyword. + MOZ_ASSERT(beginKey[0] == '-'); + beginKey++; + + // Exit the loop on the first match. + if (std::equal(beginKey, beginKey + UnicodeKeyLength, key)) { + break; + } + } + + // Skip over the key. + const CharT* beginType = beginKey + UnicodeKeyLength; + + // Find the start of the next keyword. + const CharT* endType = iter.next(); + + // No further keyword present, the current keyword ends the Unicode extension. + if (!endType) { + endType = end; + } + + // If the keyword has a type, skip over the separator preceding the type. + if (beginType != endType) { + MOZ_ASSERT(beginType[0] == '-'); + beginType++; + } + return mozilla::Some(IndexAndLength{size_t(beginType - extension), + size_t(endType - beginType)}); +} + +static inline auto FindUnicodeExtensionType(JSLinearString* unicodeExtension, + UnicodeKey key) { + JS::AutoCheckCannotGC nogc; + return unicodeExtension->hasLatin1Chars() + ? FindUnicodeExtensionType(unicodeExtension->latin1Chars(nogc), + unicodeExtension->length(), key) + : FindUnicodeExtensionType(unicodeExtension->twoByteChars(nogc), + unicodeExtension->length(), key); +} + +// Return the sequence of types for the Unicode extension keyword specified by +// key or undefined when the keyword isn't present. +static bool GetUnicodeExtension(JSContext* cx, LocaleObject* locale, + UnicodeKey key, MutableHandleValue value) { + // Return undefined when no Unicode extension subtag is present. + const Value& unicodeExtensionValue = locale->unicodeExtension(); + if (unicodeExtensionValue.isUndefined()) { + value.setUndefined(); + return true; + } + + JSLinearString* unicodeExtension = + unicodeExtensionValue.toString()->ensureLinear(cx); + if (!unicodeExtension) { + return false; + } + + // Find the type of the requested key in the Unicode extension subtag. + auto result = FindUnicodeExtensionType(unicodeExtension, key); + + // Return undefined if the requested key isn't present in the extension. + if (!result) { + value.setUndefined(); + return true; + } + + size_t index = result->index; + size_t length = result->length; + + // Otherwise return the type value of the found keyword. + JSString* str = NewDependentString(cx, unicodeExtension, index, length); + if (!str) { + return false; + } + value.setString(str); + return true; +} + +struct BaseNamePartsResult { + IndexAndLength language; + mozilla::Maybe<IndexAndLength> script; + mozilla::Maybe<IndexAndLength> region; +}; + +// Returns [language-length, script-index, region-index, region-length]. +template <typename CharT> +static BaseNamePartsResult BaseNameParts(const CharT* baseName, size_t length) { + size_t languageLength; + size_t scriptIndex = 0; + size_t regionIndex = 0; + size_t regionLength = 0; + + // Search the first separator to find the end of the language subtag. + if (const CharT* sep = std::char_traits<CharT>::find(baseName, length, '-')) { + languageLength = sep - baseName; + + // Add +1 to skip over the separator character. + size_t nextSubtag = languageLength + 1; + + // Script subtags are always four characters long, but take care for a four + // character long variant subtag. These start with a digit. + if ((nextSubtag + ScriptLength == length || + (nextSubtag + ScriptLength < length && + baseName[nextSubtag + ScriptLength] == '-')) && + mozilla::IsAsciiAlpha(baseName[nextSubtag])) { + scriptIndex = nextSubtag; + nextSubtag = scriptIndex + ScriptLength + 1; + } + + // Region subtags can be either two or three characters long. + if (nextSubtag < length) { + for (size_t rlen : {AlphaRegionLength, DigitRegionLength}) { + MOZ_ASSERT(nextSubtag + rlen <= length); + if (nextSubtag + rlen == length || baseName[nextSubtag + rlen] == '-') { + regionIndex = nextSubtag; + regionLength = rlen; + break; + } + } + } + } else { + // No separator found, the base-name consists of just a language subtag. + languageLength = length; + } + + IndexAndLength language{0, languageLength}; + MOZ_ASSERT(intl::IsStructurallyValidLanguageTag(language.rangeOf(baseName))); + + mozilla::Maybe<IndexAndLength> script{}; + if (scriptIndex) { + script.emplace(scriptIndex, ScriptLength); + MOZ_ASSERT(intl::IsStructurallyValidScriptTag(script->rangeOf(baseName))); + } + + mozilla::Maybe<IndexAndLength> region{}; + if (regionIndex) { + region.emplace(regionIndex, regionLength); + MOZ_ASSERT(intl::IsStructurallyValidRegionTag(region->rangeOf(baseName))); + } + + return {language, script, region}; +} + +static inline auto BaseNameParts(JSLinearString* baseName) { + JS::AutoCheckCannotGC nogc; + return baseName->hasLatin1Chars() + ? BaseNameParts(baseName->latin1Chars(nogc), baseName->length()) + : BaseNameParts(baseName->twoByteChars(nogc), baseName->length()); +} + +// Intl.Locale.prototype.maximize () +static bool Locale_maximize(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + RootedLinearString tagStr(cx, locale->languageTag()->ensureLinear(cx)); + if (!tagStr) { + return false; + } + + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagStr, tag)) { + return false; + } + + if (!tag.addLikelySubtags(cx)) { + return false; + } + + // Step 4. + auto* result = CreateLocaleObject(cx, nullptr, tag); + if (!result) { + return false; + } + args.rval().setObject(*result); + return true; +} + +// Intl.Locale.prototype.maximize () +static bool Locale_maximize(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_maximize>(cx, args); +} + +// Intl.Locale.prototype.minimize () +static bool Locale_minimize(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + RootedLinearString tagStr(cx, locale->languageTag()->ensureLinear(cx)); + if (!tagStr) { + return false; + } + + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagStr, tag)) { + return false; + } + + if (!tag.removeLikelySubtags(cx)) { + return false; + } + + // Step 4. + auto* result = CreateLocaleObject(cx, nullptr, tag); + if (!result) { + return false; + } + args.rval().setObject(*result); + return true; +} + +// Intl.Locale.prototype.minimize () +static bool Locale_minimize(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_minimize>(cx, args); +} + +// Intl.Locale.prototype.toString () +static bool Locale_toString(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + args.rval().setString(locale->languageTag()); + return true; +} + +// Intl.Locale.prototype.toString () +static bool Locale_toString(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_toString>(cx, args); +} + +// get Intl.Locale.prototype.baseName +static bool Locale_baseName(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // FIXME: spec bug - invalid assertion in step 4. + // FIXME: spec bug - subtag production names not updated. + + // Steps 3, 5. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + args.rval().setString(locale->baseName()); + return true; +} + +// get Intl.Locale.prototype.baseName +static bool Locale_baseName(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_baseName>(cx, args); +} + +// get Intl.Locale.prototype.calendar +static bool Locale_calendar(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + return GetUnicodeExtension(cx, locale, "ca", args.rval()); +} + +// get Intl.Locale.prototype.calendar +static bool Locale_calendar(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_calendar>(cx, args); +} + +// get Intl.Locale.prototype.collation +static bool Locale_collation(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + return GetUnicodeExtension(cx, locale, "co", args.rval()); +} + +// get Intl.Locale.prototype.collation +static bool Locale_collation(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_collation>(cx, args); +} + +// get Intl.Locale.prototype.hourCycle +static bool Locale_hourCycle(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + return GetUnicodeExtension(cx, locale, "hc", args.rval()); +} + +// get Intl.Locale.prototype.hourCycle +static bool Locale_hourCycle(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_hourCycle>(cx, args); +} + +// get Intl.Locale.prototype.caseFirst +static bool Locale_caseFirst(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + return GetUnicodeExtension(cx, locale, "kf", args.rval()); +} + +// get Intl.Locale.prototype.caseFirst +static bool Locale_caseFirst(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_caseFirst>(cx, args); +} + +// get Intl.Locale.prototype.numeric +static bool Locale_numeric(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + RootedValue value(cx); + if (!GetUnicodeExtension(cx, locale, "kn", &value)) { + return false; + } + + // FIXME: spec bug - comparison should be against the empty string, too. + MOZ_ASSERT(value.isUndefined() || value.isString()); + args.rval().setBoolean(value.isString() && value.toString()->empty()); + return true; +} + +// get Intl.Locale.prototype.numeric +static bool Locale_numeric(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_numeric>(cx, args); +} + +// get Intl.Locale.prototype.numberingSystem +static bool Intl_Locale_numberingSystem(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + return GetUnicodeExtension(cx, locale, "nu", args.rval()); +} + +// get Intl.Locale.prototype.numberingSystem +static bool Locale_numberingSystem(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Intl_Locale_numberingSystem>(cx, args); +} + +// get Intl.Locale.prototype.language +static bool Locale_language(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + JSLinearString* baseName = locale->baseName()->ensureLinear(cx); + if (!baseName) { + return false; + } + + // Step 4 (Unnecessary assertion). + + auto language = BaseNameParts(baseName).language; + + size_t index = language.index; + size_t length = language.length; + + // Step 5. + // FIXME: spec bug - not all production names updated. + JSString* str = NewDependentString(cx, baseName, index, length); + if (!str) { + return false; + } + + args.rval().setString(str); + return true; +} + +// get Intl.Locale.prototype.language +static bool Locale_language(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_language>(cx, args); +} + +// get Intl.Locale.prototype.script +static bool Locale_script(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + JSLinearString* baseName = locale->baseName()->ensureLinear(cx); + if (!baseName) { + return false; + } + + // Step 4 (Unnecessary assertion). + + auto script = BaseNameParts(baseName).script; + + // Step 5. + // FIXME: spec bug - not all production names updated. + if (!script) { + args.rval().setUndefined(); + return true; + } + + size_t index = script->index; + size_t length = script->length; + + // Step 6. + JSString* str = NewDependentString(cx, baseName, index, length); + if (!str) { + return false; + } + + args.rval().setString(str); + return true; +} + +// get Intl.Locale.prototype.script +static bool Locale_script(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_script>(cx, args); +} + +// get Intl.Locale.prototype.region +static bool Locale_region(JSContext* cx, const CallArgs& args) { + MOZ_ASSERT(IsLocale(args.thisv())); + + // Step 3. + auto* locale = &args.thisv().toObject().as<LocaleObject>(); + JSLinearString* baseName = locale->baseName()->ensureLinear(cx); + if (!baseName) { + return false; + } + + // Step 4 (Unnecessary assertion). + + auto region = BaseNameParts(baseName).region; + + // Step 5. + if (!region) { + args.rval().setUndefined(); + return true; + } + + size_t index = region->index; + size_t length = region->length; + + // Step 6. + JSString* str = NewDependentString(cx, baseName, index, length); + if (!str) { + return false; + } + + args.rval().setString(str); + return true; +} + +// get Intl.Locale.prototype.region +static bool Locale_region(JSContext* cx, unsigned argc, Value* vp) { + // Steps 1-2. + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod<IsLocale, Locale_region>(cx, args); +} + +static bool Locale_toSource(JSContext* cx, unsigned argc, Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + args.rval().setString(cx->names().Locale); + return true; +} + +static const JSFunctionSpec locale_methods[] = { + JS_FN("maximize", Locale_maximize, 0, 0), + JS_FN("minimize", Locale_minimize, 0, 0), + JS_FN(js_toString_str, Locale_toString, 0, 0), + JS_FN(js_toSource_str, Locale_toSource, 0, 0), JS_FS_END}; + +static const JSPropertySpec locale_properties[] = { + JS_PSG("baseName", Locale_baseName, 0), + JS_PSG("calendar", Locale_calendar, 0), + JS_PSG("collation", Locale_collation, 0), + JS_PSG("hourCycle", Locale_hourCycle, 0), + JS_PSG("caseFirst", Locale_caseFirst, 0), + JS_PSG("numeric", Locale_numeric, 0), + JS_PSG("numberingSystem", Locale_numberingSystem, 0), + JS_PSG("language", Locale_language, 0), + JS_PSG("script", Locale_script, 0), + JS_PSG("region", Locale_region, 0), + JS_STRING_SYM_PS(toStringTag, "Intl.Locale", JSPROP_READONLY), + JS_PS_END}; + +JSObject* js::CreateLocalePrototype(JSContext* cx, HandleObject Intl, + Handle<GlobalObject*> global) { + RootedFunction ctor(cx, + GlobalObject::createConstructor(cx, &Locale, cx->names().Locale, 1)); + if (!ctor) { + return nullptr; + } + + RootedObject proto( + cx, GlobalObject::createBlankPrototype<PlainObject>(cx, global)); + if (!proto) { + return nullptr; + } + + if (!LinkConstructorAndPrototype(cx, ctor, proto)) { + return nullptr; + } + + if (!DefinePropertiesAndFunctions(cx, proto, locale_properties, locale_methods)) { + return nullptr; + } + + RootedValue ctorValue(cx, ObjectValue(*ctor)); + if (!DefineProperty(cx, Intl, cx->names().Locale, ctorValue, nullptr, nullptr, 0)) { + return nullptr; + } + + return proto; +} + +bool js::intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, unsigned argc, + Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + MOZ_ASSERT(args.length() == 2); + + HandleValue tagValue = args[0]; + bool applyToString = args[1].toBoolean(); + + if (tagValue.isObject()) { + JSString* tagStr; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, tagStr, + LanguageTagFromMaybeWrappedLocale(cx, &tagValue.toObject())); + if (tagStr) { + args.rval().setString(tagStr); + return true; + } + } + + if (!applyToString && !tagValue.isString()) { + args.rval().setNull(); + return true; + } + + JSString* tagStr = ToString(cx, tagValue); + if (!tagStr) { + return false; + } + + RootedLinearString tagLinearStr(cx, tagStr->ensureLinear(cx)); + if (!tagLinearStr) { + return false; + } + + // Handle the common case (a standalone language) first. + // Only the following Unicode BCP 47 locale identifier subset is accepted: + // unicode_locale_id = unicode_language_id + // unicode_language_id = unicode_language_subtag + // unicode_language_subtag = alpha{2,3} + JSString* language; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, language, intl::ParseStandaloneISO639LanguageTag(cx, tagLinearStr)); + if (language) { + args.rval().setString(language); + return true; + } + + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) { + return false; + } + + if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) { + return false; + } + + StringBuffer sb(cx); + if (!tag.appendTo(cx, sb)) { + return false; + } + + JSString* resultStr = sb.finishString(); + if (!resultStr) { + return false; + } + args.rval().setString(resultStr); + return true; +} + +bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx, + unsigned argc, Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + MOZ_ASSERT(args.length() == 1); + + RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx)); + if (!linear) { + return false; + } + + LanguageTag tag(cx); + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, + LanguageTagParser::tryParse(cx, linear, tag)); + + // The caller handles invalid inputs. + if (!ok) { + args.rval().setNull(); + return true; + } + + if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) { + return false; + } + + StringBuffer sb(cx); + if (!tag.appendTo(cx, sb)) { + return false; + } + + JSString* resultStr = sb.finishString(); + if (!resultStr) { + return false; + } + args.rval().setString(resultStr); + return true; +} diff --git a/js/src/builtin/intl/Locale.h b/js/src/builtin/intl/Locale.h new file mode 100644 index 0000000000..31b3caca5c --- /dev/null +++ b/js/src/builtin/intl/Locale.h @@ -0,0 +1,61 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef builtin_intl_Locale_h +#define builtin_intl_Locale_h + +#include <stdint.h> + +#include "builtin/SelfHostingDefines.h" +#include "js/Class.h" +#include "vm/NativeObject.h" + +namespace js { + +class GlobalObject; + +class LocaleObject : public NativeObject { + public: + static const Class class_; + + static constexpr uint32_t LANGUAGE_TAG_SLOT = 0; + static constexpr uint32_t BASENAME_SLOT = 1; + static constexpr uint32_t UNICODE_EXTENSION_SLOT = 2; + static constexpr uint32_t SLOT_COUNT = 3; + + /** + * Returns the complete language tag, including any extensions and privateuse + * subtags. + */ + JSString* languageTag() const { + return getFixedSlot(LANGUAGE_TAG_SLOT).toString(); + } + + /** + * Returns the basename subtags, i.e. excluding any extensions and privateuse + * subtags. + */ + JSString* baseName() const { return getFixedSlot(BASENAME_SLOT).toString(); } + + const Value& unicodeExtension() const { + return getFixedSlot(UNICODE_EXTENSION_SLOT); + } +}; + +extern JSObject* CreateLocalePrototype(JSContext* cx, + JS::Handle<JSObject*> Intl, + JS::Handle<GlobalObject*> global); + +extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, + unsigned argc, + Value* vp); + +extern MOZ_MUST_USE bool intl_TryValidateAndCanonicalizeLanguageTag( + JSContext* cx, unsigned argc, Value* vp); + +} // namespace js + +#endif /* builtin_intl_Locale_h */ diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py index f2a6b32082..670a46357b 100644 --- a/js/src/builtin/intl/make_intl_data.py +++ b/js/src/builtin/intl/make_intl_data.py @@ -6,14 +6,15 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ Usage: - make_intl_data.py langtags [ldmlSupplemental.dtd supplementalMetadata.xml likelySubtags.xml] + make_intl_data.py langtags [cldr_core.zip] make_intl_data.py tzdata + make_intl_data.py unicode-ext Target "langtags": - This script extracts information about mappings between deprecated and - current Unicode BCP 47 locale identifiers from CLDR and converts it to - JavaScript object definitions in LangTagMappingsGenerated.js. The - definitions are used in Intl.js. + This script extracts information about 1) mappings between deprecated and + current Unicode BCP 47 locale identifiers, and 2) deprecated and current + BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping + code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp. Target "tzdata": @@ -27,17 +28,23 @@ import os import re import io import codecs -import shutil -import subprocess import sys import tarfile import tempfile import urllib2 -from contextlib import closing, contextmanager +from contextlib import closing from functools import partial -from itertools import chain, ifilter, ifilterfalse, imap, tee +from itertools import chain, ifilter, ifilterfalse, imap, izip_longest, groupby, tee from operator import attrgetter, itemgetter -from urlparse import urlsplit, urlunsplit +from urlparse import urlsplit +from zipfile import ZipFile + +# From https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return izip_longest(*args, fillvalue=fillvalue) def writeMappingHeader(println, description, source, url): if type(description) is not list: @@ -57,40 +64,134 @@ def writeMappingsVar(println, mapping, name, description, source, url): println(u"") writeMappingHeader(println, description, source, url) println(u"var {0} = {{".format(name)) - for key in sorted(mapping): - if not isinstance(mapping[key], dict): - value = mapping[key] - if isinstance(value, bool): - value = "true" if value else "false" - else: - value = '"{0}"'.format(value) - else: - preferred = mapping[key]["preferred"] - prefix = mapping[key]["prefix"] - if key != preferred: - raise Exception("Expected '{0}' matches preferred locale '{1}'".format(key, preferred)) - value = '"{0}"'.format(prefix) - println(u' "{0}": {1},'.format(key, value)) + for (key, value) in sorted(mapping.items(), key=itemgetter(0)): + println(u' "{0}": "{1}",'.format(key, value)) println(u"};") -def writeUpdateLocaleIdMappingsFunction(println, - complex_language_mappings, - complex_region_mappings, - description, source, url): - """ Writes a function definition that performs language tag mapping. """ +def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, mappings, + tag_maxlength, description, source, url): + """ Emit code to perform a binary search on language tag subtags. + + Uses the contents of |mapping|, which can either be a dictionary or set, + to emit a mapping function to find subtag replacements. + """ println(u"") writeMappingHeader(println, description, source, url) - println(u"""\ -/* eslint-disable complexity */ -function updateLocaleIdMappings(tag) { - assert(IsObject(tag), "tag is an object"); + println(u""" +bool js::intl::LanguageTag::{0}({1} {2}) {{ + MOZ_ASSERT({3}({2}.range())); +""".format(fn_name, type_name, name, validate_fn).strip()) + + def write_array(subtags, name, length, fixed): + if fixed: + println(u" static const char {}[{}][{}] = {{".format(name, len(subtags), + length + 1)) + else: + println(u" static const char* {}[{}] = {{".format(name, len(subtags))) + + # Group in pairs of ten to not exceed the 80 line column limit. + for entries in grouper(subtags, 10): + entries = (u"\"{}\"".format(tag).rjust(length + 2) + for tag in entries if tag is not None) + println(u" {},".format(u", ".join(entries))) + + println(u" };") + + trailing_return = True + + # Sort the subtags by length. That enables using an optimized comparator + # for the binary search, which only performs a single |memcmp| for multiple + # of two subtag lengths. + mappings_keys = mappings.keys() if type(mappings) == dict else mappings + for (length, subtags) in groupby(sorted(mappings_keys, key=len), len): + # Omit the length check if the current length is the maximum length. + if length != tag_maxlength: + println(u""" + if ({}.length() == {}) {{ +""".format(name, length).rstrip("\n")) + else: + trailing_return = False + println(u""" + { +""".rstrip("\n")) + + # The subtags need to be sorted for binary search to work. + subtags = sorted(subtags) + + def equals(subtag): + return u"""{}.equalTo("{}")""".format(name, subtag) + + # Don't emit a binary search for short lists. + if len(subtags) == 1: + if type(mappings) == dict: + println(u""" + if ({}) {{ + {}.set("{}"); + return true; + }} + return false; +""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n")) + else: + println(u""" + return {}; +""".format(equals(subtags[0])).strip("\n")) + elif len(subtags) <= 4: + if type(mappings) == dict: + for subtag in subtags: + println(u""" + if ({}) {{ + {}.set("{}"); + return true; + }} +""".format(equals(subtag), name, mappings[subtag]).strip("\n")) + + println(u""" + return false; +""".strip("\n")) + else: + cond = (equals(subtag) for subtag in subtags) + cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond) + println(u""" + return {}; +""".format(cond).strip("\n")) + else: + write_array(subtags, name + "s", length, True) + + if type(mappings) == dict: + write_array([mappings[k] for k in subtags], u"aliases", length, False) + + println(u""" + if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ + {0}.set(ConstCharRange(replacement, strlen(replacement))); + return true; + }} + return false; +""".format(name).rstrip()) + else: + println(u""" + return HasReplacement({0}s, {0}); +""".format(name).rstrip()) + + println(u""" + } +""".strip("\n")) + + if trailing_return: + println(u""" + return false;""") - // Replace deprecated language tags with their preferred values. - var language = tag.language; - if (hasOwn(language, languageMappings)) { - tag.language = languageMappings[language]; - } else if (hasOwn(language, complexLanguageMappings)) { - switch (language) {""") + println(u""" +}""".lstrip("\n")) + + +def writeComplexLanguageTagMappings(println, complex_language_mappings, + description, source, url): + println(u"") + writeMappingHeader(println, description, source, url) + println(u""" +void js::intl::LanguageTag::performComplexLanguageMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); +""".lstrip()) # Merge duplicate language entries. language_aliases = {} @@ -103,6 +204,7 @@ function updateLocaleIdMappings(tag) { else: language_aliases[key].append(deprecated_language) + first_language = True for (deprecated_language, (language, script, region)) in ( sorted(complex_language_mappings.items(), key=itemgetter(0)) ): @@ -110,43 +212,46 @@ function updateLocaleIdMappings(tag) { if deprecated_language in language_aliases[key]: continue - for lang in [deprecated_language] + language_aliases[key]: - println(u""" - case "{}": - """.format(lang).rstrip().strip("\n")) + if_kind = u"if" if first_language else u"else if" + first_language = False + + cond = (u"language().equalTo(\"{}\")".format(lang) + for lang in [deprecated_language] + language_aliases[key]) + cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) + + println(u""" + {} ({}) {{""".format(if_kind, cond).strip("\n")) println(u""" - tag.language = "{}"; - """.format(language).rstrip().strip("\n")) + setLanguage("{}");""".format(language).strip("\n")) + if script is not None: println(u""" - if (tag.script === undefined) - tag.script = "{}"; - """.format(script).rstrip().strip("\n")) + if (script().length() == 0) {{ + setScript("{}"); + }}""".format(script).strip("\n")) if region is not None: println(u""" - if (tag.region === undefined) - tag.region = "{}"; - """.format(region).rstrip().strip("\n")) + if (region().length() == 0) {{ + setRegion("{}"); + }}""".format(region).strip("\n")) println(u""" - break; - """.rstrip().strip("\n")) + }""".strip("\n")) println(u""" - default: - assert(false, "language not handled: " + language); - } - } +} +""".strip("\n")) - // No script replacements are currently present. - // Replace deprecated subtags with their preferred values. - var region = tag.region; - if (region !== undefined) { - if (hasOwn(region, regionMappings)) { - tag.region = regionMappings[region]; - } else if (hasOwn(region, complexRegionMappings)) { - switch (region) {""".lstrip("\n")) +def writeComplexRegionTagMappings(println, complex_region_mappings, + description, source, url): + println(u"") + writeMappingHeader(println, description, source, url) + println(u""" +void js::intl::LanguageTag::performComplexRegionMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region().range())); +""".lstrip()) # |non_default_replacements| is a list and hence not hashable. Convert it # to a string to get a proper hashable value. @@ -164,6 +269,7 @@ function updateLocaleIdMappings(tag) { else: region_aliases[key].append(deprecated_region) + first_region = True for (deprecated_region, (default, non_default_replacements)) in ( sorted(complex_region_mappings.items(), key=itemgetter(0)) ): @@ -171,91 +277,100 @@ function updateLocaleIdMappings(tag) { if deprecated_region in region_aliases[key]: continue - for region in [deprecated_region] + region_aliases[key]: - println(u""" - case "{}": - """.format(region).rstrip().strip("\n")) + if_kind = u"if" if first_region else u"else if" + first_region = False - for (language, script, region) in sorted(non_default_replacements, key=itemgetter(0)): - if script is None: - println(u""" - if (tag.language === "{}") {{ - """.format(language).rstrip().strip("\n")) - else: - println(u""" - if (tag.language === "{}" && tag.script === "{}") {{ - """.format(language, script).rstrip().strip("\n")) - println(u""" - tag.region = "{}"; - break; - }} - """.format(region).rstrip().strip("\n")) + cond = (u"region().equalTo(\"{}\")".format(region) + for region in [deprecated_region] + region_aliases[key]) + cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) println(u""" - tag.region = "{}"; - break; - """.format(default).rstrip().strip("\n")) + {} ({}) {{""".format(if_kind, cond).strip("\n")) - println(u""" - default: - assert(false, "region not handled: " + region); - } - } + replacement_regions = sorted({region for (_, _, region) in non_default_replacements}) - // No variant replacements are currently present. - // No extension replacements are currently present. - // Private use sequences are left as is. + first_case = True + for replacement_region in replacement_regions: + replacement_language_script = sorted(((language, script) + for (language, script, region) in ( + non_default_replacements + ) + if region == replacement_region), + key=itemgetter(0)) - } + if_kind = u"if" if first_case else u"else if" + first_case = False + + def compare_tags(language, script): + if script is None: + return u"language().equalTo(\"{}\")".format(language) + return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format( + language, script) + + cond = (compare_tags(language, script) + for (language, script) in replacement_language_script) + cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond) + + println(u""" + {} ({}) {{ + setRegion("{}"); + }}""".format(if_kind, cond, replacement_region).rstrip().strip("\n")) + + println(u""" + else {{ + setRegion("{}"); + }} + }}""".format(default).rstrip().strip("\n")) + + println(u""" } -/* eslint-enable complexity */ """.strip("\n")) -def writeGrandfatheredMappingsFunction(println, - grandfathered_mappings, +def writeGrandfatheredMappingsFunction(println, grandfathered_mappings, description, source, url): """ Writes a function definition that maps grandfathered language tags. """ println(u"") writeMappingHeader(println, description, source, url) println(u"""\ -function updateGrandfatheredMappings(tag) { - assert(IsObject(tag), "tag is an object"); - - // We're mapping regular grandfathered tags to non-grandfathered form here. - // Other tags remain unchanged. - // - // regular = "art-lojban" - // / "cel-gaulish" - // / "no-bok" - // / "no-nyn" - // / "zh-guoyu" - // / "zh-hakka" - // / "zh-min" - // / "zh-min-nan" - // / "zh-xiang" - // - // Therefore we can quickly exclude most tags by checking every - // |unicode_locale_id| subcomponent for characteristics not shared by any of - // the regular grandfathered (RG) tags: - // - // * Real-world |unicode_language_subtag|s are all two or three letters, - // so don't waste time running a useless |language.length > 3| fast-path. - // * No RG tag has a "script"-looking component. - // * No RG tag has a "region"-looking component. - // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, - // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, - // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag - // that |unicode_locale_id| doesn't support.) - // * No RG tag contains |extensions| or |pu_extensions|. - if (tag.script !== undefined || - tag.region !== undefined || - tag.variants.length !== 1 || - tag.extensions.length !== 0 || - tag.privateuse !== undefined) - { - return; - }""") +bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { + // We're mapping regular grandfathered tags to non-grandfathered form here. + // Other tags remain unchanged. + // + // regular = "art-lojban" + // / "cel-gaulish" + // / "no-bok" + // / "no-nyn" + // / "zh-guoyu" + // / "zh-hakka" + // / "zh-min" + // / "zh-min-nan" + // / "zh-xiang" + // + // Therefore we can quickly exclude most tags by checking every + // |unicode_locale_id| subcomponent for characteristics not shared by any of + // the regular grandfathered (RG) tags: + // + // * Real-world |unicode_language_subtag|s are all two or three letters, + // so don't waste time running a useless |language.length > 3| fast-path. + // * No RG tag has a "script"-looking component. + // * No RG tag has a "region"-looking component. + // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, + // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, + // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag + // that |unicode_locale_id| doesn't support.) + // * No RG tag contains |extensions| or |pu_extensions|. + if (script().length() != 0 || + region().length() != 0 || + variants().length() != 1 || + extensions().length() != 0 || + privateuse()) { + return true; + } + + auto variantEqualTo = [this](const char* variant) { + return strcmp(variants()[0].get(), variant) == 0; + };""") # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. # @@ -316,60 +431,57 @@ function updateGrandfatheredMappings(tag) { modern_privateuse = modern_match.group("privateuse") println(u""" - // {} -> {} + // {} -> {} """.format(tag, modern).rstrip()) println(u""" - {}if (tag.language === "{}" && tag.variants[0] === "{}") {{ - """.format("" if is_first else "else ", tag_language, tag_variant).rstrip().strip("\n")) + {}if (language().equalTo("{}") && variantEqualTo("{}")) {{ + """.format("" if is_first else "else ", + tag_language, + tag_variant).rstrip().strip("\n")) is_first = False println(u""" - tag.language = "{}"; + setLanguage("{}"); """.format(modern_language).rstrip().strip("\n")) if modern_script is not None: println(u""" - tag.script = "{}"; - """.format(modern_script).rstrip().strip("\n")) + setScript("{}"); + """.format(modern_script).rstrip().strip("\n")) if modern_region is not None: println(u""" - tag.region = "{}"; - """.format(modern_region).rstrip().strip("\n")) + setRegion("{}"); + """.format(modern_region).rstrip().strip("\n")) - if modern_variants is not None: - println(u""" - tag.variants = {}; - """.format(sorted(modern_variants[1:].split("-"))).rstrip().strip("\n")) - else: - println(u""" - tag.variants.length = 0; + assert modern_variants is None, ( + "all regular grandfathered tags' modern forms do not contain variant subtags") + + println(u""" + clearVariants(); """.rstrip().strip("\n")) if modern_privateuse is not None: println(u""" - tag.privateuse = "{}"; - """.format(modern_privateuse).rstrip().strip("\n")) + auto privateuse = DuplicateString(cx, "{}"); + if (!privateuse) {{ + return false; + }} + setPrivateuse(std::move(privateuse)); + """.format(modern_privateuse).rstrip().rstrip("\n")) println(u""" - }""".rstrip().strip("\n")) + return true; + }""".rstrip().strip("\n")) println(u""" -}""".lstrip("\n")) - - -@contextmanager -def TemporaryDirectory(): - tmpDir = tempfile.mkdtemp() - try: - yield tmpDir - finally: - shutil.rmtree(tmpDir) + return true; +}""") -def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, likely_subtags_file): +def readSupplementalData(core_file): """ Reads CLDR Supplemental Data and extracts information for Intl.js. Information extracted: @@ -379,19 +491,11 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like - complexLanguageMappings: mappings from language subtags with complex rules - regionMappings: mappings from region subtags to preferred subtags - complexRegionMappings: mappings from region subtags with complex rules - Returns these five mappings as dictionaries. + - likelySubtags: likely subtags used for generating test data only + Returns these mappings as dictionaries. """ import xml.etree.ElementTree as ET - # <!ATTLIST version cldrVersion CDATA #FIXED "36" > - re_cldr_version = re.compile( - r"""<!ATTLIST version cldrVersion CDATA #FIXED "(?P<version>[\d|\.]+)" >""") - - with io.open(supplemental_dtd_file, mode="r", encoding="utf-8") as f: - version_match = re_cldr_version.search(f.read()) - assert version_match is not None, "CLDR version string not found" - cldr_version = version_match.group("version") - # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. re_unicode_language_id = re.compile( r""" @@ -497,7 +601,7 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like script.title() if script else None, region.upper() if region else None) - tree = ET.parse(supplemental_metadata_file) + tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml")) for language_alias in tree.iterfind(".//languageAlias"): type = bcp47_id(language_alias.get("type")) @@ -547,7 +651,7 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like ), "{} invalid region subtags".format(replacement) complex_region_mappings[type] = replacements - tree = ET.parse(likely_subtags_file) + tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml")) likely_subtags = {} @@ -608,133 +712,441 @@ def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, like else: region_mappings[deprecated_region] = default - return {"version": cldr_version, - "grandfatheredMappings": grandfathered_mappings, + return {"grandfatheredMappings": grandfathered_mappings, "languageMappings": language_mappings, "complexLanguageMappings": complex_language_mappings, "regionMappings": region_mappings, "complexRegionMappings": complex_region_mappings_final, + "likelySubtags": likely_subtags, } +def readUnicodeExtensions(core_file): + import xml.etree.ElementTree as ET + + # Match all xml-files in the BCP 47 directory. + bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$") + + # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier + # + # type = alphanum{3,8} (sep alphanum{3,8})* ; + typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$") + + # Mapping from Unicode extension types to dict of deprecated to + # preferred values. + mapping = {} + + def readBCP47File(file): + tree = ET.parse(file) + for keyword in tree.iterfind(".//keyword/key"): + # Skip over keywords whose extension is not "u". + if keyword.get("extension", "u") != "u": + continue + + extension_name = keyword.get("name") + + for type in keyword.iterfind("type"): + # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: + # + # The key or type name used by Unicode locale extension with 'u' extension + # syntax or the 't' extensions syntax. When alias below is absent, this name + # can be also used with the old style "@key=type" syntax. + name = type.get("name") + + # Ignore the special name: + # - <https://unicode.org/reports/tr35/#CODEPOINTS> + # - <https://unicode.org/reports/tr35/#REORDER_CODE> + # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE> + # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE> + # - <https://unicode.org/reports/tr35/#PRIVATE_USE> + if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE", + "PRIVATE_USE"): + continue + + # All other names should match the 'type' production. + assert typeRE.match(name) is not None, ( + "{} matches the 'type' production".format(name)) + + # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: + # + # The preferred value of the deprecated key, type or attribute element. + # When a key, type or attribute element is deprecated, this attribute is + # used for specifying a new canonical form if available. + preferred = type.get("preferred") + + # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: + # + # The BCP 47 form is the canonical form, and recommended. Other aliases are + # included only for backwards compatibility. + alias = type.get("alias") + + # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> + # + # Use the bcp47 data to replace keys, types, tfields, and tvalues by their + # canonical forms. See Section 3.6.4 U Extension Data Files) and Section + # 3.7.1 T Extension Data Files. The aliases are in the alias attribute + # value, while the canonical is in the name attribute value. + + # 'preferred' contains the new preferred name, 'alias' the compatibility + # name, but then there's this entry where 'preferred' and 'alias' are the + # same. So which one to choose? Assume 'preferred' is the actual canonical + # name. + # + # <type name="islamicc" + # description="Civil (algorithmic) Arabic calendar" + # deprecated="true" + # preferred="islamic-civil" + # alias="islamic-civil"/> + + if preferred is not None: + assert typeRE.match(preferred), preferred + mapping.setdefault(extension_name, {})[name] = preferred + + if alias is not None: + for alias_name in alias.lower().split(" "): + # Ignore alias entries which don't match the 'type' production. + if typeRE.match(alias_name) is None: + continue + + # See comment above when 'alias' and 'preferred' are both present. + if (preferred is not None and + name in mapping[extension_name]): + continue + + # Skip over entries where 'name' and 'alias' are equal. + # + # <type name="pst8pdt" + # description="POSIX style time zone for US Pacific Time" + # alias="PST8PDT" + # since="1.8"/> + if name == alias_name: + continue + + mapping.setdefault(extension_name, {})[alias_name] = name + + def readSupplementalMetadata(file): + # Find subdivision and region replacements. + # + # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> + # + # Replace aliases in special key values: + # - If there is an 'sd' or 'rg' key, replace any subdivision alias + # in its value in the same way, using subdivisionAlias data. + tree = ET.parse(file) + for alias in tree.iterfind(".//subdivisionAlias"): + type = alias.get("type") + assert typeRE.match(type) is not None, ( + "{} matches the 'type' production".format(type)) + + # Take the first replacement when multiple ones are present. + replacement = alias.get("replacement").split(" ")[0].lower() + + # Skip over invalid replacements. + # + # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/> + # + # It's not entirely clear to me if CLDR actually wants to use + # "axzzzz" as the replacement for this case. + if typeRE.match(replacement) is None: + continue + + # 'subdivisionAlias' applies to 'rg' and 'sd' keys. + mapping.setdefault("rg", {})[type] = replacement + mapping.setdefault("sd", {})[type] = replacement + + for name in core_file.namelist(): + if bcpFileRE.match(name): + readBCP47File(core_file.open(name)) + + readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml")) + + return mapping + def writeCLDRLanguageTagData(println, data, url): """ Writes the language tag data to the Intl data file. """ + println(generatedFileWarning) + println(u"// Version: CLDR-{}".format(data["version"])) + println(u"// URL: {}".format(url)) + + println(u""" +#include "mozilla/Assertions.h" +#include "mozilla/Range.h" +#include "mozilla/TextUtils.h" + +#include <algorithm> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <type_traits> + +#include "jscntxt.h" +#include "jsstr.h" + +#include "builtin/intl/LanguageTag.h" + +using namespace js::intl::LanguageTagLimits; +using ConstCharRange = mozilla::Range<const char>; + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline bool HasReplacement( + const char (&subtags)[Length][TagLength], + const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.range().begin().get(); + return std::binary_search(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); +} + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline const char* SearchReplacement( + const char (&subtags)[Length][TagLength], + const char* (&aliases)[Length], + const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.range().begin().get(); + auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); + if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { + return aliases[std::distance(std::begin(subtags), p)]; + } + return nullptr; +} +""".rstrip()) + source = u"CLDR Supplemental Data, version {}".format(data["version"]) grandfathered_mappings = data["grandfatheredMappings"] language_mappings = data["languageMappings"] complex_language_mappings = data["complexLanguageMappings"] region_mappings = data["regionMappings"] complex_region_mappings = data["complexRegionMappings"] + unicode_mappings = data["unicodeMappings"] + + # unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + language_maxlength = 8 + + # unicode_region_subtag = (alpha{2} | digit{3}) ; + region_maxlength = 3 + + writeMappingsBinarySearch(println, "languageMapping", + "LanguageSubtag&", "language", + "IsStructurallyValidLanguageTag", + language_mappings, language_maxlength, + "Mappings from language subtags to preferred values.", source, url) + writeMappingsBinarySearch(println, "complexLanguageMapping", + "const LanguageSubtag&", "language", + "IsStructurallyValidLanguageTag", + complex_language_mappings.keys(), language_maxlength, + "Language subtags with complex mappings.", source, url) + writeMappingsBinarySearch(println, "regionMapping", + "RegionSubtag&", "region", + "IsStructurallyValidRegionTag", + region_mappings, region_maxlength, + "Mappings from region subtags to preferred values.", source, url) + writeMappingsBinarySearch(println, "complexRegionMapping", + "const RegionSubtag&", "region", + "IsStructurallyValidRegionTag", + complex_region_mappings.keys(), region_maxlength, + "Region subtags with complex mappings.", source, url) + + writeComplexLanguageTagMappings(println, complex_language_mappings, + "Language subtags with complex mappings.", source, url) + writeComplexRegionTagMappings(println, complex_region_mappings, + "Region subtags with complex mappings.", source, url) - writeMappingsVar(println, grandfathered_mappings, "grandfatheredMappings", - "Mappings from grandfathered tags to preferred values.", source, url) - writeMappingsVar(println, language_mappings, "languageMappings", - "Mappings from language subtags to preferred values.", source, url) - writeMappingsVar(println, {key: True for key in complex_language_mappings}, - "complexLanguageMappings", - "Language subtags with complex mappings.", source, url) - writeMappingsVar(println, region_mappings, "regionMappings", - "Mappings from region subtags to preferred values.", source, url) - writeMappingsVar(println, {key: True for key in complex_region_mappings}, - "complexRegionMappings", - "Region subtags with complex mappings.", source, url) - - writeUpdateLocaleIdMappingsFunction(println, complex_language_mappings, - complex_region_mappings, - "Canonicalize Unicode BCP 47 locale identifiers.", - source, url) writeGrandfatheredMappingsFunction(println, grandfathered_mappings, - "Canonicalize grandfathered locale identifiers.", - source, url) + "Canonicalize grandfathered locale identifiers.", source, + url) + + writeUnicodeExtensionsMappings(println, unicode_mappings) + + +def writeCLDRLanguageTagLikelySubtagsTest(println, data, url): + """ Writes the likely-subtags test file. """ + + println(generatedFileWarning) + + source = u"CLDR Supplemental Data, version {}".format(data["version"]) + language_mappings = data["languageMappings"] + complex_language_mappings = data["complexLanguageMappings"] + region_mappings = data["regionMappings"] + complex_region_mappings = data["complexRegionMappings"] + likely_subtags = data["likelySubtags"] + + def bcp47(tag): + (language, script, region) = tag + return "{}{}{}".format(language, + "-" + script if script else "", + "-" + region if region else "") + + def canonical(tag): + (language, script, region) = tag + + # Map deprecated language subtags. + if language in language_mappings: + language = language_mappings[language] + elif language in complex_language_mappings: + (language2, script2, region2) = complex_language_mappings[language] + (language, script, region) = (language2, + script if script else script2, + region if region else region2) + + # Map deprecated region subtags. + if region in region_mappings: + region = region_mappings[region] + else: + # Assume no complex region mappings are needed for now. + assert region not in complex_region_mappings,\ + "unexpected region with complex mappings: {}".format(region) + + return (language, script, region) + + # https://unicode.org/reports/tr35/#Likely_Subtags + + def addLikelySubtags(tag): + # Step 1: Canonicalize. + (language, script, region) = canonical(tag) + if script == "Zzzz": + script = None + if region == "ZZ": + region = None + + # Step 2: Lookup. + searches = ((language, script, region), + (language, None, region), + (language, script, None), + (language, None, None), + ("und", script, None)) + search = next(search for search in searches if search in likely_subtags) + + (language_s, script_s, region_s) = search + (language_m, script_m, region_m) = likely_subtags[search] + + # Step 3: Return. + return (language if language != language_s else language_m, + script if script != script_s else script_m, + region if region != region_s else region_m) + + # https://unicode.org/reports/tr35/#Likely_Subtags + def removeLikelySubtags(tag): + # Step 1: Add likely subtags. + max = addLikelySubtags(tag) + + # Step 2: Remove variants (doesn't apply here). + + # Step 3: Find a match. + (language, script, region) = max + for trial in ((language, None, None), (language, None, region), (language, script, None)): + if addLikelySubtags(trial) == max: + return trial + + # Step 4: Return maximized if no match found. + return max + + def likely_canonical(from_tag, to_tag): + # Canonicalize the input tag. + from_tag = canonical(from_tag) + + # Update the expected result if necessary. + if from_tag in likely_subtags: + to_tag = likely_subtags[from_tag] + + # Canonicalize the expected output. + to_canonical = canonical(to_tag) + + # Sanity check: This should match the result of |addLikelySubtags|. + assert to_canonical == addLikelySubtags(from_tag) + + return to_canonical + + # |likely_subtags| contains non-canonicalized tags, so canonicalize it first. + likely_subtags_canonical = {k: likely_canonical(k, v) for (k, v) in likely_subtags.items()} + + # Add test data for |Intl.Locale.prototype.maximize()|. + writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()}, + "maxLikelySubtags", "Extracted from likelySubtags.xml.", source, url) + + # Use the maximalized tags as the input for the remove likely-subtags test. + minimized = {tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()} + + # Add test data for |Intl.Locale.prototype.minimize()|. + writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in minimized.items()}, + "minLikelySubtags", "Extracted from likelySubtags.xml.", source, url) + + println(u""" +for (let [tag, maximal] of Object.entries(maxLikelySubtags)) { + assertEq(new Intl.Locale(tag).maximize().toString(), maximal); +}""") + + println(u""" +for (let [tag, minimal] of Object.entries(minLikelySubtags)) { + assertEq(new Intl.Locale(tag).minimize().toString(), minimal); +}""") + + println(u""" +if (typeof reportCompare === "function") + reportCompare(0, 0);""") def updateCLDRLangTags(args): - """ Update the LangTagMappingsCLDRGenerated.js file. """ + """ Update the LanguageTagGenerated.cpp file. """ + version = args.version url = args.url - branch = args.branch - revision = args.revision out = args.out - files = args.files + filename = args.file + + url = url.replace("<VERSION>", version) print("Arguments:") + print("\tCLDR version: %s" % version) print("\tDownload url: %s" % url) - print("\tBranch: %s" % branch) - print("\tRevision: %s" % revision) - print("\tLocal supplemental data and likely subtags: %s" % files) + if filename is not None: + print("\tLocal CLDR core.zip file: %s" % filename) print("\tOutput file: %s" % out) print("") - if files: - if len(files) != 3: - raise Exception("Expected three files, but got: {}".format(files)) + data = { + "version": version, + } - print(("Always make sure you have the newest ldmlSupplemental.dtd, " - "supplementalMetadata.xml, and likelySubtags.xml!")) + def readFiles(cldr_file): + with ZipFile(cldr_file) as zip_file: + data.update(readSupplementalData(zip_file)) + data["unicodeMappings"] = readUnicodeExtensions(zip_file) - supplemental_dtd_file = files[0] - supplemental_metadata_file = files[1] - likely_subtags_file = files[2] + print("Processing CLDR data...") + if filename is not None: + print("Always make sure you have the newest CLDR core.zip!") + with open(filename, "rb") as cldr_file: + readFiles(cldr_file) else: - print("Downloading CLDR supplemental data...") - - supplemental_dtd_filename = "ldmlSupplemental.dtd" - supplemental_dtd_path = "common/dtd/{}".format(supplemental_dtd_filename) - supplemental_dtd_file = os.path.join(os.getcwd(), supplemental_dtd_filename) - - supplemental_metadata_filename = "supplementalMetadata.xml" - supplemental_metadata_path = "common/supplemental/{}".format( - supplemental_metadata_filename) - supplemental_metadata_file = os.path.join(os.getcwd(), supplemental_metadata_filename) - - likely_subtags_filename = "likelySubtags.xml" - likely_subtags_path = "common/supplemental/{}".format(likely_subtags_filename) - likely_subtags_file = os.path.join(os.getcwd(), likely_subtags_filename) - - # Try to download the raw file directly from GitHub if possible. - split = urlsplit(url) - if split.netloc == "github.com" and split.path.endswith(".git") and revision == "HEAD": - def download(path, file): - urlpath = "{}/raw/{}/{}".format(urlsplit(url).path[:-4], branch, path) - raw_url = urlunsplit((split.scheme, split.netloc, urlpath, split.query, - split.fragment)) - - with closing(urllib2.urlopen(raw_url)) as reader: - text = reader.read().decode("utf-8") - with io.open(file, "w", encoding="utf-8") as saved_file: - saved_file.write(text) - - download(supplemental_dtd_path, supplemental_dtd_file) - download(supplemental_metadata_path, supplemental_metadata_file) - download(likely_subtags_path, likely_subtags_file) - else: - # Download the requested branch in a temporary directory. - with TemporaryDirectory() as inDir: - if revision == "HEAD": - subprocess.check_call(["git", "clone", "--depth=1", - "--branch=%s" % branch, url, inDir]) - else: - subprocess.check_call(["git", "clone", "--single-branch", - "--branch=%s" % branch, url, inDir]) - subprocess.check_call(["git", "-C", inDir, "reset", "--hard", revision]) - - shutil.copyfile(os.path.join(inDir, supplemental_dtd_path), - supplemental_dtd_file) - shutil.copyfile(os.path.join(inDir, supplemental_metadata_path), - supplemental_metadata_file) - shutil.copyfile(os.path.join(inDir, likely_subtags_path), likely_subtags_file) - - print("Processing CLDR supplemental data...") - data = readSupplementalData(supplemental_dtd_file, - supplemental_metadata_file, - likely_subtags_file) + print("Downloading CLDR core.zip...") + with closing(urllib2.urlopen(url)) as cldr_file: + cldr_data = io.BytesIO(cldr_file.read()) + readFiles(cldr_data) print("Writing Intl data...") with io.open(out, mode="w", encoding="utf-8", newline="") as f: println = partial(print, file=f) - - println(u"// Generated by make_intl_data.py. DO NOT EDIT.") writeCLDRLanguageTagData(println, data, url) + print("Writing Intl test data...") + test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "../../tests/non262/Intl/Locale/likely-subtags-generated.js") + with io.open(test_file, mode="w", encoding="utf-8", newline="") as f: + println = partial(print, file=f) + + println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||" + u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))") + writeCLDRLanguageTagLikelySubtagsTest(println, data, url) + def flines(filepath, encoding="utf-8"): """ Open filepath and iterate over its content. """ @@ -1448,6 +1860,158 @@ def updateTzdata(topsrcdir, args): else: updateFrom(tzDir) +def writeUnicodeExtensionsMappings(println, mapping): + println(u""" +template <size_t Length> +static inline bool IsUnicodeKey(const ConstCharRange& key, + const char (&str)[Length]) { + static_assert(Length == UnicodeKeyLength + 1, + "Unicode extension key is two characters long"); + return memcmp(key.begin().get(), str, Length - 1) == 0; +} + +template <size_t Length> +static inline bool IsUnicodeType(const ConstCharRange& type, + const char (&str)[Length]) { + static_assert(Length > UnicodeKeyLength + 1, + "Unicode extension type contains more than two characters"); + return type.length() == (Length - 1) && + memcmp(type.begin().get(), str, Length - 1) == 0; +} + +static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) { +#ifdef DEBUG + auto isNull = [](char c) { + return c == '\\0'; + }; +#endif + + MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull), + "unexpected null-character in string"); + + using UnsignedChar = unsigned char; + for (size_t i = 0; i < b.length(); i++) { + // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if + // we've reached the end of |a|, the below if-statement will always be true. + // That ensures we don't read past the end of |a|. + if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) { + return r; + } + } + + // Return zero if both strings are equal or a negative number if |b| is a + // prefix of |a|. + return -int32_t(UnsignedChar(a[b.length()])); +}; + +template <size_t Length> +static inline const char* SearchReplacement(const char* (&types)[Length], + const char* (&aliases)[Length], + const ConstCharRange& type) { + + auto p = std::lower_bound(std::begin(types), std::end(types), type, + [](const auto& a, const auto& b) { + return CompareUnicodeType(a, b) < 0; + }); + if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) { + return aliases[std::distance(std::begin(types), p)]; + } + return nullptr; +} + +/** + * Mapping from deprecated BCP 47 Unicode extension types to their preferred + * values. + * + * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + */ +const char* js::intl::LanguageTag::replaceUnicodeExtensionType( + const ConstCharRange& key, const ConstCharRange& type) { +#ifdef DEBUG + static auto isAsciiLowercaseAlphanumeric = [](char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); + }; + + static auto isAsciiLowercaseAlphanumericOrDash = [](char c) { + return isAsciiLowercaseAlphanumeric(c) || c == '-'; + }; +#endif + + MOZ_ASSERT(key.length() == UnicodeKeyLength); + MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(), + isAsciiLowercaseAlphanumeric)); + + MOZ_ASSERT(type.length() > UnicodeKeyLength); + MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(), + isAsciiLowercaseAlphanumericOrDash)); +""") + + def to_hash_key(replacements): + return str(sorted(replacements.items())) + + def write_array(subtags, name, length): + max_entries = (80 - len(" ")) // (length + len('"", ')) + + println(u" static const char* {}[{}] = {{".format(name, len(subtags))) + + for entries in grouper(subtags, max_entries): + entries = (u"\"{}\"".format(tag).rjust(length + 2) + for tag in entries if tag is not None) + println(u" {},".format(u", ".join(entries))) + + println(u" };") + + # Merge duplicate keys. + key_aliases = {} + for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): + hash_key = to_hash_key(replacements) + if hash_key not in key_aliases: + key_aliases[hash_key] = [] + else: + key_aliases[hash_key].append(key) + + first_key = True + for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): + hash_key = to_hash_key(replacements) + if key in key_aliases[hash_key]: + continue + + cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key]) + + if_kind = u"if" if first_key else u"else if" + cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) + println(u""" + {} ({}) {{""".format(if_kind, cond).strip("\n")) + first_key = False + + replacements = sorted(replacements.items(), key=itemgetter(0)) + + if len(replacements) > 4: + types = [t for (t, _) in replacements] + preferred = [r for (_, r) in replacements] + max_len = max(len(k) for k in types + preferred) + + write_array(types, "types", max_len) + write_array(preferred, "aliases", max_len) + println(u""" + return SearchReplacement(types, aliases, type); +""".strip("\n")) + else: + for (type, replacement) in replacements: + println(u""" + if (IsUnicodeType(type, "{}")) {{ + return "{}"; + }}""".format(type, replacement).strip("\n")) + + println(u""" + }""".lstrip("\n")) + + println(u""" + return nullptr; +} +""".strip("\n")) + + if __name__ == "__main__": import argparse @@ -1468,21 +2032,21 @@ if __name__ == "__main__": parser_cldr_tags = subparsers.add_parser("langtags", help="Update CLDR language tags data") + parser_cldr_tags.add_argument("--version", + metavar="VERSION", + required=True, + help="CLDR version number") parser_cldr_tags.add_argument("--url", metavar="URL", - default="https://github.com/unicode-org/cldr.git", - help="URL to git repository (default: %(default)s)") - parser_cldr_tags.add_argument("--branch", default="latest", - help="Git branch (default: %(default)s)") - parser_cldr_tags.add_argument("--revision", default="HEAD", - help="Git revision (default: %(default)s)") + default="https://unicode.org/Public/cldr/<VERSION>/core.zip", + type=EnsureHttps, + help="Download url CLDR data (default: %(default)s)") parser_cldr_tags.add_argument("--out", - default="LangTagMappingsGenerated.js", + default="LanguageTagGenerated.cpp", help="Output file (default: %(default)s)") - parser_cldr_tags.add_argument("files", - nargs="*", - help="Local ldmlSupplemental.dtd, supplementalMetadata.xml, " - "and likelySubtags.xml files, if omitted uses <URL>") + parser_cldr_tags.add_argument("file", + nargs="?", + help="Local cldr-core.zip file, if omitted uses <URL>") parser_cldr_tags.set_defaults(func=updateCLDRLangTags) parser_tz = subparsers.add_parser("tzdata", help="Update tzdata") diff --git a/js/src/moz.build b/js/src/moz.build index 32102bde39..cecb7ae32d 100644 --- a/js/src/moz.build +++ b/js/src/moz.build @@ -118,6 +118,9 @@ main_deunified_sources = [ 'builtin/intl/CommonFunctions.cpp', 'builtin/intl/DateTimeFormat.cpp', 'builtin/intl/IntlObject.cpp', + 'builtin/intl/LanguageTag.cpp', + 'builtin/intl/LanguageTagGenerated.cpp', + 'builtin/intl/Locale.cpp', 'builtin/intl/NumberFormat.cpp', 'builtin/intl/PluralRules.cpp', 'builtin/intl/RelativeTimeFormat.cpp', @@ -709,7 +712,6 @@ selfhosted.inputs = [ 'builtin/intl/CommonFunctions.js', 'builtin/intl/DateTimeFormat.js', 'builtin/intl/IntlObject.js', - 'builtin/intl/LangTagMappingsGenerated.js', 'builtin/intl/NumberFormat.js', 'builtin/intl/PluralRules.js', 'builtin/intl/RelativeTimeFormat.js', diff --git a/js/src/vm/CommonPropertyNames.h b/js/src/vm/CommonPropertyNames.h index 1d398190ae..d5e7a2d058 100644 --- a/js/src/vm/CommonPropertyNames.h +++ b/js/src/vm/CommonPropertyNames.h @@ -51,6 +51,7 @@ macro(byteOffset, byteOffset, "byteOffset") \ macro(bytes, bytes, "bytes") \ macro(BYTES_PER_ELEMENT, BYTES_PER_ELEMENT, "BYTES_PER_ELEMENT") \ + macro(calendar, calendar, "calendar") \ macro(call, call, "call") \ macro(callContentFunction, callContentFunction, "callContentFunction") \ macro(callee, callee, "callee") \ @@ -61,6 +62,7 @@ macro(catch, catch_, "catch") \ macro(class, class_, "class") \ macro(close, close, "close") \ + macro(collation, collation, "collation") \ macro(Collator, Collator, "Collator") \ macro(CollatorCompareGet, CollatorCompareGet, "Intl_Collator_compare_get") \ macro(collections, collections, "collections") \ @@ -177,6 +179,7 @@ macro(hasOwn, hasOwn, "hasOwn") \ macro(hasOwnProperty, hasOwnProperty, "hasOwnProperty") \ macro(hour, hour, "hour") \ + macro(hourCycle, hourCycle, "hourCycle") \ macro(if, if_, "if") \ macro(ignoreCase, ignoreCase, "ignoreCase") \ macro(ignorePunctuation, ignorePunctuation, "ignorePunctuation") \ @@ -190,6 +193,7 @@ macro(Infinity, Infinity, "Infinity") \ macro(InitializeCollator, InitializeCollator, "InitializeCollator") \ macro(InitializeDateTimeFormat, InitializeDateTimeFormat, "InitializeDateTimeFormat") \ + macro(InitializeLocale, InitializeLocale, "InitializeLocale") \ macro(InitializeNumberFormat, InitializeNumberFormat, "InitializeNumberFormat") \ macro(InitializePluralRules, InitializePluralRules, "InitializePluralRules") \ macro(InitializeRelativeTimeFormat, InitializeRelativeTimeFormat, "InitializeRelativeTimeFormat") \ @@ -218,6 +222,7 @@ macro(js, js, "js") \ macro(keys, keys, "keys") \ macro(label, label, "label") \ + macro(language, language, "language") \ macro(lastIndex, lastIndex, "lastIndex") \ macro(LegacyGeneratorCloseInternal, LegacyGeneratorCloseInternal, "LegacyGeneratorCloseInternal") \ macro(length, length, "length") \ @@ -226,6 +231,7 @@ macro(lineNumber, lineNumber, "lineNumber") \ macro(literal, literal, "literal") \ macro(loc, loc, "loc") \ + macro(Locale, Locale, "Locale") \ macro(locale, locale, "locale") \ macro(lookupGetter, lookupGetter, "__lookupGetter__") \ macro(lookupSetter, lookupSetter, "__lookupSetter__") \ @@ -263,6 +269,7 @@ macro(noStack, noStack, "noStack") \ macro(notes, notes, "notes") \ macro(NumberFormat, NumberFormat, "NumberFormat") \ + macro(numberingSystem, numberingSystem, "numberingSystem") \ macro(NumberFormatFormatGet, NumberFormatFormatGet, "Intl_NumberFormat_format_get") \ macro(numeric, numeric, "numeric") \ macro(objectArguments, objectArguments, "[object Arguments]") \ @@ -306,6 +313,7 @@ macro(reason, reason, "reason") \ macro(RegExpFlagsGetter, RegExpFlagsGetter, "RegExpFlagsGetter") \ macro(RegExpStringIterator, RegExpStringIterator, "RegExp String Iterator") \ + macro(region, region, "region") \ macro(Reify, Reify, "Reify") \ macro(reject, reject, "reject") \ macro(rejected, rejected, "rejected") \ diff --git a/js/src/vm/GlobalObject.h b/js/src/vm/GlobalObject.h index bf9255e85e..1e10fe5da3 100644 --- a/js/src/vm/GlobalObject.h +++ b/js/src/vm/GlobalObject.h @@ -113,6 +113,7 @@ class GlobalObject : public NativeObject DATE_TIME_FORMAT_PROTO, PLURAL_RULES_PROTO, RELATIVE_TIME_FORMAT_PROTO, + LOCALE_PROTO, MODULE_PROTO, IMPORT_ENTRY_PROTO, EXPORT_ENTRY_PROTO, @@ -501,6 +502,11 @@ class GlobalObject : public NativeObject return getOrCreateObject(cx, global, COLLATOR_PROTO, initIntlObject); } + static JSObject* + getOrCreateLocalePrototype(JSContext* cx, Handle<GlobalObject*> global) { + return getOrCreateObject(cx, global, LOCALE_PROTO, initIntlObject); + } + static JSFunction* getOrCreateNumberFormatConstructor(JSContext* cx, Handle<GlobalObject*> global) { JSObject* obj = getOrCreateObject(cx, global, NUMBER_FORMAT, initIntlObject); diff --git a/js/src/vm/SelfHosting.cpp b/js/src/vm/SelfHosting.cpp index fff1baf630..cce912759d 100644 --- a/js/src/vm/SelfHosting.cpp +++ b/js/src/vm/SelfHosting.cpp @@ -25,6 +25,7 @@ #include "builtin/intl/Collator.h" #include "builtin/intl/DateTimeFormat.h" #include "builtin/intl/IntlObject.h" +#include "builtin/intl/Locale.h" #include "builtin/intl/NumberFormat.h" #include "builtin/intl/PluralRules.h" #include "builtin/intl/RelativeTimeFormat.h" @@ -2486,6 +2487,8 @@ static const JSFunctionSpec intrinsic_functions[] = { JS_FN("intl_SelectPluralRule", intl_SelectPluralRule, 2,0), JS_FN("intl_toLocaleLowerCase", intl_toLocaleLowerCase, 2,0), JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0), + JS_FN("intl_ValidateAndCanonicalizeLanguageTag", intl_ValidateAndCanonicalizeLanguageTag, 2, 0), + JS_FN("intl_TryValidateAndCanonicalizeLanguageTag", intl_TryValidateAndCanonicalizeLanguageTag, 1, 0), JS_FN("intl_RelativeTimeFormat_availableLocales", intl_RelativeTimeFormat_availableLocales, 0,0), JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0), diff --git a/js/src/vm/String.h b/js/src/vm/String.h index 4c43439cd9..5eaf9e0c2e 100644 --- a/js/src/vm/String.h +++ b/js/src/vm/String.h @@ -1131,6 +1131,20 @@ class StaticStrings static bool isStatic(JSAtom* atom); /* Return null if no static atom exists for the given (chars, length). */ + MOZ_ALWAYS_INLINE JSAtom* lookup(const char* chars, size_t length) { + // Collapse calls for |const char*| into |const Latin1Char char*| to avoid + // excess instantiations. + return lookup(reinterpret_cast<const Latin1Char*>(chars), length); + } + + template <typename CharT, + typename = typename std::enable_if<!std::is_const<CharT>::value>::type> + MOZ_ALWAYS_INLINE JSAtom* lookup(CharT* chars, size_t length) { + // Collapse the remaining |CharT*| to |const CharT*| to avoid excess + // instantiations. + return lookup(const_cast<const CharT*>(chars), length); + } + template <typename CharT> JSAtom* lookup(const CharT* chars, size_t length) { switch (length) { diff --git a/js/src/vm/StringBuffer.cpp b/js/src/vm/StringBuffer.cpp index ec8592f951..e4f0e4f4d6 100644 --- a/js/src/vm/StringBuffer.cpp +++ b/js/src/vm/StringBuffer.cpp @@ -111,11 +111,17 @@ StringBuffer::finishString() JS_STATIC_ASSERT(JSFatInlineString::MAX_LENGTH_LATIN1 < Latin1CharBuffer::InlineLength); if (isLatin1()) { + if (JSAtom* staticStr = cx->staticStrings().lookup(latin1Chars().begin(), len)) + return staticStr; + if (JSInlineString::lengthFits<Latin1Char>(len)) { mozilla::Range<const Latin1Char> range(latin1Chars().begin(), len); return NewInlineString<CanGC>(cx, range); } } else { + if (JSAtom* staticStr = cx->staticStrings().lookup(twoByteChars().begin(), len)) + return staticStr; + if (JSInlineString::lengthFits<char16_t>(len)) { mozilla::Range<const char16_t> range(twoByteChars().begin(), len); return NewInlineString<CanGC>(cx, range); |