diff options
author | Martok <martok@martoks-place.de> | 2023-06-29 23:07:20 +0200 |
---|---|---|
committer | Martok <martok@martoks-place.de> | 2023-06-30 00:01:34 +0200 |
commit | 2f940bdc9dcbfe83e17ed26c5d1af7fe874c24ac (patch) | |
tree | 2519366eb8057e265339261ab651a8cb5653a703 | |
parent | 6808e659ad137ac63466aad93e406efbf091c077 (diff) | |
download | uxp-2f940bdc9dcbfe83e17ed26c5d1af7fe874c24ac.tar.gz |
Issue #2259 - process Unicode langtags and locale identifiers according to BCP 47
Major spec change: text references are to BCP47 (not the implementing RFCs) and
the single source of truth is now Unicode CLDR.
- Switch from IANA to CLDR for make_unicode
- Update grandfathered tag handling directly in the parser
- Don't support extlang, irregular, privateuse or 4-letter subtags
- Adjust comments to refer to Unicode BCP 47 locale identifiers, remove RFC 5646
- Canonicalize/order langtags correctly
- Tokenize BCP47 in reusable class
Based-on: m-c 1407674(partial), 1451082, 1530320, 1522070, 1531091
-rw-r--r-- | js/src/builtin/RegExp.cpp | 76 | ||||
-rw-r--r-- | js/src/builtin/RegExp.h | 20 | ||||
-rw-r--r-- | js/src/builtin/Utilities.js | 6 | ||||
-rw-r--r-- | js/src/builtin/intl/Collator.js | 151 | ||||
-rw-r--r-- | js/src/builtin/intl/CommonFunctions.js | 1703 | ||||
-rw-r--r-- | js/src/builtin/intl/DateTimeFormat.js | 74 | ||||
-rw-r--r-- | js/src/builtin/intl/LangTagMappingsGenerated.js | 1466 | ||||
-rw-r--r-- | js/src/builtin/intl/NumberFormat.js | 165 | ||||
-rw-r--r-- | js/src/builtin/intl/PluralRules.cpp | 2 | ||||
-rw-r--r-- | js/src/builtin/intl/PluralRules.js | 106 | ||||
-rw-r--r-- | js/src/builtin/intl/make_intl_data.py | 881 | ||||
-rw-r--r-- | js/src/vm/SelfHosting.cpp | 2 |
12 files changed, 3331 insertions, 1321 deletions
diff --git a/js/src/builtin/RegExp.cpp b/js/src/builtin/RegExp.cpp index 46a2862909..f3d34762f6 100644 --- a/js/src/builtin/RegExp.cpp +++ b/js/src/builtin/RegExp.cpp @@ -974,8 +974,7 @@ IsTrailSurrogateWithLeadSurrogate(JSContext* cx, HandleLinearString input, int32 */ static RegExpRunStatus ExecuteRegExp(JSContext* cx, HandleObject regexp, HandleString string, - int32_t lastIndex, - MatchPairs* matches, size_t* endIndex, RegExpStaticsUpdate staticsUpdate) + int32_t lastIndex, MatchPairs* matches, size_t* endIndex) { /* * WARNING: Despite the presence of spec step comment numbers, this @@ -990,14 +989,9 @@ ExecuteRegExp(JSContext* cx, HandleObject regexp, HandleString string, if (!RegExpObject::getShared(cx, reobj, &re)) return RegExpRunStatus_Error; - RegExpStatics* res; - if (staticsUpdate == UpdateRegExpStatics) { - res = GlobalObject::getRegExpStatics(cx, cx->global()); - if (!res) - return RegExpRunStatus_Error; - } else { - res = nullptr; - } + RegExpStatics* res = GlobalObject::getRegExpStatics(cx, cx->global()); + if (!res) + return RegExpRunStatus_Error; RootedLinearString input(cx, string->ensureLinear(cx)); if (!input) @@ -1051,15 +1045,14 @@ ExecuteRegExp(JSContext* cx, HandleObject regexp, HandleString string, * steps 3, 9-25, except 12.a.i, 12.c.i.1, 15. */ static bool -RegExpMatcherImpl(JSContext* cx, HandleObject regexp, HandleString string, - int32_t lastIndex, RegExpStaticsUpdate staticsUpdate, MutableHandleValue rval) +RegExpMatcherImpl(JSContext* cx, HandleObject regexp, HandleString string, int32_t lastIndex, + MutableHandleValue rval) { /* Execute regular expression and gather matches. */ ScopedMatchPairs matches(&cx->tempLifoAlloc()); /* Steps 3, 9-14, except 12.a.i, 12.c.i.1. */ - RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, lastIndex, - &matches, nullptr, staticsUpdate); + RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, lastIndex, &matches, nullptr); if (status == RegExpRunStatus_Error) return false; @@ -1099,8 +1092,7 @@ js::RegExpMatcher(JSContext* cx, unsigned argc, Value* vp) return false; /* Steps 3, 9-25, except 12.a.i, 12.c.i.1, 15. */ - return RegExpMatcherImpl(cx, regexp, string, lastIndex, - UpdateRegExpStatics, args.rval()); + return RegExpMatcherImpl(cx, regexp, string, lastIndex, args.rval()); } /* @@ -1123,8 +1115,7 @@ js::RegExpMatcherRaw(JSContext* cx, HandleObject regexp, HandleString input, return false; return CreateRegExpMatchResult(cx, *shared, input, *maybeMatches, output); } - return RegExpMatcherImpl(cx, regexp, input, lastIndex, - UpdateRegExpStatics, output); + return RegExpMatcherImpl(cx, regexp, input, lastIndex, output); } /* @@ -1135,14 +1126,13 @@ js::RegExpMatcherRaw(JSContext* cx, HandleObject regexp, HandleString input, */ static bool RegExpSearcherImpl(JSContext* cx, HandleObject regexp, HandleString string, - int32_t lastIndex, RegExpStaticsUpdate staticsUpdate, int32_t* result) + int32_t lastIndex, int32_t* result) { /* Execute regular expression and gather matches. */ ScopedMatchPairs matches(&cx->tempLifoAlloc()); /* Steps 3, 9-14, except 12.a.i, 12.c.i.1. */ - RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, lastIndex, - &matches, nullptr, staticsUpdate); + RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, lastIndex, &matches, nullptr); if (status == RegExpRunStatus_Error) return false; @@ -1180,7 +1170,7 @@ js::RegExpSearcher(JSContext* cx, unsigned argc, Value* vp) /* Steps 3, 9-25, except 12.a.i, 12.c.i.1, 15. */ int32_t result = 0; - if (!RegExpSearcherImpl(cx, regexp, string, lastIndex, UpdateRegExpStatics, &result)) + if (!RegExpSearcherImpl(cx, regexp, string, lastIndex, &result)) return false; args.rval().setInt32(result); @@ -1203,23 +1193,7 @@ js::RegExpSearcherRaw(JSContext* cx, HandleObject regexp, HandleString input, *result = CreateRegExpSearchResult(cx, *maybeMatches); return true; } - return RegExpSearcherImpl(cx, regexp, input, lastIndex, - UpdateRegExpStatics, result); -} - -bool -js::regexp_exec_no_statics(JSContext* cx, unsigned argc, Value* vp) -{ - CallArgs args = CallArgsFromVp(argc, vp); - MOZ_ASSERT(args.length() == 2); - MOZ_ASSERT(IsRegExpObject(args[0])); - MOZ_ASSERT(args[1].isString()); - - RootedObject regexp(cx, &args[0].toObject()); - RootedString string(cx, args[1].toString()); - - return RegExpMatcherImpl(cx, regexp, string, 0, - DontUpdateRegExpStatics, args.rval()); + return RegExpSearcherImpl(cx, regexp, input, lastIndex, result); } /* @@ -1245,8 +1219,7 @@ js::RegExpTester(JSContext* cx, unsigned argc, Value* vp) /* Steps 3, 9-14, except 12.a.i, 12.c.i.1. */ size_t endIndex = 0; - RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, lastIndex, - nullptr, &endIndex, UpdateRegExpStatics); + RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, lastIndex, nullptr, &endIndex); if (status == RegExpRunStatus_Error) return false; @@ -1271,8 +1244,7 @@ js::RegExpTesterRaw(JSContext* cx, HandleObject regexp, HandleString input, MOZ_ASSERT(lastIndex >= 0); size_t endIndexTmp = 0; - RegExpRunStatus status = ExecuteRegExp(cx, regexp, input, lastIndex, - nullptr, &endIndexTmp, UpdateRegExpStatics); + RegExpRunStatus status = ExecuteRegExp(cx, regexp, input, lastIndex, nullptr, &endIndexTmp); if (status == RegExpRunStatus_Success) { MOZ_ASSERT(endIndexTmp <= INT32_MAX); @@ -1287,24 +1259,6 @@ js::RegExpTesterRaw(JSContext* cx, HandleObject regexp, HandleString input, return false; } -bool -js::regexp_test_no_statics(JSContext* cx, unsigned argc, Value* vp) -{ - CallArgs args = CallArgsFromVp(argc, vp); - MOZ_ASSERT(args.length() == 2); - MOZ_ASSERT(IsRegExpObject(args[0])); - MOZ_ASSERT(args[1].isString()); - - RootedObject regexp(cx, &args[0].toObject()); - RootedString string(cx, args[1].toString()); - - size_t ignored = 0; - RegExpRunStatus status = ExecuteRegExp(cx, regexp, string, 0, - nullptr, &ignored, DontUpdateRegExpStatics); - args.rval().setBoolean(status == RegExpRunStatus_Success); - return status != RegExpRunStatus_Error; -} - static void GetParen(JSLinearString* matched, const JS::Value& capture, JSSubString* out) { diff --git a/js/src/builtin/RegExp.h b/js/src/builtin/RegExp.h index f66c9b1b81..c0a7d59f77 100644 --- a/js/src/builtin/RegExp.h +++ b/js/src/builtin/RegExp.h @@ -18,10 +18,6 @@ namespace js { JSObject* InitRegExpClass(JSContext* cx, HandleObject obj); -// Whether RegExp statics should be updated with the input and results of a -// regular expression execution. -enum RegExpStaticsUpdate { UpdateRegExpStatics, DontUpdateRegExpStatics }; - /* * Legacy behavior of ExecuteRegExp(), which is baked into the JSAPI. * @@ -72,22 +68,6 @@ intrinsic_GetStringDataProperty(JSContext* cx, unsigned argc, Value* vp); */ /* - * Behaves like regexp.exec(string), but doesn't set RegExp statics. - * - * Usage: match = regexp_exec_no_statics(regexp, string) - */ -extern MOZ_MUST_USE bool -regexp_exec_no_statics(JSContext* cx, unsigned argc, Value* vp); - -/* - * Behaves like regexp.test(string), but doesn't set RegExp statics. - * - * Usage: does_match = regexp_test_no_statics(regexp, string) - */ -extern MOZ_MUST_USE bool -regexp_test_no_statics(JSContext* cx, unsigned argc, Value* vp); - -/* * Behaves like RegExp(pattern, flags). * |pattern| should be a RegExp object, |flags| should be a raw integer value. * Must be called without |new|. diff --git a/js/src/builtin/Utilities.js b/js/src/builtin/Utilities.js index 09c15957c6..51c5a574fd 100644 --- a/js/src/builtin/Utilities.js +++ b/js/src/builtin/Utilities.js @@ -80,12 +80,6 @@ MakeConstructible(Record, {}); /********** Abstract operations defined in ECMAScript Language Specification **********/ -/* Spec: ECMAScript Language Specification, 5.1 edition, 8.12.6 and 11.8.7 */ -function HasProperty(o, p) { - return p in o; -} - - /* Spec: ECMAScript Language Specification, 5.1 edition, 9.2 and 11.4.9 */ function ToBoolean(v) { return !!v; diff --git a/js/src/builtin/intl/Collator.js b/js/src/builtin/intl/Collator.js index ee6ea9a9b8..dffadab7c5 100644 --- a/js/src/builtin/intl/Collator.js +++ b/js/src/builtin/intl/Collator.js @@ -6,18 +6,6 @@ /**
- * Mapping from Unicode extension keys for collation to options properties,
- * their types and permissible values.
- *
- * Spec: ECMAScript Internationalization API Specification, 10.1.1.
- */
-var collatorKeyMappings = {
- kn: {property: "numeric", type: "boolean"},
- kf: {property: "caseFirst", type: "string", values: ["upper", "lower", "false"]}
-};
-
-
-/**
* Compute an internal properties object from |lazyCollatorData|.
*/
function resolveCollatorInternals(lazyCollatorData)
@@ -26,60 +14,49 @@ function resolveCollatorInternals(lazyCollatorData) var internalProps = std_Object_create(null);
- // Step 7.
- internalProps.usage = lazyCollatorData.usage;
-
- // Step 8.
var Collator = collatorInternalProperties;
- // Step 9.
+ // Step 5.
+ internalProps.usage = lazyCollatorData.usage;
+
+ // Steps 6-7.
var collatorIsSorting = lazyCollatorData.usage === "sort";
var localeData = collatorIsSorting
? Collator.sortLocaleData
: Collator.searchLocaleData;
// Compute effective locale.
- // Step 14.
+ // Step 16.
var relevantExtensionKeys = Collator.relevantExtensionKeys;
- // Step 15.
+ // Step 17.
var r = ResolveLocale(callFunction(Collator.availableLocales, Collator),
lazyCollatorData.requestedLocales,
lazyCollatorData.opt,
relevantExtensionKeys,
localeData);
- // Step 16.
+ // Step 18.
internalProps.locale = r.locale;
- // Steps 17-19.
- var key, property, value, mapping;
- var i = 0, len = relevantExtensionKeys.length;
- while (i < len) {
- // Step 19.a.
- key = relevantExtensionKeys[i];
- if (key === "co") {
- // Step 19.b.
- property = "collation";
- value = r.co === null ? "default" : r.co;
- } else {
- // Step 19.c.
- mapping = collatorKeyMappings[key];
- property = mapping.property;
- value = r[key];
- if (mapping.type === "boolean")
- value = value === "true";
- }
+ // Step 19.
+ var collation = r.co;
+
+ // Step 20.
+ if (collation === null)
+ collation = "default";
- // Step 19.d.
- internalProps[property] = value;
+ // Step 21.
+ internalProps.collation = collation;
- // Step 19.e.
- i++;
- }
+ // Step 22.
+ internalProps.numeric = r.kn === "true";
+
+ // Step 23.
+ internalProps.caseFirst = r.kf;
// Compute remaining collation options.
- // Steps 21-22.
+ // Step 25.
var s = lazyCollatorData.rawSensitivity;
if (s === undefined) {
// In theory the default sensitivity for the "search" collator is
@@ -88,14 +65,13 @@ function resolveCollatorInternals(lazyCollatorData) // both collation modes.
s = "variant";
}
+
+ // Step 26.
internalProps.sensitivity = s;
- // Step 24.
+ // Step 28.
internalProps.ignorePunctuation = lazyCollatorData.ignorePunctuation;
- // Step 25.
- internalProps.boundFormat = undefined;
-
// The caller is responsible for associating |internalProps| with the right
// object using |setInternalProperties|.
return internalProps;
@@ -139,9 +115,6 @@ function InitializeCollator(collator, locales, options) { assert(IsObject(collator), "InitializeCollator called with non-object");
assert(IsCollator(collator), "InitializeCollator called with non-Collator");
- // Steps 1-2 (These steps are no longer required and should be removed
- // from the spec; https://github.com/tc39/ecma402/issues/115).;
-
// Lazy Collator data has the following structure:
//
// {
@@ -162,11 +135,11 @@ function InitializeCollator(collator, locales, options) { // subset of them.
var lazyCollatorData = std_Object_create(null);
- // Step 3.
+ // Step 1.
var requestedLocales = CanonicalizeLocaleList(locales);
lazyCollatorData.requestedLocales = requestedLocales;
- // Steps 4-5.
+ // Steps 2-3.
//
// If we ever need more speed here at startup, we should try to detect the
// case where |options === undefined| and Object.prototype hasn't been
@@ -179,38 +152,39 @@ function InitializeCollator(collator, locales, options) { options = ToObject(options);
// Compute options that impact interpretation of locale.
- // Step 6.
+ // Step 4.
var u = GetOption(options, "usage", "string", ["sort", "search"], "sort");
lazyCollatorData.usage = u;
- // Step 10.
+ // Step 8.
var opt = new Record();
lazyCollatorData.opt = opt;
- // Steps 11-12.
+ // Steps 9-10.
var matcher = GetOption(options, "localeMatcher", "string", ["lookup", "best fit"], "best fit");
opt.localeMatcher = matcher;
- // Step 13, unrolled.
+ // Steps 11-13.
var numericValue = GetOption(options, "numeric", "boolean", undefined, undefined);
if (numericValue !== undefined)
numericValue = numericValue ? 'true' : 'false';
opt.kn = numericValue;
+ // Steps 14-15.
var caseFirstValue = GetOption(options, "caseFirst", "string", ["upper", "lower", "false"], undefined);
opt.kf = caseFirstValue;
// Compute remaining collation options.
- // Step 20.
+ // Step 24.
var s = GetOption(options, "sensitivity", "string",
["base", "accent", "case", "variant"], undefined);
lazyCollatorData.rawSensitivity = s;
- // Step 23.
+ // Step 27.
var ip = GetOption(options, "ignorePunctuation", "boolean", undefined, false);
lazyCollatorData.ignorePunctuation = ip;
- // Step 26.
+ // Step 29.
//
// We've done everything that must be done now: mark the lazy data as fully
// computed and install it.
@@ -228,9 +202,14 @@ function InitializeCollator(collator, locales, options) { function Intl_Collator_supportedLocalesOf(locales /*, options*/) {
var options = arguments.length > 1 ? arguments[1] : undefined;
+ // Step 1.
var availableLocales = callFunction(collatorInternalProperties.availableLocales,
collatorInternalProperties);
+
+ // Step 2.
var requestedLocales = CanonicalizeLocaleList(locales);
+
+ // Step 3.
return SupportedLocales(availableLocales, requestedLocales, options);
}
@@ -353,9 +332,9 @@ function collatorSearchLocaleData() { /**
- * Function to be bound and returned by Intl.Collator.prototype.format.
+ * Function to be bound and returned by Intl.Collator.prototype.compare.
*
- * Spec: ECMAScript Internationalization API Specification, 12.3.2.
+ * Spec: ECMAScript Internationalization API Specification, 10.3.3.1.
*/
function collatorCompareToBind(x, y) {
// Steps 1.a.i-ii implemented by ECMAScript declaration binding instantiation,
@@ -375,26 +354,28 @@ function collatorCompareToBind(x, y) { * than 0 if x > y according to the sort order for the locale and collation
* options of this Collator object.
*
- * Spec: ECMAScript Internationalization API Specification, 10.3.2.
+ * Spec: ECMAScript Internationalization API Specification, 10.3.3.
*/
function Intl_Collator_compare_get() {
- // Check "this Collator object" per introduction of section 10.3.
- if (!IsObject(this) || !IsCollator(this))
+ // Step 1.
+ var collator = this;
+
+ // Steps 2-3.
+ if (!IsObject(collator) || !IsCollator(collator))
ThrowTypeError(JSMSG_INTL_OBJECT_NOT_INITED, "Collator", "compare", "Collator");
- var internals = getCollatorInternals(this);
+ var internals = getCollatorInternals(collator);
- // Step 1.
+ // Step 4.
if (internals.boundCompare === undefined) {
- // Step 1.a.
- var F = collatorCompareToBind;
+ // Steps 4.a-b.
+ var F = callFunction(FunctionBind, collatorCompareToBind, collator);
- // Steps 1.b-d.
- var bc = callFunction(FunctionBind, F, this);
- internals.boundCompare = bc;
+ // Step 4.c.
+ internals.boundCompare = F;
}
- // Step 2.
+ // Step 5.
return internals.boundCompare;
}
_SetCanonicalName(Intl_Collator_compare_get, "get compare");
@@ -403,28 +384,30 @@ _SetCanonicalName(Intl_Collator_compare_get, "get compare"); /**
* Returns the resolved options for a Collator object.
*
- * Spec: ECMAScript Internationalization API Specification, 10.3.3 and 10.4.
+ * Spec: ECMAScript Internationalization API Specification, 10.3.4.
*/
function Intl_Collator_resolvedOptions() {
- // Check "this Collator object" per introduction of section 10.3.
- if (!IsObject(this) || !IsCollator(this))
+ // Step 1.
+ var collator = this;
+
+ // Steps 2-3.
+ if (!IsObject(collator) || !IsCollator(collator))
ThrowTypeError(JSMSG_INTL_OBJECT_NOT_INITED, "Collator", "resolvedOptions", "Collator");
- var internals = getCollatorInternals(this);
+ var internals = getCollatorInternals(collator);
+ // Steps 4-5.
var result = {
locale: internals.locale,
usage: internals.usage,
sensitivity: internals.sensitivity,
- ignorePunctuation: internals.ignorePunctuation
+ ignorePunctuation: internals.ignorePunctuation,
+ collation: internals.collation,
+ numeric: internals.numeric,
+ caseFirst: internals.caseFirst,
};
- var relevantExtensionKeys = collatorInternalProperties.relevantExtensionKeys;
- for (var i = 0; i < relevantExtensionKeys.length; i++) {
- var key = relevantExtensionKeys[i];
- var property = (key === "co") ? "collation" : collatorKeyMappings[key].property;
- _DefineDataProperty(result, property, internals[property]);
- }
+ // Step 6.
return result;
}
diff --git a/js/src/builtin/intl/CommonFunctions.js b/js/src/builtin/intl/CommonFunctions.js index cf5a615721..36b2bec9b2 100644 --- a/js/src/builtin/intl/CommonFunctions.js +++ b/js/src/builtin/intl/CommonFunctions.js @@ -14,35 +14,70 @@ function hasOwn(propName, object) { } /** - * Holder object for encapsulating regexp instances. - * - * Regular expression instances should be created after the initialization of - * self-hosted global. - */ -var internalIntlRegExps = std_Object_create(null); -internalIntlRegExps.unicodeLocaleExtensionSequenceRE = null; -internalIntlRegExps.languageTagRE = null; -internalIntlRegExps.duplicateVariantRE = null; -internalIntlRegExps.duplicateSingletonRE = null; -internalIntlRegExps.isWellFormedCurrencyCodeRE = null; -internalIntlRegExps.currencyDigitsRE = null; - -/** - * Regular expression matching a "Unicode locale extension sequence", which the + * Returns the start index of a "Unicode locale extension sequence", which the * specification defines as: "any substring of a language tag that starts with * a separator '-' and the singleton 'u' and includes the maximum sequence of * following non-singleton subtags and their preceding '-' separators." * * Alternatively, this may be defined as: the components of a language tag that - * match the extension production in RFC 5646, where the singleton component is - * "u". + * match the `unicode_locale_extensions` production in UTS 35. * * Spec: ECMAScript Internationalization API Specification, 6.2.1. */ -function getUnicodeLocaleExtensionSequenceRE() { - return internalIntlRegExps.unicodeLocaleExtensionSequenceRE || - (internalIntlRegExps.unicodeLocaleExtensionSequenceRE = - RegExpCreate("-u(?:-[a-z0-9]{2,8})+")); +function startOfUnicodeExtensions(locale) { + assert(typeof locale === "string", "locale is a string"); + + // Search for "-u-" marking the start of a Unicode extension sequence. + var start = callFunction(std_String_indexOf, locale, "-u-"); + if (start < 0) + return -1; + + // And search for "-x-" marking the start of any privateuse component to + // handle the case when "-u-" was only found within a privateuse subtag. + var privateExt = callFunction(std_String_indexOf, locale, "-x-"); + if (privateExt >= 0 && privateExt < start) + return -1; + + return start; +} + +/** + * Returns the end index of a Unicode locale extension sequence. + */ +function endOfUnicodeExtensions(locale, start) { + assert(typeof locale === "string", "locale is a string"); + assert(IsStructurallyValidLanguageTag(locale), "locale is a language tag"); + assert(CanonicalizeLanguageTag(locale) === locale, "locale is a canonicalized language tag"); + assert(0 <= start && start < locale.length, "start is an index into locale"); + assert(Substring(locale, start, 3) === "-u-", "start points to Unicode extension sequence"); + + #define HYPHEN 0x2D + assert(std_String_fromCharCode(HYPHEN) === "-", + "code unit constant should match the expected character"); + + // Search for the start of the next singleton or privateuse subtag. + // + // Begin searching after the smallest possible Unicode locale extension + // sequence, namely |"-u-" 2alphanum|. End searching once the remaining + // characters can't fit the smallest possible singleton or privateuse + // subtag, namely |"-x-" alphanum|. Note the reduced end-limit means + // indexing inside the loop is always in-range. + for (var i = start + 5, end = locale.length - 4; i <= end; i++) { + if (callFunction(std_String_charCodeAt, locale, i) !== HYPHEN) + continue; + if (callFunction(std_String_charCodeAt, locale, i + 2) === HYPHEN) + return i; + + // Skip over (i + 1) and (i + 2) because we've just verified they + // aren't "-", so the next possible delimiter can only be at (i + 3). + i += 2; + } + + #undef HYPHEN + + // If no singleton or privateuse subtag was found, the Unicode extension + // sequence extends until the end of the string. + return locale.length; } @@ -50,226 +85,602 @@ function getUnicodeLocaleExtensionSequenceRE() { * Removes Unicode locale extension sequences from the given language tag. */ function removeUnicodeExtensions(locale) { - // A wholly-privateuse locale has no extension sequences. - if (callFunction(std_String_startsWith, locale, "x-")) + var start = startOfUnicodeExtensions(locale); + if (start < 0) return locale; - // Otherwise, split on "-x-" marking the start of any privateuse component. - // Replace Unicode locale extension sequences in the left half, and return - // the concatenation. - var pos = callFunction(std_String_indexOf, locale, "-x-"); - if (pos < 0) - pos = locale.length; - - var left = callFunction(String_substring, locale, 0, pos); - var right = callFunction(String_substring, locale, pos); - - var extensions; - var unicodeLocaleExtensionSequenceRE = getUnicodeLocaleExtensionSequenceRE(); - while ((extensions = regexp_exec_no_statics(unicodeLocaleExtensionSequenceRE, left)) !== null) { - left = StringReplaceString(left, extensions[0], ""); - unicodeLocaleExtensionSequenceRE.lastIndex = 0; - } + var end = endOfUnicodeExtensions(locale, start); + var left = Substring(locale, 0, start); + var right = Substring(locale, end, locale.length - end); var combined = left + right; - assert(IsStructurallyValidLanguageTag(combined), "recombination produced an invalid language tag"); - assert(function() { - var uindex = callFunction(std_String_indexOf, combined, "-u-"); - if (uindex < 0) - return true; - var xindex = callFunction(std_String_indexOf, combined, "-x-"); - return xindex > 0 && xindex < uindex; - }(), "recombination failed to remove all Unicode locale extension sequences"); + + assert(IsStructurallyValidLanguageTag(combined), + "recombination produced an invalid language tag"); + assert(startOfUnicodeExtensions(combined) < 0, + "recombination failed to remove all Unicode locale extension sequences"); return combined; } - /** - * Regular expression defining BCP 47 language tags. - * - * Spec: RFC 5646 section 2.1. + * Returns Unicode locale extension sequences from the given language tag. */ -function getLanguageTagRE() { - if (internalIntlRegExps.languageTagRE) - return internalIntlRegExps.languageTagRE; - - // RFC 5234 section B.1 - // ALPHA = %x41-5A / %x61-7A ; A-Z / a-z - var ALPHA = "[a-zA-Z]"; - // DIGIT = %x30-39 - // ; 0-9 - var DIGIT = "[0-9]"; - - // RFC 5646 section 2.1 - // alphanum = (ALPHA / DIGIT) ; letters and numbers - var alphanum = "(?:" + ALPHA + "|" + DIGIT + ")"; - // regular = "art-lojban" ; these tags match the 'langtag' - // / "cel-gaulish" ; production, but their subtags - // / "no-bok" ; are not extended language - // / "no-nyn" ; or variant subtags: their meaning - // / "zh-guoyu" ; is defined by their registration - // / "zh-hakka" ; and all of these are deprecated - // / "zh-min" ; in favor of a more modern - // / "zh-min-nan" ; subtag or sequence of subtags - // / "zh-xiang" - var regular = "(?:art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)"; - // irregular = "en-GB-oed" ; irregular tags do not match - // / "i-ami" ; the 'langtag' production and - // / "i-bnn" ; would not otherwise be - // / "i-default" ; considered 'well-formed' - // / "i-enochian" ; These tags are all valid, - // / "i-hak" ; but most are deprecated - // / "i-klingon" ; in favor of more modern - // / "i-lux" ; subtags or subtag - // / "i-mingo" ; combination - // / "i-navajo" - // / "i-pwn" - // / "i-tao" - // / "i-tay" - // / "i-tsu" - // / "sgn-BE-FR" - // / "sgn-BE-NL" - // / "sgn-CH-DE" - var irregular = "(?:en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)"; - // grandfathered = irregular ; non-redundant tags registered - // / regular ; during the RFC 3066 era - var grandfathered = "(?:" + irregular + "|" + regular + ")"; - // privateuse = "x" 1*("-" (1*8alphanum)) - var privateuse = "(?:x(?:-[a-z0-9]{1,8})+)"; - // singleton = DIGIT ; 0 - 9 - // / %x41-57 ; A - W - // / %x59-5A ; Y - Z - // / %x61-77 ; a - w - // / %x79-7A ; y - z - var singleton = "(?:" + DIGIT + "|[A-WY-Za-wy-z])"; - // extension = singleton 1*("-" (2*8alphanum)) - var extension = "(?:" + singleton + "(?:-" + alphanum + "{2,8})+)"; - // variant = 5*8alphanum ; registered variants - // / (DIGIT 3alphanum) - var variant = "(?:" + alphanum + "{5,8}|(?:" + DIGIT + alphanum + "{3}))"; - // region = 2ALPHA ; ISO 3166-1 code - // / 3DIGIT ; UN M.49 code - var region = "(?:" + ALPHA + "{2}|" + DIGIT + "{3})"; - // script = 4ALPHA ; ISO 15924 code - var script = "(?:" + ALPHA + "{4})"; - // extlang = 3ALPHA ; selected ISO 639 codes - // *2("-" 3ALPHA) ; permanently reserved - var extlang = "(?:" + ALPHA + "{3}(?:-" + ALPHA + "{3}){0,2})"; - // language = 2*3ALPHA ; shortest ISO 639 code - // ["-" extlang] ; sometimes followed by - // ; extended language subtags - // / 4ALPHA ; or reserved for future use - // / 5*8ALPHA ; or registered language subtag - var language = "(?:" + ALPHA + "{2,3}(?:-" + extlang + ")?|" + ALPHA + "{4}|" + ALPHA + "{5,8})"; - // langtag = language - // ["-" script] - // ["-" region] - // *("-" variant) - // *("-" extension) - // ["-" privateuse] - var langtag = language + "(?:-" + script + ")?(?:-" + region + ")?(?:-" + - variant + ")*(?:-" + extension + ")*(?:-" + privateuse + ")?"; - // Language-Tag = langtag ; normal language tags - // / privateuse ; private use tag - // / grandfathered ; grandfathered tags - var languageTag = "^(?:" + langtag + "|" + privateuse + "|" + grandfathered + ")$"; - - // Language tags are case insensitive (RFC 5646 section 2.1.1). - return (internalIntlRegExps.languageTagRE = RegExpCreate(languageTag, "i")); +function getUnicodeExtensions(locale) { + var start = startOfUnicodeExtensions(locale); + assert(start >= 0, "start of Unicode extension sequence not found"); + var end = endOfUnicodeExtensions(locale, start); + + return Substring(locale, start, end - start); } +// The three possible token type bits. Expressed as #defines to avoid +// extra named lookups in the interpreter/jits. +#define NONE 0b00 +#define ALPHA 0b01 +#define DIGIT 0b10 + +// Constants for code units used below. +#define HYPHEN 0x2D +#define DIGIT_ZERO 0x30 +#define DIGIT_NINE 0x39 +#define UPPER_A 0x41 +#define UPPER_Z 0x5A +#define LOWER_A 0x61 +#define LOWER_T 0x74 +#define LOWER_U 0x75 +#define LOWER_X 0x78 +#define LOWER_Z 0x7A + +// The requirement to use callFunction() for method calls makes the parser +// harder to read. Use macros for the rescue. + +// Reads the next token. +#define NEXT_TOKEN_OR_RETURN_NULL(ts) \ + if (!callFunction(ts.nextToken, ts)) \ + return null; + +#define NEXT_TOKEN_OR_ASSERT(ts) \ + if (!callFunction(ts.nextToken, ts)) \ + assert(false, "unexpected invalid subtag"); -function getDuplicateVariantRE() { - if (internalIntlRegExps.duplicateVariantRE) - return internalIntlRegExps.duplicateVariantRE; - - // RFC 5234 section B.1 - // ALPHA = %x41-5A / %x61-7A ; A-Z / a-z - var ALPHA = "[a-zA-Z]"; - // DIGIT = %x30-39 - // ; 0-9 - var DIGIT = "[0-9]"; - - // RFC 5646 section 2.1 - // alphanum = (ALPHA / DIGIT) ; letters and numbers - var alphanum = "(?:" + ALPHA + "|" + DIGIT + ")"; - // variant = 5*8alphanum ; registered variants - // / (DIGIT 3alphanum) - var variant = "(?:" + alphanum + "{5,8}|(?:" + DIGIT + alphanum + "{3}))"; - - // Match a langtag that contains a duplicate variant. - var duplicateVariant = - // Match everything in a langtag prior to any variants, and maybe some - // of the variants as well (which makes this pattern inefficient but - // not wrong, for our purposes); - "(?:" + alphanum + "{2,8}-)+" + - // a variant, parenthesised so that we can refer back to it later; - "(" + variant + ")-" + - // zero or more subtags at least two characters long (thus stopping - // before extension and privateuse components); - "(?:" + alphanum + "{2,8}-)*" + - // and the same variant again - "\\1" + - // ...but not followed by any characters that would turn it into a - // different subtag. - "(?!" + alphanum + ")"; - - // Language tags are case insensitive (RFC 5646 section 2.1.1). Using - // character classes covering both upper- and lower-case characters nearly - // addresses this -- but for the possibility of variant repetition with - // differing case, e.g. "en-variant-Variant". Use a case-insensitive - // regular expression to address this. (Note that there's no worry about - // case transformation accepting invalid characters here: users have - // already verified the string is alphanumeric Latin plus "-".) - return (internalIntlRegExps.duplicateVariantRE = RegExpCreate(duplicateVariant, "i")); +// Assigns the current subtag part transformed to lower-case to the target. +#define SUBTAG_VAR_OR_RETURN_NULL(ts, target) \ + { \ + target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \ + NEXT_TOKEN_OR_RETURN_NULL(ts); \ + } + +// Assigns the current subtag part transformed to lower-case to the target. +#define SUBTAG_VAR_OR_ASSERT(ts, target) \ + { \ + target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \ + NEXT_TOKEN_OR_ASSERT(ts) \ + } + +/** + * Tokenizer for Unicode BCP 47 locale identifiers. + */ +function BCP47TokenStream(locale) { + this.locale = locale; + + // Locale identifiers are compared and processed case-insensitively, so + // technically it's not necessary to adjust case. But for easier processing, + // and because the canonical form for most subtags is lower case, we start + // with lower case for all. + // + // Note that the tokenizer function keeps using the original input string + // to properly detect non-ASCII characters. The lower-case string can't be + // used to detect those characters, because some non-ASCII characters + // lower-case map into ASCII characters, e.g. U+212A (KELVIN SIGN) lower- + // case maps to U+006B (LATIN SMALL LETTER K). + this.localeLowercase = callFunction(std_String_toLowerCase, locale); + + // Current parse index in |locale|. + this.index = 0; + + // The current token type, its start index, and its length. + this.token = NONE; + this.tokenStart = 0; + this.tokenLength = 0; + + assert(std_String_fromCharCode(HYPHEN) === "-" && + std_String_fromCharCode(DIGIT_ZERO) === "0" && + std_String_fromCharCode(DIGIT_NINE) === "9" && + std_String_fromCharCode(UPPER_A) === "A" && + std_String_fromCharCode(UPPER_Z) === "Z" && + std_String_fromCharCode(LOWER_A) === "a" && + std_String_fromCharCode(LOWER_T) === "t" && + std_String_fromCharCode(LOWER_U) === "u" && + std_String_fromCharCode(LOWER_X) === "x" && + std_String_fromCharCode(LOWER_Z) === "z", + "code unit constants should match the expected characters"); } +MakeConstructible(BCP47TokenStream, { + __proto__: null, + + // Reads the next token, returns |false| if an illegal character was found, + // otherwise returns |true|. + // + // eslint-disable-next-line object-shorthand + nextToken: function() { + var type = NONE; + var {index, locale} = this; + for (var i = index; i < locale.length; i++) { + // UTS 35, section 3.1. + // alpha = [A-Z a-z] ; + // digit = [0-9] ; + var c = callFunction(std_String_charCodeAt, locale, i); + if ((UPPER_A <= c && c <= UPPER_Z) || (LOWER_A <= c && c <= LOWER_Z)) + type |= ALPHA; + else if (DIGIT_ZERO <= c && c <= DIGIT_NINE) + type |= DIGIT; + else if (c === HYPHEN && i > index && i + 1 < locale.length) + break; + else + return false; + } + + this.token = type; + this.tokenStart = index; + this.tokenLength = i - index; + this.index = i + 1; + return true; + }, + + // Returns true if the character at the requested index within the current + // token is a digit. + // + // eslint-disable-next-line object-shorthand + isDigitAt: function(index) { + assert(0 <= index && index < this.tokenLength, + "must be an index into the current token"); + var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart + index); + assert(!(c <= DIGIT_NINE) || c >= DIGIT_ZERO, + "token-start-code-unit <= '9' implies token-start-code-unit is in '0'..'9' " + + "and because all digits are sorted before any letters"); + return c <= DIGIT_NINE; + }, + + // Returns the code unit of the first character at the current token + // position. Always returns the lower-case form of an alphabetical + // character. + // + // eslint-disable-next-line object-shorthand + singletonKey: function() { + assert(this.tokenLength === 1, "token is not a singleton"); + var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart); + assert((DIGIT_ZERO <= c && c <= DIGIT_NINE) || (LOWER_A <= c && c <= LOWER_Z), + "unexpected code unit"); + return c; + }, + + // eslint-disable-next-line object-shorthand + singletonValue: function() { + var singletonStart = this.tokenStart; + var min = callFunction(this.singletonKey, this) === LOWER_X ? 1 : 2; + + NEXT_TOKEN_OR_RETURN_NULL(this); + + // At least one non-singleton subtag must be present. + if (!(min <= this.tokenLength && this.tokenLength <= 8)) + return null; + do { + NEXT_TOKEN_OR_RETURN_NULL(this); + } while (min <= this.tokenLength && this.tokenLength <= 8); + + return callFunction(this.singletonValueAt, this, singletonStart); + }, + + // eslint-disable-next-line object-shorthand + singletonValueAt: function(start) { + // Singletons must be followed by a non-singleton subtag, "en-a-b" is not allowed. + var length = this.tokenStart - 1 - start; + if (length <= 2) + return null; + return Substring(this.localeLowercase, start, length); + } +}); + +/* eslint-disable complexity */ +/** + * Parser for Unicode BCP 47 locale identifiers. + * + * Returns null if |locale| can't be parsed as a `unicode_locale_id`. If the + * input is a grandfathered language tag, it is directly canonicalized to its + * modern form. The returned object has the following structure: + * + * { + * language: `unicode_language_subtag`, + * script: `unicode_script_subtag` / undefined, + * region: `unicode_region_subtag` / undefined, + * variants: array of `unicode_variant_subtag`, + * extensions: array of `extensions`, + * privateuse: `pu_extensions` / undefined, + * } + * + * All locale identifier subtags are returned in their normalized case: + * + * var langtag = parseLanguageTag("en-latn-us"); + * assertEq("en", langtag.language); + * assertEq("Latn", langtag.script); + * assertEq("US", langtag.region); + * + * Spec: https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers + */ +function parseLanguageTag(locale) { + assert(typeof locale === "string", "locale is a string"); + + // unicode_locale_id = unicode_language_id + // extensions* + // pu_extensions? ; + var ts = new BCP47TokenStream(locale); + NEXT_TOKEN_OR_RETURN_NULL(ts); + + var language, script, region, privateuse; + var variants = []; + var extensions = []; + + // unicode_language_id = unicode_language_subtag + // (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* ; + // + // sep = "-" + // + // Note: Unicode CLDR locale identifier backward compatibility extensions + // removed from `unicode_language_id`. + + // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + if (ts.token !== ALPHA || ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) { + // Four character language subtags are not allowed in Unicode BCP 47 + // locale identifiers. Also see the comparison to Unicode CLDR locale + // identifiers in <https://unicode.org/reports/tr35/#BCP_47_Conformance>. + return null; + } + assert((2 <= ts.tokenLength && ts.tokenLength <= 3) || + (5 <= ts.tokenLength && ts.tokenLength <= 8), + "language subtags have 2-3 or 5-8 letters"); + + SUBTAG_VAR_OR_RETURN_NULL(ts, language); + + // unicode_script_subtag = alpha{4} ; + if (ts.tokenLength === 4 && ts.token === ALPHA) { + SUBTAG_VAR_OR_RETURN_NULL(ts, script); -function getDuplicateSingletonRE() { - if (internalIntlRegExps.duplicateSingletonRE) - return internalIntlRegExps.duplicateSingletonRE; - - // RFC 5234 section B.1 - // ALPHA = %x41-5A / %x61-7A ; A-Z / a-z - var ALPHA = "[a-zA-Z]"; - // DIGIT = %x30-39 - // ; 0-9 - var DIGIT = "[0-9]"; - - // RFC 5646 section 2.1 - // alphanum = (ALPHA / DIGIT) ; letters and numbers - var alphanum = "(?:" + ALPHA + "|" + DIGIT + ")"; - // singleton = DIGIT ; 0 - 9 - // / %x41-57 ; A - W - // / %x59-5A ; Y - Z - // / %x61-77 ; a - w - // / %x79-7A ; y - z - var singleton = "(?:" + DIGIT + "|[A-WY-Za-wy-z])"; - - // Match a langtag that contains a duplicate singleton. - var duplicateSingleton = - // Match a singleton subtag, parenthesised so that we can refer back to - // it later; - "-(" + singleton + ")-" + - // then zero or more subtags; - "(?:" + alphanum + "+-)*" + - // and the same singleton again - "\\1" + - // ...but not followed by any characters that would turn it into a - // different subtag. - "(?!" + alphanum + ")"; - - // Language tags are case insensitive (RFC 5646 section 2.1.1). Using - // character classes covering both upper- and lower-case characters nearly - // addresses this -- but for the possibility of singleton repetition with - // differing case, e.g. "en-u-foo-U-foo". Use a case-insensitive regular - // expression to address this. (Note that there's no worry about case - // transformation accepting invalid characters here: users have already - // verified the string is alphanumeric Latin plus "-".) - return (internalIntlRegExps.duplicateSingletonRE = RegExpCreate(duplicateSingleton, "i")); + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + script = callFunction(std_String_toUpperCase, script[0]) + + Substring(script, 1, script.length - 1); + } + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + if ((ts.tokenLength === 2 && ts.token === ALPHA) || + (ts.tokenLength === 3 && ts.token === DIGIT)) + { + SUBTAG_VAR_OR_RETURN_NULL(ts, region); + + // Region codes need to be in upper-case. "bu" -> "BU" + region = callFunction(std_String_toUpperCase, region); + } + + // unicode_variant_subtag = (alphanum{5,8} + // | digit alphanum{3}) ; + // + // alphanum = [0-9 A-Z a-z] ; + while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || + (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) + { + // Locale identifiers are case insensitive (UTS 35, section 3.2). + // All seen variants are compared ignoring case differences by + // using the lower-case form. This allows to properly detect and + // reject variant repetitions with differing case, e.g. + // "en-variant-Variant". + var variant; + SUBTAG_VAR_OR_RETURN_NULL(ts, variant); + + // Reject the Locale identifier if a duplicate variant was found. + // + // This linear-time verification step means the whole variant + // subtag checking is potentially quadratic, but we're okay doing + // that because language tags are unlikely to be deliberately + // pathological. + if (callFunction(ArrayIndexOf, variants, variant) !== -1) + return null; + _DefineDataProperty(variants, variants.length, variant); + } + + // extensions = unicode_locale_extensions + // | transformed_extensions + // | other_extensions ; + // + // unicode_locale_extensions = sep [uU] + // ((sep keyword)+ + // |(sep attribute)+ (sep keyword)*) ; + // + // transformed_extensions = sep [tT] + // ((sep tlang (sep tfield)*) + // |(sep tfield)+) ; + // + // other_extensions = [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + // + // keyword = key (sep type)? ; + // + // key = alphanum alpha ; + // + // type = alphanum{3,8} (sep alphanum{3,8})* ; + // + // attribute = alphanum{3,8} ; + // + // tlang = unicode_language_subtag + // (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* ; + // + // tfield = tkey tvalue; + // + // tkey = alpha digit ; + // + // tvalue = (sep alphanum{3,8})+ ; + var seenSingletons = []; + while (ts.tokenLength === 1) { + var singleton = callFunction(ts.singletonKey, ts); + if (singleton === LOWER_X) + break; + + // Locale identifiers are case insensitive (UTS 35, section 3.2). + // Ensure |singletonKey()| does not return the code unit of an + // upper-case character, so we can properly detect and reject + // singletons with different case, e.g. "en-u-foo-U-foo". + assert(!(UPPER_A <= singleton && singleton <= UPPER_Z), + "unexpected upper-case code unit"); + + // Reject the input if a duplicate singleton was found. + // + // Similar to the variant validation step this check is O(n**2), + // but given that there are only 35 possible singletons the + // quadratic runtime is negligible. + if (callFunction(ArrayIndexOf, seenSingletons, singleton) !== -1) + return null; + _DefineDataProperty(seenSingletons, seenSingletons.length, singleton); + + var extension; + if (singleton === LOWER_U) { + var extensionStart = ts.tokenStart; + NEXT_TOKEN_OR_RETURN_NULL(ts); + + while (2 <= ts.tokenLength && ts.tokenLength <= 8) { + // `key` doesn't allow a digit as its second character. + if (ts.tokenLength === 2 && callFunction(ts.isDigitAt, ts, 1)) + return null; + NEXT_TOKEN_OR_RETURN_NULL(ts); + } + extension = callFunction(ts.singletonValueAt, ts, extensionStart); + } else if (singleton === LOWER_T) { + var extensionStart = ts.tokenStart; + NEXT_TOKEN_OR_RETURN_NULL(ts); + + // `tfield` starts with `tkey`, which in turn is `alpha digit`, so + // an alpha-only token must be a `tlang`. + if (ts.token === ALPHA) { + // `unicode_language_subtag` + if (ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) + return null; + NEXT_TOKEN_OR_RETURN_NULL(ts); + + // `unicode_script_subtag` (optional) + if (ts.tokenLength === 4 && ts.token === ALPHA) { + NEXT_TOKEN_OR_RETURN_NULL(ts); + } + + // `unicode_region_subtag` (optional) + if ((ts.tokenLength === 2 && ts.token === ALPHA) || + (ts.tokenLength === 3 && ts.token === DIGIT)) + { + NEXT_TOKEN_OR_RETURN_NULL(ts); + } + + // `unicode_variant_subtag` (optional) + while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || + (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) + { + NEXT_TOKEN_OR_RETURN_NULL(ts); + } + } + + // Trailing `tfield` subtags. + while (ts.tokenLength === 2) { + // `tkey` is `alpha digit`. + if (callFunction(ts.isDigitAt, ts, 0) || + !callFunction(ts.isDigitAt, ts, 1)) + { + return null; + } + NEXT_TOKEN_OR_RETURN_NULL(ts); + + // `tfield` requires at least one `tvalue`. + if (!(3 <= ts.tokenLength && ts.tokenLength <= 8)) + return null; + do { + NEXT_TOKEN_OR_RETURN_NULL(ts); + } while (3 <= ts.tokenLength && ts.tokenLength <= 8); + } + extension = callFunction(ts.singletonValueAt, ts, extensionStart); + } else { + extension = callFunction(ts.singletonValue, ts); + } + if (!extension) + return null; + + _DefineDataProperty(extensions, extensions.length, extension); + } + + // Trailing pu_extensions component of the unicode_locale_id production. + // + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + if (ts.tokenLength === 1 && callFunction(ts.singletonKey, ts) === LOWER_X) { + privateuse = callFunction(ts.singletonValue, ts); + if (!privateuse) + return null; + } + + // Reject the input if it couldn't be parsed completely. + if (ts.token !== NONE) + return null; + + var tagObj = { + language, + script, + region, + variants, + extensions, + privateuse, + }; + + // Handle grandfathered tags right away, so we don't need to have extra + // paths for grandfathered tags later on. + // + // grandfathered = "art-lojban" ; non-redundant tags registered + // / "cel-gaulish" ; during the RFC 3066 era + // / "zh-guoyu" ; these tags match the 'langtag' + // / "zh-hakka" ; production, but their subtags + // / "zh-xiang" ; are not extended language + // ; or variant subtags: their meaning + // ; is defined by their registration + // ; and all of these are deprecated + // ; in favor of a more modern + // ; subtag or sequence of subtags + if (hasOwn(ts.localeLowercase, grandfatheredMappings)) + updateGrandfatheredMappings(tagObj); + + // Return if the complete input was successfully parsed. + return tagObj; } +/** + * Return the locale and fields components of the given valid Transform + * extension subtag. + */ +function TransformExtensionComponents(extension) { + assert(typeof extension === "string", "extension is a String value"); + assert(callFunction(std_String_startsWith, extension, "t-"), + "extension starts with 't-'"); + + var ts = new BCP47TokenStream(Substring(extension, 2, extension.length - 2)); + NEXT_TOKEN_OR_ASSERT(ts); + + // `tfield` starts with `tkey`, which in turn is `alpha digit`, so + // an alpha-only token must be a `tlang`. + var localeObj; + if (ts.token === ALPHA) { + // `unicode_language_subtag` + assert((2 <= ts.tokenLength && ts.tokenLength <= 3) || + (5 <= ts.tokenLength && ts.tokenLength <= 8), + "language subtags have 2-3 or 5-8 letters"); + + var language; + SUBTAG_VAR_OR_ASSERT(ts, language); + + // unicode_script_subtag = alpha{4} ; + var script; + if (ts.tokenLength === 4 && ts.token === ALPHA) { + SUBTAG_VAR_OR_ASSERT(ts, script); + + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + script = callFunction(std_String_toUpperCase, script[0]) + + Substring(script, 1, script.length - 1); + } + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + var region; + if ((ts.tokenLength === 2 && ts.token === ALPHA) || + (ts.tokenLength === 3 && ts.token === DIGIT)) + { + SUBTAG_VAR_OR_ASSERT(ts, region); + + // Region codes need to be in upper-case. "bu" -> "BU" + region = callFunction(std_String_toUpperCase, region); + } + + // unicode_variant_subtag = (alphanum{5,8} + // | digit alphanum{3}) ; + // + // alphanum = [0-9 A-Z a-z] ; + var variants = []; + while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || + (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) + { + var variant; + SUBTAG_VAR_OR_ASSERT(ts, variant); + + _DefineDataProperty(variants, variants.length, variant); + } + + localeObj = { + language, + script, + region, + variants, + extensions: [], + privateuse: undefined, + }; + } + + // Trailing `tfield` subtags. (Any other trailing subtags are an error, + // because we're guaranteed to only see a valid tranform extension here.) + var fields = []; + while (ts.tokenLength === 2) { + // `tkey` is `alpha digit`. + assert(!callFunction(ts.isDigitAt, ts, 0) && callFunction(ts.isDigitAt, ts, 1), + "unexpected invalid tkey subtag"); + + var key; + SUBTAG_VAR_OR_ASSERT(ts, key); + + // `tfield` requires at least one `tvalue`. + assert(3 <= ts.tokenLength && ts.tokenLength <= 8, + "unexpected invalid tvalue subtag"); + + var value; + SUBTAG_VAR_OR_ASSERT(ts, value); + + while (3 <= ts.tokenLength && ts.tokenLength <= 8) { + var part; + SUBTAG_VAR_OR_ASSERT(ts, part); + value += "-" + part; + } + + _DefineDataProperty(fields, fields.length, {key, value}); + } + + assert(ts.token === NONE, + "unexpected trailing characters in promised-to-be-valid transform extension"); + + return {locale: localeObj, fields}; +} +/* eslint-enable complexity */ + +#undef NONE +#undef ALPHA +#undef DIGIT + +#undef HYPHEN +#undef DIGIT_ZERO +#undef DIGIT_NINE +#undef UPPER_A +#undef UPPER_Z +#undef LOWER_A +#undef LOWER_T +#undef LOWER_U +#undef LOWER_X +#undef LOWER_Z + +#undef SUBTAG_VAR_OR_ASSERT +#undef SUBTAG_VAR_OR_RETURN_NULL +#undef NEXT_TOKEN_OR_ASSERT +#undef NEXT_TOKEN_OR_RETURN_NULL /** * Verifies that the given string is a well-formed BCP 47 language tag @@ -278,53 +689,369 @@ function getDuplicateSingletonRE() { * Spec: ECMAScript Internationalization API Specification, 6.2.2. */ function IsStructurallyValidLanguageTag(locale) { - assert(typeof locale === "string", "IsStructurallyValidLanguageTag"); - var languageTagRE = getLanguageTagRE(); - if (!regexp_test_no_statics(languageTagRE, locale)) - return false; - - // Before checking for duplicate variant or singleton subtags with - // regular expressions, we have to get private use subtag sequences - // out of the picture. - if (callFunction(std_String_startsWith, locale, "x-")) - return true; - var pos = callFunction(std_String_indexOf, locale, "-x-"); - if (pos !== -1) - locale = callFunction(String_substring, locale, 0, pos); - - // Check for duplicate variant or singleton subtags. - var duplicateVariantRE = getDuplicateVariantRE(); - var duplicateSingletonRE = getDuplicateSingletonRE(); - return !regexp_test_no_statics(duplicateVariantRE, locale) && - !regexp_test_no_statics(duplicateSingletonRE, locale); + return parseLanguageTag(locale) !== null; } /** - * Joins the array elements in the given range with the supplied separator. + * Canonicalizes the given structurally valid Unicode BCP 47 locale identifier, + * including regularized case of subtags. For example, the language tag + * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where + * + * Zh ; 2*3ALPHA + * -haNS ; ["-" script] + * -bu ; ["-" region] + * -variant2 ; *("-" variant) + * -Variant1 + * -u-ca-chinese ; *("-" extension) + * -t-Zh-laTN + * -x-PRIVATE ; ["-" privateuse] + * + * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private + * + * UTS 35 specifies two different canonicalization algorithms. There's one to + * canonicalize BCP 47 language tags and other one to canonicalize Unicode + * locale identifiers. The latter one wasn't present when ECMA-402 was changed + * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags, so + * ECMA-402 currently only uses the former to canonicalize Unicode BCP 47 locale + * identifiers. + * + * Spec: ECMAScript Internationalization API Specification, 6.2.3. + * Spec: https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers + * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion */ -function ArrayJoinRange(array, separator, from, to = array.length) { - assert(typeof separator === "string", "|separator| is a string value"); - assert(typeof from === "number", "|from| is a number value"); - assert(typeof to === "number", "|to| is a number value"); - assert(0 <= from && from <= to && to <= array.length, "|from| and |to| form a valid range"); +function CanonicalizeLanguageTagObject(localeObj) { + assert(IsObject(localeObj), "CanonicalizeLanguageTagObject"); - if (from === to) - return ""; + // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by + // normalizing the case and ordering all subtags. The canonical syntax form + // itself is specified in UTS 35, 3.2.1. + + // The parser already normalized the case for all subtags. - var result = array[from]; - for (var i = from + 1; i < to; i++) { - result += separator + array[i]; +#ifdef DEBUG + function IsLowerCase(s) { + return s === callFunction(std_String_toLowerCase, s); } - return result; + function IsUpperCase(s) { + return s === callFunction(std_String_toUpperCase, s); + } + function IsTitleCase(s) { + assert(s.length > 0, "unexpected empy string"); + var r = callFunction(std_String_toUpperCase, s[0]) + + callFunction(std_String_toLowerCase, Substring(s, 1, s.length - 1)); + return s === r; + } +#endif + + // 1. Any script subtag is in title case. + assert(localeObj.script === undefined || IsTitleCase(localeObj.script), + "If present, script subtag is in title case"); + + // 2. Any region subtag is in uppercase. + assert(localeObj.region === undefined || IsUpperCase(localeObj.region), + "If present, region subtag is in upper case"); + + // 3. All other subtags are in lowercase. + assert(IsLowerCase(localeObj.language), + "language subtag is in lower case"); + assert(callFunction(ArrayEvery, localeObj.variants, IsLowerCase), + "variant subtags are in lower case"); + assert(callFunction(ArrayEvery, localeObj.extensions, IsLowerCase), + "extension subtags are in lower case"); + assert(localeObj.privateuse === undefined || IsLowerCase(localeObj.privateuse), + "If present, privateuse subtag is in lower case"); + + + // The second step in UTS 35, 3.2.1, is to order all subtags. + + // 1. Any variants are in alphabetical order. + var variants = localeObj.variants; + if (variants.length > 0) { + callFunction(ArraySort, variants); + } + + // 2. Any extensions are in alphabetical order by their singleton. + var extensions = localeObj.extensions; + if (extensions.length > 0) { + // Extension sequences are sorted by their singleton characters. + // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" + callFunction(ArraySort, extensions); + + // The last three bullet points in UTS 35, 3.2.1 apply only to Unicode and Transform + // extensions. + // + // 3. All attributes are sorted in alphabetical order. + // + // 4. All keywords and tfields are sorted by alphabetical order of their + // keys, within their respective extensions. + // + // 5. Any type or tfield value "true" is removed. + + for (var i = 0; i < extensions.length; i++) { + var ext = extensions[i]; + assert(IsLowerCase(ext), + "extension subtags must be in lower-case"); + assert(ext[1] === "-", + "extension subtags start with a singleton"); + + // Canonicalize Unicode locale extension subtag if present. + if (ext[0] === "u") { + var {attributes, keywords} = UnicodeExtensionComponents(ext); + extensions[i] = CanonicalizeUnicodeExtension(attributes, keywords); + } + + // Canonicalize Unicode BCP 47 T extension if present. + if (ext[0] === "t") { + var {locale, fields} = TransformExtensionComponents(ext); + extensions[i] = CanonicalizeTransformExtension(locale, fields); + } + } + } + + // The next two steps in 3.3.1 replace deprecated language and region + // subtags with their preferred mappings. + updateLocaleIdMappings(localeObj); + + // The two final steps in 3.3.1, handling irregular grandfathered and + // private-use only language tags, don't apply, because these two forms + // can't occur in Unicode BCP 47 locale identifiers. +} + +/** + * Intl.Locale proposal + * + * UnicodeExtensionComponents( extension ) + * + * Returns the components of |extension| where |extension| is a "Unicode locale + * extension sequence" (ECMA-402, 6.2.1) without the starting separator + * character. + */ +function UnicodeExtensionComponents(extension) { + assert(typeof extension === "string", "extension is a String value"); + + // Step 1. + var attributes = []; + + // Step 2. + var keywords = []; + + // Step 3. + var isKeyword = false; + + // Step 4. + var size = extension.length; + + // Step 5. + // |extension| starts with "u-" instead of "-u-" in our implementation, so + // we need to initialize |k| with 2 instead of 3. + assert(callFunction(std_String_startsWith, extension, "u-"), + "extension starts with 'u-'"); + var k = 2; + + // Step 6. + var key, value; + while (k < size) { + // Step 6.a. + var e = callFunction(std_String_indexOf, extension, "-", k); + + // Step 6.b. + var len = (e < 0 ? size : e) - k; + + // Step 6.c. + var subtag = Substring(extension, k, len); + + // Steps 6.d-e. + if (!isKeyword) { + // Step 6.d. + // NB: Duplicates are handled elsewhere in our implementation. + if (len !== 2) + _DefineDataProperty(attributes, attributes.length, subtag); + } else { + // Steps 6.e.i-ii. + if (len === 2) { + // Step 6.e.i.1. + // NB: Duplicates are handled elsewhere in our implementation. + _DefineDataProperty(keywords, keywords.length, {key, value}); + } else { + // Step 6.e.ii.1. + if (value !== "") + value += "-"; + + // Step 6.e.ii.2. + value += subtag; + } + } + + // Step 6.f. + if (len === 2) { + // Step 6.f.i. + isKeyword = true; + + // Step 6.f.ii. + key = subtag; + + // Step 6.f.iii. + value = ""; + } + + // Step 6.g. + k += len + 1; + } + + // Step 7. + if (isKeyword) { + // Step 7.a. + // NB: Duplicates are handled elsewhere in our implementation. + _DefineDataProperty(keywords, keywords.length, {key, value}); + } + + // Step 8. + return {attributes, keywords}; +} + +/** + * CanonicalizeUnicodeExtension( attributes, keywords ) + * + * Canonical syntax per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All attributes and keywords are in lowercase. + * - Note: The parser already converted keywords to lowercase. + * - All attributes are sorted in alphabetical order. + * - All keywords are sorted by alphabetical order of their keys. + * - Any type value "true" is removed. + * + * Canonical form: + * - All keys and types use the canonical form (from the name attribute; + * see Section 3.6.4 U Extension Data Files). + */ +function CanonicalizeUnicodeExtension(attributes, keywords) { + assert(attributes.length > 0 || keywords.length > 0, + "unexpected empty Unicode locale extension components"); + + // All attributes are sorted in alphabetical order. + if (attributes.length > 1) + callFunction(ArraySort, attributes); + + // All keywords are sorted by alphabetical order of keys. + if (keywords.length > 1) { + function UnicodeKeySort(left, right) { + var leftKey = left.key; + var rightKey = right.key; + assert(leftKey.length === 2, "left key is a Unicode key"); + assert(rightKey.length === 2, "right key is a Unicode key"); + + // Compare both strings using charCodeAt(), because relational + // string comparison always calls into the VM, whereas charCodeAt + // can be inlined by Ion. + var diff = callFunction(std_String_charCodeAt, leftKey, 0) - + callFunction(std_String_charCodeAt, rightKey, 0); + if (diff === 0) { + diff = callFunction(std_String_charCodeAt, leftKey, 1) - + callFunction(std_String_charCodeAt, rightKey, 1); + } + return diff; + } + + callFunction(ArraySort, keywords, UnicodeKeySort); + } + + var extension = "u"; + + // Append all attributes. + for (var i = 0; i < attributes.length; i++) { + extension += "-" + attributes[i]; + } + + // Append all keywords. + for (var i = 0; i < keywords.length; i++) { + var {key, value} = keywords[i]; + extension += "-" + key; + + // Type value "true" is removed. + if (value !== "" && value !== "true") + extension += "-" + value; + } + + return extension; +} + +/** + * CanonicalizeTransformExtension + * + * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: + * + * - These subtags are all in lowercase (that is the canonical casing for these + * subtags), [...]. + * + * And per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All keywords and tfields are sorted by alphabetical order of their keys, + * within their respective extensions. + */ +function CanonicalizeTransformExtension(localeObj, fields) { + assert(localeObj !== undefined || fields.length > 0, + "unexpected empty Transform locale extension components"); + + if (fields.length > 0) { + function TransformKeySort(left, right) { + var leftKey = left.key; + var rightKey = right.key; + assert(leftKey.length === 2, "left key is a Transform key"); + assert(rightKey.length === 2, "right key is a Transform key"); + + // Compare both strings using charCodeAt(), because relational + // string comparison always calls into the VM, whereas charCodeAt + // can be inlined by Ion. + var diff = callFunction(std_String_charCodeAt, leftKey, 0) - + callFunction(std_String_charCodeAt, rightKey, 0); + if (diff === 0) { + diff = callFunction(std_String_charCodeAt, leftKey, 1) - + callFunction(std_String_charCodeAt, rightKey, 1); + } + return diff; + } + + callFunction(ArraySort, fields, TransformKeySort); + } + + var extension = "t"; + + // Append the language subtag if present. + if (localeObj !== undefined) { + // [1] is a bit unclear whether or not the `tlang` subtag also needs + // to be canonicalized (and case-adjusted). For now simply append it as + // is and change it to all lower-case. If we switch to [2], the `tlang` + // subtag also needs to be canonicalized according to the same rules as + // `unicode_language_id` subtags are canonicalized. Also see [3]. + // + // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier + // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers + // [3] https://github.com/tc39/ecma402/issues/330 + var localeStr = StringFromLanguageTagObject(localeObj); + extension += "-" + callFunction(std_String_toLowerCase, localeStr); + } + + // Append all fields. + for (var i = 0; i < fields.length; i++) { + // UTS 35, 3.2.1 specifies: + // - Any type or tfield value "true" is removed. + // + // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so + // ignore this apparently invalid part of the UTS 35 specification and + // simply append all `tfield` subtags. + var {key, value} = fields[i]; + extension += "-" + key + "-" + value; + } + + return extension; } /** * Canonicalizes the given structurally valid BCP 47 language tag, including * regularized case of subtags. For example, the language tag - * Zh-NAN-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where + * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where * * Zh ; 2*3ALPHA - * -NAN ; ["-" extlang] * -haNS ; ["-" script] * -bu ; ["-" region] * -variant2 ; *("-" variant) @@ -333,120 +1060,54 @@ function ArrayJoinRange(array, separator, from, to = array.length) { * -t-Zh-laTN * -x-PRIVATE ; ["-" privateuse] * - * becomes nan-Hans-mm-variant2-variant1-t-zh-latn-u-ca-chinese-x-private + * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private * * Spec: ECMAScript Internationalization API Specification, 6.2.3. - * Spec: RFC 5646, section 4.5. */ function CanonicalizeLanguageTag(locale) { - assert(IsStructurallyValidLanguageTag(locale), "CanonicalizeLanguageTag"); + var localeObj = parseLanguageTag(locale); + assert(localeObj !== null, "CanonicalizeLanguageTag"); - // The input - // "Zh-NAN-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE" - // will be used throughout this method to illustrate how it works. + CanonicalizeLanguageTagObject(localeObj); - // Language tags are compared and processed case-insensitively, so - // technically it's not necessary to adjust case. But for easier processing, - // and because the canonical form for most subtags is lower case, we start - // with lower case for all. - // "Zh-NAN-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE" -> - // "zh-nan-hans-bu-variant2-variant1-u-ca-chinese-t-zh-latn-x-private" - locale = callFunction(std_String_toLowerCase, locale); - - // Handle mappings for complete tags. - if (hasOwn(locale, langTagMappings)) - return langTagMappings[locale]; - - var subtags = StringSplitString(ToString(locale), "-"); - var i = 0; - - // Handle the standard part: All subtags before the first singleton or "x". - // "zh-nan-hans-bu-variant2-variant1" - while (i < subtags.length) { - var subtag = subtags[i]; - - // If we reach the start of an extension sequence or private use part, - // we're done with this loop. We have to check for i > 0 because for - // irregular language tags, such as i-klingon, the single-character - // subtag "i" is not the start of an extension sequence. - // In the example, we break at "u". - if (subtag.length === 1 && (i > 0 || subtag === "x")) - break; + return StringFromLanguageTagObject(localeObj); +} - if (i !== 0) { - if (subtag.length === 4) { - // 4-character subtags that are not in initial position are - // script codes; their first character needs to be capitalized. - // "hans" -> "Hans" - subtag = callFunction(std_String_toUpperCase, subtag[0]) + - callFunction(String_substring, subtag, 1); - } else if (subtag.length === 2) { - // 2-character subtags that are not in initial position are - // region codes; they need to be upper case. "bu" -> "BU" - subtag = callFunction(std_String_toUpperCase, subtag); - } - } - if (hasOwn(subtag, langSubtagMappings)) { - // Replace deprecated subtags with their preferred values. - // "BU" -> "MM" - // This has to come after we capitalize region codes because - // otherwise some language and region codes could be confused. - // For example, "in" is an obsolete language code for Indonesian, - // but "IN" is the country code for India. - // Note that the script generating langSubtagMappings makes sure - // that no regular subtag mapping will replace an extlang code. - subtag = langSubtagMappings[subtag]; - } else if (hasOwn(subtag, extlangMappings)) { - // Replace deprecated extlang subtags with their preferred values, - // and remove the preceding subtag if it's a redundant prefix. - // "zh-nan" -> "nan" - // Note that the script generating extlangMappings makes sure that - // no extlang mapping will replace a normal language code. - subtag = extlangMappings[subtag].preferred; - if (i === 1 && extlangMappings[subtag].prefix === subtags[0]) { - callFunction(std_Array_shift, subtags); - i--; - } - } - subtags[i] = subtag; - i++; - } - var normal = ArrayJoinRange(subtags, "-", 0, i); - - // Extension sequences are sorted by their singleton characters. - // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" - var extensions = new List(); - while (i < subtags.length && subtags[i] !== "x") { - var extensionStart = i; - i++; - while (i < subtags.length && subtags[i].length > 1) - i++; - var extension = ArrayJoinRange(subtags, "-", extensionStart, i); - callFunction(std_Array_push, extensions, extension); - } - callFunction(std_Array_sort, extensions); +/** + * Returns the string representation of the given language tag object. + */ +function StringFromLanguageTagObject(localeObj) { + assert(IsObject(localeObj), "StringFromLanguageTagObject"); + + var { + language, + script, + region, + variants, + extensions, + privateuse, + } = localeObj; - // Private use sequences are left as is. "x-private" - var privateUse = ""; - if (i < subtags.length) - privateUse = ArrayJoinRange(subtags, "-", i); + var canonical = language; + + if (script !== undefined) + canonical += "-" + script; + + if (region !== undefined) + canonical += "-" + region; + + if (variants.length > 0) + canonical += "-" + callFunction(std_Array_join, variants, "-"); - // Put everything back together. - var canonical = normal; if (extensions.length > 0) canonical += "-" + callFunction(std_Array_join, extensions, "-"); - if (privateUse.length > 0) { - // Be careful of a Language-Tag that is entirely privateuse. - if (canonical.length > 0) - canonical += "-" + privateUse; - else - canonical = privateUse; - } + + if (privateuse !== undefined) + canonical += "-" + privateuse; return canonical; } - /** * Returns true if the input contains only ASCII alphabetical characters. */ @@ -469,13 +1130,11 @@ function ValidateAndCanonicalizeLanguageTag(locale) { assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag"); // Handle the common case (a standalone language) first. - // Only the following BCP47 subset is accepted: - // Language-Tag = langtag - // langtag = language - // language = 2*3ALPHA ; shortest ISO 639 code - // For three character long strings we need to make sure it's not a - // private use only language tag, for example "x-x". - if (locale.length === 2 || (locale.length === 3 && locale[1] !== "-")) { + // Only the following Unicode BCP 47 locale identifier subset is accepted: + // unicode_locale_id = unicode_language_id + // unicode_language_id = unicode_language_subtag + // unicode_language_subtag = alpha{2,3} + if (locale.length === 2 || locale.length === 3) { if (!IsASCIIAlphaString(locale)) ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale); assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag"); @@ -483,42 +1142,27 @@ function ValidateAndCanonicalizeLanguageTag(locale) { // The language subtag is canonicalized to lower case. locale = callFunction(std_String_toLowerCase, locale); - // langTagMappings doesn't contain any 2*3ALPHA keys, so we don't need - // to check for possible replacements in this map. - assert(!callFunction(std_Object_hasOwnProperty, langTagMappings, locale), - "langTagMappings contains no 2*3ALPHA mappings"); - - // Replace deprecated subtags with their preferred values. - locale = callFunction(std_Object_hasOwnProperty, langSubtagMappings, locale) - ? langSubtagMappings[locale] - : locale; - assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization"); + // updateLocaleIdMappings may modify tags containing only |language| + // subtags, if the language is in |complexLanguageMappings|, so we need + // to handle that case first. + if (!hasOwn(locale, complexLanguageMappings)) { + // Replace deprecated subtags with their preferred values. + locale = hasOwn(locale, languageMappings) + ? languageMappings[locale] + : locale; + assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization"); - return locale; + return locale; + } } - if (!IsStructurallyValidLanguageTag(locale)) + var localeObj = parseLanguageTag(locale); + if (localeObj === null) ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale); - return CanonicalizeLanguageTag(locale); -} - - -function localeContainsNoUnicodeExtensions(locale) { - // No "-u-", no possible Unicode extension. - if (callFunction(std_String_indexOf, locale, "-u-") === -1) - return true; - - // "-u-" within privateuse also isn't one. - if (callFunction(std_String_indexOf, locale, "-u-") > callFunction(std_String_indexOf, locale, "-x-")) - return true; - - // An entirely-privateuse tag doesn't contain extensions. - if (callFunction(std_String_startsWith, locale, "x-")) - return true; + CanonicalizeLanguageTagObject(localeObj); - // Otherwise, we have a Unicode extension sequence. - return false; + return StringFromLanguageTagObject(localeObj); } @@ -571,11 +1215,13 @@ function DefaultLocaleIgnoringAvailableLocales() { // If we didn't get a cache hit, compute the candidate default locale and // cache it. Fall back on the last-ditch locale when necessary. - var candidate; - if (!IsStructurallyValidLanguageTag(runtimeDefaultLocale)) { + var candidate = parseLanguageTag(runtimeDefaultLocale); + if (candidate === null) { candidate = lastDitchLocale(); } else { - candidate = CanonicalizeLanguageTag(runtimeDefaultLocale); + CanonicalizeLanguageTagObject(candidate); + + candidate = StringFromLanguageTagObject(candidate); // The default locale must be in [[availableLocales]], and that list // must not contain any locales with Unicode extension sequences, so @@ -592,7 +1238,7 @@ function DefaultLocaleIgnoringAvailableLocales() { assert(IsStructurallyValidLanguageTag(candidate), "the candidate must be structurally valid"); - assert(localeContainsNoUnicodeExtensions(candidate), + assert(startOfUnicodeExtensions(candidate) < 0, "the candidate must not contain a Unicode extension sequence"); return candidate; @@ -633,7 +1279,7 @@ function DefaultLocale() { "the computed default locale must be structurally valid"); assert(locale === CanonicalizeLanguageTag(locale), "the computed default locale must be canonical"); - assert(localeContainsNoUnicodeExtensions(locale), + assert(startOfUnicodeExtensions(locale) < 0, "the computed default locale must not contain a Unicode extension sequence"); localeCache.defaultLocale = locale; @@ -674,30 +1320,53 @@ function addSpecialMissingLanguageTags(availableLocales) { * Spec: ECMAScript Internationalization API Specification, 9.2.1. */ function CanonicalizeLocaleList(locales) { + // Step 1. if (locales === undefined) - return new List(); - var seen = new List(); + return []; + + // Step 3 (and the remaining steps). if (typeof locales === "string") - locales = [locales]; + return [ValidateAndCanonicalizeLanguageTag(locales)]; + + // Step 2. + var seen = []; + + // Step 4. var O = ToObject(locales); + + // Step 5. var len = ToLength(O.length); + + // Step 6. var k = 0; + + // Step 7. while (k < len) { - // Don't call ToString(k) - SpiderMonkey is faster with integers. - var kPresent = HasProperty(O, k); - if (kPresent) { + // Steps 7.a-c. + if (k in O) { + // Step 7.c.i. var kValue = O[k]; + + // Step 7.c.ii. if (!(typeof kValue === "string" || IsObject(kValue))) ThrowTypeError(JSMSG_INVALID_LOCALES_ELEMENT); + + // Step 7.c.iii. var tag = ToString(kValue); - if (!IsStructurallyValidLanguageTag(tag)) - ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, tag); - tag = CanonicalizeLanguageTag(tag); + + // Step 7.c.iv. + tag = ValidateAndCanonicalizeLanguageTag(tag); + + // Step 7.c.v. if (callFunction(ArrayIndexOf, seen, tag) === -1) - callFunction(std_Array_push, seen, tag); + _DefineDataProperty(seen, seen.length, tag); } + + // Step 7.d. k++; } + + // Step 8. return seen; } @@ -705,7 +1374,7 @@ function CanonicalizeLocaleList(locales) { function BestAvailableLocaleHelper(availableLocales, locale, considerDefaultLocale) { assert(IsStructurallyValidLanguageTag(locale), "invalid BestAvailableLocale locale structure"); assert(locale === CanonicalizeLanguageTag(locale), "non-canonical BestAvailableLocale locale"); - assert(localeContainsNoUnicodeExtensions(locale), "locale must contain no Unicode extensions"); + assert(startOfUnicodeExtensions(locale) < 0, "locale must contain no Unicode extensions"); // In the spec, [[availableLocales]] is formally a list of all available // locales. But in our implementation, it's an *incomplete* list, not @@ -780,28 +1449,37 @@ function BestAvailableLocaleIgnoringDefault(availableLocales, locale) { * Spec: RFC 4647, section 3.4. */ function LookupMatcher(availableLocales, requestedLocales) { - var i = 0; - var len = requestedLocales.length; - var availableLocale; - var locale, noExtensionsLocale; - while (i < len && availableLocale === undefined) { - locale = requestedLocales[i]; - noExtensionsLocale = removeUnicodeExtensions(locale); - availableLocale = BestAvailableLocale(availableLocales, noExtensionsLocale); - i++; - } - + // Step 1. var result = new Record(); - if (availableLocale !== undefined) { - result.locale = availableLocale; - if (locale !== noExtensionsLocale) { - var unicodeLocaleExtensionSequenceRE = getUnicodeLocaleExtensionSequenceRE(); - var extensionMatch = regexp_exec_no_statics(unicodeLocaleExtensionSequenceRE, locale); - result.extension = extensionMatch[0]; + + // Step 2. + for (var i = 0; i < requestedLocales.length; i++) { + var locale = requestedLocales[i]; + + // Step 2.a. + var noExtensionsLocale = removeUnicodeExtensions(locale); + + // Step 2.b. + var availableLocale = BestAvailableLocale(availableLocales, noExtensionsLocale); + + // Step 2.c. + if (availableLocale !== undefined) { + // Step 2.c.i. + result.locale = availableLocale; + + // Step 2.c.ii. + if (locale !== noExtensionsLocale) + result.extension = getUnicodeExtensions(locale); + + // Step 2.c.iii. + return result; } - } else { - result.locale = DefaultLocale(); } + + // Steps 3-4. + result.locale = DefaultLocale(); + + // Step 5. return result; } @@ -823,73 +1501,73 @@ function BestFitMatcher(availableLocales, requestedLocales) { /** * Returns the Unicode extension value subtags for the requested key subtag. * - * NOTE: PR to add UnicodeExtensionValue to ECMA-402 isn't yet written. + * Spec: ECMAScript Internationalization API Specification, 9.2.5. */ function UnicodeExtensionValue(extension, key) { assert(typeof extension === "string", "extension is a string value"); - assert(function() { - var unicodeLocaleExtensionSequenceRE = getUnicodeLocaleExtensionSequenceRE(); - var extensionMatch = regexp_exec_no_statics(unicodeLocaleExtensionSequenceRE, extension); - return extensionMatch !== null && extensionMatch[0] === extension; - }(), "extension is a Unicode extension subtag"); + assert(callFunction(std_String_startsWith, extension, "-u-") && + getUnicodeExtensions("und" + extension) === extension, + "extension is a Unicode extension subtag"); assert(typeof key === "string", "key is a string value"); - assert(key.length === 2, "key is a Unicode extension key subtag"); // Step 1. - var size = extension.length; + assert(key.length === 2, "key is a Unicode extension key subtag"); // Step 2. - var searchValue = "-" + key + "-"; + var size = extension.length; // Step 3. - var pos = callFunction(std_String_indexOf, extension, searchValue); + var searchValue = "-" + key + "-"; // Step 4. + var pos = callFunction(std_String_indexOf, extension, searchValue); + + // Step 5. if (pos !== -1) { - // Step 4.a. + // Step 5.a. var start = pos + 4; - // Step 4.b. + // Step 5.b. var end = start; - // Step 4.c. + // Step 5.c. var k = start; - // Steps 4.d-e. + // Steps 5.d-e. while (true) { - // Step 4.e.i. + // Step 5.e.i. var e = callFunction(std_String_indexOf, extension, "-", k); - // Step 4.e.ii. + // Step 5.e.ii. var len = e === -1 ? size - k : e - k; - // Step 4.e.iii. + // Step 5.e.iii. if (len === 2) break; - // Step 4.e.iv. + // Step 5.e.iv. if (e === -1) { end = size; break; } - // Step 4.e.v. + // Step 5.e.v. end = e; k = e + 1; } - // Step 4.f. + // Step 5.f. return callFunction(String_substring, extension, start, end); } - // Step 5. + // Step 6. searchValue = "-" + key; - // Steps 6-7. + // Steps 7-8. if (callFunction(std_String_endsWith, extension, searchValue)) return ""; - // Step 8 (implicit). + // Step 9 (implicit). } /** @@ -899,11 +1577,9 @@ function UnicodeExtensionValue(extension, key) { * caller's relevant extensions and locale data as well as client-provided * options into consideration. * - * Spec: ECMAScript Internationalization API Specification, 9.2.5. + * Spec: ECMAScript Internationalization API Specification, 9.2.6. */ function ResolveLocale(availableLocales, requestedLocales, options, relevantExtensionKeys, localeData) { - /*jshint laxbreak: true */ - // Steps 1-3. var matcher = options.localeMatcher; var r = (matcher === "lookup") @@ -912,79 +1588,82 @@ function ResolveLocale(availableLocales, requestedLocales, options, relevantExte // Step 4. var foundLocale = r.locale; - - // Step 5 (Not applicable in this implementation). var extension = r.extension; - // Steps 6-7. + // Step 5. var result = new Record(); + + // Step 6. result.dataLocale = foundLocale; - // Step 8. + // Step 7. var supportedExtension = "-u"; // In this implementation, localeData is a function, not an object. var localeDataProvider = localeData(); - // Steps 9-12. + // Step 8. for (var i = 0; i < relevantExtensionKeys.length; i++) { - // Steps 12.a-c. var key = relevantExtensionKeys[i]; - // Steps 12.b-d (The locale data is only computed when needed). + // Steps 8.a-h (The locale data is only computed when needed). var keyLocaleData = undefined; var value = undefined; // Locale tag may override. - // Step 12.e. + // Step 8.g. var supportedExtensionAddition = ""; - // Step 12.f. + // Step 8.h. if (extension !== undefined) { - // NB: The step annotations don't yet match the ES2017 Intl draft, - // 94045d234762ad107a3d09bb6f7381a65f1a2f9b, because the PR to add - // the new UnicodeExtensionValue abstract operation still needs to - // be written. - - // Step 12.f.i. + // Step 8.h.i. var requestedValue = UnicodeExtensionValue(extension, key); - // Step 12.f.ii. + // Step 8.h.ii. if (requestedValue !== undefined) { - // Steps 12.b-c. + // Steps 8.a-d. keyLocaleData = callFunction(localeDataProvider[key], null, foundLocale); - // Step 12.f.ii.1. + // Step 8.h.ii.1. if (requestedValue !== "") { - // Step 12.f.ii.1.a. + // Step 8.h.ii.1.a. if (callFunction(ArrayIndexOf, keyLocaleData, requestedValue) !== -1) { value = requestedValue; supportedExtensionAddition = "-" + key + "-" + value; } } else { - // Step 12.f.ii.2. + // Step 8.h.ii.2. // According to the LDML spec, if there's no type value, // and true is an allowed value, it's used. - if (callFunction(ArrayIndexOf, keyLocaleData, "true") !== -1) + if (callFunction(ArrayIndexOf, keyLocaleData, "true") !== -1) { value = "true"; + supportedExtensionAddition = "-" + key; + } } } } // Options override all. - // Step 12.g.i. + // Step 8.i.i. var optionsValue = options[key]; - // Step 12.g, 12.gg.ii. + // Step 8.i.ii. + assert(typeof optionsValue === "string" || + optionsValue === undefined || + optionsValue === null, + "unexpected type for options value"); + + // Steps 8.i, 8.i.iii.1. if (optionsValue !== undefined && optionsValue !== value) { - // Steps 12.b-c. + // Steps 8.a-d. if (keyLocaleData === undefined) keyLocaleData = callFunction(localeDataProvider[key], null, foundLocale); + // Step 8.i.iii. if (callFunction(ArrayIndexOf, keyLocaleData, optionsValue) !== -1) { value = optionsValue; supportedExtensionAddition = ""; @@ -993,27 +1672,29 @@ function ResolveLocale(availableLocales, requestedLocales, options, relevantExte // Locale data provides default value. if (value === undefined) { - // Steps 12.b-d. + // Steps 8.a-f. value = keyLocaleData === undefined ? callFunction(localeDataProvider.default[key], null, foundLocale) : keyLocaleData[0]; } - // Steps 12.h-j. + // Step 8.j. assert(typeof value === "string" || value === null, "unexpected locale data value"); result[key] = value; + + // Step 8.k. supportedExtension += supportedExtensionAddition; } - // Step 13. + // Step 9. if (supportedExtension.length > 2) { assert(!callFunction(std_String_startsWith, foundLocale, "x-"), "unexpected privateuse-only locale returned from ICU"); - // Step 13.a. + // Step 9.a. var privateIndex = callFunction(std_String_indexOf, foundLocale, "-x-"); - // Steps 13.b-c. + // Steps 9.b-c. if (privateIndex === -1) { foundLocale += supportedExtension; } else { @@ -1022,19 +1703,19 @@ function ResolveLocale(availableLocales, requestedLocales, options, relevantExte foundLocale = preExtension + supportedExtension + postExtension; } - // Step 13.d. + // Step 9.d. assert(IsStructurallyValidLanguageTag(foundLocale), "invalid locale after concatenation"); - // Step 13.e (Not required in this implementation, because we don't + // Step 9.e (Not required in this implementation, because we don't // canonicalize Unicode extension subtags). assert(foundLocale === CanonicalizeLanguageTag(foundLocale), "same locale with extension"); } - // Step 14. + // Step 10. result.locale = foundLocale; - // Step 15. + // Step 11. return result; } @@ -1044,31 +1725,29 @@ function ResolveLocale(availableLocales, requestedLocales, options, relevantExte * matching (possibly fallback) locale. Locales appear in the same order in the * returned list as in the input list. * - * Spec: ECMAScript Internationalization API Specification, 9.2.6. + * Spec: ECMAScript Internationalization API Specification, 9.2.7. */ function LookupSupportedLocales(availableLocales, requestedLocales) { - // Steps 1-2. - var len = requestedLocales.length; - var subset = new List(); + // Step 1. + var subset = []; - // Steps 3-4. - var k = 0; - while (k < len) { - // Steps 4.a-b. - var locale = requestedLocales[k]; + // Step 2. + for (var i = 0; i < requestedLocales.length; i++) { + var locale = requestedLocales[i]; + + // Step 2.a. var noExtensionsLocale = removeUnicodeExtensions(locale); - // Step 4.c-d. + // Step 2.b. var availableLocale = BestAvailableLocale(availableLocales, noExtensionsLocale); - if (availableLocale !== undefined) - callFunction(std_Array_push, subset, locale); - // Step 4.e. - k++; + // Step 2.c. + if (availableLocale !== undefined) + _DefineDataProperty(subset, subset.length, locale); } - // Steps 5-6. - return callFunction(std_Array_slice, subset, 0); + // Step 3. + return subset; } @@ -1077,7 +1756,7 @@ function LookupSupportedLocales(availableLocales, requestedLocales) { * matching (possibly fallback) locale. Locales appear in the same order in the * returned list as in the input list. * - * Spec: ECMAScript Internationalization API Specification, 9.2.7. + * Spec: ECMAScript Internationalization API Specification, 9.2.8. */ function BestFitSupportedLocales(availableLocales, requestedLocales) { // don't have anything better @@ -1090,19 +1769,17 @@ function BestFitSupportedLocales(availableLocales, requestedLocales) { * matching (possibly fallback) locale. Locales appear in the same order in the * returned list as in the input list. * - * Spec: ECMAScript Internationalization API Specification, 9.2.8. + * Spec: ECMAScript Internationalization API Specification, 9.2.9. */ function SupportedLocales(availableLocales, requestedLocales, options) { - /*jshint laxbreak: true */ - // Step 1. var matcher; if (options !== undefined) { - // Steps 1.a-b. + // Step 1.a. options = ToObject(options); - matcher = options.localeMatcher; - // Step 1.c. + // Step 1.b + matcher = options.localeMatcher; if (matcher !== undefined) { matcher = ToString(matcher); if (matcher !== "lookup" && matcher !== "best fit") @@ -1110,12 +1787,12 @@ function SupportedLocales(availableLocales, requestedLocales, options) { } } - // Steps 2-3. + // Steps 2-5. var subset = (matcher === undefined || matcher === "best fit") ? BestFitSupportedLocales(availableLocales, requestedLocales) : LookupSupportedLocales(availableLocales, requestedLocales); - // Step 4. + // Steps 6-7. for (var i = 0; i < subset.length; i++) { _DefineDataProperty(subset, i, subset[i], ATTR_ENUMERABLE | ATTR_NONCONFIGURABLE | ATTR_NONWRITABLE); @@ -1123,7 +1800,7 @@ function SupportedLocales(availableLocales, requestedLocales, options) { _DefineDataProperty(subset, "length", subset.length, ATTR_NONENUMERABLE | ATTR_NONCONFIGURABLE | ATTR_NONWRITABLE); - // Step 5. + // Step 8. return subset; } @@ -1133,7 +1810,7 @@ function SupportedLocales(availableLocales, requestedLocales, options) { * the required type, checks whether it is one of a list of allowed values, * and fills in a fallback value if necessary. * - * Spec: ECMAScript Internationalization API Specification, 9.2.9. + * Spec: ECMAScript Internationalization API Specification, 9.2.10. */ function GetOption(options, property, type, values, fallback) { // Step 1. diff --git a/js/src/builtin/intl/DateTimeFormat.js b/js/src/builtin/intl/DateTimeFormat.js index 4de3c084f2..a4feb50aa6 100644 --- a/js/src/builtin/intl/DateTimeFormat.js +++ b/js/src/builtin/intl/DateTimeFormat.js @@ -53,9 +53,10 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) { // never a subset of them.
var internalProps = std_Object_create(null);
+
+ var DateTimeFormat = dateTimeFormatInternalProperties;
// Compute effective locale.
- var DateTimeFormat = dateTimeFormatInternalProperties;
// Step 10.
var localeData = DateTimeFormat.localeData;
@@ -73,7 +74,7 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) { internalProps.numberingSystem = r.nu;
// Compute formatting options.
- // Step 16.
+ // Step 14.
var dataLocale = r.dataLocale;
// Steps 20.
@@ -119,8 +120,6 @@ function resolveDateTimeFormatInternals(lazyDateTimeFormatData) { // Step 31.
internalProps.pattern = pattern;
- internalProps.boundFormat = undefined;
-
// The caller is responsible for associating |internalProps| with the right
// object using |setInternalProperties|.
return internalProps;
@@ -297,23 +296,25 @@ function DefaultTimeZone() { /**
- * UnwrapDateTimeFormat(dtf)
+ * 12.1.10 UnwrapDateTimeFormat( dtf )
*/
function UnwrapDateTimeFormat(dtf, methodName) {
- // Step 1.
+ // Step 1 (not applicable in our implementation).
+
+ // Step 2.
if ((!IsObject(dtf) || !IsDateTimeFormat(dtf)) &&
dtf instanceof GetDateTimeFormatConstructor())
{
dtf = dtf[intlFallbackSymbol()];
}
- // Step 2.
+ // Step 3.
if (!IsObject(dtf) || !IsDateTimeFormat(dtf)) {
ThrowTypeError(JSMSG_INTL_OBJECT_NOT_INITED, "DateTimeFormat", methodName,
"DateTimeFormat");
}
- // Step 3.
+ // Step 4.
return dtf;
}
@@ -334,9 +335,6 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m assert(IsDateTimeFormat(dateTimeFormat),
"InitializeDateTimeFormat called with non-DateTimeFormat");
- // Steps 1-2 (These steps are no longer required and should be removed
- // from the spec; https://github.com/tc39/ecma402/issues/115).
-
// Lazy DateTimeFormat data has the following structure:
//
// {
@@ -471,6 +469,8 @@ function InitializeDateTimeFormat(dateTimeFormat, thisValue, locales, options, m initializeIntlObject(dateTimeFormat, "DateTimeFormat", lazyDateTimeFormatData);
// 12.2.1, steps 4-5.
+ // TODO: spec issue - The current spec doesn't have the IsObject check,
+ // which means |Intl.DateTimeFormat.call(null)| is supposed to throw here.
if (dateTimeFormat !== thisValue && thisValue instanceof GetDateTimeFormatConstructor()) {
if (!IsObject(thisValue))
ThrowTypeError(JSMSG_NOT_NONNULL_OBJECT, typeof thisValue);
@@ -687,17 +687,19 @@ function ToDateTimeOptions(options, required, defaults) { assert(typeof required === "string", "ToDateTimeOptions");
assert(typeof defaults === "string", "ToDateTimeOptions");
- // Steps 1-3.
+ // Steps 1-2.
if (options === undefined)
options = null;
else
options = ToObject(options);
options = std_Object_create(options);
- // Step 4.
+ // Step 3.
var needDefaults = true;
- // Step 5.
+ // Step 4.
+ // TODO: spec issue - The spec requires to retrieve all options, so using
+ // the ||-operator with its lazy evaluation semantics is incorrect.
if ((required === "date" || required === "any") &&
(options.weekday !== undefined || options.year !== undefined ||
options.month !== undefined || options.day !== undefined))
@@ -705,7 +707,9 @@ function ToDateTimeOptions(options, required, defaults) { needDefaults = false;
}
- // Step 6.
+ // Step 5.
+ // TODO: spec issue - The spec requires to retrieve all options, so using
+ // the ||-operator with its lazy evaluation semantics is incorrect.
if ((required === "time" || required === "any") &&
(options.hour !== undefined || options.minute !== undefined ||
options.second !== undefined))
@@ -713,7 +717,7 @@ function ToDateTimeOptions(options, required, defaults) { needDefaults = false;
}
- // Step 7.
+ // Step 6.
if (needDefaults && (defaults === "date" || defaults === "all")) {
// The specification says to call [[DefineOwnProperty]] with false for
// the Throw parameter, while Object.defineProperty uses true. For the
@@ -724,7 +728,7 @@ function ToDateTimeOptions(options, required, defaults) { _DefineDataProperty(options, "day", "numeric");
}
- // Step 8.
+ // Step 7.
if (needDefaults && (defaults === "time" || defaults === "all")) {
// See comment for step 7.
_DefineDataProperty(options, "hour", "numeric");
@@ -732,7 +736,7 @@ function ToDateTimeOptions(options, required, defaults) { _DefineDataProperty(options, "second", "numeric");
}
- // Step 9.
+ // Step 8.
return options;
}
@@ -842,14 +846,19 @@ function BestFitFormatMatcher(options, formats) { * matching (possibly fallback) locale. Locales appear in the same order in the
* returned list as in the input list.
*
- * Spec: ECMAScript Internationalization API Specification, 12.2.2.
+ * Spec: ECMAScript Internationalization API Specification, 12.3.2.
*/
function Intl_DateTimeFormat_supportedLocalesOf(locales /*, options*/) {
var options = arguments.length > 1 ? arguments[1] : undefined;
+ // Step 1.
var availableLocales = callFunction(dateTimeFormatInternalProperties.availableLocales,
dateTimeFormatInternalProperties);
+
+ // Step 2.
var requestedLocales = CanonicalizeLocaleList(locales);
+
+ // Step 3.
return SupportedLocales(availableLocales, requestedLocales, options);
}
@@ -857,7 +866,7 @@ function Intl_DateTimeFormat_supportedLocalesOf(locales /*, options*/) { /**
* DateTimeFormat internal properties.
*
- * Spec: ECMAScript Internationalization API Specification, 9.1 and 12.2.3.
+ * Spec: ECMAScript Internationalization API Specification, 9.1 and 12.3.3.
*/
var dateTimeFormatInternalProperties = {
localeData: dateTimeFormatLocaleData,
@@ -897,7 +906,7 @@ function dateTimeFormatLocaleData() { /**
* Function to be bound and returned by Intl.DateTimeFormat.prototype.format.
*
- * Spec: ECMAScript Internationalization API Specification, 12.3.2.
+ * Spec: ECMAScript Internationalization API Specification, 12.1.5.
*/
function dateTimeFormatFormatToBind() {
// Steps 1.a.i-ii
@@ -913,7 +922,7 @@ function dateTimeFormatFormatToBind() { * representing the result of calling ToNumber(date) according to the
* effective locale and the formatting options of this DateTimeFormat.
*
- * Spec: ECMAScript Internationalization API Specification, 12.3.2.
+ * Spec: ECMAScript Internationalization API Specification, 12.4.3.
*/
function Intl_DateTimeFormat_format_get() {
// Steps 1-3.
@@ -923,12 +932,11 @@ function Intl_DateTimeFormat_format_get() { // Step 4.
if (internals.boundFormat === undefined) {
- // Step 4.a.
- var F = dateTimeFormatFormatToBind;
+ // Steps 4.a-b.
+ var F = callFunction(FunctionBind, dateTimeFormatFormatToBind, dtf);
- // Steps 4.b-d.
- var bf = callFunction(FunctionBind, F, dtf);
- internals.boundFormat = bf;
+ // Step 4.c.
+ internals.boundFormat = F;
}
// Step 5.
@@ -937,6 +945,11 @@ function Intl_DateTimeFormat_format_get() { _SetCanonicalName(Intl_DateTimeFormat_format_get, "get format");
+/**
+ * Intl.DateTimeFormat.prototype.formatToParts ( date )
+ *
+ * Spec: ECMAScript Internationalization API Specification, 12.4.4.
+ */
function Intl_DateTimeFormat_formatToParts() {
// Steps 1-3.
var dtf = UnwrapDateTimeFormat(this, "formatToParts");
@@ -956,14 +969,15 @@ function Intl_DateTimeFormat_formatToParts() { /**
* Returns the resolved options for a DateTimeFormat object.
*
- * Spec: ECMAScript Internationalization API Specification, 12.3.3 and 12.4.
+ * Spec: ECMAScript Internationalization API Specification, 12.4.5.
*/
function Intl_DateTimeFormat_resolvedOptions() {
- // Invoke |UnwrapDateTimeFormat| per introduction of section 12.3.
+ // Steps 1-3.
var dtf = UnwrapDateTimeFormat(this, "resolvedOptions");
var internals = getDateTimeFormatInternals(dtf);
+ // Steps 4-5.
var result = {
locale: internals.locale,
calendar: internals.calendar,
@@ -981,6 +995,8 @@ function Intl_DateTimeFormat_resolvedOptions() { }
resolveICUPattern(internals.pattern, result);
+
+ // Step 6.
return result;
}
diff --git a/js/src/builtin/intl/LangTagMappingsGenerated.js b/js/src/builtin/intl/LangTagMappingsGenerated.js index 269cf9f93a..83a8ff8f60 100644 --- a/js/src/builtin/intl/LangTagMappingsGenerated.js +++ b/js/src/builtin/intl/LangTagMappingsGenerated.js @@ -1,382 +1,1246 @@ // Generated by make_intl_data.py. DO NOT EDIT. -// Mappings from complete tags to preferred values. -// Derived from IANA Language Subtag Registry, file date 2016-10-12. -// http://www.iana.org/assignments/language-subtag-registry -var langTagMappings = { +// Mappings from grandfathered tags to preferred values. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +var grandfatheredMappings = { "art-lojban": "jbo", - "cel-gaulish": "cel-gaulish", - "en-gb-oed": "en-GB-oxendict", - "i-ami": "ami", - "i-bnn": "bnn", - "i-default": "i-default", - "i-enochian": "i-enochian", - "i-hak": "hak", - "i-klingon": "tlh", - "i-lux": "lb", - "i-mingo": "i-mingo", - "i-navajo": "nv", - "i-pwn": "pwn", - "i-tao": "tao", - "i-tay": "tay", - "i-tsu": "tsu", - "ja-latn-hepburn-heploc": "ja-Latn-alalc97", - "no-bok": "nb", - "no-nyn": "nn", - "sgn-be-fr": "sfb", - "sgn-be-nl": "vgt", - "sgn-br": "bzs", - "sgn-ch-de": "sgg", - "sgn-co": "csn", - "sgn-de": "gsg", - "sgn-dk": "dsl", - "sgn-es": "ssp", - "sgn-fr": "fsl", - "sgn-gb": "bfi", - "sgn-gr": "gss", - "sgn-ie": "isg", - "sgn-it": "ise", - "sgn-jp": "jsl", - "sgn-mx": "mfs", - "sgn-ni": "ncs", - "sgn-nl": "dse", - "sgn-no": "nsl", - "sgn-pt": "psr", - "sgn-se": "swl", - "sgn-us": "ase", - "sgn-za": "sfs", - "zh-cmn": "cmn", - "zh-cmn-hans": "cmn-Hans", - "zh-cmn-hant": "cmn-Hant", - "zh-gan": "gan", - "zh-guoyu": "cmn", + "cel-gaulish": "xtg-x-cel-gaulish", + "zh-guoyu": "zh", "zh-hakka": "hak", - "zh-min": "zh-min", - "zh-min-nan": "nan", - "zh-wuu": "wuu", "zh-xiang": "hsn", - "zh-yue": "yue", }; -// Mappings from non-extlang subtags to preferred values. -// Derived from IANA Language Subtag Registry, file date 2016-10-12. -// http://www.iana.org/assignments/language-subtag-registry -var langSubtagMappings = { - "BU": "MM", - "DD": "DE", - "FX": "FR", - "TP": "TL", - "YD": "YE", - "ZR": "CD", +// Mappings from language subtags to preferred values. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +var languageMappings = { "aam": "aas", + "aar": "aa", + "abk": "ab", "adp": "dz", + "afr": "af", + "aju": "jrb", + "aka": "ak", + "alb": "sq", + "als": "sq", + "amh": "am", + "ara": "ar", + "arb": "ar", + "arg": "an", + "arm": "hy", + "asd": "snz", + "asm": "as", "aue": "ktz", + "ava": "av", + "ave": "ae", + "aym": "ay", + "ayr": "ay", "ayx": "nun", + "aze": "az", + "azj": "az", + "bak": "ba", + "bam": "bm", + "baq": "eu", + "bcc": "bal", + "bcl": "bik", + "bel": "be", + "ben": "bn", "bgm": "bcg", + "bh": "bho", + "bih": "bho", + "bis": "bi", "bjd": "drl", + "bod": "bo", + "bos": "bs", + "bre": "br", + "bul": "bg", + "bur": "my", + "bxk": "luy", + "bxr": "bua", + "cat": "ca", "ccq": "rki", + "ces": "cs", + "cha": "ch", + "che": "ce", + "chi": "zh", + "chu": "cu", + "chv": "cv", "cjr": "mom", "cka": "cmr", + "cld": "syr", "cmk": "xch", + "cmn": "zh", + "cor": "kw", + "cos": "co", "coy": "pij", "cqu": "quh", - "drh": "khk", - "drw": "prs", + "cre": "cr", + "cwd": "cr", + "cym": "cy", + "cze": "cs", + "dan": "da", + "deu": "de", + "dgo": "doi", + "dhd": "mwr", + "dik": "din", + "diq": "zza", + "dit": "dif", + "div": "dv", + "drh": "mn", + "dut": "nl", + "dzo": "dz", + "ekk": "et", + "ell": "el", + "emk": "man", + "eng": "en", + "epo": "eo", + "esk": "ik", + "est": "et", + "eus": "eu", + "ewe": "ee", + "fao": "fo", + "fas": "fa", + "fat": "ak", + "fij": "fj", + "fin": "fi", + "fra": "fr", + "fre": "fr", + "fry": "fy", + "fuc": "ff", + "ful": "ff", "gav": "dev", + "gaz": "om", + "gbo": "grb", + "geo": "ka", + "ger": "de", "gfx": "vaj", "ggn": "gvr", + "gla": "gd", + "gle": "ga", + "glg": "gl", + "glv": "gv", + "gno": "gon", + "gre": "el", + "grn": "gn", "gti": "nyc", + "gug": "gn", + "guj": "gu", "guv": "duz", + "gya": "gba", + "hat": "ht", + "hau": "ha", + "hdn": "hai", + "hea": "hmn", + "heb": "he", + "her": "hz", + "him": "srx", + "hin": "hi", + "hmo": "ho", "hrr": "jal", + "hrv": "hr", + "hun": "hu", + "hye": "hy", "ibi": "opa", + "ibo": "ig", + "ice": "is", + "ido": "io", + "iii": "ii", + "ike": "iu", + "iku": "iu", + "ile": "ie", "ilw": "gal", "in": "id", + "ina": "ia", + "ind": "id", + "ipk": "ik", + "isl": "is", + "ita": "it", "iw": "he", + "jav": "jv", + "jeg": "oyb", "ji": "yi", + "jpn": "ja", "jw": "jv", + "kal": "kl", + "kan": "kn", + "kas": "ks", + "kat": "ka", + "kau": "kr", + "kaz": "kk", "kgc": "tdf", "kgh": "kml", + "khk": "mn", + "khm": "km", + "kik": "ki", + "kin": "rw", + "kir": "ky", + "kmr": "ku", + "knc": "kr", + "kng": "kg", + "knn": "kok", "koj": "kwv", + "kom": "kv", + "kon": "kg", + "kor": "ko", + "kpv": "kv", + "krm": "bmf", "ktr": "dtp", + "kua": "kj", + "kur": "ku", "kvs": "gdj", "kwq": "yam", "kxe": "tvd", "kzj": "dtp", "kzt": "dtp", + "lao": "lo", + "lat": "la", + "lav": "lv", + "lbk": "bnc", "lii": "raq", + "lim": "li", + "lin": "ln", + "lit": "lt", + "llo": "ngt", "lmm": "rmx", + "ltz": "lb", + "lub": "lu", + "lug": "lg", + "lvs": "lv", + "mac": "mk", + "mah": "mh", + "mal": "ml", + "mao": "mi", + "mar": "mr", + "may": "ms", "meg": "cir", + "mhr": "chm", + "mkd": "mk", + "mlg": "mg", + "mlt": "mt", + "mnk": "man", "mo": "ro", + "mol": "ro", + "mon": "mn", + "mri": "mi", + "msa": "ms", "mst": "mry", + "mup": "raj", "mwj": "vaj", + "mya": "my", + "myd": "aog", "myt": "mry", "nad": "xny", + "nau": "na", + "nav": "nv", + "nbl": "nr", + "ncp": "kdz", + "nde": "nd", + "ndo": "ng", + "nep": "ne", + "nld": "nl", + "nno": "nn", + "nns": "nbr", "nnx": "ngv", + "no": "nb", + "nob": "nb", + "nor": "nb", + "npi": "ne", "nts": "pij", + "nya": "ny", + "oci": "oc", + "ojg": "oj", + "oji": "oj", + "ori": "or", + "orm": "om", + "ory": "or", + "oss": "os", "oun": "vaj", + "pan": "pa", + "pbu": "ps", "pcr": "adx", + "per": "fa", + "pes": "fa", + "pli": "pi", + "plt": "mg", "pmc": "huw", "pmu": "phr", + "pnb": "lah", + "pol": "pl", + "por": "pt", "ppa": "bfy", "ppr": "lcq", "pry": "prt", + "pus": "ps", "puz": "pub", + "que": "qu", + "quz": "qu", + "rmy": "rom", + "roh": "rm", + "ron": "ro", + "rum": "ro", + "run": "rn", + "rus": "ru", + "sag": "sg", + "san": "sa", "sca": "hle", + "scc": "sr", + "scr": "hr", + "sin": "si", + "skk": "oyb", + "slk": "sk", + "slo": "sk", + "slv": "sl", + "sme": "se", + "smo": "sm", + "sna": "sn", + "snd": "sd", + "som": "so", + "sot": "st", + "spa": "es", + "spy": "kln", + "sqi": "sq", + "src": "sc", + "srd": "sc", + "srp": "sr", + "ssw": "ss", + "sun": "su", + "swa": "sw", + "swe": "sv", + "swh": "sw", + "tah": "ty", + "tam": "ta", + "tat": "tt", "tdu": "dtp", + "tel": "te", + "tgk": "tg", + "tgl": "fil", + "tha": "th", "thc": "tpo", "thx": "oyb", + "tib": "bo", "tie": "ras", + "tir": "ti", "tkk": "twm", + "tl": "fil", "tlw": "weo", "tmp": "tyj", "tne": "kak", - "tnf": "prs", + "ton": "to", "tsf": "taj", + "tsn": "tn", + "tso": "ts", + "ttq": "tmh", + "tuk": "tk", + "tur": "tr", + "tw": "ak", + "twi": "ak", + "uig": "ug", + "ukr": "uk", + "umu": "del", "uok": "ema", + "urd": "ur", + "uzb": "uz", + "uzn": "uz", + "ven": "ve", + "vie": "vi", + "vol": "vo", + "wel": "cy", + "wln": "wa", + "wol": "wo", "xba": "cax", + "xho": "xh", "xia": "acn", "xkh": "waw", + "xpe": "kpe", "xsj": "suj", + "xsl": "den", "ybd": "rki", + "ydd": "yi", + "yid": "yi", "yma": "lrr", "ymt": "mtm", + "yor": "yo", "yos": "zom", "yuu": "yug", + "zai": "zap", + "zha": "za", + "zho": "zh", + "zsm": "ms", + "zul": "zu", + "zyb": "za", }; -// Mappings from extlang subtags to preferred values. -// Derived from IANA Language Subtag Registry, file date 2016-10-12. -// http://www.iana.org/assignments/language-subtag-registry -var extlangMappings = { - "aao": {preferred: "aao", prefix: "ar"}, - "abh": {preferred: "abh", prefix: "ar"}, - "abv": {preferred: "abv", prefix: "ar"}, - "acm": {preferred: "acm", prefix: "ar"}, - "acq": {preferred: "acq", prefix: "ar"}, - "acw": {preferred: "acw", prefix: "ar"}, - "acx": {preferred: "acx", prefix: "ar"}, - "acy": {preferred: "acy", prefix: "ar"}, - "adf": {preferred: "adf", prefix: "ar"}, - "ads": {preferred: "ads", prefix: "sgn"}, - "aeb": {preferred: "aeb", prefix: "ar"}, - "aec": {preferred: "aec", prefix: "ar"}, - "aed": {preferred: "aed", prefix: "sgn"}, - "aen": {preferred: "aen", prefix: "sgn"}, - "afb": {preferred: "afb", prefix: "ar"}, - "afg": {preferred: "afg", prefix: "sgn"}, - "ajp": {preferred: "ajp", prefix: "ar"}, - "apc": {preferred: "apc", prefix: "ar"}, - "apd": {preferred: "apd", prefix: "ar"}, - "arb": {preferred: "arb", prefix: "ar"}, - "arq": {preferred: "arq", prefix: "ar"}, - "ars": {preferred: "ars", prefix: "ar"}, - "ary": {preferred: "ary", prefix: "ar"}, - "arz": {preferred: "arz", prefix: "ar"}, - "ase": {preferred: "ase", prefix: "sgn"}, - "asf": {preferred: "asf", prefix: "sgn"}, - "asp": {preferred: "asp", prefix: "sgn"}, - "asq": {preferred: "asq", prefix: "sgn"}, - "asw": {preferred: "asw", prefix: "sgn"}, - "auz": {preferred: "auz", prefix: "ar"}, - "avl": {preferred: "avl", prefix: "ar"}, - "ayh": {preferred: "ayh", prefix: "ar"}, - "ayl": {preferred: "ayl", prefix: "ar"}, - "ayn": {preferred: "ayn", prefix: "ar"}, - "ayp": {preferred: "ayp", prefix: "ar"}, - "bbz": {preferred: "bbz", prefix: "ar"}, - "bfi": {preferred: "bfi", prefix: "sgn"}, - "bfk": {preferred: "bfk", prefix: "sgn"}, - "bjn": {preferred: "bjn", prefix: "ms"}, - "bog": {preferred: "bog", prefix: "sgn"}, - "bqn": {preferred: "bqn", prefix: "sgn"}, - "bqy": {preferred: "bqy", prefix: "sgn"}, - "btj": {preferred: "btj", prefix: "ms"}, - "bve": {preferred: "bve", prefix: "ms"}, - "bvl": {preferred: "bvl", prefix: "sgn"}, - "bvu": {preferred: "bvu", prefix: "ms"}, - "bzs": {preferred: "bzs", prefix: "sgn"}, - "cdo": {preferred: "cdo", prefix: "zh"}, - "cds": {preferred: "cds", prefix: "sgn"}, - "cjy": {preferred: "cjy", prefix: "zh"}, - "cmn": {preferred: "cmn", prefix: "zh"}, - "coa": {preferred: "coa", prefix: "ms"}, - "cpx": {preferred: "cpx", prefix: "zh"}, - "csc": {preferred: "csc", prefix: "sgn"}, - "csd": {preferred: "csd", prefix: "sgn"}, - "cse": {preferred: "cse", prefix: "sgn"}, - "csf": {preferred: "csf", prefix: "sgn"}, - "csg": {preferred: "csg", prefix: "sgn"}, - "csl": {preferred: "csl", prefix: "sgn"}, - "csn": {preferred: "csn", prefix: "sgn"}, - "csq": {preferred: "csq", prefix: "sgn"}, - "csr": {preferred: "csr", prefix: "sgn"}, - "czh": {preferred: "czh", prefix: "zh"}, - "czo": {preferred: "czo", prefix: "zh"}, - "doq": {preferred: "doq", prefix: "sgn"}, - "dse": {preferred: "dse", prefix: "sgn"}, - "dsl": {preferred: "dsl", prefix: "sgn"}, - "dup": {preferred: "dup", prefix: "ms"}, - "ecs": {preferred: "ecs", prefix: "sgn"}, - "esl": {preferred: "esl", prefix: "sgn"}, - "esn": {preferred: "esn", prefix: "sgn"}, - "eso": {preferred: "eso", prefix: "sgn"}, - "eth": {preferred: "eth", prefix: "sgn"}, - "fcs": {preferred: "fcs", prefix: "sgn"}, - "fse": {preferred: "fse", prefix: "sgn"}, - "fsl": {preferred: "fsl", prefix: "sgn"}, - "fss": {preferred: "fss", prefix: "sgn"}, - "gan": {preferred: "gan", prefix: "zh"}, - "gds": {preferred: "gds", prefix: "sgn"}, - "gom": {preferred: "gom", prefix: "kok"}, - "gse": {preferred: "gse", prefix: "sgn"}, - "gsg": {preferred: "gsg", prefix: "sgn"}, - "gsm": {preferred: "gsm", prefix: "sgn"}, - "gss": {preferred: "gss", prefix: "sgn"}, - "gus": {preferred: "gus", prefix: "sgn"}, - "hab": {preferred: "hab", prefix: "sgn"}, - "haf": {preferred: "haf", prefix: "sgn"}, - "hak": {preferred: "hak", prefix: "zh"}, - "hds": {preferred: "hds", prefix: "sgn"}, - "hji": {preferred: "hji", prefix: "ms"}, - "hks": {preferred: "hks", prefix: "sgn"}, - "hos": {preferred: "hos", prefix: "sgn"}, - "hps": {preferred: "hps", prefix: "sgn"}, - "hsh": {preferred: "hsh", prefix: "sgn"}, - "hsl": {preferred: "hsl", prefix: "sgn"}, - "hsn": {preferred: "hsn", prefix: "zh"}, - "icl": {preferred: "icl", prefix: "sgn"}, - "iks": {preferred: "iks", prefix: "sgn"}, - "ils": {preferred: "ils", prefix: "sgn"}, - "inl": {preferred: "inl", prefix: "sgn"}, - "ins": {preferred: "ins", prefix: "sgn"}, - "ise": {preferred: "ise", prefix: "sgn"}, - "isg": {preferred: "isg", prefix: "sgn"}, - "isr": {preferred: "isr", prefix: "sgn"}, - "jak": {preferred: "jak", prefix: "ms"}, - "jax": {preferred: "jax", prefix: "ms"}, - "jcs": {preferred: "jcs", prefix: "sgn"}, - "jhs": {preferred: "jhs", prefix: "sgn"}, - "jls": {preferred: "jls", prefix: "sgn"}, - "jos": {preferred: "jos", prefix: "sgn"}, - "jsl": {preferred: "jsl", prefix: "sgn"}, - "jus": {preferred: "jus", prefix: "sgn"}, - "kgi": {preferred: "kgi", prefix: "sgn"}, - "knn": {preferred: "knn", prefix: "kok"}, - "kvb": {preferred: "kvb", prefix: "ms"}, - "kvk": {preferred: "kvk", prefix: "sgn"}, - "kvr": {preferred: "kvr", prefix: "ms"}, - "kxd": {preferred: "kxd", prefix: "ms"}, - "lbs": {preferred: "lbs", prefix: "sgn"}, - "lce": {preferred: "lce", prefix: "ms"}, - "lcf": {preferred: "lcf", prefix: "ms"}, - "liw": {preferred: "liw", prefix: "ms"}, - "lls": {preferred: "lls", prefix: "sgn"}, - "lsg": {preferred: "lsg", prefix: "sgn"}, - "lsl": {preferred: "lsl", prefix: "sgn"}, - "lso": {preferred: "lso", prefix: "sgn"}, - "lsp": {preferred: "lsp", prefix: "sgn"}, - "lst": {preferred: "lst", prefix: "sgn"}, - "lsy": {preferred: "lsy", prefix: "sgn"}, - "ltg": {preferred: "ltg", prefix: "lv"}, - "lvs": {preferred: "lvs", prefix: "lv"}, - "lzh": {preferred: "lzh", prefix: "zh"}, - "max": {preferred: "max", prefix: "ms"}, - "mdl": {preferred: "mdl", prefix: "sgn"}, - "meo": {preferred: "meo", prefix: "ms"}, - "mfa": {preferred: "mfa", prefix: "ms"}, - "mfb": {preferred: "mfb", prefix: "ms"}, - "mfs": {preferred: "mfs", prefix: "sgn"}, - "min": {preferred: "min", prefix: "ms"}, - "mnp": {preferred: "mnp", prefix: "zh"}, - "mqg": {preferred: "mqg", prefix: "ms"}, - "mre": {preferred: "mre", prefix: "sgn"}, - "msd": {preferred: "msd", prefix: "sgn"}, - "msi": {preferred: "msi", prefix: "ms"}, - "msr": {preferred: "msr", prefix: "sgn"}, - "mui": {preferred: "mui", prefix: "ms"}, - "mzc": {preferred: "mzc", prefix: "sgn"}, - "mzg": {preferred: "mzg", prefix: "sgn"}, - "mzy": {preferred: "mzy", prefix: "sgn"}, - "nan": {preferred: "nan", prefix: "zh"}, - "nbs": {preferred: "nbs", prefix: "sgn"}, - "ncs": {preferred: "ncs", prefix: "sgn"}, - "nsi": {preferred: "nsi", prefix: "sgn"}, - "nsl": {preferred: "nsl", prefix: "sgn"}, - "nsp": {preferred: "nsp", prefix: "sgn"}, - "nsr": {preferred: "nsr", prefix: "sgn"}, - "nzs": {preferred: "nzs", prefix: "sgn"}, - "okl": {preferred: "okl", prefix: "sgn"}, - "orn": {preferred: "orn", prefix: "ms"}, - "ors": {preferred: "ors", prefix: "ms"}, - "pel": {preferred: "pel", prefix: "ms"}, - "pga": {preferred: "pga", prefix: "ar"}, - "pgz": {preferred: "pgz", prefix: "sgn"}, - "pks": {preferred: "pks", prefix: "sgn"}, - "prl": {preferred: "prl", prefix: "sgn"}, - "prz": {preferred: "prz", prefix: "sgn"}, - "psc": {preferred: "psc", prefix: "sgn"}, - "psd": {preferred: "psd", prefix: "sgn"}, - "pse": {preferred: "pse", prefix: "ms"}, - "psg": {preferred: "psg", prefix: "sgn"}, - "psl": {preferred: "psl", prefix: "sgn"}, - "pso": {preferred: "pso", prefix: "sgn"}, - "psp": {preferred: "psp", prefix: "sgn"}, - "psr": {preferred: "psr", prefix: "sgn"}, - "pys": {preferred: "pys", prefix: "sgn"}, - "rms": {preferred: "rms", prefix: "sgn"}, - "rsi": {preferred: "rsi", prefix: "sgn"}, - "rsl": {preferred: "rsl", prefix: "sgn"}, - "rsm": {preferred: "rsm", prefix: "sgn"}, - "sdl": {preferred: "sdl", prefix: "sgn"}, - "sfb": {preferred: "sfb", prefix: "sgn"}, - "sfs": {preferred: "sfs", prefix: "sgn"}, - "sgg": {preferred: "sgg", prefix: "sgn"}, - "sgx": {preferred: "sgx", prefix: "sgn"}, - "shu": {preferred: "shu", prefix: "ar"}, - "slf": {preferred: "slf", prefix: "sgn"}, - "sls": {preferred: "sls", prefix: "sgn"}, - "sqk": {preferred: "sqk", prefix: "sgn"}, - "sqs": {preferred: "sqs", prefix: "sgn"}, - "ssh": {preferred: "ssh", prefix: "ar"}, - "ssp": {preferred: "ssp", prefix: "sgn"}, - "ssr": {preferred: "ssr", prefix: "sgn"}, - "svk": {preferred: "svk", prefix: "sgn"}, - "swc": {preferred: "swc", prefix: "sw"}, - "swh": {preferred: "swh", prefix: "sw"}, - "swl": {preferred: "swl", prefix: "sgn"}, - "syy": {preferred: "syy", prefix: "sgn"}, - "tmw": {preferred: "tmw", prefix: "ms"}, - "tse": {preferred: "tse", prefix: "sgn"}, - "tsm": {preferred: "tsm", prefix: "sgn"}, - "tsq": {preferred: "tsq", prefix: "sgn"}, - "tss": {preferred: "tss", prefix: "sgn"}, - "tsy": {preferred: "tsy", prefix: "sgn"}, - "tza": {preferred: "tza", prefix: "sgn"}, - "ugn": {preferred: "ugn", prefix: "sgn"}, - "ugy": {preferred: "ugy", prefix: "sgn"}, - "ukl": {preferred: "ukl", prefix: "sgn"}, - "uks": {preferred: "uks", prefix: "sgn"}, - "urk": {preferred: "urk", prefix: "ms"}, - "uzn": {preferred: "uzn", prefix: "uz"}, - "uzs": {preferred: "uzs", prefix: "uz"}, - "vgt": {preferred: "vgt", prefix: "sgn"}, - "vkk": {preferred: "vkk", prefix: "ms"}, - "vkt": {preferred: "vkt", prefix: "ms"}, - "vsi": {preferred: "vsi", prefix: "sgn"}, - "vsl": {preferred: "vsl", prefix: "sgn"}, - "vsv": {preferred: "vsv", prefix: "sgn"}, - "wuu": {preferred: "wuu", prefix: "zh"}, - "xki": {preferred: "xki", prefix: "sgn"}, - "xml": {preferred: "xml", prefix: "sgn"}, - "xmm": {preferred: "xmm", prefix: "ms"}, - "xms": {preferred: "xms", prefix: "sgn"}, - "ygs": {preferred: "ygs", prefix: "sgn"}, - "yhs": {preferred: "yhs", prefix: "sgn"}, - "ysl": {preferred: "ysl", prefix: "sgn"}, - "yue": {preferred: "yue", prefix: "zh"}, - "zib": {preferred: "zib", prefix: "sgn"}, - "zlm": {preferred: "zlm", prefix: "ms"}, - "zmi": {preferred: "zmi", prefix: "ms"}, - "zsl": {preferred: "zsl", prefix: "sgn"}, - "zsm": {preferred: "zsm", prefix: "ms"}, +// Language subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +var complexLanguageMappings = { + "cnr": true, + "drw": true, + "hbs": true, + "prs": true, + "sh": true, + "swc": true, + "tnf": true, }; + +// Mappings from region subtags to preferred values. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +var regionMappings = { + "004": "AF", + "008": "AL", + "010": "AQ", + "012": "DZ", + "016": "AS", + "020": "AD", + "024": "AO", + "028": "AG", + "031": "AZ", + "032": "AR", + "036": "AU", + "040": "AT", + "044": "BS", + "048": "BH", + "050": "BD", + "051": "AM", + "052": "BB", + "056": "BE", + "060": "BM", + "062": "034", + "064": "BT", + "068": "BO", + "070": "BA", + "072": "BW", + "074": "BV", + "076": "BR", + "084": "BZ", + "086": "IO", + "090": "SB", + "092": "VG", + "096": "BN", + "100": "BG", + "104": "MM", + "108": "BI", + "112": "BY", + "116": "KH", + "120": "CM", + "124": "CA", + "132": "CV", + "136": "KY", + "140": "CF", + "144": "LK", + "148": "TD", + "152": "CL", + "156": "CN", + "158": "TW", + "162": "CX", + "166": "CC", + "170": "CO", + "174": "KM", + "175": "YT", + "178": "CG", + "180": "CD", + "184": "CK", + "188": "CR", + "191": "HR", + "192": "CU", + "196": "CY", + "203": "CZ", + "204": "BJ", + "208": "DK", + "212": "DM", + "214": "DO", + "218": "EC", + "222": "SV", + "226": "GQ", + "230": "ET", + "231": "ET", + "232": "ER", + "233": "EE", + "234": "FO", + "238": "FK", + "239": "GS", + "242": "FJ", + "246": "FI", + "248": "AX", + "249": "FR", + "250": "FR", + "254": "GF", + "258": "PF", + "260": "TF", + "262": "DJ", + "266": "GA", + "268": "GE", + "270": "GM", + "275": "PS", + "276": "DE", + "278": "DE", + "280": "DE", + "288": "GH", + "292": "GI", + "296": "KI", + "300": "GR", + "304": "GL", + "308": "GD", + "312": "GP", + "316": "GU", + "320": "GT", + "324": "GN", + "328": "GY", + "332": "HT", + "334": "HM", + "336": "VA", + "340": "HN", + "344": "HK", + "348": "HU", + "352": "IS", + "356": "IN", + "360": "ID", + "364": "IR", + "368": "IQ", + "372": "IE", + "376": "IL", + "380": "IT", + "384": "CI", + "388": "JM", + "392": "JP", + "398": "KZ", + "400": "JO", + "404": "KE", + "408": "KP", + "410": "KR", + "414": "KW", + "417": "KG", + "418": "LA", + "422": "LB", + "426": "LS", + "428": "LV", + "430": "LR", + "434": "LY", + "438": "LI", + "440": "LT", + "442": "LU", + "446": "MO", + "450": "MG", + "454": "MW", + "458": "MY", + "462": "MV", + "466": "ML", + "470": "MT", + "474": "MQ", + "478": "MR", + "480": "MU", + "484": "MX", + "492": "MC", + "496": "MN", + "498": "MD", + "499": "ME", + "500": "MS", + "504": "MA", + "508": "MZ", + "512": "OM", + "516": "NA", + "520": "NR", + "524": "NP", + "528": "NL", + "531": "CW", + "533": "AW", + "534": "SX", + "535": "BQ", + "540": "NC", + "548": "VU", + "554": "NZ", + "558": "NI", + "562": "NE", + "566": "NG", + "570": "NU", + "574": "NF", + "578": "NO", + "580": "MP", + "581": "UM", + "583": "FM", + "584": "MH", + "585": "PW", + "586": "PK", + "591": "PA", + "598": "PG", + "600": "PY", + "604": "PE", + "608": "PH", + "612": "PN", + "616": "PL", + "620": "PT", + "624": "GW", + "626": "TL", + "630": "PR", + "634": "QA", + "638": "RE", + "642": "RO", + "643": "RU", + "646": "RW", + "652": "BL", + "654": "SH", + "659": "KN", + "660": "AI", + "662": "LC", + "663": "MF", + "666": "PM", + "670": "VC", + "674": "SM", + "678": "ST", + "682": "SA", + "686": "SN", + "688": "RS", + "690": "SC", + "694": "SL", + "702": "SG", + "703": "SK", + "704": "VN", + "705": "SI", + "706": "SO", + "710": "ZA", + "716": "ZW", + "720": "YE", + "724": "ES", + "728": "SS", + "729": "SD", + "732": "EH", + "736": "SD", + "740": "SR", + "744": "SJ", + "748": "SZ", + "752": "SE", + "756": "CH", + "760": "SY", + "762": "TJ", + "764": "TH", + "768": "TG", + "772": "TK", + "776": "TO", + "780": "TT", + "784": "AE", + "788": "TN", + "792": "TR", + "795": "TM", + "796": "TC", + "798": "TV", + "800": "UG", + "804": "UA", + "807": "MK", + "818": "EG", + "826": "GB", + "830": "JE", + "831": "GG", + "832": "JE", + "833": "IM", + "834": "TZ", + "840": "US", + "850": "VI", + "854": "BF", + "858": "UY", + "860": "UZ", + "862": "VE", + "876": "WF", + "882": "WS", + "886": "YE", + "887": "YE", + "891": "RS", + "894": "ZM", + "958": "AA", + "959": "QM", + "960": "QN", + "962": "QP", + "963": "QQ", + "964": "QR", + "965": "QS", + "966": "QT", + "967": "EU", + "968": "QV", + "969": "QW", + "970": "QX", + "971": "QY", + "972": "QZ", + "973": "XA", + "974": "XB", + "975": "XC", + "976": "XD", + "977": "XE", + "978": "XF", + "979": "XG", + "980": "XH", + "981": "XI", + "982": "XJ", + "983": "XK", + "984": "XL", + "985": "XM", + "986": "XN", + "987": "XO", + "988": "XP", + "989": "XQ", + "990": "XR", + "991": "XS", + "992": "XT", + "993": "XU", + "994": "XV", + "995": "XW", + "996": "XX", + "997": "XY", + "998": "XZ", + "999": "ZZ", + "BU": "MM", + "CS": "RS", + "CT": "KI", + "DD": "DE", + "DY": "BJ", + "FQ": "AQ", + "FX": "FR", + "HV": "BF", + "JT": "UM", + "MI": "UM", + "NH": "VU", + "NQ": "AQ", + "PU": "UM", + "PZ": "PA", + "QU": "EU", + "RH": "ZW", + "TP": "TL", + "UK": "GB", + "VD": "VN", + "WK": "UM", + "YD": "YE", + "YU": "RS", + "ZR": "CD", +}; + +// Region subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +var complexRegionMappings = { + "172": true, + "200": true, + "530": true, + "532": true, + "536": true, + "582": true, + "810": true, + "890": true, + "AN": true, + "NT": true, + "PC": true, + "SU": true, +}; + +// Canonicalize Unicode BCP 47 locale identifiers. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +/* eslint-disable complexity */ +function updateLocaleIdMappings(tag) { + assert(IsObject(tag), "tag is an object"); + + // Replace deprecated language tags with their preferred values. + var language = tag.language; + if (hasOwn(language, languageMappings)) { + tag.language = languageMappings[language]; + } else if (hasOwn(language, complexLanguageMappings)) { + switch (language) { + case "cnr": + tag.language = "sr"; + if (tag.region === undefined) + tag.region = "ME"; + break; + case "drw": + case "prs": + case "tnf": + tag.language = "fa"; + if (tag.region === undefined) + tag.region = "AF"; + break; + case "hbs": + case "sh": + tag.language = "sr"; + if (tag.script === undefined) + tag.script = "Latn"; + break; + case "swc": + tag.language = "sw"; + if (tag.region === undefined) + tag.region = "CD"; + break; + default: + assert(false, "language not handled: " + language); + } + } + + // No script replacements are currently present. + + // Replace deprecated subtags with their preferred values. + var region = tag.region; + if (region !== undefined) { + if (hasOwn(region, regionMappings)) { + tag.region = regionMappings[region]; + } else if (hasOwn(region, complexRegionMappings)) { + switch (region) { + case "172": + if (tag.language === "ab") { + tag.region = "GE"; + break; + } + if (tag.language === "az") { + tag.region = "AZ"; + break; + } + if (tag.language === "be") { + tag.region = "BY"; + break; + } + if (tag.language === "crh") { + tag.region = "UA"; + break; + } + if (tag.language === "gag") { + tag.region = "MD"; + break; + } + if (tag.language === "got") { + tag.region = "UA"; + break; + } + if (tag.language === "hy") { + tag.region = "AM"; + break; + } + if (tag.language === "ji") { + tag.region = "UA"; + break; + } + if (tag.language === "ka") { + tag.region = "GE"; + break; + } + if (tag.language === "kaa") { + tag.region = "UZ"; + break; + } + if (tag.language === "kk") { + tag.region = "KZ"; + break; + } + if (tag.language === "ku" && tag.script === "Yezi") { + tag.region = "GE"; + break; + } + if (tag.language === "ky") { + tag.region = "KG"; + break; + } + if (tag.language === "os") { + tag.region = "GE"; + break; + } + if (tag.language === "rue") { + tag.region = "UA"; + break; + } + if (tag.language === "sog") { + tag.region = "UZ"; + break; + } + if (tag.language === "tg") { + tag.region = "TJ"; + break; + } + if (tag.language === "tk") { + tag.region = "TM"; + break; + } + if (tag.language === "tkr") { + tag.region = "AZ"; + break; + } + if (tag.language === "tly") { + tag.region = "AZ"; + break; + } + if (tag.language === "ttt") { + tag.region = "AZ"; + break; + } + if (tag.language === "ug" && tag.script === "Cyrl") { + tag.region = "KZ"; + break; + } + if (tag.language === "uk") { + tag.region = "UA"; + break; + } + if (tag.language === "und" && tag.script === "Geor") { + tag.region = "GE"; + break; + } + if (tag.language === "und" && tag.script === "Armn") { + tag.region = "AM"; + break; + } + if (tag.language === "und" && tag.script === "Sogo") { + tag.region = "UZ"; + break; + } + if (tag.language === "und" && tag.script === "Goth") { + tag.region = "UA"; + break; + } + if (tag.language === "und" && tag.script === "Chrs") { + tag.region = "UZ"; + break; + } + if (tag.language === "und" && tag.script === "Sogd") { + tag.region = "UZ"; + break; + } + if (tag.language === "und" && tag.script === "Yezi") { + tag.region = "GE"; + break; + } + if (tag.language === "uz") { + tag.region = "UZ"; + break; + } + if (tag.language === "xco") { + tag.region = "UZ"; + break; + } + if (tag.language === "xmf") { + tag.region = "GE"; + break; + } + tag.region = "RU"; + break; + case "200": + if (tag.language === "sk") { + tag.region = "SK"; + break; + } + tag.region = "CZ"; + break; + case "530": + case "532": + case "AN": + if (tag.language === "vic") { + tag.region = "SX"; + break; + } + tag.region = "CW"; + break; + case "536": + case "NT": + if (tag.language === "akk") { + tag.region = "IQ"; + break; + } + if (tag.language === "ckb") { + tag.region = "IQ"; + break; + } + if (tag.language === "ku" && tag.script === "Arab") { + tag.region = "IQ"; + break; + } + if (tag.language === "mis") { + tag.region = "IQ"; + break; + } + if (tag.language === "syr") { + tag.region = "IQ"; + break; + } + if (tag.language === "und" && tag.script === "Syrc") { + tag.region = "IQ"; + break; + } + if (tag.language === "und" && tag.script === "Hatr") { + tag.region = "IQ"; + break; + } + if (tag.language === "und" && tag.script === "Xsux") { + tag.region = "IQ"; + break; + } + tag.region = "SA"; + break; + case "582": + case "PC": + if (tag.language === "mh") { + tag.region = "MH"; + break; + } + if (tag.language === "pau") { + tag.region = "PW"; + break; + } + tag.region = "FM"; + break; + case "810": + case "SU": + if (tag.language === "ab") { + tag.region = "GE"; + break; + } + if (tag.language === "az") { + tag.region = "AZ"; + break; + } + if (tag.language === "be") { + tag.region = "BY"; + break; + } + if (tag.language === "crh") { + tag.region = "UA"; + break; + } + if (tag.language === "et") { + tag.region = "EE"; + break; + } + if (tag.language === "gag") { + tag.region = "MD"; + break; + } + if (tag.language === "got") { + tag.region = "UA"; + break; + } + if (tag.language === "hy") { + tag.region = "AM"; + break; + } + if (tag.language === "ji") { + tag.region = "UA"; + break; + } + if (tag.language === "ka") { + tag.region = "GE"; + break; + } + if (tag.language === "kaa") { + tag.region = "UZ"; + break; + } + if (tag.language === "kk") { + tag.region = "KZ"; + break; + } + if (tag.language === "ku" && tag.script === "Yezi") { + tag.region = "GE"; + break; + } + if (tag.language === "ky") { + tag.region = "KG"; + break; + } + if (tag.language === "lt") { + tag.region = "LT"; + break; + } + if (tag.language === "ltg") { + tag.region = "LV"; + break; + } + if (tag.language === "lv") { + tag.region = "LV"; + break; + } + if (tag.language === "os") { + tag.region = "GE"; + break; + } + if (tag.language === "rue") { + tag.region = "UA"; + break; + } + if (tag.language === "sgs") { + tag.region = "LT"; + break; + } + if (tag.language === "sog") { + tag.region = "UZ"; + break; + } + if (tag.language === "tg") { + tag.region = "TJ"; + break; + } + if (tag.language === "tk") { + tag.region = "TM"; + break; + } + if (tag.language === "tkr") { + tag.region = "AZ"; + break; + } + if (tag.language === "tly") { + tag.region = "AZ"; + break; + } + if (tag.language === "ttt") { + tag.region = "AZ"; + break; + } + if (tag.language === "ug" && tag.script === "Cyrl") { + tag.region = "KZ"; + break; + } + if (tag.language === "uk") { + tag.region = "UA"; + break; + } + if (tag.language === "und" && tag.script === "Geor") { + tag.region = "GE"; + break; + } + if (tag.language === "und" && tag.script === "Armn") { + tag.region = "AM"; + break; + } + if (tag.language === "und" && tag.script === "Sogo") { + tag.region = "UZ"; + break; + } + if (tag.language === "und" && tag.script === "Goth") { + tag.region = "UA"; + break; + } + if (tag.language === "und" && tag.script === "Chrs") { + tag.region = "UZ"; + break; + } + if (tag.language === "und" && tag.script === "Sogd") { + tag.region = "UZ"; + break; + } + if (tag.language === "und" && tag.script === "Yezi") { + tag.region = "GE"; + break; + } + if (tag.language === "uz") { + tag.region = "UZ"; + break; + } + if (tag.language === "vro") { + tag.region = "EE"; + break; + } + if (tag.language === "xco") { + tag.region = "UZ"; + break; + } + if (tag.language === "xmf") { + tag.region = "GE"; + break; + } + tag.region = "RU"; + break; + case "890": + if (tag.language === "bs") { + tag.region = "BA"; + break; + } + if (tag.language === "hr") { + tag.region = "HR"; + break; + } + if (tag.language === "mk") { + tag.region = "MK"; + break; + } + if (tag.language === "sl") { + tag.region = "SI"; + break; + } + tag.region = "RS"; + break; + default: + assert(false, "region not handled: " + region); + } + } + + // No variant replacements are currently present. + // No extension replacements are currently present. + // Private use sequences are left as is. + + } +} +/* eslint-enable complexity */ + +// Canonicalize grandfathered locale identifiers. +// Derived from CLDR Supplemental Data, version 36.1. +// https://github.com/unicode-org/cldr.git +function updateGrandfatheredMappings(tag) { + assert(IsObject(tag), "tag is an object"); + + // We're mapping regular grandfathered tags to non-grandfathered form here. + // Other tags remain unchanged. + // + // regular = "art-lojban" + // / "cel-gaulish" + // / "no-bok" + // / "no-nyn" + // / "zh-guoyu" + // / "zh-hakka" + // / "zh-min" + // / "zh-min-nan" + // / "zh-xiang" + // + // Therefore we can quickly exclude most tags by checking every + // |unicode_locale_id| subcomponent for characteristics not shared by any of + // the regular grandfathered (RG) tags: + // + // * Real-world |unicode_language_subtag|s are all two or three letters, + // so don't waste time running a useless |language.length > 3| fast-path. + // * No RG tag has a "script"-looking component. + // * No RG tag has a "region"-looking component. + // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, + // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, + // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag + // that |unicode_locale_id| doesn't support.) + // * No RG tag contains |extensions| or |pu_extensions|. + if (tag.script !== undefined || + tag.region !== undefined || + tag.variants.length !== 1 || + tag.extensions.length !== 0 || + tag.privateuse !== undefined) + { + return; + } + + // art-lojban -> jbo + if (tag.language === "art" && tag.variants[0] === "lojban") { + tag.language = "jbo"; + tag.variants.length = 0; + } + + // cel-gaulish -> xtg-x-cel-gaulish + else if (tag.language === "cel" && tag.variants[0] === "gaulish") { + tag.language = "xtg"; + tag.variants.length = 0; + tag.privateuse = "x-cel-gaulish"; + } + + // zh-guoyu -> zh + else if (tag.language === "zh" && tag.variants[0] === "guoyu") { + tag.language = "zh"; + tag.variants.length = 0; + } + + // zh-hakka -> hak + else if (tag.language === "zh" && tag.variants[0] === "hakka") { + tag.language = "hak"; + tag.variants.length = 0; + } + + // zh-xiang -> hsn + else if (tag.language === "zh" && tag.variants[0] === "xiang") { + tag.language = "hsn"; + tag.variants.length = 0; + } +} diff --git a/js/src/builtin/intl/NumberFormat.js b/js/src/builtin/intl/NumberFormat.js index bba78d7a0d..64158c1103 100644 --- a/js/src/builtin/intl/NumberFormat.js +++ b/js/src/builtin/intl/NumberFormat.js @@ -8,7 +8,7 @@ /**
* NumberFormat internal properties.
*
- * Spec: ECMAScript Internationalization API Specification, 9.1 and 11.2.3.
+ * Spec: ECMAScript Internationalization API Specification, 9.1 and 11.3.3.
*/
var numberFormatInternalProperties = {
localeData: numberFormatLocaleData,
@@ -35,44 +35,38 @@ function resolveNumberFormatInternals(lazyNumberFormatData) { var internalProps = std_Object_create(null);
- // Step 3.
- var requestedLocales = lazyNumberFormatData.requestedLocales;
-
- // Compute options that impact interpretation of locale.
- // Step 6.
- var opt = lazyNumberFormatData.opt;
-
var NumberFormat = numberFormatInternalProperties;
- // Step 9.
+ // Compute effective locale.
+
+ // Step 7.
var localeData = NumberFormat.localeData;
- // Step 10.
+ // Step 8.
var r = ResolveLocale(callFunction(NumberFormat.availableLocales, NumberFormat),
lazyNumberFormatData.requestedLocales,
lazyNumberFormatData.opt,
NumberFormat.relevantExtensionKeys,
localeData);
- // Steps 11-12. (Step 13 is not relevant to our implementation.)
+ // Steps 9-10. (Step 11 is not relevant to our implementation.)
internalProps.locale = r.locale;
internalProps.numberingSystem = r.nu;
// Compute formatting options.
- // Step 15.
+ // Step 13.
var s = lazyNumberFormatData.style;
internalProps.style = s;
- // Steps 19, 21.
+ // Steps 17, 19.
if (s === "currency") {
internalProps.currency = lazyNumberFormatData.currency;
internalProps.currencyDisplay = lazyNumberFormatData.currencyDisplay;
}
+ // Step 22.
internalProps.minimumIntegerDigits = lazyNumberFormatData.minimumIntegerDigits;
-
internalProps.minimumFractionDigits = lazyNumberFormatData.minimumFractionDigits;
-
internalProps.maximumFractionDigits = lazyNumberFormatData.maximumFractionDigits;
if ("minimumSignificantDigits" in lazyNumberFormatData) {
@@ -83,12 +77,9 @@ function resolveNumberFormatInternals(lazyNumberFormatData) { internalProps.maximumSignificantDigits = lazyNumberFormatData.maximumSignificantDigits;
}
- // Step 27.
+ // Step 24.
internalProps.useGrouping = lazyNumberFormatData.useGrouping;
- // Step 34.
- internalProps.boundFormat = undefined;
-
// The caller is responsible for associating |internalProps| with the right
// object using |setInternalProperties|.
return internalProps;
@@ -118,19 +109,21 @@ function getNumberFormatInternals(obj) { /**
- * UnwrapNumberFormat(nf)
+ * 11.1.11 UnwrapNumberFormat( nf )
*/
function UnwrapNumberFormat(nf, methodName) {
- // Step 1.
+ // Step 1 (not applicable in our implementation).
+
+ // Step 2.
if ((!IsObject(nf) || !IsNumberFormat(nf)) && nf instanceof GetNumberFormatConstructor()) {
nf = nf[intlFallbackSymbol()];
}
- // Step 2.
+ // Step 3.
if (!IsObject(nf) || !IsNumberFormat(nf))
ThrowTypeError(JSMSG_INTL_OBJECT_NOT_INITED, "NumberFormat", methodName, "NumberFormat");
- // Step 3.
+ // Step 4.
return nf;
}
@@ -141,18 +134,18 @@ function UnwrapNumberFormat(nf, methodName) { * Spec: ECMAScript Internationalization API Specification, 11.1.1.
*/
function SetNumberFormatDigitOptions(lazyData, options, mnfdDefault) {
- // We skip Step 1 because we set the properties on a lazyData object.
+ // We skip step 1 because we set the properties on a lazyData object.
- // Step 2-3.
+ // Steps 2-4.
assert(IsObject(options), "SetNumberFormatDigitOptions");
assert(typeof mnfdDefault === "number", "SetNumberFormatDigitOptions");
- // Steps 4-6.
+ // Steps 5-8.
const mnid = GetNumberOption(options, "minimumIntegerDigits", 1, 21, 1);
const mnfd = GetNumberOption(options, "minimumFractionDigits", 0, 20, mnfdDefault);
const mxfd = GetNumberOption(options, "maximumFractionDigits", mnfd, 20);
- // Steps 7-8.
+ // Steps 9-10.
let mnsd = options.minimumSignificantDigits;
let mxsd = options.maximumSignificantDigits;
@@ -196,17 +189,9 @@ function toASCIIUpperCase(s) { *
* Spec: ECMAScript Internationalization API Specification, 6.3.1.
*/
-function getIsWellFormedCurrencyCodeRE() {
- return internalIntlRegExps.isWellFormedCurrencyCodeRE ||
- (internalIntlRegExps.isWellFormedCurrencyCodeRE = RegExpCreate("[^A-Z]"));
-}
-
function IsWellFormedCurrencyCode(currency) {
- var c = ToString(currency);
- var normalized = toASCIIUpperCase(c);
- if (normalized.length !== 3)
- return false;
- return !regexp_test_no_statics(getIsWellFormedCurrencyCodeRE(), normalized);
+ assert(typeof currency === "string", "currency is a string value");
+ return currency.length === 3 && IsASCIIAlphaString(currency);
}
/**
@@ -218,15 +203,12 @@ function IsWellFormedCurrencyCode(currency) { * This later work occurs in |resolveNumberFormatInternals|; steps not noted
* here occur there.
*
- * Spec: ECMAScript Internationalization API Specification, 11.1.1.
+ * Spec: ECMAScript Internationalization API Specification, 11.1.2.
*/
function InitializeNumberFormat(numberFormat, thisValue, locales, options) {
assert(IsObject(numberFormat), "InitializeNumberFormat called with non-object");
assert(IsNumberFormat(numberFormat), "InitializeNumberFormat called with non-NumberFormat");
- // Steps 1-2 (These steps are no longer required and should be removed
- // from the spec; https://github.com/tc39/ecma402/issues/115).
-
// Lazy NumberFormat data has the following structure:
//
// {
@@ -258,11 +240,11 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { // subset of them.
var lazyNumberFormatData = std_Object_create(null);
- // Step 3.
+ // Step 1.
var requestedLocales = CanonicalizeLocaleList(locales);
lazyNumberFormatData.requestedLocales = requestedLocales;
- // Steps 4-5.
+ // Steps 2-3.
//
// If we ever need more speed here at startup, we should try to detect the
// case where |options === undefined| and Object.prototype hasn't been
@@ -275,20 +257,20 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { options = ToObject(options);
// Compute options that impact interpretation of locale.
- // Step 6.
+ // Step 4.
var opt = new Record();
lazyNumberFormatData.opt = opt;
- // Steps 7-8.
+ // Steps 5-6.
var matcher = GetOption(options, "localeMatcher", "string", ["lookup", "best fit"], "best fit");
opt.localeMatcher = matcher;
// Compute formatting options.
- // Step 14.
+ // Step 12.
var s = GetOption(options, "style", "string", ["decimal", "percent", "currency"], "decimal");
lazyNumberFormatData.style = s;
- // Steps 16-19.
+ // Steps 14-17.
var c = GetOption(options, "currency", "string", undefined, undefined);
if (c !== undefined && !IsWellFormedCurrencyCode(c))
ThrowRangeError(JSMSG_INVALID_CURRENCY_CODE, c);
@@ -303,12 +285,12 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { cDigits = CurrencyDigits(c);
}
- // Step 20.
+ // Step 18.
var cd = GetOption(options, "currencyDisplay", "string", ["code", "symbol", "name"], "symbol");
if (s === "currency")
lazyNumberFormatData.currencyDisplay = cd;
- // Steps 22-24.
+ // Steps 20-22.
SetNumberFormatDigitOptions(lazyNumberFormatData, options, s === "currency" ? cDigits: 0);
// Step 25.
@@ -322,16 +304,19 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { std_Math_max(lazyNumberFormatData.minimumFractionDigits, mxfdDefault);
}
- // Step 26.
+ // Steps 23.
var g = GetOption(options, "useGrouping", "boolean", undefined, true);
lazyNumberFormatData.useGrouping = g;
- // Steps 35-36.
+ // Step 31.
//
// We've done everything that must be done now: mark the lazy data as fully
// computed and install it.
initializeIntlObject(numberFormat, "NumberFormat", lazyNumberFormatData);
+ // 11.2.1, steps 4-5.
+ // TODO: spec issue - The current spec doesn't have the IsObject check,
+ // which means |Intl.NumberFormat.call(null)| is supposed to throw here.
if (numberFormat !== thisValue && thisValue instanceof GetNumberFormatConstructor()) {
if (!IsObject(thisValue))
ThrowTypeError(JSMSG_NOT_NONNULL_OBJECT, typeof thisValue);
@@ -342,6 +327,7 @@ function InitializeNumberFormat(numberFormat, thisValue, locales, options) { return thisValue;
}
+ // 11.2.1, step 6.
return numberFormat;
}
@@ -386,15 +372,12 @@ var currencyDigits = { /**
* Returns the number of decimal digits to be used for the given currency.
*
- * Spec: ECMAScript Internationalization API Specification, 11.1.1.
+ * Spec: ECMAScript Internationalization API Specification, 11.1.3.
*/
-function getCurrencyDigitsRE() {
- return internalIntlRegExps.currencyDigitsRE ||
- (internalIntlRegExps.currencyDigitsRE = RegExpCreate("^[A-Z]{3}$"));
-}
function CurrencyDigits(currency) {
- assert(typeof currency === "string", "CurrencyDigits");
- assert(regexp_test_no_statics(getCurrencyDigitsRE(), currency), "CurrencyDigits");
+ assert(typeof currency === "string", "currency is a string value");
+ assert(IsWellFormedCurrencyCode(currency), "currency is well-formed");
+ assert(currency == toASCIIUpperCase(currency), "currency is all upper-case");
if (hasOwn(currency, currencyDigits))
return currencyDigits[currency];
@@ -407,14 +390,19 @@ function CurrencyDigits(currency) { * matching (possibly fallback) locale. Locales appear in the same order in the
* returned list as in the input list.
*
- * Spec: ECMAScript Internationalization API Specification, 11.2.2.
+ * Spec: ECMAScript Internationalization API Specification, 11.3.2.
*/
function Intl_NumberFormat_supportedLocalesOf(locales /*, options*/) {
var options = arguments.length > 1 ? arguments[1] : undefined;
+ // Step 1.
var availableLocales = callFunction(numberFormatInternalProperties.availableLocales,
numberFormatInternalProperties);
+
+ // Step 2.
var requestedLocales = CanonicalizeLocaleList(locales);
+
+ // Step 3.
return SupportedLocales(availableLocales, requestedLocales, options);
}
@@ -427,8 +415,8 @@ function getNumberingSystems(locale) { // Algorithmic numbering systems are typically tied to one locale, so for
// lack of information we don't offer them. To increase chances that
// other software will process output correctly, we further restrict to
- // those decimal numbering systems explicitly listed in table 2 of
- // the ECMAScript Internationalization API Specification, 11.3.2, which
+ // those decimal numbering systems explicitly listed in table 3 of
+ // the ECMAScript Internationalization API Specification, 11.1.6, which
// in turn are those with full specifications in version 21 of Unicode
// Technical Standard #35 using digits that were defined in Unicode 5.0,
// the Unicode version supported in Windows Vista.
@@ -459,7 +447,7 @@ function numberFormatLocaleData() { /**
* Function to be bound and returned by Intl.NumberFormat.prototype.format.
*
- * Spec: ECMAScript Internationalization API Specification, 11.3.2.
+ * Spec: ECMAScript Internationalization API Specification, 11.1.4.
*/
function numberFormatFormatToBind(value) {
// Steps 1.a.i implemented by ECMAScript declaration binding instantiation,
@@ -476,7 +464,7 @@ function numberFormatFormatToBind(value) { * representing the result of calling ToNumber(value) according to the
* effective locale and the formatting options of this NumberFormat.
*
- * Spec: ECMAScript Internationalization API Specification, 11.3.2.
+ * Spec: ECMAScript Internationalization API Specification, 11.4.3.
*/
function Intl_NumberFormat_format_get() {
// Steps 1-3.
@@ -486,12 +474,11 @@ function Intl_NumberFormat_format_get() { // Step 4.
if (internals.boundFormat === undefined) {
- // Step 4.a.
- var F = numberFormatFormatToBind;
+ // Steps 4.a-b.
+ var F = callFunction(FunctionBind, numberFormatFormatToBind, nf);
- // Steps 4.b-d.
- var bf = callFunction(FunctionBind, F, nf);
- internals.boundFormat = bf;
+ // Step 4.c.
+ internals.boundFormat = F;
}
// Step 5.
@@ -499,6 +486,9 @@ function Intl_NumberFormat_format_get() { }
_SetCanonicalName(Intl_NumberFormat_format_get, "get format");
+/**
+ * 11.4.4 Intl.NumberFormat.prototype.formatToParts ( value )
+ */
function Intl_NumberFormat_formatToParts(value) {
// Steps 1-3.
var nf = UnwrapNumberFormat(this, "formatToParts");
@@ -516,14 +506,15 @@ function Intl_NumberFormat_formatToParts(value) { /**
* Returns the resolved options for a NumberFormat object.
*
- * Spec: ECMAScript Internationalization API Specification, 11.3.3 and 11.4.
+ * Spec: ECMAScript Internationalization API Specification, 11.4.5.
*/
function Intl_NumberFormat_resolvedOptions() {
- // Invoke |UnwrapNumberFormat| per introduction of section 11.3.
+ // Steps 1-3.
var nf = UnwrapNumberFormat(this, "resolvedOptions");
var internals = getNumberFormatInternals(nf);
+ // Steps 4-5.
var result = {
locale: internals.locale,
numberingSystem: internals.numberingSystem,
@@ -533,17 +524,31 @@ function Intl_NumberFormat_resolvedOptions() { maximumFractionDigits: internals.maximumFractionDigits,
useGrouping: internals.useGrouping
};
- var optionalProperties = [
- "currency",
- "currencyDisplay",
- "minimumSignificantDigits",
- "maximumSignificantDigits"
- ];
- for (var i = 0; i < optionalProperties.length; i++) {
- var p = optionalProperties[i];
- if (hasOwn(p, internals))
- _DefineDataProperty(result, p, internals[p]);
+
+ // currency and currencyDisplay are only present for currency formatters.
+ assert(hasOwn("currency", internals) === (internals.style === "currency"),
+ "currency is present iff style is 'currency'");
+ assert(hasOwn("currencyDisplay", internals) === (internals.style === "currency"),
+ "currencyDisplay is present iff style is 'currency'");
+
+ if (hasOwn("currency", internals)) {
+ _DefineDataProperty(result, "currency", internals.currency);
+ _DefineDataProperty(result, "currencyDisplay", internals.currencyDisplay);
+ }
+
+ // Min/Max significant digits are either both present or not at all.
+ assert(hasOwn("minimumSignificantDigits", internals) ===
+ hasOwn("maximumSignificantDigits", internals),
+ "minimumSignificantDigits is present iff maximumSignificantDigits is present");
+
+ if (hasOwn("minimumSignificantDigits", internals)) {
+ _DefineDataProperty(result, "minimumSignificantDigits",
+ internals.minimumSignificantDigits);
+ _DefineDataProperty(result, "maximumSignificantDigits",
+ internals.maximumSignificantDigits);
}
+
+ // Step 6.
return result;
}
diff --git a/js/src/builtin/intl/PluralRules.cpp b/js/src/builtin/intl/PluralRules.cpp index 78bd9e5d74..63d399f818 100644 --- a/js/src/builtin/intl/PluralRules.cpp +++ b/js/src/builtin/intl/PluralRules.cpp @@ -79,7 +79,7 @@ static const JSFunctionSpec pluralRules_methods[] = { /**
* PluralRules constructor.
- * Spec: ECMAScript 402 API, PluralRules, 1.1
+ * Spec: ECMAScript 402 API, PluralRules, 13.2.1
*/
static bool
PluralRules(JSContext* cx, const CallArgs& args, bool construct)
diff --git a/js/src/builtin/intl/PluralRules.js b/js/src/builtin/intl/PluralRules.js index 1e138a8830..d687296245 100644 --- a/js/src/builtin/intl/PluralRules.js +++ b/js/src/builtin/intl/PluralRules.js @@ -7,7 +7,7 @@ /**
* PluralRules internal properties.
*
- * Spec: ECMAScript 402 API, PluralRules, 1.3.3.
+ * Spec: ECMAScript 402 API, PluralRules, 13.3.3.
*/
var pluralRulesInternalProperties = {
localeData: pluralRulesLocaleData,
@@ -44,20 +44,25 @@ function resolvePluralRulesInternals(lazyPluralRulesData) { var PluralRules = pluralRulesInternalProperties;
- // Step 13.
+ // Compute effective locale.
+
+ // Step 10.
+ var localeData = PluralRules.localeData;
+
+ // Step 11.
const r = ResolveLocale(callFunction(PluralRules.availableLocales, PluralRules),
- lazyPluralRulesData.requestedLocales,
- lazyPluralRulesData.opt,
- PluralRules.relevantExtensionKeys, PluralRules.localeData);
+ lazyPluralRulesData.requestedLocales,
+ lazyPluralRulesData.opt,
+ PluralRules.relevantExtensionKeys,
+ localeData);
- // Step 14.
+ // Step 12.
internalProps.locale = r.locale;
- internalProps.type = lazyPluralRulesData.type;
- internalProps.pluralCategories = intl_GetPluralCategories(
- internalProps.locale,
- internalProps.type);
+ // Step 8.
+ internalProps.type = lazyPluralRulesData.type;
+ // Step 9.
internalProps.minimumIntegerDigits = lazyPluralRulesData.minimumIntegerDigits;
internalProps.minimumFractionDigits = lazyPluralRulesData.minimumFractionDigits;
internalProps.maximumFractionDigits = lazyPluralRulesData.maximumFractionDigits;
@@ -68,6 +73,9 @@ function resolvePluralRulesInternals(lazyPluralRulesData) { internalProps.maximumSignificantDigits = lazyPluralRulesData.maximumSignificantDigits;
}
+ // Step 13 (lazily computed on first access).
+ internalProps.pluralCategories = null;
+
return internalProps;
}
@@ -99,15 +107,12 @@ function getPluralRulesInternals(obj) { * This later work occurs in |resolvePluralRulesInternals|; steps not noted
* here occur there.
*
- * Spec: ECMAScript 402 API, PluralRules, 1.1.1.
+ * Spec: ECMAScript 402 API, PluralRules, 13.1.1.
*/
function InitializePluralRules(pluralRules, locales, options) {
assert(IsObject(pluralRules), "InitializePluralRules called with non-object");
assert(IsPluralRules(pluralRules), "InitializePluralRules called with non-PluralRules");
- // Steps 1-2 (These steps are no longer required and should be removed
- // from the spec; https://github.com/tc39/ecma402/issues/115).
-
// Lazy PluralRules data has the following structure:
//
// {
@@ -133,30 +138,29 @@ function InitializePluralRules(pluralRules, locales, options) { // subset of them.
const lazyPluralRulesData = std_Object_create(null);
- // Step 3.
+ // Step 1.
let requestedLocales = CanonicalizeLocaleList(locales);
lazyPluralRulesData.requestedLocales = requestedLocales;
- // Steps 4-5.
+ // Steps 2-3.
if (options === undefined)
options = {};
else
options = ToObject(options);
- // Step 6.
- const type = GetOption(options, "type", "string", ["cardinal", "ordinal"], "cardinal");
- lazyPluralRulesData.type = type;
-
- // Step 8.
+ // Step 4.
let opt = new Record();
lazyPluralRulesData.opt = opt;
- // Steps 9-10.
+ // Steps 5-6.
let matcher = GetOption(options, "localeMatcher", "string", ["lookup", "best fit"], "best fit");
opt.localeMatcher = matcher;
+ // Step 7.
+ const type = GetOption(options, "type", "string", ["cardinal", "ordinal"], "cardinal");
+ lazyPluralRulesData.type = type;
- // Step 11.
+ // Step 9.
SetNumberFormatDigitOptions(lazyPluralRulesData, options, 0);
// Step 12.
@@ -165,6 +169,10 @@ function InitializePluralRules(pluralRules, locales, options) { std_Math_max(lazyPluralRulesData.minimumFractionDigits, 3);
}
+ // Step 15.
+ //
+ // We've done everything that must be done now: mark the lazy data as fully
+ // computed and install it.
initializeIntlObject(pluralRules, "PluralRules", lazyPluralRulesData)
}
@@ -173,7 +181,7 @@ function InitializePluralRules(pluralRules, locales, options) { * matching (possibly fallback) locale. Locales appear in the same order in the
* returned list as in the input list.
*
- * Spec: ECMAScript 402 API, PluralRules, 1.3.2.
+ * Spec: ECMAScript 402 API, PluralRules, 13.3.2.
*/
function Intl_PluralRules_supportedLocalesOf(locales /*, options*/) {
var options = arguments.length > 1 ? arguments[1] : undefined;
@@ -193,20 +201,20 @@ function Intl_PluralRules_supportedLocalesOf(locales /*, options*/) { * the number passed as value according to the
* effective locale and the formatting options of this PluralRules.
*
- * Spec: ECMAScript 402 API, PluralRules, 1.4.3.
+ * Spec: ECMAScript 402 API, PluralRules, 13.4.3.
*/
function Intl_PluralRules_select(value) {
// Step 1.
let pluralRules = this;
- // Step 2.
+ // Steps 2-3.
if (!IsObject(pluralRules) || !IsPluralRules(pluralRules))
ThrowTypeError(JSMSG_INTL_OBJECT_NOT_INITED, "PluralRules", "select", "PluralRules");
// Ensure the PluralRules internals are resolved.
getPluralRulesInternals(pluralRules);
- // Steps 3-4.
+ // Step 4.
let n = ToNumber(value);
// Step 5.
@@ -216,17 +224,34 @@ function Intl_PluralRules_select(value) { /**
* Returns the resolved options for a PluralRules object.
*
- * Spec: ECMAScript 402 API, PluralRules, 1.4.4.
+ * Spec: ECMAScript 402 API, PluralRules, 13.4.4.
*/
function Intl_PluralRules_resolvedOptions() {
- // Check "this PluralRules object" per introduction of section 1.4.
- if (!IsObject(this) || !IsPluralRules(this)) {
+ // Step 1.
+ var pluralRules = this;
+
+ // Steps 2-3.
+ if (!IsObject(pluralRules) || !IsPluralRules(pluralRules)) {
ThrowTypeError(JSMSG_INTL_OBJECT_NOT_INITED, "PluralRules", "resolvedOptions",
"PluralRules");
}
- var internals = getPluralRulesInternals(this);
+ var internals = getPluralRulesInternals(pluralRules);
+
+ var internalsPluralCategories = internals.pluralCategories;
+ if (internalsPluralCategories === null) {
+ internalsPluralCategories = intl_GetPluralCategories(internals.locale, internals.type);
+ internals.pluralCategories = internalsPluralCategories;
+ }
+
+ // TODO: The current spec actually requires to return the internal array
+ // object and not a copy of it.
+ // <https://github.com/tc39/proposal-intl-plural-rules/issues/28#issuecomment-341557030>
+ var pluralCategories = [];
+ for (var i = 0; i < internalsPluralCategories.length; i++)
+ _DefineDataProperty(pluralCategories, i, internalsPluralCategories[i]);
+ // Steps 4-5.
var result = {
locale: internals.locale,
type: internals.type,
@@ -236,16 +261,19 @@ function Intl_PluralRules_resolvedOptions() { maximumFractionDigits: internals.maximumFractionDigits,
};
- var optionalProperties = [
- "minimumSignificantDigits",
- "maximumSignificantDigits"
- ];
+ // Min/Max significant digits are either both present or not at all.
+ assert(hasOwn("minimumSignificantDigits", internals) ===
+ hasOwn("maximumSignificantDigits", internals),
+ "minimumSignificantDigits is present iff maximumSignificantDigits is present");
- for (var i = 0; i < optionalProperties.length; i++) {
- var p = optionalProperties[i];
- if (hasOwn(p, internals))
- _DefineDataProperty(result, p, internals[p]);
+ if (hasOwn("minimumSignificantDigits", internals)) {
+ _DefineDataProperty(result, "minimumSignificantDigits",
+ internals.minimumSignificantDigits);
+ _DefineDataProperty(result, "maximumSignificantDigits",
+ internals.maximumSignificantDigits);
}
+
+ // Step 6.
return result;
}
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py index 02bf350814..f2a6b32082 100644 --- a/js/src/builtin/intl/make_intl_data.py +++ b/js/src/builtin/intl/make_intl_data.py @@ -6,19 +6,14 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ Usage: - make_intl_data.py langtags [language-subtag-registry.txt] + make_intl_data.py langtags [ldmlSupplemental.dtd supplementalMetadata.xml likelySubtags.xml] make_intl_data.py tzdata Target "langtags": This script extracts information about mappings between deprecated and - current BCP 47 language tags from the IANA Language Subtag Registry and - converts it to JavaScript object definitions in - LangTagMappingsGenerated.js. The definitions are used in Intl.js. - - The IANA Language Subtag Registry is imported from - https://www.iana.org/assignments/language-subtag-registry - and uses the syntax specified in - https://tools.ietf.org/html/rfc5646#section-3 + current Unicode BCP 47 locale identifiers from CLDR and converts it to + JavaScript object definitions in LangTagMappingsGenerated.js. The + definitions are used in Intl.js. Target "tzdata": @@ -32,202 +27,714 @@ import os import re import io import codecs +import shutil +import subprocess import sys import tarfile import tempfile import urllib2 -import urlparse -from contextlib import closing +from contextlib import closing, contextmanager from functools import partial from itertools import chain, ifilter, ifilterfalse, imap, tee from operator import attrgetter, itemgetter +from urlparse import urlsplit, urlunsplit + +def writeMappingHeader(println, description, source, url): + if type(description) is not list: + description = [description] + for desc in description: + println(u"// {0}".format(desc)) + println(u"// Derived from {0}.".format(source)) + println(u"// {0}".format(url)) + +def writeMappingsVar(println, mapping, name, description, source, url): + """ Writes a variable definition with a mapping table. + + Writes the contents of dictionary |mapping| through the |println| + function with the given variable name and a comment with description, + source, and URL. + """ + println(u"") + writeMappingHeader(println, description, source, url) + println(u"var {0} = {{".format(name)) + for key in sorted(mapping): + if not isinstance(mapping[key], dict): + value = mapping[key] + if isinstance(value, bool): + value = "true" if value else "false" + else: + value = '"{0}"'.format(value) + else: + preferred = mapping[key]["preferred"] + prefix = mapping[key]["prefix"] + if key != preferred: + raise Exception("Expected '{0}' matches preferred locale '{1}'".format(key, preferred)) + value = '"{0}"'.format(prefix) + println(u' "{0}": {1},'.format(key, value)) + println(u"};") + +def writeUpdateLocaleIdMappingsFunction(println, + complex_language_mappings, + complex_region_mappings, + description, source, url): + """ Writes a function definition that performs language tag mapping. """ + println(u"") + writeMappingHeader(println, description, source, url) + println(u"""\ +/* eslint-disable complexity */ +function updateLocaleIdMappings(tag) { + assert(IsObject(tag), "tag is an object"); + + // Replace deprecated language tags with their preferred values. + var language = tag.language; + if (hasOwn(language, languageMappings)) { + tag.language = languageMappings[language]; + } else if (hasOwn(language, complexLanguageMappings)) { + switch (language) {""") + + # Merge duplicate language entries. + language_aliases = {} + for (deprecated_language, (language, script, region)) in ( + sorted(complex_language_mappings.items(), key=itemgetter(0)) + ): + key = (language, script, region) + if key not in language_aliases: + language_aliases[key] = [] + else: + language_aliases[key].append(deprecated_language) -def readRegistryRecord(registry): - """ Yields the records of the IANA Language Subtag Registry as dictionaries. """ - record = {} - for line in registry: - line = line.strip() - if line == "": + for (deprecated_language, (language, script, region)) in ( + sorted(complex_language_mappings.items(), key=itemgetter(0)) + ): + key = (language, script, region) + if deprecated_language in language_aliases[key]: continue - if line == "%%": - yield record - record = {} + + for lang in [deprecated_language] + language_aliases[key]: + println(u""" + case "{}": + """.format(lang).rstrip().strip("\n")) + + println(u""" + tag.language = "{}"; + """.format(language).rstrip().strip("\n")) + if script is not None: + println(u""" + if (tag.script === undefined) + tag.script = "{}"; + """.format(script).rstrip().strip("\n")) + if region is not None: + println(u""" + if (tag.region === undefined) + tag.region = "{}"; + """.format(region).rstrip().strip("\n")) + println(u""" + break; + """.rstrip().strip("\n")) + + println(u""" + default: + assert(false, "language not handled: " + language); + } + } + + // No script replacements are currently present. + + // Replace deprecated subtags with their preferred values. + var region = tag.region; + if (region !== undefined) { + if (hasOwn(region, regionMappings)) { + tag.region = regionMappings[region]; + } else if (hasOwn(region, complexRegionMappings)) { + switch (region) {""".lstrip("\n")) + + # |non_default_replacements| is a list and hence not hashable. Convert it + # to a string to get a proper hashable value. + def hash_key(default, non_default_replacements): + return (default, str(sorted(str(v) for v in non_default_replacements))) + + # Merge duplicate region entries. + region_aliases = {} + for (deprecated_region, (default, non_default_replacements)) in ( + sorted(complex_region_mappings.items(), key=itemgetter(0)) + ): + key = hash_key(default, non_default_replacements) + if key not in region_aliases: + region_aliases[key] = [] else: - if ":" in line: - key, value = line.split(":", 1) - key, value = key.strip(), value.strip() - record[key] = value + region_aliases[key].append(deprecated_region) + + for (deprecated_region, (default, non_default_replacements)) in ( + sorted(complex_region_mappings.items(), key=itemgetter(0)) + ): + key = hash_key(default, non_default_replacements) + if deprecated_region in region_aliases[key]: + continue + + for region in [deprecated_region] + region_aliases[key]: + println(u""" + case "{}": + """.format(region).rstrip().strip("\n")) + + for (language, script, region) in sorted(non_default_replacements, key=itemgetter(0)): + if script is None: + println(u""" + if (tag.language === "{}") {{ + """.format(language).rstrip().strip("\n")) else: - # continuation line - record[key] += " " + line - if record: - yield record - return + println(u""" + if (tag.language === "{}" && tag.script === "{}") {{ + """.format(language, script).rstrip().strip("\n")) + println(u""" + tag.region = "{}"; + break; + }} + """.format(region).rstrip().strip("\n")) + + println(u""" + tag.region = "{}"; + break; + """.format(default).rstrip().strip("\n")) + + println(u""" + default: + assert(false, "region not handled: " + region); + } + } + + // No variant replacements are currently present. + // No extension replacements are currently present. + // Private use sequences are left as is. + + } +} +/* eslint-enable complexity */ +""".strip("\n")) + + +def writeGrandfatheredMappingsFunction(println, + grandfathered_mappings, + description, source, url): + """ Writes a function definition that maps grandfathered language tags. """ + println(u"") + writeMappingHeader(println, description, source, url) + println(u"""\ +function updateGrandfatheredMappings(tag) { + assert(IsObject(tag), "tag is an object"); + + // We're mapping regular grandfathered tags to non-grandfathered form here. + // Other tags remain unchanged. + // + // regular = "art-lojban" + // / "cel-gaulish" + // / "no-bok" + // / "no-nyn" + // / "zh-guoyu" + // / "zh-hakka" + // / "zh-min" + // / "zh-min-nan" + // / "zh-xiang" + // + // Therefore we can quickly exclude most tags by checking every + // |unicode_locale_id| subcomponent for characteristics not shared by any of + // the regular grandfathered (RG) tags: + // + // * Real-world |unicode_language_subtag|s are all two or three letters, + // so don't waste time running a useless |language.length > 3| fast-path. + // * No RG tag has a "script"-looking component. + // * No RG tag has a "region"-looking component. + // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, + // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, + // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag + // that |unicode_locale_id| doesn't support.) + // * No RG tag contains |extensions| or |pu_extensions|. + if (tag.script !== undefined || + tag.region !== undefined || + tag.variants.length !== 1 || + tag.extensions.length !== 0 || + tag.privateuse !== undefined) + { + return; + }""") + + # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. + # + # Doesn't allow any 'extensions' subtags. + re_unicode_locale_id = re.compile( + r""" + ^ + # unicode_language_id = unicode_language_subtag + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + (?P<language>[a-z]{2,3}|[a-z]{5,8}) + + # (sep unicode_script_subtag)? + # unicode_script_subtag = alpha{4} + (?:-(?P<script>[a-z]{4}))? + + # (sep unicode_region_subtag)? + # unicode_region_subtag = (alpha{2} | digit{3}) + (?:-(?P<region>([a-z]{2}|[0-9]{3})))? + + # (sep unicode_variant_subtag)* + # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? + + # pu_extensions? + # pu_extensions = sep [xX] (sep alphanum{1,8})+ + (?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))? + $ + """, re.IGNORECASE | re.VERBOSE) + + is_first = True + + for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)): + tag_match = re_unicode_locale_id.match(tag) + assert tag_match is not None + + tag_language = tag_match.group("language") + assert tag_match.group("script") is None, ( + "{} does not contain a script subtag".format(tag)) + assert tag_match.group("region") is None, ( + "{} does not contain a region subtag".format(tag)) + tag_variants = tag_match.group("variants") + assert tag_variants is not None, ( + "{} contains a variant subtag".format(tag)) + assert tag_match.group("privateuse") is None, ( + "{} does not contain a privateuse subtag".format(tag)) + + tag_variant = tag_variants[1:] + assert "-" not in tag_variant, ( + "{} contains only a single variant".format(tag)) + + modern_match = re_unicode_locale_id.match(modern) + assert modern_match is not None + + modern_language = modern_match.group("language") + modern_script = modern_match.group("script") + modern_region = modern_match.group("region") + modern_variants = modern_match.group("variants") + modern_privateuse = modern_match.group("privateuse") + + println(u""" + // {} -> {} +""".format(tag, modern).rstrip()) + + println(u""" + {}if (tag.language === "{}" && tag.variants[0] === "{}") {{ + """.format("" if is_first else "else ", tag_language, tag_variant).rstrip().strip("\n")) + + is_first = False + + println(u""" + tag.language = "{}"; + """.format(modern_language).rstrip().strip("\n")) + + if modern_script is not None: + println(u""" + tag.script = "{}"; + """.format(modern_script).rstrip().strip("\n")) + + if modern_region is not None: + println(u""" + tag.region = "{}"; + """.format(modern_region).rstrip().strip("\n")) + + if modern_variants is not None: + println(u""" + tag.variants = {}; + """.format(sorted(modern_variants[1:].split("-"))).rstrip().strip("\n")) + else: + println(u""" + tag.variants.length = 0; + """.rstrip().strip("\n")) + + if modern_privateuse is not None: + println(u""" + tag.privateuse = "{}"; + """.format(modern_privateuse).rstrip().strip("\n")) + + println(u""" + }""".rstrip().strip("\n")) + println(u""" +}""".lstrip("\n")) -def readRegistry(registry): - """ Reads IANA Language Subtag Registry and extracts information for Intl.js. + +@contextmanager +def TemporaryDirectory(): + tmpDir = tempfile.mkdtemp() + try: + yield tmpDir + finally: + shutil.rmtree(tmpDir) + + +def readSupplementalData(supplemental_dtd_file, supplemental_metadata_file, likely_subtags_file): + """ Reads CLDR Supplemental Data and extracts information for Intl.js. Information extracted: - - langTagMappings: mappings from complete language tags to preferred + - grandfatheredMappings: mappings from grandfathered tags to preferred complete language tags - - langSubtagMappings: mappings from subtags to preferred subtags - - extlangMappings: mappings from extlang subtags to preferred subtags, - with prefix to be removed - Returns these three mappings as dictionaries, along with the registry's - file date. - - We also check that mappings for language subtags don't affect extlang - subtags and vice versa, so that CanonicalizeLanguageTag doesn't have - to separate them for processing. Region codes are separated by case, - and script codes by length, so they're unproblematic. + - languageMappings: mappings from language subtags to preferred subtags + - complexLanguageMappings: mappings from language subtags with complex rules + - regionMappings: mappings from region subtags to preferred subtags + - complexRegionMappings: mappings from region subtags with complex rules + Returns these five mappings as dictionaries. """ - langTagMappings = {} - langSubtagMappings = {} - extlangMappings = {} - languageSubtags = set() - extlangSubtags = set() - - for record in readRegistryRecord(registry): - if "File-Date" in record: - fileDate = record["File-Date"] - continue + import xml.etree.ElementTree as ET + + # <!ATTLIST version cldrVersion CDATA #FIXED "36" > + re_cldr_version = re.compile( + r"""<!ATTLIST version cldrVersion CDATA #FIXED "(?P<version>[\d|\.]+)" >""") + + with io.open(supplemental_dtd_file, mode="r", encoding="utf-8") as f: + version_match = re_cldr_version.search(f.read()) + assert version_match is not None, "CLDR version string not found" + cldr_version = version_match.group("version") + + # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. + re_unicode_language_id = re.compile( + r""" + ^ + # unicode_language_id = unicode_language_subtag + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + (?P<language>[a-z]{2,3}|[a-z]{5,8}) + + # (sep unicode_script_subtag)? + # unicode_script_subtag = alpha{4} + (?:-(?P<script>[a-z]{4}))? + + # (sep unicode_region_subtag)? + # unicode_region_subtag = (alpha{2} | digit{3}) + (?:-(?P<region>([a-z]{2}|[0-9]{3})))? + + # (sep unicode_variant_subtag)* + # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? + $ + """, re.IGNORECASE | re.VERBOSE) + + re_unicode_language_subtag = re.compile( + r""" + ^ + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + ([a-z]{2,3}|[a-z]{5,8}) + $ + """, re.IGNORECASE | re.VERBOSE) + + re_unicode_region_subtag = re.compile( + r""" + ^ + # unicode_region_subtag = (alpha{2} | digit{3}) + ([a-z]{2}|[0-9]{3}) + $ + """, re.IGNORECASE | re.VERBOSE) + + # The fixed list of BCP 47 grandfathered language tags. + grandfathered_tags = ( + "art-lojban", + "cel-gaulish", + "en-GB-oed", + "i-ami", + "i-bnn", + "i-default", + "i-enochian", + "i-hak", + "i-klingon", + "i-lux", + "i-mingo", + "i-navajo", + "i-pwn", + "i-tao", + "i-tay", + "i-tsu", + "no-bok", + "no-nyn", + "sgn-BE-FR", + "sgn-BE-NL", + "sgn-CH-DE", + "zh-guoyu", + "zh-hakka", + "zh-min", + "zh-min-nan", + "zh-xiang", + ) - if record["Type"] == "grandfathered": - # Grandfathered tags don't use standard syntax, so - # CanonicalizeLanguageTag expects the mapping table to provide - # the final form for all. - # For langTagMappings, keys must be in lower case; values in - # the case used in the registry. - tag = record["Tag"] - if "Preferred-Value" in record: - langTagMappings[tag.lower()] = record["Preferred-Value"] - else: - langTagMappings[tag.lower()] = tag - elif record["Type"] == "redundant": - # For langTagMappings, keys must be in lower case; values in - # the case used in the registry. - if "Preferred-Value" in record: - langTagMappings[record["Tag"].lower()] = record["Preferred-Value"] - elif record["Type"] in ("language", "script", "region", "variant"): - # For langSubtagMappings, keys and values must be in the case used - # in the registry. - subtag = record["Subtag"] - if record["Type"] == "language": - languageSubtags.add(subtag) - if "Preferred-Value" in record: - if subtag == "heploc": - # The entry for heploc is unique in its complexity; handle - # it as special case below. - continue - if "Prefix" in record: - # This might indicate another heploc-like complex case. - raise Exception("Please evaluate: subtag mapping with prefix value.") - langSubtagMappings[subtag] = record["Preferred-Value"] - elif record["Type"] == "extlang": - # For extlangMappings, keys must be in the case used in the - # registry; values are records with the preferred value and the - # prefix to be removed. - subtag = record["Subtag"] - extlangSubtags.add(subtag) - if "Preferred-Value" in record: - preferred = record["Preferred-Value"] - prefix = record["Prefix"] - extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix} - else: - # No other types are allowed by - # https://tools.ietf.org/html/rfc5646#section-3.1.3 - assert False, "Unrecognized Type: {0}".format(record["Type"]) + # The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers. + unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags + if re_unicode_language_id.match(tag)} - # Check that mappings for language subtags and extlang subtags don't affect - # each other. - for lang in languageSubtags: - if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang: - raise Exception("Conflict: lang with extlang mapping: " + lang) - for extlang in extlangSubtags: - if extlang in langSubtagMappings: - raise Exception("Conflict: extlang with lang mapping: " + extlang) + # Dictionary of simple language subtag mappings, e.g. "in" -> "id". + language_mappings = {} - # Special case for heploc. - langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97" + # Dictionary of complex language subtag mappings, modifying more than one + # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME"). + complex_language_mappings = {} - # ValidateAndCanonicalizeLanguageTag in Intl.js expects langTagMappings - # contains no 2*3ALPHA. - assert all(len(lang) > 3 for lang in langTagMappings.iterkeys()) + # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE". + region_mappings = {} - return {"fileDate": fileDate, - "langTagMappings": langTagMappings, - "langSubtagMappings": langSubtagMappings, - "extlangMappings": extlangMappings} + # Dictionary of complex region subtag mappings, containing more than one + # replacement, e.g. "SU" -> ("RU", ["AM",complex_region_mappings[type] = replacements "AZ", "BY", ...]). + complex_region_mappings = {} + # Dictionary of grandfathered mappings to preferred values. + grandfathered_mappings = {} -def writeMappingsVar(intlData, dict, name, description, fileDate, url): - """ Writes a variable definition with a mapping table to file intlData. + # CLDR uses "_" as the separator for some elements. Replace it with "-". + def bcp47_id(cldr_id): + return cldr_id.replace("_", "-") - Writes the contents of dictionary dict to file intlData with the given - variable name and a comment with description, fileDate, and URL. - """ - intlData.write("\n") - intlData.write("// {0}.\n".format(description)) - intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate)) - intlData.write("// {0}\n".format(url)) - intlData.write("var {0} = {{\n".format(name)) - keys = sorted(dict) - for key in keys: - if isinstance(dict[key], basestring): - value = '"{0}"'.format(dict[key]) + # CLDR uses the canonical case for most entries, but there are some + # exceptions, like: + # <languageAlias type="drw" replacement="fa_af" reason="deprecated"/> + # Therefore canonicalize all tags to be on the safe side. + def bcp47_canonical(language, script, region): + # Canonical case for language subtags is lower case. + # Canonical case for script subtags is title case. + # Canonical case for region subtags is upper case. + return (language.lower() if language else None, + script.title() if script else None, + region.upper() if region else None) + + tree = ET.parse(supplemental_metadata_file) + + for language_alias in tree.iterfind(".//languageAlias"): + type = bcp47_id(language_alias.get("type")) + replacement = bcp47_id(language_alias.get("replacement")) + + # Handle grandfathered mappings first. + if type in unicode_bcp47_grandfathered_tags: + grandfathered_mappings[type] = replacement + continue + + # We're only interested in language subtag matches, so ignore any + # entries which have additional subtags. + if re_unicode_language_subtag.match(type) is None: + continue + + if re_unicode_language_subtag.match(replacement) is not None: + # Canonical case for language subtags is lower-case. + language_mappings[type] = replacement.lower() + else: + replacement_match = re_unicode_language_id.match(replacement) + assert replacement_match is not None, ( + "{} invalid Unicode BCP 47 locale identifier".format(replacement)) + assert replacement_match.group("variants") is None, ( + "{}: unexpected variant subtags in {}".format(type, replacement)) + + complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"), + replacement_match.group("script"), + replacement_match.group("region")) + + for territory_alias in tree.iterfind(".//territoryAlias"): + type = territory_alias.get("type") + replacement = territory_alias.get("replacement") + + # We're only interested in region subtag matches, so ignore any entries + # which contain legacy formats, e.g. three letter region codes. + if re_unicode_region_subtag.match(type) is None: + continue + + if re_unicode_region_subtag.match(replacement) is not None: + # Canonical case for region subtags is upper-case. + region_mappings[type] = replacement.upper() else: - preferred = dict[key]["preferred"] - prefix = dict[key]["prefix"] - value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix) - intlData.write(' "{0}": {1},\n'.format(key, value)) - intlData.write("};\n") + # Canonical case for region subtags is upper-case. + replacements = [r.upper() for r in replacement.split(" ")] + assert all( + re_unicode_region_subtag.match(loc) is not None for loc in replacements + ), "{} invalid region subtags".format(replacement) + complex_region_mappings[type] = replacements + + tree = ET.parse(likely_subtags_file) + + likely_subtags = {} + + for likely_subtag in tree.iterfind(".//likelySubtag"): + from_tag = bcp47_id(likely_subtag.get("from")) + from_match = re_unicode_language_id.match(from_tag) + assert from_match is not None, ( + "{} invalid Unicode BCP 47 locale identifier".format(from_tag)) + assert from_match.group("variants") is None, ( + "unexpected variant subtags in {}".format(from_tag)) + + to_tag = bcp47_id(likely_subtag.get("to")) + to_match = re_unicode_language_id.match(to_tag) + assert to_match is not None, ( + "{} invalid Unicode BCP 47 locale identifier".format(to_tag)) + assert to_match.group("variants") is None, ( + "unexpected variant subtags in {}".format(to_tag)) + + from_canonical = bcp47_canonical(from_match.group("language"), + from_match.group("script"), + from_match.group("region")) + + to_canonical = bcp47_canonical(to_match.group("language"), + to_match.group("script"), + to_match.group("region")) + + likely_subtags[from_canonical] = to_canonical + + complex_region_mappings_final = {} + + for (deprecated_region, replacements) in complex_region_mappings.items(): + # Find all likely subtag entries which don't already contain a region + # subtag and whose target region is in the list of replacement regions. + region_likely_subtags = [(from_language, from_script, to_region) + for ((from_language, from_script, from_region), + (_, _, to_region)) in likely_subtags.items() + if from_region is None and to_region in replacements] + + # The first replacement entry is the default region. + default = replacements[0] + + # Find all likely subtag entries whose region matches the default region. + default_replacements = {(language, script) + for (language, script, region) in region_likely_subtags + if region == default} + + # And finally find those entries which don't use the default region. + # These are the entries we're actually interested in, because those need + # to be handled specially when selecting the correct preferred region. + non_default_replacements = [(language, script, region) + for (language, script, region) in region_likely_subtags + if (language, script) not in default_replacements] + + # If there are no non-default replacements, we can handle the region as + # part of the simple region mapping. + if non_default_replacements: + complex_region_mappings_final[deprecated_region] = (default, non_default_replacements) + else: + region_mappings[deprecated_region] = default + return {"version": cldr_version, + "grandfatheredMappings": grandfathered_mappings, + "languageMappings": language_mappings, + "complexLanguageMappings": complex_language_mappings, + "regionMappings": region_mappings, + "complexRegionMappings": complex_region_mappings_final, + } -def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings): +def writeCLDRLanguageTagData(println, data, url): """ Writes the language tag data to the Intl data file. """ - writeMappingsVar(intlData, langTagMappings, "langTagMappings", - "Mappings from complete tags to preferred values", fileDate, url) - writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings", - "Mappings from non-extlang subtags to preferred values", fileDate, url) - writeMappingsVar(intlData, extlangMappings, "extlangMappings", - "Mappings from extlang subtags to preferred values", fileDate, url) - -def updateLangTags(args): - """ Update the LangTagMappingsGenerated.js file. """ + + source = u"CLDR Supplemental Data, version {}".format(data["version"]) + grandfathered_mappings = data["grandfatheredMappings"] + language_mappings = data["languageMappings"] + complex_language_mappings = data["complexLanguageMappings"] + region_mappings = data["regionMappings"] + complex_region_mappings = data["complexRegionMappings"] + + writeMappingsVar(println, grandfathered_mappings, "grandfatheredMappings", + "Mappings from grandfathered tags to preferred values.", source, url) + writeMappingsVar(println, language_mappings, "languageMappings", + "Mappings from language subtags to preferred values.", source, url) + writeMappingsVar(println, {key: True for key in complex_language_mappings}, + "complexLanguageMappings", + "Language subtags with complex mappings.", source, url) + writeMappingsVar(println, region_mappings, "regionMappings", + "Mappings from region subtags to preferred values.", source, url) + writeMappingsVar(println, {key: True for key in complex_region_mappings}, + "complexRegionMappings", + "Region subtags with complex mappings.", source, url) + + writeUpdateLocaleIdMappingsFunction(println, complex_language_mappings, + complex_region_mappings, + "Canonicalize Unicode BCP 47 locale identifiers.", + source, url) + writeGrandfatheredMappingsFunction(println, grandfathered_mappings, + "Canonicalize grandfathered locale identifiers.", + source, url) + + +def updateCLDRLangTags(args): + """ Update the LangTagMappingsCLDRGenerated.js file. """ url = args.url + branch = args.branch + revision = args.revision out = args.out - filename = args.file + files = args.files print("Arguments:") print("\tDownload url: %s" % url) - print("\tLocal registry: %s" % filename) + print("\tBranch: %s" % branch) + print("\tRevision: %s" % revision) + print("\tLocal supplemental data and likely subtags: %s" % files) print("\tOutput file: %s" % out) print("") - if filename is not None: - print("Always make sure you have the newest language-subtag-registry.txt!") - registry = codecs.open(filename, "r", encoding="utf-8") + if files: + if len(files) != 3: + raise Exception("Expected three files, but got: {}".format(files)) + + print(("Always make sure you have the newest ldmlSupplemental.dtd, " + "supplementalMetadata.xml, and likelySubtags.xml!")) + + supplemental_dtd_file = files[0] + supplemental_metadata_file = files[1] + likely_subtags_file = files[2] else: - print("Downloading IANA Language Subtag Registry...") - with closing(urllib2.urlopen(url)) as reader: - text = reader.read().decode("utf-8") - registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8") - registry.write(text) - registry.seek(0) - - print("Processing IANA Language Subtag Registry...") - with closing(registry) as reg: - data = readRegistry(reg) - fileDate = data["fileDate"] - langTagMappings = data["langTagMappings"] - langSubtagMappings = data["langSubtagMappings"] - extlangMappings = data["extlangMappings"] + print("Downloading CLDR supplemental data...") + + supplemental_dtd_filename = "ldmlSupplemental.dtd" + supplemental_dtd_path = "common/dtd/{}".format(supplemental_dtd_filename) + supplemental_dtd_file = os.path.join(os.getcwd(), supplemental_dtd_filename) + + supplemental_metadata_filename = "supplementalMetadata.xml" + supplemental_metadata_path = "common/supplemental/{}".format( + supplemental_metadata_filename) + supplemental_metadata_file = os.path.join(os.getcwd(), supplemental_metadata_filename) + + likely_subtags_filename = "likelySubtags.xml" + likely_subtags_path = "common/supplemental/{}".format(likely_subtags_filename) + likely_subtags_file = os.path.join(os.getcwd(), likely_subtags_filename) + + # Try to download the raw file directly from GitHub if possible. + split = urlsplit(url) + if split.netloc == "github.com" and split.path.endswith(".git") and revision == "HEAD": + def download(path, file): + urlpath = "{}/raw/{}/{}".format(urlsplit(url).path[:-4], branch, path) + raw_url = urlunsplit((split.scheme, split.netloc, urlpath, split.query, + split.fragment)) + + with closing(urllib2.urlopen(raw_url)) as reader: + text = reader.read().decode("utf-8") + with io.open(file, "w", encoding="utf-8") as saved_file: + saved_file.write(text) + + download(supplemental_dtd_path, supplemental_dtd_file) + download(supplemental_metadata_path, supplemental_metadata_file) + download(likely_subtags_path, likely_subtags_file) + else: + # Download the requested branch in a temporary directory. + with TemporaryDirectory() as inDir: + if revision == "HEAD": + subprocess.check_call(["git", "clone", "--depth=1", + "--branch=%s" % branch, url, inDir]) + else: + subprocess.check_call(["git", "clone", "--single-branch", + "--branch=%s" % branch, url, inDir]) + subprocess.check_call(["git", "-C", inDir, "reset", "--hard", revision]) + + shutil.copyfile(os.path.join(inDir, supplemental_dtd_path), + supplemental_dtd_file) + shutil.copyfile(os.path.join(inDir, supplemental_metadata_path), + supplemental_metadata_file) + shutil.copyfile(os.path.join(inDir, likely_subtags_path), likely_subtags_file) + + print("Processing CLDR supplemental data...") + data = readSupplementalData(supplemental_dtd_file, + supplemental_metadata_file, + likely_subtags_file) print("Writing Intl data...") - with codecs.open(out, "w", encoding="utf-8") as intlData: - intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n") - writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings) + with io.open(out, mode="w", encoding="utf-8", newline="") as f: + println = partial(print, file=f) + + println(u"// Generated by make_intl_data.py. DO NOT EDIT.") + writeCLDRLanguageTagData(println, data, url) + def flines(filepath, encoding="utf-8"): """ Open filepath and iterate over its content. """ @@ -707,11 +1214,11 @@ def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignor println(u"// Format:") println(u'// "LinkName", "Target" // ICU-Target [time zone file]') - println(u"struct LinkAndTarget"); - println(u"{"); - println(u" const char* const link;"); - println(u" const char* const target;"); - println(u"};"); + println(u"struct LinkAndTarget") + println(u"{") + println(u" const char* const link;") + println(u" const char* const target;") + println(u"};") println(u"") println(u"const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {") for (zone, target, icuTarget) in incorrectLinks: @@ -932,7 +1439,7 @@ def updateTzdata(topsrcdir, args): if tzDir is None: print("Downloading tzdata file...") with closing(urllib2.urlopen(url)) as tzfile: - fname = urlparse.urlsplit(tzfile.geturl()).path.split("/")[-1] + fname = urlsplit(tzfile.geturl()).path.split("/")[-1] with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile: print("File stored in %s" % tztmpfile.name) tztmpfile.write(tzfile.read()) @@ -959,20 +1466,24 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Update intl data.") subparsers = parser.add_subparsers(help="Select update mode") - parser_tags = subparsers.add_parser("langtags", - help="Update language-subtag-registry") - parser_tags.add_argument("--url", - metavar="URL", - default="https://www.iana.org/assignments/language-subtag-registry", - type=EnsureHttps, - help="Download url for language-subtag-registry.txt (default: %(default)s)") - parser_tags.add_argument("--out", - default="LangTagMappingsGenerated.js", - help="Output file (default: %(default)s)") - parser_tags.add_argument("file", - nargs="?", - help="Local language-subtag-registry.txt file, if omitted uses <URL>") - parser_tags.set_defaults(func=updateLangTags) + parser_cldr_tags = subparsers.add_parser("langtags", + help="Update CLDR language tags data") + parser_cldr_tags.add_argument("--url", + metavar="URL", + default="https://github.com/unicode-org/cldr.git", + help="URL to git repository (default: %(default)s)") + parser_cldr_tags.add_argument("--branch", default="latest", + help="Git branch (default: %(default)s)") + parser_cldr_tags.add_argument("--revision", default="HEAD", + help="Git revision (default: %(default)s)") + parser_cldr_tags.add_argument("--out", + default="LangTagMappingsGenerated.js", + help="Output file (default: %(default)s)") + parser_cldr_tags.add_argument("files", + nargs="*", + help="Local ldmlSupplemental.dtd, supplementalMetadata.xml, " + "and likelySubtags.xml files, if omitted uses <URL>") + parser_cldr_tags.set_defaults(func=updateCLDRLangTags) parser_tz = subparsers.add_parser("tzdata", help="Update tzdata") parser_tz.add_argument("--tz", diff --git a/js/src/vm/SelfHosting.cpp b/js/src/vm/SelfHosting.cpp index ebb95c83e9..fff1baf630 100644 --- a/js/src/vm/SelfHosting.cpp +++ b/js/src/vm/SelfHosting.cpp @@ -2545,8 +2545,6 @@ static const JSFunctionSpec intrinsic_functions[] = { JS_FN("StringSplitStringLimit", intrinsic_StringSplitStringLimit, 3, 0), // See builtin/RegExp.h for descriptions of the regexp_* functions. - JS_FN("regexp_exec_no_statics", regexp_exec_no_statics, 2,0), - JS_FN("regexp_test_no_statics", regexp_test_no_statics, 2,0), JS_FN("regexp_construct_raw_flags", regexp_construct_raw_flags, 2,0), JS_FN("regexp_clone", regexp_clone, 1,0), |