diff options
author | Martok <martok@martoks-place.de> | 2023-06-18 15:05:33 +0200 |
---|---|---|
committer | Martok <martok@martoks-place.de> | 2023-06-30 00:01:35 +0200 |
commit | e96f965422528636e13adc3473679248941540e7 (patch) | |
tree | a6bd9d0f9a34add576553833f527d76224b157ad /js | |
parent | 7c3aa6a8b63d7d1ba2a5ae96ea065379634f3de1 (diff) | |
download | uxp-e96f965422528636e13adc3473679248941540e7.tar.gz |
Issue #2259 - Performance improvements for LanguageTag parsing
- parsing: dont normalise things that don't need to be normalised anymore:
extension, private-use, variant, language, script, and region subtags
- Add missing() and present() methods to LanguageSubtag
- Change mozilla::Range to mozilla::Span for slightly better code
Based-on: m-c 1592588
Diffstat (limited to 'js')
-rw-r--r-- | js/src/builtin/intl/IntlObject.cpp | 5 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTag.cpp | 544 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTag.h | 201 | ||||
-rw-r--r-- | js/src/builtin/intl/LanguageTagGenerated.cpp | 109 | ||||
-rw-r--r-- | js/src/builtin/intl/Locale.cpp | 44 | ||||
-rw-r--r-- | js/src/builtin/intl/SharedIntlData.cpp | 4 | ||||
-rw-r--r-- | js/src/builtin/intl/make_intl_data.py | 100 | ||||
-rw-r--r-- | js/src/js.msg | 1 |
8 files changed, 505 insertions, 503 deletions
diff --git a/js/src/builtin/intl/IntlObject.cpp b/js/src/builtin/intl/IntlObject.cpp index 9caa2709a8..e0dd36dac4 100644 --- a/js/src/builtin/intl/IntlObject.cpp +++ b/js/src/builtin/intl/IntlObject.cpp @@ -545,10 +545,7 @@ js::intl_BestAvailableLocale(JSContext* cx, unsigned argc, Value* vp) JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, intl::LanguageTagParser::tryParse(cx, locale, tag)); MOZ_ASSERT(ok, "locale is a structurally valid language tag"); - auto isUnicodeExtension = [](const auto& extension) { - return extension[0] == 'u'; - }; - MOZ_ASSERT(std::none_of(tag.extensions().begin(), tag.extensions().end(), isUnicodeExtension), + MOZ_ASSERT(!tag.unicodeExtension(), "locale must contain no Unicode extensions"); if (!tag.canonicalize(cx, intl::LanguageTag::UnicodeExtensionCanonicalForm::No)) { diff --git a/js/src/builtin/intl/LanguageTag.cpp b/js/src/builtin/intl/LanguageTag.cpp index e675e19686..583033f629 100644 --- a/js/src/builtin/intl/LanguageTag.cpp +++ b/js/src/builtin/intl/LanguageTag.cpp @@ -8,7 +8,7 @@ #include "mozilla/Assertions.h" #include "mozilla/MathAlgorithms.h" -#include "mozilla/Range.h" +#include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include "mozilla/Variant.h" @@ -40,102 +40,93 @@ namespace intl { using namespace js::intl::LanguageTagLimits; -using ConstCharRange = mozilla::Range<const char>; - -#ifdef DEBUG template <typename CharT> -bool IsStructurallyValidLanguageTag( - const mozilla::Range<const CharT>& language) { +bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; // unicode_language_subtag = alpha{2,3} | alpha{5,8}; - size_t length = language.length(); - const CharT* str = language.begin().get(); + size_t length = language.size(); + const CharT* str = language.data(); return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && - std::all_of(str, str + length, mozilla::IsAsciiLowercaseAlpha<CharT>); + std::all_of(str, str + length, mozilla::IsAsciiAlpha<CharT>); } template bool IsStructurallyValidLanguageTag( - const mozilla::Range<const Latin1Char>& language); + mozilla::Span<const char> language); +template bool IsStructurallyValidLanguageTag( + mozilla::Span<const Latin1Char> language); template bool IsStructurallyValidLanguageTag( - const mozilla::Range<const char16_t>& language); + mozilla::Span<const char16_t> language); template <typename CharT> -bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script) { +bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; // unicode_script_subtag = alpha{4} ; - size_t length = script.length(); - const CharT* str = script.begin().get(); - return length == 4 && mozilla::IsAsciiUppercaseAlpha<CharT>(str[0]) && - std::all_of(str + 1, str + length, - mozilla::IsAsciiLowercaseAlpha<CharT>); + size_t length = script.size(); + const CharT* str = script.data(); + return length == 4 && + std::all_of(str, str + length, mozilla::IsAsciiAlpha<CharT>); } template bool IsStructurallyValidScriptTag( - const mozilla::Range<const Latin1Char>& script); + mozilla::Span<const char> script); template bool IsStructurallyValidScriptTag( - const mozilla::Range<const char16_t>& script); + mozilla::Span<const Latin1Char> script); +template bool IsStructurallyValidScriptTag( + mozilla::Span<const char16_t> script); template <typename CharT> -bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region) { +bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; // unicode_region_subtag = (alpha{2} | digit{3}) ; - size_t length = region.length(); - const CharT* str = region.begin().get(); - return (length == 2 && std::all_of(str, str + length, - mozilla::IsAsciiUppercaseAlpha<CharT>)) || + size_t length = region.size(); + const CharT* str = region.data(); + return (length == 2 && + std::all_of(str, str + length, mozilla::IsAsciiAlpha<CharT>)) || (length == 3 && std::all_of(str, str + length, mozilla::IsAsciiDigit<CharT>)); } template bool IsStructurallyValidRegionTag( - const mozilla::Range<const Latin1Char>& region); + mozilla::Span<const char> region); +template bool IsStructurallyValidRegionTag( + mozilla::Span<const Latin1Char> region); template bool IsStructurallyValidRegionTag( - const mozilla::Range<const char16_t>& region); + mozilla::Span<const char16_t> region); -bool IsStructurallyValidVariantTag(const ConstCharRange& variant) { +#ifdef DEBUG +bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant) { // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; - auto isAsciiLowercaseAlphanumeric = [](char c) { - return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); - }; - size_t length = variant.length(); - const char* str = variant.begin().get(); + size_t length = variant.size(); + const char* str = variant.data(); return ((5 <= length && length <= 8) || (length == 4 && mozilla::IsAsciiDigit(str[0]))) && - std::all_of(str, str + length, isAsciiLowercaseAlphanumeric); + std::all_of(str, str + length, mozilla::IsAsciiAlphanumeric<char>); } -bool IsStructurallyValidUnicodeExtensionTag(const ConstCharRange& extension) { - auto isAsciiLowercaseAlphanumericOrDash = [](char c) { - return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c) || - c == '-'; - }; - - size_t length = extension.length(); - const char* str = extension.begin().get(); - return LanguageTagParser::canParseUnicodeExtension(extension) && - std::all_of(str, str + length, isAsciiLowercaseAlphanumericOrDash); +bool IsStructurallyValidUnicodeExtensionTag( + mozilla::Span<const char> extension) { + return LanguageTagParser::canParseUnicodeExtension(extension); } -static bool IsStructurallyValidExtensionTag(const ConstCharRange& extension) { +static bool IsStructurallyValidExtensionTag( + mozilla::Span<const char> extension) { // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; // NB: Allow any extension, including Unicode and Transform here, because // this function is only used for an assertion. - auto isAsciiDigitOrLowercaseAlpha = [](char c) { - return mozilla::IsAsciiDigit(c) || mozilla::IsAsciiLowercaseAlpha(c); - }; - size_t length = extension.length(); - const char* str = extension.begin().get(); + size_t length = extension.size(); + const char* str = extension.data(); + const char* const end = extension.data() + length; if (length <= 2) { return false; } - if (!isAsciiDigitOrLowercaseAlpha(str[0]) || str[0] == 'x') { + if (!mozilla::IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { return false; } str++; @@ -143,11 +134,11 @@ static bool IsStructurallyValidExtensionTag(const ConstCharRange& extension) { return false; } while (true) { - const char* sep = reinterpret_cast<const char*>( - memchr(str, '-', extension.end().get() - str)); - size_t len = (sep ? sep : extension.end().get()) - str; + const char* sep = + reinterpret_cast<const char*>(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; if (len < 2 || len > 8 || - !std::all_of(str, str + len, isAsciiDigitOrLowercaseAlpha)) { + !std::all_of(str, str + len, mozilla::IsAsciiAlphanumeric<char>)) { return false; } if (!sep) { @@ -157,23 +148,28 @@ static bool IsStructurallyValidExtensionTag(const ConstCharRange& extension) { } } -bool IsStructurallyValidPrivateUseTag(const ConstCharRange& privateUse) { +bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse) { // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; - auto isAsciiDigitOrLowercaseAlpha = [](char c) { - return mozilla::IsAsciiDigit(c) || mozilla::IsAsciiLowercaseAlpha(c); - }; - size_t length = privateUse.length(); - const char* str = privateUse.begin().get(); - if (length <= 2 || *str++ != 'x' || *str++ != '-') { + size_t length = privateUse.size(); + const char* str = privateUse.data(); + const char* const end = privateUse.data() + length; + if (length <= 2) { + return false; + } + if (str[0] != 'x' && str[0] != 'X') { + return false; + } + str++; + if (*str++ != '-') { return false; } while (true) { - const char* sep = reinterpret_cast<const char*>( - memchr(str, '-', privateUse.end().get() - str)); - size_t len = (sep ? sep : privateUse.end().get()) - str; + const char* sep = + reinterpret_cast<const char*>(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; if (len == 0 || len > 8 || - !std::all_of(str, str + len, isAsciiDigitOrLowercaseAlpha)) { + !std::all_of(str, str + len, mozilla::IsAsciiAlphanumeric<char>)) { return false; } if (!sep) { @@ -184,15 +180,33 @@ bool IsStructurallyValidPrivateUseTag(const ConstCharRange& privateUse) { } #endif +ptrdiff_t LanguageTag::unicodeExtensionIndex() const { + // The extension subtags aren't necessarily sorted, so we can't use binary + // search here. + auto p = std::find_if( + extensions().begin(), extensions().end(), + [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); + if (p != extensions().end()) { + return std::distance(extensions().begin(), p); + } + return -1; +} + +const char* LanguageTag::unicodeExtension() const { + ptrdiff_t index = unicodeExtensionIndex(); + if (index >= 0) { + return extensions()[index].get(); + } + return nullptr; +} + bool LanguageTag::setUnicodeExtension(UniqueChars extension) { MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag( - {extension.get(), strlen(extension.get())})); + mozilla::MakeCStringSpan(extension.get()))); // Replace the existing Unicode extension subtag or append a new one. - auto p = std::find_if(extensions().begin(), extensions().end(), - [](const auto& ext) { return ext[0] == 'u'; }); - if (p != extensions().end()) { - size_t index = std::distance(extensions().begin(), p); + ptrdiff_t index = unicodeExtensionIndex(); + if (index >= 0) { extensions_[index] = std::move(extension); return true; } @@ -200,10 +214,8 @@ bool LanguageTag::setUnicodeExtension(UniqueChars extension) { } void LanguageTag::clearUnicodeExtension() { - auto p = std::find_if(extensions().begin(), extensions().end(), - [](const auto& ext) { return ext[0] == 'u'; }); - if (p != extensions().end()) { - size_t index = std::distance(extensions().begin(), p); + ptrdiff_t index = unicodeExtensionIndex(); + if (index >= 0) { extensions_.erase(extensions_.begin() + index); } } @@ -252,35 +264,53 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) { // normalizing the case and ordering all subtags. The canonical syntax form // itself is specified in UTS 35, 3.2.1. - // The |LanguageTag| fields are already in normalized case, so we can skip - // this step. - MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); - MOZ_ASSERT(script().length() == 0 || - IsStructurallyValidScriptTag(script().range())); - MOZ_ASSERT(region().length() == 0 || - IsStructurallyValidRegionTag(region().range())); -#ifdef DEBUG - auto validVariant = [](const auto& variant) { - const char* str = variant.get(); - return IsStructurallyValidVariantTag({str, strlen(str)}); - }; - MOZ_ASSERT(std::all_of(variants().begin(), variants().end(), validVariant)); + // Language codes need to be in lower case. "JA" -> "ja" + language_.toLowerCase(); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); - auto validExtension = [](const auto& extension) { - const char* str = extension.get(); - return IsStructurallyValidExtensionTag({str, strlen(str)}); - }; - MOZ_ASSERT( - std::all_of(extensions().begin(), extensions().end(), validExtension)); -#endif - MOZ_ASSERT(!privateuse() || IsStructurallyValidPrivateUseTag( - {privateuse(), strlen(privateuse())})); + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + script_.toTitleCase(); + MOZ_ASSERT(script().missing() || + IsStructurallyValidScriptTag(script().span())); + + // Region codes need to be in upper case. "bu" -> "BU" + region_.toUpperCase(); + MOZ_ASSERT(region().missing() || + IsStructurallyValidRegionTag(region().span())); + + // The canonical case for variant subtags is lowercase. + for (UniqueChars& variant : variants_) { + char* variantChars = variant.get(); + size_t variantLength = strlen(variantChars); + AsciiToLowerCase(variantChars, variantLength, variantChars); + + MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength})); + } + + // Extensions and privateuse subtags are case normalized in the + // |canonicalizeExtensions| method. // The second step in UTS 35, 3.2.1, is to order all subtags. - // 1. Any variants are in alphabetical order. - if (!SortAlphabetically(cx, variants_)) { - return false; + if (variants_.length() > 1) { + // 1. Any variants are in alphabetical order. + if (!SortAlphabetically(cx, variants_)) { + return false; + } + + // Reject the Locale identifier if a duplicate variant was found, e.g. + // "en-variant-Variant". + const UniqueChars* duplicate = std::adjacent_find( + variants().begin(), variants().end(), [](const auto& a, const auto& b) { + return strcmp(a.get(), b.get()) == 0; + }); + if (duplicate != variants().end()) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_DUPLICATE_VARIANT_SUBTAG, + duplicate->get()); + return false; + } } // 2. Any extensions are in alphabetical order by their singleton. @@ -301,7 +331,7 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) { // No script replacements are currently present. // Replace deprecated region subtags with their preferred values. - if (region().length() > 0) { + if (region().present()) { if (!regionMapping(region_) && complexRegionMapping(region_)) { performComplexRegionMappings(); } @@ -320,6 +350,16 @@ bool LanguageTag::canonicalizeBaseName(JSContext* cx) { bool LanguageTag::canonicalizeExtensions( JSContext* cx, UnicodeExtensionCanonicalForm canonicalForm) { + // The canonical case for all extension subtags is lowercase. + for (UniqueChars& extension : extensions_) { + char* extensionChars = extension.get(); + size_t extensionLength = strlen(extensionChars); + AsciiToLowerCase(extensionChars, extensionLength, extensionChars); + + MOZ_ASSERT( + IsStructurallyValidExtensionTag({extensionChars, extensionLength})); + } + // Any extensions are in alphabetical order by their singleton. // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" if (!SortAlphabetically(cx, extensions_)) { @@ -337,6 +377,15 @@ bool LanguageTag::canonicalizeExtensions( } } } + + // The canonical case for privateuse subtags is lowercase. + if (char* privateuse = privateuse_.get()) { + size_t privateuseLength = strlen(privateuse); + AsciiToLowerCase(privateuse, privateuseLength, privateuse); + + MOZ_ASSERT( + IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); + } return true; } @@ -362,7 +411,8 @@ bool LanguageTag::canonicalizeUnicodeExtension( const char* const extension = unicodeExtension.get(); MOZ_ASSERT(extension[0] == 'u'); MOZ_ASSERT(extension[1] == '-'); - MOZ_ASSERT(IsStructurallyValidExtensionTag({extension, strlen(extension)})); + MOZ_ASSERT( + IsStructurallyValidExtensionTag(mozilla::MakeCStringSpan(extension))); size_t length = strlen(extension); @@ -376,7 +426,7 @@ bool LanguageTag::canonicalizeUnicodeExtension( JS_TRY_VAR_OR_RETURN_FALSE( cx, ok, LanguageTagParser::parseUnicodeExtension( - cx, ConstCharRange(extension, length), attributes, keywords)); + cx, mozilla::MakeSpan(extension, length), attributes, keywords)); MOZ_ASSERT(ok, "unexpected invalid Unicode extension subtag"); auto attributesLessOrEqual = [extension](const Attribute& a, @@ -475,12 +525,13 @@ bool LanguageTag::canonicalizeUnicodeExtension( static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; - static auto isTrue = [](const ConstCharRange& type) { + using StringSpan = mozilla::Span<const char>; + + static auto isTrue = [](StringSpan type) { constexpr char True[] = "true"; const size_t TrueLength = strlen(True); - return type.length() == TrueLength && - std::char_traits<char>::compare(type.begin().get(), True, - TrueLength) == 0; + return type.size() == TrueLength && + std::char_traits<char>::compare(type.data(), True, TrueLength) == 0; }; auto appendKey = [&sb, extension](const Keyword& keyword) { @@ -489,7 +540,7 @@ bool LanguageTag::canonicalizeUnicodeExtension( }; auto appendKeyword = [&sb, extension](const Keyword& keyword, - const ConstCharRange& type) { + StringSpan type) { MOZ_ASSERT(keyword.length() > UnicodeKeyLength); // Elide the Unicode extension type "true". @@ -501,7 +552,7 @@ bool LanguageTag::canonicalizeUnicodeExtension( }; auto appendReplacement = [&sb, extension](const Keyword& keyword, - const ConstCharRange& replacement) { + StringSpan replacement) { MOZ_ASSERT(keyword.length() > UnicodeKeyLength); // Elide the type "true" if present in the replacement. @@ -511,7 +562,7 @@ bool LanguageTag::canonicalizeUnicodeExtension( // Otherwise append the Unicode key (including the separator) and the // replaced type. return sb.append(keyword.begin(extension), UnicodeKeyWithSepLength) && - sb.append(replacement.begin().get(), replacement.length()); + sb.append(replacement.data(), replacement.size()); }; // Append all Unicode extension keywords. @@ -539,15 +590,15 @@ bool LanguageTag::canonicalizeUnicodeExtension( return false; } } else { - ConstCharRange key(keyword.begin(extension), UnicodeKeyLength); - ConstCharRange type(keyword.begin(extension) + UnicodeKeyWithSepLength, - keyword.length() - UnicodeKeyWithSepLength); + StringSpan key(keyword.begin(extension), UnicodeKeyLength); + StringSpan type(keyword.begin(extension) + UnicodeKeyWithSepLength, + keyword.length() - UnicodeKeyWithSepLength); if (canonicalForm == UnicodeExtensionCanonicalForm::Yes) { // Search if there's a replacement for the current Unicode keyword. if (const char* replacement = replaceUnicodeExtensionType(key, type)) { - if (!appendReplacement( - keyword, ConstCharRange(replacement, strlen(replacement)))) { + if (!appendReplacement(keyword, + mozilla::MakeCStringSpan(replacement))) { return false; } } else { @@ -584,9 +635,9 @@ template <class Buffer> static bool LanguageTagToString(JSContext* cx, const LanguageTag& tag, Buffer& sb) { auto appendSubtag = [&sb](const auto& subtag) { - auto range = subtag.range(); - MOZ_ASSERT(range.length() > 0); - return sb.append(range.begin().get(), range.length()); + auto span = subtag.span(); + MOZ_ASSERT(span.size() > 0); + return sb.append(span.data(), span.size()); }; auto appendSubtagZ = [&sb](const char* subtag) { @@ -609,14 +660,14 @@ static bool LanguageTagToString(JSContext* cx, const LanguageTag& tag, } // Append the script subtag if present. - if (tag.script().length() > 0) { + if (tag.script().present()) { if (!sb.append('-') || !appendSubtag(tag.script())) { return false; } } // Append the region subtag if present. - if (tag.region().length() > 0) { + if (tag.region().present()) { if (!sb.append('-') || !appendSubtag(tag.region())) { return false; } @@ -661,7 +712,8 @@ bool LanguageTag::canonicalizeTransformExtension( const char* const extension = transformExtension.get(); MOZ_ASSERT(extension[0] == 't'); MOZ_ASSERT(extension[1] == '-'); - MOZ_ASSERT(IsStructurallyValidExtensionTag({extension, strlen(extension)})); + MOZ_ASSERT( + IsStructurallyValidExtensionTag(mozilla::MakeCStringSpan(extension))); size_t length = strlen(extension); @@ -674,7 +726,7 @@ bool LanguageTag::canonicalizeTransformExtension( JS_TRY_VAR_OR_RETURN_FALSE( cx, ok, LanguageTagParser::parseTransformExtension( - cx, ConstCharRange(extension, length), tag, fields)); + cx, mozilla::MakeSpan(extension, length), tag, fields)); MOZ_ASSERT(ok, "unexpected invalid transform extension subtag"); auto tfieldLessOrEqual = [extension](const TField& a, const TField& b) { @@ -720,7 +772,7 @@ bool LanguageTag::canonicalizeTransformExtension( // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers // [3] https://github.com/tc39/ecma402/issues/330 - if (tag.language().length() > 0) { + if (tag.language().present()) { if (!sb.append('-')) { return false; } @@ -786,14 +838,14 @@ static bool HasLikelySubtags(LikelySubtags likelySubtags, // used. if (likelySubtags == LikelySubtags::Add) { return !tag.language().equalTo("und") && - (tag.script().length() > 0 && !tag.script().equalTo("Zzzz")) && - (tag.region().length() > 0 && !tag.region().equalTo("ZZ")); + (tag.script().present() && !tag.script().equalTo("Zzzz")) && + (tag.region().present() && !tag.region().equalTo("ZZ")); } // The language tag is already minimized if it only contains a language // subtag whose value is not the placeholder value "und". - return !tag.language().equalTo("und") && tag.script().length() == 0 && - tag.region().length() == 0; + return !tag.language().equalTo("und") && tag.script().missing() && + tag.region().missing(); } // Create an ICU locale ID from the given language tag. @@ -802,9 +854,9 @@ static bool CreateLocaleForLikelySubtags(const LanguageTag& tag, MOZ_ASSERT(locale.length() == 0); auto appendSubtag = [&locale](const auto& subtag) { - auto range = subtag.range(); - MOZ_ASSERT(range.length() > 0); - return locale.append(range.begin().get(), range.length()); + auto span = subtag.span(); + MOZ_ASSERT(span.size() > 0); + return locale.append(span.data(), span.size()); }; // Append the language subtag. @@ -813,14 +865,14 @@ static bool CreateLocaleForLikelySubtags(const LanguageTag& tag, } // Append the script subtag if present. - if (tag.script().length() > 0) { + if (tag.script().present()) { if (!locale.append('_') || !appendSubtag(tag.script())) { return false; } } // Append the region subtag if present. - if (tag.region().length() > 0) { + if (tag.region().present()) { if (!locale.append('_') || !appendSubtag(tag.region())) { return false; } @@ -857,12 +909,12 @@ static bool AssignFromLocaleId(JSContext* cx, LocaleId& localeId, memmove(localeId.begin(), und, length); } - ConstCharRange localeRange(localeId.begin(), localeId.length() - 1); + mozilla::Span<const char> localeSpan(localeId.begin(), localeId.length() - 1); // Retrieve the language, script, and region subtags from the locale ID, but // ignore any other subtags. LanguageTag localeTag(cx); - if (!LanguageTagParser::parseBaseName(cx, localeRange, localeTag)) { + if (!LanguageTagParser::parseBaseName(cx, localeSpan, localeTag)) { return false; } @@ -1025,18 +1077,6 @@ UniqueChars LanguageTagParser::chars(JSContext* cx, size_t index, return chars; } -UniqueChars LanguageTagParser::extension(JSContext* cx, const Token& start, - const Token& end) const { - MOZ_ASSERT(start.index() < end.index()); - - size_t length = end.index() - 1 - start.index(); - UniqueChars extension = chars(cx, start.index(), length); - if (extension) { - AsciiToLowerCase(extension.get(), length, extension.get()); - } - return extension; -} - // Parse the `unicode_language_id` production. // // unicode_language_id = unicode_language_subtag @@ -1051,55 +1091,22 @@ UniqueChars LanguageTagParser::extension(JSContext* cx, const Token& start, // // |tok| is the current token from |ts|. // -// The trailing |parseType| argument corresponds to one of two modes. -// -// In the |BaseNameParsing::Normal| mode, our input is in unknown case and is -// potentially invalid. |tag| will be filled with canonically-cased output, and -// duplicate variants will lead to an error. -// -// In the |BaseNameParsing::WithinTransformExtension| mode, our input is the -// `tlang` in a lowercased `transform_extensions`. |tag| subtags will be -// directly copied from the input (i.e. in lowercase). Variant subtags in the -// `tlang` subtag may contain duplicates. +// All subtags will be added unaltered to |tag|, without canonicalizing their +// case or, in the case of variant subtags, detecting and rejecting duplicate +// variants. Users must subsequently |canonicalizeBaseName| to perform these +// actions. // // Do not use this function directly: use |parseBaseName| or // |parseTlangFromTransformExtension| instead. -JS::Result<bool> LanguageTagParser::internalParseBaseName( - JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok, - BaseNameParsing parseType) { -#ifdef DEBUG - auto isAsciiLowerCase = [](const auto& range) { - // Tell the analysis the |std::all_of| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - - const char* ptr = range.begin().get(); - size_t length = range.length(); - return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<char>); - }; - auto isAsciiDigit = [](const auto& range) { - // Tell the analysis the |std::all_of| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - - const char* ptr = range.begin().get(); - size_t length = range.length(); - return std::all_of(ptr, ptr + length, mozilla::IsAsciiDigit<char>); - }; -#endif - +JS::Result<bool> LanguageTagParser::internalParseBaseName(JSContext* cx, + LanguageTagParser& ts, + LanguageTag& tag, + Token& tok) { if (ts.isLanguage(tok)) { ts.copyChars(tok, tag.language_); - // Language codes need to be in lower case. "JA" -> "ja" - if (parseType == BaseNameParsing::Normal) { - tag.language_.toLowerCase(); - } else { - MOZ_ASSERT(isAsciiLowerCase(tag.language_.range())); - } - tok = ts.nextToken(); } else { - MOZ_ASSERT(parseType == BaseNameParsing::Normal); - // The language subtag is mandatory. return false; } @@ -1107,28 +1114,12 @@ JS::Result<bool> LanguageTagParser::internalParseBaseName( if (ts.isScript(tok)) { ts.copyChars(tok, tag.script_); - // The first character of a script code needs to be capitalized. - // "hans" -> "Hans" - if (parseType == BaseNameParsing::Normal) { - tag.script_.toTitleCase(); - } else { - MOZ_ASSERT(isAsciiLowerCase(tag.script_.range())); - } - tok = ts.nextToken(); } if (ts.isRegion(tok)) { ts.copyChars(tok, tag.region_); - // Region codes need to be in upper case. "bu" -> "BU" - if (parseType == BaseNameParsing::Normal) { - tag.region_.toUpperCase(); - } else { - MOZ_ASSERT_IF(tok.length() == 2, isAsciiLowerCase(tag.region_.range())); - MOZ_ASSERT_IF(tok.length() == 3, isAsciiDigit(tag.region_.range())); - } - tok = ts.nextToken(); } @@ -1139,28 +1130,6 @@ JS::Result<bool> LanguageTagParser::internalParseBaseName( if (!variant) { return cx->alreadyReportedOOM(); } - - if (parseType == BaseNameParsing::Normal) { - // Locale identifiers are case insensitive (UTS 35, section 3.2). - // All seen variants are compared ignoring case differences by using the - // lower case form. This allows to properly detect and reject variant - // repetitions with differing case, e.g. "en-variant-Variant". - AsciiToLowerCase(variant.get(), tok.length(), variant.get()); - - // Reject the Locale identifier if a duplicate variant was found. - // - // This linear-time verification step means the whole variant subtag - // checking is potentially quadratic. Language tags are unlikely to be - // deliberately pathological, so this is okay at least for now. - for (const auto& seenVariant : variants) { - if (strcmp(variant.get(), seenVariant.get()) == 0) { - return false; - } - } - } else { - // When parsing variants in a `tlang` subtag, duplicates are allowed. - } - if (!variants.append(std::move(variant))) { return cx->alreadyReportedOOM(); } @@ -1332,10 +1301,11 @@ bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale, return false; } -bool LanguageTagParser::parseBaseName(JSContext* cx, ConstCharRange locale, +bool LanguageTagParser::parseBaseName(JSContext* cx, + mozilla::Span<const char> locale, LanguageTag& tag) { - LocaleChars localeChars = StringChars(locale.begin().get()); - LanguageTagParser ts(localeChars, locale.length()); + LocaleChars localeChars = StringChars(locale.data()); + LanguageTagParser ts(localeChars, locale.size()); Token tok = ts.nextToken(); // Parse only the base-name part and ignore any trailing characters. @@ -1344,12 +1314,10 @@ bool LanguageTagParser::parseBaseName(JSContext* cx, ConstCharRange locale, if (ok) { return true; } - if (UniqueChars localeChars = - DuplicateString(locale.begin().get(), locale.length())) { + if (UniqueChars localeChars = DuplicateString(cx, locale.data(), + locale.size())) { JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr, JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); - } else { - JS_ReportOutOfMemory(cx); } return false; } @@ -1357,10 +1325,10 @@ bool LanguageTagParser::parseBaseName(JSContext* cx, ConstCharRange locale, // Parse |extension|, which must be a valid `transformed_extensions` subtag, and // fill |tag| and |fields| from the `tlang` and `tfield` components. JS::Result<bool> LanguageTagParser::parseTransformExtension( - JSContext* cx, ConstCharRange extension, LanguageTag& tag, + JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag, TFieldVector& fields) { - LocaleChars extensionChars = StringChars(extension.begin().get()); - LanguageTagParser ts(extensionChars, extension.length()); + LocaleChars extensionChars = StringChars(extension.data()); + LanguageTagParser ts(extensionChars, extension.size()); Token tok = ts.nextToken(); if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 't') { @@ -1417,10 +1385,10 @@ JS::Result<bool> LanguageTagParser::parseTransformExtension( // and fill |attributes| and |keywords| from the `attribute` and `keyword` // components. JS::Result<bool> LanguageTagParser::parseUnicodeExtension( - JSContext* cx, ConstCharRange extension, AttributesVector& attributes, - KeywordsVector& keywords) { - LocaleChars extensionChars = StringChars(extension.begin().get()); - LanguageTagParser ts(extensionChars, extension.length()); + JSContext* cx, mozilla::Span<const char> extension, + AttributesVector& attributes, KeywordsVector& keywords) { + LocaleChars extensionChars = StringChars(extension.data()); + LanguageTagParser ts(extensionChars, extension.size()); Token tok = ts.nextToken(); // unicode_locale_extensions = sep [uU] ((sep keyword)+ | @@ -1467,9 +1435,10 @@ JS::Result<bool> LanguageTagParser::parseUnicodeExtension( return tok.isNone(); } -bool LanguageTagParser::canParseUnicodeExtension(ConstCharRange extension) { - LocaleChars extensionChars = StringChars(extension.begin().get()); - LanguageTagParser ts(extensionChars, extension.length()); +bool LanguageTagParser::canParseUnicodeExtension( + mozilla::Span<const char> extension) { + LocaleChars extensionChars = StringChars(extension.data()); + LanguageTagParser ts(extensionChars, extension.size()); Token tok = ts.nextToken(); // unicode_locale_extensions = sep [uU] ((sep keyword)+ | @@ -1522,118 +1491,89 @@ bool LanguageTagParser::canParseUnicodeExtensionType( return tok.isNone(); } -bool ParseStandaloneLanguagTag(HandleLinearString str, LanguageSubtag& result) { - auto isLanguage = [](const auto* language, size_t length) { - // Tell the analysis the |std::all_of| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - - using T = std::remove_pointer_t<decltype(language)>; - return length >= 2 && length != 4 && length <= 8 && - std::all_of(language, language + length, mozilla::IsAsciiAlpha<T>); - }; - +bool ParseStandaloneLanguageTag(HandleLinearString str, + LanguageSubtag& result) { JS::AutoCheckCannotGC nogc; if (str->hasLatin1Chars()) { - if (!isLanguage(str->latin1Chars(nogc), str->length())) { + if (!IsStructurallyValidLanguageTag<Latin1Char>(str->latin1Range(nogc))) { return false; } - result.set(str->latin1Range(nogc)); + result.set<Latin1Char>(str->latin1Range(nogc)); } else { - if (!isLanguage(str->twoByteChars(nogc), str->length())) { + if (!IsStructurallyValidLanguageTag<char16_t>(str->twoByteRange(nogc))) { return false; } - result.set(str->twoByteRange(nogc)); + result.set<char16_t>(str->twoByteRange(nogc)); } - result.toLowerCase(); return true; } bool ParseStandaloneScriptTag(HandleLinearString str, ScriptSubtag& result) { - auto isScript = [](const auto* script, size_t length) { - // Tell the analysis the |std::all_of| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - - using T = std::remove_pointer_t<decltype(script)>; - return length == ScriptLength && - std::all_of(script, script + ScriptLength, mozilla::IsAsciiAlpha<T>); - }; - JS::AutoCheckCannotGC nogc; if (str->hasLatin1Chars()) { - if (!isScript(str->latin1Chars(nogc), str->length())) { + if (!IsStructurallyValidScriptTag<Latin1Char>(str->latin1Range(nogc))) { return false; } - result.set(str->latin1Range(nogc)); + result.set<Latin1Char>(str->latin1Range(nogc)); } else { - if (!isScript(str->twoByteChars(nogc), str->length())) { + if (!IsStructurallyValidScriptTag<char16_t>(str->twoByteRange(nogc))) { return false; } - result.set(str->twoByteRange(nogc)); + result.set<char16_t>(str->twoByteRange(nogc)); } - result.toTitleCase(); return true; } bool ParseStandaloneRegionTag(HandleLinearString str, RegionSubtag& result) { - auto isRegion = [](const auto* region, size_t length) { - // Tell the analysis the |std::all_of| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - - using T = std::remove_pointer_t<decltype(region)>; - return (length == AlphaRegionLength && - std::all_of(region, region + AlphaRegionLength, - mozilla::IsAsciiAlpha<T>)) || - (length == DigitRegionLength && - std::all_of(region, region + DigitRegionLength, - mozilla::IsAsciiDigit<T>)); - }; - JS::AutoCheckCannotGC nogc; if (str->hasLatin1Chars()) { - if (!isRegion(str->latin1Chars(nogc), str->length())) { + if (!IsStructurallyValidRegionTag<Latin1Char>(str->latin1Range(nogc))) { return false; } - result.set(str->latin1Range(nogc)); + result.set<Latin1Char>(str->latin1Range(nogc)); } else { - if (!isRegion(str->twoByteChars(nogc), str->length())) { + if (!IsStructurallyValidRegionTag<char16_t>(str->twoByteRange(nogc))) { return false; } - result.set(str->twoByteRange(nogc)); + result.set<char16_t>(str->twoByteRange(nogc)); } - result.toUpperCase(); return true; } template <typename CharT> -static bool IsAsciiLowercaseAlpha(const mozilla::Range<const CharT>& range) { +static bool IsAsciiLowercaseAlpha(mozilla::Span<const CharT> span) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; - const CharT* ptr = range.begin().get(); - size_t length = range.length(); + const CharT* ptr = span.data(); + size_t length = span.size(); return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<CharT>); } static bool IsAsciiLowercaseAlpha(JSLinearString* str) { JS::AutoCheckCannotGC nogc; - return str->hasLatin1Chars() ? IsAsciiLowercaseAlpha(str->latin1Range(nogc)) - : IsAsciiLowercaseAlpha(str->twoByteRange(nogc)); + if (str->hasLatin1Chars()) { + return IsAsciiLowercaseAlpha<Latin1Char>(str->latin1Range(nogc)); + } + return IsAsciiLowercaseAlpha<char16_t>(str->twoByteRange(nogc)); } template <typename CharT> -static bool IsAsciiAlpha(const mozilla::Range<const CharT>& range) { +static bool IsAsciiAlpha(mozilla::Span<const CharT> span) { // Tell the analysis the |std::all_of| function can't GC. JS::AutoSuppressGCAnalysis nogc; - const CharT* ptr = range.begin().get(); - size_t length = range.length(); + const CharT* ptr = span.data(); + size_t length = span.size(); return std::all_of(ptr, ptr + length, mozilla::IsAsciiAlpha<CharT>); } static bool IsAsciiAlpha(JSLinearString* str) { JS::AutoCheckCannotGC nogc; - return str->hasLatin1Chars() ? IsAsciiAlpha(str->latin1Range(nogc)) - : IsAsciiAlpha(str->twoByteRange(nogc)); + if (str->hasLatin1Chars()) { + return IsAsciiAlpha<Latin1Char>(str->latin1Range(nogc)); + } + return IsAsciiAlpha<char16_t>(str->twoByteRange(nogc)); } JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx, @@ -1656,10 +1596,10 @@ JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx, LanguageSubtag languageTag; if (str->hasLatin1Chars()) { JS::AutoCheckCannotGC nogc; - languageTag.set(str->latin1Range(nogc)); + languageTag.set<Latin1Char>(str->latin1Range(nogc)); } else { JS::AutoCheckCannotGC nogc; - languageTag.set(str->twoByteRange(nogc)); + languageTag.set<char16_t>(str->twoByteRange(nogc)); } if (!isLowerCase) { @@ -1676,8 +1616,8 @@ JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx, // Take care to replace deprecated subtags with their preferred values. JSString* result; if (LanguageTag::languageMapping(languageTag) || !isLowerCase) { - auto range = languageTag.range(); - result = NewStringCopyN<CanGC>(cx, range.begin().get(), range.length()); + auto span = languageTag.span(); + result = NewStringCopyN<CanGC>(cx, span.data(), span.size()); } else { result = str; } diff --git a/js/src/builtin/intl/LanguageTag.h b/js/src/builtin/intl/LanguageTag.h index 657458cecb..384ff4bb7a 100644 --- a/js/src/builtin/intl/LanguageTag.h +++ b/js/src/builtin/intl/LanguageTag.h @@ -10,7 +10,7 @@ #define builtin_intl_LanguageTag_h #include "mozilla/Assertions.h" -#include "mozilla/Range.h" +#include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include "mozilla/TypedEnumBits.h" #include "mozilla/Variant.h" @@ -36,57 +36,53 @@ namespace js { namespace intl { -#ifdef DEBUG - /** - * Return true if |language| is a valid, case-normalized language subtag. + * Return true if |language| is a valid language subtag. */ template <typename CharT> -bool IsStructurallyValidLanguageTag( - const mozilla::Range<const CharT>& language); +bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language); /** - * Return true if |script| is a valid, case-normalized script subtag. + * Return true if |script| is a valid script subtag. */ template <typename CharT> -bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script); +bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script); /** - * Return true if |region| is a valid, case-normalized region subtag. + * Return true if |region| is a valid region subtag. */ template <typename CharT> -bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region); +bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region); +#ifdef DEBUG /** - * Return true if |variant| is a valid, case-normalized variant subtag. + * Return true if |variant| is a valid variant subtag. */ -bool IsStructurallyValidVariantTag(const mozilla::Range<const char>& variant); +bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant); /** - * Return true if |extension| is a valid, case-normalized Unicode extension - * subtag. + * Return true if |extension| is a valid Unicode extension subtag. */ bool IsStructurallyValidUnicodeExtensionTag( - const mozilla::Range<const char>& extension); + mozilla::Span<const char> extension); /** - * Return true if |privateUse| is a valid, case-normalized private-use subtag. + * Return true if |privateUse| is a valid private-use subtag. */ -bool IsStructurallyValidPrivateUseTag( - const mozilla::Range<const char>& privateUse); +bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse); #endif template <typename CharT> char AsciiToLowerCase(CharT c) { MOZ_ASSERT(mozilla::IsAscii(c)); - return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c; + return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c; } template <typename CharT> char AsciiToUpperCase(CharT c) { MOZ_ASSERT(mozilla::IsAscii(c)); - return mozilla::IsAsciiLowercaseAlpha(c) ? (c & ~0x20) : c; + return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c; } template <typename CharT> @@ -141,7 +137,7 @@ static constexpr size_t TransformKeyLength = 2; template <size_t Length> class LanguageTagSubtag final { uint8_t length_ = 0; - char chars_[Length]; + char chars_[Length] = {}; // zero initialize public: LanguageTagSubtag() = default; @@ -150,21 +146,31 @@ class LanguageTagSubtag final { LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete; size_t length() const { return length_; } + bool missing() const { return length_ == 0; } + bool present() const { return length_ > 0; } - mozilla::Range<const char> range() const { return {chars_, length_}; } + mozilla::Span<const char> span() const { return {chars_, length_}; } template <typename CharT> - void set(const mozilla::Range<const CharT>& str) { - MOZ_ASSERT(str.length() <= Length); - std::copy_n(str.begin().get(), str.length(), chars_); - length_ = str.length(); + void set(mozilla::Span<const CharT> str) { + MOZ_ASSERT(str.size() <= Length); + std::copy_n(str.data(), str.size(), chars_); + length_ = str.size(); } - void toLowerCase() { AsciiToLowerCase(chars_, length(), chars_); } + // The toXYZCase() methods are using |Length| instead of |length()|, because + // current compilers (tested GCC and Clang) can't infer the maximum string + // length - even when using hints like |std::min| - and instead are emitting + // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD + // code. (Emitting SIMD code doesn't make sense here, because the SIMD code + // only kicks in for long strings.) A fixed length will additionally ensure + // the compiler unrolls the loop in the case conversion code. - void toUpperCase() { AsciiToUpperCase(chars_, length(), chars_); } + void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); } - void toTitleCase() { AsciiToTitleCase(chars_, length(), chars_); } + void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); } + + void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); } template <size_t N> bool equalTo(const char (&str)[N]) const { @@ -224,8 +230,7 @@ class MOZ_STACK_CLASS LanguageTag final { MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx); static const char* replaceUnicodeExtensionType( - const mozilla::Range<const char>& key, - const mozilla::Range<const char>& type); + mozilla::Span<const char> key, mozilla::Span<const char> type); public: explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {} @@ -241,65 +246,68 @@ class MOZ_STACK_CLASS LanguageTag final { const char* privateuse() const { return privateuse_.get(); } /** - * Set the language subtag. The input must be a valid, case-normalized - * language subtag. + * Return the Unicode extension subtag or nullptr if not present. + */ + const char* unicodeExtension() const; + + private: + ptrdiff_t unicodeExtensionIndex() const; + + public: + /** + * Set the language subtag. The input must be a valid language subtag. */ template <size_t N> void setLanguage(const char (&language)[N]) { - mozilla::Range<const char> range(language, N - 1); - MOZ_ASSERT(IsStructurallyValidLanguageTag(range)); - language_.set(range); + mozilla::Span<const char> span(language, N - 1); + MOZ_ASSERT(IsStructurallyValidLanguageTag(span)); + language_.set(span); } /** - * Set the language subtag. The input must be a valid, case-normalized - * language subtag. + * Set the language subtag. The input must be a valid language subtag. */ void setLanguage(const LanguageSubtag& language) { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range())); - language_.set(language.range()); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); + language_.set(language.span()); } /** - * Set the script subtag. The input must be a valid, case-normalized - * script subtag or the empty string. + * Set the script subtag. The input must be a valid script subtag. */ template <size_t N> void setScript(const char (&script)[N]) { - mozilla::Range<const char> range(script, N - 1); - MOZ_ASSERT(IsStructurallyValidScriptTag(range)); - script_.set(range); + mozilla::Span<const char> span(script, N - 1); + MOZ_ASSERT(IsStructurallyValidScriptTag(span)); + script_.set(span); } /** - * Set the script subtag. The input must be a valid, case-normalized - * script subtag or the empty string. + * Set the script subtag. The input must be a valid script subtag or the empty + * string. */ void setScript(const ScriptSubtag& script) { - MOZ_ASSERT(script.length() == 0 || - IsStructurallyValidScriptTag(script.range())); - script_.set(script.range()); + MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span())); + script_.set(script.span()); } /** - * Set the region subtag. The input must be a valid, case-normalized - * region subtag or the empty string. + * Set the region subtag. The input must be a valid region subtag. */ template <size_t N> void setRegion(const char (®ion)[N]) { - mozilla::Range<const char> range(region, N - 1); - MOZ_ASSERT(IsStructurallyValidRegionTag(range)); - region_.set(range); + mozilla::Span<const char> span(region, N - 1); + MOZ_ASSERT(IsStructurallyValidRegionTag(span)); + region_.set(span); } /** - * Set the region subtag. The input must be a valid, case-normalized - * region subtag or the empty string. + * Set the region subtag. The input must be a valid region subtag or the empty + * empty string. */ void setRegion(const RegionSubtag& region) { - MOZ_ASSERT(region.length() == 0 || - IsStructurallyValidRegionTag(region.range())); - region_.set(region.range()); + MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span())); + region_.set(region.span()); } /** @@ -308,8 +316,8 @@ class MOZ_STACK_CLASS LanguageTag final { void clearVariants() { variants_.clearAndFree(); } /** - * Set the Unicode extension subtag. The input must be a valid, - * case-normalized Unicode extension subtag. + * Set the Unicode extension subtag. The input must be a valid Unicode + * extension subtag. */ bool setUnicodeExtension(JS::UniqueChars extension); @@ -319,8 +327,8 @@ class MOZ_STACK_CLASS LanguageTag final { void clearUnicodeExtension(); /** - * Set the private-use subtag. The input must be a valid, case-normalized - * private-use subtag or the empty string. + * Set the private-use subtag. The input must be a valid private-use subtag + * or nullptr. */ void setPrivateuse(JS::UniqueChars privateuse) { MOZ_ASSERT(!privateuse || @@ -462,10 +470,10 @@ class MOZ_STACK_CLASS LanguageTagParser final { size_t length = tok.length(); if (locale_.is<const JS::Latin1Char*>()) { using T = const JS::Latin1Char; - subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length)); + subtag.set(mozilla::MakeSpan(locale_.as<T*>() + index, length)); } else { using T = const char16_t; - subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length)); + subtag.set(mozilla::MakeSpan(locale_.as<T*>() + index, length)); } } @@ -477,10 +485,15 @@ class MOZ_STACK_CLASS LanguageTagParser final { return chars(cx, tok.index(), tok.length()); } - Token nextToken(); - JS::UniqueChars extension(JSContext* cx, const Token& start, - const Token& end) const; + const Token& end) const { + MOZ_ASSERT(start.index() < end.index()); + + size_t length = end.index() - 1 - start.index(); + return chars(cx, start.index(), length); + } + + Token nextToken(); // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; // @@ -513,8 +526,7 @@ class MOZ_STACK_CLASS LanguageTagParser final { // Always returns the lower case form of an alphabetical character. char singletonKey(const Token& tok) const { MOZ_ASSERT(tok.length() == 1); - char c = charAt(tok.index()); - return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c; + return AsciiToLowerCase(charAt(tok.index())); } // extensions = unicode_locale_extensions | @@ -581,23 +593,18 @@ class MOZ_STACK_CLASS LanguageTagParser final { return 1 <= tok.length() && tok.length() <= 8; } - enum class BaseNameParsing : bool { Normal, WithinTransformExtension }; - // Helper function for use in |parseBaseName| and // |parseTlangInTransformExtension|. Do not use this directly! static JS::Result<bool> internalParseBaseName(JSContext* cx, LanguageTagParser& ts, - LanguageTag& tag, Token& tok, - BaseNameParsing parseType); + LanguageTag& tag, Token& tok); // Parse the `unicode_language_id` production, i.e. the - // language/script/region/variants portion of a language tag, into |tag|, - // which will be filled with canonical-cased components (lowercase language, - // titlecase script, uppercase region, lowercased and alphabetized and - // deduplicated variants). |tok| must be the current token. + // language/script/region/variants portion of a language tag, into |tag|. + // |tok| must be the current token. static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) { - return internalParseBaseName(cx, ts, tag, tok, BaseNameParsing::Normal); + return internalParseBaseName(cx, ts, tag, tok); } // Parse the `tlang` production within a parsed 't' transform extension. @@ -611,17 +618,14 @@ class MOZ_STACK_CLASS LanguageTagParser final { // Return an error on internal failure. Otherwise, return a success value. If // there was no `tlang`, then |tag.language().missing()|. But if there was a // `tlang`, then |tag| is filled with subtags exactly as they appeared in the - // parse input: fully lowercase, variants in alphabetical order without - // duplicates. + // parse input. static JS::Result<JS::Ok> parseTlangInTransformExtension( JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) { MOZ_ASSERT(ts.isLanguage(tok)); - return internalParseBaseName(cx, ts, tag, tok, - BaseNameParsing::WithinTransformExtension) - .map([](bool parsed) { - MOZ_ASSERT(parsed); - return JS::Ok(); - }); + return internalParseBaseName(cx, ts, tag, tok).map([](bool parsed) { + MOZ_ASSERT(parsed); + return JS::Ok(); + }); } friend class LanguageTag; @@ -650,14 +654,14 @@ class MOZ_STACK_CLASS LanguageTagParser final { // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent // with |extension|. static JS::Result<bool> parseTransformExtension( - JSContext* cx, mozilla::Range<const char> extension, LanguageTag& tag, + JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag, TFieldVector& fields); // Parse |extension|, which must be a validated, fully lowercase // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| // from the `attribute` and `keyword` components. static JS::Result<bool> parseUnicodeExtension( - JSContext* cx, mozilla::Range<const char> extension, + JSContext* cx, mozilla::Span<const char> extension, AttributesVector& attributes, KeywordsVector& keywords); public: @@ -673,11 +677,11 @@ class MOZ_STACK_CLASS LanguageTagParser final { // Parse the input string as the base-name parts (language, script, region, // variants) of a language tag. Ignores any trailing characters. - static bool parseBaseName(JSContext* cx, mozilla::Range<const char> locale, + static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale, LanguageTag& tag); // Return true iff |extension| can be parsed as a Unicode extension subtag. - static bool canParseUnicodeExtension(mozilla::Range<const char> extension); + static bool canParseUnicodeExtension(mozilla::Span<const char> extension); // Return true iff |unicodeType| can be parsed as a Unicode extension type. static bool canParseUnicodeExtensionType(JSLinearString* unicodeType); @@ -687,24 +691,21 @@ MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind) /** * Parse a string as a standalone |language| tag. If |str| is a standalone - * language tag, store it in case-normalized form in |result| and return true. - * Otherwise return false. + * language tag, store it in |result| and return true. Otherwise return false. */ -MOZ_MUST_USE bool ParseStandaloneLanguagTag(JS::Handle<JSLinearString*> str, - LanguageSubtag& result); +MOZ_MUST_USE bool ParseStandaloneLanguageTag(JS::Handle<JSLinearString*> str, + LanguageSubtag& result); /** * Parse a string as a standalone |script| tag. If |str| is a standalone script - * tag, store it in case-normalized form in |result| and return true. Otherwise - * return false. + * tag, store it in |result| and return true. Otherwise return false. */ MOZ_MUST_USE bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str, ScriptSubtag& result); /** * Parse a string as a standalone |region| tag. If |str| is a standalone region - * tag, store it in case-normalized form in |result| and return true. Otherwise - * return false. + * tag, store it in |result| and return true. Otherwise return false. */ MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str, RegionSubtag& result); diff --git a/js/src/builtin/intl/LanguageTagGenerated.cpp b/js/src/builtin/intl/LanguageTagGenerated.cpp index 8952286976..6255861141 100644 --- a/js/src/builtin/intl/LanguageTagGenerated.cpp +++ b/js/src/builtin/intl/LanguageTagGenerated.cpp @@ -3,7 +3,7 @@ // URL: https://unicode.org/Public/cldr/35.1/core.zip #include "mozilla/Assertions.h" -#include "mozilla/Range.h" +#include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include <algorithm> @@ -18,7 +18,6 @@ #include "builtin/intl/LanguageTag.h" using namespace js::intl::LanguageTagLimits; -using ConstCharRange = mozilla::Range<const char>; template <size_t Length, size_t TagLength, size_t SubtagLength> static inline bool HasReplacement( @@ -27,7 +26,7 @@ static inline bool HasReplacement( MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); - const char* ptr = subtag.range().begin().get(); + const char* ptr = subtag.span().data(); return std::binary_search(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { return memcmp(a, b, TagLength - 1) < 0; @@ -42,7 +41,7 @@ static inline const char* SearchReplacement( MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); - const char* ptr = subtag.range().begin().get(); + const char* ptr = subtag.span().data(); auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { return memcmp(a, b, TagLength - 1) < 0; @@ -53,11 +52,40 @@ static inline const char* SearchReplacement( return nullptr; } +#ifdef DEBUG +static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>); +} + +static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) || + std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); +} + +static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { + auto isAsciiLowercaseAlphaOrDigit = [](char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); + }; + + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit); +} +#endif + // Mappings from language subtags to preferred values. // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range())); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.span())); if (language.length() == 2) { static const char languages[9][3] = { @@ -68,7 +96,7 @@ bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) { }; if (const char* replacement = SearchReplacement(languages, aliases, language)) { - language.set(ConstCharRange(replacement, strlen(replacement))); + language.set(mozilla::MakeCStringSpan(replacement)); return true; } return false; @@ -149,7 +177,7 @@ bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) { }; if (const char* replacement = SearchReplacement(languages, aliases, language)) { - language.set(ConstCharRange(replacement, strlen(replacement))); + language.set(mozilla::MakeCStringSpan(replacement)); return true; } return false; @@ -162,7 +190,8 @@ bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) { // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& language) { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range())); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.span())); if (language.length() == 2) { return language.equalTo("sh"); @@ -183,7 +212,8 @@ bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& languag // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) { - MOZ_ASSERT(IsStructurallyValidRegionTag(region.range())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.span())); if (region.length() == 2) { static const char regions[23][3] = { @@ -198,7 +228,7 @@ bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) { }; if (const char* replacement = SearchReplacement(regions, aliases, region)) { - region.set(ConstCharRange(replacement, strlen(replacement))); + region.set(mozilla::MakeCStringSpan(replacement)); return true; } return false; @@ -271,7 +301,7 @@ bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) { }; if (const char* replacement = SearchReplacement(regions, aliases, region)) { - region.set(ConstCharRange(replacement, strlen(replacement))); + region.set(mozilla::MakeCStringSpan(replacement)); return true; } return false; @@ -282,7 +312,8 @@ bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) { // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) { - MOZ_ASSERT(IsStructurallyValidRegionTag(region.range())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.span())); if (region.length() == 2) { return region.equalTo("AN") || @@ -304,11 +335,12 @@ bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) { // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip void js::intl::LanguageTag::performComplexLanguageMappings() { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); if (language().equalTo("cnr")) { setLanguage("sr"); - if (region().length() == 0) { + if (region().missing()) { setRegion("ME"); } } @@ -316,20 +348,20 @@ void js::intl::LanguageTag::performComplexLanguageMappings() { language().equalTo("prs") || language().equalTo("tnf")) { setLanguage("fa"); - if (region().length() == 0) { + if (region().missing()) { setRegion("AF"); } } else if (language().equalTo("hbs") || language().equalTo("sh")) { setLanguage("sr"); - if (script().length() == 0) { + if (script().missing()) { setScript("Latn"); } } else if (language().equalTo("swc")) { setLanguage("sw"); - if (region().length() == 0) { + if (region().missing()) { setRegion("CD"); } } @@ -339,8 +371,10 @@ void js::intl::LanguageTag::performComplexLanguageMappings() { // Derived from CLDR Supplemental Data, version 35.1. // https://unicode.org/Public/cldr/35.1/core.zip void js::intl::LanguageTag::performComplexRegionMappings() { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); - MOZ_ASSERT(IsStructurallyValidRegionTag(region().range())); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span())); if (region().equalTo("172")) { if (language().equalTo("hy") || @@ -562,14 +596,17 @@ bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag // that |unicode_locale_id| doesn't support.) // * No RG tag contains |extensions| or |pu_extensions|. - if (script().length() != 0 || - region().length() != 0 || + if (script().present() || + region().present() || variants().length() != 1 || extensions().length() != 0 || privateuse()) { return true; } + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variants()[0].get()))); + auto variantEqualTo = [this](const char* variant) { return strcmp(variants()[0].get(), variant) == 0; }; @@ -619,34 +656,34 @@ bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { } template <size_t Length> -static inline bool IsUnicodeKey(const ConstCharRange& key, +static inline bool IsUnicodeKey(mozilla::Span<const char> key, const char (&str)[Length]) { static_assert(Length == UnicodeKeyLength + 1, "Unicode extension key is two characters long"); - return memcmp(key.begin().get(), str, Length - 1) == 0; + return memcmp(key.data(), str, Length - 1) == 0; } template <size_t Length> -static inline bool IsUnicodeType(const ConstCharRange& type, +static inline bool IsUnicodeType(mozilla::Span<const char> type, const char (&str)[Length]) { static_assert(Length > UnicodeKeyLength + 1, "Unicode extension type contains more than two characters"); - return type.length() == (Length - 1) && - memcmp(type.begin().get(), str, Length - 1) == 0; + return type.size() == (Length - 1) && + memcmp(type.data(), str, Length - 1) == 0; } -static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) { +static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) { #ifdef DEBUG auto isNull = [](char c) { return c == '\0'; }; #endif - MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull), + MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull), "unexpected null-character in string"); using UnsignedChar = unsigned char; - for (size_t i = 0; i < b.length(); i++) { + for (size_t i = 0; i < b.size(); i++) { // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if // we've reached the end of |a|, the below if-statement will always be true. // That ensures we don't read past the end of |a|. @@ -657,13 +694,13 @@ static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) { // Return zero if both strings are equal or a negative number if |b| is a // prefix of |a|. - return -int32_t(UnsignedChar(a[b.length()])); + return -int32_t(UnsignedChar(a[b.size()])); }; template <size_t Length> static inline const char* SearchReplacement(const char* (&types)[Length], const char* (&aliases)[Length], - const ConstCharRange& type) { + mozilla::Span<const char> type) { auto p = std::lower_bound(std::begin(types), std::end(types), type, [](const auto& a, const auto& b) { @@ -682,7 +719,7 @@ static inline const char* SearchReplacement(const char* (&types)[Length], * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files */ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( - const ConstCharRange& key, const ConstCharRange& type) { + mozilla::Span<const char> key, mozilla::Span<const char> type) { #ifdef DEBUG static auto isAsciiLowercaseAlphanumeric = [](char c) { return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); @@ -693,12 +730,12 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( }; #endif - MOZ_ASSERT(key.length() == UnicodeKeyLength); - MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(), + MOZ_ASSERT(key.size() == UnicodeKeyLength); + MOZ_ASSERT(std::all_of(key.begin(), key.end(), isAsciiLowercaseAlphanumeric)); - MOZ_ASSERT(type.length() > UnicodeKeyLength); - MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(), + MOZ_ASSERT(type.size() > UnicodeKeyLength); + MOZ_ASSERT(std::all_of(type.begin(), type.end(), isAsciiLowercaseAlphanumericOrDash)); if (IsUnicodeKey(key, "ca")) { diff --git a/js/src/builtin/intl/Locale.cpp b/js/src/builtin/intl/Locale.cpp index 7e26add298..5d55fad2a1 100644 --- a/js/src/builtin/intl/Locale.cpp +++ b/js/src/builtin/intl/Locale.cpp @@ -12,7 +12,7 @@ #include "mozilla/Assertions.h" #include "mozilla/Casting.h" #include "mozilla/Maybe.h" -#include "mozilla/Range.h" +#include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include <algorithm> @@ -56,10 +56,10 @@ static inline bool IsLocale(HandleValue v) { // Return the length of the base-name subtags. static size_t BaseNameLength(const LanguageTag& tag) { size_t baseNameLength = tag.language().length(); - if (tag.script().length() > 0) { + if (tag.script().present()) { baseNameLength += 1 + tag.script().length(); } - if (tag.region().length() > 0) { + if (tag.region().present()) { baseNameLength += 1 + tag.region().length(); } for (const auto& variant : tag.variants()) { @@ -75,7 +75,7 @@ struct IndexAndLength { IndexAndLength(size_t index, size_t length) : index(index), length(length){}; template <typename T> - mozilla::Range<const T> rangeOf(const T* ptr) const { + mozilla::Span<const T> spanOf(const T* ptr) const { return {ptr + index, length}; } }; @@ -85,6 +85,9 @@ static mozilla::Maybe<IndexAndLength> UnicodeExtensionPosition( const LanguageTag& tag) { size_t index = 0; for (const auto& extension : tag.extensions()) { + MOZ_ASSERT(!mozilla::IsAsciiUppercaseAlpha(extension[0]), + "extensions are case normalized to lowercase"); + size_t extensionLength = strlen(extension.get()); if (extension[0] == 'u') { return mozilla::Some(IndexAndLength{index, extensionLength}); @@ -287,7 +290,7 @@ static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag, // Step 4. intl::LanguageSubtag language; - if (option && !intl::ParseStandaloneLanguagTag(option, language)) { + if (option && !intl::ParseStandaloneLanguageTag(option, language)) { if (UniqueChars str = StringToNewUTF8CharsZ(cx, *option)) { JS_ReportErrorNumberUTF8(cx, js::GetErrorMessage, nullptr, JSMSG_INVALID_OPTION_VALUE, "language", @@ -329,19 +332,19 @@ static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag, // Step 9 (Already performed in caller). // Skip steps 10-13 when no subtags were modified. - if (language.length() > 0 || script.length() > 0 || region.length() > 0) { + if (language.present() || script.present() || region.present()) { // Step 10. - if (language.length() > 0) { + if (language.present()) { tag.setLanguage(language); } // Step 11. - if (script.length() > 0) { + if (script.present()) { tag.setScript(script); } // Step 12. - if (region.length() > 0) { + if (region.present()) { tag.setRegion(region); } @@ -378,16 +381,11 @@ static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag, return false; } - // Check if there's an existing Unicode extension subtag. (The extension - // subtags aren't necessarily sorted, so we can't use binary search here.) - const UniqueChars* existingUnicodeExtension = - std::find_if(tag.extensions().begin(), tag.extensions().end(), - [](const auto& extension) { return extension[0] == 'u'; }); + // Check if there's an existing Unicode extension subtag. const char* unicodeExtensionEnd = nullptr; const char* unicodeExtensionKeywords = nullptr; - if (existingUnicodeExtension != tag.extensions().end()) { - const char* unicodeExtension = existingUnicodeExtension->get(); + if (const char* unicodeExtension = tag.unicodeExtension()) { unicodeExtensionEnd = unicodeExtension + strlen(unicodeExtension); SepKeywordIterator<char> iter(unicodeExtension, unicodeExtensionEnd); @@ -423,8 +421,6 @@ static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag, // keyword with the same key is detected as a duplicate when canonicalizing // the Unicode extension subtag and gets discarded. - size_t startNewKeywords = newExtension.length(); - if (calendar) { if (!appendKeyword("-ca-", calendar)) { return false; @@ -456,12 +452,6 @@ static bool ApplyUnicodeExtensionToTag(JSContext* cx, LanguageTag& tag, } } - // Normalize the case of the new keywords. - std::transform(newExtension.begin() + startNewKeywords, newExtension.end(), - newExtension.begin() + startNewKeywords, [](char c) { - return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c; - }); - // Append the remaining keywords from the previous Unicode extension subtag. if (unicodeExtensionKeywords) { if (!newExtension.append(unicodeExtensionKeywords, unicodeExtensionEnd)) { @@ -847,18 +837,18 @@ static BaseNamePartsResult BaseNameParts(const CharT* baseName, size_t length) { } IndexAndLength language{0, languageLength}; - MOZ_ASSERT(intl::IsStructurallyValidLanguageTag(language.rangeOf(baseName))); + MOZ_ASSERT(intl::IsStructurallyValidLanguageTag(language.spanOf(baseName))); mozilla::Maybe<IndexAndLength> script{}; if (scriptIndex) { script.emplace(scriptIndex, ScriptLength); - MOZ_ASSERT(intl::IsStructurallyValidScriptTag(script->rangeOf(baseName))); + MOZ_ASSERT(intl::IsStructurallyValidScriptTag(script->spanOf(baseName))); } mozilla::Maybe<IndexAndLength> region{}; if (regionIndex) { region.emplace(regionIndex, regionLength); - MOZ_ASSERT(intl::IsStructurallyValidRegionTag(region->rangeOf(baseName))); + MOZ_ASSERT(intl::IsStructurallyValidRegionTag(region->spanOf(baseName))); } return {language, script, region}; diff --git a/js/src/builtin/intl/SharedIntlData.cpp b/js/src/builtin/intl/SharedIntlData.cpp index 01db1d38cb..6aeea0ad93 100644 --- a/js/src/builtin/intl/SharedIntlData.cpp +++ b/js/src/builtin/intl/SharedIntlData.cpp @@ -31,9 +31,7 @@ template<typename Char> static constexpr Char
ToUpperASCII(Char c)
{
- return ('a' <= c && c <= 'z')
- ? (c & ~0x20)
- : c;
+ return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c;
}
static_assert(ToUpperASCII('a') == 'A', "verifying 'a' uppercases correctly");
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py index 670a46357b..0370d422d9 100644 --- a/js/src/builtin/intl/make_intl_data.py +++ b/js/src/builtin/intl/make_intl_data.py @@ -68,8 +68,8 @@ def writeMappingsVar(println, mapping, name, description, source, url): println(u' "{0}": "{1}",'.format(key, value)) println(u"};") -def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, mappings, - tag_maxlength, description, source, url): +def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, validate_case_fn, + mappings, tag_maxlength, description, source, url): """ Emit code to perform a binary search on language tag subtags. Uses the contents of |mapping|, which can either be a dictionary or set, @@ -79,8 +79,9 @@ def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, ma writeMappingHeader(println, description, source, url) println(u""" bool js::intl::LanguageTag::{0}({1} {2}) {{ - MOZ_ASSERT({3}({2}.range())); -""".format(fn_name, type_name, name, validate_fn).strip()) + MOZ_ASSERT({3}({2}.span())); + MOZ_ASSERT({4}({2}.span())); +""".format(fn_name, type_name, name, validate_fn, validate_case_fn).strip()) def write_array(subtags, name, length, fixed): if fixed: @@ -162,7 +163,7 @@ bool js::intl::LanguageTag::{0}({1} {2}) {{ println(u""" if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ - {0}.set(ConstCharRange(replacement, strlen(replacement))); + {0}.set(mozilla::MakeCStringSpan(replacement)); return true; }} return false; @@ -190,7 +191,8 @@ def writeComplexLanguageTagMappings(println, complex_language_mappings, writeMappingHeader(println, description, source, url) println(u""" void js::intl::LanguageTag::performComplexLanguageMappings() { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); """.lstrip()) # Merge duplicate language entries. @@ -227,12 +229,12 @@ void js::intl::LanguageTag::performComplexLanguageMappings() { if script is not None: println(u""" - if (script().length() == 0) {{ + if (script().missing()) {{ setScript("{}"); }}""".format(script).strip("\n")) if region is not None: println(u""" - if (region().length() == 0) {{ + if (region().missing()) {{ setRegion("{}"); }}""".format(region).strip("\n")) println(u""" @@ -249,8 +251,10 @@ def writeComplexRegionTagMappings(println, complex_region_mappings, writeMappingHeader(println, description, source, url) println(u""" void js::intl::LanguageTag::performComplexRegionMappings() { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range())); - MOZ_ASSERT(IsStructurallyValidRegionTag(region().range())); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span())); """.lstrip()) # |non_default_replacements| is a list and hence not hashable. Convert it @@ -360,14 +364,17 @@ bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag // that |unicode_locale_id| doesn't support.) // * No RG tag contains |extensions| or |pu_extensions|. - if (script().length() != 0 || - region().length() != 0 || + if (script().present() || + region().present() || variants().length() != 1 || extensions().length() != 0 || privateuse()) { return true; } + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variants()[0].get()))); + auto variantEqualTo = [this](const char* variant) { return strcmp(variants()[0].get(), variant) == 0; };""") @@ -870,7 +877,7 @@ def writeCLDRLanguageTagData(println, data, url): println(u""" #include "mozilla/Assertions.h" -#include "mozilla/Range.h" +#include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include <algorithm> @@ -885,7 +892,6 @@ def writeCLDRLanguageTagData(println, data, url): #include "builtin/intl/LanguageTag.h" using namespace js::intl::LanguageTagLimits; -using ConstCharRange = mozilla::Range<const char>; template <size_t Length, size_t TagLength, size_t SubtagLength> static inline bool HasReplacement( @@ -894,7 +900,7 @@ static inline bool HasReplacement( MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); - const char* ptr = subtag.range().begin().get(); + const char* ptr = subtag.span().data(); return std::binary_search(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { return memcmp(a, b, TagLength - 1) < 0; @@ -909,7 +915,7 @@ static inline const char* SearchReplacement( MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); - const char* ptr = subtag.range().begin().get(); + const char* ptr = subtag.span().data(); auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { return memcmp(a, b, TagLength - 1) < 0; @@ -919,6 +925,34 @@ static inline const char* SearchReplacement( } return nullptr; } + +#ifdef DEBUG +static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>); +} + +static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) || + std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); +} + +static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { + auto isAsciiLowercaseAlphaOrDigit = [](char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); + }; + + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit); +} +#endif """.rstrip()) source = u"CLDR Supplemental Data, version {}".format(data["version"]) @@ -938,21 +972,25 @@ static inline const char* SearchReplacement( writeMappingsBinarySearch(println, "languageMapping", "LanguageSubtag&", "language", "IsStructurallyValidLanguageTag", + "IsCanonicallyCasedLanguageTag", language_mappings, language_maxlength, "Mappings from language subtags to preferred values.", source, url) writeMappingsBinarySearch(println, "complexLanguageMapping", "const LanguageSubtag&", "language", "IsStructurallyValidLanguageTag", + "IsCanonicallyCasedLanguageTag", complex_language_mappings.keys(), language_maxlength, "Language subtags with complex mappings.", source, url) writeMappingsBinarySearch(println, "regionMapping", "RegionSubtag&", "region", "IsStructurallyValidRegionTag", + "IsCanonicallyCasedRegionTag", region_mappings, region_maxlength, "Mappings from region subtags to preferred values.", source, url) writeMappingsBinarySearch(println, "complexRegionMapping", "const RegionSubtag&", "region", "IsStructurallyValidRegionTag", + "IsCanonicallyCasedRegionTag", complex_region_mappings.keys(), region_maxlength, "Region subtags with complex mappings.", source, url) @@ -1863,34 +1901,34 @@ def updateTzdata(topsrcdir, args): def writeUnicodeExtensionsMappings(println, mapping): println(u""" template <size_t Length> -static inline bool IsUnicodeKey(const ConstCharRange& key, +static inline bool IsUnicodeKey(mozilla::Span<const char> key, const char (&str)[Length]) { static_assert(Length == UnicodeKeyLength + 1, "Unicode extension key is two characters long"); - return memcmp(key.begin().get(), str, Length - 1) == 0; + return memcmp(key.data(), str, Length - 1) == 0; } template <size_t Length> -static inline bool IsUnicodeType(const ConstCharRange& type, +static inline bool IsUnicodeType(mozilla::Span<const char> type, const char (&str)[Length]) { static_assert(Length > UnicodeKeyLength + 1, "Unicode extension type contains more than two characters"); - return type.length() == (Length - 1) && - memcmp(type.begin().get(), str, Length - 1) == 0; + return type.size() == (Length - 1) && + memcmp(type.data(), str, Length - 1) == 0; } -static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) { +static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) { #ifdef DEBUG auto isNull = [](char c) { return c == '\\0'; }; #endif - MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull), + MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull), "unexpected null-character in string"); using UnsignedChar = unsigned char; - for (size_t i = 0; i < b.length(); i++) { + for (size_t i = 0; i < b.size(); i++) { // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if // we've reached the end of |a|, the below if-statement will always be true. // That ensures we don't read past the end of |a|. @@ -1901,13 +1939,13 @@ static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) { // Return zero if both strings are equal or a negative number if |b| is a // prefix of |a|. - return -int32_t(UnsignedChar(a[b.length()])); + return -int32_t(UnsignedChar(a[b.size()])); }; template <size_t Length> static inline const char* SearchReplacement(const char* (&types)[Length], const char* (&aliases)[Length], - const ConstCharRange& type) { + mozilla::Span<const char> type) { auto p = std::lower_bound(std::begin(types), std::end(types), type, [](const auto& a, const auto& b) { @@ -1926,7 +1964,7 @@ static inline const char* SearchReplacement(const char* (&types)[Length], * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files */ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( - const ConstCharRange& key, const ConstCharRange& type) { + mozilla::Span<const char> key, mozilla::Span<const char> type) { #ifdef DEBUG static auto isAsciiLowercaseAlphanumeric = [](char c) { return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); @@ -1937,12 +1975,12 @@ const char* js::intl::LanguageTag::replaceUnicodeExtensionType( }; #endif - MOZ_ASSERT(key.length() == UnicodeKeyLength); - MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(), + MOZ_ASSERT(key.size() == UnicodeKeyLength); + MOZ_ASSERT(std::all_of(key.begin(), key.end(), isAsciiLowercaseAlphanumeric)); - MOZ_ASSERT(type.length() > UnicodeKeyLength); - MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(), + MOZ_ASSERT(type.size() > UnicodeKeyLength); + MOZ_ASSERT(std::all_of(type.begin(), type.end(), isAsciiLowercaseAlphanumericOrDash)); """) diff --git a/js/src/js.msg b/js/src/js.msg index 1b77cf6a31..a2a1e3f3d2 100644 --- a/js/src/js.msg +++ b/js/src/js.msg @@ -485,6 +485,7 @@ MSG_DEF(JSMSG_TRACELOGGER_ENABLE_FAIL, 1, JSEXN_ERR, "enabling tracelogger faile // Intl MSG_DEF(JSMSG_DATE_NOT_FINITE, 2, JSEXN_RANGEERR, "date value is not finite in {0}.{1}()") +MSG_DEF(JSMSG_DUPLICATE_VARIANT_SUBTAG, 1, JSEXN_RANGEERR, "duplicate variant subtag: {0}") MSG_DEF(JSMSG_INTERNAL_INTL_ERROR, 0, JSEXN_ERR, "internal error while computing Intl data") MSG_DEF(JSMSG_INTL_OBJECT_NOT_INITED, 3, JSEXN_TYPEERR, "Intl.{0}.prototype.{1} called on value that's not an object initialized as a {2}") MSG_DEF(JSMSG_INVALID_CURRENCY_CODE, 1, JSEXN_RANGEERR, "invalid currency code in NumberFormat(): {0}") |