From 26adb1ff4193fa5b53a8c5e01b4ea0be3677eff8 Mon Sep 17 00:00:00 2001 From: Job Bautista Date: Wed, 22 Jun 2022 16:38:14 +0800 Subject: Issue #326 - Part 1a: Update character property table generator script for Unicode 9, and adjust APIs to fit the new identifier-type property model Backported from Mozilla bug 1281448. --- gfx/thebes/gfxFont.cpp | 9 +- intl/unicharutil/tools/genUnicodePropertyData.pl | 164 +++++++++++++++++------ intl/unicharutil/util/nsUnicodeProperties.cpp | 2 +- intl/unicharutil/util/nsUnicodeProperties.h | 31 +++-- netwerk/dns/nsIDNService.cpp | 66 ++++----- 5 files changed, 174 insertions(+), 98 deletions(-) diff --git a/gfx/thebes/gfxFont.cpp b/gfx/thebes/gfxFont.cpp index f79c5cbd72..8ac64bc1b4 100644 --- a/gfx/thebes/gfxFont.cpp +++ b/gfx/thebes/gfxFont.cpp @@ -725,10 +725,9 @@ gfxShapedText::SetGlyphs(uint32_t aIndex, CompressedGlyph aGlyph, #define ZWNJ 0x200C #define ZWJ 0x200D static inline bool -IsDefaultIgnorable(uint32_t aChar) +IsIgnorable(uint32_t aChar) { - return GetIdentifierModification(aChar) == XIDMOD_DEFAULT_IGNORABLE || - aChar == ZWNJ || aChar == ZWJ; + return (IsDefaultIgnorable(aChar)) || aChar == ZWNJ || aChar == ZWJ; } void @@ -744,7 +743,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont) DetailedGlyph *details = AllocateDetailedGlyphs(aIndex, 1); details->mGlyphID = aChar; - if (IsDefaultIgnorable(aChar)) { + if (IsIgnorable(aChar)) { // Setting advance width to zero will prevent drawing the hexbox details->mAdvance = 0; } else { @@ -762,7 +761,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont) bool gfxShapedText::FilterIfIgnorable(uint32_t aIndex, uint32_t aCh) { - if (IsDefaultIgnorable(aCh)) { + if (IsIgnorable(aCh)) { // There are a few default-ignorables of Letter category (currently, // just the Hangul filler characters) that we'd better not discard // if they're followed by additional characters in the same cluster. diff --git a/intl/unicharutil/tools/genUnicodePropertyData.pl b/intl/unicharutil/tools/genUnicodePropertyData.pl index 8b247e83c6..bd86076eac 100755 --- a/intl/unicharutil/tools/genUnicodePropertyData.pl +++ b/intl/unicharutil/tools/genUnicodePropertyData.pl @@ -23,6 +23,7 @@ # - HangulSyllableType.txt # - LineBreak.txt # - EastAsianWidth.txt +# - DerivedCoreProperties.txt # - ReadMe.txt (to record version/date of the UCD) # - Unihan_Variants.txt (from Unihan.zip) # though this may change if we find a need for additional properties. @@ -30,12 +31,13 @@ # The Unicode data files listed above should be together in one directory. # # We also require the file -# http://www.unicode.org/Public/security/latest/xidmodifications.txt +# http://www.unicode.org/Public/security/latest/IdentifierStatus.txt +# http://www.unicode.org/Public/security/latest/IdentifierType.txt # This file should be in a sub-directory "security" immediately below the # directory containing the other Unicode data files. # -# We also require the latest data file for UTR50, currently revision-13: -# http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt +# We also require the latest data file for UTR50, currently revision-15: +# http://www.unicode.org/Public/vertical/revision-15/VerticalOrientation-15.txt # This file should be in a sub-directory "vertical" immediately below the # directory containing the other Unicode data files. # @@ -140,20 +142,35 @@ sub readIcuHeader die "didn't find ICU script codes\n" if $sc == -1; -my %xidmodCode = ( -'Recommended' => 0, -'Inclusion' => 1, -'Uncommon_Use' => 2, -'Technical' => 3, -'Obsolete' => 4, -'Aspirational' => 5, -'Limited_Use' => 6, -'Exclusion' => 7, -'Not_XID' => 8, -'Not_NFKC' => 9, -'Default_Ignorable' => 10, -'Deprecated' => 11, -'not-chars' => 12 +# We don't currently store these values; %idType is used only to check that +# properties listed in the IdentifierType.txt file are recognized. We record +# only the %mappedIdType values that are used by nsIDNService::isLabelSafe. +# In practice, it would be sufficient for us to read only the last value in +# IdentifierType.txt, but we check that all values are known so that we'll get +# a warning if future updates introduce new ones, and can consider whether +# they need to be taken into account. +my %idType = ( + "Not_Character" => 0, + "Recommended" => 1, + "Inclusion" => 2, + "Uncommon_Use" => 3, + "Technical" => 4, + "Obsolete" => 5, + "Aspirational" => 6, + "Limited_Use" => 7, + "Exclusion" => 8, + "Not_XID" => 9, + "Not_NFKC" => 10, + "Default_Ignorable" => 11, + "Deprecated" => 12 +); + +# These match the IdentifierType enum in nsUnicodeProperties.h. +my %mappedIdType = ( + "Restricted" => 0, + "Allowed" => 1, + "Aspirational" => 2 # for Aspirational characters that are not excluded + # by another attribute. ); my %bidicategoryCode = ( @@ -229,7 +246,10 @@ my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum "CP" => 36, "CJ" => 37, "HL" => 38, - "RI" => 39 + "RI" => 39, + "EB" => 40, + "EM" => 41, + "ZWJ" => 42 ); my %eastAsianWidthCode = ( @@ -249,7 +269,7 @@ my @mirror; my @pairedBracketType; my @hangul; my @casemap; -my @xidmod; +my @idtype; my @numericvalue; my @hanVariant; my @bidicategory; @@ -258,13 +278,14 @@ my @fullWidthInverse; my @verticalOrientation; my @lineBreak; my @eastAsianWidthFWH; +my @defaultIgnorable; for (my $i = 0; $i < 0x110000; ++$i) { $script[$i] = $scriptCode{"UNKNOWN"}; $category[$i] = $catCode{"UNASSIGNED"}; $combining[$i] = 0; $pairedBracketType[$i] = 0; $casemap[$i] = 0; - $xidmod[$i] = $xidmodCode{"not-chars"}; + $idtype[$i] = $mappedIdType{'Restricted'}; $numericvalue[$i] = -1; $hanVariant[$i] = 0; $bidicategory[$i] = $bidicategoryCode{"L"}; @@ -273,6 +294,7 @@ for (my $i = 0; $i < 0x110000; ++$i) { $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R' $lineBreak[$i] = $lineBreakCode{"XX"}; $eastAsianWidthFWH[$i] = 0; + $defaultIgnorable[$i] = 0; } # blocks where the default for bidi category is not L @@ -557,25 +579,72 @@ while () { } close FH; -# read xidmodifications.txt -open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n"; +# read DerivedCoreProperties.txt (for Default-Ignorables) +open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n"; push @versionInfo, ""; + while () { - chomp; - unless (/\xef\xbb\xbf/) { + chomp; push @versionInfo, $_; - } - last if /Generated:/; + last if /Date:/; +} +while () { + s/#.*//; + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) { + my $start = hex "0x$1"; + my $end = (defined $2) ? hex "0x$2" : $start; + for (my $i = $start; $i <= $end; ++$i) { + $defaultIgnorable[$i] = 1; + } + } } +close FH; + +# read IdentifierStatus.txt +open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n"; +push @versionInfo, ""; while () { - if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) { - my $xidmod = $3; - warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod}; - $xidmod = $xidmodCode{$xidmod}; + chomp; + s/\xef\xbb\xbf//; + push @versionInfo, $_; + last if /Date:/; + +} + +while () { + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+Allowed/) { my $start = hex "0x$1"; my $end = (defined $2) ? hex "0x$2" : $start; for (my $i = $start; $i <= $end; ++$i) { - $xidmod[$i] = $xidmod; + $idtype[$i] = $mappedIdType{'Allowed'}; + } + + } +} +close FH; + +# read IdentifierType.txt, to find Aspirational characters +open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n"; +push @versionInfo, ""; +while () { + chomp; + s/\xef\xbb\xbf//; + push @versionInfo, $_; + last if /Date:/; +} +while () { + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) { + my $idtype = $3; + foreach (split(/ /, $idtype)) { + warn "unknown Identifier Type $_" unless exists $idType{$_}; + } + my $start = hex "0x$1"; + my $end = (defined $2) ? hex "0x$2" : $start; + if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) { + + for (my $i = $start; $i <= $end; ++$i) { + $idtype[$i] = $mappedIdType{'Aspirational'}; + } } } } @@ -617,8 +686,8 @@ while () { } close FH; -# read VerticalOrientation-13.txt -open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n"; +# read VerticalOrientation-15.txt +open FH, "< $UNICODE/vertical/VerticalOrientation-15.txt" or die "can't open UTR50 data file VerticalOrientation-15.txt\n"; push @versionInfo, ""; while () { chomp; @@ -738,14 +807,15 @@ sub sprintCharProps2_short { my $usv = shift; return sprintf("{%d,%d},", - $verticalOrientation[$usv], $xidmod[$usv]); + $verticalOrientation[$usv], $idtype[$usv]); } $type = q| struct nsCharProps2 { - // Currently only 6 bits are defined here, so 2 more could be added without - // affecting the storage requirements for this struct. + // Currently only 4 bits are defined here, so 4 more could be added without + // affecting the storage requirements for this struct. Or we could pack two + // records per byte, at the cost of a slightly more complex accessor. unsigned char mVertOrient:2; - unsigned char mXidmod:4; + unsigned char mIdType:2; }; |; &genTables("#if ENABLE_INTL_API", "#endif", @@ -754,23 +824,31 @@ struct nsCharProps2 { sub sprintCharProps2_full { my $usv = shift; - return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d},", + return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d,%d},", $script[$usv], $pairedBracketType[$usv], $eastAsianWidthFWH[$usv], $category[$usv], - $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv], - $verticalOrientation[$usv], $lineBreak[$usv]); + $idtype[$usv], $defaultIgnorable[$usv], $bidicategory[$usv], + $verticalOrientation[$usv], $lineBreak[$usv], + $numericvalue[$usv]); } $type = q| +// This struct currently requires 5 bytes. We try to ensure that whole-byte +// fields will not straddle byte boundaries, to optimize access to them. struct nsCharProps2 { unsigned char mScriptCode:8; + // -- byte boundary -- unsigned char mPairedBracketType:2; unsigned char mEastAsianWidthFWH:1; unsigned char mCategory:5; + // -- byte boundary -- + unsigned char mIdType:2; + unsigned char mDefaultIgnorable:1; unsigned char mBidiCategory:5; - unsigned char mXidmod:4; - signed char mNumericValue:5; + // -- byte boundary -- unsigned char mVertOrient:2; - unsigned char mLineBreak; // only 6 bits actually needed + unsigned char mLineBreak:6; + // -- byte boundary -- + signed char mNumericValue; // only 5 bits are actually needed here }; |; &genTables("#if !ENABLE_INTL_API", "#endif", diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp index 71f684f0e9..99a87bc958 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.cpp +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp @@ -31,7 +31,7 @@ GetCharProps2(uint32_t aCh) using namespace mozilla::unicode; static const nsCharProps2 undefined = { VERTICAL_ORIENTATION_R, - XIDMOD_NOT_CHARS + 0 // IdentifierType }; return undefined; } diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h index ee1d77252c..2ff69d19a5 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.h +++ b/intl/unicharutil/util/nsUnicodeProperties.h @@ -38,20 +38,13 @@ enum PairedBracketType { PAIRED_BRACKET_TYPE_CLOSE = 2 }; -enum XidmodType { - XIDMOD_RECOMMENDED, - XIDMOD_INCLUSION, - XIDMOD_UNCOMMON_USE, - XIDMOD_TECHNICAL, - XIDMOD_OBSOLETE, - XIDMOD_ASPIRATIONAL, - XIDMOD_LIMITED_USE, - XIDMOD_EXCLUSION, - XIDMOD_NOT_XID, - XIDMOD_NOT_NFKC, - XIDMOD_DEFAULT_IGNORABLE, - XIDMOD_DEPRECATED, - XIDMOD_NOT_CHARS +/* Flags for Unicode security IdentifierType.txt attributes. Only a subset + of these are currently checked by Gecko, so we only define flags for the + ones we need. */ +enum IdentifierType { + IDTYPE_RESTRICTED = 0, + IDTYPE_ALLOWED = 1, + IDTYPE_ASPIRATIONAL = 2, }; enum EmojiPresentation { @@ -179,6 +172,12 @@ IsEastAsianWidthFWH(uint32_t aCh) return false; } +inline bool +IsDefaultIgnorable(uint32_t aCh) +{ + return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT); +} + inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) { @@ -201,8 +200,8 @@ inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) { return VerticalOrientation(GetCharProps2(aCh).mVertOrient); } -inline XidmodType GetIdentifierModification(uint32_t aCh) { - return XidmodType(GetCharProps2(aCh).mXidmod); +inline IdentifierType GetIdentifierType(uint32_t aCh) { + return IdentifierType(GetCharProps2(aCh).mIdType); } uint32_t GetFullWidth(uint32_t aCh); diff --git a/netwerk/dns/nsIDNService.cpp b/netwerk/dns/nsIDNService.cpp index 9210e20b5f..70e255ed15 100644 --- a/netwerk/dns/nsIDNService.cpp +++ b/netwerk/dns/nsIDNService.cpp @@ -314,42 +314,42 @@ nsresult nsIDNService::ACEtoUTF8(const nsACString & input, nsACString & _retval, return NS_OK; } - -/** - * Returns |true| if |aString| contains only ASCII characters according - * to our CRT. - * - * @param aString an 8-bit wide string to scan - */ -inline bool IsAsciiString(mozilla::Span aString) { - for (char c : aString) { - if (!nsCRT::IsAscii(c)) { - return false; - } - } - return true; -} + +/** + * Returns |true| if |aString| contains only ASCII characters according + * to our CRT. + * + * @param aString an 8-bit wide string to scan + */ +inline bool IsAsciiString(mozilla::Span aString) { + for (char c : aString) { + if (!nsCRT::IsAscii(c)) { + return false; + } + } + return true; +} NS_IMETHODIMP nsIDNService::IsACE(const nsACString & input, bool *_retval) { // look for the ACE prefix in the input string. it may occur // at the beginning of any segment in the domain name. for // example: "www.xn--ENCODED.com" - if (!IsAsciiString(input)) { - *_retval = false; - return NS_OK; - } - auto stringContains = [](const nsACString& haystack, - const nsACString& needle) { - return std::search(haystack.BeginReading(), haystack.EndReading(), - needle.BeginReading(), - needle.EndReading()) != haystack.EndReading(); - }; - - *_retval = StringBeginsWith(input, NS_LITERAL_CSTRING("xn--")) || - (!input.IsEmpty() && input[0] != '.' && - stringContains(input, NS_LITERAL_CSTRING(".xn--"))); - return NS_OK; + if (!IsAsciiString(input)) { + *_retval = false; + return NS_OK; + } + auto stringContains = [](const nsACString& haystack, + const nsACString& needle) { + return std::search(haystack.BeginReading(), haystack.EndReading(), + needle.BeginReading(), + needle.EndReading()) != haystack.EndReading(); + }; + + *_retval = StringBeginsWith(input, NS_LITERAL_CSTRING("xn--")) || + (!input.IsEmpty() && input[0] != '.' && + stringContains(input, NS_LITERAL_CSTRING(".xn--"))); + return NS_OK; } NS_IMETHODIMP nsIDNService::Normalize(const nsACString & input, @@ -730,11 +730,11 @@ bool nsIDNService::isLabelSafe(const nsAString &label) // Check for restricted characters; aspirational scripts are NOT permitted, // in anticipation of the category being merged into Limited-Use scripts // in the upcoming (Unicode 10.0-based) revision of UAX #31. - XidmodType xm = GetIdentifierModification(ch); - if (xm != XIDMOD_RECOMMENDED && - xm != XIDMOD_INCLUSION) { + IdentifierType idType = GetIdentifierType(ch); + if (idType == IDTYPE_RESTRICTED) { return false; } + MOZ_ASSERT(idType == IDTYPE_ALLOWED || idType == IDTYPE_ASPIRATIONAL); // Check for mixed script Script script = GetScriptCode(ch); -- cgit v1.2.3