diff options
Diffstat (limited to 'intl/unicharutil/tools/genUnicodePropertyData.pl')
-rwxr-xr-x | intl/unicharutil/tools/genUnicodePropertyData.pl | 164 |
1 files changed, 121 insertions, 43 deletions
diff --git a/intl/unicharutil/tools/genUnicodePropertyData.pl b/intl/unicharutil/tools/genUnicodePropertyData.pl index 8b247e83c6..bd86076eac 100755 --- a/intl/unicharutil/tools/genUnicodePropertyData.pl +++ b/intl/unicharutil/tools/genUnicodePropertyData.pl @@ -23,6 +23,7 @@ # - HangulSyllableType.txt # - LineBreak.txt # - EastAsianWidth.txt +# - DerivedCoreProperties.txt # - ReadMe.txt (to record version/date of the UCD) # - Unihan_Variants.txt (from Unihan.zip) # though this may change if we find a need for additional properties. @@ -30,12 +31,13 @@ # The Unicode data files listed above should be together in one directory. # # We also require the file -# http://www.unicode.org/Public/security/latest/xidmodifications.txt +# http://www.unicode.org/Public/security/latest/IdentifierStatus.txt +# http://www.unicode.org/Public/security/latest/IdentifierType.txt # This file should be in a sub-directory "security" immediately below the # directory containing the other Unicode data files. # -# We also require the latest data file for UTR50, currently revision-13: -# http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt +# We also require the latest data file for UTR50, currently revision-15: +# http://www.unicode.org/Public/vertical/revision-15/VerticalOrientation-15.txt # This file should be in a sub-directory "vertical" immediately below the # directory containing the other Unicode data files. # @@ -140,20 +142,35 @@ sub readIcuHeader die "didn't find ICU script codes\n" if $sc == -1; -my %xidmodCode = ( -'Recommended' => 0, -'Inclusion' => 1, -'Uncommon_Use' => 2, -'Technical' => 3, -'Obsolete' => 4, -'Aspirational' => 5, -'Limited_Use' => 6, -'Exclusion' => 7, -'Not_XID' => 8, -'Not_NFKC' => 9, -'Default_Ignorable' => 10, -'Deprecated' => 11, -'not-chars' => 12 +# We don't currently store these values; %idType is used only to check that +# properties listed in the IdentifierType.txt file are recognized. We record +# only the %mappedIdType values that are used by nsIDNService::isLabelSafe. +# In practice, it would be sufficient for us to read only the last value in +# IdentifierType.txt, but we check that all values are known so that we'll get +# a warning if future updates introduce new ones, and can consider whether +# they need to be taken into account. +my %idType = ( + "Not_Character" => 0, + "Recommended" => 1, + "Inclusion" => 2, + "Uncommon_Use" => 3, + "Technical" => 4, + "Obsolete" => 5, + "Aspirational" => 6, + "Limited_Use" => 7, + "Exclusion" => 8, + "Not_XID" => 9, + "Not_NFKC" => 10, + "Default_Ignorable" => 11, + "Deprecated" => 12 +); + +# These match the IdentifierType enum in nsUnicodeProperties.h. +my %mappedIdType = ( + "Restricted" => 0, + "Allowed" => 1, + "Aspirational" => 2 # for Aspirational characters that are not excluded + # by another attribute. ); my %bidicategoryCode = ( @@ -229,7 +246,10 @@ my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum "CP" => 36, "CJ" => 37, "HL" => 38, - "RI" => 39 + "RI" => 39, + "EB" => 40, + "EM" => 41, + "ZWJ" => 42 ); my %eastAsianWidthCode = ( @@ -249,7 +269,7 @@ my @mirror; my @pairedBracketType; my @hangul; my @casemap; -my @xidmod; +my @idtype; my @numericvalue; my @hanVariant; my @bidicategory; @@ -258,13 +278,14 @@ my @fullWidthInverse; my @verticalOrientation; my @lineBreak; my @eastAsianWidthFWH; +my @defaultIgnorable; for (my $i = 0; $i < 0x110000; ++$i) { $script[$i] = $scriptCode{"UNKNOWN"}; $category[$i] = $catCode{"UNASSIGNED"}; $combining[$i] = 0; $pairedBracketType[$i] = 0; $casemap[$i] = 0; - $xidmod[$i] = $xidmodCode{"not-chars"}; + $idtype[$i] = $mappedIdType{'Restricted'}; $numericvalue[$i] = -1; $hanVariant[$i] = 0; $bidicategory[$i] = $bidicategoryCode{"L"}; @@ -273,6 +294,7 @@ for (my $i = 0; $i < 0x110000; ++$i) { $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R' $lineBreak[$i] = $lineBreakCode{"XX"}; $eastAsianWidthFWH[$i] = 0; + $defaultIgnorable[$i] = 0; } # blocks where the default for bidi category is not L @@ -557,25 +579,72 @@ while (<FH>) { } close FH; -# read xidmodifications.txt -open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n"; +# read DerivedCoreProperties.txt (for Default-Ignorables) +open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n"; push @versionInfo, ""; + while (<FH>) { - chomp; - unless (/\xef\xbb\xbf/) { + chomp; push @versionInfo, $_; - } - last if /Generated:/; + last if /Date:/; +} +while (<FH>) { + s/#.*//; + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) { + my $start = hex "0x$1"; + my $end = (defined $2) ? hex "0x$2" : $start; + for (my $i = $start; $i <= $end; ++$i) { + $defaultIgnorable[$i] = 1; + } + } } +close FH; + +# read IdentifierStatus.txt +open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n"; +push @versionInfo, ""; while (<FH>) { - if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) { - my $xidmod = $3; - warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod}; - $xidmod = $xidmodCode{$xidmod}; + chomp; + s/\xef\xbb\xbf//; + push @versionInfo, $_; + last if /Date:/; + +} + +while (<FH>) { + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+Allowed/) { my $start = hex "0x$1"; my $end = (defined $2) ? hex "0x$2" : $start; for (my $i = $start; $i <= $end; ++$i) { - $xidmod[$i] = $xidmod; + $idtype[$i] = $mappedIdType{'Allowed'}; + } + + } +} +close FH; + +# read IdentifierType.txt, to find Aspirational characters +open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n"; +push @versionInfo, ""; +while (<FH>) { + chomp; + s/\xef\xbb\xbf//; + push @versionInfo, $_; + last if /Date:/; +} +while (<FH>) { + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) { + my $idtype = $3; + foreach (split(/ /, $idtype)) { + warn "unknown Identifier Type $_" unless exists $idType{$_}; + } + my $start = hex "0x$1"; + my $end = (defined $2) ? hex "0x$2" : $start; + if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) { + + for (my $i = $start; $i <= $end; ++$i) { + $idtype[$i] = $mappedIdType{'Aspirational'}; + } } } } @@ -617,8 +686,8 @@ while (<FH>) { } close FH; -# read VerticalOrientation-13.txt -open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n"; +# read VerticalOrientation-15.txt +open FH, "< $UNICODE/vertical/VerticalOrientation-15.txt" or die "can't open UTR50 data file VerticalOrientation-15.txt\n"; push @versionInfo, ""; while (<FH>) { chomp; @@ -738,14 +807,15 @@ sub sprintCharProps2_short { my $usv = shift; return sprintf("{%d,%d},", - $verticalOrientation[$usv], $xidmod[$usv]); + $verticalOrientation[$usv], $idtype[$usv]); } $type = q| struct nsCharProps2 { - // Currently only 6 bits are defined here, so 2 more could be added without - // affecting the storage requirements for this struct. + // Currently only 4 bits are defined here, so 4 more could be added without + // affecting the storage requirements for this struct. Or we could pack two + // records per byte, at the cost of a slightly more complex accessor. unsigned char mVertOrient:2; - unsigned char mXidmod:4; + unsigned char mIdType:2; }; |; &genTables("#if ENABLE_INTL_API", "#endif", @@ -754,23 +824,31 @@ struct nsCharProps2 { sub sprintCharProps2_full { my $usv = shift; - return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d},", + return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d,%d},", $script[$usv], $pairedBracketType[$usv], $eastAsianWidthFWH[$usv], $category[$usv], - $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv], - $verticalOrientation[$usv], $lineBreak[$usv]); + $idtype[$usv], $defaultIgnorable[$usv], $bidicategory[$usv], + $verticalOrientation[$usv], $lineBreak[$usv], + $numericvalue[$usv]); } $type = q| +// This struct currently requires 5 bytes. We try to ensure that whole-byte +// fields will not straddle byte boundaries, to optimize access to them. struct nsCharProps2 { unsigned char mScriptCode:8; + // -- byte boundary -- unsigned char mPairedBracketType:2; unsigned char mEastAsianWidthFWH:1; unsigned char mCategory:5; + // -- byte boundary -- + unsigned char mIdType:2; + unsigned char mDefaultIgnorable:1; unsigned char mBidiCategory:5; - unsigned char mXidmod:4; - signed char mNumericValue:5; + // -- byte boundary -- unsigned char mVertOrient:2; - unsigned char mLineBreak; // only 6 bits actually needed + unsigned char mLineBreak:6; + // -- byte boundary -- + signed char mNumericValue; // only 5 bits are actually needed here }; |; &genTables("#if !ENABLE_INTL_API", "#endif", |