summaryrefslogtreecommitdiff
path: root/intl
diff options
context:
space:
mode:
authorJob Bautista <jobbautista9@protonmail.com>2022-06-23 15:44:03 +0800
committerJob Bautista <jobbautista9@protonmail.com>2022-06-23 15:44:03 +0800
commitfcc171ddb76d8fc70074854a7b05ed720002e8cc (patch)
treebaafe947143375e03df361781e142ddc40d8e466 /intl
parentbbb68510ee0ededf18ee7a4680d3a9bbe57e872d (diff)
downloaduxp-fcc171ddb76d8fc70074854a7b05ed720002e8cc.tar.gz
Issue #326 - Part 5: Simplify the genUnicodePropertyData.pl tool to generate only the data we need to supplement ICU properties
Backported from Mozilla bug 1402271. Some removals were already done by Part 1b.
Diffstat (limited to 'intl')
-rwxr-xr-xintl/unicharutil/tools/genUnicodePropertyData.pl434
1 files changed, 14 insertions, 420 deletions
diff --git a/intl/unicharutil/tools/genUnicodePropertyData.pl b/intl/unicharutil/tools/genUnicodePropertyData.pl
index e17b6cb76a..8c7437f82d 100755
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@@ -9,6 +9,10 @@
# read from the Unicode Character Database and compiled into multi-level arrays
# for efficient lookup.
#
+# Note that for most properties, we now rely on ICU; this tool and the tables
+# it generates are used only for a couple of properties not readily exposed
+# via ICU APIs.
+#
# To regenerate the tables in nsUnicodePropertyData.cpp:
#
# (1) Download the current Unicode data files from
@@ -17,13 +21,6 @@
#
# NB: not all the files are actually needed; currently, we require
# - UnicodeData.txt
-# - Scripts.txt
-# - BidiMirroring.txt
-# - BidiBrackets.txt
-# - HangulSyllableType.txt
-# - LineBreak.txt
-# - EastAsianWidth.txt
-# - DerivedCoreProperties.txt
# - ReadMe.txt (to record version/date of the UCD)
# - Unihan_Variants.txt (from Unihan.zip)
# though this may change if we find a need for additional properties.
@@ -44,7 +41,6 @@
# (2) Run this tool using a command line of the form
#
# perl genUnicodePropertyData.pl \
-# /path/to/harfbuzz/src \
# /path/to/icu/common/unicode \
# /path/to/UCD-directory
#
@@ -58,17 +54,15 @@
use strict;
use List::Util qw(first);
-if ($#ARGV != 2) {
+if ($#ARGV != 1) {
print <<__EOT;
# Run this tool using a command line of the form
#
# perl genUnicodePropertyData.pl \\
-# /path/to/harfbuzz/src \\
# /path/to/icu/common/unicode \\
# /path/to/UCD-directory
#
-# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
-# icu/common/unicode is the directory containing ICU 'common' public headers,
+# where icu/common/unicode is the directory containing ICU 'common' headers,
# and UCD-directory is a directory containing the current Unicode Character
# Database files (UnicodeData.txt, etc), available from
# http://www.unicode.org/Public/UNIDATA/, with additional resources as
@@ -84,35 +78,11 @@ __EOT
exit 0;
}
-my $HARFBUZZ = $ARGV[0];
-my $ICU = $ARGV[1];
-my $UNICODE = $ARGV[2];
-
-# load HB_Category constants
-
-my $cc = -1;
-my %catCode;
+my $ICU = $ARGV[0];
+my $UNICODE = $ARGV[1];
-sub readHarfBuzzHeader
-{
- my $file = shift;
- open FH, "< $HARFBUZZ/$file" or die "can't open harfbuzz header $HARFBUZZ/$file\n";
- while (<FH>) {
- if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
- $cc++;
- $catCode{$1} = $cc;
- }
- }
- close FH;
-}
-
-&readHarfBuzzHeader("hb-unicode.h");
-
-die "didn't find HarfBuzz category codes\n" if $cc == -1;
-
-my %scriptCode;
-my @scriptCodeToTag;
my @scriptCodeToName;
+my @idtype;
my $sc = -1;
@@ -129,8 +99,6 @@ sub readIcuHeader
s/SIGN_WRITING/SIGNWRITING/;
if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) {
$sc = $2;
- $scriptCode{$1} = $sc;
- $scriptCodeToTag[$sc] = $3;
$scriptCodeToName[$sc] = $1;
}
}
@@ -170,32 +138,6 @@ my %mappedIdType = (
"Allowed" => 1
);
-my %bidicategoryCode = (
- "L" => 0, # Left-to-Right
- "R" => 1, # Right-to-Left
- "EN" => 2, # European Number
- "ES" => 3, # European Number Separator
- "ET" => 4, # European Number Terminator
- "AN" => 5, # Arabic Number
- "CS" => 6, # Common Number Separator
- "B" => 7, # Paragraph Separator
- "S" => 8, # Segment Separator
- "WS" => 9, # Whitespace
- "ON" => 10, # Other Neutrals
- "LRE" => 11, # Left-to-Right Embedding
- "LRO" => 12, # Left-to-Right Override
- "AL" => 13, # Right-to-Left Arabic
- "RLE" => 14, # Right-to-Left Embedding
- "RLO" => 15, # Right-to-Left Override
- "PDF" => 16, # Pop Directional Format
- "NSM" => 17, # Non-Spacing Mark
- "BN" => 18, # Boundary Neutral
- "FSI" => 19, # First Strong Isolate
- "LRI" => 20, # Left-to-Right Isolate
- "RLI" => 21, # Right-to-left Isolate
- "PDI" => 22 # Pop Direcitonal Isolate
-);
-
my %verticalOrientationCode = (
'U' => 0, # U - Upright, the same orientation as in the code charts
'R' => 1, # R - Rotated 90 degrees clockwise compared to the code charts
@@ -203,141 +145,18 @@ my %verticalOrientationCode = (
'Tr' => 3 # Tr - Transformed typographically, with fallback to Rotated
);
-my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum
- "XX" => 0,
- "AI" => 1,
- "AL" => 2,
- "B2" => 3,
- "BA" => 4,
- "BB" => 5,
- "BK" => 6,
- "CB" => 7,
- "CL" => 8,
- "CM" => 9,
- "CR" => 10,
- "EX" => 11,
- "GL" => 12,
- "HY" => 13,
- "ID" => 14,
- "IN" => 15,
- "IS" => 16,
- "LF" => 17,
- "NS" => 18,
- "NU" => 19,
- "OP" => 20,
- "PO" => 21,
- "PR" => 22,
- "QU" => 23,
- "SA" => 24,
- "SG" => 25,
- "SP" => 26,
- "SY" => 27,
- "ZW" => 28,
- "NL" => 29,
- "WJ" => 30,
- "H2" => 31,
- "H3" => 32,
- "JL" => 33,
- "JT" => 34,
- "JV" => 35,
- "CP" => 36,
- "CJ" => 37,
- "HL" => 38,
- "RI" => 39,
- "EB" => 40,
- "EM" => 41,
- "ZWJ" => 42
-);
-
-my %eastAsianWidthCode = (
- "N" => 0,
- "A" => 1,
- "H" => 2,
- "W" => 3,
- "F" => 4,
- "Na" => 5
-);
-
# initialize default properties
-my @script;
-my @category;
-my @combining;
-my @mirror;
-my @pairedBracketType;
-my @hangul;
-my @casemap;
-my @idtype;
-my @numericvalue;
my @hanVariant;
-my @bidicategory;
my @fullWidth;
my @fullWidthInverse;
my @verticalOrientation;
-my @lineBreak;
-my @eastAsianWidthFWH;
-my @defaultIgnorable;
for (my $i = 0; $i < 0x110000; ++$i) {
- $script[$i] = $scriptCode{"UNKNOWN"};
- $category[$i] = $catCode{"UNASSIGNED"};
- $combining[$i] = 0;
- $pairedBracketType[$i] = 0;
- $casemap[$i] = 0;
- $idtype[$i] = $mappedIdType{'Restricted'};
- $numericvalue[$i] = -1;
$hanVariant[$i] = 0;
- $bidicategory[$i] = $bidicategoryCode{"L"};
$fullWidth[$i] = 0;
$fullWidthInverse[$i] = 0;
$verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
- $lineBreak[$i] = $lineBreakCode{"XX"};
- $eastAsianWidthFWH[$i] = 0;
- $defaultIgnorable[$i] = 0;
}
-# blocks where the default for bidi category is not L
-for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
- $bidicategory[$i] = $bidicategoryCode{"AL"};
-}
-for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
- $bidicategory[$i] = $bidicategoryCode{"R"};
-}
-for my $i (0x20A0..0x20CF) {
- $bidicategory[$i] = $bidicategoryCode{"ET"};
-}
-
-my %ucd2hb = (
-'Cc' => 'CONTROL',
-'Cf' => 'FORMAT',
-'Cn' => 'UNASSIGNED',
-'Co' => 'PRIVATE_USE',
-'Cs' => 'SURROGATE',
-'Ll' => 'LOWERCASE_LETTER',
-'Lm' => 'MODIFIER_LETTER',
-'Lo' => 'OTHER_LETTER',
-'Lt' => 'TITLECASE_LETTER',
-'Lu' => 'UPPERCASE_LETTER',
-'Mc' => 'SPACING_MARK',
-'Me' => 'ENCLOSING_MARK',
-'Mn' => 'NON_SPACING_MARK',
-'Nd' => 'DECIMAL_NUMBER',
-'Nl' => 'LETTER_NUMBER',
-'No' => 'OTHER_NUMBER',
-'Pc' => 'CONNECT_PUNCTUATION',
-'Pd' => 'DASH_PUNCTUATION',
-'Pe' => 'CLOSE_PUNCTUATION',
-'Pf' => 'FINAL_PUNCTUATION',
-'Pi' => 'INITIAL_PUNCTUATION',
-'Po' => 'OTHER_PUNCTUATION',
-'Ps' => 'OPEN_PUNCTUATION',
-'Sc' => 'CURRENCY_SYMBOL',
-'Sk' => 'MODIFIER_SYMBOL',
-'Sm' => 'MATH_SYMBOL',
-'So' => 'OTHER_SYMBOL',
-'Zl' => 'LINE_SEPARATOR',
-'Zp' => 'PARAGRAPH_SEPARATOR',
-'Zs' => 'SPACE_SEPARATOR'
-);
-
# read ReadMe.txt
my @versionInfo;
open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
@@ -347,12 +166,6 @@ while (<FH>) {
}
close FH;
-my $kTitleToUpper = 0x80000000;
-my $kUpperToLower = 0x40000000;
-my $kLowerToTitle = 0x20000000;
-my $kLowerToUpper = 0x10000000;
-my $kCaseMapCharMask = 0x001fffff;
-
# read UnicodeData.txt
open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
while (<FH>) {
@@ -365,12 +178,6 @@ while (<FH>) {
if ($fields[1] =~ /Last/) {
my $last = hex "0x$fields[0]";
do {
- $category[$first] = $catCode{$ucd2hb{$fields[2]}};
- $combining[$first] = $fields[3];
- $bidicategory[$first] = $bidicategoryCode{$fields[4]};
- unless (length($fields[7]) == 0) {
- $numericvalue[$first] = $fields[7];
- }
if ($fields[1] =~ /CJK/) {
@hanVariant[$first] = 3;
}
@@ -381,33 +188,6 @@ while (<FH>) {
}
} else {
my $usv = hex "0x$fields[0]";
- $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
- $combining[$usv] = $fields[3];
- my $upper = hex $fields[12];
- my $lower = hex $fields[13];
- my $title = hex $fields[14];
- # we only store one mapping for each character,
- # but also record what kind of mapping it is
- if ($upper && $lower) {
- $casemap[$usv] |= $kTitleToUpper;
- $casemap[$usv] |= ($usv ^ $upper);
- }
- elsif ($lower) {
- $casemap[$usv] |= $kUpperToLower;
- $casemap[$usv] |= ($usv ^ $lower);
- }
- elsif ($title && ($title != $upper)) {
- $casemap[$usv] |= $kLowerToTitle;
- $casemap[$usv] |= ($usv ^ $title);
- }
- elsif ($upper) {
- $casemap[$usv] |= $kLowerToUpper;
- $casemap[$usv] |= ($usv ^ $upper);
- }
- $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
- unless (length($fields[7]) == 0) {
- $numericvalue[$usv] = $fields[7];
- }
if ($fields[1] =~ /CJK/) {
@hanVariant[$usv] = 3;
}
@@ -427,180 +207,6 @@ while (<FH>) {
}
close FH;
-# read Scripts.txt
-open FH, "< $UNICODE/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
- my $script = uc($3);
- unless (exists $scriptCode{$script}) {
- warn "unknown ICU script $script";
- $scriptCode{$script} = $scriptCode{"UNKNOWN"};
- }
- $script = $scriptCode{$script};
- my $script = $scriptCode{$script};
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $script[$i] = $script;
- }
- }
-}
-close FH;
-
-# read BidiMirroring.txt
-my @offsets = ();
-push @offsets, 0;
-
-open FH, "< $UNICODE/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
- my $mirrorOffset = hex("0x$2") - hex("0x$1");
- my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
- if ($offsetIndex == undef) {
- die "too many offset codes\n" if scalar @offsets == 31;
- push @offsets, $mirrorOffset;
- $offsetIndex = $#offsets;
- }
- $mirror[hex "0x$1"] = $offsetIndex;
- }
-}
-close FH;
-
-# read BidiBrackets.txt
-my %pairedBracketTypeCode = (
- 'N' => 0,
- 'O' => 1,
- 'C' => 2
-);
-open FH, "< $UNICODE/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6});\s*(.)/) {
- my $mirroredChar = $offsets[$mirror[hex "0x$1"]] + hex "0x$1";
- die "bidi bracket does not match mirrored char\n" unless $mirroredChar == hex "0x$2";
- my $pbt = uc($3);
- warn "unknown Bidi Bracket type" unless exists $pairedBracketTypeCode{$pbt};
- $pairedBracketType[hex "0x$1"] = $pairedBracketTypeCode{$pbt};
- }
-}
-close FH;
-
-# read HangulSyllableType.txt
-my %hangulType = (
- 'L' => 0x01,
- 'V' => 0x02,
- 'T' => 0x04,
- 'LV' => 0x03,
- 'LVT' => 0x07
-);
-open FH, "< $UNICODE/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
- my $hangul = uc($3);
- warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
- $hangul = $hangulType{$hangul};
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $hangul[$i] = $hangul;
- }
- }
-}
-close FH;
-
-# read LineBreak.txt
-open FH, "< $UNICODE/LineBreak.txt" or die "can't open UCD file LineBreak.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
- my $lb = uc($3);
- warn "unknown LineBreak class" unless exists $lineBreakCode{$lb};
- $lb = $lineBreakCode{$lb};
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $lineBreak[$i] = $lb;
- }
- }
-}
-close FH;
-
-# read EastAsianWidth.txt
-open FH, "< $UNICODE/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- my $eaw = $3;
- warn "unknown EastAsianWidth class" unless exists $eastAsianWidthCode{$eaw};
- my $isFWH = ($eaw =~ m/^[FWH]$/) ? 1 : 0;
- for (my $i = $start; $i <= $end; ++$i) {
- $eastAsianWidthFWH[$i] = $isFWH;
- }
- }
-}
-close FH;
-
-# read DerivedCoreProperties.txt (for Default-Ignorables)
-open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n";
-push @versionInfo, "";
-
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) {
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $defaultIgnorable[$i] = 1;
- }
- }
-}
-close FH;
-
# read IdentifierStatus.txt
open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n";
push @versionInfo, "";
@@ -759,8 +365,7 @@ struct nsCharProps2 {
unsigned char mIdType:2;
};
|;
-&genTables("", "",
- "CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2, 16, 1, 1);
+&genTables("CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2, 16, 1, 1);
print HEADER "#pragma pack()\n\n";
@@ -776,42 +381,32 @@ sub sprintHanVariants
return sprintf("0x%02x,", $val);
}
## Han Variant data currently unused but may be needed in future, see bug 857481
-## &genTables("", "", "HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
+## &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
sub sprintFullWidth
{
my $usv = shift;
return sprintf("0x%04x,", $fullWidth[$usv]);
}
-&genTables("", "", "FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
+&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
sub sprintFullWidthInverse
{
my $usv = shift;
return sprintf("0x%04x,", $fullWidthInverse[$usv]);
}
-&genTables("", "", "FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
+&genTables("FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
print STDERR "Total data = $totalData\n";
-printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
-printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
-printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
-printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
-printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
-
sub genTables
{
- my ($guardBegin, $guardEnd,
- $prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
+ my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
if ($typedef ne '') {
- print HEADER "$guardBegin\n";
print HEADER "$typedef\n";
- print HEADER "$guardEnd\n\n";
}
- print DATA_TABLES "\n$guardBegin\n";
print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n";
print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
print DATA_TABLES "#define k${prefix}CharBits $charBits\n";
@@ -880,7 +475,6 @@ sub genTables
print DATA_TABLES $i < $#char ? "},\n" : "}\n";
}
print DATA_TABLES "};\n";
- print DATA_TABLES "$guardEnd\n";
my $dataSize = $pmCount * $indexLen * $pmBits/8 +
$chCount * $pageLen * $bytesPerEntry +