From 26adb1ff4193fa5b53a8c5e01b4ea0be3677eff8 Mon Sep 17 00:00:00 2001
From: Job Bautista <jobbautista9@protonmail.com>
Date: Wed, 22 Jun 2022 16:38:14 +0800
Subject: Issue #326 - Part 1a: Update character property table generator
 script for Unicode 9, and adjust APIs to fit the new identifier-type property
 model

Backported from Mozilla bug 1281448.
---
 gfx/thebes/gfxFont.cpp                           |   9 +-
 intl/unicharutil/tools/genUnicodePropertyData.pl | 164 +++++++++++++++++------
 intl/unicharutil/util/nsUnicodeProperties.cpp    |   2 +-
 intl/unicharutil/util/nsUnicodeProperties.h      |  31 +++--
 netwerk/dns/nsIDNService.cpp                     |  66 ++++-----
 5 files changed, 174 insertions(+), 98 deletions(-)

diff --git a/gfx/thebes/gfxFont.cpp b/gfx/thebes/gfxFont.cpp
index f79c5cbd72..8ac64bc1b4 100644
--- a/gfx/thebes/gfxFont.cpp
+++ b/gfx/thebes/gfxFont.cpp
@@ -725,10 +725,9 @@ gfxShapedText::SetGlyphs(uint32_t aIndex, CompressedGlyph aGlyph,
 #define ZWNJ 0x200C
 #define ZWJ  0x200D
 static inline bool
-IsDefaultIgnorable(uint32_t aChar)
+IsIgnorable(uint32_t aChar)
 {
-    return GetIdentifierModification(aChar) == XIDMOD_DEFAULT_IGNORABLE ||
-           aChar == ZWNJ || aChar == ZWJ;
+    return (IsDefaultIgnorable(aChar)) || aChar == ZWNJ || aChar == ZWJ;
 }
 
 void
@@ -744,7 +743,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont)
     DetailedGlyph *details = AllocateDetailedGlyphs(aIndex, 1);
 
     details->mGlyphID = aChar;
-    if (IsDefaultIgnorable(aChar)) {
+    if (IsIgnorable(aChar)) {
         // Setting advance width to zero will prevent drawing the hexbox
         details->mAdvance = 0;
     } else {
@@ -762,7 +761,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont)
 bool
 gfxShapedText::FilterIfIgnorable(uint32_t aIndex, uint32_t aCh)
 {
-    if (IsDefaultIgnorable(aCh)) {
+    if (IsIgnorable(aCh)) {
         // There are a few default-ignorables of Letter category (currently,
         // just the Hangul filler characters) that we'd better not discard
         // if they're followed by additional characters in the same cluster.
diff --git a/intl/unicharutil/tools/genUnicodePropertyData.pl b/intl/unicharutil/tools/genUnicodePropertyData.pl
index 8b247e83c6..bd86076eac 100755
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@@ -23,6 +23,7 @@
 #       - HangulSyllableType.txt
 #       - LineBreak.txt
 #       - EastAsianWidth.txt
+#       - DerivedCoreProperties.txt
 #       - ReadMe.txt (to record version/date of the UCD)
 #       - Unihan_Variants.txt (from Unihan.zip)
 #     though this may change if we find a need for additional properties.
@@ -30,12 +31,13 @@
 #     The Unicode data files listed above should be together in one directory.
 #
 #     We also require the file
-#        http://www.unicode.org/Public/security/latest/xidmodifications.txt
+#        http://www.unicode.org/Public/security/latest/IdentifierStatus.txt
+#        http://www.unicode.org/Public/security/latest/IdentifierType.txt
 #     This file should be in a sub-directory "security" immediately below the
 #        directory containing the other Unicode data files.
 #
-#     We also require the latest data file for UTR50, currently revision-13:
-#        http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt
+#     We also require the latest data file for UTR50, currently revision-15:
+#        http://www.unicode.org/Public/vertical/revision-15/VerticalOrientation-15.txt
 #     This file should be in a sub-directory "vertical" immediately below the
 #        directory containing the other Unicode data files.
 #
@@ -140,20 +142,35 @@ sub readIcuHeader
 
 die "didn't find ICU script codes\n" if $sc == -1;
 
-my %xidmodCode = (
-'Recommended'       => 0,
-'Inclusion'         => 1,
-'Uncommon_Use'      => 2,
-'Technical'         => 3,
-'Obsolete'          => 4,
-'Aspirational'      => 5,
-'Limited_Use'       => 6,
-'Exclusion'         => 7,
-'Not_XID'           => 8,
-'Not_NFKC'          => 9,
-'Default_Ignorable' => 10,
-'Deprecated'        => 11,
-'not-chars'         => 12
+# We don't currently store these values; %idType is used only to check that
+# properties listed in the IdentifierType.txt file are recognized. We record
+# only the %mappedIdType values that are used by nsIDNService::isLabelSafe.
+# In practice, it would be sufficient for us to read only the last value in
+# IdentifierType.txt, but we check that all values are known so that we'll get
+# a warning if future updates introduce new ones, and can consider whether
+# they need to be taken into account.
+my %idType = (
+  "Not_Character"     => 0,
+  "Recommended"       => 1,
+  "Inclusion"         => 2,
+  "Uncommon_Use"      => 3,
+  "Technical"         => 4,
+  "Obsolete"          => 5,
+  "Aspirational"      => 6,
+  "Limited_Use"       => 7,
+  "Exclusion"         => 8,
+  "Not_XID"           => 9,
+  "Not_NFKC"          => 10,
+  "Default_Ignorable" => 11,
+  "Deprecated"        => 12
+);
+
+# These match the IdentifierType enum in nsUnicodeProperties.h.
+my %mappedIdType = (
+  "Restricted"   => 0,
+  "Allowed"      => 1,
+  "Aspirational" => 2 # for Aspirational characters that are not excluded
+                      # by another attribute.
 );
 
 my %bidicategoryCode = (
@@ -229,7 +246,10 @@ my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum
   "CP" => 36,
   "CJ" => 37,
   "HL" => 38,
-  "RI" => 39
+  "RI" => 39,
+  "EB" => 40,
+  "EM" => 41,
+  "ZWJ" => 42
 );
 
 my %eastAsianWidthCode = (
@@ -249,7 +269,7 @@ my @mirror;
 my @pairedBracketType;
 my @hangul;
 my @casemap;
-my @xidmod;
+my @idtype;
 my @numericvalue;
 my @hanVariant;
 my @bidicategory;
@@ -258,13 +278,14 @@ my @fullWidthInverse;
 my @verticalOrientation;
 my @lineBreak;
 my @eastAsianWidthFWH;
+my @defaultIgnorable;
 for (my $i = 0; $i < 0x110000; ++$i) {
     $script[$i] = $scriptCode{"UNKNOWN"};
     $category[$i] = $catCode{"UNASSIGNED"};
     $combining[$i] = 0;
     $pairedBracketType[$i] = 0;
     $casemap[$i] = 0;
-    $xidmod[$i] = $xidmodCode{"not-chars"};
+    $idtype[$i] = $mappedIdType{'Restricted'};
     $numericvalue[$i] = -1;
     $hanVariant[$i] = 0;
     $bidicategory[$i] = $bidicategoryCode{"L"};
@@ -273,6 +294,7 @@ for (my $i = 0; $i < 0x110000; ++$i) {
     $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
     $lineBreak[$i] = $lineBreakCode{"XX"};
     $eastAsianWidthFWH[$i] = 0;
+    $defaultIgnorable[$i] = 0;
 }
 
 # blocks where the default for bidi category is not L
@@ -557,25 +579,72 @@ while (<FH>) {
 }
 close FH;
 
-# read xidmodifications.txt
-open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
+# read DerivedCoreProperties.txt (for Default-Ignorables)
+open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n";
 push @versionInfo, "";
+
 while (<FH>) {
-  chomp;
-  unless (/\xef\xbb\xbf/) {
+    chomp;
     push @versionInfo, $_;
-  }
-  last if /Generated:/;
+    last if /Date:/;
+}
+while (<FH>) {
+    s/#.*//;
+    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) {
+        my $start = hex "0x$1";
+        my $end = (defined $2) ? hex "0x$2" : $start;
+        for (my $i = $start; $i <= $end; ++$i) {
+            $defaultIgnorable[$i] = 1;
+        }
+    }
 }
+close FH;
+
+# read IdentifierStatus.txt
+open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n";
+push @versionInfo, "";
 while (<FH>) {
-  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
-    my $xidmod = $3;
-    warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
-    $xidmod = $xidmodCode{$xidmod};
+  chomp;
+  s/\xef\xbb\xbf//;
+  push @versionInfo, $_;
+  last if /Date:/;
+
+}
+
+while (<FH>) {
+  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+Allowed/) {
     my $start = hex "0x$1";
     my $end = (defined $2) ? hex "0x$2" : $start;
     for (my $i = $start; $i <= $end; ++$i) {
-      $xidmod[$i] = $xidmod;
+      $idtype[$i] = $mappedIdType{'Allowed'};
+    }
+
+  }
+}
+close FH;
+
+# read IdentifierType.txt, to find Aspirational characters
+open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+  chomp;
+  s/\xef\xbb\xbf//;
+  push @versionInfo, $_;
+  last if /Date:/;
+}
+while (<FH>) {
+  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) {
+    my $idtype = $3;
+    foreach (split(/ /, $idtype)) {
+      warn "unknown Identifier Type $_" unless exists $idType{$_};
+    }
+    my $start = hex "0x$1";
+    my $end = (defined $2) ? hex "0x$2" : $start;
+    if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) {
+
+      for (my $i = $start; $i <= $end; ++$i) {
+        $idtype[$i] = $mappedIdType{'Aspirational'};
+      }
     }
   }
 }
@@ -617,8 +686,8 @@ while (<FH>) {
 }
 close FH;
 
-# read VerticalOrientation-13.txt
-open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
+# read VerticalOrientation-15.txt
+open FH, "< $UNICODE/vertical/VerticalOrientation-15.txt" or die "can't open UTR50 data file VerticalOrientation-15.txt\n";
 push @versionInfo, "";
 while (<FH>) {
     chomp;
@@ -738,14 +807,15 @@ sub sprintCharProps2_short
 {
   my $usv = shift;
   return sprintf("{%d,%d},",
-                 $verticalOrientation[$usv], $xidmod[$usv]);
+                 $verticalOrientation[$usv], $idtype[$usv]);
 }
 $type = q|
 struct nsCharProps2 {
-  // Currently only 6 bits are defined here, so 2 more could be added without
-  // affecting the storage requirements for this struct.
+  // Currently only 4 bits are defined here, so 4 more could be added without
+  // affecting the storage requirements for this struct. Or we could pack two
+  // records per byte, at the cost of a slightly more complex accessor.
   unsigned char mVertOrient:2;
-  unsigned char mXidmod:4;
+  unsigned char mIdType:2;
 };
 |;
 &genTables("#if ENABLE_INTL_API", "#endif",
@@ -754,23 +824,31 @@ struct nsCharProps2 {
 sub sprintCharProps2_full
 {
   my $usv = shift;
-  return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d},",
+  return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d,%d},",
                  $script[$usv], $pairedBracketType[$usv],
                  $eastAsianWidthFWH[$usv], $category[$usv],
-                 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv],
-                 $verticalOrientation[$usv], $lineBreak[$usv]);
+                 $idtype[$usv], $defaultIgnorable[$usv], $bidicategory[$usv],
+                 $verticalOrientation[$usv], $lineBreak[$usv],
+                 $numericvalue[$usv]);
 }
 $type = q|
+// This struct currently requires 5 bytes. We try to ensure that whole-byte
+// fields will not straddle byte boundaries, to optimize access to them.
 struct nsCharProps2 {
   unsigned char mScriptCode:8;
+  // -- byte boundary --
   unsigned char mPairedBracketType:2;
   unsigned char mEastAsianWidthFWH:1;
   unsigned char mCategory:5;
+  // -- byte boundary --
+  unsigned char mIdType:2;
+  unsigned char mDefaultIgnorable:1;
   unsigned char mBidiCategory:5;
-  unsigned char mXidmod:4;
-  signed char   mNumericValue:5;
+  // -- byte boundary --
   unsigned char mVertOrient:2;
-  unsigned char mLineBreak; // only 6 bits actually needed
+  unsigned char mLineBreak:6;
+  // -- byte boundary --
+  signed char   mNumericValue; // only 5 bits are actually needed here
 };
 |;
 &genTables("#if !ENABLE_INTL_API", "#endif",
diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp
index 71f684f0e9..99a87bc958 100644
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@@ -31,7 +31,7 @@ GetCharProps2(uint32_t aCh)
     using namespace mozilla::unicode;
     static const nsCharProps2 undefined = {
         VERTICAL_ORIENTATION_R,
-        XIDMOD_NOT_CHARS
+        0 // IdentifierType
     };
     return undefined;
 }
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h
index ee1d77252c..2ff69d19a5 100644
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -38,20 +38,13 @@ enum PairedBracketType {
   PAIRED_BRACKET_TYPE_CLOSE = 2
 };
 
-enum XidmodType {
-  XIDMOD_RECOMMENDED,
-  XIDMOD_INCLUSION,
-  XIDMOD_UNCOMMON_USE,
-  XIDMOD_TECHNICAL,
-  XIDMOD_OBSOLETE,
-  XIDMOD_ASPIRATIONAL,
-  XIDMOD_LIMITED_USE,
-  XIDMOD_EXCLUSION,
-  XIDMOD_NOT_XID,
-  XIDMOD_NOT_NFKC,
-  XIDMOD_DEFAULT_IGNORABLE,
-  XIDMOD_DEPRECATED,
-  XIDMOD_NOT_CHARS
+/* Flags for Unicode security IdentifierType.txt attributes. Only a subset
+   of these are currently checked by Gecko, so we only define flags for the
+   ones we need. */
+enum IdentifierType {
+  IDTYPE_RESTRICTED = 0,
+  IDTYPE_ALLOWED = 1,
+  IDTYPE_ASPIRATIONAL = 2,
 };
 
 enum EmojiPresentation {
@@ -179,6 +172,12 @@ IsEastAsianWidthFWH(uint32_t aCh)
   return false;
 }
 
+inline bool
+IsDefaultIgnorable(uint32_t aCh)
+{
+  return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT);
+}
+
 inline EmojiPresentation
 GetEmojiPresentation(uint32_t aCh)
 {
@@ -201,8 +200,8 @@ inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
   return VerticalOrientation(GetCharProps2(aCh).mVertOrient);
 }
 
-inline XidmodType GetIdentifierModification(uint32_t aCh) {
-  return XidmodType(GetCharProps2(aCh).mXidmod);
+inline IdentifierType GetIdentifierType(uint32_t aCh) {
+  return IdentifierType(GetCharProps2(aCh).mIdType);
 }
 
 uint32_t GetFullWidth(uint32_t aCh);
diff --git a/netwerk/dns/nsIDNService.cpp b/netwerk/dns/nsIDNService.cpp
index 9210e20b5f..70e255ed15 100644
--- a/netwerk/dns/nsIDNService.cpp
+++ b/netwerk/dns/nsIDNService.cpp
@@ -314,42 +314,42 @@ nsresult nsIDNService::ACEtoUTF8(const nsACString & input, nsACString & _retval,
 
   return NS_OK;
 }
-
-/**
- * Returns |true| if |aString| contains only ASCII characters according
- * to our CRT.
- *
- * @param aString an 8-bit wide string to scan
- */
-inline bool IsAsciiString(mozilla::Span<const char> aString) {
-  for (char c : aString) {
-    if (!nsCRT::IsAscii(c)) {
-      return false;
-    }
-  }
-  return true;
-}
+
+/**
+ * Returns |true| if |aString| contains only ASCII characters according
+ * to our CRT.
+ *
+ * @param aString an 8-bit wide string to scan
+ */
+inline bool IsAsciiString(mozilla::Span<const char> aString) {
+  for (char c : aString) {
+    if (!nsCRT::IsAscii(c)) {
+      return false;
+    }
+  }
+  return true;
+}
 
 NS_IMETHODIMP nsIDNService::IsACE(const nsACString & input, bool *_retval)
 {
   // look for the ACE prefix in the input string.  it may occur
   // at the beginning of any segment in the domain name.  for
   // example: "www.xn--ENCODED.com"
-  if (!IsAsciiString(input)) {
-    *_retval = false;
-    return NS_OK;
-  }
-  auto stringContains = [](const nsACString& haystack,
-                           const nsACString& needle) {
-    return std::search(haystack.BeginReading(), haystack.EndReading(),
-                       needle.BeginReading(),
-                       needle.EndReading()) != haystack.EndReading();
-  };
-
-  *_retval = StringBeginsWith(input, NS_LITERAL_CSTRING("xn--")) ||
-             (!input.IsEmpty() && input[0] != '.' &&
-              stringContains(input, NS_LITERAL_CSTRING(".xn--")));
-  return NS_OK;
+  if (!IsAsciiString(input)) {
+    *_retval = false;
+    return NS_OK;
+  }
+  auto stringContains = [](const nsACString& haystack,
+                           const nsACString& needle) {
+    return std::search(haystack.BeginReading(), haystack.EndReading(),
+                       needle.BeginReading(),
+                       needle.EndReading()) != haystack.EndReading();
+  };
+
+  *_retval = StringBeginsWith(input, NS_LITERAL_CSTRING("xn--")) ||
+             (!input.IsEmpty() && input[0] != '.' &&
+              stringContains(input, NS_LITERAL_CSTRING(".xn--")));
+  return NS_OK;
 }
 
 NS_IMETHODIMP nsIDNService::Normalize(const nsACString & input,
@@ -730,11 +730,11 @@ bool nsIDNService::isLabelSafe(const nsAString &label)
     // Check for restricted characters; aspirational scripts are NOT permitted,
     // in anticipation of the category being merged into Limited-Use scripts
     // in the upcoming (Unicode 10.0-based) revision of UAX #31.
-    XidmodType xm = GetIdentifierModification(ch);
-    if (xm != XIDMOD_RECOMMENDED &&
-        xm != XIDMOD_INCLUSION) {
+    IdentifierType idType = GetIdentifierType(ch);
+    if (idType == IDTYPE_RESTRICTED) {
       return false;
     }
+    MOZ_ASSERT(idType == IDTYPE_ALLOWED || idType == IDTYPE_ASPIRATIONAL);
 
     // Check for mixed script
     Script script = GetScriptCode(ch);
-- 
cgit v1.2.3