diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /js/src/vm/Unicode.h | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | uxp-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz |
Add m-esr52 at 52.6.0
Diffstat (limited to 'js/src/vm/Unicode.h')
-rw-r--r-- | js/src/vm/Unicode.h | 498 |
1 files changed, 498 insertions, 0 deletions
diff --git a/js/src/vm/Unicode.h b/js/src/vm/Unicode.h new file mode 100644 index 0000000000..8b538d06de --- /dev/null +++ b/js/src/vm/Unicode.h @@ -0,0 +1,498 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef vm_Unicode_h +#define vm_Unicode_h + +#include "jspubtd.h" +#include "vm/UnicodeNonBMP.h" + +extern const bool js_isidstart[]; +extern const bool js_isident[]; +extern const bool js_isspace[]; + +namespace js { +namespace unicode { + +/* + * This namespace contains all the knowledge required to handle Unicode + * characters in JavaScript. + * + * SPACE + * Every character that is either in the ECMAScript class WhiteSpace + * (ES2016, § 11.2) or in LineTerminator (ES2016, § 11.3). + * + * WhiteSpace + * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF + * and every other Unicode character with the General Category "Zs". + * See <http://www.unicode.org/reports/tr44/#UnicodeData.txt> for more + * information about General Categories and the UnicodeData.txt file. + * + * LineTerminator + * \u000A, \u000D, \u2028, \u2029 + * + * UNICODE_ID_START + * These are all characters with the Unicode property «ID_Start». + * + * UNICODE_ID_CONTINUE_ONLY + * These are all characters with the Unicode property «ID_Continue» minus all + * characters with the Unicode property «ID_Start». + * And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6) + * + * UNICODE_ID_CONTINUE + * These are all characters with the Unicode property «ID_Continue». + * And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6) + * + * Attention: UNICODE_ID_START is _not_ IdentifierStart, but you could build + * a matcher for the real IdentifierPart like this: + * + * if char in ['$', '_']: + * return True + * if GetFlag(char) & UNICODE_ID_CONTINUE: + * return True + * + */ + +namespace CharFlag { + const uint8_t SPACE = 1 << 0; + const uint8_t UNICODE_ID_START = 1 << 1; + const uint8_t UNICODE_ID_CONTINUE_ONLY = 1 << 2; + const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY; +} + +const char16_t BYTE_ORDER_MARK2 = 0xFFFE; +const char16_t NO_BREAK_SPACE = 0x00A0; + +const char16_t LeadSurrogateMin = 0xD800; +const char16_t LeadSurrogateMax = 0xDBFF; +const char16_t TrailSurrogateMin = 0xDC00; +const char16_t TrailSurrogateMax = 0xDFFF; + +const uint32_t UTF16Max = 0xFFFF; +const uint32_t NonBMPMin = 0x10000; +const uint32_t NonBMPMax = 0x10FFFF; + +class CharacterInfo { + /* + * upperCase and lowerCase normally store the delta between two + * letters. For example the lower case alpha (a) has the char code + * 97, and the upper case alpha (A) has 65. So for "a" we would + * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, + * because this char is already in lower case. + * Well, not -32 exactly, but (2**16 - 32) to induce + * unsigned overflow with identical mathematical behavior. + * For upper case alpha, we would store 0 in upperCase and 32 in + * lowerCase (65 + 32 = 97). + * + * We use deltas to reuse information for multiple characters. For + * example the whole lower case latin alphabet fits into one entry, + * because it's always a UnicodeLetter and upperCase contains + * -32. + */ + public: + uint16_t upperCase; + uint16_t lowerCase; + uint8_t flags; + + inline bool isSpace() const { + return flags & CharFlag::SPACE; + } + + inline bool isUnicodeIDStart() const { + return flags & CharFlag::UNICODE_ID_START; + } + + inline bool isUnicodeIDContinue() const { + // Also matches <ZWNJ> and <ZWJ>! + return flags & CharFlag::UNICODE_ID_CONTINUE; + } +}; + +extern const uint8_t index1[]; +extern const uint8_t index2[]; +extern const CharacterInfo js_charinfo[]; + +inline const CharacterInfo& +CharInfo(char16_t code) +{ + const size_t shift = 6; + size_t index = index1[code >> shift]; + index = index2[(index << shift) + (code & ((1 << shift) - 1))]; + + return js_charinfo[index]; +} + +inline bool +IsIdentifierStart(char16_t ch) +{ + /* + * ES2016 11.6 IdentifierStart + * $ (dollar sign) + * _ (underscore) + * or any character with the Unicode property «ID_Start». + * + * We use a lookup table for small and thus common characters for speed. + */ + + if (ch < 128) + return js_isidstart[ch]; + + return CharInfo(ch).isUnicodeIDStart(); +} + +inline bool +IsIdentifierStart(uint32_t codePoint) +{ + // TODO: Supplemental code points not yet supported (bug 1197230). + return codePoint <= UTF16Max && IsIdentifierStart(char16_t(codePoint)); +} + +inline bool +IsIdentifierPart(char16_t ch) +{ + /* + * ES2016 11.6 IdentifierPart + * $ (dollar sign) + * _ (underscore) + * <ZWNJ> + * <ZWJ> + * or any character with the Unicode property «ID_Continue». + * + * We use a lookup table for small and thus common characters for speed. + */ + + if (ch < 128) + return js_isident[ch]; + + return CharInfo(ch).isUnicodeIDContinue(); +} + +inline bool +IsIdentifierPart(uint32_t codePoint) +{ + // TODO: Supplemental code points not yet supported (bug 1197230). + return codePoint <= UTF16Max && IsIdentifierPart(char16_t(codePoint)); +} + +inline bool +IsUnicodeIDStart(char16_t ch) +{ + return CharInfo(ch).isUnicodeIDStart(); +} + +inline bool +IsSpace(char16_t ch) +{ + /* + * IsSpace checks if some character is included in the merged set + * of WhiteSpace and LineTerminator, specified by ES2016 11.2 and 11.3. + * We combined them, because in practice nearly every + * calling function wants this, except some code in the tokenizer. + * + * We use a lookup table for ASCII-7 characters, because they are + * very common and must be handled quickly in the tokenizer. + * NO-BREAK SPACE is supposed to be the most common character not in + * this range, so we inline this case, too. + */ + + if (ch < 128) + return js_isspace[ch]; + + if (ch == NO_BREAK_SPACE) + return true; + + return CharInfo(ch).isSpace(); +} + +inline bool +IsSpaceOrBOM2(char16_t ch) +{ + if (ch < 128) + return js_isspace[ch]; + + /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */ + if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2) + return true; + + return CharInfo(ch).isSpace(); +} + +inline char16_t +ToUpperCase(char16_t ch) +{ + if (ch < 128) { + if (ch >= 'a' && ch <= 'z') + return ch - ('a' - 'A'); + return ch; + } + + const CharacterInfo& info = CharInfo(ch); + + return uint16_t(ch) + info.upperCase; +} + +inline char16_t +ToLowerCase(char16_t ch) +{ + if (ch < 128) { + if (ch >= 'A' && ch <= 'Z') + return ch + ('a' - 'A'); + return ch; + } + + const CharacterInfo& info = CharInfo(ch); + + return uint16_t(ch) + info.lowerCase; +} + +// Returns true iff ToUpperCase(ch) != ch. +inline bool +CanUpperCase(char16_t ch) +{ + if (ch < 128) + return ch >= 'a' && ch <= 'z'; + return CharInfo(ch).upperCase != 0; +} + +// Returns true iff ToLowerCase(ch) != ch. +inline bool +CanLowerCase(char16_t ch) +{ + if (ch < 128) + return ch >= 'A' && ch <= 'Z'; + return CharInfo(ch).lowerCase != 0; +} + +#define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \ + return true; + +inline bool +CanUpperCaseNonBMP(char16_t lead, char16_t trail) +{ + FOR_EACH_NON_BMP_UPPERCASE(CHECK_RANGE) + return false; +} + +inline bool +CanLowerCaseNonBMP(char16_t lead, char16_t trail) +{ + FOR_EACH_NON_BMP_LOWERCASE(CHECK_RANGE) + return false; +} + +#undef CHECK_RANGE + +inline char16_t +ToUpperCaseNonBMPTrail(char16_t lead, char16_t trail) +{ +#define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \ + return trail + DIFF; + FOR_EACH_NON_BMP_UPPERCASE(CALC_TRAIL) +#undef CALL_TRAIL + + return trail; +} + +inline char16_t +ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail) +{ +#define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \ + return trail + DIFF; + FOR_EACH_NON_BMP_LOWERCASE(CALC_TRAIL) +#undef CALL_TRAIL + + return trail; +} + +/* + * For a codepoint C, CodepointsWithSameUpperCaseInfo stores three offsets + * from C to up to three codepoints with same uppercase (no codepoint in + * UnicodeData.txt has more than three such codepoints). + * + * To illustrate, consider the codepoint U+0399 GREEK CAPITAL LETTER IOTA, the + * uppercased form of these three codepoints: + * + * U+03B9 GREEK SMALL LETTER IOTA + * U+1FBE GREEK PROSGEGRAMMENI + * U+0345 COMBINING GREEK YPOGEGRAMMENI + * + * For the CodepointsWithSameUpperCaseInfo corresponding to this codepoint, + * delta{1,2,3} are 16-bit modular deltas from 0x0399 to each respective + * codepoint: + * uint16_t(0x03B9 - 0x0399), + * uint16_t(0x1FBE - 0x0399), + * uint16_t(0x0345 - 0x0399) + * in an unimportant order. + * + * If there are fewer than three other codepoints, some fields are zero. + * Consider the codepoint U+03B9 above, the other two codepoints U+1FBE and + * U+0345 have same uppercase (U+0399 is not). For the + * CodepointsWithSameUpperCaseInfo corresponding to this codepoint, + * delta{1,2,3} are: + * uint16_t(0x1FBE - 0x03B9), + * uint16_t(0x0345 - 0x03B9), + * uint16_t(0) + * in an unimportant order. + * + * Because multiple codepoints map to a single CodepointsWithSameUpperCaseInfo, + * a CodepointsWithSameUpperCaseInfo and its delta{1,2,3} have no meaning + * standing alone: they have meaning only with respect to a codepoint mapping + * to that CodepointsWithSameUpperCaseInfo. + */ +class CodepointsWithSameUpperCaseInfo +{ + public: + uint16_t delta1; + uint16_t delta2; + uint16_t delta3; +}; + +extern const uint8_t codepoints_with_same_upper_index1[]; +extern const uint8_t codepoints_with_same_upper_index2[]; +extern const CodepointsWithSameUpperCaseInfo js_codepoints_with_same_upper_info[]; + +class CodepointsWithSameUpperCase +{ + const CodepointsWithSameUpperCaseInfo& info_; + const char16_t code_; + + static const CodepointsWithSameUpperCaseInfo& computeInfo(char16_t code) { + const size_t shift = 6; + size_t index = codepoints_with_same_upper_index1[code >> shift]; + index = codepoints_with_same_upper_index2[(index << shift) + (code & ((1 << shift) - 1))]; + return js_codepoints_with_same_upper_info[index]; + } + + public: + explicit CodepointsWithSameUpperCase(char16_t code) + : info_(computeInfo(code)), + code_(code) + {} + + char16_t other1() const { return uint16_t(code_) + info_.delta1; } + char16_t other2() const { return uint16_t(code_) + info_.delta2; } + char16_t other3() const { return uint16_t(code_) + info_.delta3; } +}; + +class FoldingInfo { + public: + uint16_t folding; + uint16_t reverse1; + uint16_t reverse2; + uint16_t reverse3; +}; + +extern const uint8_t folding_index1[]; +extern const uint8_t folding_index2[]; +extern const FoldingInfo js_foldinfo[]; + +inline const FoldingInfo& +CaseFoldInfo(char16_t code) +{ + const size_t shift = 6; + size_t index = folding_index1[code >> shift]; + index = folding_index2[(index << shift) + (code & ((1 << shift) - 1))]; + return js_foldinfo[index]; +} + +inline char16_t +FoldCase(char16_t ch) +{ + const FoldingInfo& info = CaseFoldInfo(ch); + return uint16_t(ch) + info.folding; +} + +inline char16_t +ReverseFoldCase1(char16_t ch) +{ + const FoldingInfo& info = CaseFoldInfo(ch); + return uint16_t(ch) + info.reverse1; +} + +inline char16_t +ReverseFoldCase2(char16_t ch) +{ + const FoldingInfo& info = CaseFoldInfo(ch); + return uint16_t(ch) + info.reverse2; +} + +inline char16_t +ReverseFoldCase3(char16_t ch) +{ + const FoldingInfo& info = CaseFoldInfo(ch); + return uint16_t(ch) + info.reverse3; +} + +inline bool +IsSupplementary(uint32_t codePoint) +{ + return codePoint >= NonBMPMin && codePoint <= NonBMPMax; +} + +inline bool +IsLeadSurrogate(uint32_t codePoint) +{ + return codePoint >= LeadSurrogateMin && codePoint <= LeadSurrogateMax; +} + +inline bool +IsTrailSurrogate(uint32_t codePoint) +{ + return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax; +} + +inline char16_t +LeadSurrogate(uint32_t codePoint) +{ + MOZ_ASSERT(IsSupplementary(codePoint)); + + return char16_t((codePoint >> 10) + (LeadSurrogateMin - (NonBMPMin >> 10))); +} + +inline char16_t +TrailSurrogate(uint32_t codePoint) +{ + MOZ_ASSERT(IsSupplementary(codePoint)); + + return char16_t((codePoint & 0x3FF) | TrailSurrogateMin); +} + +inline void +UTF16Encode(uint32_t codePoint, char16_t* lead, char16_t* trail) +{ + MOZ_ASSERT(IsSupplementary(codePoint)); + + *lead = LeadSurrogate(codePoint); + *trail = TrailSurrogate(codePoint); +} + +static inline void +UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index) +{ + if (!IsSupplementary(codePoint)) { + elements[(*index)++] = char16_t(codePoint); + } else { + elements[(*index)++] = LeadSurrogate(codePoint); + elements[(*index)++] = TrailSurrogate(codePoint); + } +} + +inline uint32_t +UTF16Decode(char16_t lead, char16_t trail) +{ + MOZ_ASSERT(IsLeadSurrogate(lead)); + MOZ_ASSERT(IsTrailSurrogate(trail)); + + return (lead << 10) + trail + (NonBMPMin - (LeadSurrogateMin << 10) - TrailSurrogateMin); +} + +} /* namespace unicode */ +} /* namespace js */ + +#endif /* vm_Unicode_h */ |