diff options
Diffstat (limited to 'js/src/vm/make_unicode.py')
-rwxr-xr-x | js/src/vm/make_unicode.py | 324 |
1 files changed, 315 insertions, 9 deletions
diff --git a/js/src/vm/make_unicode.py b/js/src/vm/make_unicode.py index 5565d7d14..83f0d004b 100755 --- a/js/src/vm/make_unicode.py +++ b/js/src/vm/make_unicode.py @@ -133,6 +133,17 @@ def read_derived_core_properties(derived_core_properties): for char in range(int(start, 16), int(end, 16) + 1): yield (char, char_property) +def int_ranges(ints): + """ Yields consecutive ranges (inclusive) from integer values. """ + from itertools import tee, izip_longest + + (a, b) = tee(sorted(ints)) + start = next(b) + for (curr, succ) in izip_longest(a, b): + if curr + 1 != succ: + yield (start, curr) + start = succ + def utf16_encode(code): NonBMPMin = 0x10000 LeadSurrogateMin = 0xD800 @@ -144,37 +155,65 @@ def utf16_encode(code): return lead, trail def make_non_bmp_convert_macro(out_file, name, convert_map): + # Find continuous range in convert_map. convert_list = [] entry = None for code in sorted(convert_map.keys()): + lead, trail = utf16_encode(code) converted = convert_map[code] diff = converted - code - if entry and code == entry['code'] + entry['length'] and diff == entry['diff']: + if (entry and code == entry['code'] + entry['length'] and + diff == entry['diff'] and lead == entry['lead']): + entry['length'] += 1 continue - entry = { 'code': code, 'diff': diff, 'length': 1 } + entry = { + 'code': code, + 'diff': diff, + 'length': 1, + 'lead': lead, + 'trail': trail, + } convert_list.append(entry) + # Generate macro call for each range. lines = [] for entry in convert_list: from_code = entry['code'] to_code = entry['code'] + entry['length'] - 1 diff = entry['diff'] - from_lead, from_trail = utf16_encode(from_code) - to_lead, to_trail = utf16_encode(to_code) - - assert from_lead == to_lead + lead = entry['lead'] + from_trail = entry['trail'] + to_trail = entry['trail'] + entry['length'] - 1 lines.append(' macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format( - from_code, to_code, from_lead, from_trail, to_trail, diff)) + from_code, to_code, lead, from_trail, to_trail, diff)) out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name)) out_file.write(' \\\n'.join(lines)) out_file.write('\n') +def for_each_non_bmp_group(group_set): + # Find continuous range in group_set. + group_list = [] + entry = None + for code in sorted(group_set.keys()): + if entry and code == entry['code'] + entry['length']: + entry['length'] += 1 + continue + + entry = { + 'code': code, + 'length': 1 + } + group_list.append(entry) + + for entry in group_list: + yield (entry['code'], entry['code'] + entry['length'] - 1) + def process_derived_core_properties(derived_core_properties): id_start = set() id_continue = set() @@ -203,6 +242,9 @@ def process_unicode_data(unicode_data, derived_core_properties): non_bmp_lower_map = {} non_bmp_upper_map = {} + non_bmp_id_start_set = {} + non_bmp_id_cont_set = {} + non_bmp_space_set = {} (id_start, id_continue) = process_derived_core_properties(derived_core_properties) @@ -235,6 +277,13 @@ def process_unicode_data(unicode_data, derived_core_properties): non_bmp_lower_map[code] = lower if code != upper: non_bmp_upper_map[code] = upper + if category == 'Zs': + non_bmp_space_set[code] = 1 + test_space_table.append(code) + if code in id_start: + non_bmp_id_start_set[code] = 1 + if code in id_continue: + non_bmp_id_cont_set[code] = 1 continue # we combine whitespace and lineterminators because in pratice we don't need them separated @@ -304,6 +353,8 @@ def process_unicode_data(unicode_data, derived_core_properties): table, index, same_upper_table, same_upper_index, non_bmp_lower_map, non_bmp_upper_map, + non_bmp_space_set, + non_bmp_id_start_set, non_bmp_id_cont_set, test_table, test_space_table, ) @@ -401,6 +452,16 @@ def make_non_bmp_file(version, #ifndef vm_UnicodeNonBMP_h #define vm_UnicodeNonBMP_h +// |macro| receives the following arguments +// macro(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) +// FROM: code point where the range starts +// TO: code point where the range ends +// LEAD: common lead surrogate of FROM and TO +// TRAIL_FROM: trail surrogate of FROM +// TRAIL_FROM: trail surrogate of TO +// DIFF: the difference between the code point in the range and +// converted code point + """) make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map) @@ -514,7 +575,9 @@ if (typeof reportCompare === "function") def make_unicode_file(version, table, index, same_upper_table, same_upper_index, - folding_table, folding_index): + folding_table, folding_index, + non_bmp_space_set, + non_bmp_id_start_set, non_bmp_id_cont_set): index1, index2, shift = splitbins(index) # Don't forget to update CharInfo in Unicode.h if you need to change this @@ -671,6 +734,43 @@ def make_unicode_file(version, dump(folding_index2, 'folding_index2', data_file) data_file.write('\n') + # If the following assert fails, it means space character is added to + # non-BMP area. In that case the following code should be uncommented + # and the corresponding code should be added to frontend. + assert len(non_bmp_space_set.keys()) == 0 + + data_file.write("""\ +bool +js::unicode::IsIdentifierStartNonBMP(uint32_t codePoint) +{ +""") + + for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_start_set): + data_file.write("""\ + if (codePoint >= 0x{:x} && codePoint <= 0x{:x}) + return true; +""".format(from_code, to_code)) + + data_file.write("""\ + return false; +} + +bool +js::unicode::IsIdentifierPartNonBMP(uint32_t codePoint) +{ +""") + + for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_cont_set): + data_file.write("""\ + if (codePoint >= 0x{:x} && codePoint <= 0x{:x}) + return true; +""".format(from_code, to_code)) + + data_file.write("""\ + return false; +} +""") + def getsize(data): """ return smallest possible integer size for the given array """ maxdata = max(data) @@ -740,6 +840,204 @@ def splitbins(t): assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] return best +def make_irregexp_tables(version, + table, index, + folding_table, folding_index, + test_table): + import string + from functools import partial + from itertools import chain, ifilter, imap + + MAX_ASCII = 0x7F + MAX_LATIN1 = 0xFF + LEAD_SURROGATE_MIN = 0xD800 + TRAIL_SURROGATE_MAX = 0xDFFF + + def hex2(n): + assert 0 <= n and n < 16**2 + return '0x{:02X}'.format(n) + + def hex4(n): + assert 0 <= n and n < 16**4 + return '0x{:04X}'.format(n) + + def uhex4(n): + assert 0 <= n and n < 16**4 + return 'U+{:04X}'.format(n) + + def case_info(code): + assert 0 <= code and code <= MAX_BMP + (upper, lower, flags) = table[index[code]] + return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags) + + def is_space(code): + (_, _, flags) = case_info(code) + return bool(flags & FLAG_SPACE) + + def to_upper(code): + (upper, _, _) = case_info(code) + return upper + + def casefold(code): + assert 0 <= code and code <= MAX_BMP + (folding, _, _, _) = folding_table[folding_index[code]] + return (code + folding) & 0xffff + + def casefolds_to_ascii(code): + return casefold(code) <= MAX_ASCII + + def casefolds_to_latin1(code): + return casefold(code) <= MAX_LATIN1 + + def casemaps_to_nonlatin1(code): + upper = to_upper(code) + return upper > MAX_LATIN1 + + def char_name(code): + assert 0 <= code and code <= MAX_BMP + if code not in test_table: + return '<Unused>' + if code == LEAD_SURROGATE_MIN: + return '<Lead Surrogate Min>' + if code == TRAIL_SURROGATE_MAX: + return '<Trail Surrogate Max>' + (_, _, name, alias) = test_table[code] + return name if not name.startswith('<') else alias + + def write_character_range(println, name, characters): + char_ranges = list(int_ranges(characters)) + println('') + println('const int js::irregexp::k{}Ranges[] = {{'.format(name)) + for (start, end) in char_ranges: + s_name = char_name(start) + e_name = char_name(end) + println(' {}, {} + 1, // {}'.format(hex4(start), hex4(end), + '{}..{}'.format(s_name, e_name) + if start != end else s_name)) + println(' {} + 1'.format(hex4(MAX_BMP))) + println('};') + println('const int js::irregexp::k{}RangeCount = {};'.format(name, + len(char_ranges) * 2 + 1)) + + def write_character_test(println, test, consequent, default): + # Latin1 characters which, when case-mapped through + # String.prototype.toUpperCase(), canonicalize to a non-Latin1 character. + # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize + casemapped_to_nonlatin1 = ifilter(casemaps_to_nonlatin1, xrange(0, MAX_LATIN1 + 1)) + + def casemap_closure(ch): + upper = to_upper(ch) + return (ch, [c for c in xrange(MAX_LATIN1 + 1, MAX_BMP + 1) if upper == to_upper(c)]) + + # Mapping from Latin1 characters to the list of case map equivalent + # non-Latin1 characters. + casemap_for_latin1 = dict(chain(imap(casemap_closure, casemapped_to_nonlatin1))) + + # Non-latin1 characters which, when Unicode case-folded, canonicalize to + # a Latin1 character. + # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize + casefolded_to_latin1 = ifilter(casefolds_to_latin1, xrange(MAX_LATIN1 + 1, MAX_BMP + 1)) + + println(' if (unicode) {') + for ch in casefolded_to_latin1: + casefolded = casefold(ch) + # Skip if also handled below for case mapping. + if casefolded in casemap_for_latin1 and ch in casemap_for_latin1[casefolded]: + continue + println(' // "{}" case folds to "{}".'.format(char_name(ch), + char_name(casefolded))) + println(' if ({})'.format(test(ch))) + println(' return {};'.format(consequent(casefolded))) + println(' }') + println('') + for (ch, casemapped_chars) in casemap_for_latin1.iteritems(): + for casemapped in casemapped_chars: + println(' // "{}" case maps to "{}".'.format(char_name(casemapped), + char_name(ch))) + println(' if ({})'.format(' || '.join(imap(test, casemapped_chars)))) + println(' return {};'.format(consequent(ch))) + println(' return {};'.format(default)) + + with io.open('../irregexp/RegExpCharacters-inl.h', 'wb') as chars_file: + write = partial(print, file=chars_file, sep='', end='') + println = partial(write, end='\n') + + write(warning_message) + write(unicode_version_message.format(version)) + + println('#ifndef V8_JSREGEXPCHARACTERS_INL_H_') + println('#define V8_JSREGEXPCHARACTERS_INL_H_') + println('') + println('namespace js {') + println('') + println('namespace irregexp {') + println('') + + println('static inline bool') + println('RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)') + println('{') + write_character_test(println, lambda ch: 'range.Contains({})'.format(hex4(ch)), + lambda _: 'true', 'false') + println('}') + + println('') + println('} } // namespace js::irregexp') + println('') + println('#endif // V8_JSREGEXPCHARACTERS_INL_H_') + + with io.open('../irregexp/RegExpCharacters.cpp', 'wb') as chars_file: + write = partial(print, file=chars_file, sep='', end='') + println = partial(write, end='\n') + character_range = partial(write_character_range, println) + + # Characters in \s, 21.2.2.12 CharacterClassEscape. + space_chars = filter(is_space, xrange(0, MAX_BMP + 1)) + + # Characters in \d, 21.2.2.12 CharacterClassEscape. + digit_chars = map(ord, string.digits) + assert all(ch <= MAX_ASCII for ch in digit_chars) + + # Characters in \w, 21.2.2.12 CharacterClassEscape. + word_chars = map(ord, string.digits + string.ascii_letters + '_') + assert all(ch <= MAX_ASCII for ch in word_chars) + + # Characters which case-fold to characters in \w. + ignorecase_word_chars = (word_chars + + filter(casefolds_to_ascii, xrange(MAX_ASCII + 1, MAX_BMP + 1))) + + # Surrogate characters. + surrogate_chars = range(LEAD_SURROGATE_MIN, TRAIL_SURROGATE_MAX + 1) + + write(warning_message) + write(unicode_version_message.format(version)) + println('#include "irregexp/RegExpCharacters.h"') + println('') + println('#include "mozilla/Assertions.h"') + println('') + + println('char16_t') + println('js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)') + println('{') + println(' MOZ_ASSERT(c > {}, "Character mustn\'t be Latin1");'.format(hex2(MAX_LATIN1))) + write_character_test(println, lambda ch: 'c == {}'.format(hex4(ch)), hex2, '0') + println('}') + + character_range('Space', space_chars) + character_range('SpaceAndSurrogate', space_chars + surrogate_chars) + + character_range('Word', word_chars) + character_range('IgnoreCaseWord', ignorecase_word_chars) + character_range('WordAndSurrogate', word_chars + surrogate_chars) + character_range('NegatedIgnoreCaseWordAndSurrogate', + set(xrange(0, MAX_BMP + 1)) - set(ignorecase_word_chars + surrogate_chars)) + + character_range('Digit', digit_chars) + character_range('DigitAndSurrogate', digit_chars + surrogate_chars) + + character_range('Surrogate', surrogate_chars) + + character_range('LineTerminator', line_terminator) + def update_unicode(args): import urllib2 @@ -791,6 +1089,8 @@ def update_unicode(args): table, index, same_upper_table, same_upper_index, non_bmp_lower_map, non_bmp_upper_map, + non_bmp_space_set, + non_bmp_id_start_set, non_bmp_id_cont_set, test_table, test_space_table ) = process_unicode_data(unicode_data, derived_core_properties) ( @@ -803,10 +1103,16 @@ def update_unicode(args): make_unicode_file(unicode_version, table, index, same_upper_table, same_upper_index, - folding_table, folding_index) + folding_table, folding_index, + non_bmp_space_set, + non_bmp_id_start_set, non_bmp_id_cont_set) make_non_bmp_file(unicode_version, non_bmp_lower_map, non_bmp_upper_map, non_bmp_folding_map, non_bmp_rev_folding_map) + make_irregexp_tables(unicode_version, + table, index, + folding_table, folding_index, + test_table) make_bmp_mapping_test(unicode_version, test_table) make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map) |