summaryrefslogtreecommitdiff
path: root/gfx/thebes/nsUnicodeRange.cpp
blob: af53f21fa6100364da2c8383bcdd0f04e3e3f10c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicodeRange.h"

/**********************************************************************
 * Unicode subranges as defined in unicode 3.0
 * x-western  -> latin
 *  0000 - 036f
 *  1e00 - 1eff
 *  2000 - 206f  (general punctuation)
 *  20a0 - 20cf  (currency symbols)
 *  2100 - 214f  (letterlike symbols)
 *  2150 - 218f  (Number Forms)
 * el         -> greek
 *  0370 - 03ff
 *  1f00 - 1fff
 * x-cyrillic -> cyrillic
 *  0400 - 04ff
 * he         -> hebrew
 *  0590 - 05ff
 * ar         -> arabic
 *  0600 - 06ff
 *  fb50 - fdff (arabic presentation forms)
 *  fe70 - feff (arabic presentation forms b)
 * th - thai
 *  0e00 - 0e7f
 * ko        -> korean
 *  ac00 - d7af  (hangul Syllables)
 *  1100 - 11ff    (jamo)
 *  3130 - 318f (hangul compatibility jamo)
 * ja
 *  3040 - 309f (hiragana)
 *  30a0 - 30ff (katakana)
 * zh-CN
 * zh-TW
 *
 * CJK
 *  3100 - 312f (bopomofo)
 *  31a0 - 31bf (bopomofo extended)
 *  3000 - 303f (CJK Symbols and Punctuation) 
 *  2e80 - 2eff (CJK radicals supplement)
 *  2f00 - 2fdf (Kangxi Radicals)
 *  2ff0 - 2fff (Ideographic Description Characters)
 *  3190 - 319f (kanbun)
 *  3200 - 32ff (Enclosed CJK letters and Months)
 *  3300 - 33ff (CJK compatibility)
 *  3400 - 4dbf (CJK Unified Ideographs Extension A)
 *  4e00 - 9faf (CJK Unified Ideographs)
 *  f900 - fa5f (CJK Compatibility Ideographs)
 *  fe30 - fe4f (CJK compatibility Forms)
 *  ff00 - ffef (halfwidth and fullwidth forms)
 *
 * Armenian
 *  0530 - 058f 
 * Sriac 
 *  0700 - 074f
 * Thaana
 *  0780 - 07bf
 * Devanagari
 *  0900 - 097f
 * Bengali
 *  0980 - 09ff
 * Gurmukhi
 *  0a00 - 0a7f
 * Gujarati
 *  0a80 - 0aff
 * Oriya
 *  0b00 - 0b7f
 * Tamil
 *  0b80 - 0bff
 * Telugu
 *  0c00 - 0c7f
 * Kannada
 *  0c80 - 0cff
 * Malayalam
 *  0d00 - 0d7f
 * Sinhala
 *  0d80 - 0def
 * Lao
 *  0e80 - 0eff
 * Tibetan
 *  0f00 - 0fbf
 * Myanmar
 *  1000 - 109f
 * Georgian
 *  10a0 - 10ff
 * Ethiopic
 *  1200 - 137f
 * Cherokee
 *  13a0 - 13ff
 * Canadian Aboriginal Syllabics
 *  1400 - 167f
 * Ogham
 *  1680 - 169f
 * Runic 
 *  16a0 - 16ff
 * Khmer
 *  1780 - 17ff
 * Mongolian
 *  1800 - 18af
 * Misc - superscripts and subscripts
 *  2070 - 209f
 * Misc - Combining Diacritical Marks for Symbols
 *  20d0 - 20ff
 * Misc - Arrows
 *  2190 - 21ff
 * Misc - Mathematical Operators
 *  2200 - 22ff
 * Misc - Miscellaneous Technical
 *  2300 - 23ff
 * Misc - Control picture
 *  2400 - 243f
 * Misc - Optical character recognition
 *  2440 - 2450
 * Misc - Enclose Alphanumerics
 *  2460 - 24ff
 * Misc - Box Drawing 
 *  2500 - 257f
 * Misc - Block Elements
 *  2580 - 259f
 * Misc - Geometric Shapes
 *  25a0 - 25ff
 * Misc - Miscellaneous Symbols
 *  2600 - 267f
 * Misc - Dingbats
 *  2700 - 27bf
 * Misc - Braille Patterns
 *  2800 - 28ff
 * Yi Syllables
 *  a000 - a48f
 * Yi radicals
 *  a490 - a4cf
 * Alphabetic Presentation Forms
 *  fb00 - fb4f
 * Misc - Combining half Marks
 *  fe20 - fe2f
 * Misc - small form variants
 *  fe50 - fe6f
 * Misc - Specials
 *  fff0 - ffff
 *********************************************************************/



#define NUM_OF_SUBTABLES      10
#define SUBTABLE_SIZE         16

static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = 
{ 
  { // table for X---
    kRangeTableBase+1,  //u0xxx
    kRangeTableBase+2,  //u1xxx
    kRangeTableBase+3,  //u2xxx
    kRangeSetCJK,       //u3xxx
    kRangeSetCJK,       //u4xxx
    kRangeSetCJK,       //u5xxx
    kRangeSetCJK,       //u6xxx
    kRangeSetCJK,       //u7xxx
    kRangeSetCJK,       //u8xxx
    kRangeSetCJK,       //u9xxx
    kRangeTableBase+4,  //uaxxx
    kRangeKorean,       //ubxxx
    kRangeKorean,       //ucxxx
    kRangeTableBase+5,  //udxxx
    kRangePrivate,      //uexxx
    kRangeTableBase+6   //ufxxx
  },
  { //table for 0X--
    kRangeSetLatin,          //u00xx
    kRangeSetLatin,          //u01xx
    kRangeSetLatin,          //u02xx
    kRangeGreek,             //u03xx     XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
    kRangeCyrillic,          //u04xx
    kRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
    kRangeArabic,            //u06xx
    kRangeTertiaryTable,     //u07xx
    kRangeUnassigned,        //u08xx
    kRangeTertiaryTable,     //u09xx
    kRangeTertiaryTable,     //u0axx
    kRangeTertiaryTable,     //u0bxx
    kRangeTertiaryTable,     //u0cxx
    kRangeTertiaryTable,     //u0dxx
    kRangeTertiaryTable,     //u0exx
    kRangeTibetan            //u0fxx
  },
  { //table for 1x--
    kRangeTertiaryTable,     //u10xx
    kRangeKorean,            //u11xx
    kRangeEthiopic,          //u12xx
    kRangeTertiaryTable,     //u13xx
    kRangeCanadian,          //u14xx
    kRangeCanadian,          //u15xx
    kRangeTertiaryTable,     //u16xx
    kRangeKhmer,             //u17xx
    kRangeMongolian,         //u18xx
    kRangeUnassigned,        //u19xx
    kRangeUnassigned,        //u1axx
    kRangeUnassigned,        //u1bxx
    kRangeUnassigned,        //u1cxx
    kRangeUnassigned,        //u1dxx
    kRangeSetLatin,          //u1exx
    kRangeGreek              //u1fxx
  },
  { //table for 2x--
    kRangeSetLatin,          //u20xx
    kRangeSetLatin,          //u21xx
    kRangeMathOperators,     //u22xx
    kRangeMiscTechnical,     //u23xx
    kRangeControlOpticalEnclose, //u24xx
    kRangeBoxBlockGeometrics, //u25xx
    kRangeMiscSymbols,       //u26xx
    kRangeDingbats,          //u27xx
    kRangeBraillePattern,    //u28xx
    kRangeUnassigned,        //u29xx
    kRangeUnassigned,        //u2axx
    kRangeUnassigned,        //u2bxx
    kRangeUnassigned,        //u2cxx
    kRangeUnassigned,        //u2dxx
    kRangeSetCJK,            //u2exx
    kRangeSetCJK             //u2fxx
  },
  {  //table for ax--
    kRangeYi,                //ua0xx
    kRangeYi,                //ua1xx
    kRangeYi,                //ua2xx
    kRangeYi,                //ua3xx
    kRangeYi,                //ua4xx
    kRangeUnassigned,        //ua5xx
    kRangeUnassigned,        //ua6xx
    kRangeUnassigned,        //ua7xx
    kRangeUnassigned,        //ua8xx
    kRangeUnassigned,        //ua9xx
    kRangeUnassigned,        //uaaxx
    kRangeUnassigned,        //uabxx
    kRangeKorean,            //uacxx
    kRangeKorean,            //uadxx
    kRangeKorean,            //uaexx
    kRangeKorean             //uafxx
  },
  {  //table for dx--
    kRangeKorean,            //ud0xx
    kRangeKorean,            //ud1xx
    kRangeKorean,            //ud2xx
    kRangeKorean,            //ud3xx
    kRangeKorean,            //ud4xx
    kRangeKorean,            //ud5xx
    kRangeKorean,            //ud6xx
    kRangeKorean,            //ud7xx
    kRangeSurrogate,         //ud8xx
    kRangeSurrogate,         //ud9xx
    kRangeSurrogate,         //udaxx
    kRangeSurrogate,         //udbxx
    kRangeSurrogate,         //udcxx
    kRangeSurrogate,         //uddxx
    kRangeSurrogate,         //udexx
    kRangeSurrogate          //udfxx
  },
  { // table for fx--
    kRangePrivate,           //uf0xx 
    kRangePrivate,           //uf1xx 
    kRangePrivate,           //uf2xx 
    kRangePrivate,           //uf3xx 
    kRangePrivate,           //uf4xx 
    kRangePrivate,           //uf5xx 
    kRangePrivate,           //uf6xx 
    kRangePrivate,           //uf7xx 
    kRangePrivate,           //uf8xx 
    kRangeSetCJK,            //uf9xx 
    kRangeSetCJK,            //ufaxx 
    kRangeArabic,            //ufbxx, includes alphabic presentation form
    kRangeArabic,            //ufcxx
    kRangeArabic,            //ufdxx
    kRangeTableBase+8,       //ufexx
    kRangeTableBase+9        //uffxx, halfwidth and fullwidth forms, includes Specials
  },
  { //table for 0x0500 - 0x05ff
    kRangeCyrillic,          //u050x
    kRangeCyrillic,          //u051x
    kRangeCyrillic,          //u052x
    kRangeArmenian,          //u053x
    kRangeArmenian,          //u054x
    kRangeArmenian,          //u055x
    kRangeArmenian,          //u056x
    kRangeArmenian,          //u057x
    kRangeArmenian,          //u058x
    kRangeHebrew,            //u059x
    kRangeHebrew,            //u05ax
    kRangeHebrew,            //u05bx
    kRangeHebrew,            //u05cx
    kRangeHebrew,            //u05dx
    kRangeHebrew,            //u05ex
    kRangeHebrew             //u05fx
  },
  { //table for 0xfe00 - 0xfeff
    kRangeSetCJK,            //ufe0x
    kRangeSetCJK,            //ufe1x
    kRangeSetCJK,            //ufe2x
    kRangeSetCJK,            //ufe3x
    kRangeSetCJK,            //ufe4x
    kRangeSetCJK,            //ufe5x
    kRangeSetCJK,            //ufe6x
    kRangeArabic,            //ufe7x
    kRangeArabic,            //ufe8x
    kRangeArabic,            //ufe9x
    kRangeArabic,            //ufeax
    kRangeArabic,            //ufebx
    kRangeArabic,            //ufecx
    kRangeArabic,            //ufedx
    kRangeArabic,            //ufeex
    kRangeArabic             //ufefx
  },
  { //table for 0xff00 - 0xffff
    kRangeSetCJK,            //uff0x, fullwidth latin
    kRangeSetCJK,            //uff1x, fullwidth latin
    kRangeSetCJK,            //uff2x, fullwidth latin
    kRangeSetCJK,            //uff3x, fullwidth latin
    kRangeSetCJK,            //uff4x, fullwidth latin
    kRangeSetCJK,            //uff5x, fullwidth latin
    kRangeSetCJK,            //uff6x, halfwidth katakana
    kRangeSetCJK,            //uff7x, halfwidth katakana
    kRangeSetCJK,            //uff8x, halfwidth katakana
    kRangeSetCJK,            //uff9x, halfwidth katakana
    kRangeSetCJK,            //uffax, halfwidth hangul jamo
    kRangeSetCJK,            //uffbx, halfwidth hangul jamo
    kRangeSetCJK,            //uffcx, halfwidth hangul jamo
    kRangeSetCJK,            //uffdx, halfwidth hangul jamo
    kRangeSetCJK,            //uffex, fullwidth symbols
    kRangeSpecials,          //ufffx, Specials
  },
};

// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) 
// code points  so that the number of entries in the tertiary range
// table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal 
// syllabaries take multiple chunks and Ogham and Runic share  a single chunk.
#define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)

static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
{ //table for 0x0700 - 0x1600 
    kRangeSyriac,            //u070x
    kRangeThaana,            //u078x
    kRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
    kRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
    kRangeDevanagari,        //u090x
    kRangeBengali,           //u098x
    kRangeGurmukhi,          //u0a0x
    kRangeGujarati,          //u0a8x
    kRangeOriya,             //u0b0x
    kRangeTamil,             //u0b8x
    kRangeTelugu,            //u0c0x
    kRangeKannada,           //u0c8x
    kRangeMalayalam,         //u0d0x
    kRangeSinhala,           //u0d8x
    kRangeThai,              //u0e0x  
    kRangeLao,               //u0e8x
    kRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
    kRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
    kRangeMyanmar,           //u100x
    kRangeGeorgian,          //u108x
    kRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
    kRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
    kRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
    kRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
    kRangeEthiopic,          //u130x  
    kRangeCherokee,          //u138x
    kRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,          //u160x  
    kRangeOghamRunic         //u168x  this contains two scripts, Ogham & Runic
};

// A two level index is almost enough for locating a range, with the 
// exception of u03xx and u05xx. Since we don't really care about range for
// combining diacritical marks in our font application, they are 
// not discriminated further. But future adoption of this module for other use 
// should be aware of this limitation. The implementation can be extended if 
// there is such a need.
// For Indic, Southeast Asian scripts and some other scripts between
// U+0700 and U+16FF, it's extended to the third level.
uint32_t FindCharUnicodeRange(uint32_t ch)
{
  uint32_t range;
  
  // aggregate ranges for non-BMP codepoints
  if (ch > 0xFFFF) {
    uint32_t p = (ch >> 16);
    if (p == 1) {
        return kRangeSMP;
    } else if (p == 2) {
        return kRangeSetCJK;
    }
    return kRangeHigherPlanes;
  }

  // lookup explicit range for BMP codepoints
  // first general range
  range = gUnicodeSubrangeTable[0][ch >> 12];
  
  // if general range is good enough, return that
  if (range < kRangeTableBase)
    // we try to get a specific range 
    return range;

  // otherwise, use subrange tables
  range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
  if (range < kRangeTableBase)
    return range;
  if (range < kRangeTertiaryTable)
    return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];

  // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
  return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
}