diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/getonescriptspan.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/getonescriptspan.cc | 1086 |
1 files changed, 1086 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/getonescriptspan.cc b/application/basilisk/components/translation/cld2/internal/getonescriptspan.cc new file mode 100644 index 0000000000..6bdd4871b8 --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/getonescriptspan.cc @@ -0,0 +1,1086 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + + +#include "getonescriptspan.h" +#include <string.h> + +#include "fixunicodevalue.h" +#include "lang_script.h" +#include "port.h" +#include "utf8statetable.h" + +#include "utf8prop_lettermarkscriptnum.h" +#include "utf8repl_lettermarklower.h" +#include "utf8scannot_lettermarkspecial.h" + + +namespace CLD2 { + +// Alphabetical order for binary search, from +// generated_entities.cc +extern const int kNameToEntitySize; +extern const CharIntPair kNameToEntity[]; + +static const int kMaxUpToWordBoundary = 50; // span < this make longer, + // else make shorter +static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes + // to round to word boundary, + // direction above + +static const char kSpecialSymbol[256] = { // true for < > & + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, +}; + + + +#define LT 0 // < +#define GT 1 // > +#define EX 2 // ! +#define HY 3 // - +#define QU 4 // " +#define AP 5 // ' +#define SL 6 // / +#define S_ 7 +#define C_ 8 +#define R_ 9 +#define I_ 10 +#define P_ 11 +#define T_ 12 +#define Y_ 13 +#define L_ 14 +#define E_ 15 +#define CR 16 // <cr> or <lf> +#define NL 17 // non-letter: ASCII whitespace, digit, punctuation +#define PL 18 // possible letter, incl. & +#define xx 19 // <unused> + +// Map byte to one of ~20 interesting categories for cheap tag parsing +static const uint8 kCharToSub[256] = { + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, + + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, + + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, +}; + +#undef LT +#undef GT +#undef EX +#undef HY +#undef QU +#undef AP +#undef SL +#undef S_ +#undef C_ +#undef R_ +#undef I_ +#undef P_ +#undef T_ +#undef Y_ +#undef L_ +#undef E_ +#undef CR +#undef NL +#undef PL +#undef xx + + +#define OK 0 +#define X_ 1 + + +static const int kMaxExitStateLettersMarksOnly = 1; +static const int kMaxExitStateAllText = 2; + + +// State machine to do cheap parse of non-letter strings incl. tags +// advances <tag> +// | | +// advances <tag> ... </tag> for <script> <style> +// | | +// advances <!-- ... <tag> ... --> +// | | +// advances <tag +// || (0) +// advances <tag <tag2> +// || (0) +// +// We start in state [0] at a non-letter and make at least one transition +// When scanning for just letters, arriving back at state [0] or [1] exits +// the state machine. +// When scanning for any non-tag text, arriving at state [2] also exits +static const uint8 kTagParseTbl_0[] = { +// < > ! - " ' / S C R I P T Y L E CR NL PL xx + 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state + X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state + 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state] + X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < + X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! + X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- + 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* + 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- + 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* + 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" + 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' + X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' + +// < > ! - " ' / S C R I P T Y L E CR NL PL xx + X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S + X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC + X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP + X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT + 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* + 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< + 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF + 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S + 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC + 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR + 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI + 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP + 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT + +// < > ! - " ' / S C R I P T Y L E CR NL PL xx + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL + X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE + 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* + 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< + 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF + 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL + 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE +}; + +#undef OK +#undef X_ + +enum +{ + UTFmax = 4, // maximum bytes per rune + Runesync = 0x80, // cannot represent part of a UTF sequence (<) + Runeself = 0x80, // rune and UTF sequences are the same (<) + Runeerror = 0xFFFD, // decoding error in UTF + Runemax = 0x10FFFF, // maximum rune value +}; + +// Debugging. Not thread safe. +static char gDisplayPiece[32]; +const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4}; +char* DisplayPiece(const char* next_byte_, int byte_length_) { + // Copy up to 8 UTF-8 chars to buffer + int k = 0; // byte count + int n = 0; // character count + for (int i = 0; i < byte_length_; ++i) { + char c = next_byte_[i]; + if ((c & 0xc0) != 0x80) { + // Beginning of a UTF-8 character + int charlen = gCharlen[static_cast<uint8>(c) >> 4]; + if (i + charlen > byte_length_) {break;} // Not enough room for full char + if (k >= (32 - 7)) {break;} // Not necessarily enough room + if (n >= 8) {break;} // Enough characters already + ++n; + } + if (c == '<') { + memcpy(&gDisplayPiece[k], "<", 4); k += 4; + } else if (c == '>') { + memcpy(&gDisplayPiece[k], ">", 4); k += 4; + } else if (c == '&') { + memcpy(&gDisplayPiece[k], "&", 5); k += 5; + } else if (c == '\'') { + memcpy(&gDisplayPiece[k], "'", 6); k += 6; + } else if (c == '"') { + memcpy(&gDisplayPiece[k], """, 6); k += 6; + } else { + gDisplayPiece[k++] = c; + } + } + gDisplayPiece[k++] = '\0'; + return gDisplayPiece; +} + + + +// runetochar copies (encodes) one rune, pointed to by r, to at most +// UTFmax bytes starting at s and returns the number of bytes generated. +int runetochar(char *str, const char32 *rune) { + // Convert to unsigned for range check. + unsigned long c; + + // 1 char 00-7F + c = *rune; + if(c <= 0x7F) { + str[0] = c; + return 1; + } + + // 2 char 0080-07FF + if(c <= 0x07FF) { + str[0] = 0xC0 | (c >> 1*6); + str[1] = 0x80 | (c & 0x3F); + return 2; + } + + // Range check + if (c > Runemax) { + c = Runeerror; + } + + // 3 char 0800-FFFF + if (c <= 0xFFFF) { + str[0] = 0xE0 | (c >> 2*6); + str[1] = 0x80 | ((c >> 1*6) & 0x3F); + str[2] = 0x80 | (c & 0x3F); + return 3; + } + + // 4 char 10000-1FFFFF + str[0] = 0xF0 | (c >> 3*6); + str[1] = 0x80 | ((c >> 2*6) & 0x3F); + str[2] = 0x80 | ((c >> 1*6) & 0x3F); + str[3] = 0x80 | (c & 0x3F); + return 4; +} + + + +// Useful for converting an entity to an ascii value. +// RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ; +int LookupEntity(const char* entity_name, int entity_len) { + // Make a C string + if (entity_len >= 16) {return -1;} // All real entities are shorter + char temp[16]; + memcpy(temp, entity_name, entity_len); + temp[entity_len] = '\0'; + int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity); + if (match >= 0) {return kNameToEntity[match].i;} + return -1; +} + +bool ascii_isdigit(char c) { + return ('0' <= c) && (c <= '9'); +} +bool ascii_isxdigit(char c) { + if (('0' <= c) && (c <= '9')) {return true;} + if (('a' <= c) && (c <= 'f')) {return true;} + if (('A' <= c) && (c <= 'F')) {return true;} + return false; +} +bool ascii_isalnum(char c) { + if (('0' <= c) && (c <= '9')) {return true;} + if (('a' <= c) && (c <= 'z')) {return true;} + if (('A' <= c) && (c <= 'Z')) {return true;} + return false; +} +int hex_digit_to_int(char c) { + if (('0' <= c) && (c <= '9')) {return c - '0';} + if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;} + if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;} + return 0; +} + +static int32 strto32_base10(const char* nptr, const char* limit, + const char **endptr) { + *endptr = nptr; + while (nptr < limit && *nptr == '0') { + ++nptr; + } + if (nptr == limit || !ascii_isdigit(*nptr)) + return -1; + const char* end_digits_run = nptr; + while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) { + ++end_digits_run; + } + *endptr = end_digits_run; + const int num_digits = end_digits_run - nptr; + // kint32max == 2147483647. + if (num_digits < 9 || + (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) { + int value = 0; + for (; nptr < end_digits_run; ++nptr) { + value *= 10; + value += *nptr - '0'; + } + // Overflow past the last valid unicode codepoint + // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). + return FixUnicodeValue(value); + } else { + // Overflow: can't fit in an int32; + // returns the replacement character 0xFFFD. + return 0xFFFD; + } +} + +static int32 strto32_base16(const char* nptr, const char* limit, + const char **endptr) { + *endptr = nptr; + while (nptr < limit && *nptr == '0') { + ++nptr; + } + if (nptr == limit || !ascii_isxdigit(*nptr)) { + return -1; + } + const char* end_xdigits_run = nptr; + while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) { + ++end_xdigits_run; + } + *endptr = end_xdigits_run; + const int num_xdigits = end_xdigits_run - nptr; + // kint32max == 0x7FFFFFFF. + if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) { + int value = 0; + for (; nptr < end_xdigits_run; ++nptr) { + value <<= 4; + value += hex_digit_to_int(*nptr); + } + // Overflow past the last valid unicode codepoint + // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). + return FixUnicodeValue(value); + } else { + // Overflow: can't fit in an int32; + // returns the replacement character 0xFFFD. + return 0xFFFD; + } +} + +// Unescape the current character pointed to by src. SETS the number +// of chars read for the conversion (in UTF8). If src isn't a valid entity, +// just consume the & and RETURN -1. If src doesn't point to & -- which it +// should -- set src_consumed to 0 and RETURN -1. +int ReadEntity(const char* src, int srcn, int* src_consumed) { + const char* const srcend = src + srcn; + + if (srcn == 0 || *src != '&') { // input should start with an ampersand + *src_consumed = 0; + return -1; + } + *src_consumed = 1; // we'll get the & at least + + // The standards are a bit unclear on when an entity ends. Certainly a ";" + // ends one, but spaces probably do too. We follow the lead of both IE and + // Netscape, which as far as we can tell end numeric entities (1st case below) + // at any non-digit, and end character entities (2nd case) at any non-alnum. + const char* entstart, *entend; // where the entity starts and ends + entstart = src + 1; // read past the & + int entval; // UCS2 value of the entity + if ( *entstart == '#' ) { // -- 1st case: numeric entity + if ( entstart + 2 >= srcend ) { + return -1; // no way a legitimate number could fit + } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric + entval = strto32_base16(entstart + 2, srcend, &entend); + } else { // decimal numeric entity + entval = strto32_base10(entstart+1, srcend, &entend); + } + if (entval == -1 || entend > srcend) { + return -1; // not entirely correct, but close enough + } + } else { // -- 2nd case: character entity + for (entend = entstart; + entend < srcend && ascii_isalnum(*entend); + ++entend ) { + // entity consists of alphanumeric chars + } + entval = LookupEntity(entstart, entend - entstart); + if (entval < 0) { + return -1; // not a legal entity name + } + // Now we do a strange-seeming IE6-compatibility check: if entval is + // >= 256, it *must* be followed by a semicolon or it's not considered + // an entity. The problem is lots of the newfangled entity names, like + // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en". + // When these links are written in HTML, it would be really bad if the + // "&lang" were treated as an entity, which is what the spec says + // *should* happen (even when the HTML is inside an "A HREF" tag!) + // IE ignores the spec for these new, high-value entities, so we do too. + if ( entval >= 256 && !(entend < srcend && *entend == ';') ) { + return -1; // make non-;-terminated entity illegal + } + } + + // Finally, figure out how much src was consumed + if ( entend < srcend && *entend == ';' ) { + entend++; // standard says ; terminator is special + } + *src_consumed = entend - src; + return entval; +} + + +// Src points to '&' +// Writes entity value to dst. Returns take(src), put(dst) byte counts +void EntityToBuffer(const char* src, int len, char* dst, + int* tlen, int* plen) { + char32 entval = ReadEntity(src, len, tlen); + + // ReadEntity does this already: entval = FixUnicodeValue(entval); + + // Convert UTF-32 to UTF-8 + if (entval > 0) { + *plen = runetochar(dst, &entval); + } else { + // Illegal entity; ignore the '&' + *tlen = 1; + *plen = 0; + } +} + +// Returns true if character is < > or &, none of which are letters +bool inline IsSpecial(char c) { + if ((c & 0xe0) == 0x20) { + return kSpecialSymbol[static_cast<uint8>(c)]; + } + return false; +} + +// Quick Skip to next letter or < > & or to end of string (eos) +// Always return is_letter for eos +int ScanToLetterOrSpecial(const char* src, int len) { + int bytes_consumed; + StringPiece str(src, len); + UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed); + return bytes_consumed; +} + + + + +// src points to non-letter, such as tag-opening '<' +// Return length from here to next possible letter +// On another < before >, return 1 +// advances <tag> +// | | +// advances <tag> ... </tag> for <script> <style> +// | | +// advances <!-- ... <tag> ... --> +// | | +// advances <tag +// | | end of string +// advances <tag <tag2> +// || +int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { + const uint8* src = reinterpret_cast<const uint8*>(isrc); + const uint8* srclimit = src + len; + const uint8* tagParseTbl = kTagParseTbl_0; + int e = 0; + while (src < srclimit) { + e = tagParseTbl[kCharToSub[*src++]]; + if (e <= max_exit_state) { + // We overshot by one byte + --src; + break; + } + tagParseTbl = &kTagParseTbl_0[e * 20]; + } + + if (src >= srclimit) { + // We fell off the end of the text. + // It looks like the most common case for this is a truncated file, not + // mismatched angle brackets. So we pretend that the last char was '>' + return len; + } + + // OK to be in state 0 or state 2 at exit + if ((e != 0) && (e != 2)) { + // Error, '<' followed by '<' + // We want to back up to first <, then advance by one byte past it + int offset = src - reinterpret_cast<const uint8*>(isrc); + + // Backscan to first '<' and return enough length to just get past it + --offset; // back up over the second '<', which caused us to stop + while ((0 < offset) && (isrc[offset] != '<')) { + // Find the first '<', which is unmatched + --offset; + } + // skip to just beyond first '<' + return offset + 1; + } + + return src - reinterpret_cast<const uint8*>(isrc); +} + + +ScriptScanner::ScriptScanner(const char* buffer, + int buffer_length, + bool is_plain_text) + : start_byte_(buffer), + next_byte_(buffer), + next_byte_limit_(buffer + buffer_length), + byte_length_(buffer_length), + is_plain_text_(is_plain_text), + letters_marks_only_(true), + one_script_only_(true), + exit_state_(kMaxExitStateLettersMarksOnly) { + script_buffer_ = new char[kMaxScriptBuffer]; + script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; + map2original_.Clear(); // map from script_buffer_ to buffer + map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ +} + +// Extended version to allow spans of any non-tag text and spans of mixed script +ScriptScanner::ScriptScanner(const char* buffer, + int buffer_length, + bool is_plain_text, + bool any_text, + bool any_script) + : start_byte_(buffer), + next_byte_(buffer), + next_byte_limit_(buffer + buffer_length), + byte_length_(buffer_length), + is_plain_text_(is_plain_text), + letters_marks_only_(!any_text), + one_script_only_(!any_script), + exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) { + script_buffer_ = new char[kMaxScriptBuffer]; + script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; + map2original_.Clear(); // map from script_buffer_ to buffer + map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ +} + + +ScriptScanner::~ScriptScanner() { + delete[] script_buffer_; + delete[] script_buffer_lower_; +} + + + + +// Get to the first real non-tag letter or entity that is a letter +// Sets script of that letter +// Return len if no more letters +int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { + int sc = UNKNOWN_ULSCRIPT; + int skip = 0; + int tlen, plen; + + // Do run of non-letters (tag | &NL | NL)* + tlen = 0; + while (skip < len) { + // Do fast scan to next interesting byte + // int oldskip = skip; + skip += ScanToLetterOrSpecial(src + skip, len - skip); + + // Check for no more letters/specials + if (skip >= len) { + // All done + *script = sc; + return len; + } + + // We are at a letter, nonletter, tag, or entity + if (IsSpecial(src[skip]) && !is_plain_text_) { + if (src[skip] == '<') { + // Begining of tag; skip to end and go around again + tlen = ScanToPossibleLetter(src + skip, len - skip, + exit_state_); + sc = 0; + } else if (src[skip] == '>') { + // Unexpected end of tag; skip it and go around again + tlen = 1; // Over the > + sc = 0; + } else if (src[skip] == '&') { + // Expand entity, no advance + char temp[4]; + EntityToBuffer(src + skip, len - skip, + temp, &tlen, &plen); + sc = GetUTF8LetterScriptNum(temp); + } + } else { + // Update 1..4 bytes + tlen = UTF8OneCharLen(src + skip); + sc = GetUTF8LetterScriptNum(src + skip); + } + if (sc != 0) {break;} // Letter found + skip += tlen; // Else advance + } + + *script = sc; + return skip; +} + + +// These are for ASCII-only tag names +// Compare one letter uplow to c, ignoring case of uplowp +inline bool EqCase(char uplow, char c) { + return (uplow | 0x20) == c; +} + +// These are for ASCII-only tag names +// Return true for space / < > etc. all less than 0x40 +inline bool NeqLetter(char c) { + return c < 0x40; +} + +// These are for ASCII-only tag names +// Return true for space \n false for \r +inline bool WS(char c) { + return (c == ' ') || (c == '\n'); +} + +// Canonical CR or LF +static const char LF = '\n'; + + +// The naive loop scans from next_byte_ to script_buffer_ until full. +// But this can leave an awkward hard-to-identify short fragment at the +// end of the input. We would prefer to make the next-to-last fragment +// shorter and the last fragment longer. + +// Copy next run of non-tag characters to buffer [NUL terminated] +// This just replaces tags with space or \n and removes entities. +// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences +// including \r or \n are replaced by \n. All other tags and skipped text +// are replaced with ASCII space. +// +// Buffer ALWAYS has leading space and trailing space space space NUL +bool ScriptScanner::GetOneTextSpan(LangSpan* span) { + span->text = script_buffer_; + span->text_bytes = 0; + span->offset = next_byte_ - start_byte_; + span->ulscript = UNKNOWN_ULSCRIPT; + span->lang = UNKNOWN_LANGUAGE; + span->truncated = false; + + int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; + if ((kMaxScriptBytes <= byte_length_) && + (byte_length_ < (2 * kMaxScriptBytes))) { + // Try to split the last two fragments in half + put_soft_limit = byte_length_ / 2; + } + + script_buffer_[0] = ' '; // Always a space at front of output + script_buffer_[1] = '\0'; + int take = 0; + int put = 1; // Start after the initial space + int tlen, plen; + + if (byte_length_ <= 0) { + return false; // No more text to be found + } + + // Go over alternating spans of text and tags, + // copying letters to buffer with single spaces for each run of non-letters + bool last_byte_was_space = false; + while (take < byte_length_) { + char c = next_byte_[take]; + if (c == '\r') {c = LF;} // Canonical CR or LF + if (c == '\n') {c = LF;} // Canonical CR or LF + + if (IsSpecial(c) && !is_plain_text_) { + if (c == '<') { + // Replace tag with space + c = ' '; // for almost-full test below + // or if <p> <br> <tr>, replace with \n + if (take < (byte_length_ - 3)) { + if (EqCase(next_byte_[take + 1], 'p') && + NeqLetter(next_byte_[take + 2])) { + c = LF; + } + if (EqCase(next_byte_[take + 1], 'b') && + EqCase(next_byte_[take + 2], 'r') && + NeqLetter(next_byte_[take + 3])) { + c = LF; + } + if (EqCase(next_byte_[take + 1], 't') && + EqCase(next_byte_[take + 2], 'r') && + NeqLetter(next_byte_[take + 3])) { + c = LF; + } + } + // Begining of tag; skip to end and go around again + tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, + exit_state_); + // Copy one byte, compressing spaces + if (!last_byte_was_space || !WS(c)) { + script_buffer_[put++] = c; // Advance dest + last_byte_was_space = WS(c); + } + } else if (c == '>') { + // Unexpected end of tag; copy it and go around again + tlen = 1; // Over the > + script_buffer_[put++] = c; // Advance dest + } else if (c == '&') { + // Expand entity, no advance + EntityToBuffer(next_byte_ + take, byte_length_ - take, + script_buffer_ + put, &tlen, &plen); + put += plen; // Advance dest + } + take += tlen; // Advance source + } else { + // Copy one byte, compressing spaces + if (!last_byte_was_space || !WS(c)) { + script_buffer_[put++] = c; // Advance dest + last_byte_was_space = WS(c); + } + ++take; // Advance source + } + + if (WS(c) && + (put >= put_soft_limit)) { + // Buffer is almost full + span->truncated = true; + break; + } + if (put >= kMaxScriptBytes) { + // Buffer is completely full + span->truncated = true; + break; + } + } + + // Almost done. Back up to a character boundary if needed + while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) { + // Back up over continuation byte + --take; + --put; + } + + // Update input position + next_byte_ += take; + byte_length_ -= take; + + // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 + // kMaxScriptBytes | | put + script_buffer_[put + 0] = ' '; + script_buffer_[put + 1] = ' '; + script_buffer_[put + 2] = ' '; + script_buffer_[put + 3] = '\0'; + + span->text_bytes = put; // Does not include the last four chars above + return true; +} + + +// Copy next run of same-script non-tag letters to buffer [NUL terminated] +// Buffer ALWAYS has leading space and trailing space space space NUL +bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { + if (!letters_marks_only_) { + // Return non-tag text, including punctuation and digits + return GetOneTextSpan(span); + } + + span->text = script_buffer_; + span->text_bytes = 0; + span->offset = next_byte_ - start_byte_; + span->ulscript = UNKNOWN_ULSCRIPT; + span->lang = UNKNOWN_LANGUAGE; + span->truncated = false; + + // struct timeval script_start, script_mid, script_end; + + int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; + if ((kMaxScriptBytes <= byte_length_) && + (byte_length_ < (2 * kMaxScriptBytes))) { + // Try to split the last two fragments in half + put_soft_limit = byte_length_ / 2; + } + + + int spanscript; // The script of this span + int sc = UNKNOWN_ULSCRIPT; // The script of next character + int tlen = 0; + int plen = 0; + + script_buffer_[0] = ' '; // Always a space at front of output + script_buffer_[1] = '\0'; + int take = 0; + int put = 1; // Start after the initial space + + // Build offsets from span->text back to start_byte_ + span->offset + // This mapping reflects deletion of non-letters, expansion of + // entities, etc. + map2original_.Clear(); + map2original_.Delete(span->offset); // So that MapBack(0) gives offset + + // Get to the first real non-tag letter or entity that is a letter + int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); + next_byte_ += skip; + byte_length_ -= skip; + + if (skip != 1) { + map2original_.Delete(skip); + map2original_.Insert(1); + } else { + map2original_.Copy(1); + } + if (byte_length_ <= 0) { + map2original_.Reset(); + return false; // No more letters to be found + } + + // There is at least one letter, so we know the script for this span + span->ulscript = (ULScript)spanscript; + + + // Go over alternating spans of same-script letters and non-letters, + // copying letters to buffer with single spaces for each run of non-letters + while (take < byte_length_) { + // Copy run of letters in same script (&LS | LS)* + int letter_count = 0; // Keep track of word length + bool need_break = false; + + while (take < byte_length_) { + // We are at a letter, nonletter, tag, or entity + if (IsSpecial(next_byte_[take]) && !is_plain_text_) { + if (next_byte_[take] == '<') { + // Begining of tag + sc = 0; + break; + } else if (next_byte_[take] == '>') { + // Unexpected end of tag + sc = 0; + break; + } else if (next_byte_[take] == '&') { + // Copy entity, no advance + EntityToBuffer(next_byte_ + take, byte_length_ - take, + script_buffer_ + put, &tlen, &plen); + sc = GetUTF8LetterScriptNum(script_buffer_ + put); + } + } else { + // Real letter, safely copy up to 4 bytes, increment by 1..4 + // Will update by 1..4 bytes at Advance, below + tlen = plen = UTF8OneCharLen(next_byte_ + take); + if (take < (byte_length_ - 3)) { + // X86 fast case, does unaligned load/store + UNALIGNED_STORE32(script_buffer_ + put, + UNALIGNED_LOAD32(next_byte_ + take)); + + } else { + // Slow case, happens 1-3 times per input document + memcpy(script_buffer_ + put, next_byte_ + take, plen); + } + sc = GetUTF8LetterScriptNum(next_byte_ + take); + } + + // Allow continue across a single letter in a different script: + // A B D = three scripts, c = common script, i = inherited script, + // - = don't care, ( = take position before the += below + // AAA(A- continue + // + // AAA(BA continue + // AAA(BB break + // AAA(Bc continue (breaks after B) + // AAA(BD break + // AAA(Bi break + // + // AAA(c- break + // + // AAA(i- continue + // + + if ((sc != spanscript) && (sc != ULScript_Inherited)) { + // Might need to break this script span + if (sc == ULScript_Common) { + need_break = true; + } else { + // Look at next following character, ignoring entity as Common + int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen); + if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { + // We found a non-trivial change of script + if (one_script_only_) { + need_break = true; + } + } + } + } + if (need_break) {break;} // Non-letter or letter in wrong script + + take += tlen; // Advance + put += plen; // Advance + + // Update the offset map to reflect take/put lengths + if (tlen == plen) { + map2original_.Copy(tlen); + } else if (tlen < plen) { + map2original_.Copy(tlen); + map2original_.Insert(plen - tlen); + } else { // plen < tlen + map2original_.Copy(plen); + map2original_.Delete(tlen - plen); + } + + ++letter_count; + if (put >= kMaxScriptBytes) { + // Buffer is full + span->truncated = true; + break; + } + } // End while letters + + // Do run of non-letters (tag | &NL | NL)* + while (take < byte_length_) { + // Do fast scan to next interesting byte + tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); + take += tlen; + map2original_.Delete(tlen); + if (take >= byte_length_) {break;} // Might have scanned to end + + // We are at a letter, nonletter, tag, or entity + if (IsSpecial(next_byte_[take]) && !is_plain_text_) { + if (next_byte_[take] == '<') { + // Begining of tag; skip to end and go around again + tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, + exit_state_); + sc = 0; + } else if (next_byte_[take] == '>') { + // Unexpected end of tag; skip it and go around again + tlen = 1; // Over the > + sc = 0; + } else if (next_byte_[take] == '&') { + // Expand entity, no advance + EntityToBuffer(next_byte_ + take, byte_length_ - take, + script_buffer_ + put, &tlen, &plen); + sc = GetUTF8LetterScriptNum(script_buffer_ + put); + } + } else { + // Update 1..4 + tlen = UTF8OneCharLen(next_byte_ + take); + sc = GetUTF8LetterScriptNum(next_byte_ + take); + } + if (sc != 0) {break;} // Letter found + take += tlen; // Else advance + map2original_.Delete(tlen); + } // End while not-letters + + script_buffer_[put++] = ' '; + map2original_.Insert(1); + + // Letter in wrong script ? + if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;} + if (put >= put_soft_limit) { + // Buffer is almost full + span->truncated = true; + break; + } + } + + // Almost done. Back up to a character boundary if needed + while ((0 < take) && (take < byte_length_) && + ((next_byte_[take] & 0xc0) == 0x80)) { + // Back up over continuation byte + --take; + --put; + } + + // Update input position + next_byte_ += take; + byte_length_ -= take; + + // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 + // kMaxScriptBytes | | put + script_buffer_[put + 0] = ' '; + script_buffer_[put + 1] = ' '; + script_buffer_[put + 2] = ' '; + script_buffer_[put + 3] = '\0'; + map2original_.Insert(4); + map2original_.Reset(); + + span->text_bytes = put; // Does not include the last four chars above + return true; +} + +// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase +// List changes with each version of Unicode, so just always lowercase +// Unicode 6.2.0: +// ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN +void ScriptScanner::LowerScriptSpan(LangSpan* span) { + // If needed, lowercase all the text. If we do it sooner, might miss + // lowercasing an entity such as Á + // We only need to do this for Latn and Cyrl scripts + map2uplow_.Clear(); + // Full Unicode lowercase of the entire buffer, including + // four pad bytes off the end. + // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad + // bytes and put the 0x00 in explicitly. + // Build an offset map from script_buffer_lower_ back to script_buffer_ + int consumed, filled, changed; + StringPiece istr(span->text, span->text_bytes + 3); + StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer); + + UTF8GenericReplace(&utf8repl_lettermarklower_obj, + istr, ostr, is_plain_text_, + &consumed, &filled, &changed, &map2uplow_); + script_buffer_lower_[filled] = '\0'; + span->text = script_buffer_lower_; + span->text_bytes = filled - 3; + map2uplow_.Reset(); +} + +// Copy next run of same-script non-tag letters to buffer [NUL terminated] +// Force Latin, Cyrillic, Greek scripts to be lowercase +// Buffer ALWAYS has leading space and trailing space space space NUL +bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { + bool ok = GetOneScriptSpan(span); + LowerScriptSpan(span); + return ok; +} + + +// Maps byte offset in most recent GetOneScriptSpan/Lower +// span->text [0..text_bytes] into an additional byte offset from +// span->offset, to get back to corresponding text in the original +// input buffer. +// text_offset must be the first byte +// of a UTF-8 character, or just beyond the last character. Normally this +// routine is called with the first byte of an interesting range and +// again with the first byte of the following range. +int ScriptScanner::MapBack(int text_offset) { + return map2original_.MapBack(map2uplow_.MapBack(text_offset)); +} + + +// Gets lscript number for letters; always returns +// 0 (common script) for non-letters +int GetUTF8LetterScriptNum(const char* src) { + int srclen = UTF8OneCharLen(src); + const uint8* usrc = reinterpret_cast<const uint8*>(src); + return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj, + &usrc, &srclen); +} + +} // namespace CLD2 + + |