diff options
author | Matt A. Tobin <email@mattatobin.com> | 2021-11-29 13:42:30 -0500 |
---|---|---|
committer | Matt A. Tobin <email@mattatobin.com> | 2021-11-29 13:42:30 -0500 |
commit | 2985bb59256adf0fc2bdc371e55ee303dfaf910e (patch) | |
tree | 1ca3674a54e762ce8bc4778c3390a3281edc8005 /libs | |
parent | 9c59ce27775b068c8f907d6af171ed9ca3abc6b9 (diff) | |
download | aura-central-2985bb59256adf0fc2bdc371e55ee303dfaf910e.tar.gz |
Issue %3003 - Move hunspell to libs
Diffstat (limited to 'libs')
34 files changed, 17849 insertions, 0 deletions
diff --git a/libs/hunspell/README.mozilla b/libs/hunspell/README.mozilla new file mode 100644 index 000000000..79a9f54d1 --- /dev/null +++ b/libs/hunspell/README.mozilla @@ -0,0 +1,2 @@ +Hunspell Version: 1.4.1 +Additional Patches: See patches directory. diff --git a/libs/hunspell/license.hunspell b/libs/hunspell/license.hunspell new file mode 100644 index 000000000..dc2ce9c1e --- /dev/null +++ b/libs/hunspell/license.hunspell @@ -0,0 +1,61 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Laszlo Nemeth (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): + * David Einstein + * Davide Prina + * Giuseppe Modugno + * Gianluca Turconi + * Simon Brouwer + * Noll Janos + * Biro Arpad + * Goldman Eleonora + * Sarlos Tamas + * Bencsath Boldizsar + * Halacsy Peter + * Dvornik Laszlo + * Gefferth Andras + * Nagy Viktor + * Varga Daniel + * Chris Halls + * Rene Engelhard + * Bram Moolenaar + * Dafydd Jones + * Harri Pitkanen + * Andras Timar + * Tor Lillqvist + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef MOZILLA_CLIENT +# include "config.h" +#endif diff --git a/libs/hunspell/license.myspell b/libs/hunspell/license.myspell new file mode 100644 index 000000000..2da533075 --- /dev/null +++ b/libs/hunspell/license.myspell @@ -0,0 +1,61 @@ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * + * NOTE: A special thanks and credit goes to Geoff Kuenning + * the creator of ispell. MySpell's affix algorithms were + * based on those of ispell which should be noted is + * copyright Geoff Kuenning et.al. and now available + * under a BSD style license. For more information on ispell + * and affix compression in general, please see: + * http://www.cs.ucla.edu/ficus-members/geoff/ispell.html + * (the home page for ispell) + * + * An almost complete rewrite of MySpell for use by + * the Mozilla project has been developed by David Einstein + * (Deinst@world.std.com). David and I are now + * working on parallel development tracks to help + * our respective projects (Mozilla and OpenOffice.org + * and we will maintain full affix file and dictionary + * file compatibility and work on merging our versions + * of MySpell back into a single tree. David has been + * a significant help in improving MySpell. + * + * Special thanks also go to La'szlo' Ne'meth + * <nemethl@gyorsposta.hu> who is the author of the + * Hungarian dictionary and who developed and contributed + * the code to support compound words in MySpell + * and fixed numerous problems with the encoding + * case conversion tables. + * + */ diff --git a/libs/hunspell/moz.build b/libs/hunspell/moz.build new file mode 100644 index 000000000..4f983858e --- /dev/null +++ b/libs/hunspell/moz.build @@ -0,0 +1,33 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +include('/ipc/chromium/chromium-config.mozbuild') + +SOURCES += [ + 'src/affentry.cxx', + 'src/affixmgr.cxx', + 'src/csutil.cxx', + 'src/filemgr.cxx', + 'src/hashmgr.cxx', + 'src/hunspell.cxx', + 'src/hunzip.cxx', + 'src/phonet.cxx', + 'src/replist.cxx', + 'src/suggestmgr.cxx', +] + +LOCAL_INCLUDES += ['/extensions/spellcheck/hunspell/glue'] + +# This variable is referenced in configure.in. Make sure to change that file +# too if you need to change this variable. +DEFINES['HUNSPELL_STATIC'] = True + +if CONFIG['CLANG_CXX'] or CONFIG['CLANG_CL']: + CXXFLAGS += ['-Wno-implicit-fallthrough'] + +# We allow warnings for third-party code that can be updated from upstream. +ALLOW_COMPILER_WARNINGS = True + +FINAL_LIBRARY = 'xul'
\ No newline at end of file diff --git a/libs/hunspell/patches/1322666 b/libs/hunspell/patches/1322666 new file mode 100644 index 000000000..16db1fbe6 --- /dev/null +++ b/libs/hunspell/patches/1322666 @@ -0,0 +1,24 @@ +Bug 1322666 - Change MAXWORDLEN to 100 + +diff --git a/extensions/spellcheck/hunspell/src/hunspell.cxx b/extensions/spellcheck/hunspell/src/hunspell.cxx +--- a/extensions/spellcheck/hunspell/src/hunspell.cxx ++++ b/extensions/spellcheck/hunspell/src/hunspell.cxx +@@ -80,17 +80,17 @@ + #ifndef MOZILLA_CLIENT + #include "config.h" + #endif + #include "csutil.hxx" + + #include <limits> + #include <string> + +-#define MAXWORDLEN 176 ++#define MAXWORDLEN 100 + #define MAXWORDUTF8LEN (MAXWORDLEN * 3) + + Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) { + encoding = NULL; + csconv = NULL; + utf8 = 0; + complexprefixes = 0; + affixpath = mystrdup(affpath); diff --git a/libs/hunspell/src/README b/libs/hunspell/src/README new file mode 100644 index 000000000..b97a112fd --- /dev/null +++ b/libs/hunspell/src/README @@ -0,0 +1,21 @@ +Hunspell spell checker and morphological analyser library + +Documentation, tests, examples: http://hunspell.github.io/ + +Author of Hunspell: +László Németh (nemethl (at) gyorsposta.hu) + +Hunspell based on OpenOffice.org's Myspell. MySpell's author: +Kevin Hendricks (kevin.hendricks (at) sympatico.ca) + +License: GPL 2.0/LGPL 2.1/MPL 1.1 tri-license + +The contents of this library may be used under the terms of +the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL", +see http://gnu.org/copyleft/lesser.html) or the Mozilla Public License +Version 1.1 or later (the "MPL", see http://mozilla.org/MPL/MPL-1.1.html). + +Software distributed under these licenses is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the licences +for the specific language governing rights and limitations under the licenses. diff --git a/libs/hunspell/src/affentry.cxx b/libs/hunspell/src/affentry.cxx new file mode 100644 index 000000000..bd2827436 --- /dev/null +++ b/libs/hunspell/src/affentry.cxx @@ -0,0 +1,1068 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include "affentry.hxx" +#include "csutil.hxx" + +PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) + // register affix manager + : pmyMgr(pmgr), + next(NULL), + nexteq(NULL), + nextne(NULL), + flgnxt(NULL) { + // set up its initial values + aflag = dp->aflag; // flag + strip = dp->strip; // string to strip + appnd = dp->appnd; // string to append + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag + // then copy over all of the conditions + if (opts & aeLONGCOND) { + memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else + memcpy(c.conds, dp->c.conds, MAXCONDLEN); + morphcode = dp->morphcode; + contclass = dp->contclass; + contclasslen = dp->contclasslen; +} + +PfxEntry::~PfxEntry() { + aflag = 0; + pmyMgr = NULL; + if (opts & aeLONGCOND) + free(c.l.conds2); + if (morphcode && !(opts & aeALIASM)) + free(morphcode); + if (contclass && !(opts & aeALIASF)) + free(contclass); +} + +// add prefix to this word assuming conditions hold +char* PfxEntry::add(const char* word, size_t len) { + if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word) && + (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) { + /* we have a match so add prefix */ + std::string tword(appnd); + tword.append(word + strip.size()); + return mystrdup(tword.c_str()); + } + return NULL; +} + +inline char* PfxEntry::nextchar(char* p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.conds + MAXCONDLEN_1) + return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) + return NULL; + return *p ? p : NULL; + } + return NULL; +} + +inline int PfxEntry::test_condition(const char* st) { + const char* pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) + return 1; + char* p = c.conds; + while (1) { + switch (*p) { + case '\0': + return 1; + case '[': { + neg = false; + ingroup = false; + p = nextchar(p); + pos = st; + break; + } + case '^': { + p = nextchar(p); + neg = true; + break; + } + case ']': { + if ((neg && ingroup) || (!neg && !ingroup)) + return 0; + pos = NULL; + p = nextchar(p); + // skip the next character + if (!ingroup && *st) + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++) + ; + if (*st == '\0' && p) + return 0; // word <= condition + break; + } + case '.': + if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++) + ; + if (*st == '\0' && p) + return 0; // word <= condition + break; + } + /* FALLTHROUGH */ + default: { + if (*st == *p) { + st++; + p = nextchar(p); + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte + while (p && (*p & 0xc0) == 0x80) { // character + if (*p != *st) { + if (!pos) + return 0; + st = pos; + break; + } + p = nextchar(p); + st++; + } + if (pos && st != pos) { + ingroup = true; + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { + } + } + } else if (pos) { + ingroup = true; + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { + } + } + } else if (pos) { // group + p = nextchar(p); + } else + return 0; + } + } + if (!p) + return 1; + } +} + +// check if this prefix entry matches +struct hentry* PfxEntry::checkword(const char* word, + int len, + char in_compound, + const FLAG needflag) { + struct hentry* he; // hash entry of root word or NULL + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + + if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + std::string tmpword(strip); + tmpword.append(word + appnd.size()); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword.c_str())) { + tmpl += strip.size(); + if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) { + do { + if (TESTAFF(he->astr, aflag, he->alen) && + // forbid single prefixes with needaffix flag + !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && + // needflag + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || + (contclass && TESTAFF(contclass, needflag, contclasslen)))) + return he; + he = he->next_homonym; // check homonyms + } while (he); + } + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + // if ((opts & aeXPRODUCT) && in_compound) { + if ((opts & aeXPRODUCT)) { + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this, + NULL, 0, NULL, FLAG_NULL, needflag, + in_compound); + if (he) + return he; + } + } + } + return NULL; +} + +// check if this prefix entry matches +struct hentry* PfxEntry::check_twosfx(const char* word, + int len, + char in_compound, + const FLAG needflag) { + struct hentry* he; // hash entry of root word or NULL + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + strip.size() >= numconds)) { + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + std::string tmpword(strip); + tmpword.append(word + appnd.size()); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword.c_str())) { + tmpl += strip.size(); + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // cross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this, + needflag); + if (he) + return he; + } + } + } + return NULL; +} + +// check if this prefix entry matches +char* PfxEntry::check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + strip.size() >= numconds)) { + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + std::string tmpword(strip); + tmpword.append(word + appnd.size()); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword.c_str())) { + tmpl += strip.size(); + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + return pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl, + aeXPRODUCT, + this, needflag); + } + } + } + return NULL; +} + +// check if this prefix entry matches +char* PfxEntry::check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + struct hentry* he; // hash entry of root word or NULL + char* st; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + strip.size() >= numconds)) { + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + std::string tmpword(strip); + tmpword.append(word + appnd.size()); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword.c_str())) { + std::string result; + + tmpl += strip.size(); + if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) { + do { + if (TESTAFF(he->astr, aflag, he->alen) && + // forbid single prefixes with needaffix flag + !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && + // needflag + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || + (contclass && TESTAFF(contclass, needflag, contclasslen)))) { + if (morphcode) { + result.append(" "); + result.append(morphcode); + } else + result.append(getKey()); + if (!HENTRY_FIND(he, MORPH_STEM)) { + result.append(" "); + result.append(MORPH_STEM); + result.append(HENTRY_WORD(he)); + } + // store the pointer of the hash entry + if (HENTRY_DATA(he)) { + result.append(" "); + result.append(HENTRY_DATA2(he)); + } else { + // return with debug information + char* flag = pmyMgr->encode_flag(getFlag()); + result.append(" "); + result.append(MORPH_FLAG); + result.append(flag); + free(flag); + } + result.append("\n"); + } + he = he->next_homonym; + } while (he); + } + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this, + FLAG_NULL, needflag); + if (st) { + result.append(st); + free(st); + } + } + + if (!result.empty()) + return mystrdup(result.c_str()); + } + } + + return NULL; +} + +SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp) + : pmyMgr(pmgr) // register affix manager + , + next(NULL), + nexteq(NULL), + nextne(NULL), + flgnxt(NULL), + l_morph(NULL), + r_morph(NULL), + eq_morph(NULL) { + // set up its initial values + aflag = dp->aflag; // char flag + strip = dp->strip; // string to strip + appnd = dp->appnd; // string to append + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag + + // then copy over all of the conditions + if (opts & aeLONGCOND) { + memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else + memcpy(c.conds, dp->c.conds, MAXCONDLEN); + rappnd = appnd; + reverseword(rappnd); + morphcode = dp->morphcode; + contclass = dp->contclass; + contclasslen = dp->contclasslen; +} + +SfxEntry::~SfxEntry() { + aflag = 0; + pmyMgr = NULL; + if (opts & aeLONGCOND) + free(c.l.conds2); + if (morphcode && !(opts & aeALIASM)) + free(morphcode); + if (contclass && !(opts & aeALIASF)) + free(contclass); +} + +// add suffix to this word assuming conditions hold +char* SfxEntry::add(const char* word, size_t len) { + /* make sure all conditions match */ + if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word + len, word) && + (!strip.size() || + (strcmp(word + len - strip.size(), strip.c_str()) == 0))) { + std::string tword(word); + /* we have a match so add suffix */ + tword.replace(len - strip.size(), std::string::npos, appnd); + return mystrdup(tword.c_str()); + } + return NULL; +} + +inline char* SfxEntry::nextchar(char* p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.l.conds1 + MAXCONDLEN_1) + return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) + return NULL; + return *p ? p : NULL; + } + return NULL; +} + +inline int SfxEntry::test_condition(const char* st, const char* beg) { + const char* pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) + return 1; + char* p = c.conds; + st--; + int i = 1; + while (1) { + switch (*p) { + case '\0': + return 1; + case '[': + p = nextchar(p); + pos = st; + break; + case '^': + p = nextchar(p); + neg = true; + break; + case ']': + if (!neg && !ingroup) + return 0; + i++; + // skip the next character + if (!ingroup) { + for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--) + ; + st--; + } + pos = NULL; + neg = false; + ingroup = false; + p = nextchar(p); + if (st < beg && p) + return 0; // word <= condition + break; + case '.': + if (!pos) { + // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; + st--) + ; + if (st < beg) { // word <= condition + if (p) + return 0; + else + return 1; + } + if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character + st--; + if (st < beg) { // word <= condition + if (p) + return 0; + else + return 1; + } + } + break; + } + /* FALLTHROUGH */ + default: { + if (*st == *p) { + p = nextchar(p); + if ((opts & aeUTF8) && (*st & 0x80)) { + st--; + while (p && (st >= beg)) { + if (*p != *st) { + if (!pos) + return 0; + st = pos; + break; + } + // first byte of the UTF-8 multibyte character + if ((*p & 0xc0) != 0x80) + break; + p = nextchar(p); + st--; + } + if (pos && st != pos) { + if (neg) + return 0; + else if (i == numconds) + return 1; + ingroup = true; + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { + } + st--; + } + if (p && *p != ']') + p = nextchar(p); + } else if (pos) { + if (neg) + return 0; + else if (i == numconds) + return 1; + ingroup = true; + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { + } + // if (p && *p != ']') p = nextchar(p); + st--; + } + if (!pos) { + i++; + st--; + } + if (st < beg && p && *p != ']') + return 0; // word <= condition + } else if (pos) { // group + p = nextchar(p); + } else + return 0; + } + } + if (!p) + return 1; + } +} + +// see if this suffix is present in the word +struct hentry* SfxEntry::checkword(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + char** wlst, + int maxSug, + int* ns, + const FLAG cclass, + const FLAG needflag, + const FLAG badflag) { + struct hentry* he; // hash entry pointer + PfxEntry* ep = ppfx; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + // the second condition is not enough for UTF-8 strings + // it checked in test_condition() + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + strip.size() >= numconds)) { + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + std::string tmpstring(word, tmpl); + if (strip.size()) { + tmpstring.append(strip); + } + + const char* tmpword = tmpstring.c_str(); + const char* endword = tmpword + tmpstring.size(); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(endword, tmpword)) { +#ifdef SZOSZABLYA_POSSIBLE_ROOTS + fprintf(stdout, "%s %s %c\n", word, tmpword, aflag); +#endif + if ((he = pmyMgr->lookup(tmpword)) != NULL) { + do { + // check conditional suffix (enabled by prefix) + if ((TESTAFF(he->astr, aflag, he->alen) || + (ep && ep->getCont() && + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || + (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) || + // enabled by prefix + ((contclass) && + (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) && + // handle cont. class + ((!cclass) || + ((contclass) && TESTAFF(contclass, cclass, contclasslen))) && + // check only in compound homonyms (bad flags) + (!badflag || !TESTAFF(he->astr, badflag, he->alen)) && + // handle required flag + ((!needflag) || + (TESTAFF(he->astr, needflag, he->alen) || + ((contclass) && TESTAFF(contclass, needflag, contclasslen))))) + return he; + he = he->next_homonym; // check homonyms + } while (he); + + // obsolote stemming code (used only by the + // experimental SuffixMgr:suggest_pos_stems) + // store resulting root in wlst + } else if (wlst && (*ns < maxSug)) { + int cwrd = 1; + for (int k = 0; k < *ns; k++) + if (strcmp(tmpword, wlst[k]) == 0) { + cwrd = 0; + break; + } + if (cwrd) { + wlst[*ns] = mystrdup(tmpword); + if (wlst[*ns] == NULL) { + for (int j = 0; j < *ns; j++) + free(wlst[j]); + *ns = -1; + return NULL; + } + (*ns)++; + } + } + } + } + return NULL; +} + +// see if two-level suffix is present in the word +struct hentry* SfxEntry::check_twosfx(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + const FLAG needflag) { + struct hentry* he; // hash entry pointer + PfxEntry* ep = ppfx; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + strip.size() >= numconds)) { + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + std::string tmpword(word); + tmpword.resize(tmpl); + tmpword.append(strip); + tmpl += strip.size(); + + const char* beg = tmpword.c_str(); + const char* end = beg + tmpl; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then recall suffix_check + + if (test_condition(end, beg)) { + if (ppfx) { + // handle conditional suffix + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL, + (FLAG)aflag, needflag); + else + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, NULL, 0, + NULL, (FLAG)aflag, needflag); + } else { + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL, + (FLAG)aflag, needflag); + } + if (he) + return he; + } + } + return NULL; +} + +// see if two-level suffix is present in the word +char* SfxEntry::check_twosfx_morph(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + const FLAG needflag) { + PfxEntry* ep = ppfx; + char* st; + + char result[MAXLNLEN]; + + *result = '\0'; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + int tmpl = len - appnd.size(); // length of tmpword + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + strip.size() >= numconds)) { + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + std::string tmpword(word); + tmpword.resize(tmpl); + tmpword.append(strip); + tmpl += strip.size(); + + const char* beg = tmpword.c_str(); + const char* end = beg + tmpl; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then recall suffix_check + + if (test_condition(end, beg)) { + if (ppfx) { + // handle conditional suffix + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { + st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, + needflag); + if (st) { + if (ppfx->getMorph()) { + mystrcat(result, ppfx->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } + mystrcat(result, st, MAXLNLEN); + free(st); + mychomp(result); + } + } else { + st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag, + needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + mychomp(result); + } + } + } else { + st = + pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + mychomp(result); + } + } + if (*result) + return mystrdup(result); + } + } + return NULL; +} + +// get next homonym with same affix +struct hentry* SfxEntry::get_next_homonym(struct hentry* he, + int optflags, + PfxEntry* ppfx, + const FLAG cclass, + const FLAG needflag) { + PfxEntry* ep = ppfx; + FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; + + while (he->next_homonym) { + he = he->next_homonym; + if ((TESTAFF(he->astr, aflag, he->alen) || + (ep && ep->getCont() && + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) || + // handle conditional suffix + ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) && + // handle cont. class + ((!cclass) || + ((contclass) && TESTAFF(contclass, cclass, contclasslen))) && + // handle required flag + ((!needflag) || + (TESTAFF(he->astr, needflag, he->alen) || + ((contclass) && TESTAFF(contclass, needflag, contclasslen))))) + return he; + } + return NULL; +} + +#if 0 + +Appendix: Understanding Affix Code + + +An affix is either a prefix or a suffix attached to root words to make +other words. + +Basically a Prefix or a Suffix is set of AffEntry objects +which store information about the prefix or suffix along +with supporting routines to check if a word has a particular +prefix or suffix or a combination. + +The structure affentry is defined as follows: + +struct affentry +{ + unsigned short aflag; // ID used to represent the affix + std::string strip; // string to strip before adding affix + std::string appnd; // the affix string to add + char numconds; // the number of conditions that must be met + char opts; // flag: aeXPRODUCT- combine both prefix and suffix + char conds[SETSIZE]; // array which encodes the conditions to be met +}; + + +Here is a suffix borrowed from the en_US.aff file. This file +is whitespace delimited. + +SFX D Y 4 +SFX D 0 e d +SFX D y ied [^aeiou]y +SFX D 0 ed [^ey] +SFX D 0 ed [aeiou]y + +This information can be interpreted as follows: + +In the first line has 4 fields + +Field +----- +1 SFX - indicates this is a suffix +2 D - is the name of the character flag which represents this suffix +3 Y - indicates it can be combined with prefixes (cross product) +4 4 - indicates that sequence of 4 affentry structures are needed to + properly store the affix information + +The remaining lines describe the unique information for the 4 SfxEntry +objects that make up this affix. Each line can be interpreted +as follows: (note fields 1 and 2 are as a check against line 1 info) + +Field +----- +1 SFX - indicates this is a suffix +2 D - is the name of the character flag for this affix +3 y - the string of chars to strip off before adding affix + (a 0 here indicates the NULL string) +4 ied - the string of affix characters to add +5 [^aeiou]y - the conditions which must be met before the affix + can be applied + +Field 5 is interesting. Since this is a suffix, field 5 tells us that +there are 2 conditions that must be met. The first condition is that +the next to the last character in the word must *NOT* be any of the +following "a", "e", "i", "o" or "u". The second condition is that +the last character of the word must end in "y". + +So how can we encode this information concisely and be able to +test for both conditions in a fast manner? The answer is found +but studying the wonderful ispell code of Geoff Kuenning, et.al. +(now available under a normal BSD license). + +If we set up a conds array of 256 bytes indexed (0 to 255) and access it +using a character (cast to an unsigned char) of a string, we have 8 bits +of information we can store about that character. Specifically we +could use each bit to say if that character is allowed in any of the +last (or first for prefixes) 8 characters of the word. + +Basically, each character at one end of the word (up to the number +of conditions) is used to index into the conds array and the resulting +value found there says whether the that character is valid for a +specific character position in the word. + +For prefixes, it does this by setting bit 0 if that char is valid +in the first position, bit 1 if valid in the second position, and so on. + +If a bit is not set, then that char is not valid for that postion in the +word. + +If working with suffixes bit 0 is used for the character closest +to the front, bit 1 for the next character towards the end, ..., +with bit numconds-1 representing the last char at the end of the string. + +Note: since entries in the conds[] are 8 bits, only 8 conditions +(read that only 8 character positions) can be examined at one +end of a word (the beginning for prefixes and the end for suffixes. + +So to make this clearer, lets encode the conds array values for the +first two affentries for the suffix D described earlier. + + + For the first affentry: + numconds = 1 (only examine the last character) + + conds['e'] = (1 << 0) (the word must end in an E) + all others are all 0 + + For the second affentry: + numconds = 2 (only examine the last two characters) + + conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) + where X is all characters *but* a, e, i, o, or u + + + conds['y'] = (1 << 1) (the last char must be a y) + all other bits for all other entries in the conds array are zero + +#endif diff --git a/libs/hunspell/src/affentry.hxx b/libs/hunspell/src/affentry.hxx new file mode 100644 index 000000000..6311d83ff --- /dev/null +++ b/libs/hunspell/src/affentry.hxx @@ -0,0 +1,232 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AFFIX_HXX_ +#define _AFFIX_HXX_ + +#include "hunvisapi.h" + +#include "atypes.hxx" +#include "baseaffix.hxx" +#include "affixmgr.hxx" + +/* A Prefix Entry */ + +class LIBHUNSPELL_DLL_EXPORTED PfxEntry : protected AffEntry { + private: + PfxEntry(const PfxEntry&); + PfxEntry& operator=(const PfxEntry&); + + private: + AffixMgr* pmyMgr; + + PfxEntry* next; + PfxEntry* nexteq; + PfxEntry* nextne; + PfxEntry* flgnxt; + + public: + PfxEntry(AffixMgr* pmgr, affentry* dp); + ~PfxEntry(); + + inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); } + struct hentry* checkword(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + + struct hentry* check_twosfx(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + + char* check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + + char* check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + + inline FLAG getFlag() { return aflag; } + inline const char* getKey() { return appnd.c_str(); } + char* add(const char* word, size_t len); + + inline short getKeyLen() { return appnd.size(); } + + inline const char* getMorph() { return morphcode; } + + inline const unsigned short* getCont() { return contclass; } + inline short getContLen() { return contclasslen; } + + inline PfxEntry* getNext() { return next; } + inline PfxEntry* getNextNE() { return nextne; } + inline PfxEntry* getNextEQ() { return nexteq; } + inline PfxEntry* getFlgNxt() { return flgnxt; } + + inline void setNext(PfxEntry* ptr) { next = ptr; } + inline void setNextNE(PfxEntry* ptr) { nextne = ptr; } + inline void setNextEQ(PfxEntry* ptr) { nexteq = ptr; } + inline void setFlgNxt(PfxEntry* ptr) { flgnxt = ptr; } + + inline char* nextchar(char* p); + inline int test_condition(const char* st); +}; + +/* A Suffix Entry */ + +class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry { + private: + SfxEntry(const SfxEntry&); + SfxEntry& operator=(const SfxEntry&); + + private: + AffixMgr* pmyMgr; + std::string rappnd; + + SfxEntry* next; + SfxEntry* nexteq; + SfxEntry* nextne; + SfxEntry* flgnxt; + + SfxEntry* l_morph; + SfxEntry* r_morph; + SfxEntry* eq_morph; + + public: + SfxEntry(AffixMgr* pmgr, affentry* dp); + ~SfxEntry(); + + inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); } + struct hentry* checkword(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + char** wlst, + int maxSug, + int* ns, + const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, + const FLAG badflag = FLAG_NULL); + + struct hentry* check_twosfx(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + const FLAG needflag = FLAG_NULL); + + char* check_twosfx_morph(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + const FLAG needflag = FLAG_NULL); + struct hentry* get_next_homonym(struct hentry* he); + struct hentry* get_next_homonym(struct hentry* word, + int optflags, + PfxEntry* ppfx, + const FLAG cclass, + const FLAG needflag); + + inline FLAG getFlag() { return aflag; } + inline const char* getKey() { return rappnd.c_str(); } + char* add(const char* word, size_t len); + + inline const char* getMorph() { return morphcode; } + + inline const unsigned short* getCont() { return contclass; } + inline short getContLen() { return contclasslen; } + inline const char* getAffix() { return appnd.c_str(); } + + inline short getKeyLen() { return appnd.size(); } + + inline SfxEntry* getNext() { return next; } + inline SfxEntry* getNextNE() { return nextne; } + inline SfxEntry* getNextEQ() { return nexteq; } + + inline SfxEntry* getLM() { return l_morph; } + inline SfxEntry* getRM() { return r_morph; } + inline SfxEntry* getEQM() { return eq_morph; } + inline SfxEntry* getFlgNxt() { return flgnxt; } + + inline void setNext(SfxEntry* ptr) { next = ptr; } + inline void setNextNE(SfxEntry* ptr) { nextne = ptr; } + inline void setNextEQ(SfxEntry* ptr) { nexteq = ptr; } + inline void setFlgNxt(SfxEntry* ptr) { flgnxt = ptr; } + + inline char* nextchar(char* p); + inline int test_condition(const char* st, const char* begin); +}; + +#endif diff --git a/libs/hunspell/src/affixmgr.cxx b/libs/hunspell/src/affixmgr.cxx new file mode 100644 index 000000000..d6bb67798 --- /dev/null +++ b/libs/hunspell/src/affixmgr.cxx @@ -0,0 +1,5117 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include <algorithm> +#include <limits> +#include <string> +#include <vector> + +#include "affixmgr.hxx" +#include "affentry.hxx" +#include "langnum.hxx" + +#include "csutil.hxx" + +AffixMgr::AffixMgr(const char* affpath, + HashMgr** ptr, + int* md, + const char* key) { + // register hash manager and load affix data from aff file + pHMgr = ptr[0]; + alldic = ptr; + maxdic = md; + keystring = NULL; + trystring = NULL; + encoding = NULL; + csconv = NULL; + utf8 = 0; + complexprefixes = 0; + maptable = NULL; + nummap = 0; + breaktable = NULL; + numbreak = -1; + reptable = NULL; + numrep = 0; + iconvtable = NULL; + oconvtable = NULL; + checkcpdtable = NULL; + // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) + simplifiedcpd = 0; + numcheckcpd = 0; + defcpdtable = NULL; + numdefcpd = 0; + phone = NULL; + compoundflag = FLAG_NULL; // permits word in compound forms + compoundbegin = FLAG_NULL; // may be first word in compound forms + compoundmiddle = FLAG_NULL; // may be middle word in compound forms + compoundend = FLAG_NULL; // may be last word in compound forms + compoundroot = FLAG_NULL; // compound word signing flag + compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word + compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word + compoundmoresuffixes = 0; // allow more suffixes within compound words + checkcompounddup = 0; // forbid double words in compounds + checkcompoundrep = 0; // forbid bad compounds (may be non compound word with + // a REP substitution) + checkcompoundcase = + 0; // forbid upper and lowercase combinations at word bounds + checkcompoundtriple = 0; // forbid compounds with triple letters + simplifiedtriple = 0; // allow simplified triple letters in compounds + // (Schiff+fahrt -> Schiffahrt) + forbiddenword = FORBIDDENWORD; // forbidden word signing flag + nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag + nongramsuggest = FLAG_NULL; + lang = NULL; // language + langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) + needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes + cpdwordmax = -1; // default: unlimited wordcount in compound words + cpdmin = -1; // undefined + cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words + cpdvowels = NULL; // vowels (for calculating of Hungarian compounding limit, + // O(n) search! XXX) + cpdvowels_utf16 = + NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search) + cpdvowels_utf16_len = 0; // vowels + pfxappnd = NULL; // previous prefix for counting syllables of the prefix BUG + sfxappnd = NULL; // previous suffix for counting syllables of the suffix BUG + sfxextra = 0; // modifier for syllable count of sfxappnd BUG + cpdsyllablenum = NULL; // syllable count incrementing flag + checknum = 0; // checking numbers, and word with numbers + wordchars = NULL; // letters + spec. word characters + ignorechars = NULL; // letters + spec. word characters + version = NULL; // affix and dictionary file version string + havecontclass = 0; // flags of possible continuing classes (double affix) + // LEMMA_PRESENT: not put root into the morphological output. Lemma presents + // in morhological description in dictionary file. It's often combined with + // PSEUDOROOT. + lemma_present = FLAG_NULL; + circumfix = FLAG_NULL; + onlyincompound = FLAG_NULL; + maxngramsugs = -1; // undefined + maxdiff = -1; // undefined + onlymaxdiff = 0; + maxcpdsugs = -1; // undefined + nosplitsugs = 0; + sugswithdots = 0; + keepcase = 0; + forceucase = 0; + warn = 0; + forbidwarn = 0; + checksharps = 0; + substandard = FLAG_NULL; + fullstrip = 0; + + sfx = NULL; + pfx = NULL; + + for (int i = 0; i < SETSIZE; i++) { + pStart[i] = NULL; + sStart[i] = NULL; + pFlag[i] = NULL; + sFlag[i] = NULL; + } + + for (int j = 0; j < CONTSIZE; j++) { + contclasses[j] = 0; + } + + if (parse_file(affpath, key)) { + HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath); + } + + if (cpdmin == -1) + cpdmin = MINCPDLEN; +} + +AffixMgr::~AffixMgr() { + // pass through linked prefix entries and clean up + for (int i = 0; i < SETSIZE; i++) { + pFlag[i] = NULL; + PfxEntry* ptr = pStart[i]; + PfxEntry* nptr = NULL; + while (ptr) { + nptr = ptr->getNext(); + delete (ptr); + ptr = nptr; + nptr = NULL; + } + } + + // pass through linked suffix entries and clean up + for (int j = 0; j < SETSIZE; j++) { + sFlag[j] = NULL; + SfxEntry* ptr = sStart[j]; + SfxEntry* nptr = NULL; + while (ptr) { + nptr = ptr->getNext(); + delete (ptr); + ptr = nptr; + nptr = NULL; + } + sStart[j] = NULL; + } + + if (keystring) + free(keystring); + keystring = NULL; + if (trystring) + free(trystring); + trystring = NULL; + if (encoding) + free(encoding); + encoding = NULL; + if (maptable) { + for (int j = 0; j < nummap; j++) { + for (int k = 0; k < maptable[j].len; k++) { + if (maptable[j].set[k]) + free(maptable[j].set[k]); + } + free(maptable[j].set); + maptable[j].set = NULL; + maptable[j].len = 0; + } + free(maptable); + maptable = NULL; + } + nummap = 0; + if (breaktable) { + for (int j = 0; j < numbreak; j++) { + if (breaktable[j]) + free(breaktable[j]); + breaktable[j] = NULL; + } + free(breaktable); + breaktable = NULL; + } + numbreak = 0; + if (reptable) { + for (int j = 0; j < numrep; j++) { + free(reptable[j].pattern); + free(reptable[j].pattern2); + } + free(reptable); + reptable = NULL; + } + if (iconvtable) + delete iconvtable; + if (oconvtable) + delete oconvtable; + if (phone && phone->rules) { + for (int j = 0; j < phone->num + 1; j++) { + free(phone->rules[j * 2]); + free(phone->rules[j * 2 + 1]); + } + free(phone->rules); + free(phone); + phone = NULL; + } + + if (defcpdtable) { + for (int j = 0; j < numdefcpd; j++) { + free(defcpdtable[j].def); + defcpdtable[j].def = NULL; + } + free(defcpdtable); + defcpdtable = NULL; + } + numrep = 0; + if (checkcpdtable) { + for (int j = 0; j < numcheckcpd; j++) { + free(checkcpdtable[j].pattern); + free(checkcpdtable[j].pattern2); + free(checkcpdtable[j].pattern3); + checkcpdtable[j].pattern = NULL; + checkcpdtable[j].pattern2 = NULL; + checkcpdtable[j].pattern3 = NULL; + } + free(checkcpdtable); + checkcpdtable = NULL; + } + numcheckcpd = 0; + FREE_FLAG(compoundflag); + FREE_FLAG(compoundbegin); + FREE_FLAG(compoundmiddle); + FREE_FLAG(compoundend); + FREE_FLAG(compoundpermitflag); + FREE_FLAG(compoundforbidflag); + FREE_FLAG(compoundroot); + FREE_FLAG(forbiddenword); + FREE_FLAG(nosuggest); + FREE_FLAG(nongramsuggest); + FREE_FLAG(needaffix); + FREE_FLAG(lemma_present); + FREE_FLAG(circumfix); + FREE_FLAG(onlyincompound); + + cpdwordmax = 0; + pHMgr = NULL; + cpdmin = 0; + cpdmaxsyllable = 0; + if (cpdvowels) + free(cpdvowels); + if (cpdvowels_utf16) + free(cpdvowels_utf16); + if (cpdsyllablenum) + free(cpdsyllablenum); + free_utf_tbl(); + if (lang) + free(lang); + if (wordchars) + free(wordchars); + if (ignorechars) + free(ignorechars); + if (version) + free(version); + checknum = 0; +#ifdef MOZILLA_CLIENT + delete[] csconv; +#endif +} + +void AffixMgr::finishFileMgr(FileMgr* afflst) { + delete afflst; + + // convert affix trees to sorted list + process_pfx_tree_to_list(); + process_sfx_tree_to_list(); +} + +// read in aff file and build up prefix and suffix entry objects +int AffixMgr::parse_file(const char* affpath, const char* key) { + char* line; // io buffers + char ft; // affix type + + // checking flag duplication + char dupflags[CONTSIZE]; + char dupflags_ini = 1; + + // first line indicator for removing byte order mark + int firstline = 1; + + // open the affix file + FileMgr* afflst = new FileMgr(affpath, key); + if (!afflst) { + HUNSPELL_WARNING( + stderr, "error: could not open affix description file %s\n", affpath); + return 1; + } + + // step one is to parse the affix file building up the internal + // affix data structures + + // read in each line ignoring any that do not + // start with a known line type indicator + while ((line = afflst->getline()) != NULL) { + mychomp(line); + + /* remove byte order mark */ + if (firstline) { + firstline = 0; + // Affix file begins with byte order mark: possible incompatibility with + // old Hunspell versions + if (strncmp(line, "\xEF\xBB\xBF", 3) == 0) { + memmove(line, line + 3, strlen(line + 3) + 1); + } + } + + /* parse in the keyboard string */ + if (strncmp(line, "KEY", 3) == 0) { + if (parse_string(line, &keystring, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the try string */ + if (strncmp(line, "TRY", 3) == 0) { + if (parse_string(line, &trystring, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the name of the character set used by the .dict and .aff */ + if (strncmp(line, "SET", 3) == 0) { + if (parse_string(line, &encoding, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + if (strcmp(encoding, "UTF-8") == 0) { + utf8 = 1; +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + if (initialize_utf_tbl()) { + finishFileMgr(afflst); + return 1; + } +#endif +#endif + } + } + + /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left + * writing system */ + if (strncmp(line, "COMPLEXPREFIXES", 15) == 0) + complexprefixes = 1; + + /* parse in the flag used by the controlled compound words */ + if (strncmp(line, "COMPOUNDFLAG", 12) == 0) { + if (parse_flag(line, &compoundflag, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by compound words */ + if (strncmp(line, "COMPOUNDBEGIN", 13) == 0) { + if (complexprefixes) { + if (parse_flag(line, &compoundend, afflst)) { + finishFileMgr(afflst); + return 1; + } + } else { + if (parse_flag(line, &compoundbegin, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + } + + /* parse in the flag used by compound words */ + if (strncmp(line, "COMPOUNDMIDDLE", 14) == 0) { + if (parse_flag(line, &compoundmiddle, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + /* parse in the flag used by compound words */ + if (strncmp(line, "COMPOUNDEND", 11) == 0) { + if (complexprefixes) { + if (parse_flag(line, &compoundbegin, afflst)) { + finishFileMgr(afflst); + return 1; + } + } else { + if (parse_flag(line, &compoundend, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + } + + /* parse in the data used by compound_check() method */ + if (strncmp(line, "COMPOUNDWORDMAX", 15) == 0) { + if (parse_num(line, &cpdwordmax, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag sign compounds in dictionary */ + if (strncmp(line, "COMPOUNDROOT", 12) == 0) { + if (parse_flag(line, &compoundroot, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line, "COMPOUNDPERMITFLAG", 18) == 0) { + if (parse_flag(line, &compoundpermitflag, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line, "COMPOUNDFORBIDFLAG", 18) == 0) { + if (parse_flag(line, &compoundforbidflag, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "COMPOUNDMORESUFFIXES", 20) == 0) { + compoundmoresuffixes = 1; + } + + if (strncmp(line, "CHECKCOMPOUNDDUP", 16) == 0) { + checkcompounddup = 1; + } + + if (strncmp(line, "CHECKCOMPOUNDREP", 16) == 0) { + checkcompoundrep = 1; + } + + if (strncmp(line, "CHECKCOMPOUNDTRIPLE", 19) == 0) { + checkcompoundtriple = 1; + } + + if (strncmp(line, "SIMPLIFIEDTRIPLE", 16) == 0) { + simplifiedtriple = 1; + } + + if (strncmp(line, "CHECKCOMPOUNDCASE", 17) == 0) { + checkcompoundcase = 1; + } + + if (strncmp(line, "NOSUGGEST", 9) == 0) { + if (parse_flag(line, &nosuggest, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "NONGRAMSUGGEST", 14) == 0) { + if (parse_flag(line, &nongramsuggest, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by forbidden words */ + if (strncmp(line, "FORBIDDENWORD", 13) == 0) { + if (parse_flag(line, &forbiddenword, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by forbidden words */ + if (strncmp(line, "LEMMA_PRESENT", 13) == 0) { + if (parse_flag(line, &lemma_present, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by circumfixes */ + if (strncmp(line, "CIRCUMFIX", 9) == 0) { + if (parse_flag(line, &circumfix, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by fogemorphemes */ + if (strncmp(line, "ONLYINCOMPOUND", 14) == 0) { + if (parse_flag(line, &onlyincompound, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by `needaffixs' */ + if (strncmp(line, "PSEUDOROOT", 10) == 0) { + if (parse_flag(line, &needaffix, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by `needaffixs' */ + if (strncmp(line, "NEEDAFFIX", 9) == 0) { + if (parse_flag(line, &needaffix, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the minimal length for words in compounds */ + if (strncmp(line, "COMPOUNDMIN", 11) == 0) { + if (parse_num(line, &cpdmin, afflst)) { + finishFileMgr(afflst); + return 1; + } + if (cpdmin < 1) + cpdmin = 1; + } + + /* parse in the max. words and syllables in compounds */ + if (strncmp(line, "COMPOUNDSYLLABLE", 16) == 0) { + if (parse_cpdsyllable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line, "SYLLABLENUM", 11) == 0) { + if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by the controlled compound words */ + if (strncmp(line, "CHECKNUM", 8) == 0) { + checknum = 1; + } + + /* parse in the extra word characters */ + if (strncmp(line, "WORDCHARS", 9) == 0) { + if (!parse_array(line, &wordchars, wordchars_utf16, + utf8, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the ignored characters (for example, Arabic optional diacretics + * charachters */ + if (strncmp(line, "IGNORE", 6) == 0) { + if (!parse_array(line, &ignorechars, ignorechars_utf16, + utf8, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the typical fault correcting table */ + if (strncmp(line, "REP", 3) == 0) { + if (parse_reptable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the input conversion table */ + if (strncmp(line, "ICONV", 5) == 0) { + if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the input conversion table */ + if (strncmp(line, "OCONV", 5) == 0) { + if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the phonetic translation table */ + if (strncmp(line, "PHONE", 5) == 0) { + if (parse_phonetable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the checkcompoundpattern table */ + if (strncmp(line, "CHECKCOMPOUNDPATTERN", 20) == 0) { + if (parse_checkcpdtable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the defcompound table */ + if (strncmp(line, "COMPOUNDRULE", 12) == 0) { + if (parse_defcpdtable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the related character map table */ + if (strncmp(line, "MAP", 3) == 0) { + if (parse_maptable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the word breakpoints table */ + if (strncmp(line, "BREAK", 5) == 0) { + if (parse_breaktable(line, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the language for language specific codes */ + if (strncmp(line, "LANG", 4) == 0) { + if (parse_string(line, &lang, afflst->getlinenum())) { + finishFileMgr(afflst); + return 1; + } + langnum = get_lang_num(lang); + } + + if (strncmp(line, "VERSION", 7) == 0) { + for (line = line + 7; *line == ' ' || *line == '\t'; line++) + ; + version = mystrdup(line); + } + + if (strncmp(line, "MAXNGRAMSUGS", 12) == 0) { + if (parse_num(line, &maxngramsugs, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "ONLYMAXDIFF", 11) == 0) + onlymaxdiff = 1; + + if (strncmp(line, "MAXDIFF", 7) == 0) { + if (parse_num(line, &maxdiff, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "MAXCPDSUGS", 10) == 0) { + if (parse_num(line, &maxcpdsugs, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "NOSPLITSUGS", 11) == 0) { + nosplitsugs = 1; + } + + if (strncmp(line, "FULLSTRIP", 9) == 0) { + fullstrip = 1; + } + + if (strncmp(line, "SUGSWITHDOTS", 12) == 0) { + sugswithdots = 1; + } + + /* parse in the flag used by forbidden words */ + if (strncmp(line, "KEEPCASE", 8) == 0) { + if (parse_flag(line, &keepcase, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by `forceucase' */ + if (strncmp(line, "FORCEUCASE", 10) == 0) { + if (parse_flag(line, &forceucase, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + /* parse in the flag used by `warn' */ + if (strncmp(line, "WARN", 4) == 0) { + if (parse_flag(line, &warn, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "FORBIDWARN", 10) == 0) { + forbidwarn = 1; + } + + /* parse in the flag used by the affix generator */ + if (strncmp(line, "SUBSTANDARD", 11) == 0) { + if (parse_flag(line, &substandard, afflst)) { + finishFileMgr(afflst); + return 1; + } + } + + if (strncmp(line, "CHECKSHARPS", 11) == 0) { + checksharps = 1; + } + + /* parse this affix: P - prefix, S - suffix */ + ft = ' '; + if (strncmp(line, "PFX", 3) == 0) + ft = complexprefixes ? 'S' : 'P'; + if (strncmp(line, "SFX", 3) == 0) + ft = complexprefixes ? 'P' : 'S'; + if (ft != ' ') { + if (dupflags_ini) { + memset(dupflags, 0, sizeof(dupflags)); + dupflags_ini = 0; + } + if (parse_affix(line, ft, afflst, dupflags)) { + finishFileMgr(afflst); + return 1; + } + } + } + + finishFileMgr(afflst); + // affix trees are sorted now + + // now we can speed up performance greatly taking advantage of the + // relationship between the affixes and the idea of "subsets". + + // View each prefix as a potential leading subset of another and view + // each suffix (reversed) as a potential trailing subset of another. + + // To illustrate this relationship if we know the prefix "ab" is found in the + // word to examine, only prefixes that "ab" is a leading subset of need be + // examined. + // Furthermore is "ab" is not present then none of the prefixes that "ab" is + // is a subset need be examined. + // The same argument goes for suffix string that are reversed. + + // Then to top this off why not examine the first char of the word to quickly + // limit the set of prefixes to examine (i.e. the prefixes to examine must + // be leading supersets of the first character of the word (if they exist) + + // To take advantage of this "subset" relationship, we need to add two links + // from entry. One to take next if the current prefix is found (call it + // nexteq) + // and one to take next if the current prefix is not found (call it nextne). + + // Since we have built ordered lists, all that remains is to properly + // initialize + // the nextne and nexteq pointers that relate them + + process_pfx_order(); + process_sfx_order(); + + /* get encoding for CHECKCOMPOUNDCASE */ + if (!utf8) { + char* enc = get_encoding(); + csconv = get_current_cs(enc); + free(enc); + enc = NULL; + + std::string expw; + if (wordchars) { + expw.assign(wordchars); + free(wordchars); + } + + for (int i = 0; i <= 255; i++) { + if ((csconv[i].cupper != csconv[i].clower) && + (expw.find((char)i) == std::string::npos)) { + expw.push_back((char)i); + } + } + + wordchars = mystrdup(expw.c_str()); + } + + // default BREAK definition + if (numbreak == -1) { + breaktable = (char**)malloc(sizeof(char*) * 3); + if (!breaktable) + return 1; + breaktable[0] = mystrdup("-"); + breaktable[1] = mystrdup("^-"); + breaktable[2] = mystrdup("-$"); + if (breaktable[0] && breaktable[1] && breaktable[2]) + numbreak = 3; + } + return 0; +} + +// we want to be able to quickly access prefix information +// both by prefix flag, and sorted by prefix string itself +// so we need to set up two indexes + +int AffixMgr::build_pfxtree(PfxEntry* pfxptr) { + PfxEntry* ptr; + PfxEntry* pptr; + PfxEntry* ep = pfxptr; + + // get the right starting points + const char* key = ep->getKey(); + const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF); + + // first index by flag which must exist + ptr = pFlag[flg]; + ep->setFlgNxt(ptr); + pFlag[flg] = ep; + + // handle the special case of null affix string + if (strlen(key) == 0) { + // always inset them at head of list at element 0 + ptr = pStart[0]; + ep->setNext(ptr); + pStart[0] = ep; + return 0; + } + + // now handle the normal case + ep->setNextEQ(NULL); + ep->setNextNE(NULL); + + unsigned char sp = *((const unsigned char*)key); + ptr = pStart[sp]; + + // handle the first insert + if (!ptr) { + pStart[sp] = ep; + return 0; + } + + // otherwise use binary tree insertion so that a sorted + // list can easily be generated later + pptr = NULL; + for (;;) { + pptr = ptr; + if (strcmp(ep->getKey(), ptr->getKey()) <= 0) { + ptr = ptr->getNextEQ(); + if (!ptr) { + pptr->setNextEQ(ep); + break; + } + } else { + ptr = ptr->getNextNE(); + if (!ptr) { + pptr->setNextNE(ep); + break; + } + } + } + return 0; +} + +// we want to be able to quickly access suffix information +// both by suffix flag, and sorted by the reverse of the +// suffix string itself; so we need to set up two indexes +int AffixMgr::build_sfxtree(SfxEntry* sfxptr) { + SfxEntry* ptr; + SfxEntry* pptr; + SfxEntry* ep = sfxptr; + + /* get the right starting point */ + const char* key = ep->getKey(); + const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF); + + // first index by flag which must exist + ptr = sFlag[flg]; + ep->setFlgNxt(ptr); + sFlag[flg] = ep; + + // next index by affix string + + // handle the special case of null affix string + if (strlen(key) == 0) { + // always inset them at head of list at element 0 + ptr = sStart[0]; + ep->setNext(ptr); + sStart[0] = ep; + return 0; + } + + // now handle the normal case + ep->setNextEQ(NULL); + ep->setNextNE(NULL); + + unsigned char sp = *((const unsigned char*)key); + ptr = sStart[sp]; + + // handle the first insert + if (!ptr) { + sStart[sp] = ep; + return 0; + } + + // otherwise use binary tree insertion so that a sorted + // list can easily be generated later + pptr = NULL; + for (;;) { + pptr = ptr; + if (strcmp(ep->getKey(), ptr->getKey()) <= 0) { + ptr = ptr->getNextEQ(); + if (!ptr) { + pptr->setNextEQ(ep); + break; + } + } else { + ptr = ptr->getNextNE(); + if (!ptr) { + pptr->setNextNE(ep); + break; + } + } + } + return 0; +} + +// convert from binary tree to sorted list +int AffixMgr::process_pfx_tree_to_list() { + for (int i = 1; i < SETSIZE; i++) { + pStart[i] = process_pfx_in_order(pStart[i], NULL); + } + return 0; +} + +PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) { + if (ptr) { + nptr = process_pfx_in_order(ptr->getNextNE(), nptr); + ptr->setNext(nptr); + nptr = process_pfx_in_order(ptr->getNextEQ(), ptr); + } + return nptr; +} + +// convert from binary tree to sorted list +int AffixMgr::process_sfx_tree_to_list() { + for (int i = 1; i < SETSIZE; i++) { + sStart[i] = process_sfx_in_order(sStart[i], NULL); + } + return 0; +} + +SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) { + if (ptr) { + nptr = process_sfx_in_order(ptr->getNextNE(), nptr); + ptr->setNext(nptr); + nptr = process_sfx_in_order(ptr->getNextEQ(), ptr); + } + return nptr; +} + +// reinitialize the PfxEntry links NextEQ and NextNE to speed searching +// using the idea of leading subsets this time +int AffixMgr::process_pfx_order() { + PfxEntry* ptr; + + // loop through each prefix list starting point + for (int i = 1; i < SETSIZE; i++) { + ptr = pStart[i]; + + // look through the remainder of the list + // and find next entry with affix that + // the current one is not a subset of + // mark that as destination for NextNE + // use next in list that you are a subset + // of as NextEQ + + for (; ptr != NULL; ptr = ptr->getNext()) { + PfxEntry* nptr = ptr->getNext(); + for (; nptr != NULL; nptr = nptr->getNext()) { + if (!isSubset(ptr->getKey(), nptr->getKey())) + break; + } + ptr->setNextNE(nptr); + ptr->setNextEQ(NULL); + if ((ptr->getNext()) && + isSubset(ptr->getKey(), (ptr->getNext())->getKey())) + ptr->setNextEQ(ptr->getNext()); + } + + // now clean up by adding smart search termination strings: + // if you are already a superset of the previous prefix + // but not a subset of the next, search can end here + // so set NextNE properly + + ptr = pStart[i]; + for (; ptr != NULL; ptr = ptr->getNext()) { + PfxEntry* nptr = ptr->getNext(); + PfxEntry* mptr = NULL; + for (; nptr != NULL; nptr = nptr->getNext()) { + if (!isSubset(ptr->getKey(), nptr->getKey())) + break; + mptr = nptr; + } + if (mptr) + mptr->setNextNE(NULL); + } + } + return 0; +} + +// initialize the SfxEntry links NextEQ and NextNE to speed searching +// using the idea of leading subsets this time +int AffixMgr::process_sfx_order() { + SfxEntry* ptr; + + // loop through each prefix list starting point + for (int i = 1; i < SETSIZE; i++) { + ptr = sStart[i]; + + // look through the remainder of the list + // and find next entry with affix that + // the current one is not a subset of + // mark that as destination for NextNE + // use next in list that you are a subset + // of as NextEQ + + for (; ptr != NULL; ptr = ptr->getNext()) { + SfxEntry* nptr = ptr->getNext(); + for (; nptr != NULL; nptr = nptr->getNext()) { + if (!isSubset(ptr->getKey(), nptr->getKey())) + break; + } + ptr->setNextNE(nptr); + ptr->setNextEQ(NULL); + if ((ptr->getNext()) && + isSubset(ptr->getKey(), (ptr->getNext())->getKey())) + ptr->setNextEQ(ptr->getNext()); + } + + // now clean up by adding smart search termination strings: + // if you are already a superset of the previous suffix + // but not a subset of the next, search can end here + // so set NextNE properly + + ptr = sStart[i]; + for (; ptr != NULL; ptr = ptr->getNext()) { + SfxEntry* nptr = ptr->getNext(); + SfxEntry* mptr = NULL; + for (; nptr != NULL; nptr = nptr->getNext()) { + if (!isSubset(ptr->getKey(), nptr->getKey())) + break; + mptr = nptr; + } + if (mptr) + mptr->setNextNE(NULL); + } + } + return 0; +} + +// add flags to the result for dictionary debugging +void AffixMgr::debugflag(char* result, unsigned short flag) { + char* st = encode_flag(flag); + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_FLAG, MAXLNLEN); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } +} + +// add flags to the result for dictionary debugging +std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) { + char* st = encode_flag(flag); + result.append(" "); + result.append(MORPH_FLAG); + if (st) { + result.append(st); + free(st); + } + return result; +} + +// calculate the character length of the condition +int AffixMgr::condlen(const char* st) { + int l = 0; + bool group = false; + for (; *st; st++) { + if (*st == '[') { + group = true; + l++; + } else if (*st == ']') + group = false; + else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) + l++; + } + return l; +} + +int AffixMgr::encodeit(affentry& entry, const char* cs) { + if (strcmp(cs, ".") != 0) { + entry.numconds = (char)condlen(cs); + // coverity[buffer_size_warning] - deliberate use of lack of end of conds + // padded by strncpy as long condition flag + strncpy(entry.c.conds, cs, MAXCONDLEN); + if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { + entry.opts += aeLONGCOND; + entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); + if (!entry.c.l.conds2) + return 1; + } + } else { + entry.numconds = 0; + entry.c.conds[0] = '\0'; + } + return 0; +} + +// return 1 if s1 is a leading subset of s2 (dots are for infixes) +inline int AffixMgr::isSubset(const char* s1, const char* s2) { + while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { + s1++; + s2++; + } + return (*s1 == '\0'); +} + +// check word for prefixes +struct hentry* AffixMgr::prefix_check(const char* word, + int len, + char in_compound, + const FLAG needflag) { + struct hentry* rv = NULL; + + pfx = NULL; + pfxappnd = NULL; + sfxappnd = NULL; + sfxextra = 0; + + // first handle the special case of 0 length prefixes + PfxEntry* pe = pStart[0]; + while (pe) { + if ( + // fogemorpheme + ((in_compound != IN_CPD_NOT) || + !(pe->getCont() && + (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) && + // permit prefixes in compounds + ((in_compound != IN_CPD_END) || + (pe->getCont() && + (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) { + // check prefix + rv = pe->checkword(word, len, in_compound, needflag); + if (rv) { + pfx = pe; // BUG: pfx not stateless + return rv; + } + } + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char*)word); + PfxEntry* pptr = pStart[sp]; + + while (pptr) { + if (isSubset(pptr->getKey(), word)) { + if ( + // fogemorpheme + ((in_compound != IN_CPD_NOT) || + !(pptr->getCont() && + (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) && + // permit prefixes in compounds + ((in_compound != IN_CPD_END) || + (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag, + pptr->getContLen()))))) { + // check prefix + rv = pptr->checkword(word, len, in_compound, needflag); + if (rv) { + pfx = pptr; // BUG: pfx not stateless + return rv; + } + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } + + return NULL; +} + +// check word for prefixes +struct hentry* AffixMgr::prefix_check_twosfx(const char* word, + int len, + char in_compound, + const FLAG needflag) { + struct hentry* rv = NULL; + + pfx = NULL; + sfxappnd = NULL; + sfxextra = 0; + + // first handle the special case of 0 length prefixes + PfxEntry* pe = pStart[0]; + + while (pe) { + rv = pe->check_twosfx(word, len, in_compound, needflag); + if (rv) + return rv; + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char*)word); + PfxEntry* pptr = pStart[sp]; + + while (pptr) { + if (isSubset(pptr->getKey(), word)) { + rv = pptr->check_twosfx(word, len, in_compound, needflag); + if (rv) { + pfx = pptr; + return rv; + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } + + return NULL; +} + +// check word for prefixes +char* AffixMgr::prefix_check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + + char result[MAXLNLEN]; + result[0] = '\0'; + + pfx = NULL; + sfxappnd = NULL; + sfxextra = 0; + + // first handle the special case of 0 length prefixes + PfxEntry* pe = pStart[0]; + while (pe) { + char* st = pe->check_morph(word, len, in_compound, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + // if (rv) return rv; + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char*)word); + PfxEntry* pptr = pStart[sp]; + + while (pptr) { + if (isSubset(pptr->getKey(), word)) { + char* st = pptr->check_morph(word, len, in_compound, needflag); + if (st) { + // fogemorpheme + if ((in_compound != IN_CPD_NOT) || + !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, + pptr->getContLen()))))) { + mystrcat(result, st, MAXLNLEN); + pfx = pptr; + } + free(st); + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } + + if (*result) + return mystrdup(result); + return NULL; +} + +// check word for prefixes +char* AffixMgr::prefix_check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + char result[MAXLNLEN]; + result[0] = '\0'; + + pfx = NULL; + sfxappnd = NULL; + sfxextra = 0; + + // first handle the special case of 0 length prefixes + PfxEntry* pe = pStart[0]; + while (pe) { + char* st = pe->check_twosfx_morph(word, len, in_compound, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char*)word); + PfxEntry* pptr = pStart[sp]; + + while (pptr) { + if (isSubset(pptr->getKey(), word)) { + char* st = pptr->check_twosfx_morph(word, len, in_compound, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + pfx = pptr; + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } + + if (*result) + return mystrdup(result); + return NULL; +} + +// Is word a non compound with a REP substitution (see checkcompoundrep)? +int AffixMgr::cpdrep_check(const char* word, int wl) { + + if ((wl < 2) || !numrep) + return 0; + + for (int i = 0; i < numrep; i++) { + const char* r = word; + int lenp = strlen(reptable[i].pattern); + // search every occurence of the pattern in the word + while ((r = strstr(r, reptable[i].pattern)) != NULL) { + std::string candidate(word); + candidate.replace(r - word, lenp, reptable[i].pattern2); + if (candidate_check(candidate.c_str(), candidate.size())) + return 1; + r++; // search for the next letter + } + } + return 0; +} + +// forbid compoundings when there are special patterns at word bound +int AffixMgr::cpdpat_check(const char* word, + int pos, + hentry* r1, + hentry* r2, + const char /*affixed*/) { + int len; + for (int i = 0; i < numcheckcpd; i++) { + if (isSubset(checkcpdtable[i].pattern2, word + pos) && + (!r1 || !checkcpdtable[i].cond || + (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && + (!r2 || !checkcpdtable[i].cond2 || + (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && + // zero length pattern => only TESTAFF + // zero pattern (0/flag) => unmodified stem (zero affixes allowed) + (!*(checkcpdtable[i].pattern) || + ((*(checkcpdtable[i].pattern) == '0' && r1->blen <= pos && + strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) || + (*(checkcpdtable[i].pattern) != '0' && + ((len = strlen(checkcpdtable[i].pattern)) != 0) && + strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) { + return 1; + } + } + return 0; +} + +// forbid compounding with neighbouring upper and lower case characters at word +// bounds +int AffixMgr::cpdcase_check(const char* word, int pos) { + if (utf8) { + const char* p; + for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--) + ; + std::string pair(p); + std::vector<w_char> pair_u; + u8_u16(pair_u, pair); + unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0; + unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0; + if (((unicodetoupper(a, langnum) == a) || + (unicodetoupper(b, langnum) == b)) && + (a != '-') && (b != '-')) + return 1; + } else { + unsigned char a = *(word + pos - 1); + unsigned char b = *(word + pos); + if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) + return 1; + } + return 0; +} + +struct metachar_data { + signed short btpp; // metacharacter (*, ?) position for backtracking + signed short btwp; // word position for metacharacters + int btnum; // number of matched characters in metacharacter +}; + +// check compound patterns +int AffixMgr::defcpd_check(hentry*** words, + short wnum, + hentry* rv, + hentry** def, + char all) { + int w = 0; + + if (!*words) { + w = 1; + *words = def; + } + + if (!*words) { + return 0; + } + + std::vector<metachar_data> btinfo(1); + + short bt = 0; + int i, j; + + (*words)[wnum] = rv; + + // has the last word COMPOUNDRULE flag? + if (rv->alen == 0) { + (*words)[wnum] = NULL; + if (w) + *words = NULL; + return 0; + } + int ok = 0; + for (i = 0; i < numdefcpd; i++) { + for (j = 0; j < defcpdtable[i].len; j++) { + if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && + TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) { + ok = 1; + break; + } + } + } + if (ok == 0) { + (*words)[wnum] = NULL; + if (w) + *words = NULL; + return 0; + } + + for (i = 0; i < numdefcpd; i++) { + signed short pp = 0; // pattern position + signed short wp = 0; // "words" position + int ok2; + ok = 1; + ok2 = 1; + do { + while ((pp < defcpdtable[i].len) && (wp <= wnum)) { + if (((pp + 1) < defcpdtable[i].len) && + ((defcpdtable[i].def[pp + 1] == '*') || + (defcpdtable[i].def[pp + 1] == '?'))) { + int wend = (defcpdtable[i].def[pp + 1] == '?') ? wp : wnum; + ok2 = 1; + pp += 2; + btinfo[bt].btpp = pp; + btinfo[bt].btwp = wp; + while (wp <= wend) { + if (!(*words)[wp]->alen || + !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp - 2], + (*words)[wp]->alen)) { + ok2 = 0; + break; + } + wp++; + } + if (wp <= wnum) + ok2 = 0; + btinfo[bt].btnum = wp - btinfo[bt].btwp; + if (btinfo[bt].btnum > 0) { + ++bt; + btinfo.resize(bt+1); + } + if (ok2) + break; + } else { + ok2 = 1; + if (!(*words)[wp] || !(*words)[wp]->alen || + !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], + (*words)[wp]->alen)) { + ok = 0; + break; + } + pp++; + wp++; + if ((defcpdtable[i].len == pp) && !(wp > wnum)) + ok = 0; + } + } + if (ok && ok2) { + int r = pp; + while ((defcpdtable[i].len > r) && ((r + 1) < defcpdtable[i].len) && + ((defcpdtable[i].def[r + 1] == '*') || + (defcpdtable[i].def[r + 1] == '?'))) + r += 2; + if (defcpdtable[i].len <= r) + return 1; + } + // backtrack + if (bt) + do { + ok = 1; + btinfo[bt - 1].btnum--; + pp = btinfo[bt - 1].btpp; + wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum; + } while ((btinfo[bt - 1].btnum < 0) && --bt); + } while (bt); + + if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) + return 1; + + // check zero ending + while (ok && ok2 && (defcpdtable[i].len > pp) && + ((pp + 1) < defcpdtable[i].len) && + ((defcpdtable[i].def[pp + 1] == '*') || + (defcpdtable[i].def[pp + 1] == '?'))) + pp += 2; + if (ok && ok2 && (defcpdtable[i].len <= pp)) + return 1; + } + (*words)[wnum] = NULL; + if (w) + *words = NULL; + return 0; +} + +inline int AffixMgr::candidate_check(const char* word, int len) { + struct hentry* rv = NULL; + + rv = lookup(word); + if (rv) + return 1; + + // rv = prefix_check(word,len,1); + // if (rv) return 1; + + rv = affix_check(word, len); + if (rv) + return 1; + return 0; +} + +// calculate number of syllable for compound-checking +short AffixMgr::get_syllable(const std::string& word) { + if (cpdmaxsyllable == 0) + return 0; + + short num = 0; + + if (!utf8) { + for (size_t i = 0; i < word.size(); ++i) { + if (strchr(cpdvowels, word[i])) + num++; + } + } else if (cpdvowels_utf16) { + std::vector<w_char> w; + int i = u8_u16(w, word); + for (; i > 0; i--) { + if (std::binary_search(cpdvowels_utf16, + cpdvowels_utf16 + cpdvowels_utf16_len, + w[i - 1])) { + ++num; + } + } + } + return num; +} + +void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) { + if (utf8) { + int i; + for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) { + for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++) + ; + } + for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) { + for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--) + ; + } + } else { + *cmin = cpdmin; + *cmax = len - cpdmin + 1; + } +} + +// check if compound word is correctly spelled +// hu_mov_rule = spec. Hungarian rule (XXX) +struct hentry* AffixMgr::compound_check(const char* word, + int len, + short wordnum, + short numsyllable, + short maxwordnum, + short wnum, + hentry** words = NULL, + hentry** rwords = NULL, + char hu_mov_rule = 0, + char is_sug = 0, + int* info = NULL) { + int i; + short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; + struct hentry* rv = NULL; + struct hentry* rv_first; + std::string st; + char ch = '\0'; + int cmin; + int cmax; + int striple = 0; + int scpd = 0; + int soldi = 0; + int oldcmin = 0; + int oldcmax = 0; + int oldlen = 0; + int checkedstriple = 0; + int onlycpdrule; + char affixed = 0; + hentry** oldwords = words; + + int checked_prefix; + + setcminmax(&cmin, &cmax, word, len); + + st.assign(word); + + for (i = cmin; i < cmax; i++) { + // go to end of the UTF-8 character + if (utf8) { + for (; (st[i] & 0xc0) == 0x80; i++) + ; + if (i >= cmax) + return NULL; + } + + words = oldwords; + onlycpdrule = (words) ? 1 : 0; + + do { // onlycpdrule loop + + oldnumsyllable = numsyllable; + oldwordnum = wordnum; + checked_prefix = 0; + + do { // simplified checkcompoundpattern loop + + if (scpd > 0) { + for (; scpd <= numcheckcpd && + (!checkcpdtable[scpd - 1].pattern3 || + strncmp(word + i, checkcpdtable[scpd - 1].pattern3, + strlen(checkcpdtable[scpd - 1].pattern3)) != 0); + scpd++) + ; + + if (scpd > numcheckcpd) + break; // break simplified checkcompoundpattern loop + st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern); + soldi = i; + i += strlen(checkcpdtable[scpd - 1].pattern); + st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2); + st.replace(i + strlen(checkcpdtable[scpd - 1].pattern2), std::string::npos, + word + soldi + strlen(checkcpdtable[scpd - 1].pattern3)); + + oldlen = len; + len += strlen(checkcpdtable[scpd - 1].pattern) + + strlen(checkcpdtable[scpd - 1].pattern2) - + strlen(checkcpdtable[scpd - 1].pattern3); + oldcmin = cmin; + oldcmax = cmax; + setcminmax(&cmin, &cmax, st.c_str(), len); + + cmax = len - cpdmin + 1; + } + + ch = st[i]; + st[i] = '\0'; + + sfx = NULL; + pfx = NULL; + + // FIRST WORD + + affixed = 1; + rv = lookup(st.c_str()); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && !hu_mov_rule && + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + !((compoundflag && !words && !onlycpdrule && + TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundbegin && !wordnum && !onlycpdrule && + TESTAFF(rv->astr, compoundbegin, rv->alen)) || + (compoundmiddle && wordnum && !words && !onlycpdrule && + TESTAFF(rv->astr, compoundmiddle, rv->alen)) || + (numdefcpd && onlycpdrule && + ((!words && !wordnum && + defcpd_check(&words, wnum, rv, rwords, 0)) || + (words && + defcpd_check(&words, wnum, rv, rwords, 0))))) || + (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL && + !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) { + rv = rv->next_homonym; + } + + if (rv) + affixed = 0; + + if (!rv) { + if (onlycpdrule) + break; + if (compoundflag && + !(rv = prefix_check(st.c_str(), i, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, + compoundflag))) { + if (((rv = suffix_check( + st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundflag, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (compoundmoresuffixes && + (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) && + !hu_mov_rule && sfx->getCont() && + ((compoundforbidflag && + TESTAFF(sfx->getCont(), compoundforbidflag, + sfx->getContLen())) || + (compoundend && + TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { + rv = NULL; + } + } + + if (rv || + (((wordnum == 0) && compoundbegin && + ((rv = suffix_check( + st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (compoundmoresuffixes && + (rv = suffix_check_twosfx( + st.c_str(), i, 0, NULL, + compoundbegin))) || // twofold suffixes + compound + (rv = prefix_check(st.c_str(), i, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, + compoundbegin)))) || + ((wordnum > 0) && compoundmiddle && + ((rv = suffix_check( + st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (compoundmoresuffixes && + (rv = suffix_check_twosfx( + st.c_str(), i, 0, NULL, + compoundmiddle))) || // twofold suffixes + compound + (rv = prefix_check(st.c_str(), i, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, + compoundmiddle)))))) + checked_prefix = 1; + // else check forbiddenwords and needaffix + } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, needaffix, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + (is_sug && nosuggest && + TESTAFF(rv->astr, nosuggest, rv->alen)))) { + st[i] = ch; + // continue; + break; + } + + // check non_compound flag in suffix and prefix + if ((rv) && !hu_mov_rule && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundforbidflag, + sfx->getContLen())))) { + rv = NULL; + } + + // check compoundend flag in suffix and prefix + if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { + rv = NULL; + } + + // check compoundmiddle flag in suffix and prefix + if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle && + !hu_mov_rule && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && + (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { + return NULL; + } + + // increment word number, if the second root has a compoundroot flag + if ((rv) && compoundroot && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // first word is acceptable in compound words? + if (((rv) && + (checked_prefix || (words && words[wnum]) || + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + ((oldwordnum == 0) && compoundbegin && + TESTAFF(rv->astr, compoundbegin, rv->alen)) || + ((oldwordnum > 0) && compoundmiddle && + TESTAFF(rv->astr, compoundmiddle, rv->alen)) // || + // (numdefcpd && ) + + // LANG_hu section: spec. Hungarian rule + || ((langnum == LANG_hu) && hu_mov_rule && + (TESTAFF( + rv->astr, 'F', + rv->alen) || // XXX hardwired Hungarian dictionary codes + TESTAFF(rv->astr, 'G', rv->alen) || + TESTAFF(rv->astr, 'H', rv->alen))) + // END of LANG_hu section + ) && + ( + // test CHECKCOMPOUNDPATTERN conditions + scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) && + !((checkcompoundtriple && scpd == 0 && + !words && // test triple letters + (word[i - 1] == word[i]) && + (((i > 1) && (word[i - 1] == word[i - 2])) || + ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' + )) || + (checkcompoundcase && scpd == 0 && !words && + cpdcase_check(word, i)))) + // LANG_hu section: spec. Hungarian rule + || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && + (rv = affix_check(st.c_str(), i)) && + (sfx && sfx->getCont() && + ( // XXX hardwired Hungarian dic. codes + TESTAFF(sfx->getCont(), (unsigned short)'x', + sfx->getContLen()) || + TESTAFF( + sfx->getCont(), (unsigned short)'%', + sfx->getContLen()))))) { // first word is ok condition + + // LANG_hu section: spec. Hungarian rule + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(st.substr(i)); + // + 1 word, if syllable number of the prefix > 1 (hungarian + // convention) + if (pfx && (get_syllable(pfx->getKey()) > 1)) + wordnum++; + } + // END of LANG_hu section + + // NEXT WORD(S) + rv_first = rv; + st[i] = ch; + + do { // striple loop + + // check simplifiedtriple + if (simplifiedtriple) { + if (striple) { + checkedstriple = 1; + i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" + } else if (i > 2 && *(word + i - 1) == *(word + i - 2)) + striple = 1; + } + + rv = lookup(st.c_str() + i); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + !((compoundflag && !words && + TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && !words && + TESTAFF(rv->astr, compoundend, rv->alen)) || + (numdefcpd && words && + defcpd_check(&words, wnum + 1, rv, NULL, 1))) || + (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL && + !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, + rv->alen)))) { + rv = rv->next_homonym; + } + + // check FORCEUCASE + if (rv && forceucase && (rv) && + (TESTAFF(rv->astr, forceucase, rv->alen)) && + !(info && *info & SPELL_ORIGCAP)) + rv = NULL; + + if (rv && words && words[wnum + 1]) + return rv_first; + + oldnumsyllable2 = numsyllable; + oldwordnum2 = wordnum; + + // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary + // code + if ((rv) && (langnum == LANG_hu) && + (TESTAFF(rv->astr, 'I', rv->alen)) && + !(TESTAFF(rv->astr, 'J', rv->alen))) { + numsyllable--; + } + // END of LANG_hu section + + // increment word number, if the second root has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && + (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + (is_sug && nosuggest && + TESTAFF(rv->astr, nosuggest, rv->alen)))) + return NULL; + + // second word is acceptable, as a root? + // hungarian conventions: compounding is acceptable, + // when compound forms consist of 2 words, or if more, + // then the syllable number of root words must be 6, or lesser. + + if ((rv) && + ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && + (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || + ((cpdmaxsyllable != 0) && + (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->clen)) <= + cpdmaxsyllable))) && + ( + // test CHECKCOMPOUNDPATTERN + !numcheckcpd || scpd != 0 || + !cpdpat_check(word, i, rv_first, rv, 0)) && + ((!checkcompounddup || (rv != rv_first))) + // test CHECKCOMPOUNDPATTERN conditions + && + (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) { + // forbid compound word, if it is a non compound word with typical + // fault + if (checkcompoundrep && cpdrep_check(word, len)) + return NULL; + return rv_first; + } + + numsyllable = oldnumsyllable2; + wordnum = oldwordnum2; + + // perhaps second word has prefix or/and suffix + sfx = NULL; + sfxflag = FLAG_NULL; + rv = (compoundflag && !onlycpdrule) + ? affix_check((word + i), strlen(word + i), compoundflag, + IN_CPD_END) + : NULL; + if (!rv && compoundend && !onlycpdrule) { + sfx = NULL; + pfx = NULL; + rv = affix_check((word + i), strlen(word + i), compoundend, + IN_CPD_END); + } + + if (!rv && numdefcpd && words) { + rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END); + if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) + return rv_first; + rv = NULL; + } + + // test CHECKCOMPOUNDPATTERN conditions (allowed forms) + if (rv && + !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) + rv = NULL; + + // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) + if (rv && numcheckcpd && scpd == 0 && + cpdpat_check(word, i, rv_first, rv, affixed)) + rv = NULL; + + // check non_compound flag in suffix and prefix + if ((rv) && ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundforbidflag, + pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundforbidflag, + sfx->getContLen())))) { + rv = NULL; + } + + // check FORCEUCASE + if (rv && forceucase && (rv) && + (TESTAFF(rv->astr, forceucase, rv->alen)) && + !(info && *info & SPELL_ORIGCAP)) + rv = NULL; + + // check forbiddenwords + if ((rv) && (rv->astr) && + (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + (is_sug && nosuggest && + TESTAFF(rv->astr, nosuggest, rv->alen)))) + return NULL; + + // pfxappnd = prefix of word+i, or NULL + // calculate syllable number of prefix. + // hungarian convention: when syllable number of prefix is more, + // than 1, the prefix+word counts as two words. + + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(word + i); + + // - affix syllable num. + // XXX only second suffix (inflections, not derivations) + if (sfxappnd) { + std::string tmp(sfxappnd); + reverseword(tmp); + numsyllable -= get_syllable(tmp) + sfxextra; + } + + // + 1 word, if syllable number of the prefix > 1 (hungarian + // convention) + if (pfx && (get_syllable(pfx->getKey()) > 1)) + wordnum++; + + // increment syllable num, if last word has a SYLLABLENUM flag + // and the suffix is beginning `s' + + if (cpdsyllablenum) { + switch (sfxflag) { + case 'c': { + numsyllable += 2; + break; + } + case 'J': { + numsyllable += 1; + break; + } + case 'I': { + if (rv && TESTAFF(rv->astr, 'J', rv->alen)) + numsyllable += 1; + break; + } + } + } + } + + // increment word number, if the second word has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // second word is acceptable, as a word with prefix or/and suffix? + // hungarian conventions: compounding is acceptable, + // when compound forms consist 2 word, otherwise + // the syllable number of root words is 6, or lesser. + if ((rv) && + (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || + ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && + ((!checkcompounddup || (rv != rv_first)))) { + // forbid compound word, if it is a non compound word with typical + // fault + if (checkcompoundrep && cpdrep_check(word, len)) + return NULL; + return rv_first; + } + + numsyllable = oldnumsyllable2; + wordnum = oldwordnum2; + + // perhaps second word is a compound word (recursive call) + if (wordnum < maxwordnum) { + rv = compound_check(st.c_str() + i, strlen(st.c_str() + i), wordnum + 1, + numsyllable, maxwordnum, wnum + 1, words, rwords, 0, + is_sug, info); + + if (rv && numcheckcpd && + ((scpd == 0 && + cpdpat_check(word, i, rv_first, rv, affixed)) || + (scpd != 0 && + !cpdpat_check(word, i, rv_first, rv, affixed)))) + rv = NULL; + } else { + rv = NULL; + } + if (rv) { + // forbid compound word, if it is a non compound word with typical + // fault + if (checkcompoundrep || forbiddenword) { + struct hentry* rv2 = NULL; + + if (checkcompoundrep && cpdrep_check(word, len)) + return NULL; + + // check first part + if (strncmp(rv->word, word + i, rv->blen) == 0) { + char r = st[i + rv->blen]; + st[i + rv->blen] = '\0'; + + if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) { + st[ + i + rv->blen] = r; + continue; + } + + if (forbiddenword) { + rv2 = lookup(word); + if (!rv2) + rv2 = affix_check(word, len); + if (rv2 && rv2->astr && + TESTAFF(rv2->astr, forbiddenword, rv2->alen) && + (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) { + return NULL; + } + } + st[i + rv->blen] = r; + } + } + return rv_first; + } + } while (striple && !checkedstriple); // end of striple loop + + if (checkedstriple) { + i++; + checkedstriple = 0; + striple = 0; + } + + } // first word is ok condition + + if (soldi != 0) { + i = soldi; + soldi = 0; + len = oldlen; + cmin = oldcmin; + cmax = oldcmax; + } + scpd++; + + } while (!onlycpdrule && simplifiedcpd && + scpd <= numcheckcpd); // end of simplifiedcpd loop + + scpd = 0; + wordnum = oldwordnum; + numsyllable = oldnumsyllable; + + if (soldi != 0) { + i = soldi; + st.assign(word); // XXX add more optim. + soldi = 0; + } else + st[i] = ch; + + } while (numdefcpd && oldwordnum == 0 && + onlycpdrule++ < 1); // end of onlycpd loop + } + + return NULL; +} + +// check if compound word is correctly spelled +// hu_mov_rule = spec. Hungarian rule (XXX) +int AffixMgr::compound_check_morph(const char* word, + int len, + short wordnum, + short numsyllable, + short maxwordnum, + short wnum, + hentry** words, + hentry** rwords, + char hu_mov_rule = 0, + char** result = NULL, + char* partresult = NULL) { + int i; + short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; + int ok = 0; + + struct hentry* rv = NULL; + struct hentry* rv_first; + std::string st; + char ch; + + int checked_prefix; + char presult[MAXLNLEN]; + + int cmin; + int cmax; + + int onlycpdrule; + char affixed = 0; + hentry** oldwords = words; + + setcminmax(&cmin, &cmax, word, len); + + st.assign(word); + + for (i = cmin; i < cmax; i++) { + // go to end of the UTF-8 character + if (utf8) { + for (; (st[i] & 0xc0) == 0x80; i++) + ; + if (i >= cmax) + return 0; + } + + words = oldwords; + onlycpdrule = (words) ? 1 : 0; + + do { // onlycpdrule loop + + oldnumsyllable = numsyllable; + oldwordnum = wordnum; + checked_prefix = 0; + + ch = st[i]; + st[i] = '\0'; + sfx = NULL; + + // FIRST WORD + + affixed = 1; + + *presult = '\0'; + if (partresult) + mystrcat(presult, partresult, MAXLNLEN); + + rv = lookup(st.c_str()); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && !hu_mov_rule && + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + !((compoundflag && !words && !onlycpdrule && + TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundbegin && !wordnum && !onlycpdrule && + TESTAFF(rv->astr, compoundbegin, rv->alen)) || + (compoundmiddle && wordnum && !words && !onlycpdrule && + TESTAFF(rv->astr, compoundmiddle, rv->alen)) || + (numdefcpd && onlycpdrule && + ((!words && !wordnum && + defcpd_check(&words, wnum, rv, rwords, 0)) || + (words && + defcpd_check(&words, wnum, rv, rwords, 0))))))) { + rv = rv->next_homonym; + } + + if (rv) + affixed = 0; + + if (rv) { + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st.c_str()); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, + st.c_str()); + } + // store the pointer of the hash entry + // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, + // MORPH_HENTRY, rv); + if (HENTRY_DATA(rv)) { + sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, + HENTRY_DATA2(rv)); + } + } + + if (!rv) { + if (onlycpdrule && strlen(*result) > MAXLNLEN / 10) + break; + if (compoundflag && + !(rv = + prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, + compoundflag))) { + if (((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, + compoundflag, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (compoundmoresuffixes && + (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) && + !hu_mov_rule && sfx->getCont() && + ((compoundforbidflag && + TESTAFF(sfx->getCont(), compoundforbidflag, + sfx->getContLen())) || + (compoundend && + TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { + rv = NULL; + } + } + + if (rv || + (((wordnum == 0) && compoundbegin && + ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, + compoundbegin, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (compoundmoresuffixes && + (rv = suffix_check_twosfx( + st.c_str(), i, 0, NULL, + compoundbegin))) || // twofold suffix+compound + (rv = prefix_check(st.c_str(), i, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, + compoundbegin)))) || + ((wordnum > 0) && compoundmiddle && + ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, + compoundmiddle, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (compoundmoresuffixes && + (rv = suffix_check_twosfx( + st.c_str(), i, 0, NULL, + compoundmiddle))) || // twofold suffix+compound + (rv = prefix_check(st.c_str(), i, + hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, + compoundmiddle)))))) { + // char * p = prefix_check_morph(st, i, 0, compound); + char* p = NULL; + if (compoundflag) + p = affix_check_morph(st.c_str(), i, compoundflag); + if (!p || (*p == '\0')) { + if (p) + free(p); + p = NULL; + if ((wordnum == 0) && compoundbegin) { + p = affix_check_morph(st.c_str(), i, compoundbegin); + } else if ((wordnum > 0) && compoundmiddle) { + p = affix_check_morph(st.c_str(), i, compoundmiddle); + } + } + if (p && (*p != '\0')) { + sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, MORPH_PART, + st.c_str(), line_uniq_app(&p, MSEP_REC)); + } + if (p) + free(p); + checked_prefix = 1; + } + // else check forbiddenwords + } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr, needaffix, rv->alen))) { + st[i] = ch; + continue; + } + + // check non_compound flag in suffix and prefix + if ((rv) && !hu_mov_rule && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) { + continue; + } + + // check compoundend flag in suffix and prefix + if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { + continue; + } + + // check compoundmiddle flag in suffix and prefix + if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle && + !hu_mov_rule && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) + continue; + + // increment word number, if the second root has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // first word is acceptable in compound words? + if (((rv) && + (checked_prefix || (words && words[wnum]) || + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + ((oldwordnum == 0) && compoundbegin && + TESTAFF(rv->astr, compoundbegin, rv->alen)) || + ((oldwordnum > 0) && compoundmiddle && + TESTAFF(rv->astr, compoundmiddle, rv->alen)) + // LANG_hu section: spec. Hungarian rule + || ((langnum == LANG_hu) && // hu_mov_rule + hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) || + TESTAFF(rv->astr, 'G', rv->alen) || + TESTAFF(rv->astr, 'H', rv->alen))) + // END of LANG_hu section + ) && + !((checkcompoundtriple && !words && // test triple letters + (word[i - 1] == word[i]) && + (((i > 1) && (word[i - 1] == word[i - 2])) || + ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' + )) || + ( + // test CHECKCOMPOUNDPATTERN + numcheckcpd && !words && + cpdpat_check(word, i, rv, NULL, affixed)) || + (checkcompoundcase && !words && cpdcase_check(word, i)))) + // LANG_hu section: spec. Hungarian rule + || + ((!rv) && (langnum == LANG_hu) && hu_mov_rule && + (rv = affix_check(st.c_str(), i)) && + (sfx && sfx->getCont() && + (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) || + TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen())))) + // END of LANG_hu section + ) { + // LANG_hu section: spec. Hungarian rule + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(st.substr(i)); + + // + 1 word, if syllable number of the prefix > 1 (hungarian + // convention) + if (pfx && (get_syllable(pfx->getKey()) > 1)) + wordnum++; + } + // END of LANG_hu section + + // NEXT WORD(S) + rv_first = rv; + rv = lookup((word + i)); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + !((compoundflag && !words && + TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && !words && + TESTAFF(rv->astr, compoundend, rv->alen)) || + (numdefcpd && words && + defcpd_check(&words, wnum + 1, rv, NULL, 1))))) { + rv = rv->next_homonym; + } + + if (rv && words && words[wnum + 1]) { + mystrcat(*result, presult, MAXLNLEN); + mystrcat(*result, " ", MAXLNLEN); + mystrcat(*result, MORPH_PART, MAXLNLEN); + mystrcat(*result, word + i, MAXLNLEN); + if (complexprefixes && HENTRY_DATA(rv)) + mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(*result, " ", MAXLNLEN); + mystrcat(*result, MORPH_STEM, MAXLNLEN); + mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry + // sprintf(*result + strlen(*result), " %s%p", + // MORPH_HENTRY, rv); + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(*result, " ", MAXLNLEN); + mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + } + mystrcat(*result, "\n", MAXLNLEN); + return 0; + } + + oldnumsyllable2 = numsyllable; + oldwordnum2 = wordnum; + + // LANG_hu section: spec. Hungarian rule + if ((rv) && (langnum == LANG_hu) && + (TESTAFF(rv->astr, 'I', rv->alen)) && + !(TESTAFF(rv->astr, 'J', rv->alen))) { + numsyllable--; + } + // END of LANG_hu section + // increment word number, if the second root has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && + (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) { + st[i] = ch; + continue; + } + + // second word is acceptable, as a root? + // hungarian conventions: compounding is acceptable, + // when compound forms consist of 2 words, or if more, + // then the syllable number of root words must be 6, or lesser. + if ((rv) && + ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && + (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || + ((cpdmaxsyllable != 0) && + (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <= + cpdmaxsyllable))) && + ((!checkcompounddup || (rv != rv_first)))) { + // bad compound word + mystrcat(*result, presult, MAXLNLEN); + mystrcat(*result, " ", MAXLNLEN); + mystrcat(*result, MORPH_PART, MAXLNLEN); + mystrcat(*result, word + i, MAXLNLEN); + + if (HENTRY_DATA(rv)) { + if (complexprefixes) + mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(*result, " ", MAXLNLEN); + mystrcat(*result, MORPH_STEM, MAXLNLEN); + mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry + // sprintf(*result + strlen(*result), " + // %s%p", MORPH_HENTRY, rv); + if (!complexprefixes) { + mystrcat(*result, " ", MAXLNLEN); + mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + } + } + mystrcat(*result, "\n", MAXLNLEN); + ok = 1; + } + + numsyllable = oldnumsyllable2; + wordnum = oldwordnum2; + + // perhaps second word has prefix or/and suffix + sfx = NULL; + sfxflag = FLAG_NULL; + + if (compoundflag && !onlycpdrule) + rv = affix_check((word + i), strlen(word + i), compoundflag); + else + rv = NULL; + + if (!rv && compoundend && !onlycpdrule) { + sfx = NULL; + pfx = NULL; + rv = affix_check((word + i), strlen(word + i), compoundend); + } + + if (!rv && numdefcpd && words) { + rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END); + if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { + char* m = NULL; + if (compoundflag) + m = affix_check_morph((word + i), strlen(word + i), compoundflag); + if ((!m || *m == '\0') && compoundend) { + if (m) + free(m); + m = affix_check_morph((word + i), strlen(word + i), compoundend); + } + mystrcat(*result, presult, MAXLNLEN); + if (m || (*m != '\0')) { + char m2[MAXLNLEN]; + sprintf(m2, "%c%s%s%s", MSEP_FLD, MORPH_PART, word + i, + line_uniq_app(&m, MSEP_REC)); + mystrcat(*result, m2, MAXLNLEN); + } + if (m) + free(m); + mystrcat(*result, "\n", MAXLNLEN); + ok = 1; + } + } + + // check non_compound flag in suffix and prefix + if ((rv) && + ((pfx && pfx->getCont() && + TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || + (sfx && sfx->getCont() && + TESTAFF(sfx->getCont(), compoundforbidflag, + sfx->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && + (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) && + (!TESTAFF(rv->astr, needaffix, rv->alen))) { + st[i] = ch; + continue; + } + + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(word + i); + + // - affix syllable num. + // XXX only second suffix (inflections, not derivations) + if (sfxappnd) { + std::string tmp(sfxappnd); + reverseword(tmp); + numsyllable -= get_syllable(tmp) + sfxextra; + } + + // + 1 word, if syllable number of the prefix > 1 (hungarian + // convention) + if (pfx && (get_syllable(pfx->getKey()) > 1)) + wordnum++; + + // increment syllable num, if last word has a SYLLABLENUM flag + // and the suffix is beginning `s' + + if (cpdsyllablenum) { + switch (sfxflag) { + case 'c': { + numsyllable += 2; + break; + } + case 'J': { + numsyllable += 1; + break; + } + case 'I': { + if (rv && TESTAFF(rv->astr, 'J', rv->alen)) + numsyllable += 1; + break; + } + } + } + } + + // increment word number, if the second word has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + // second word is acceptable, as a word with prefix or/and suffix? + // hungarian conventions: compounding is acceptable, + // when compound forms consist 2 word, otherwise + // the syllable number of root words is 6, or lesser. + if ((rv) && + (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || + ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && + ((!checkcompounddup || (rv != rv_first)))) { + char* m = NULL; + if (compoundflag) + m = affix_check_morph((word + i), strlen(word + i), compoundflag); + if ((!m || *m == '\0') && compoundend) { + if (m) + free(m); + m = affix_check_morph((word + i), strlen(word + i), compoundend); + } + mystrcat(*result, presult, MAXLNLEN); + if (m && (*m != '\0')) { + char m2[MAXLNLEN]; + sprintf(m2, "%c%s%s%s", MSEP_FLD, MORPH_PART, word + i, + line_uniq_app(&m, MSEP_REC)); + mystrcat(*result, m2, MAXLNLEN); + } + if (m) + free(m); + if (strlen(*result) + 1 < MAXLNLEN) + sprintf(*result + strlen(*result), "%c", MSEP_REC); + ok = 1; + } + + numsyllable = oldnumsyllable2; + wordnum = oldwordnum2; + + // perhaps second word is a compound word (recursive call) + if ((wordnum < maxwordnum) && (ok == 0)) { + compound_check_morph((word + i), strlen(word + i), wordnum + 1, + numsyllable, maxwordnum, wnum + 1, words, rwords, 0, + result, presult); + } else { + rv = NULL; + } + } + st[i] = ch; + wordnum = oldwordnum; + numsyllable = oldnumsyllable; + + } while (numdefcpd && oldwordnum == 0 && + onlycpdrule++ < 1); // end of onlycpd loop + } + return 0; +} + + +// return 1 if s1 (reversed) is a leading subset of end of s2 +/* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int + len) + { + while ((len > 0) && *s1 && (*s1 == *end_of_s2)) { + s1++; + end_of_s2--; + len--; + } + return (*s1 == '\0'); + } + */ + +inline int AffixMgr::isRevSubset(const char* s1, + const char* end_of_s2, + int len) { + while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) { + s1++; + end_of_s2--; + len--; + } + return (*s1 == '\0'); +} + +// check word for suffixes + +struct hentry* AffixMgr::suffix_check(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + char** wlst, + int maxSug, + int* ns, + const FLAG cclass, + const FLAG needflag, + char in_compound) { + struct hentry* rv = NULL; + PfxEntry* ep = ppfx; + + // first handle the special case of 0 length suffixes + SfxEntry* se = sStart[0]; + + while (se) { + if (!cclass || se->getCont()) { + // suffixes are not allowed in beginning of compounds + if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (se->getCont() && compoundpermitflag && + TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) && + (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || + !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (!se->getCont() || + !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && + TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (se->getCont() && + (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) && + // fogemorpheme + (in_compound || + !(se->getCont() && + (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) && + // needaffix on prefix or first suffix + (cclass || + !(se->getCont() && + TESTAFF(se->getCont(), needaffix, se->getContLen())) || + (ppfx && + !((ep->getCont()) && + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) { + rv = se->checkword(word, len, sfxopts, ppfx, wlst, maxSug, ns, + (FLAG)cclass, needflag, + (in_compound ? 0 : onlyincompound)); + if (rv) { + sfx = se; // BUG: sfx not stateless + return rv; + } + } + } + se = se->getNext(); + } + + // now handle the general case + if (len == 0) + return NULL; // FULLSTRIP + unsigned char sp = *((const unsigned char*)(word + len - 1)); + SfxEntry* sptr = sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { + // suffixes are not allowed in beginning of compounds + if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (sptr->getCont() && compoundpermitflag && + TESTAFF(sptr->getCont(), compoundpermitflag, + sptr->getContLen()))) && + (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || + !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (!sptr->getCont() || + !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && + TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (sptr->getCont() && + (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) && + // fogemorpheme + (in_compound || + !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, + sptr->getContLen()))))) && + // needaffix on prefix or first suffix + (cclass || + !(sptr->getCont() && + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || + (ppfx && + !((ep->getCont()) && + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) + if (in_compound != IN_CPD_END || ppfx || + !(sptr->getCont() && + TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { + rv = sptr->checkword(word, len, sfxopts, ppfx, wlst, maxSug, ns, + cclass, needflag, + (in_compound ? 0 : onlyincompound)); + if (rv) { + sfx = sptr; // BUG: sfx not stateless + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless + if (!sptr->getCont()) + sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless + // LANG_hu section: spec. Hungarian rule + else if (langnum == LANG_hu && sptr->getKeyLen() && + sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' && + sptr->getKey()[1] != 't') { + sfxextra = 1; + } + // END of LANG_hu section + return rv; + } + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + + return NULL; +} + +// check word for two-level suffixes + +struct hentry* AffixMgr::suffix_check_twosfx(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG needflag) { + struct hentry* rv = NULL; + + // first handle the special case of 0 length suffixes + SfxEntry* se = sStart[0]; + while (se) { + if (contclasses[se->getFlag()]) { + rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag); + if (rv) + return rv; + } + se = se->getNext(); + } + + // now handle the general case + if (len == 0) + return NULL; // FULLSTRIP + unsigned char sp = *((const unsigned char*)(word + len - 1)); + SfxEntry* sptr = sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { + if (contclasses[sptr->getFlag()]) { + rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag); + if (rv) { + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless + if (!sptr->getCont()) + sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless + return rv; + } + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + + return NULL; +} + +char* AffixMgr::suffix_check_twosfx_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG needflag) { + std::string result; + std::string result2; + std::string result3; + + char* st; + + // first handle the special case of 0 length suffixes + SfxEntry* se = sStart[0]; + while (se) { + if (contclasses[se->getFlag()]) { + st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); + if (st) { + if (ppfx) { + if (ppfx->getMorph()) { + result.append(ppfx->getMorph()); + result.append(" "); + } else + debugflag(result, ppfx->getFlag()); + } + result.append(st); + free(st); + if (se->getMorph()) { + result.append(" "); + result.append(se->getMorph()); + } else + debugflag(result, se->getFlag()); + result.append("\n"); + } + } + se = se->getNext(); + } + + // now handle the general case + if (len == 0) + return NULL; // FULLSTRIP + unsigned char sp = *((const unsigned char*)(word + len - 1)); + SfxEntry* sptr = sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { + if (contclasses[sptr->getFlag()]) { + st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); + if (st) { + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless + if (!sptr->getCont()) + sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless + result2.assign(st); + free(st); + + result3.clear(); + + if (sptr->getMorph()) { + result3.append(" "); + result3.append(sptr->getMorph()); + } else + debugflag(result3, sptr->getFlag()); + strlinecat(result2, result3); + result2.append("\n"); + result.append(result2); + } + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + + if (!result.empty()) + return mystrdup(result.c_str()); + + return NULL; +} + +char* AffixMgr::suffix_check_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG cclass, + const FLAG needflag, + char in_compound) { + char result[MAXLNLEN]; + + struct hentry* rv = NULL; + + result[0] = '\0'; + + PfxEntry* ep = ppfx; + + // first handle the special case of 0 length suffixes + SfxEntry* se = sStart[0]; + while (se) { + if (!cclass || se->getCont()) { + // suffixes are not allowed in beginning of compounds + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (se->getCont() && compoundpermitflag && + TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) && + (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || + !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (!se->getCont() || + !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && + TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (se->getCont() && + (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) && + // fogemorpheme + (in_compound || + !((se->getCont() && + (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && + // needaffix on prefix or first suffix + (cclass || + !(se->getCont() && + TESTAFF(se->getCont(), needaffix, se->getContLen())) || + (ppfx && + !((ep->getCont()) && + TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))) + rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, + needflag); + while (rv) { + if (ppfx) { + if (ppfx->getMorph()) { + mystrcat(result, ppfx->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else + debugflag(result, ppfx->getFlag()); + } + if (complexprefixes && HENTRY_DATA(rv)) + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry + // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, + // rv); + + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + if (se->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, se->getMorph(), MAXLNLEN); + } else + debugflag(result, se->getFlag()); + mystrcat(result, "\n", MAXLNLEN); + rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); + } + } + se = se->getNext(); + } + + // now handle the general case + if (len == 0) + return NULL; // FULLSTRIP + unsigned char sp = *((const unsigned char*)(word + len - 1)); + SfxEntry* sptr = sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { + // suffixes are not allowed in beginning of compounds + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (sptr->getCont() && compoundpermitflag && + TESTAFF(sptr->getCont(), compoundpermitflag, + sptr->getContLen()))) && + (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || + !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (!sptr->getCont() || + !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && + TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && + (sptr->getCont() && + (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) && + // fogemorpheme + (in_compound || + !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, + sptr->getContLen()))))) && + // needaffix on first suffix + (cclass || + !(sptr->getCont() && + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))))) + rv = sptr->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, + needflag); + while (rv) { + if (ppfx) { + if (ppfx->getMorph()) { + mystrcat(result, ppfx->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else + debugflag(result, ppfx->getFlag()); + } + if (complexprefixes && HENTRY_DATA(rv)) + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry + // sprintf(result + strlen(result), " %s%p", + // MORPH_HENTRY, rv); + + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + + if (sptr->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, sptr->getMorph(), MAXLNLEN); + } else + debugflag(result, sptr->getFlag()); + mystrcat(result, "\n", MAXLNLEN); + rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + + if (*result) + return mystrdup(result); + return NULL; +} + +// check if word with affixes is correctly spelled +struct hentry* AffixMgr::affix_check(const char* word, + int len, + const FLAG needflag, + char in_compound) { + struct hentry* rv = NULL; + + // check all prefixes (also crossed with suffixes if allowed) + rv = prefix_check(word, len, in_compound, needflag); + if (rv) + return rv; + + // if still not found check all suffixes + rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, + in_compound); + + if (havecontclass) { + sfx = NULL; + pfx = NULL; + + if (rv) + return rv; + // if still not found check all two-level suffixes + rv = suffix_check_twosfx(word, len, 0, NULL, needflag); + + if (rv) + return rv; + // if still not found check all two-level suffixes + rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag); + } + + return rv; +} + +// check if word with affixes is correctly spelled +char* AffixMgr::affix_check_morph(const char* word, + int len, + const FLAG needflag, + char in_compound) { + char result[MAXLNLEN]; + char* st = NULL; + + *result = '\0'; + + // check all prefixes (also crossed with suffixes if allowed) + st = prefix_check_morph(word, len, in_compound); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + + // if still not found check all suffixes + st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + + if (havecontclass) { + sfx = NULL; + pfx = NULL; + // if still not found check all two-level suffixes + st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + + // if still not found check all two-level suffixes + st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + } + + return mystrdup(result); +} + +char* AffixMgr::morphgen(const char* ts, + int wl, + const unsigned short* ap, + unsigned short al, + const char* morph, + const char* targetmorph, + int level) { + // handle suffixes + if (!morph) + return NULL; + + // check substandard flag + if (TESTAFF(ap, substandard, al)) + return NULL; + + if (morphcmp(morph, targetmorph) == 0) + return mystrdup(ts); + + size_t stemmorphcatpos; + std::string mymorph; + + // use input suffix fields, if exist + if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { + mymorph.assign(morph); + mymorph.append(" "); + stemmorphcatpos = mymorph.size(); + } else { + stemmorphcatpos = std::string::npos; + } + + for (int i = 0; i < al; i++) { + const unsigned char c = (unsigned char)(ap[i] & 0x00FF); + SfxEntry* sptr = sFlag[c]; + while (sptr) { + if (sptr->getFlag() == ap[i] && sptr->getMorph() && + ((sptr->getContLen() == 0) || + // don't generate forms with substandard affixes + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { + const char* stemmorph; + if (stemmorphcatpos != std::string::npos) { + mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph()); + stemmorph = mymorph.c_str(); + } else { + stemmorph = sptr->getMorph(); + } + + int cmp = morphcmp(stemmorph, targetmorph); + + if (cmp == 0) { + char* newword = sptr->add(ts, wl); + if (newword) { + hentry* check = pHMgr->lookup(newword); // XXX extra dic + if (!check || !check->astr || + !(TESTAFF(check->astr, forbiddenword, check->alen) || + TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) { + return newword; + } + free(newword); + } + } + + // recursive call for secondary suffixes + if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && + // (get_sfxcount(stemmorph) < targetcount) && + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { + char* newword = sptr->add(ts, wl); + if (newword) { + char* newword2 = + morphgen(newword, strlen(newword), sptr->getCont(), + sptr->getContLen(), stemmorph, targetmorph, 1); + + if (newword2) { + free(newword); + return newword2; + } + free(newword); + newword = NULL; + } + } + } + sptr = sptr->getFlgNxt(); + } + } + return NULL; +} + +int AffixMgr::expand_rootword(struct guessword* wlst, + int maxn, + const char* ts, + int wl, + const unsigned short* ap, + unsigned short al, + const char* bad, + int badl, + const char* phon) { + int nh = 0; + // first add root word to list + if ((nh < maxn) && + !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || + (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { + wlst[nh].word = mystrdup(ts); + if (!wlst[nh].word) + return 0; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = NULL; + nh++; + // add special phonetic version + if (phon && (nh < maxn)) { + wlst[nh].word = mystrdup(phon); + if (!wlst[nh].word) + return nh - 1; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = mystrdup(ts); + if (!wlst[nh].orig) + return nh - 1; + nh++; + } + } + + // handle suffixes + for (int i = 0; i < al; i++) { + const unsigned char c = (unsigned char)(ap[i] & 0x00FF); + SfxEntry* sptr = sFlag[c]; + while (sptr) { + if ((sptr->getFlag() == ap[i]) && + (!sptr->getKeyLen() || + ((badl > sptr->getKeyLen()) && + (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && + // check needaffix flag + !(sptr->getCont() && + ((needaffix && + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || + (circumfix && + TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || + (onlyincompound && + TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) { + char* newword = sptr->add(ts, wl); + if (newword) { + if (nh < maxn) { + wlst[nh].word = newword; + wlst[nh].allow = sptr->allowCross(); + wlst[nh].orig = NULL; + nh++; + // add special phonetic version + if (phon && (nh < maxn)) { + std::string prefix(phon); + std::string key(sptr->getKey()); + reverseword(key); + prefix.append(key); + wlst[nh].word = mystrdup(prefix.c_str()); + if (!wlst[nh].word) + return nh - 1; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = mystrdup(newword); + if (!wlst[nh].orig) + return nh - 1; + nh++; + } + } else { + free(newword); + } + } + } + sptr = sptr->getFlgNxt(); + } + } + + int n = nh; + + // handle cross products of prefixes and suffixes + for (int j = 1; j < n; j++) + if (wlst[j].allow) { + for (int k = 0; k < al; k++) { + const unsigned char c = (unsigned char)(ap[k] & 0x00FF); + PfxEntry* cptr = pFlag[c]; + while (cptr) { + if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && + (!cptr->getKeyLen() || + ((badl > cptr->getKeyLen()) && + (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { + int l1 = strlen(wlst[j].word); + char* newword = cptr->add(wlst[j].word, l1); + if (newword) { + if (nh < maxn) { + wlst[nh].word = newword; + wlst[nh].allow = cptr->allowCross(); + wlst[nh].orig = NULL; + nh++; + } else { + free(newword); + } + } + } + cptr = cptr->getFlgNxt(); + } + } + } + + // now handle pure prefixes + for (int m = 0; m < al; m++) { + const unsigned char c = (unsigned char)(ap[m] & 0x00FF); + PfxEntry* ptr = pFlag[c]; + while (ptr) { + if ((ptr->getFlag() == ap[m]) && + (!ptr->getKeyLen() || + ((badl > ptr->getKeyLen()) && + (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && + // check needaffix flag + !(ptr->getCont() && + ((needaffix && + TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || + (circumfix && + TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || + (onlyincompound && + TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) { + char* newword = ptr->add(ts, wl); + if (newword) { + if (nh < maxn) { + wlst[nh].word = newword; + wlst[nh].allow = ptr->allowCross(); + wlst[nh].orig = NULL; + nh++; + } else { + free(newword); + } + } + } + ptr = ptr->getFlgNxt(); + } + } + + return nh; +} + +// return length of replacing table +int AffixMgr::get_numrep() const { + return numrep; +} + +// return replacing table +struct replentry* AffixMgr::get_reptable() const { + if (!reptable) + return NULL; + return reptable; +} + +// return iconv table +RepList* AffixMgr::get_iconvtable() const { + if (!iconvtable) + return NULL; + return iconvtable; +} + +// return oconv table +RepList* AffixMgr::get_oconvtable() const { + if (!oconvtable) + return NULL; + return oconvtable; +} + +// return replacing table +struct phonetable* AffixMgr::get_phonetable() const { + if (!phone) + return NULL; + return phone; +} + +// return length of character map table +int AffixMgr::get_nummap() const { + return nummap; +} + +// return character map table +struct mapentry* AffixMgr::get_maptable() const { + if (!maptable) + return NULL; + return maptable; +} + +// return length of word break table +int AffixMgr::get_numbreak() const { + return numbreak; +} + +// return character map table +char** AffixMgr::get_breaktable() const { + if (!breaktable) + return NULL; + return breaktable; +} + +// return text encoding of dictionary +char* AffixMgr::get_encoding() { + if (!encoding) + encoding = mystrdup(SPELL_ENCODING); + return mystrdup(encoding); +} + +// return text encoding of dictionary +int AffixMgr::get_langnum() const { + return langnum; +} + +// return double prefix option +int AffixMgr::get_complexprefixes() const { + return complexprefixes; +} + +// return FULLSTRIP option +int AffixMgr::get_fullstrip() const { + return fullstrip; +} + +FLAG AffixMgr::get_keepcase() const { + return keepcase; +} + +FLAG AffixMgr::get_forceucase() const { + return forceucase; +} + +FLAG AffixMgr::get_warn() const { + return warn; +} + +int AffixMgr::get_forbidwarn() const { + return forbidwarn; +} + +int AffixMgr::get_checksharps() const { + return checksharps; +} + +char* AffixMgr::encode_flag(unsigned short aflag) const { + return pHMgr->encode_flag(aflag); +} + +// return the preferred ignore string for suggestions +char* AffixMgr::get_ignore() const { + if (!ignorechars) + return NULL; + return ignorechars; +} + +// return the preferred ignore string for suggestions +const std::vector<w_char>& AffixMgr::get_ignore_utf16() const { + return ignorechars_utf16; +} + +// return the keyboard string for suggestions +char* AffixMgr::get_key_string() { + if (!keystring) + keystring = mystrdup(SPELL_KEYSTRING); + return mystrdup(keystring); +} + +// return the preferred try string for suggestions +char* AffixMgr::get_try_string() const { + if (!trystring) + return NULL; + return mystrdup(trystring); +} + +// return the preferred try string for suggestions +const char* AffixMgr::get_wordchars() const { + return wordchars; +} + +const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const { + return wordchars_utf16; +} + +// is there compounding? +int AffixMgr::get_compound() const { + return compoundflag || compoundbegin || numdefcpd; +} + +// return the compound words control flag +FLAG AffixMgr::get_compoundflag() const { + return compoundflag; +} + +// return the forbidden words control flag +FLAG AffixMgr::get_forbiddenword() const { + return forbiddenword; +} + +// return the forbidden words control flag +FLAG AffixMgr::get_nosuggest() const { + return nosuggest; +} + +// return the forbidden words control flag +FLAG AffixMgr::get_nongramsuggest() const { + return nongramsuggest; +} + +// return the forbidden words flag modify flag +FLAG AffixMgr::get_needaffix() const { + return needaffix; +} + +// return the onlyincompound flag +FLAG AffixMgr::get_onlyincompound() const { + return onlyincompound; +} + +// return the compound word signal flag +FLAG AffixMgr::get_compoundroot() const { + return compoundroot; +} + +// return the compound begin signal flag +FLAG AffixMgr::get_compoundbegin() const { + return compoundbegin; +} + +// return the value of checknum +int AffixMgr::get_checknum() const { + return checknum; +} + +// return the value of prefix +const char* AffixMgr::get_prefix() const { + if (pfx) + return pfx->getKey(); + return NULL; +} + +// return the value of suffix +const char* AffixMgr::get_suffix() const { + return sfxappnd; +} + +// return the value of suffix +const char* AffixMgr::get_version() const { + return version; +} + +// return lemma_present flag +FLAG AffixMgr::get_lemma_present() const { + return lemma_present; +} + +// utility method to look up root words in hash table +struct hentry* AffixMgr::lookup(const char* word) { + int i; + struct hentry* he = NULL; + for (i = 0; i < *maxdic && !he; i++) { + he = (alldic[i])->lookup(word); + } + return he; +} + +// return the value of suffix +int AffixMgr::have_contclass() const { + return havecontclass; +} + +// return utf8 +int AffixMgr::get_utf8() const { + return utf8; +} + +int AffixMgr::get_maxngramsugs(void) const { + return maxngramsugs; +} + +int AffixMgr::get_maxcpdsugs(void) const { + return maxcpdsugs; +} + +int AffixMgr::get_maxdiff(void) const { + return maxdiff; +} + +int AffixMgr::get_onlymaxdiff(void) const { + return onlymaxdiff; +} + +// return nosplitsugs +int AffixMgr::get_nosplitsugs(void) const { + return nosplitsugs; +} + +// return sugswithdots +int AffixMgr::get_sugswithdots(void) const { + return sugswithdots; +} + +/* parse flag */ +int AffixMgr::parse_flag(char* line, unsigned short* out, FileMgr* af) { + char* s = NULL; + if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { + HUNSPELL_WARNING( + stderr, + "error: line %d: multiple definitions of an affix file parameter\n", + af->getlinenum()); + return 1; + } + if (parse_string(line, &s, af->getlinenum())) + return 1; + *out = pHMgr->decode_flag(s); + free(s); + return 0; +} + +/* parse num */ +int AffixMgr::parse_num(char* line, int* out, FileMgr* af) { + char* s = NULL; + if (*out != -1) { + HUNSPELL_WARNING( + stderr, + "error: line %d: multiple definitions of an affix file parameter\n", + af->getlinenum()); + return 1; + } + if (parse_string(line, &s, af->getlinenum())) + return 1; + *out = atoi(s); + free(s); + return 0; +} + +/* parse in the max syllablecount of compound words and */ +int AffixMgr::parse_cpdsyllable(char* line, FileMgr* af) { + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + cpdmaxsyllable = atoi(piece); + np++; + break; + } + case 2: { + if (!utf8) { + cpdvowels = mystrdup(piece); + } else { + std::vector<w_char> w; + u8_u16(w, piece); + if (!w.empty()) { + std::sort(w.begin(), w.end()); + cpdvowels_utf16 = (w_char*)malloc(w.size() * sizeof(w_char)); + if (!cpdvowels_utf16) + return 1; + memcpy(cpdvowels_utf16, &w[0], w.size()); + } + cpdvowels_utf16_len = w.size(); + } + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np < 2) { + HUNSPELL_WARNING(stderr, + "error: line %d: missing compoundsyllable information\n", + af->getlinenum()); + return 1; + } + if (np == 2) + cpdvowels = mystrdup("aeiouAEIOU"); + return 0; +} + +/* parse in the typical fault correcting table */ +int AffixMgr::parse_reptable(char* line, FileMgr* af) { + if (numrep != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrep = atoi(piece); + if (numrep < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return 1; + } + reptable = (replentry*)malloc(numrep * sizeof(struct replentry)); + if (!reptable) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the numrep lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < numrep; j++) { + if ((nl = af->getline()) == NULL) + return 1; + mychomp(nl); + tp = nl; + i = 0; + reptable[j].pattern = NULL; + reptable[j].pattern2 = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "REP", 3) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numrep = 0; + return 1; + } + break; + } + case 1: { + if (*piece == '^') + reptable[j].start = true; + else + reptable[j].start = false; + reptable[j].pattern = + mystrrep(mystrdup(piece + int(reptable[j].start)), "_", " "); + int lr = strlen(reptable[j].pattern) - 1; + if (reptable[j].pattern[lr] == '$') { + reptable[j].end = true; + reptable[j].pattern[lr] = '\0'; + } else + reptable[j].end = false; + break; + } + case 2: { + reptable[j].pattern2 = mystrrep(mystrdup(piece), "_", " "); + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numrep = 0; + return 1; + } + } + return 0; +} + +/* parse in the typical fault correcting table */ +int AffixMgr::parse_convtable(char* line, + FileMgr* af, + RepList** rl, + const char* keyword) { + if (*rl) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + int numrl = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrl = atoi(piece); + if (numrl < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return 1; + } + *rl = new RepList(numrl); + if (!*rl) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the num lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < numrl; j++) { + if (!(nl = af->getline())) + return 1; + mychomp(nl); + tp = nl; + i = 0; + char* pattern = NULL; + char* pattern2 = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, keyword, strlen(keyword)) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + delete *rl; + *rl = NULL; + return 1; + } + break; + } + case 1: { + pattern = mystrrep(mystrdup(piece), "_", " "); + break; + } + case 2: { + pattern2 = mystrrep(mystrdup(piece), "_", " "); + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!pattern || !pattern2) { + if (pattern) + free(pattern); + if (pattern2) + free(pattern2); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return 1; + } + (*rl)->add(pattern, pattern2); + } + return 0; +} + +/* parse in the typical fault correcting table */ +int AffixMgr::parse_phonetable(char* line, FileMgr* af) { + if (phone) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + phone = (phonetable*)malloc(sizeof(struct phonetable)); + if (!phone) + return 1; + phone->num = atoi(piece); + phone->rules = NULL; + phone->utf8 = (char)utf8; + if (phone->num < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + phone->rules = (char**)malloc(2 * (phone->num + 1) * sizeof(char*)); + if (!phone->rules) { + free(phone); + phone = NULL; + return 1; + } + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the phone->num lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < phone->num; j++) { + if (!(nl = af->getline())) + return 1; + mychomp(nl); + tp = nl; + i = 0; + phone->rules[j * 2] = NULL; + phone->rules[j * 2 + 1] = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "PHONE", 5) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + phone->num = 0; + return 1; + } + break; + } + case 1: { + phone->rules[j * 2] = mystrrep(mystrdup(piece), "_", ""); + break; + } + case 2: { + phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece), "_", ""); + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + phone->num = 0; + return 1; + } + } + phone->rules[phone->num * 2] = mystrdup(""); + phone->rules[phone->num * 2 + 1] = mystrdup(""); + init_phonet_hash(*phone); + return 0; +} + +/* parse in the checkcompoundpattern table */ +int AffixMgr::parse_checkcpdtable(char* line, FileMgr* af) { + if (numcheckcpd != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numcheckcpd = atoi(piece); + if (numcheckcpd < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + checkcpdtable = + (patentry*)malloc(numcheckcpd * sizeof(struct patentry)); + if (!checkcpdtable) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the numcheckcpd lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < numcheckcpd; j++) { + if (!(nl = af->getline())) + return 1; + mychomp(nl); + tp = nl; + i = 0; + checkcpdtable[j].pattern = NULL; + checkcpdtable[j].pattern2 = NULL; + checkcpdtable[j].pattern3 = NULL; + checkcpdtable[j].cond = FLAG_NULL; + checkcpdtable[j].cond2 = FLAG_NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "CHECKCOMPOUNDPATTERN", 20) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numcheckcpd = 0; + return 1; + } + break; + } + case 1: { + checkcpdtable[j].pattern = mystrdup(piece); + char* p = strchr(checkcpdtable[j].pattern, '/'); + if (p) { + *p = '\0'; + checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); + } + break; + } + case 2: { + checkcpdtable[j].pattern2 = mystrdup(piece); + char* p = strchr(checkcpdtable[j].pattern2, '/'); + if (p) { + *p = '\0'; + checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); + } + break; + } + case 3: { + checkcpdtable[j].pattern3 = mystrdup(piece); + simplifiedcpd = 1; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numcheckcpd = 0; + return 1; + } + } + return 0; +} + +/* parse in the compound rule table */ +int AffixMgr::parse_defcpdtable(char* line, FileMgr* af) { + if (numdefcpd != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numdefcpd = atoi(piece); + if (numdefcpd < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + defcpdtable = (flagentry*)malloc(numdefcpd * sizeof(flagentry)); + if (!defcpdtable) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the numdefcpd lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < numdefcpd; j++) { + if (!(nl = af->getline())) + return 1; + mychomp(nl); + tp = nl; + i = 0; + defcpdtable[j].def = NULL; + defcpdtable[j].len = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numdefcpd = 0; + return 1; + } + break; + } + case 1: { // handle parenthesized flags + if (strchr(piece, '(')) { + defcpdtable[j].def = (FLAG*)malloc(strlen(piece) * sizeof(FLAG)); + defcpdtable[j].len = 0; + int end = 0; + FLAG* conv; + while (!end) { + char* par = piece + 1; + while (*par != '(' && *par != ')' && *par != '\0') + par++; + if (*par == '\0') + end = 1; + else + *par = '\0'; + if (*piece == '(') + piece++; + if (*piece == '*' || *piece == '?') { + defcpdtable[j].def[defcpdtable[j].len++] = (FLAG)*piece; + } else if (*piece != '\0') { + int l = pHMgr->decode_flags(&conv, piece, af); + for (int k = 0; k < l; k++) + defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; + free(conv); + } + piece = par + 1; + } + } else { + defcpdtable[j].len = + pHMgr->decode_flags(&(defcpdtable[j].def), piece, af); + } + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!defcpdtable[j].len) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numdefcpd = 0; + return 1; + } + } + return 0; +} + +/* parse in the character map table */ +int AffixMgr::parse_maptable(char* line, FileMgr* af) { + if (nummap != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + nummap = atoi(piece); + if (nummap < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + maptable = (mapentry*)malloc(nummap * sizeof(struct mapentry)); + if (!maptable) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the nummap lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < nummap; j++) { + if (!(nl = af->getline())) + return 1; + mychomp(nl); + tp = nl; + i = 0; + maptable[j].set = NULL; + maptable[j].len = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "MAP", 3) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + nummap = 0; + return 1; + } + break; + } + case 1: { + int setn = 0; + maptable[j].len = strlen(piece); + maptable[j].set = (char**)malloc(maptable[j].len * sizeof(char*)); + if (!maptable[j].set) + return 1; + for (int k = 0; k < maptable[j].len; k++) { + int chl = 1; + int chb = k; + if (piece[k] == '(') { + char* parpos = strchr(piece + k, ')'); + if (parpos != NULL) { + chb = k + 1; + chl = (int)(parpos - piece) - k - 1; + k = k + chl + 1; + } + } else { + if (utf8 && (piece[k] & 0xc0) == 0xc0) { + for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++) + ; + chl = k - chb; + k--; + } + } + maptable[j].set[setn] = (char*)malloc(chl + 1); + if (!maptable[j].set[setn]) + return 1; + strncpy(maptable[j].set[setn], piece + chb, chl); + maptable[j].set[setn][chl] = '\0'; + setn++; + } + maptable[j].len = setn; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!maptable[j].set || !maptable[j].len) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + nummap = 0; + return 1; + } + } + return 0; +} + +/* parse in the word breakpoint table */ +int AffixMgr::parse_breaktable(char* line, FileMgr* af) { + if (numbreak > -1) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numbreak = atoi(piece); + if (numbreak < 0) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + if (numbreak == 0) + return 0; + breaktable = (char**)malloc(numbreak * sizeof(char*)); + if (!breaktable) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the numbreak lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < numbreak; j++) { + if (!(nl = af->getline())) + return 1; + mychomp(nl); + tp = nl; + i = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "BREAK", 5) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numbreak = 0; + return 1; + } + break; + } + case 1: { + breaktable[j] = mystrdup(piece); + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!breaktable) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numbreak = 0; + return 1; + } + } + return 0; +} + +void AffixMgr::reverse_condition(std::string& piece) { + if (piece.empty()) + return; + + int neg = 0; + for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) { + switch (*k) { + case '[': { + if (neg) + *(k - 1) = '['; + else + *k = ']'; + break; + } + case ']': { + *k = '['; + if (neg) + *(k - 1) = '^'; + neg = 0; + break; + } + case '^': { + if (*(k - 1) == ']') + neg = 1; + else + *(k - 1) = *k; + break; + } + default: { + if (neg) + *(k - 1) = *k; + } + } + } +} + +int AffixMgr::parse_affix(char* line, + const char at, + FileMgr* af, + char* dupflags) { + int numents = 0; // number of affentry structures to parse + + unsigned short aflag = 0; // affix char identifier + + char ff = 0; + std::vector<affentry> affentries; + + char* tp = line; + char* nl = line; + char* piece; + int i = 0; + +// checking lines with bad syntax +#ifdef DEBUG + int basefieldnum = 0; +#endif + + // split affix header line into pieces + + int np = 0; + + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + // piece 1 - is type of affix + case 0: { + np++; + break; + } + + // piece 2 - is affix char + case 1: { + np++; + aflag = pHMgr->decode_flag(piece); + if (((at == 'S') && (dupflags[aflag] & dupSFX)) || + ((at == 'P') && (dupflags[aflag] & dupPFX))) { + HUNSPELL_WARNING( + stderr, + "error: line %d: multiple definitions of an affix flag\n", + af->getlinenum()); + // return 1; XXX permissive mode for bad dictionaries + } + dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX); + break; + } + // piece 3 - is cross product indicator + case 2: { + np++; + if (*piece == 'Y') + ff = aeXPRODUCT; + break; + } + + // piece 4 - is number of affentries + case 3: { + np++; + numents = atoi(piece); + if ((numents <= 0) || ((std::numeric_limits<size_t>::max() / + sizeof(struct affentry)) < static_cast<size_t>(numents))) { + char* err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + free(err); + } + return 1; + } + affentries.resize(numents); + affentries[0].opts = ff; + if (utf8) + affentries[0].opts += aeUTF8; + if (pHMgr->is_aliasf()) + affentries[0].opts += aeALIASF; + if (pHMgr->is_aliasm()) + affentries[0].opts += aeALIASM; + affentries[0].aflag = aflag; + } + + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + // check to make sure we parsed enough pieces + if (np != 4) { + char* err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + free(err); + } + return 1; + } + + // now parse numents affentries for this affix + std::vector<affentry>::iterator start = affentries.begin(); + std::vector<affentry>::iterator end = affentries.end(); + for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { + if ((nl = af->getline()) == NULL) + return 1; + mychomp(nl); + tp = nl; + i = 0; + np = 0; + + // split line into pieces + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + // piece 1 - is type + case 0: { + np++; + if (entry != start) + entry->opts = start->opts & + (char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); + break; + } + + // piece 2 - is affix char + case 1: { + np++; + if (pHMgr->decode_flag(piece) != aflag) { + char* err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, + "error: line %d: affix %s is corrupt\n", + af->getlinenum(), err); + free(err); + } + return 1; + } + + if (entry != start) + entry->aflag = start->aflag; + break; + } + + // piece 3 - is string to strip or 0 for null + case 2: { + np++; + entry->strip = piece; + if (complexprefixes) { + if (utf8) + reverseword_utf(entry->strip); + else + reverseword(entry->strip); + } + if (entry->strip.compare("0") == 0) { + entry->strip.clear(); + } + break; + } + + // piece 4 - is affix string or 0 for null + case 3: { + char* dash; + entry->morphcode = NULL; + entry->contclass = NULL; + entry->contclasslen = 0; + np++; + dash = strchr(piece, '/'); + if (dash) { + *dash = '\0'; + + entry->appnd = piece; + + if (ignorechars) { + if (utf8) { + remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); + } else { + remove_ignored_chars(entry->appnd, ignorechars); + } + } + + if (complexprefixes) { + if (utf8) + reverseword_utf(entry->appnd); + else + reverseword(entry->appnd); + } + + if (pHMgr->is_aliasf()) { + int index = atoi(dash + 1); + entry->contclasslen = (unsigned short)pHMgr->get_aliasf( + index, &(entry->contclass), af); + if (!entry->contclasslen) + HUNSPELL_WARNING(stderr, + "error: bad affix flag alias: \"%s\"\n", + dash + 1); + } else { + entry->contclasslen = (unsigned short)pHMgr->decode_flags( + &(entry->contclass), dash + 1, af); + std::sort(entry->contclass, entry->contclass + entry->contclasslen); + } + *dash = '/'; + + havecontclass = 1; + for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { + contclasses[(entry->contclass)[_i]] = 1; + } + } else { + entry->appnd = piece; + + if (ignorechars) { + if (utf8) { + remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); + } else { + remove_ignored_chars(entry->appnd, ignorechars); + } + } + + if (complexprefixes) { + if (utf8) + reverseword_utf(entry->appnd); + else + reverseword(entry->appnd); + } + } + + if (entry->appnd.compare("0") == 0) { + entry->appnd.clear(); + } + break; + } + + // piece 5 - is the conditions descriptions + case 4: { + std::string chunk(piece); + np++; + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + reverse_condition(chunk); + } + if (!entry->strip.empty() && chunk != "." && + redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(), + af->getlinenum())) + chunk = "."; + if (at == 'S') { + reverseword(chunk); + reverse_condition(chunk); + } + if (encodeit(*entry, chunk.c_str())) + return 1; + break; + } + + case 5: { + std::string chunk(piece); + np++; + if (pHMgr->is_aliasm()) { + int index = atoi(chunk.c_str()); + entry->morphcode = pHMgr->get_aliasm(index); + } else { + if (complexprefixes) { // XXX - fix me for morph. gen. + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + } + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + chunk.push_back(' '); + chunk.append(tp); + } + entry->morphcode = mystrdup(chunk.c_str()); + if (!entry->morphcode) + return 1; + } + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + // check to make sure we parsed enough pieces + if (np < 4) { + char* err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", + af->getlinenum(), err); + free(err); + } + return 1; + } + +#ifdef DEBUG + // detect unnecessary fields, excepting comments + if (basefieldnum) { + int fieldnum = + !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6); + if (fieldnum != basefieldnum) + HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", + af->getlinenum()); + } else { + basefieldnum = + !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6); + } +#endif + } + + // now create SfxEntry or PfxEntry objects and use links to + // build an ordered (sorted by affix string) list + for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { + if (at == 'P') { + PfxEntry* pfxptr = new PfxEntry(this, &(*entry)); + build_pfxtree(pfxptr); + } else { + SfxEntry* sfxptr = new SfxEntry(this, &(*entry)); + build_sfxtree(sfxptr); + } + } + return 0; +} + +int AffixMgr::redundant_condition(char ft, + const char* strip, + int stripl, + const char* cond, + int linenum) { + int condl = strlen(cond); + int i; + int j; + int neg; + int in; + if (ft == 'P') { // prefix + if (strncmp(strip, cond, condl) == 0) + return 1; + if (utf8) { + } else { + for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { + if (cond[j] != '[') { + if (cond[j] != strip[i]) { + HUNSPELL_WARNING(stderr, + "warning: line %d: incompatible stripping " + "characters and condition\n", + linenum); + return 0; + } + } else { + neg = (cond[j + 1] == '^') ? 1 : 0; + in = 0; + do { + j++; + if (strip[i] == cond[j]) + in = 1; + } while ((j < (condl - 1)) && (cond[j] != ']')); + if (j == (condl - 1) && (cond[j] != ']')) { + HUNSPELL_WARNING(stderr, + "error: line %d: missing ] in condition:\n%s\n", + linenum, cond); + return 0; + } + if ((!neg && !in) || (neg && in)) { + HUNSPELL_WARNING(stderr, + "warning: line %d: incompatible stripping " + "characters and condition\n", + linenum); + return 0; + } + } + } + if (j >= condl) + return 1; + } + } else { // suffix + if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) + return 1; + if (utf8) { + } else { + for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { + if (cond[j] != ']') { + if (cond[j] != strip[i]) { + HUNSPELL_WARNING(stderr, + "warning: line %d: incompatible stripping " + "characters and condition\n", + linenum); + return 0; + } + } else { + in = 0; + do { + j--; + if (strip[i] == cond[j]) + in = 1; + } while ((j > 0) && (cond[j] != '[')); + if ((j == 0) && (cond[j] != '[')) { + HUNSPELL_WARNING(stderr, + "error: line: %d: missing ] in condition:\n%s\n", + linenum, cond); + return 0; + } + neg = (cond[j + 1] == '^') ? 1 : 0; + if ((!neg && !in) || (neg && in)) { + HUNSPELL_WARNING(stderr, + "warning: line %d: incompatible stripping " + "characters and condition\n", + linenum); + return 0; + } + } + } + if (j < 0) + return 1; + } + } + return 0; +} + +int AffixMgr::get_suffix_words(short unsigned* suff, + int len, + const char* root_word, + char** slst) { + int suff_words_cnt = 0; + short unsigned* start_ptr = suff; + for (int j = 0; j < SETSIZE; j++) { + SfxEntry* ptr = sStart[j]; + while (ptr) { + suff = start_ptr; + for (int i = 0; i < len; i++) { + if ((*suff) == ptr->getFlag()) { + std::string nw(root_word); + nw.append(ptr->getAffix()); + hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, NULL, 0, + NULL, 0, 0, 0); + if (ht) { + slst[suff_words_cnt++] = mystrdup(nw.c_str()); + } + } + suff++; + } + ptr = ptr->getNext(); + } + } + return suff_words_cnt; +} diff --git a/libs/hunspell/src/affixmgr.hxx b/libs/hunspell/src/affixmgr.hxx new file mode 100644 index 000000000..d70e85338 --- /dev/null +++ b/libs/hunspell/src/affixmgr.hxx @@ -0,0 +1,390 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AFFIXMGR_HXX_ +#define _AFFIXMGR_HXX_ + +#include "hunvisapi.h" + +#include <stdio.h> + +#include <string> + +#include "atypes.hxx" +#include "baseaffix.hxx" +#include "hashmgr.hxx" +#include "phonet.hxx" +#include "replist.hxx" + +// check flag duplication +#define dupSFX (1 << 0) +#define dupPFX (1 << 1) + +class PfxEntry; +class SfxEntry; + +class LIBHUNSPELL_DLL_EXPORTED AffixMgr { + PfxEntry* pStart[SETSIZE]; + SfxEntry* sStart[SETSIZE]; + PfxEntry* pFlag[SETSIZE]; + SfxEntry* sFlag[SETSIZE]; + HashMgr* pHMgr; + HashMgr** alldic; + int* maxdic; + char* keystring; + char* trystring; + char* encoding; + struct cs_info* csconv; + int utf8; + int complexprefixes; + FLAG compoundflag; + FLAG compoundbegin; + FLAG compoundmiddle; + FLAG compoundend; + FLAG compoundroot; + FLAG compoundforbidflag; + FLAG compoundpermitflag; + int compoundmoresuffixes; + int checkcompounddup; + int checkcompoundrep; + int checkcompoundcase; + int checkcompoundtriple; + int simplifiedtriple; + FLAG forbiddenword; + FLAG nosuggest; + FLAG nongramsuggest; + FLAG needaffix; + int cpdmin; + int numrep; + replentry* reptable; + RepList* iconvtable; + RepList* oconvtable; + int nummap; + mapentry* maptable; + int numbreak; + char** breaktable; + int numcheckcpd; + patentry* checkcpdtable; + int simplifiedcpd; + int numdefcpd; + flagentry* defcpdtable; + phonetable* phone; + int maxngramsugs; + int maxcpdsugs; + int maxdiff; + int onlymaxdiff; + int nosplitsugs; + int sugswithdots; + int cpdwordmax; + int cpdmaxsyllable; + char* cpdvowels; + w_char* cpdvowels_utf16; + int cpdvowels_utf16_len; + char* cpdsyllablenum; + const char* pfxappnd; // BUG: not stateless + const char* sfxappnd; // BUG: not stateless + int sfxextra; // BUG: not stateless + FLAG sfxflag; // BUG: not stateless + char* derived; // BUG: not stateless + SfxEntry* sfx; // BUG: not stateless + PfxEntry* pfx; // BUG: not stateless + int checknum; + char* wordchars; + std::vector<w_char> wordchars_utf16; + char* ignorechars; + std::vector<w_char> ignorechars_utf16; + char* version; + char* lang; + int langnum; + FLAG lemma_present; + FLAG circumfix; + FLAG onlyincompound; + FLAG keepcase; + FLAG forceucase; + FLAG warn; + int forbidwarn; + FLAG substandard; + int checksharps; + int fullstrip; + + int havecontclass; // boolean variable + char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold + // affix) + + public: + AffixMgr(const char* affpath, HashMgr** ptr, int* md, const char* key = NULL); + ~AffixMgr(); + struct hentry* affix_check(const char* word, + int len, + const unsigned short needflag = (unsigned short)0, + char in_compound = IN_CPD_NOT); + struct hentry* prefix_check(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + inline int isSubset(const char* s1, const char* s2); + struct hentry* prefix_check_twosfx(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + inline int isRevSubset(const char* s1, const char* end_of_s2, int len); + struct hentry* suffix_check(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + char** wlst, + int maxSug, + int* ns, + const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); + struct hentry* suffix_check_twosfx(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG needflag = FLAG_NULL); + + char* affix_check_morph(const char* word, + int len, + const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); + char* prefix_check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + char* suffix_check_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); + + char* prefix_check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + char* suffix_check_twosfx_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG needflag = FLAG_NULL); + + char* morphgen(const char* ts, + int wl, + const unsigned short* ap, + unsigned short al, + const char* morph, + const char* targetmorph, + int level); + + int expand_rootword(struct guessword* wlst, + int maxn, + const char* ts, + int wl, + const unsigned short* ap, + unsigned short al, + const char* bad, + int, + const char*); + + short get_syllable(const std::string& word); + int cpdrep_check(const char* word, int len); + int cpdpat_check(const char* word, + int len, + hentry* r1, + hentry* r2, + const char affixed); + int defcpd_check(hentry*** words, + short wnum, + hentry* rv, + hentry** rwords, + char all); + int cpdcase_check(const char* word, int len); + inline int candidate_check(const char* word, int len); + void setcminmax(int* cmin, int* cmax, const char* word, int len); + struct hentry* compound_check(const char* word, + int len, + short wordnum, + short numsyllable, + short maxwordnum, + short wnum, + hentry** words, + hentry** rwords, + char hu_mov_rule, + char is_sug, + int* info); + + int compound_check_morph(const char* word, + int len, + short wordnum, + short numsyllable, + short maxwordnum, + short wnum, + hentry** words, + hentry** rwords, + char hu_mov_rule, + char** result, + char* partresult); + + int get_suffix_words(short unsigned* suff, + int len, + const char* root_word, + char** slst); + + struct hentry* lookup(const char* word); + int get_numrep() const; + struct replentry* get_reptable() const; + RepList* get_iconvtable() const; + RepList* get_oconvtable() const; + struct phonetable* get_phonetable() const; + int get_nummap() const; + struct mapentry* get_maptable() const; + int get_numbreak() const; + char** get_breaktable() const; + char* get_encoding(); + int get_langnum() const; + char* get_key_string(); + char* get_try_string() const; + const char* get_wordchars() const; + const std::vector<w_char>& get_wordchars_utf16() const; + char* get_ignore() const; + const std::vector<w_char>& get_ignore_utf16() const; + int get_compound() const; + FLAG get_compoundflag() const; + FLAG get_compoundbegin() const; + FLAG get_forbiddenword() const; + FLAG get_nosuggest() const; + FLAG get_nongramsuggest() const; + FLAG get_needaffix() const; + FLAG get_onlyincompound() const; + FLAG get_compoundroot() const; + FLAG get_lemma_present() const; + int get_checknum() const; + const char* get_prefix() const; + const char* get_suffix() const; + const char* get_derived() const; + const char* get_version() const; + int have_contclass() const; + int get_utf8() const; + int get_complexprefixes() const; + char* get_suffixed(char) const; + int get_maxngramsugs() const; + int get_maxcpdsugs() const; + int get_maxdiff() const; + int get_onlymaxdiff() const; + int get_nosplitsugs() const; + int get_sugswithdots(void) const; + FLAG get_keepcase(void) const; + FLAG get_forceucase(void) const; + FLAG get_warn(void) const; + int get_forbidwarn(void) const; + int get_checksharps(void) const; + char* encode_flag(unsigned short aflag) const; + int get_fullstrip() const; + + private: + int parse_file(const char* affpath, const char* key); + int parse_flag(char* line, unsigned short* out, FileMgr* af); + int parse_num(char* line, int* out, FileMgr* af); + int parse_cpdsyllable(char* line, FileMgr* af); + int parse_reptable(char* line, FileMgr* af); + int parse_convtable(char* line, + FileMgr* af, + RepList** rl, + const char* keyword); + int parse_phonetable(char* line, FileMgr* af); + int parse_maptable(char* line, FileMgr* af); + int parse_breaktable(char* line, FileMgr* af); + int parse_checkcpdtable(char* line, FileMgr* af); + int parse_defcpdtable(char* line, FileMgr* af); + int parse_affix(char* line, const char at, FileMgr* af, char* dupflags); + + void reverse_condition(std::string&); + void debugflag(char* result, unsigned short flag); + std::string& debugflag(std::string& result, unsigned short flag); + int condlen(const char*); + int encodeit(affentry& entry, const char* cs); + int build_pfxtree(PfxEntry* pfxptr); + int build_sfxtree(SfxEntry* sfxptr); + int process_pfx_order(); + int process_sfx_order(); + PfxEntry* process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr); + SfxEntry* process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr); + int process_pfx_tree_to_list(); + int process_sfx_tree_to_list(); + int redundant_condition(char, const char* strip, int stripl, const char* cond, int); + void finishFileMgr(FileMgr* afflst); +}; + +#endif diff --git a/libs/hunspell/src/atypes.hxx b/libs/hunspell/src/atypes.hxx new file mode 100644 index 000000000..60826af20 --- /dev/null +++ b/libs/hunspell/src/atypes.hxx @@ -0,0 +1,145 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef _ATYPES_HXX_ +#define _ATYPES_HXX_ + +#ifndef HUNSPELL_WARNING +#include <stdio.h> +#ifdef HUNSPELL_WARNING_ON +#define HUNSPELL_WARNING fprintf +#else +// empty inline function to switch off warnings (instead of the C99 standard +// variadic macros) +static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {} +#endif +#endif + +// HUNSTEM def. +#define HUNSTEM + +#include "hashmgr.hxx" +#include "w_char.hxx" +#include <algorithm> +#include <string> + +#define SETSIZE 256 +#define CONTSIZE 65536 + +// affentry options +#define aeXPRODUCT (1 << 0) +#define aeUTF8 (1 << 1) +#define aeALIASF (1 << 2) +#define aeALIASM (1 << 3) +#define aeLONGCOND (1 << 4) + +// compound options +#define IN_CPD_NOT 0 +#define IN_CPD_BEGIN 1 +#define IN_CPD_END 2 +#define IN_CPD_OTHER 3 + +// info options +#define SPELL_COMPOUND (1 << 0) +#define SPELL_FORBIDDEN (1 << 1) +#define SPELL_ALLCAP (1 << 2) +#define SPELL_NOCAP (1 << 3) +#define SPELL_INITCAP (1 << 4) +#define SPELL_ORIGCAP (1 << 5) +#define SPELL_WARN (1 << 6) + +#define MAXLNLEN 8192 + +#define MINCPDLEN 3 +#define MAXCOMPOUND 10 +#define MAXCONDLEN 20 +#define MAXCONDLEN_1 (MAXCONDLEN - sizeof(char*)) + +#define MAXACC 1000 + +#define FLAG unsigned short +#define FLAG_NULL 0x00 +#define FREE_FLAG(a) a = 0 + +#define TESTAFF(a, b, c) (std::binary_search(a, a + c, b)) + +struct affentry { + std::string strip; + std::string appnd; + char numconds; + char opts; + unsigned short aflag; + unsigned short* contclass; + short contclasslen; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char* conds2; + } l; + } c; + char* morphcode; +}; + +struct guessword { + char* word; + bool allow; + char* orig; +}; + +struct mapentry { + char** set; + int len; +}; + +struct flagentry { + FLAG* def; + int len; +}; + +struct patentry { + char* pattern; + char* pattern2; + char* pattern3; + FLAG cond; + FLAG cond2; +}; + +#endif diff --git a/libs/hunspell/src/baseaffix.hxx b/libs/hunspell/src/baseaffix.hxx new file mode 100644 index 000000000..59256e92f --- /dev/null +++ b/libs/hunspell/src/baseaffix.hxx @@ -0,0 +1,77 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef _BASEAFF_HXX_ +#define _BASEAFF_HXX_ + +#include "hunvisapi.h" +#include <string> + +class LIBHUNSPELL_DLL_EXPORTED AffEntry { + private: + AffEntry(const AffEntry&); + AffEntry& operator=(const AffEntry&); + + protected: + AffEntry() + : numconds(0), + opts(0), + aflag(0), + morphcode(0), + contclass(NULL), + contclasslen(0) {} + std::string appnd; + std::string strip; + unsigned char numconds; + char opts; + unsigned short aflag; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char* conds2; + } l; + } c; + char* morphcode; + unsigned short* contclass; + short contclasslen; +}; + +#endif diff --git a/libs/hunspell/src/csutil.cxx b/libs/hunspell/src/csutil.cxx new file mode 100644 index 000000000..1948e4a3b --- /dev/null +++ b/libs/hunspell/src/csutil.cxx @@ -0,0 +1,2850 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <algorithm> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include "csutil.hxx" +#include "atypes.hxx" +#include "langnum.hxx" + +// Unicode character encoding information +struct unicode_info { + unsigned short c; + unsigned short cupper; + unsigned short clower; +}; + +#ifdef _WIN32 +#include <windows.h> +#include <wchar.h> +#endif + +#ifdef OPENOFFICEORG +#include <unicode/uchar.h> +#else +#ifndef MOZILLA_CLIENT +#include "utf_info.cxx" +#define UTF_LST_LEN (sizeof(utf_lst) / (sizeof(unicode_info))) +#endif +#endif + +#ifdef MOZILLA_CLIENT +#include "nsCOMPtr.h" +#include "nsIUnicodeEncoder.h" +#include "nsIUnicodeDecoder.h" +#include "nsUnicharUtils.h" +#include "mozilla/dom/EncodingUtils.h" + +using mozilla::dom::EncodingUtils; +#endif + +struct unicode_info2 { + char cletter; + unsigned short cupper; + unsigned short clower; +}; + +static struct unicode_info2* utf_tbl = NULL; +static int utf_tbl_count = + 0; // utf_tbl can be used by multiple Hunspell instances + +FILE* myfopen(const char* path, const char* mode) { +#ifdef _WIN32 +#define WIN32_LONG_PATH_PREFIX "\\\\?\\" + if (strncmp(path, WIN32_LONG_PATH_PREFIX, 4) == 0) { + int len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0); + wchar_t* buff = (wchar_t*)malloc(len * sizeof(wchar_t)); + wchar_t* buff2 = (wchar_t*)malloc(len * sizeof(wchar_t)); + FILE* f = NULL; + if (buff && buff2) { + MultiByteToWideChar(CP_UTF8, 0, path, -1, buff, len); + if (_wfullpath(buff2, buff, len) != NULL) { + f = _wfopen(buff2, (strcmp(mode, "r") == 0) ? L"r" : L"rb"); + } + free(buff); + free(buff2); + } + return f; + } +#endif + return fopen(path, mode); +} + +std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) { + dest.clear(); + std::vector<w_char>::const_iterator u2 = src.begin(); + std::vector<w_char>::const_iterator u2_max = src.end(); + while (u2 < u2_max) { + signed char u8; + if (u2->h) { // > 0xFF + // XXX 4-byte haven't implemented yet. + if (u2->h >= 0x08) { // >= 0x800 (3-byte UTF-8 character) + u8 = 0xe0 + (u2->h >> 4); + dest.push_back(u8); + u8 = 0x80 + ((u2->h & 0xf) << 2) + (u2->l >> 6); + dest.push_back(u8); + u8 = 0x80 + (u2->l & 0x3f); + dest.push_back(u8); + } else { // < 0x800 (2-byte UTF-8 character) + u8 = 0xc0 + (u2->h << 2) + (u2->l >> 6); + dest.push_back(u8); + u8 = 0x80 + (u2->l & 0x3f); + dest.push_back(u8); + } + } else { // <= 0xFF + if (u2->l & 0x80) { // >0x80 (2-byte UTF-8 character) + u8 = 0xc0 + (u2->l >> 6); + dest.push_back(u8); + u8 = 0x80 + (u2->l & 0x3f); + dest.push_back(u8); + } else { // < 0x80 (1-byte UTF-8 character) + u8 = u2->l; + dest.push_back(u8); + } + } + ++u2; + } + return dest; +} + +int u8_u16(std::vector<w_char>& dest, const std::string& src) { + dest.clear(); + std::string::const_iterator u8 = src.begin(); + std::string::const_iterator u8_max = src.end(); + + while (u8 < u8_max) { + w_char u2; + switch ((*u8) & 0xf0) { + case 0x00: + case 0x10: + case 0x20: + case 0x30: + case 0x40: + case 0x50: + case 0x60: + case 0x70: { + u2.h = 0; + u2.l = *u8; + break; + } + case 0x80: + case 0x90: + case 0xa0: + case 0xb0: { + HUNSPELL_WARNING(stderr, + "UTF-8 encoding error. Unexpected continuation bytes " + "in %ld. character position\n%s\n", + static_cast<long>(std::distance(src.begin(), u8)), + src.c_str()); + u2.h = 0xff; + u2.l = 0xfd; + break; + } + case 0xc0: + case 0xd0: { // 2-byte UTF-8 codes + if ((*(u8 + 1) & 0xc0) == 0x80) { + u2.h = (*u8 & 0x1f) >> 2; + u2.l = (*u8 << 6) + (*(u8 + 1) & 0x3f); + ++u8; + } else { + HUNSPELL_WARNING(stderr, + "UTF-8 encoding error. Missing continuation byte in " + "%ld. character position:\n%s\n", + static_cast<long>(std::distance(src.begin(), u8)), + src.c_str()); + u2.h = 0xff; + u2.l = 0xfd; + } + break; + } + case 0xe0: { // 3-byte UTF-8 codes + if ((*(u8 + 1) & 0xc0) == 0x80) { + u2.h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2); + ++u8; + if ((*(u8 + 1) & 0xc0) == 0x80) { + u2.l = (static_cast<unsigned char>(*u8) << 6) + (*(u8 + 1) & 0x3f); + ++u8; + } else { + HUNSPELL_WARNING(stderr, + "UTF-8 encoding error. Missing continuation byte " + "in %ld. character position:\n%s\n", + static_cast<long>(std::distance(src.begin(), u8)), + src.c_str()); + u2.h = 0xff; + u2.l = 0xfd; + } + } else { + HUNSPELL_WARNING(stderr, + "UTF-8 encoding error. Missing continuation byte in " + "%ld. character position:\n%s\n", + static_cast<long>(std::distance(src.begin(), u8)), + src.c_str()); + u2.h = 0xff; + u2.l = 0xfd; + } + break; + } + case 0xf0: { // 4 or more byte UTF-8 codes + HUNSPELL_WARNING(stderr, + "This UTF-8 encoding can't convert to UTF-16:\n%s\n", + src.c_str()); + u2.h = 0xff; + u2.l = 0xfd; + dest.push_back(u2); + return -1; + } + } + dest.push_back(u2); + ++u8; + } + + return dest.size(); +} + +// strip strings into token based on single char delimiter +// acts like strsep() but only uses a delim char and not +// a delim string +// default delimiter: white space characters + +char* mystrsep(char** stringp, const char delim) { + char* mp = *stringp; + if (*mp != '\0') { + char* dp; + if (delim) { + dp = strchr(mp, delim); + } else { + // don't use isspace() here, the string can be in some random charset + // that's way different than the locale's + for (dp = mp; (*dp && *dp != ' ' && *dp != '\t'); dp++) + ; + if (!*dp) + dp = NULL; + } + if (dp) { + *stringp = dp + 1; + *dp = '\0'; + } else { + *stringp = mp + strlen(mp); + } + return mp; + } + return NULL; +} + +// replaces strdup with ansi version +char* mystrdup(const char* s) { + char* d = NULL; + if (s) { + size_t sl = strlen(s) + 1; + d = (char*)malloc(sl); + if (d) { + memcpy(d, s, sl); + } else { + HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); + } + } + return d; +} + +// strcat for limited length destination string +char* mystrcat(char* dest, const char* st, int max) { + int len; + int len2; + if (dest == NULL || st == NULL) + return dest; + len = strlen(dest); + len2 = strlen(st); + if (len + len2 + 1 > max) + return dest; + strcpy(dest + len, st); + return dest; +} + +// remove cross-platform text line end characters +void mychomp(char* s) { + size_t k = strlen(s); + if ((k > 0) && ((*(s + k - 1) == '\r') || (*(s + k - 1) == '\n'))) + *(s + k - 1) = '\0'; + if ((k > 1) && (*(s + k - 2) == '\r')) + *(s + k - 2) = '\0'; +} + +// break text to lines +// return number of lines +int line_tok(const char* text, char*** lines, char breakchar) { + int linenum = 0; + if (!text) { + return linenum; + } + char* dup = mystrdup(text); + char* p = strchr(dup, breakchar); + while (p) { + linenum++; + *p = '\0'; + p++; + p = strchr(p, breakchar); + } + linenum++; + *lines = (char**)malloc(linenum * sizeof(char*)); + if (!(*lines)) { + free(dup); + return 0; + } + + p = dup; + int l = 0; + for (int i = 0; i < linenum; i++) { + if (*p != '\0') { + (*lines)[l] = mystrdup(p); + if (!(*lines)[l]) { + for (i = 0; i < l; i++) + free((*lines)[i]); + free(dup); + return 0; + } + l++; + } + p += strlen(p) + 1; + } + free(dup); + if (!l) { + free(*lines); + *lines = NULL; + } + return l; +} + +// uniq line in place +char* line_uniq(char* text, char breakchar) { + char** lines; + int linenum = line_tok(text, &lines, breakchar); + int i; + strcpy(text, lines[0]); + for (i = 1; i < linenum; i++) { + int dup = 0; + for (int j = 0; j < i; j++) { + if (strcmp(lines[i], lines[j]) == 0) { + dup = 1; + break; + } + } + if (!dup) { + if ((i > 1) || (*(lines[0]) != '\0')) { + sprintf(text + strlen(text), "%c", breakchar); + } + strcat(text, lines[i]); + } + } + for (i = 0; i < linenum; i++) { + free(lines[i]); + } + free(lines); + return text; +} + +// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " +char* line_uniq_app(char** text, char breakchar) { + if (!strchr(*text, breakchar)) { + return *text; + } + + char** lines; + int i; + int linenum = line_tok(*text, &lines, breakchar); + int dup = 0; + for (i = 0; i < linenum; i++) { + for (int j = 0; j < (i - 1); j++) { + if (strcmp(lines[i], lines[j]) == 0) { + *(lines[i]) = '\0'; + dup++; + break; + } + } + } + if ((linenum - dup) == 1) { + strcpy(*text, lines[0]); + freelist(&lines, linenum); + return *text; + } + char* newtext = (char*)malloc(strlen(*text) + 2 * linenum + 3 + 1); + if (newtext) { + free(*text); + *text = newtext; + } else { + freelist(&lines, linenum); + return *text; + } + strcpy(*text, " ( "); + for (i = 0; i < linenum; i++) + if (*(lines[i])) { + sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); + } + (*text)[strlen(*text) - 2] = ')'; // " ) " + freelist(&lines, linenum); + return *text; +} + +// append s to ends of every lines in text +std::string& strlinecat(std::string& str, const std::string& apd) { + size_t pos = 0; + while ((pos = str.find('\n', pos)) != std::string::npos) { + str.insert(pos, apd); + pos += apd.length() + 1; + } + str.append(apd); + return str; +} + +// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields +// in the first line of the inputs +// return 0, if inputs equal +// return 1, if inputs may equal with a secondary suffix +// otherwise return -1 +int morphcmp(const char* s, const char* t) { + int se = 0; + int te = 0; + const char* sl; + const char* tl; + const char* olds; + const char* oldt; + if (!s || !t) + return 1; + olds = s; + sl = strchr(s, '\n'); + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) + s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + tl = strchr(t, '\n'); + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) + t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + while (s && t && (!sl || sl > s) && (!tl || tl > t)) { + s += MORPH_TAG_LEN; + t += MORPH_TAG_LEN; + se = 0; + te = 0; + while ((*s == *t) && !se && !te) { + s++; + t++; + switch (*s) { + case ' ': + case '\n': + case '\t': + case '\0': + se = 1; + } + switch (*t) { + case ' ': + case '\n': + case '\t': + case '\0': + te = 1; + } + } + if (!se || !te) { + // not terminal suffix difference + if (olds) + return -1; + return 1; + } + olds = s; + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) + s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) + t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + } + if (!s && !t && se && te) + return 0; + return 1; +} + +int get_sfxcount(const char* morph) { + if (!morph || !*morph) + return 0; + int n = 0; + const char* old = morph; + morph = strstr(morph, MORPH_DERI_SFX); + if (!morph) + morph = strstr(old, MORPH_INFL_SFX); + if (!morph) + morph = strstr(old, MORPH_TERM_SFX); + while (morph) { + n++; + old = morph; + morph = strstr(morph + 1, MORPH_DERI_SFX); + if (!morph) + morph = strstr(old + 1, MORPH_INFL_SFX); + if (!morph) + morph = strstr(old + 1, MORPH_TERM_SFX); + } + return n; +} + +int fieldlen(const char* r) { + int n = 0; + while (r && *r != ' ' && *r != '\t' && *r != '\0' && *r != '\n') { + r++; + n++; + } + return n; +} + +bool copy_field(std::string& dest, + const std::string& morph, + const std::string& var) { + if (morph.empty()) + return false; + size_t pos = morph.find(var); + if (pos == std::string::npos) + return false; + dest.clear(); + std::string beg(morph.substr(pos + MORPH_TAG_LEN, std::string::npos)); + + for (size_t i = 0; i < beg.size(); ++i) { + const char c(beg[i]); + if (c == ' ' || c == '\t' || c == '\n') + break; + dest.push_back(c); + } + + return true; +} + +std::string& mystrrep(std::string& str, + const std::string& search, + const std::string& replace) { + size_t pos = 0; + while ((pos = str.find(search, pos)) != std::string::npos) { + str.replace(pos, search.length(), replace); + pos += replace.length(); + } + return str; +} + +char* mystrrep(char* word, const char* pat, const char* rep) { + char* pos = strstr(word, pat); + if (pos) { + int replen = strlen(rep); + int patlen = strlen(pat); + while (pos) { + if (replen < patlen) { + char* end = word + strlen(word); + char* next = pos + replen; + char* prev = pos + strlen(pat); + for (; prev < end;* next = *prev, prev++, next++) + ; + *next = '\0'; + } else if (replen > patlen) { + char* end = pos + patlen; + char* next = word + strlen(word) + replen - patlen; + char* prev = next - replen + patlen; + for (; prev >= end;* next = *prev, prev--, next--) + ; + } + strncpy(pos, rep, replen); + pos = strstr(word, pat); + } + } + return word; +} + +// reverse word +size_t reverseword(std::string& word) { + std::reverse(word.begin(), word.end()); + return word.size(); +} + +// reverse word +size_t reverseword_utf(std::string& word) { + std::vector<w_char> w; + u8_u16(w, word); + std::reverse(w.begin(), w.end()); + u16_u8(word, w); + return w.size(); +} + +int uniqlist(char** list, int n) { + int i; + if (n < 2) + return n; + for (i = 0; i < n; i++) { + for (int j = 0; j < i; j++) { + if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { + free(list[i]); + list[i] = NULL; + break; + } + } + } + int m = 1; + for (i = 1; i < n; i++) + if (list[i]) { + list[m] = list[i]; + m++; + } + return m; +} + +void freelist(char*** list, int n) { + if (list && *list) { + for (int i = 0; i < n; i++) + free((*list)[i]); + free(*list); + *list = NULL; + } +} + +namespace { +unsigned char cupper(const struct cs_info* csconv, int nIndex) { + if (nIndex < 0 || nIndex > 255) + return nIndex; + return csconv[nIndex].cupper; +} + +unsigned char clower(const struct cs_info* csconv, int nIndex) { + if (nIndex < 0 || nIndex > 255) + return nIndex; + return csconv[nIndex].clower; +} + +unsigned char ccase(const struct cs_info* csconv, int nIndex) { + if (nIndex < 0 || nIndex > 255) + return nIndex; + return csconv[nIndex].ccase; +} +} + +w_char upper_utf(w_char u, int langnum) { + unsigned short idx = (u.h << 8) + u.l; + if (idx != unicodetoupper(idx, langnum)) { + u.h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); + u.l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + } + return u; +} + +w_char lower_utf(w_char u, int langnum) { + unsigned short idx = (u.h << 8) + u.l; + if (idx != unicodetolower(idx, langnum)) { + u.h = (unsigned char)(unicodetolower(idx, langnum) >> 8); + u.l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + } + return u; +} + +// convert std::string to all caps +std::string& mkallcap(std::string& s, const struct cs_info* csconv) { + for (std::string::iterator aI = s.begin(), aEnd = s.end(); aI != aEnd; ++aI) { + *aI = cupper(csconv, static_cast<unsigned char>(*aI)); + } + return s; +} + +// convert std::string to all little +std::string& mkallsmall(std::string& s, const struct cs_info* csconv) { + for (std::string::iterator aI = s.begin(), aEnd = s.end(); aI != aEnd; ++aI) { + *aI = clower(csconv, static_cast<unsigned char>(*aI)); + } + return s; +} + +std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u, + int langnum) { + for (size_t i = 0; i < u.size(); ++i) { + unsigned short idx = (u[i].h << 8) + u[i].l; + if (idx != unicodetolower(idx, langnum)) { + u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); + u[i].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + } + } + return u; +} + +std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) { + for (size_t i = 0; i < u.size(); i++) { + unsigned short idx = (u[i].h << 8) + u[i].l; + if (idx != unicodetoupper(idx, langnum)) { + u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); + u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + } + } + return u; +} + +std::string& mkinitcap(std::string& s, const struct cs_info* csconv) { + if (!s.empty()) { + s[0] = cupper(csconv, static_cast<unsigned char>(s[0])); + } + return s; +} + +std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) { + if (!u.empty()) { + unsigned short idx = (u[0].h << 8) + u[0].l; + if (idx != unicodetoupper(idx, langnum)) { + u[0].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); + u[0].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + } + } + return u; +} + +std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) { + if (!s.empty()) { + s[0] = clower(csconv, static_cast<unsigned char>(s[0])); + } + return s; +} + +std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) { + if (!u.empty()) { + unsigned short idx = (u[0].h << 8) + u[0].l; + if (idx != unicodetolower(idx, langnum)) { + u[0].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); + u[0].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + } + } + return u; +} + +// conversion function for protected memory +void store_pointer(char* dest, char* source) { + memcpy(dest, &source, sizeof(char*)); +} + +// conversion function for protected memory +char* get_stored_pointer(const char* s) { + char* p; + memcpy(&p, s, sizeof(char*)); + return p; +} + +#ifndef MOZILLA_CLIENT + +// these are simple character mappings for the +// encodings supported +// supplying isupper, tolower, and toupper + +static struct cs_info iso1_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso2_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x01, 0xb1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x01, 0xb3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x01, 0xb5, 0xa5}, {0x01, 0xb6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x01, 0xb9, 0xa9}, {0x01, 0xba, 0xaa}, + {0x01, 0xbb, 0xab}, {0x01, 0xbc, 0xac}, {0x00, 0xad, 0xad}, + {0x01, 0xbe, 0xae}, {0x01, 0xbf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xa1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xa3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xa5}, {0x00, 0xb6, 0xa6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xa9}, + {0x00, 0xba, 0xaa}, {0x00, 0xbb, 0xab}, {0x00, 0xbc, 0xac}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xae}, {0x00, 0xbf, 0xaf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso3_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x01, 0xb1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x01, 0xb6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x01, 0x69, 0xa9}, {0x01, 0xba, 0xaa}, + {0x01, 0xbb, 0xab}, {0x01, 0xbc, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x01, 0xbf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xa1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xa6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0x49}, + {0x00, 0xba, 0xaa}, {0x00, 0xbb, 0xab}, {0x00, 0xbc, 0xac}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xaf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x00, 0xc3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x00, 0xd0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xe3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso4_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x01, 0xb1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x01, 0xb3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x01, 0xb5, 0xa5}, {0x01, 0xb6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x01, 0xb9, 0xa9}, {0x01, 0xba, 0xaa}, + {0x01, 0xbb, 0xab}, {0x01, 0xbc, 0xac}, {0x00, 0xad, 0xad}, + {0x01, 0xbe, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xa1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xa3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xa5}, {0x00, 0xb6, 0xa6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xa9}, + {0x00, 0xba, 0xaa}, {0x00, 0xbb, 0xab}, {0x00, 0xbc, 0xac}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xae}, {0x00, 0xbf, 0xbf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso5_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x01, 0xf1, 0xa1}, + {0x01, 0xf2, 0xa2}, {0x01, 0xf3, 0xa3}, {0x01, 0xf4, 0xa4}, + {0x01, 0xf5, 0xa5}, {0x01, 0xf6, 0xa6}, {0x01, 0xf7, 0xa7}, + {0x01, 0xf8, 0xa8}, {0x01, 0xf9, 0xa9}, {0x01, 0xfa, 0xaa}, + {0x01, 0xfb, 0xab}, {0x01, 0xfc, 0xac}, {0x00, 0xad, 0xad}, + {0x01, 0xfe, 0xae}, {0x01, 0xff, 0xaf}, {0x01, 0xd0, 0xb0}, + {0x01, 0xd1, 0xb1}, {0x01, 0xd2, 0xb2}, {0x01, 0xd3, 0xb3}, + {0x01, 0xd4, 0xb4}, {0x01, 0xd5, 0xb5}, {0x01, 0xd6, 0xb6}, + {0x01, 0xd7, 0xb7}, {0x01, 0xd8, 0xb8}, {0x01, 0xd9, 0xb9}, + {0x01, 0xda, 0xba}, {0x01, 0xdb, 0xbb}, {0x01, 0xdc, 0xbc}, + {0x01, 0xdd, 0xbd}, {0x01, 0xde, 0xbe}, {0x01, 0xdf, 0xbf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x00, 0xd0, 0xb0}, {0x00, 0xd1, 0xb1}, + {0x00, 0xd2, 0xb2}, {0x00, 0xd3, 0xb3}, {0x00, 0xd4, 0xb4}, + {0x00, 0xd5, 0xb5}, {0x00, 0xd6, 0xb6}, {0x00, 0xd7, 0xb7}, + {0x00, 0xd8, 0xb8}, {0x00, 0xd9, 0xb9}, {0x00, 0xda, 0xba}, + {0x00, 0xdb, 0xbb}, {0x00, 0xdc, 0xbc}, {0x00, 0xdd, 0xbd}, + {0x00, 0xde, 0xbe}, {0x00, 0xdf, 0xbf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xa1}, {0x00, 0xf2, 0xa2}, + {0x00, 0xf3, 0xa3}, {0x00, 0xf4, 0xa4}, {0x00, 0xf5, 0xa5}, + {0x00, 0xf6, 0xa6}, {0x00, 0xf7, 0xa7}, {0x00, 0xf8, 0xa8}, + {0x00, 0xf9, 0xa9}, {0x00, 0xfa, 0xaa}, {0x00, 0xfb, 0xab}, + {0x00, 0xfc, 0xac}, {0x00, 0xfd, 0xfd}, {0x00, 0xfe, 0xae}, + {0x00, 0xff, 0xaf}}; + +static struct cs_info iso6_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x00, 0xc0, 0xc0}, {0x00, 0xc1, 0xc1}, {0x00, 0xc2, 0xc2}, + {0x00, 0xc3, 0xc3}, {0x00, 0xc4, 0xc4}, {0x00, 0xc5, 0xc5}, + {0x00, 0xc6, 0xc6}, {0x00, 0xc7, 0xc7}, {0x00, 0xc8, 0xc8}, + {0x00, 0xc9, 0xc9}, {0x00, 0xca, 0xca}, {0x00, 0xcb, 0xcb}, + {0x00, 0xcc, 0xcc}, {0x00, 0xcd, 0xcd}, {0x00, 0xce, 0xce}, + {0x00, 0xcf, 0xcf}, {0x00, 0xd0, 0xd0}, {0x00, 0xd1, 0xd1}, + {0x00, 0xd2, 0xd2}, {0x00, 0xd3, 0xd3}, {0x00, 0xd4, 0xd4}, + {0x00, 0xd5, 0xd5}, {0x00, 0xd6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x00, 0xd8, 0xd8}, {0x00, 0xd9, 0xd9}, {0x00, 0xda, 0xda}, + {0x00, 0xdb, 0xdb}, {0x00, 0xdc, 0xdc}, {0x00, 0xdd, 0xdd}, + {0x00, 0xde, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xe0}, + {0x00, 0xe1, 0xe1}, {0x00, 0xe2, 0xe2}, {0x00, 0xe3, 0xe3}, + {0x00, 0xe4, 0xe4}, {0x00, 0xe5, 0xe5}, {0x00, 0xe6, 0xe6}, + {0x00, 0xe7, 0xe7}, {0x00, 0xe8, 0xe8}, {0x00, 0xe9, 0xe9}, + {0x00, 0xea, 0xea}, {0x00, 0xeb, 0xeb}, {0x00, 0xec, 0xec}, + {0x00, 0xed, 0xed}, {0x00, 0xee, 0xee}, {0x00, 0xef, 0xef}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xf1}, {0x00, 0xf2, 0xf2}, + {0x00, 0xf3, 0xf3}, {0x00, 0xf4, 0xf4}, {0x00, 0xf5, 0xf5}, + {0x00, 0xf6, 0xf6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xf8}, + {0x00, 0xf9, 0xf9}, {0x00, 0xfa, 0xfa}, {0x00, 0xfb, 0xfb}, + {0x00, 0xfc, 0xfc}, {0x00, 0xfd, 0xfd}, {0x00, 0xfe, 0xfe}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso7_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x01, 0xdc, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x01, 0xdd, 0xb8}, {0x01, 0xde, 0xb9}, + {0x01, 0xdf, 0xba}, {0x00, 0xbb, 0xbb}, {0x01, 0xfc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x01, 0xfd, 0xbe}, {0x01, 0xfe, 0xbf}, + {0x00, 0xc0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x00, 0xd2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x01, 0xf7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x00, 0xdc, 0xb6}, {0x00, 0xdd, 0xb8}, + {0x00, 0xde, 0xb9}, {0x00, 0xdf, 0xba}, {0x00, 0xe0, 0xe0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd3}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xd7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xbc}, {0x00, 0xfd, 0xbe}, {0x00, 0xfe, 0xbf}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso8_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x00, 0xc0, 0xc0}, {0x00, 0xc1, 0xc1}, {0x00, 0xc2, 0xc2}, + {0x00, 0xc3, 0xc3}, {0x00, 0xc4, 0xc4}, {0x00, 0xc5, 0xc5}, + {0x00, 0xc6, 0xc6}, {0x00, 0xc7, 0xc7}, {0x00, 0xc8, 0xc8}, + {0x00, 0xc9, 0xc9}, {0x00, 0xca, 0xca}, {0x00, 0xcb, 0xcb}, + {0x00, 0xcc, 0xcc}, {0x00, 0xcd, 0xcd}, {0x00, 0xce, 0xce}, + {0x00, 0xcf, 0xcf}, {0x00, 0xd0, 0xd0}, {0x00, 0xd1, 0xd1}, + {0x00, 0xd2, 0xd2}, {0x00, 0xd3, 0xd3}, {0x00, 0xd4, 0xd4}, + {0x00, 0xd5, 0xd5}, {0x00, 0xd6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x00, 0xd8, 0xd8}, {0x00, 0xd9, 0xd9}, {0x00, 0xda, 0xda}, + {0x00, 0xdb, 0xdb}, {0x00, 0xdc, 0xdc}, {0x00, 0xdd, 0xdd}, + {0x00, 0xde, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xe0}, + {0x00, 0xe1, 0xe1}, {0x00, 0xe2, 0xe2}, {0x00, 0xe3, 0xe3}, + {0x00, 0xe4, 0xe4}, {0x00, 0xe5, 0xe5}, {0x00, 0xe6, 0xe6}, + {0x00, 0xe7, 0xe7}, {0x00, 0xe8, 0xe8}, {0x00, 0xe9, 0xe9}, + {0x00, 0xea, 0xea}, {0x00, 0xeb, 0xeb}, {0x00, 0xec, 0xec}, + {0x00, 0xed, 0xed}, {0x00, 0xee, 0xee}, {0x00, 0xef, 0xef}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xf1}, {0x00, 0xf2, 0xf2}, + {0x00, 0xf3, 0xf3}, {0x00, 0xf4, 0xf4}, {0x00, 0xf5, 0xf5}, + {0x00, 0xf6, 0xf6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xf8}, + {0x00, 0xf9, 0xf9}, {0x00, 0xfa, 0xfa}, {0x00, 0xfb, 0xfb}, + {0x00, 0xfc, 0xfc}, {0x00, 0xfd, 0xfd}, {0x00, 0xfe, 0xfe}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso9_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0xfd, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0xdd}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0x69, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0x49}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso10_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x00, 0xc0, 0xc0}, {0x00, 0xc1, 0xc1}, {0x00, 0xc2, 0xc2}, + {0x00, 0xc3, 0xc3}, {0x00, 0xc4, 0xc4}, {0x00, 0xc5, 0xc5}, + {0x00, 0xc6, 0xc6}, {0x00, 0xc7, 0xc7}, {0x00, 0xc8, 0xc8}, + {0x00, 0xc9, 0xc9}, {0x00, 0xca, 0xca}, {0x00, 0xcb, 0xcb}, + {0x00, 0xcc, 0xcc}, {0x00, 0xcd, 0xcd}, {0x00, 0xce, 0xce}, + {0x00, 0xcf, 0xcf}, {0x00, 0xd0, 0xd0}, {0x00, 0xd1, 0xd1}, + {0x00, 0xd2, 0xd2}, {0x00, 0xd3, 0xd3}, {0x00, 0xd4, 0xd4}, + {0x00, 0xd5, 0xd5}, {0x00, 0xd6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x00, 0xd8, 0xd8}, {0x00, 0xd9, 0xd9}, {0x00, 0xda, 0xda}, + {0x00, 0xdb, 0xdb}, {0x00, 0xdc, 0xdc}, {0x00, 0xdd, 0xdd}, + {0x00, 0xde, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xe0}, + {0x00, 0xe1, 0xe1}, {0x00, 0xe2, 0xe2}, {0x00, 0xe3, 0xe3}, + {0x00, 0xe4, 0xe4}, {0x00, 0xe5, 0xe5}, {0x00, 0xe6, 0xe6}, + {0x00, 0xe7, 0xe7}, {0x00, 0xe8, 0xe8}, {0x00, 0xe9, 0xe9}, + {0x00, 0xea, 0xea}, {0x00, 0xeb, 0xeb}, {0x00, 0xec, 0xec}, + {0x00, 0xed, 0xed}, {0x00, 0xee, 0xee}, {0x00, 0xef, 0xef}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xf1}, {0x00, 0xf2, 0xf2}, + {0x00, 0xf3, 0xf3}, {0x00, 0xf4, 0xf4}, {0x00, 0xf5, 0xf5}, + {0x00, 0xf6, 0xf6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xf8}, + {0x00, 0xf9, 0xf9}, {0x00, 0xfa, 0xfa}, {0x00, 0xfb, 0xfb}, + {0x00, 0xfc, 0xfc}, {0x00, 0xfd, 0xfd}, {0x00, 0xfe, 0xfe}, + {0x00, 0xff, 0xff}}; + +static struct cs_info koi8r_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xb3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x01, 0xa3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x00, 0xc0, 0xe0}, {0x00, 0xc1, 0xe1}, {0x00, 0xc2, 0xe2}, + {0x00, 0xc3, 0xe3}, {0x00, 0xc4, 0xe4}, {0x00, 0xc5, 0xe5}, + {0x00, 0xc6, 0xe6}, {0x00, 0xc7, 0xe7}, {0x00, 0xc8, 0xe8}, + {0x00, 0xc9, 0xe9}, {0x00, 0xca, 0xea}, {0x00, 0xcb, 0xeb}, + {0x00, 0xcc, 0xec}, {0x00, 0xcd, 0xed}, {0x00, 0xce, 0xee}, + {0x00, 0xcf, 0xef}, {0x00, 0xd0, 0xf0}, {0x00, 0xd1, 0xf1}, + {0x00, 0xd2, 0xf2}, {0x00, 0xd3, 0xf3}, {0x00, 0xd4, 0xf4}, + {0x00, 0xd5, 0xf5}, {0x00, 0xd6, 0xf6}, {0x00, 0xd7, 0xf7}, + {0x00, 0xd8, 0xf8}, {0x00, 0xd9, 0xf9}, {0x00, 0xda, 0xfa}, + {0x00, 0xdb, 0xfb}, {0x00, 0xdc, 0xfc}, {0x00, 0xdd, 0xfd}, + {0x00, 0xde, 0xfe}, {0x00, 0xdf, 0xff}, {0x01, 0xc0, 0xe0}, + {0x01, 0xc1, 0xe1}, {0x01, 0xc2, 0xe2}, {0x01, 0xc3, 0xe3}, + {0x01, 0xc4, 0xe4}, {0x01, 0xc5, 0xe5}, {0x01, 0xc6, 0xe6}, + {0x01, 0xc7, 0xe7}, {0x01, 0xc8, 0xe8}, {0x01, 0xc9, 0xe9}, + {0x01, 0xca, 0xea}, {0x01, 0xcb, 0xeb}, {0x01, 0xcc, 0xec}, + {0x01, 0xcd, 0xed}, {0x01, 0xce, 0xee}, {0x01, 0xcf, 0xef}, + {0x01, 0xd0, 0xf0}, {0x01, 0xd1, 0xf1}, {0x01, 0xd2, 0xf2}, + {0x01, 0xd3, 0xf3}, {0x01, 0xd4, 0xf4}, {0x01, 0xd5, 0xf5}, + {0x01, 0xd6, 0xf6}, {0x01, 0xd7, 0xf7}, {0x01, 0xd8, 0xf8}, + {0x01, 0xd9, 0xf9}, {0x01, 0xda, 0xfa}, {0x01, 0xdb, 0xfb}, + {0x01, 0xdc, 0xfc}, {0x01, 0xdd, 0xfd}, {0x01, 0xde, 0xfe}, + {0x01, 0xdf, 0xff}}; + +static struct cs_info koi8u_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xb3}, {0x00, 0xa4, 0xb4}, /* ie */ + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xb6}, /* i */ + {0x00, 0xa7, 0xb7}, /* ii */ + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xbd}, /* g'' */ + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x01, 0xa3, 0xb3}, + {0x00, 0xb4, 0xb4}, /* IE */ + {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, /* I */ + {0x00, 0xb7, 0xb7}, /* II */ + {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, {0x00, 0xba, 0xba}, + {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, {0x00, 0xbd, 0xbd}, + {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, {0x00, 0xc0, 0xe0}, + {0x00, 0xc1, 0xe1}, {0x00, 0xc2, 0xe2}, {0x00, 0xc3, 0xe3}, + {0x00, 0xc4, 0xe4}, {0x00, 0xc5, 0xe5}, {0x00, 0xc6, 0xe6}, + {0x00, 0xc7, 0xe7}, {0x00, 0xc8, 0xe8}, {0x00, 0xc9, 0xe9}, + {0x00, 0xca, 0xea}, {0x00, 0xcb, 0xeb}, {0x00, 0xcc, 0xec}, + {0x00, 0xcd, 0xed}, {0x00, 0xce, 0xee}, {0x00, 0xcf, 0xef}, + {0x00, 0xd0, 0xf0}, {0x00, 0xd1, 0xf1}, {0x00, 0xd2, 0xf2}, + {0x00, 0xd3, 0xf3}, {0x00, 0xd4, 0xf4}, {0x00, 0xd5, 0xf5}, + {0x00, 0xd6, 0xf6}, {0x00, 0xd7, 0xf7}, {0x00, 0xd8, 0xf8}, + {0x00, 0xd9, 0xf9}, {0x00, 0xda, 0xfa}, {0x00, 0xdb, 0xfb}, + {0x00, 0xdc, 0xfc}, {0x00, 0xdd, 0xfd}, {0x00, 0xde, 0xfe}, + {0x00, 0xdf, 0xff}, {0x01, 0xc0, 0xe0}, {0x01, 0xc1, 0xe1}, + {0x01, 0xc2, 0xe2}, {0x01, 0xc3, 0xe3}, {0x01, 0xc4, 0xe4}, + {0x01, 0xc5, 0xe5}, {0x01, 0xc6, 0xe6}, {0x01, 0xc7, 0xe7}, + {0x01, 0xc8, 0xe8}, {0x01, 0xc9, 0xe9}, {0x01, 0xca, 0xea}, + {0x01, 0xcb, 0xeb}, {0x01, 0xcc, 0xec}, {0x01, 0xcd, 0xed}, + {0x01, 0xce, 0xee}, {0x01, 0xcf, 0xef}, {0x01, 0xd0, 0xf0}, + {0x01, 0xd1, 0xf1}, {0x01, 0xd2, 0xf2}, {0x01, 0xd3, 0xf3}, + {0x01, 0xd4, 0xf4}, {0x01, 0xd5, 0xf5}, {0x01, 0xd6, 0xf6}, + {0x01, 0xd7, 0xf7}, {0x01, 0xd8, 0xf8}, {0x01, 0xd9, 0xf9}, + {0x01, 0xda, 0xfa}, {0x01, 0xdb, 0xfb}, {0x01, 0xdc, 0xfc}, + {0x01, 0xdd, 0xfd}, {0x01, 0xde, 0xfe}, {0x01, 0xdf, 0xff}}; + +static struct cs_info cp1251_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x01, 0x90, 0x80}, + {0x01, 0x83, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x81}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x01, 0x9a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x01, 0x9c, 0x8c}, + {0x01, 0x9d, 0x8d}, {0x01, 0x9e, 0x8e}, {0x01, 0x9f, 0x8f}, + {0x00, 0x90, 0x80}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x8a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x8c}, {0x00, 0x9d, 0x8d}, {0x00, 0x9e, 0x8e}, + {0x00, 0x9f, 0x8f}, {0x00, 0xa0, 0xa0}, {0x01, 0xa2, 0xa1}, + {0x00, 0xa2, 0xa1}, {0x01, 0xbc, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x01, 0xb4, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x01, 0xb8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x01, 0xba, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x01, 0xbf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x01, 0xb3, 0xb2}, {0x00, 0xb3, 0xb2}, + {0x00, 0xb4, 0xa5}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xa8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xaa}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xa3}, + {0x01, 0xbe, 0xbd}, {0x00, 0xbe, 0xbd}, {0x00, 0xbf, 0xaf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x01, 0xf7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x01, 0xff, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xd7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xdf}}; + +static struct cs_info iso13_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0A, 0x0A}, {0x00, 0x0B, 0x0B}, + {0x00, 0x0C, 0x0C}, {0x00, 0x0D, 0x0D}, {0x00, 0x0E, 0x0E}, + {0x00, 0x0F, 0x0F}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1A, 0x1A}, + {0x00, 0x1B, 0x1B}, {0x00, 0x1C, 0x1C}, {0x00, 0x1D, 0x1D}, + {0x00, 0x1E, 0x1E}, {0x00, 0x1F, 0x1F}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2A, 0x2A}, {0x00, 0x2B, 0x2B}, {0x00, 0x2C, 0x2C}, + {0x00, 0x2D, 0x2D}, {0x00, 0x2E, 0x2E}, {0x00, 0x2F, 0x2F}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3A, 0x3A}, {0x00, 0x3B, 0x3B}, + {0x00, 0x3C, 0x3C}, {0x00, 0x3D, 0x3D}, {0x00, 0x3E, 0x3E}, + {0x00, 0x3F, 0x3F}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6A, 0x4A}, + {0x01, 0x6B, 0x4B}, {0x01, 0x6C, 0x4C}, {0x01, 0x6D, 0x4D}, + {0x01, 0x6E, 0x4E}, {0x01, 0x6F, 0x4F}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7A, 0x5A}, {0x00, 0x5B, 0x5B}, {0x00, 0x5C, 0x5C}, + {0x00, 0x5D, 0x5D}, {0x00, 0x5E, 0x5E}, {0x00, 0x5F, 0x5F}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6A, 0x4A}, {0x00, 0x6B, 0x4B}, + {0x00, 0x6C, 0x4C}, {0x00, 0x6D, 0x4D}, {0x00, 0x6E, 0x4E}, + {0x00, 0x6F, 0x4F}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7A, 0x5A}, + {0x00, 0x7B, 0x7B}, {0x00, 0x7C, 0x7C}, {0x00, 0x7D, 0x7D}, + {0x00, 0x7E, 0x7E}, {0x00, 0x7F, 0x7F}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8A, 0x8A}, {0x00, 0x8B, 0x8B}, {0x00, 0x8C, 0x8C}, + {0x00, 0x8D, 0x8D}, {0x00, 0x8E, 0x8E}, {0x00, 0x8F, 0x8F}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9A, 0x9A}, {0x00, 0x9B, 0x9B}, + {0x00, 0x9C, 0x9C}, {0x00, 0x9D, 0x9D}, {0x00, 0x9E, 0x9E}, + {0x00, 0x9F, 0x9F}, {0x00, 0xA0, 0xA0}, {0x00, 0xA1, 0xA1}, + {0x00, 0xA2, 0xA2}, {0x00, 0xA3, 0xA3}, {0x00, 0xA4, 0xA4}, + {0x00, 0xA5, 0xA5}, {0x00, 0xA6, 0xA6}, {0x00, 0xA7, 0xA7}, + {0x01, 0xB8, 0xA8}, {0x00, 0xA9, 0xA9}, {0x01, 0xBA, 0xAA}, + {0x00, 0xAB, 0xAB}, {0x00, 0xAC, 0xAC}, {0x00, 0xAD, 0xAD}, + {0x00, 0xAE, 0xAE}, {0x01, 0xBF, 0xAF}, {0x00, 0xB0, 0xB0}, + {0x00, 0xB1, 0xB1}, {0x00, 0xB2, 0xB2}, {0x00, 0xB3, 0xB3}, + {0x00, 0xB4, 0xB4}, {0x00, 0xB5, 0xB5}, {0x00, 0xB6, 0xB6}, + {0x00, 0xB7, 0xB7}, {0x00, 0xB8, 0xA8}, {0x00, 0xB9, 0xB9}, + {0x00, 0xBA, 0xAA}, {0x00, 0xBB, 0xBB}, {0x00, 0xBC, 0xBC}, + {0x00, 0xBD, 0xBD}, {0x00, 0xBE, 0xBE}, {0x00, 0xBF, 0xAF}, + {0x01, 0xE0, 0xC0}, {0x01, 0xE1, 0xC1}, {0x01, 0xE2, 0xC2}, + {0x01, 0xE3, 0xC3}, {0x01, 0xE4, 0xC4}, {0x01, 0xE5, 0xC5}, + {0x01, 0xE6, 0xC6}, {0x01, 0xE7, 0xC7}, {0x01, 0xE8, 0xC8}, + {0x01, 0xE9, 0xC9}, {0x01, 0xEA, 0xCA}, {0x01, 0xEB, 0xCB}, + {0x01, 0xEC, 0xCC}, {0x01, 0xED, 0xCD}, {0x01, 0xEE, 0xCE}, + {0x01, 0xEF, 0xCF}, {0x01, 0xF0, 0xD0}, {0x01, 0xF1, 0xD1}, + {0x01, 0xF2, 0xD2}, {0x01, 0xF3, 0xD3}, {0x01, 0xF4, 0xD4}, + {0x01, 0xF5, 0xD5}, {0x01, 0xF6, 0xD6}, {0x00, 0xD7, 0xD7}, + {0x01, 0xF8, 0xD8}, {0x01, 0xF9, 0xD9}, {0x01, 0xFA, 0xDA}, + {0x01, 0xFB, 0xDB}, {0x01, 0xFC, 0xDC}, {0x01, 0xFD, 0xDD}, + {0x01, 0xFE, 0xDE}, {0x00, 0xDF, 0xDF}, {0x00, 0xE0, 0xC0}, + {0x00, 0xE1, 0xC1}, {0x00, 0xE2, 0xC2}, {0x00, 0xE3, 0xC3}, + {0x00, 0xE4, 0xC4}, {0x00, 0xE5, 0xC5}, {0x00, 0xE6, 0xC6}, + {0x00, 0xE7, 0xC7}, {0x00, 0xE8, 0xC8}, {0x00, 0xE9, 0xC9}, + {0x00, 0xEA, 0xCA}, {0x00, 0xEB, 0xCB}, {0x00, 0xEC, 0xCC}, + {0x00, 0xED, 0xCD}, {0x00, 0xEE, 0xCE}, {0x00, 0xEF, 0xCF}, + {0x00, 0xF0, 0xD0}, {0x00, 0xF1, 0xD1}, {0x00, 0xF2, 0xD2}, + {0x00, 0xF3, 0xD3}, {0x00, 0xF4, 0xD4}, {0x00, 0xF5, 0xD5}, + {0x00, 0xF6, 0xD6}, {0x00, 0xF7, 0xF7}, {0x00, 0xF8, 0xD8}, + {0x00, 0xF9, 0xD9}, {0x00, 0xFA, 0xDA}, {0x00, 0xFB, 0xDB}, + {0x00, 0xFC, 0xDC}, {0x00, 0xFD, 0xDD}, {0x00, 0xFE, 0xDE}, + {0x00, 0xFF, 0xFF}}; + +static struct cs_info iso14_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x01, 0xa2, 0xa1}, + {0x00, 0xa2, 0xa1}, {0x00, 0xa3, 0xa3}, {0x01, 0xa5, 0xa4}, + {0x00, 0xa5, 0xa4}, {0x01, 0xa6, 0xab}, {0x00, 0xa7, 0xa7}, + {0x01, 0xb8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x01, 0xba, 0xaa}, + {0x00, 0xab, 0xa6}, {0x01, 0xbc, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x01, 0xff, 0xaf}, {0x01, 0xb1, 0xb0}, + {0x00, 0xb1, 0xb0}, {0x01, 0xb3, 0xb2}, {0x00, 0xb3, 0xb2}, + {0x01, 0xb5, 0xb4}, {0x00, 0xb5, 0xb4}, {0x00, 0xb6, 0xb6}, + {0x01, 0xb9, 0xb7}, {0x00, 0xb8, 0xa8}, {0x00, 0xb9, 0xb6}, + {0x00, 0xba, 0xaa}, {0x01, 0xbf, 0xbb}, {0x00, 0xbc, 0xac}, + {0x01, 0xbe, 0xbd}, {0x00, 0xbe, 0xbd}, {0x00, 0xbf, 0xbb}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x01, 0xf7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xd7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xff}}; + +static struct cs_info iso15_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x01, 0xa8, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa6}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x01, 0xb8, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb4}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x01, 0xbd, 0xbc}, + {0x00, 0xbd, 0xbc}, {0x01, 0xff, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x01, 0xe0, 0xc0}, {0x01, 0xe1, 0xc1}, {0x01, 0xe2, 0xc2}, + {0x01, 0xe3, 0xc3}, {0x01, 0xe4, 0xc4}, {0x01, 0xe5, 0xc5}, + {0x01, 0xe6, 0xc6}, {0x01, 0xe7, 0xc7}, {0x01, 0xe8, 0xc8}, + {0x01, 0xe9, 0xc9}, {0x01, 0xea, 0xca}, {0x01, 0xeb, 0xcb}, + {0x01, 0xec, 0xcc}, {0x01, 0xed, 0xcd}, {0x01, 0xee, 0xce}, + {0x01, 0xef, 0xcf}, {0x01, 0xf0, 0xd0}, {0x01, 0xf1, 0xd1}, + {0x01, 0xf2, 0xd2}, {0x01, 0xf3, 0xd3}, {0x01, 0xf4, 0xd4}, + {0x01, 0xf5, 0xd5}, {0x01, 0xf6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x01, 0xf8, 0xd8}, {0x01, 0xf9, 0xd9}, {0x01, 0xfa, 0xda}, + {0x01, 0xfb, 0xdb}, {0x01, 0xfc, 0xdc}, {0x01, 0xfd, 0xdd}, + {0x01, 0xfe, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xc0}, + {0x00, 0xe1, 0xc1}, {0x00, 0xe2, 0xc2}, {0x00, 0xe3, 0xc3}, + {0x00, 0xe4, 0xc4}, {0x00, 0xe5, 0xc5}, {0x00, 0xe6, 0xc6}, + {0x00, 0xe7, 0xc7}, {0x00, 0xe8, 0xc8}, {0x00, 0xe9, 0xc9}, + {0x00, 0xea, 0xca}, {0x00, 0xeb, 0xcb}, {0x00, 0xec, 0xcc}, + {0x00, 0xed, 0xcd}, {0x00, 0xee, 0xce}, {0x00, 0xef, 0xcf}, + {0x00, 0xf0, 0xd0}, {0x00, 0xf1, 0xd1}, {0x00, 0xf2, 0xd2}, + {0x00, 0xf3, 0xd3}, {0x00, 0xf4, 0xd4}, {0x00, 0xf5, 0xd5}, + {0x00, 0xf6, 0xd6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xd8}, + {0x00, 0xf9, 0xd9}, {0x00, 0xfa, 0xda}, {0x00, 0xfb, 0xdb}, + {0x00, 0xfc, 0xdc}, {0x00, 0xfd, 0xdd}, {0x00, 0xfe, 0xde}, + {0x00, 0xff, 0xbe}}; + +static struct cs_info iscii_devanagari_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x00, 0xc0, 0xc0}, {0x00, 0xc1, 0xc1}, {0x00, 0xc2, 0xc2}, + {0x00, 0xc3, 0xc3}, {0x00, 0xc4, 0xc4}, {0x00, 0xc5, 0xc5}, + {0x00, 0xc6, 0xc6}, {0x00, 0xc7, 0xc7}, {0x00, 0xc8, 0xc8}, + {0x00, 0xc9, 0xc9}, {0x00, 0xca, 0xca}, {0x00, 0xcb, 0xcb}, + {0x00, 0xcc, 0xcc}, {0x00, 0xcd, 0xcd}, {0x00, 0xce, 0xce}, + {0x00, 0xcf, 0xcf}, {0x00, 0xd0, 0xd0}, {0x00, 0xd1, 0xd1}, + {0x00, 0xd2, 0xd2}, {0x00, 0xd3, 0xd3}, {0x00, 0xd4, 0xd4}, + {0x00, 0xd5, 0xd5}, {0x00, 0xd6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x00, 0xd8, 0xd8}, {0x00, 0xd9, 0xd9}, {0x00, 0xda, 0xda}, + {0x00, 0xdb, 0xdb}, {0x00, 0xdc, 0xdc}, {0x00, 0xdd, 0xdd}, + {0x00, 0xde, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xe0}, + {0x00, 0xe1, 0xe1}, {0x00, 0xe2, 0xe2}, {0x00, 0xe3, 0xe3}, + {0x00, 0xe4, 0xe4}, {0x00, 0xe5, 0xe5}, {0x00, 0xe6, 0xe6}, + {0x00, 0xe7, 0xe7}, {0x00, 0xe8, 0xe8}, {0x00, 0xe9, 0xe9}, + {0x00, 0xea, 0xea}, {0x00, 0xeb, 0xeb}, {0x00, 0xec, 0xec}, + {0x00, 0xed, 0xed}, {0x00, 0xee, 0xee}, {0x00, 0xef, 0xef}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xf1}, {0x00, 0xf2, 0xf2}, + {0x00, 0xf3, 0xf3}, {0x00, 0xf4, 0xf4}, {0x00, 0xf5, 0xf5}, + {0x00, 0xf6, 0xf6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xf8}, + {0x00, 0xf9, 0xf9}, {0x00, 0xfa, 0xfa}, {0x00, 0xfb, 0xfb}, + {0x00, 0xfc, 0xfc}, {0x00, 0xfd, 0xfd}, {0x00, 0xfe, 0xfe}, + {0x00, 0xff, 0xff}}; + +static struct cs_info tis620_tbl[] = { + {0x00, 0x00, 0x00}, {0x00, 0x01, 0x01}, {0x00, 0x02, 0x02}, + {0x00, 0x03, 0x03}, {0x00, 0x04, 0x04}, {0x00, 0x05, 0x05}, + {0x00, 0x06, 0x06}, {0x00, 0x07, 0x07}, {0x00, 0x08, 0x08}, + {0x00, 0x09, 0x09}, {0x00, 0x0a, 0x0a}, {0x00, 0x0b, 0x0b}, + {0x00, 0x0c, 0x0c}, {0x00, 0x0d, 0x0d}, {0x00, 0x0e, 0x0e}, + {0x00, 0x0f, 0x0f}, {0x00, 0x10, 0x10}, {0x00, 0x11, 0x11}, + {0x00, 0x12, 0x12}, {0x00, 0x13, 0x13}, {0x00, 0x14, 0x14}, + {0x00, 0x15, 0x15}, {0x00, 0x16, 0x16}, {0x00, 0x17, 0x17}, + {0x00, 0x18, 0x18}, {0x00, 0x19, 0x19}, {0x00, 0x1a, 0x1a}, + {0x00, 0x1b, 0x1b}, {0x00, 0x1c, 0x1c}, {0x00, 0x1d, 0x1d}, + {0x00, 0x1e, 0x1e}, {0x00, 0x1f, 0x1f}, {0x00, 0x20, 0x20}, + {0x00, 0x21, 0x21}, {0x00, 0x22, 0x22}, {0x00, 0x23, 0x23}, + {0x00, 0x24, 0x24}, {0x00, 0x25, 0x25}, {0x00, 0x26, 0x26}, + {0x00, 0x27, 0x27}, {0x00, 0x28, 0x28}, {0x00, 0x29, 0x29}, + {0x00, 0x2a, 0x2a}, {0x00, 0x2b, 0x2b}, {0x00, 0x2c, 0x2c}, + {0x00, 0x2d, 0x2d}, {0x00, 0x2e, 0x2e}, {0x00, 0x2f, 0x2f}, + {0x00, 0x30, 0x30}, {0x00, 0x31, 0x31}, {0x00, 0x32, 0x32}, + {0x00, 0x33, 0x33}, {0x00, 0x34, 0x34}, {0x00, 0x35, 0x35}, + {0x00, 0x36, 0x36}, {0x00, 0x37, 0x37}, {0x00, 0x38, 0x38}, + {0x00, 0x39, 0x39}, {0x00, 0x3a, 0x3a}, {0x00, 0x3b, 0x3b}, + {0x00, 0x3c, 0x3c}, {0x00, 0x3d, 0x3d}, {0x00, 0x3e, 0x3e}, + {0x00, 0x3f, 0x3f}, {0x00, 0x40, 0x40}, {0x01, 0x61, 0x41}, + {0x01, 0x62, 0x42}, {0x01, 0x63, 0x43}, {0x01, 0x64, 0x44}, + {0x01, 0x65, 0x45}, {0x01, 0x66, 0x46}, {0x01, 0x67, 0x47}, + {0x01, 0x68, 0x48}, {0x01, 0x69, 0x49}, {0x01, 0x6a, 0x4a}, + {0x01, 0x6b, 0x4b}, {0x01, 0x6c, 0x4c}, {0x01, 0x6d, 0x4d}, + {0x01, 0x6e, 0x4e}, {0x01, 0x6f, 0x4f}, {0x01, 0x70, 0x50}, + {0x01, 0x71, 0x51}, {0x01, 0x72, 0x52}, {0x01, 0x73, 0x53}, + {0x01, 0x74, 0x54}, {0x01, 0x75, 0x55}, {0x01, 0x76, 0x56}, + {0x01, 0x77, 0x57}, {0x01, 0x78, 0x58}, {0x01, 0x79, 0x59}, + {0x01, 0x7a, 0x5a}, {0x00, 0x5b, 0x5b}, {0x00, 0x5c, 0x5c}, + {0x00, 0x5d, 0x5d}, {0x00, 0x5e, 0x5e}, {0x00, 0x5f, 0x5f}, + {0x00, 0x60, 0x60}, {0x00, 0x61, 0x41}, {0x00, 0x62, 0x42}, + {0x00, 0x63, 0x43}, {0x00, 0x64, 0x44}, {0x00, 0x65, 0x45}, + {0x00, 0x66, 0x46}, {0x00, 0x67, 0x47}, {0x00, 0x68, 0x48}, + {0x00, 0x69, 0x49}, {0x00, 0x6a, 0x4a}, {0x00, 0x6b, 0x4b}, + {0x00, 0x6c, 0x4c}, {0x00, 0x6d, 0x4d}, {0x00, 0x6e, 0x4e}, + {0x00, 0x6f, 0x4f}, {0x00, 0x70, 0x50}, {0x00, 0x71, 0x51}, + {0x00, 0x72, 0x52}, {0x00, 0x73, 0x53}, {0x00, 0x74, 0x54}, + {0x00, 0x75, 0x55}, {0x00, 0x76, 0x56}, {0x00, 0x77, 0x57}, + {0x00, 0x78, 0x58}, {0x00, 0x79, 0x59}, {0x00, 0x7a, 0x5a}, + {0x00, 0x7b, 0x7b}, {0x00, 0x7c, 0x7c}, {0x00, 0x7d, 0x7d}, + {0x00, 0x7e, 0x7e}, {0x00, 0x7f, 0x7f}, {0x00, 0x80, 0x80}, + {0x00, 0x81, 0x81}, {0x00, 0x82, 0x82}, {0x00, 0x83, 0x83}, + {0x00, 0x84, 0x84}, {0x00, 0x85, 0x85}, {0x00, 0x86, 0x86}, + {0x00, 0x87, 0x87}, {0x00, 0x88, 0x88}, {0x00, 0x89, 0x89}, + {0x00, 0x8a, 0x8a}, {0x00, 0x8b, 0x8b}, {0x00, 0x8c, 0x8c}, + {0x00, 0x8d, 0x8d}, {0x00, 0x8e, 0x8e}, {0x00, 0x8f, 0x8f}, + {0x00, 0x90, 0x90}, {0x00, 0x91, 0x91}, {0x00, 0x92, 0x92}, + {0x00, 0x93, 0x93}, {0x00, 0x94, 0x94}, {0x00, 0x95, 0x95}, + {0x00, 0x96, 0x96}, {0x00, 0x97, 0x97}, {0x00, 0x98, 0x98}, + {0x00, 0x99, 0x99}, {0x00, 0x9a, 0x9a}, {0x00, 0x9b, 0x9b}, + {0x00, 0x9c, 0x9c}, {0x00, 0x9d, 0x9d}, {0x00, 0x9e, 0x9e}, + {0x00, 0x9f, 0x9f}, {0x00, 0xa0, 0xa0}, {0x00, 0xa1, 0xa1}, + {0x00, 0xa2, 0xa2}, {0x00, 0xa3, 0xa3}, {0x00, 0xa4, 0xa4}, + {0x00, 0xa5, 0xa5}, {0x00, 0xa6, 0xa6}, {0x00, 0xa7, 0xa7}, + {0x00, 0xa8, 0xa8}, {0x00, 0xa9, 0xa9}, {0x00, 0xaa, 0xaa}, + {0x00, 0xab, 0xab}, {0x00, 0xac, 0xac}, {0x00, 0xad, 0xad}, + {0x00, 0xae, 0xae}, {0x00, 0xaf, 0xaf}, {0x00, 0xb0, 0xb0}, + {0x00, 0xb1, 0xb1}, {0x00, 0xb2, 0xb2}, {0x00, 0xb3, 0xb3}, + {0x00, 0xb4, 0xb4}, {0x00, 0xb5, 0xb5}, {0x00, 0xb6, 0xb6}, + {0x00, 0xb7, 0xb7}, {0x00, 0xb8, 0xb8}, {0x00, 0xb9, 0xb9}, + {0x00, 0xba, 0xba}, {0x00, 0xbb, 0xbb}, {0x00, 0xbc, 0xbc}, + {0x00, 0xbd, 0xbd}, {0x00, 0xbe, 0xbe}, {0x00, 0xbf, 0xbf}, + {0x00, 0xc0, 0xc0}, {0x00, 0xc1, 0xc1}, {0x00, 0xc2, 0xc2}, + {0x00, 0xc3, 0xc3}, {0x00, 0xc4, 0xc4}, {0x00, 0xc5, 0xc5}, + {0x00, 0xc6, 0xc6}, {0x00, 0xc7, 0xc7}, {0x00, 0xc8, 0xc8}, + {0x00, 0xc9, 0xc9}, {0x00, 0xca, 0xca}, {0x00, 0xcb, 0xcb}, + {0x00, 0xcc, 0xcc}, {0x00, 0xcd, 0xcd}, {0x00, 0xce, 0xce}, + {0x00, 0xcf, 0xcf}, {0x00, 0xd0, 0xd0}, {0x00, 0xd1, 0xd1}, + {0x00, 0xd2, 0xd2}, {0x00, 0xd3, 0xd3}, {0x00, 0xd4, 0xd4}, + {0x00, 0xd5, 0xd5}, {0x00, 0xd6, 0xd6}, {0x00, 0xd7, 0xd7}, + {0x00, 0xd8, 0xd8}, {0x00, 0xd9, 0xd9}, {0x00, 0xda, 0xda}, + {0x00, 0xdb, 0xdb}, {0x00, 0xdc, 0xdc}, {0x00, 0xdd, 0xdd}, + {0x00, 0xde, 0xde}, {0x00, 0xdf, 0xdf}, {0x00, 0xe0, 0xe0}, + {0x00, 0xe1, 0xe1}, {0x00, 0xe2, 0xe2}, {0x00, 0xe3, 0xe3}, + {0x00, 0xe4, 0xe4}, {0x00, 0xe5, 0xe5}, {0x00, 0xe6, 0xe6}, + {0x00, 0xe7, 0xe7}, {0x00, 0xe8, 0xe8}, {0x00, 0xe9, 0xe9}, + {0x00, 0xea, 0xea}, {0x00, 0xeb, 0xeb}, {0x00, 0xec, 0xec}, + {0x00, 0xed, 0xed}, {0x00, 0xee, 0xee}, {0x00, 0xef, 0xef}, + {0x00, 0xf0, 0xf0}, {0x00, 0xf1, 0xf1}, {0x00, 0xf2, 0xf2}, + {0x00, 0xf3, 0xf3}, {0x00, 0xf4, 0xf4}, {0x00, 0xf5, 0xf5}, + {0x00, 0xf6, 0xf6}, {0x00, 0xf7, 0xf7}, {0x00, 0xf8, 0xf8}, + {0x00, 0xf9, 0xf9}, {0x00, 0xfa, 0xfa}, {0x00, 0xfb, 0xfb}, + {0x00, 0xfc, 0xfc}, {0x00, 0xfd, 0xfd}, {0x00, 0xfe, 0xfe}, + {0x00, 0xff, 0xff}}; + +struct enc_entry { + const char* enc_name; + struct cs_info* cs_table; +}; + +static struct enc_entry encds[] = { + {"iso88591", iso1_tbl}, // ISO-8859-1 + {"iso88592", iso2_tbl}, // ISO-8859-2 + {"iso88593", iso3_tbl}, // ISO-8859-3 + {"iso88594", iso4_tbl}, // ISO-8859-4 + {"iso88595", iso5_tbl}, // ISO-8859-5 + {"iso88596", iso6_tbl}, // ISO-8859-6 + {"iso88597", iso7_tbl}, // ISO-8859-7 + {"iso88598", iso8_tbl}, // ISO-8859-8 + {"iso88599", iso9_tbl}, // ISO-8859-9 + {"iso885910", iso10_tbl}, // ISO-8859-10 + {"tis620", tis620_tbl}, // TIS-620/ISO-8859-11 + {"tis6202533", tis620_tbl}, // TIS-620/ISO-8859-11 + {"iso885911", tis620_tbl}, // TIS-620/ISO-8859-11 + {"iso885913", iso13_tbl}, // ISO-8859-13 + {"iso885914", iso14_tbl}, // ISO-8859-14 + {"iso885915", iso15_tbl}, // ISO-8859-15 + {"koi8r", koi8r_tbl}, // KOI8-R + {"koi8u", koi8u_tbl}, // KOI8-U + {"cp1251", cp1251_tbl}, // CP-1251 + {"microsoftcp1251", cp1251_tbl}, // microsoft-cp1251 + {"xisciias", iscii_devanagari_tbl}, // x-iscii-as + {"isciidevanagari", iscii_devanagari_tbl} // ISCII-DEVANAGARI +}; + +/* map to lower case and remove non alphanumeric chars */ +static void toAsciiLowerAndRemoveNonAlphanumeric(const char* pName, + char* pBuf) { + while (*pName) { + /* A-Z */ + if ((*pName >= 0x41) && (*pName <= 0x5A)) { + *pBuf = (*pName) + 0x20; /* toAsciiLower */ + pBuf++; + } + /* a-z, 0-9 */ + else if (((*pName >= 0x61) && (*pName <= 0x7A)) || + ((*pName >= 0x30) && (*pName <= 0x39))) { + *pBuf = *pName; + pBuf++; + } + + pName++; + } + + *pBuf = '\0'; +} + +struct cs_info* get_current_cs(const char* es) { + char* normalized_encoding = new char[strlen(es) + 1]; + toAsciiLowerAndRemoveNonAlphanumeric(es, normalized_encoding); + + struct cs_info* ccs = NULL; + int n = sizeof(encds) / sizeof(encds[0]); + for (int i = 0; i < n; i++) { + if (strcmp(normalized_encoding, encds[i].enc_name) == 0) { + ccs = encds[i].cs_table; + break; + } + } + + delete[] normalized_encoding; + + if (!ccs) { + HUNSPELL_WARNING(stderr, + "error: unknown encoding %s: using %s as fallback\n", es, + encds[0].enc_name); + ccs = encds[0].cs_table; + } + + return ccs; +} +#else +// XXX This function was rewritten for mozilla. Instead of storing the +// conversion tables static in this file, create them when needed +// with help the mozilla backend. +struct cs_info* get_current_cs(const char* es) { + struct cs_info* ccs = new cs_info[256]; + // Initialze the array with dummy data so that we wouldn't need + // to return null in case of failures. + for (int i = 0; i <= 0xff; ++i) { + ccs[i].ccase = false; + ccs[i].clower = i; + ccs[i].cupper = i; + } + + nsCOMPtr<nsIUnicodeEncoder> encoder; + nsCOMPtr<nsIUnicodeDecoder> decoder; + + nsresult rv; + + nsAutoCString label(es); + nsAutoCString encoding; + if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) { + return ccs; + } + encoder = EncodingUtils::EncoderForEncoding(encoding); + decoder = EncodingUtils::DecoderForEncoding(encoding); + encoder->SetOutputErrorBehavior(encoder->kOnError_Signal, nullptr, '?'); + decoder->SetInputErrorBehavior(decoder->kOnError_Signal); + + for (unsigned int i = 0; i <= 0xff; ++i) { + bool success = false; + // We want to find the upper/lowercase equivalents of each byte + // in this 1-byte character encoding. Call our encoding/decoding + // APIs separately for each byte since they may reject some of the + // bytes, and we want to handle errors separately for each byte. + char lower, upper; + do { + if (i == 0) + break; + const char source = char(i); + char16_t uni, uniCased; + int32_t charLength = 1, uniLength = 1; + + rv = decoder->Convert(&source, &charLength, &uni, &uniLength); + // Explicitly check NS_OK because we don't want to allow + // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT. + if (rv != NS_OK || charLength != 1 || uniLength != 1) + break; + uniCased = ToLowerCase(uni); + rv = encoder->Convert(&uniCased, &uniLength, &lower, &charLength); + // Explicitly check NS_OK because we don't want to allow + // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT. + if (rv != NS_OK || charLength != 1 || uniLength != 1) + break; + + uniCased = ToUpperCase(uni); + rv = encoder->Convert(&uniCased, &uniLength, &upper, &charLength); + // Explicitly check NS_OK because we don't want to allow + // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT. + if (rv != NS_OK || charLength != 1 || uniLength != 1) + break; + + success = true; + } while (0); + + if (success) { + ccs[i].cupper = upper; + ccs[i].clower = lower; + } else { + ccs[i].cupper = i; + ccs[i].clower = i; + } + + if (ccs[i].clower != (unsigned char)i) + ccs[i].ccase = true; + else + ccs[i].ccase = false; + } + + return ccs; +} +#endif + +// primitive isalpha() replacement for tokenization +char* get_casechars(const char* enc) { + struct cs_info* csconv = get_current_cs(enc); + char expw[MAXLNLEN]; + char* p = expw; + for (int i = 0; i <= 255; i++) { + if (cupper(csconv, i) != clower(csconv, i)) { + *p = static_cast<char>(i); + p++; + } + } + *p = '\0'; +#ifdef MOZILLA_CLIENT + delete[] csconv; +#endif + return mystrdup(expw); +} + +// language to encoding default map + +struct lang_map { + const char* lang; + int num; +}; + +static struct lang_map lang2enc[] = + {{"ar", LANG_ar}, {"az", LANG_az}, + {"az_AZ", LANG_az}, // for back-compatibility + {"bg", LANG_bg}, {"ca", LANG_ca}, + {"cs", LANG_cs}, {"da", LANG_da}, + {"de", LANG_de}, {"el", LANG_el}, + {"en", LANG_en}, {"es", LANG_es}, + {"eu", LANG_eu}, {"gl", LANG_gl}, + {"fr", LANG_fr}, {"hr", LANG_hr}, + {"hu", LANG_hu}, {"hu_HU", LANG_hu}, // for back-compatibility + {"it", LANG_it}, {"la", LANG_la}, + {"lv", LANG_lv}, {"nl", LANG_nl}, + {"pl", LANG_pl}, {"pt", LANG_pt}, + {"sv", LANG_sv}, {"tr", LANG_tr}, + {"tr_TR", LANG_tr}, // for back-compatibility + {"ru", LANG_ru}, {"uk", LANG_uk}}; + +int get_lang_num(const char* lang) { + int n = sizeof(lang2enc) / sizeof(lang2enc[0]); + for (int i = 0; i < n; i++) { + if (strcmp(lang, lang2enc[i].lang) == 0) { + return lang2enc[i].num; + } + } + return LANG_xx; +} + +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT +int initialize_utf_tbl() { + utf_tbl_count++; + if (utf_tbl) + return 0; + utf_tbl = (unicode_info2*)malloc(CONTSIZE * sizeof(unicode_info2)); + if (utf_tbl) { + size_t j; + for (j = 0; j < CONTSIZE; j++) { + utf_tbl[j].cletter = 0; + utf_tbl[j].clower = (unsigned short)j; + utf_tbl[j].cupper = (unsigned short)j; + } + for (j = 0; j < UTF_LST_LEN; j++) { + utf_tbl[utf_lst[j].c].cletter = 1; + utf_tbl[utf_lst[j].c].clower = utf_lst[j].clower; + utf_tbl[utf_lst[j].c].cupper = utf_lst[j].cupper; + } + } else + return 1; + return 0; +} +#endif +#endif + +void free_utf_tbl() { + if (utf_tbl_count > 0) + utf_tbl_count--; + if (utf_tbl && (utf_tbl_count == 0)) { + free(utf_tbl); + utf_tbl = NULL; + } +} + +unsigned short unicodetoupper(unsigned short c, int langnum) { + // In Azeri and Turkish, I and i dictinct letters: + // There are a dotless lower case i pair of upper `I', + // and an upper I with dot pair of lower `i'. + if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr))) + return 0x0130; +#ifdef OPENOFFICEORG + return static_cast<unsigned short>(u_toupper(c)); +#else +#ifdef MOZILLA_CLIENT + return ToUpperCase((char16_t)c); +#else + return (utf_tbl) ? utf_tbl[c].cupper : c; +#endif +#endif +} + +unsigned short unicodetolower(unsigned short c, int langnum) { + // In Azeri and Turkish, I and i dictinct letters: + // There are a dotless lower case i pair of upper `I', + // and an upper I with dot pair of lower `i'. + if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr))) + return 0x0131; +#ifdef OPENOFFICEORG + return static_cast<unsigned short>(u_tolower(c)); +#else +#ifdef MOZILLA_CLIENT + return ToLowerCase((char16_t)c); +#else + return (utf_tbl) ? utf_tbl[c].clower : c; +#endif +#endif +} + +int unicodeisalpha(unsigned short c) { +#ifdef OPENOFFICEORG + return u_isalpha(c); +#else + return (utf_tbl) ? utf_tbl[c].cletter : 0; +#endif +} + +/* get type of capitalization */ +int get_captype(const std::string& word, cs_info* csconv) { + // now determine the capitalization type of the first nl letters + size_t ncap = 0; + size_t nneutral = 0; + size_t firstcap = 0; + if (csconv == NULL) + return NOCAP; + for (std::string::const_iterator q = word.begin(); q != word.end(); ++q) { + unsigned char nIndex = static_cast<unsigned char>(*q); + if (ccase(csconv, nIndex)) + ncap++; + if (cupper(csconv, nIndex) == clower(csconv, nIndex)) + nneutral++; + } + if (ncap) { + unsigned char nIndex = static_cast<unsigned char>(word[0]); + firstcap = csconv[nIndex].ccase; + } + + // now finally set the captype + if (ncap == 0) { + return NOCAP; + } else if ((ncap == 1) && firstcap) { + return INITCAP; + } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) { + return ALLCAP; + } else if ((ncap > 1) && firstcap) { + return HUHINITCAP; + } + return HUHCAP; +} + +int get_captype_utf8(const std::vector<w_char>& word, int langnum) { + // now determine the capitalization type of the first nl letters + size_t ncap = 0; + size_t nneutral = 0; + size_t firstcap = 0; + for (size_t i = 0; i < word.size(); ++i) { + unsigned short idx = (word[i].h << 8) + word[i].l; + if (idx != unicodetolower(idx, langnum)) + ncap++; + if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) + nneutral++; + } + if (ncap) { + unsigned short idx = (word[0].h << 8) + word[0].l; + firstcap = (idx != unicodetolower(idx, langnum)); + } + + // now finally set the captype + if (ncap == 0) { + return NOCAP; + } else if ((ncap == 1) && firstcap) { + return INITCAP; + } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) { + return ALLCAP; + } else if ((ncap > 1) && firstcap) { + return HUHINITCAP; + } + return HUHCAP; +} + +// strip all ignored characters in the string +size_t remove_ignored_chars_utf(std::string& word, + const std::vector<w_char>& ignored_chars) { + std::vector<w_char> w; + std::vector<w_char> w2; + u8_u16(w, word); + + for (size_t i = 0; i < w.size(); ++i) { + if (!std::binary_search(ignored_chars.begin(), + ignored_chars.end(), + w[i])) { + w2.push_back(w[i]); + } + } + + u16_u8(word, w2); + return w2.size(); +} + +namespace { +class is_any_of { + public: + is_any_of(const std::string& in) : chars(in) {} + + bool operator()(char c) { return chars.find(c) != std::string::npos; } + + private: + std::string chars; +}; +} + +// strip all ignored characters in the string +size_t remove_ignored_chars(std::string& word, + const std::string& ignored_chars) { + word.erase( + std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars)), + word.end()); + return word.size(); +} + +int parse_string(char* line, char** out, int ln) { + char* tp = line; + char* piece; + int i = 0; + int np = 0; + if (*out) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions\n", ln); + return 1; + } + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + *out = mystrdup(piece); + if (!*out) + return 1; + np++; + break; + } + default: + break; + } + i++; + } + // free(piece); + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", ln); + return 1; + } + return 0; +} + +bool parse_array(char* line, + char** out, + std::vector<w_char>& out_utf16, + int utf8, + int ln) { + if (parse_string(line, out, ln)) + return false; + if (utf8) { + u8_u16(out_utf16, *out); + std::sort(out_utf16.begin(), out_utf16.end()); + } + return true; +} diff --git a/libs/hunspell/src/csutil.hxx b/libs/hunspell/src/csutil.hxx new file mode 100644 index 000000000..ce7091df5 --- /dev/null +++ b/libs/hunspell/src/csutil.hxx @@ -0,0 +1,325 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __CSUTILHXX__ +#define __CSUTILHXX__ + +#include "hunvisapi.h" + +// First some base level utility routines + +#include <string> +#include <vector> +#include <string.h> +#include "w_char.hxx" +#include "htypes.hxx" + +#ifdef MOZILLA_CLIENT +#include "nscore.h" // for mozalloc headers +#endif + +// casing +#define NOCAP 0 +#define INITCAP 1 +#define ALLCAP 2 +#define HUHCAP 3 +#define HUHINITCAP 4 + +// default encoding and keystring +#define SPELL_ENCODING "ISO8859-1" +#define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" + +// default morphological fields +#define MORPH_STEM "st:" +#define MORPH_ALLOMORPH "al:" +#define MORPH_POS "po:" +#define MORPH_DERI_PFX "dp:" +#define MORPH_INFL_PFX "ip:" +#define MORPH_TERM_PFX "tp:" +#define MORPH_DERI_SFX "ds:" +#define MORPH_INFL_SFX "is:" +#define MORPH_TERM_SFX "ts:" +#define MORPH_SURF_PFX "sp:" +#define MORPH_FREQ "fr:" +#define MORPH_PHON "ph:" +#define MORPH_HYPH "hy:" +#define MORPH_PART "pa:" +#define MORPH_FLAG "fl:" +#define MORPH_HENTRY "_H:" +#define MORPH_TAG_LEN strlen(MORPH_STEM) + +#define MSEP_FLD ' ' +#define MSEP_REC '\n' +#define MSEP_ALT '\v' + +// default flags +#define DEFAULTFLAGS 65510 +#define FORBIDDENWORD 65510 +#define ONLYUPCASEFLAG 65511 + +// fopen or optional _wfopen to fix long pathname problem of WIN32 +LIBHUNSPELL_DLL_EXPORTED FILE* myfopen(const char* path, const char* mode); + +// convert UTF-16 characters to UTF-8 +LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest, + const std::vector<w_char>& src); + +// convert UTF-8 characters to UTF-16 +LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest, + const std::string& src); + +// remove end of line char(s) +LIBHUNSPELL_DLL_EXPORTED void mychomp(char* s); + +// duplicate string +LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s); + +// strcat for limited length destination string +LIBHUNSPELL_DLL_EXPORTED char* mystrcat(char* dest, const char* st, int max); + +// parse into tokens with char delimiter +LIBHUNSPELL_DLL_EXPORTED char* mystrsep(char** sptr, const char delim); + +// replace pat by rep in word and return word +LIBHUNSPELL_DLL_EXPORTED char* mystrrep(char* word, + const char* pat, + const char* rep); +LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str, + const std::string& search, + const std::string& replace); + +// append s to ends of every lines in text +LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str, + const std::string& apd); + +// tokenize into lines with new line +LIBHUNSPELL_DLL_EXPORTED int line_tok(const char* text, + char*** lines, + char breakchar); + +// tokenize into lines with new line and uniq in place +LIBHUNSPELL_DLL_EXPORTED char* line_uniq(char* text, char breakchar); +LIBHUNSPELL_DLL_EXPORTED char* line_uniq_app(char** text, char breakchar); + +// reverse word +LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word); + +// reverse word +LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&); + +// remove duplicates +LIBHUNSPELL_DLL_EXPORTED int uniqlist(char** list, int n); + +// free character array list +LIBHUNSPELL_DLL_EXPORTED void freelist(char*** list, int n); + +// character encoding information +struct cs_info { + unsigned char ccase; + unsigned char clower; + unsigned char cupper; +}; + +LIBHUNSPELL_DLL_EXPORTED int initialize_utf_tbl(); +LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl(); +LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c, + int langnum); +LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum); +LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum); +LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c, + int langnum); +LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c); + +LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const char* es); + +// get language identifiers of language codes +LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const char* lang); + +// get characters of the given 8bit encoding with lower- and uppercase forms +LIBHUNSPELL_DLL_EXPORTED char* get_casechars(const char* enc); + +// convert std::string to all caps +LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s, + const struct cs_info* csconv); + +// convert null terminated string to all little +LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s, + const struct cs_info* csconv); + +// convert first letter of string to little +LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s, + const struct cs_info* csconv); + +// convert first letter of string to capital +LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s, + const struct cs_info* csconv); + +// convert first letter of UTF-8 string to capital +LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& +mkinitcap_utf(std::vector<w_char>& u, int langnum); + +// convert UTF-8 string to little +LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& +mkallsmall_utf(std::vector<w_char>& u, int langnum); + +// convert first letter of UTF-8 string to little +LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& +mkinitsmall_utf(std::vector<w_char>& u, int langnum); + +// convert UTF-8 string to capital +LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& +mkallcap_utf(std::vector<w_char>& u, int langnum); + +// get type of capitalization +LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*); + +// get type of capitalization (UTF-8) +LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum); + +// strip all ignored characters in the string +LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf( + std::string& word, + const std::vector<w_char>& ignored_chars); + +// strip all ignored characters in the string +LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars( + std::string& word, + const std::string& ignored_chars); + +LIBHUNSPELL_DLL_EXPORTED int parse_string(char* line, char** out, int ln); + +LIBHUNSPELL_DLL_EXPORTED bool parse_array(char* line, + char** out, + std::vector<w_char>& out_utf16, + int utf8, + int ln); + +LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r); + +LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest, + const std::string& morph, + const std::string& var); + +LIBHUNSPELL_DLL_EXPORTED int morphcmp(const char* s, const char* t); + +LIBHUNSPELL_DLL_EXPORTED int get_sfxcount(const char* morph); + +// conversion function for protected memory +LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source); + +// conversion function for protected memory +LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s); + +// hash entry macros +LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) { + char* ret; + if (!h->var) + ret = NULL; + else if (h->var & H_OPT_ALIASM) + ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); + else + ret = HENTRY_WORD(h) + h->blen + 1; + return ret; +} + +LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA( + const struct hentry* h) { + const char* ret; + if (!h->var) + ret = NULL; + else if (h->var & H_OPT_ALIASM) + ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); + else + ret = HENTRY_WORD(h) + h->blen + 1; + return ret; +} + +// NULL-free version for warning-free OOo build +LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2( + const struct hentry* h) { + const char* ret; + if (!h->var) + ret = ""; + else if (h->var & H_OPT_ALIASM) + ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); + else + ret = HENTRY_WORD(h) + h->blen + 1; + return ret; +} + +LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h, + const char* p) { + return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL); +} + +#endif diff --git a/libs/hunspell/src/filemgr.cxx b/libs/hunspell/src/filemgr.cxx new file mode 100644 index 000000000..2218bc79e --- /dev/null +++ b/libs/hunspell/src/filemgr.cxx @@ -0,0 +1,120 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include "filemgr.hxx" +#include "csutil.hxx" + +int FileMgr::fail(const char* err, const char* par) { + fprintf(stderr, err, par); + return -1; +} + +FileMgr::FileMgr(const char* file, const char* key) : hin(NULL), linenum(0) { + in[0] = '\0'; + + fin = myfopen(file, "r"); + if (!fin) { + // check hzipped file + std::string st(file); + st.append(HZIP_EXTENSION); + hin = new Hunzip(st.c_str(), key); + } + if (!fin && !hin) + fail(MSG_OPEN, file); +} + +FileMgr::~FileMgr() { + if (fin) + fclose(fin); + if (hin) + delete hin; +} + +char* FileMgr::getline() { + const char* l; + linenum++; + if (fin) + return fgets(in, BUFSIZE - 1, fin); + if (hin && ((l = hin->getline()) != NULL)) + return strcpy(in, l); + linenum--; + return NULL; +} + +int FileMgr::getlinenum() { + return linenum; +} diff --git a/libs/hunspell/src/filemgr.hxx b/libs/hunspell/src/filemgr.hxx new file mode 100644 index 000000000..8b69931dd --- /dev/null +++ b/libs/hunspell/src/filemgr.hxx @@ -0,0 +1,101 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* file manager class - read lines of files [filename] OR [filename.hz] */ +#ifndef _FILEMGR_HXX_ +#define _FILEMGR_HXX_ + +#include "hunvisapi.h" + +#include "hunzip.hxx" +#include <stdio.h> + +class LIBHUNSPELL_DLL_EXPORTED FileMgr { + private: + FileMgr(const FileMgr&); + FileMgr& operator=(const FileMgr&); + + protected: + FILE* fin; + Hunzip* hin; + char in[BUFSIZE + 50]; // input buffer + int fail(const char* err, const char* par); + int linenum; + + public: + FileMgr(const char* filename, const char* key = NULL); + ~FileMgr(); + char* getline(); + int getlinenum(); +}; +#endif diff --git a/libs/hunspell/src/hashmgr.cxx b/libs/hunspell/src/hashmgr.cxx new file mode 100644 index 000000000..c3cd95420 --- /dev/null +++ b/libs/hunspell/src/hashmgr.cxx @@ -0,0 +1,1147 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> +#include <limits> +#include <sstream> + +#include "hashmgr.hxx" +#include "csutil.hxx" +#include "atypes.hxx" + +// build a hash table from a munched word list + +HashMgr::HashMgr(const char* tpath, const char* apath, const char* key) + : tablesize(0), + tableptr(NULL), + flag_mode(FLAG_CHAR), + complexprefixes(0), + utf8(0), + forbiddenword(FORBIDDENWORD) // forbidden word signing flag + , + numaliasf(0), + aliasf(NULL), + aliasflen(0), + numaliasm(0), + aliasm(NULL) { + langnum = 0; + lang = NULL; + enc = NULL; + csconv = 0; + ignorechars = NULL; + load_config(apath, key); + int ec = load_tables(tpath, key); + if (ec) { + /* error condition - what should we do here */ + HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec); + if (tableptr) { + free(tableptr); + tableptr = NULL; + } + tablesize = 0; + } +} + +HashMgr::~HashMgr() { + if (tableptr) { + // now pass through hash table freeing up everything + // go through column by column of the table + for (int i = 0; i < tablesize; i++) { + struct hentry* pt = tableptr[i]; + struct hentry* nt = NULL; + while (pt) { + nt = pt->next; + if (pt->astr && + (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) + free(pt->astr); + free(pt); + pt = nt; + } + } + free(tableptr); + } + tablesize = 0; + + if (aliasf) { + for (int j = 0; j < (numaliasf); j++) + free(aliasf[j]); + free(aliasf); + aliasf = NULL; + if (aliasflen) { + free(aliasflen); + aliasflen = NULL; + } + } + if (aliasm) { + for (int j = 0; j < (numaliasm); j++) + free(aliasm[j]); + free(aliasm); + aliasm = NULL; + } + +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + if (utf8) + free_utf_tbl(); +#endif +#endif + + if (enc) + free(enc); + if (lang) + free(lang); + + if (ignorechars) + free(ignorechars); + +#ifdef MOZILLA_CLIENT + delete[] csconv; +#endif +} + +// lookup a root word in the hashtable + +struct hentry* HashMgr::lookup(const char* word) const { + struct hentry* dp; + if (tableptr) { + dp = tableptr[hash(word)]; + if (!dp) + return NULL; + for (; dp != NULL; dp = dp->next) { + if (strcmp(word, dp->word) == 0) + return dp; + } + } + return NULL; +} + +// add a word to the hash table (private) +int HashMgr::add_word(const char* word, + int wbl, + int wcl, + unsigned short* aff, + int al, + const char* desc, + bool onlyupcase) { + + std::string *word_copy = NULL; + std::string *desc_copy = NULL; + if (ignorechars || complexprefixes) { + word_copy = new std::string(word, wbl); + + if (ignorechars != NULL) { + if (utf8) { + wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16); + } else { + remove_ignored_chars(*word_copy, ignorechars); + } + } + + if (complexprefixes) { + if (utf8) + wcl = reverseword_utf(*word_copy); + else + reverseword(*word_copy); + + if (desc && !aliasm) { + desc_copy = new std::string(desc); + + if (complexprefixes) { + if (utf8) + reverseword_utf(*desc_copy); + else + reverseword(*desc_copy); + } + desc = desc_copy->c_str(); + } + } + + wbl = word_copy->size(); + word = word_copy->c_str(); + } + + bool upcasehomonym = false; + int descl = desc ? (aliasm ? sizeof(char*) : strlen(desc) + 1) : 0; + // variable-length hash record with word and optional fields + struct hentry* hp = + (struct hentry*)malloc(sizeof(struct hentry) + wbl + descl); + if (!hp) { + delete desc_copy; + delete word_copy; + return 1; + } + + char* hpw = hp->word; + strcpy(hpw, word); + + int i = hash(hpw); + + hp->blen = (unsigned char)wbl; + hp->clen = (unsigned char)wcl; + hp->alen = (short)al; + hp->astr = aff; + hp->next = NULL; + hp->next_homonym = NULL; + + // store the description string or its pointer + if (desc) { + hp->var = H_OPT; + if (aliasm) { + hp->var += H_OPT_ALIASM; + store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); + } else { + strcpy(hpw + wbl + 1, desc); + } + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) + hp->var += H_OPT_PHON; + } else + hp->var = 0; + + struct hentry* dp = tableptr[i]; + if (!dp) { + tableptr[i] = hp; + delete desc_copy; + delete word_copy; + return 0; + } + while (dp->next != NULL) { + if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + free(hp); + delete desc_copy; + delete word_copy; + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } + dp = dp->next; + } + if (strcmp(hp->word, dp->word) == 0) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + free(hp); + delete desc_copy; + delete word_copy; + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } + if (!upcasehomonym) { + dp->next = hp; + } else { + // remove hidden onlyupcase homonym + if (hp->astr) + free(hp->astr); + free(hp); + } + + delete desc_copy; + delete word_copy; + return 0; +} + +int HashMgr::add_hidden_capitalized_word(const std::string& word, + int wcl, + unsigned short* flags, + int flagslen, + char* dp, + int captype) { + if (flags == NULL) + flagslen = 0; + + // add inner capitalized forms to handle the following allcap forms: + // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG + // Allcaps with suffixes: CIA's -> CIA'S + if (((captype == HUHCAP) || (captype == HUHINITCAP) || + ((captype == ALLCAP) && (flagslen != 0))) && + !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) { + unsigned short* flags2 = + (unsigned short*)malloc(sizeof(unsigned short) * (flagslen + 1)); + if (!flags2) + return 1; + if (flagslen) + memcpy(flags2, flags, flagslen * sizeof(unsigned short)); + flags2[flagslen] = ONLYUPCASEFLAG; + if (utf8) { + std::string st; + std::vector<w_char> w; + u8_u16(w, word); + mkallsmall_utf(w, langnum); + mkinitcap_utf(w, langnum); + u16_u8(st, w); + return add_word(st.c_str(), st.size(), wcl, flags2, flagslen + 1, dp, true); + } else { + std::string new_word(word); + mkallsmall(new_word, csconv); + mkinitcap(new_word, csconv); + int ret = add_word(new_word.c_str(), new_word.size(), wcl, flags2, flagslen + 1, dp, true); + return ret; + } + } + return 0; +} + +// detect captype and modify word length for UTF-8 encoding +int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { + int len; + if (utf8) { + std::vector<w_char> dest_utf; + len = u8_u16(dest_utf, word); + *captype = get_captype_utf8(dest_utf, langnum); + } else { + len = word.size(); + *captype = get_captype(word, csconv); + } + return len; +} + +// remove word (personal dictionary function for standalone applications) +int HashMgr::remove(const char* word) { + struct hentry* dp = lookup(word); + while (dp) { + if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { + unsigned short* flags = + (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen + 1)); + if (!flags) + return 1; + for (int i = 0; i < dp->alen; i++) + flags[i] = dp->astr[i]; + flags[dp->alen] = forbiddenword; + dp->astr = flags; + dp->alen++; + std::sort(flags, flags + dp->alen); + } + dp = dp->next_homonym; + } + return 0; +} + +/* remove forbidden flag to add a personal word to the hash */ +int HashMgr::remove_forbidden_flag(const std::string& word) { + struct hentry* dp = lookup(word.c_str()); + if (!dp) + return 1; + while (dp) { + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { + if (dp->alen == 1) + dp->alen = 0; // XXX forbidden words of personal dic. + else { + unsigned short* flags2 = + (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1)); + if (!flags2) + return 1; + int i, j = 0; + for (i = 0; i < dp->alen; i++) { + if (dp->astr[i] != forbiddenword) + flags2[j++] = dp->astr[i]; + } + dp->alen--; + dp->astr = flags2; // XXX allowed forbidden words + } + } + dp = dp->next_homonym; + } + return 0; +} + +// add a custom dic. word to the hash table (public) +int HashMgr::add(const std::string& word) { + unsigned short* flags = NULL; + int al = 0; + if (remove_forbidden_flag(word)) { + int captype; + int wbl = word.size(); + int wcl = get_clen_and_captype(word, &captype); + add_word(word.c_str(), wbl, wcl, flags, al, NULL, false); + return add_hidden_capitalized_word(word, wcl, flags, al, NULL, + captype); + } + return 0; +} + +int HashMgr::add_with_affix(const char* word, const char* example) { + // detect captype and modify word length for UTF-8 encoding + struct hentry* dp = lookup(example); + remove_forbidden_flag(word); + if (dp && dp->astr) { + int captype; + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, &captype); + if (aliasf) { + add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); + } else { + unsigned short* flags = + (unsigned short*)malloc(dp->alen * sizeof(unsigned short)); + if (flags) { + memcpy((void*)flags, (void*)dp->astr, + dp->alen * sizeof(unsigned short)); + add_word(word, wbl, wcl, flags, dp->alen, NULL, false); + } else + return 1; + } + return add_hidden_capitalized_word(word, wcl, dp->astr, + dp->alen, NULL, captype); + } + return 1; +} + +// walk the hash table entry by entry - null at end +// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); +struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const { + if (hp && hp->next != NULL) + return hp->next; + for (col++; col < tablesize; col++) { + if (tableptr[col]) + return tableptr[col]; + } + // null at end and reset to start + col = -1; + return NULL; +} + +// load a munched word list and build a hash table on the fly +int HashMgr::load_tables(const char* tpath, const char* key) { + int al; + char* ap; + char* dp; + char* dp2; + unsigned short* flags; + char* ts; + + // open dictionary file + FileMgr* dict = new FileMgr(tpath, key); + if (dict == NULL) + return 1; + + // first read the first line of file to get hash table size */ + if ((ts = dict->getline()) == NULL) { + HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath); + delete dict; + return 2; + } + mychomp(ts); + + /* remove byte order mark */ + if (strncmp(ts, "\xEF\xBB\xBF", 3) == 0) { + memmove(ts, ts + 3, strlen(ts + 3) + 1); + // warning: dic file begins with byte order mark: possible incompatibility + // with old Hunspell versions + } + + tablesize = atoi(ts); + + int nExtra = 5 + USERWORD; + + if (tablesize <= 0 || + (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) / + int(sizeof(struct hentry*)))) { + HUNSPELL_WARNING( + stderr, "error: line 1: missing or bad word count in the dic file\n"); + delete dict; + return 4; + } + tablesize += nExtra; + if ((tablesize % 2) == 0) + tablesize++; + + // allocate the hash table + tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*)); + if (!tableptr) { + delete dict; + return 3; + } + + // loop through all words on much list and add to hash + // table and create word and affix strings + + while ((ts = dict->getline()) != NULL) { + mychomp(ts); + // split each line into word and morphological description + dp = ts; + while ((dp = strchr(dp, ':')) != NULL) { + if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { + for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--) + ; + if (dp < ts) { // missing word + dp = NULL; + } else { + *(dp + 1) = '\0'; + dp = dp + 2; + } + break; + } + dp++; + } + + // tabulator is the old morphological field separator + dp2 = strchr(ts, '\t'); + if (dp2 && (!dp || dp2 < dp)) { + *dp2 = '\0'; + dp = dp2 + 1; + } + + // split each line into word and affix char strings + // "\/" signs slash in words (not affix separator) + // "/" at beginning of the line is word character (not affix separator) + ap = strchr(ts, '/'); + while (ap) { + if (ap == ts) { + ap++; + continue; + } else if (*(ap - 1) != '\\') + break; + // replace "\/" with "/" + for (char *sp = ap - 1; *sp; *sp = *(sp + 1), sp++) + ; + ap = strchr(ap, '/'); + } + + if (ap) { + *ap = '\0'; + if (aliasf) { + int index = atoi(ap + 1); + al = get_aliasf(index, &flags, dict); + if (!al) { + HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", + dict->getlinenum()); + *ap = '\0'; + } + } else { + al = decode_flags(&flags, ap + 1, dict); + if (al == -1) { + HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); + delete dict; + return 6; + } + std::sort(flags, flags + al); + } + } else { + al = 0; + ap = NULL; + flags = NULL; + } + + int captype; + int wbl = strlen(ts); + int wcl = get_clen_and_captype(ts, &captype); + // add the word and its index plus its capitalized form optionally + if (add_word(ts, wbl, wcl, flags, al, dp, false) || + add_hidden_capitalized_word(ts, wcl, flags, al, dp, captype)) { + delete dict; + return 5; + } + } + + delete dict; + return 0; +} + +// the hash function is a simple load and rotate +// algorithm borrowed +int HashMgr::hash(const char* word) const { + unsigned long hv = 0; + for (int i = 0; i < 4 && *word != 0; i++) + hv = (hv << 8) | (*word++); + while (*word != 0) { + ROTATE(hv, ROTATE_LEN); + hv ^= (*word++); + } + return (unsigned long)hv % tablesize; +} + +int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) { + int len; + if (*flags == '\0') { + *result = NULL; + return 0; + } + switch (flag_mode) { + case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) + len = strlen(flags); + if (len % 2 == 1) + HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", + af->getlinenum()); + len /= 2; + *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + for (int i = 0; i < len; i++) { + (*result)[i] = (((unsigned short)flags[i * 2]) << 8) + + (unsigned short)flags[i * 2 + 1]; + } + break; + } + case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 + // 23 233) + int i; + len = 1; + char* src = flags; + unsigned short* dest; + char* p; + for (p = flags; *p; p++) { + if (*p == ',') + len++; + } + *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + dest = *result; + for (p = flags; *p; p++) { + if (*p == ',') { + i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING( + stderr, "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + *dest = (unsigned short)i; + if (*dest == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + src = p + 1; + dest++; + } + } + i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING(stderr, + "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + *dest = (unsigned short)i; + if (*dest == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + break; + } + case FLAG_UNI: { // UTF-8 characters + std::vector<w_char> w; + u8_u16(w, flags); + len = w.size(); + *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + memcpy(*result, &w[0], len * sizeof(short)); + break; + } + default: { // Ispell's one-character flags (erfg -> e r f g) + unsigned short* dest; + len = strlen(flags); + *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + dest = *result; + for (unsigned char* p = (unsigned char*)flags; *p; p++) { + *dest = (unsigned short)*p; + dest++; + } + } + } + return len; +} + +unsigned short HashMgr::decode_flag(const char* f) { + unsigned short s = 0; + int i; + switch (flag_mode) { + case FLAG_LONG: + s = ((unsigned short)f[0] << 8) + (unsigned short)f[1]; + break; + case FLAG_NUM: + i = atoi(f); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", + i, DEFAULTFLAGS - 1); + s = (unsigned short)i; + break; + case FLAG_UNI: { + std::vector<w_char> w; + u8_u16(w, f); + if (!w.empty()) + memcpy(&s, &w[0], 1 * sizeof(short)); + break; + } + default: + s = (unsigned short)*((unsigned char*)f); + } + if (s == 0) + HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); + return s; +} + +char* HashMgr::encode_flag(unsigned short f) { + if (f == 0) + return mystrdup("(NULL)"); + std::string ch; + if (flag_mode == FLAG_LONG) { + ch.push_back((unsigned char)(f >> 8)); + ch.push_back((unsigned char)(f - ((f >> 8) << 8))); + } else if (flag_mode == FLAG_NUM) { + std::ostringstream stream; + stream << f; + ch = stream.str(); + } else if (flag_mode == FLAG_UNI) { + const w_char* w_c = (const w_char*)&f; + std::vector<w_char> w(w_c, w_c + 1); + u16_u8(ch, w); + } else { + ch.push_back((unsigned char)(f)); + } + return mystrdup(ch.c_str()); +} + +// read in aff file and set flag mode +int HashMgr::load_config(const char* affpath, const char* key) { + char* line; // io buffers + int firstline = 1; + + // open the affix file + FileMgr* afflst = new FileMgr(affpath, key); + if (!afflst) { + HUNSPELL_WARNING( + stderr, "Error - could not open affix description file %s\n", affpath); + return 1; + } + + // read in each line ignoring any that do not + // start with a known line type indicator + + while ((line = afflst->getline()) != NULL) { + mychomp(line); + + /* remove byte order mark */ + if (firstline) { + firstline = 0; + if (strncmp(line, "\xEF\xBB\xBF", 3) == 0) + memmove(line, line + 3, strlen(line + 3) + 1); + } + + /* parse in the try string */ + if ((strncmp(line, "FLAG", 4) == 0) && isspace(line[4])) { + if (flag_mode != FLAG_CHAR) { + HUNSPELL_WARNING(stderr, + "error: line %d: multiple definitions of the FLAG " + "affix file parameter\n", + afflst->getlinenum()); + } + if (strstr(line, "long")) + flag_mode = FLAG_LONG; + if (strstr(line, "num")) + flag_mode = FLAG_NUM; + if (strstr(line, "UTF-8")) + flag_mode = FLAG_UNI; + if (flag_mode == FLAG_CHAR) { + HUNSPELL_WARNING( + stderr, + "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", + afflst->getlinenum()); + } + } + if (strncmp(line, "FORBIDDENWORD", 13) == 0) { + char* st = NULL; + if (parse_string(line, &st, afflst->getlinenum())) { + delete afflst; + return 1; + } + forbiddenword = decode_flag(st); + free(st); + } + if (strncmp(line, "SET", 3) == 0) { + if (parse_string(line, &enc, afflst->getlinenum())) { + delete afflst; + return 1; + } + if (strcmp(enc, "UTF-8") == 0) { + utf8 = 1; +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + initialize_utf_tbl(); +#endif +#endif + } else + csconv = get_current_cs(enc); + } + if (strncmp(line, "LANG", 4) == 0) { + if (parse_string(line, &lang, afflst->getlinenum())) { + delete afflst; + return 1; + } + langnum = get_lang_num(lang); + } + + /* parse in the ignored characters (for example, Arabic optional diacritics + * characters */ + if (strncmp(line, "IGNORE", 6) == 0) { + if (!parse_array(line, &ignorechars, ignorechars_utf16, + utf8, afflst->getlinenum())) { + delete afflst; + return 1; + } + } + + if ((strncmp(line, "AF", 2) == 0) && isspace(line[2])) { + if (parse_aliasf(line, afflst)) { + delete afflst; + return 1; + } + } + + if ((strncmp(line, "AM", 2) == 0) && isspace(line[2])) { + if (parse_aliasm(line, afflst)) { + delete afflst; + return 1; + } + } + + if (strncmp(line, "COMPLEXPREFIXES", 15) == 0) + complexprefixes = 1; + if (((strncmp(line, "SFX", 3) == 0) || (strncmp(line, "PFX", 3) == 0)) && + isspace(line[3])) + break; + } + if (csconv == NULL) + csconv = get_current_cs(SPELL_ENCODING); + delete afflst; + return 0; +} + +/* parse in the ALIAS table */ +int HashMgr::parse_aliasf(char* line, FileMgr* af) { + if (numaliasf != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numaliasf = atoi(piece); + if (numaliasf < 1) { + numaliasf = 0; + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + aliasf = + (unsigned short**)malloc(numaliasf * sizeof(unsigned short*)); + aliasflen = + (unsigned short*)malloc(numaliasf * sizeof(unsigned short)); + if (!aliasf || !aliasflen) { + numaliasf = 0; + if (aliasf) + free(aliasf); + if (aliasflen) + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + return 1; + } + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + numaliasf = 0; + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the numaliasf lines to read in the remainder of the table */ + char* nl; + for (int j = 0; j < numaliasf; j++) { + if ((nl = af->getline()) == NULL) + return 1; + mychomp(nl); + tp = nl; + i = 0; + aliasf[j] = NULL; + aliasflen[j] = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "AF", 2) != 0) { + numaliasf = 0; + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return 1; + } + break; + } + case 1: { + aliasflen[j] = + (unsigned short)decode_flags(&(aliasf[j]), piece, af); + std::sort(aliasf[j], aliasf[j] + aliasflen[j]); + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!aliasf[j]) { + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + numaliasf = 0; + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return 1; + } + } + return 0; +} + +int HashMgr::is_aliasf() { + return (aliasf != NULL); +} + +int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) { + if ((index > 0) && (index <= numaliasf)) { + *fvec = aliasf[index - 1]; + return aliasflen[index - 1]; + } + HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", + af->getlinenum(), index); + *fvec = NULL; + return 0; +} + +/* parse morph alias definitions */ +int HashMgr::parse_aliasm(char* line, FileMgr* af) { + if (numaliasm != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return 1; + } + char* tp = line; + char* piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numaliasm = atoi(piece); + if (numaliasm < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return 1; + } + aliasm = (char**)malloc(numaliasm * sizeof(char*)); + if (!aliasm) { + numaliasm = 0; + return 1; + } + np++; + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + numaliasm = 0; + free(aliasm); + aliasm = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return 1; + } + + /* now parse the numaliasm lines to read in the remainder of the table */ + char* nl = line; + for (int j = 0; j < numaliasm; j++) { + if ((nl = af->getline()) == NULL) + return 1; + mychomp(nl); + tp = nl; + i = 0; + aliasm[j] = NULL; + piece = mystrsep(&tp, ' '); + while (piece) { + if (*piece != '\0') { + switch (i) { + case 0: { + if (strncmp(piece, "AM", 2) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numaliasm = 0; + free(aliasm); + aliasm = NULL; + return 1; + } + break; + } + case 1: { + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } + std::string chunk(piece); + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + } + aliasm[j] = mystrdup(chunk.c_str()); + break; + } + default: + break; + } + i++; + } + piece = mystrsep(&tp, ' '); + } + if (!aliasm[j]) { + numaliasm = 0; + free(aliasm); + aliasm = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return 1; + } + } + return 0; +} + +int HashMgr::is_aliasm() { + return (aliasm != NULL); +} + +char* HashMgr::get_aliasm(int index) { + if ((index > 0) && (index <= numaliasm)) + return aliasm[index - 1]; + HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); + return NULL; +} diff --git a/libs/hunspell/src/hashmgr.hxx b/libs/hunspell/src/hashmgr.hxx new file mode 100644 index 000000000..95b06b13f --- /dev/null +++ b/libs/hunspell/src/hashmgr.hxx @@ -0,0 +1,149 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _HASHMGR_HXX_ +#define _HASHMGR_HXX_ + +#include "hunvisapi.h" + +#include <stdio.h> +#include <string> +#include <vector> + +#include "htypes.hxx" +#include "filemgr.hxx" +#include "w_char.hxx" + +enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; + +class LIBHUNSPELL_DLL_EXPORTED HashMgr { + int tablesize; + struct hentry** tableptr; + flag flag_mode; + int complexprefixes; + int utf8; + unsigned short forbiddenword; + int langnum; + char* enc; + char* lang; + struct cs_info* csconv; + char* ignorechars; + std::vector<w_char> ignorechars_utf16; + int numaliasf; // flag vector `compression' with aliases + unsigned short** aliasf; + unsigned short* aliasflen; + int numaliasm; // morphological desciption `compression' with aliases + char** aliasm; + + public: + HashMgr(const char* tpath, const char* apath, const char* key = NULL); + ~HashMgr(); + + struct hentry* lookup(const char*) const; + int hash(const char*) const; + struct hentry* walk_hashtable(int& col, struct hentry* hp) const; + + int add(const std::string& word); + int add_with_affix(const char* word, const char* pattern); + int remove(const char* word); + int decode_flags(unsigned short** result, char* flags, FileMgr* af); + unsigned short decode_flag(const char* flag); + char* encode_flag(unsigned short flag); + int is_aliasf(); + int get_aliasf(int index, unsigned short** fvec, FileMgr* af); + int is_aliasm(); + char* get_aliasm(int index); + + private: + int get_clen_and_captype(const std::string& word, int* captype); + int load_tables(const char* tpath, const char* key); + int add_word(const char* word, + int wbl, + int wcl, + unsigned short* ap, + int al, + const char* desc, + bool onlyupcase); + int load_config(const char* affpath, const char* key); + int parse_aliasf(char* line, FileMgr* af); + int add_hidden_capitalized_word(const std::string& word, + int wcl, + unsigned short* flags, + int al, + char* dp, + int captype); + int parse_aliasm(char* line, FileMgr* af); + int remove_forbidden_flag(const std::string& word); +}; + +#endif diff --git a/libs/hunspell/src/htypes.hxx b/libs/hunspell/src/htypes.hxx new file mode 100644 index 000000000..d24439441 --- /dev/null +++ b/libs/hunspell/src/htypes.hxx @@ -0,0 +1,71 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef _HTYPES_HXX_ +#define _HTYPES_HXX_ + +#define ROTATE_LEN 5 + +#define ROTATE(v, q) \ + (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1)); + +// hentry options +#define H_OPT (1 << 0) +#define H_OPT_ALIASM (1 << 1) +#define H_OPT_PHON (1 << 2) + +// see also csutil.hxx +#define HENTRY_WORD(h) &(h->word[0]) + +// approx. number of user defined words +#define USERWORD 1000 + +struct hentry { + unsigned char blen; // word length in bytes + unsigned char clen; // word length in characters (different for UTF-8 enc.) + short alen; // length of affix flag vector + unsigned short* astr; // affix flag vector + struct hentry* next; // next word with same hash code + struct hentry* next_homonym; // next homonym word (with same hash code) + char var; // variable fields (only for special pronounciation yet) + char word[1]; // variable-length word (8-bit or UTF-8 encoding) +}; + +#endif diff --git a/libs/hunspell/src/hunspell.cxx b/libs/hunspell/src/hunspell.cxx new file mode 100644 index 000000000..7ff1e2bcf --- /dev/null +++ b/libs/hunspell/src/hunspell.cxx @@ -0,0 +1,1895 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include "hunspell.hxx" +#include "hunspell.h" +#ifndef MOZILLA_CLIENT +#include "config.h" +#endif +#include "csutil.hxx" + +#include <limits> +#include <string> + +#define MAXWORDLEN 100 +#define MAXWORDUTF8LEN (MAXWORDLEN * 3) + +Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) { + encoding = NULL; + csconv = NULL; + utf8 = 0; + complexprefixes = 0; + affixpath = mystrdup(affpath); + maxdic = 0; + + /* first set up the hash manager */ + pHMgr[0] = new HashMgr(dpath, affpath, key); + if (pHMgr[0]) + maxdic = 1; + + /* next set up the affix manager */ + /* it needs access to the hash manager lookup methods */ + pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); + + /* get the preferred try string and the dictionary */ + /* encoding from the Affix Manager for that dictionary */ + char* try_string = pAMgr->get_try_string(); + encoding = pAMgr->get_encoding(); + langnum = pAMgr->get_langnum(); + utf8 = pAMgr->get_utf8(); + if (!utf8) + csconv = get_current_cs(encoding); + complexprefixes = pAMgr->get_complexprefixes(); + wordbreak = pAMgr->get_breaktable(); + + /* and finally set up the suggestion manager */ + pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); + if (try_string) + free(try_string); +} + +Hunspell::~Hunspell() { + delete pSMgr; + delete pAMgr; + for (int i = 0; i < maxdic; i++) + delete pHMgr[i]; + maxdic = 0; + pSMgr = NULL; + pAMgr = NULL; +#ifdef MOZILLA_CLIENT + delete[] csconv; +#endif + csconv = NULL; + if (encoding) + free(encoding); + encoding = NULL; + if (affixpath) + free(affixpath); + affixpath = NULL; +} + +// load extra dictionaries +int Hunspell::add_dic(const char* dpath, const char* key) { + if (maxdic == MAXDIC || !affixpath) + return 1; + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); + if (pHMgr[maxdic]) + maxdic++; + else + return 1; + return 0; +} + +// make a copy of src at destination while removing all leading +// blanks and removing any trailing periods after recording +// their presence with the abbreviation flag +// also since already going through character by character, +// set the capitalization type +// return the length of the "cleaned" (and UTF-8 encoded) word + +size_t Hunspell::cleanword2(std::string& dest, + std::vector<w_char>& dest_utf, + const char* src, + int* nc, + int* pcaptype, + size_t* pabbrev) { + dest.clear(); + dest_utf.clear(); + + const char* q = src; + + // first skip over any leading blanks + while ((*q != '\0') && (*q == ' ')) + q++; + + // now strip off any trailing periods (recording their presence) + *pabbrev = 0; + int nl = strlen(q); + while ((nl > 0) && (*(q + nl - 1) == '.')) { + nl--; + (*pabbrev)++; + } + + // if no characters are left it can't be capitalized + if (nl <= 0) { + *pcaptype = NOCAP; + return 0; + } + + dest.append(q, nl); + nl = dest.size(); + if (utf8) { + *nc = u8_u16(dest_utf, dest); + *pcaptype = get_captype_utf8(dest_utf, langnum); + } else { + *pcaptype = get_captype(dest, csconv); + *nc = nl; + } + return nl; +} + +void Hunspell::cleanword(std::string& dest, + const char* src, + int* pcaptype, + int* pabbrev) { + dest.clear(); + const unsigned char* q = (const unsigned char*)src; + int firstcap = 0; + + // first skip over any leading blanks + while ((*q != '\0') && (*q == ' ')) + q++; + + // now strip off any trailing periods (recording their presence) + *pabbrev = 0; + int nl = strlen((const char*)q); + while ((nl > 0) && (*(q + nl - 1) == '.')) { + nl--; + (*pabbrev)++; + } + + // if no characters are left it can't be capitalized + if (nl <= 0) { + *pcaptype = NOCAP; + return; + } + + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int nc = 0; + + if (!utf8) { + while (nl > 0) { + nc++; + if (csconv[(*q)].ccase) + ncap++; + if (csconv[(*q)].cupper == csconv[(*q)].clower) + nneutral++; + dest.push_back(*q++); + nl--; + } + // remember to terminate the destination string + firstcap = csconv[static_cast<unsigned char>(dest[0])].ccase; + } else { + std::vector<w_char> t; + u8_u16(t, src); + for (size_t i = 0; i < t.size(); ++i) { + unsigned short idx = (t[i].h << 8) + t[i].l; + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) + ncap++; + if (unicodetoupper(idx, langnum) == low) + nneutral++; + } + u16_u8(dest, t); + if (ncap) { + unsigned short idx = (t[0].h << 8) + t[0].l; + firstcap = (idx != unicodetolower(idx, langnum)); + } + } + + // now finally set the captype + if (ncap == 0) { + *pcaptype = NOCAP; + } else if ((ncap == 1) && firstcap) { + *pcaptype = INITCAP; + } else if ((ncap == nc) || ((ncap + nneutral) == nc)) { + *pcaptype = ALLCAP; + } else if ((ncap > 1) && firstcap) { + *pcaptype = HUHINITCAP; + } else { + *pcaptype = HUHCAP; + } +} + +void Hunspell::mkallcap(std::string& u8) { + if (utf8) { + std::vector<w_char> u16; + u8_u16(u16, u8); + ::mkallcap_utf(u16, langnum); + u16_u8(u8, u16); + } else { + ::mkallcap(u8, csconv); + } +} + +int Hunspell::mkallsmall2(std::string& u8, std::vector<w_char>& u16) { + if (utf8) { + ::mkallsmall_utf(u16, langnum); + u16_u8(u8, u16); + } else { + ::mkallsmall(u8, csconv); + } + return u8.size(); +} + +// convert UTF-8 sharp S codes to latin 1 +std::string Hunspell::sharps_u8_l1(const std::string& source) { + std::string dest(source); + mystrrep(dest, "\xC3\x9F", "\xDF"); + return dest; +} + +// recursive search for right ss - sharp s permutations +hentry* Hunspell::spellsharps(std::string& base, + size_t n_pos, + int n, + int repnum, + int* info, + char** root) { + size_t pos = base.find("ss", n_pos); + if (pos != std::string::npos && (n < MAXSHARPS)) { + base[pos] = '\xC3'; + base[pos + 1] = '\x9F'; + hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, info, root); + if (h) + return h; + base[pos] = 's'; + base[pos + 1] = 's'; + h = spellsharps(base, pos + 2, n + 1, repnum, info, root); + if (h) + return h; + } else if (repnum > 0) { + if (utf8) + return checkword(base.c_str(), info, root); + std::string tmp(sharps_u8_l1(base)); + return checkword(tmp.c_str(), info, root); + } + return NULL; +} + +int Hunspell::is_keepcase(const hentry* rv) { + return pAMgr && rv->astr && pAMgr->get_keepcase() && + TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); +} + +/* insert a word to the beginning of the suggestion array and return ns */ +int Hunspell::insert_sug(char*** slst, const char* word, int ns) { + if (!*slst) + return ns; + char* dup = mystrdup(word); + if (!dup) + return ns; + if (ns == MAXSUGGESTION) { + ns--; + free((*slst)[ns]); + } + for (int k = ns; k > 0; k--) + (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = dup; + return ns + 1; +} + +int Hunspell::spell(const char* word, int* info, char** root) { + struct hentry* rv = NULL; + + int info2 = 0; + if (!info) + info = &info2; + else + *info = 0; + + // Hunspell supports XML input of the simplified API (see manual) + if (strcmp(word, SPELL_XML) == 0) + return 1; + int nc = strlen(word); + if (utf8) { + if (nc >= MAXWORDUTF8LEN) + return 0; + } else { + if (nc >= MAXWORDLEN) + return 0; + } + int captype = NOCAP; + size_t abbv = 0; + size_t wl = 0; + + std::string scw; + std::vector<w_char> sunicw; + + // input conversion + RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + { + std::string wspace; + + int convstatus = rl ? rl->conv(word, wspace) : 0; + if (convstatus < 0) + return 0; + else if (convstatus > 0) + wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv); + else + wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv); + } + +#ifdef MOZILLA_CLIENT + // accept the abbreviated words without dots + // workaround for the incomplete tokenization of Mozilla + abbv = 1; +#endif + + if (wl == 0 || maxdic == 0) + return 1; + if (root) + *root = NULL; + + // allow numbers with dots, dashes and commas (but forbid double separators: + // "..", "--" etc.) + enum { NBEGIN, NNUM, NSEP }; + int nstate = NBEGIN; + size_t i; + + for (i = 0; (i < wl); i++) { + if ((scw[i] <= '9') && (scw[i] >= '0')) { + nstate = NNUM; + } else if ((scw[i] == ',') || (scw[i] == '.') || (scw[i] == '-')) { + if ((nstate == NSEP) || (i == 0)) + break; + nstate = NSEP; + } else + break; + } + if ((i == wl) && (nstate == NNUM)) + return 1; + + switch (captype) { + case HUHCAP: + /* FALLTHROUGH */ + case HUHINITCAP: + *info += SPELL_ORIGCAP; + /* FALLTHROUGH */ + case NOCAP: + rv = checkword(scw.c_str(), info, root); + if ((abbv) && !(rv)) { + std::string u8buffer(scw); + u8buffer.push_back('.'); + rv = checkword(u8buffer.c_str(), info, root); + } + break; + case ALLCAP: { + *info += SPELL_ORIGCAP; + rv = checkword(scw.c_str(), info, root); + if (rv) + break; + if (abbv) { + std::string u8buffer(scw); + u8buffer.push_back('.'); + rv = checkword(u8buffer.c_str(), info, root); + if (rv) + break; + } + // Spec. prefix handling for Catalan, French, Italian: + // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). + size_t apos = pAMgr ? scw.find('\'') : std::string::npos; + if (apos != std::string::npos) { + mkallsmall2(scw, sunicw); + //conversion may result in string with different len to pre-mkallsmall2 + //so re-scan + if (apos != std::string::npos && apos < scw.size() - 1) { + std::string part1 = scw.substr(0, apos+1); + std::string part2 = scw.substr(apos+1); + if (utf8) { + std::vector<w_char> part1u, part2u; + u8_u16(part1u, part1); + u8_u16(part2u, part2); + mkinitcap2(part2, part2u); + scw = part1 + part2; + sunicw = part1u; + sunicw.insert(sunicw.end(), part2u.begin(), part2u.end()); + rv = checkword(scw.c_str(), info, root); + if (rv) + break; + } else { + mkinitcap2(part2, sunicw); + scw = part1 + part2; + rv = checkword(scw.c_str(), info, root); + if (rv) + break; + } + mkinitcap2(scw, sunicw); + rv = checkword(scw.c_str(), info, root); + if (rv) + break; + } + } + if (pAMgr && pAMgr->get_checksharps() && scw.find("SS") != std::string::npos) { + + mkallsmall2(scw, sunicw); + std::string u8buffer(scw); + rv = spellsharps(u8buffer, 0, 0, 0, info, root); + if (!rv) { + mkinitcap2(scw, sunicw); + rv = spellsharps(scw, 0, 0, 0, info, root); + } + if ((abbv) && !(rv)) { + u8buffer.push_back('.'); + rv = spellsharps(u8buffer, 0, 0, 0, info, root); + if (!rv) { + u8buffer = std::string(scw); + u8buffer.push_back('.'); + rv = spellsharps(u8buffer, 0, 0, 0, info, root); + } + } + if (rv) + break; + } + } + case INITCAP: { + + *info += SPELL_ORIGCAP; + mkallsmall2(scw, sunicw); + std::string u8buffer(scw); + mkinitcap2(scw, sunicw); + if (captype == INITCAP) + *info += SPELL_INITCAP; + rv = checkword(scw.c_str(), info, root); + if (captype == INITCAP) + *info -= SPELL_INITCAP; + // forbid bad capitalization + // (for example, ijs -> Ijs instead of IJs in Dutch) + // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) + if (*info & SPELL_FORBIDDEN) { + rv = NULL; + break; + } + if (rv && is_keepcase(rv) && (captype == ALLCAP)) + rv = NULL; + if (rv) + break; + + rv = checkword(u8buffer.c_str(), info, root); + if (abbv && !rv) { + u8buffer.push_back('.'); + rv = checkword(u8buffer.c_str(), info, root); + if (!rv) { + u8buffer = scw; + u8buffer.push_back('.'); + if (captype == INITCAP) + *info += SPELL_INITCAP; + rv = checkword(u8buffer.c_str(), info, root); + if (captype == INITCAP) + *info -= SPELL_INITCAP; + if (rv && is_keepcase(rv) && (captype == ALLCAP)) + rv = NULL; + break; + } + } + if (rv && is_keepcase(rv) && + ((captype == ALLCAP) || + // if CHECKSHARPS: KEEPCASE words with \xDF are allowed + // in INITCAP form, too. + !(pAMgr->get_checksharps() && + ((utf8 && u8buffer.find("\xC3\x9F") != std::string::npos) || + (!utf8 && u8buffer.find('\xDF') != std::string::npos))))) + rv = NULL; + break; + } + } + + if (rv) { + if (pAMgr && pAMgr->get_warn() && rv->astr && + TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { + *info += SPELL_WARN; + if (pAMgr->get_forbidwarn()) + return 0; + return HUNSPELL_OK_WARN; + } + return HUNSPELL_OK; + } + + // recursive breaking at break points + if (wordbreak) { + + int nbr = 0; + wl = scw.size(); + int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; + + // calculate break points for recursion limit + for (int j = 0; j < numbreak; j++) { + size_t len = strlen(wordbreak[j]); + size_t pos = 0; + while ((pos = scw.find(wordbreak[j], pos, len)) != std::string::npos) { + ++nbr; + pos += len; + } + } + if (nbr >= 10) + return 0; + + // check boundary patterns (^begin and end$) + for (int j = 0; j < numbreak; j++) { + size_t plen = strlen(wordbreak[j]); + if (plen == 1 || plen > wl) + continue; + + if (wordbreak[j][0] == '^' && + scw.compare(0, plen - 1, wordbreak[j] + 1, plen -1) == 0 && spell(scw.c_str() + plen - 1)) + return 1; + + if (wordbreak[j][plen - 1] == '$' && + scw.compare(wl - plen + 1, plen - 1, wordbreak[j], plen - 1) == 0) { + char r = scw[wl - plen + 1]; + scw[wl - plen + 1] = '\0'; + if (spell(scw.c_str())) + return 1; + scw[wl - plen + 1] = r; + } + } + + // other patterns + for (int j = 0; j < numbreak; j++) { + size_t plen = strlen(wordbreak[j]); + size_t found = scw.find(wordbreak[j]); + if ((found > 0) && (found < wl - plen)) { + if (!spell(scw.c_str() + found + plen)) + continue; + char r = scw[found]; + scw[found] = '\0'; + // examine 2 sides of the break point + if (spell(scw.c_str())) + return 1; + scw[found] = r; + + // LANG_hu: spec. dash rule + if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { + r = scw[found + 1]; + scw[found + 1] = '\0'; + if (spell(scw.c_str())) + return 1; // check the first part with dash + scw[found + 1] = r; + } + // end of LANG specific region + } + } + } + + return 0; +} + +struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { + struct hentry* he = NULL; + bool usebuffer = false; + int len, i; + std::string w2; + const char* word; + + char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; + if (ignoredchars != NULL) { + w2.assign(w); + if (utf8) { + const std::vector<w_char>& ignoredchars_utf16 = + pAMgr->get_ignore_utf16(); + remove_ignored_chars_utf(w2, ignoredchars_utf16); + } else { + remove_ignored_chars(w2, ignoredchars); + } + word = w2.c_str(); + usebuffer = true; + } else + word = w; + + len = strlen(word); + + if (!len) + return NULL; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + if (!usebuffer) { + w2.assign(word); + usebuffer = true; + } + if (utf8) + reverseword_utf(w2); + else + reverseword(w2); + } + + if (usebuffer) { + word = w2.c_str(); + } + + // look word in hash table + for (i = 0; (i < maxdic) && !he; i++) { + he = (pHMgr[i])->lookup(word); + + // check forbidden and onlyincompound words + if ((he) && (he->astr) && (pAMgr) && + TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { + if (info) + *info += SPELL_FORBIDDEN; + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) { + if (pAMgr->get_compoundflag() && + TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { + if (info) + *info += SPELL_COMPOUND; + } + } + return NULL; + } + + // he = next not needaffix, onlyincompound homonym or onlyupcase word + while (he && (he->astr) && pAMgr && + ((pAMgr->get_needaffix() && + TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || + (pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) + he = he->next_homonym; + } + + // check with affixes + if (!he && pAMgr) { + // try stripping off affixes */ + he = pAMgr->affix_check(word, len, 0); + + // check compound restriction and onlyupcase + if (he && he->astr && + ((pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { + he = NULL; + } + + if (he) { + if ((he->astr) && (pAMgr) && + TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { + if (info) + *info += SPELL_FORBIDDEN; + return NULL; + } + if (root) { + std::string word_root(he->word); + if (complexprefixes) { + if (utf8) + reverseword_utf(word_root); + else + reverseword(word_root); + } + *root = mystrdup(word_root.c_str()); + } + // try check compound word + } else if (pAMgr->get_compound()) { + struct hentry* rwords[100]; // buffer for COMPOUND pattern checking + he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info); + // LANG_hu section: `moving rule' with last dash + if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) { + char* dup = mystrdup(word); + if (!dup) + return NULL; + dup[len - 1] = '\0'; + he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, + info); + free(dup); + } + // end of LANG specific region + if (he) { + if (root) { + std::string word_root(he->word); + if (complexprefixes) { + if (utf8) + reverseword_utf(word_root); + else + reverseword(word_root); + } + *root = mystrdup(word_root.c_str()); + } + if (info) + *info += SPELL_COMPOUND; + } + } + } + + return he; +} + +int Hunspell::suggest(char*** slst, const char* word) { + int onlycmpdsug = 0; + if (!pSMgr || maxdic == 0) + return 0; + *slst = NULL; + // process XML input of the simplified API (see manual) + if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { + return spellml(slst, word); + } + int nc = strlen(word); + if (utf8) { + if (nc >= MAXWORDUTF8LEN) + return 0; + } else { + if (nc >= MAXWORDLEN) + return 0; + } + int captype = NOCAP; + size_t abbv = 0; + size_t wl = 0; + + std::string scw; + std::vector<w_char> sunicw; + + // input conversion + RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + { + std::string wspace; + + int convstatus = rl ? rl->conv(word, wspace) : 0; + if (convstatus < 0) + return 0; + else if (convstatus > 0) + wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv); + else + wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv); + + if (wl == 0) + return 0; + } + + int ns = 0; + int capwords = 0; + + // check capitalized form for FORCEUCASE + if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { + int info = SPELL_ORIGCAP; + if (checkword(scw.c_str(), &info, NULL)) { + std::string form(scw); + mkinitcap(form); + + char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*)); + if (wlst == NULL) + return -1; + *slst = wlst; + wlst[0] = mystrdup(form.c_str()); + for (int i = 1; i < MAXSUGGESTION; ++i) { + wlst[i] = NULL; + } + + return 1; + } + } + + switch (captype) { + case NOCAP: { + ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug); + break; + } + + case INITCAP: { + capwords = 1; + ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug); + if (ns == -1) + break; + std::string wspace(scw); + mkallsmall2(wspace, sunicw); + ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + break; + } + case HUHINITCAP: + capwords = 1; + case HUHCAP: { + ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug); + if (ns != -1) { + // something.The -> something. The + size_t dot_pos = scw.find('.'); + if (dot_pos != std::string::npos) { + std::string postdot = scw.substr(dot_pos + 1); + int captype_; + if (utf8) { + std::vector<w_char> postdotu; + u8_u16(postdotu, postdot); + captype_ = get_captype_utf8(postdotu, langnum); + } else { + captype_ = get_captype(postdot, csconv); + } + if (captype_ == INITCAP) { + std::string str(scw); + str.insert(dot_pos + 1, 1, ' '); + ns = insert_sug(slst, str.c_str(), ns); + } + } + + std::string wspace; + + if (captype == HUHINITCAP) { + // TheOpenOffice.org -> The OpenOffice.org + wspace = scw; + mkinitsmall2(wspace, sunicw); + ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + } + wspace = scw; + mkallsmall2(wspace, sunicw); + if (spell(wspace.c_str())) + ns = insert_sug(slst, wspace.c_str(), ns); + int prevns = ns; + ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + if (captype == HUHINITCAP) { + mkinitcap2(wspace, sunicw); + if (spell(wspace.c_str())) + ns = insert_sug(slst, wspace.c_str(), ns); + ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + } + // aNew -> "a New" (instead of "a new") + for (int j = prevns; j < ns; j++) { + char* space = strchr((*slst)[j], ' '); + if (space) { + size_t slen = strlen(space + 1); + // different case after space (need capitalisation) + if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) { + std::string first((*slst)[j], space + 1); + std::string second(space + 1); + std::vector<w_char> w; + if (utf8) + u8_u16(w, second); + mkinitcap2(second, w); + // set as first suggestion + char* r = (*slst)[j]; + for (int k = j; k > 0; k--) + (*slst)[k] = (*slst)[k - 1]; + free(r); + (*slst)[0] = mystrdup((first + second).c_str()); + } + } + } + } + break; + } + + case ALLCAP: { + std::string wspace(scw); + mkallsmall2(wspace, sunicw); + ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + if (ns == -1) + break; + if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str())) + ns = insert_sug(slst, wspace.c_str(), ns); + mkinitcap2(wspace, sunicw); + ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + for (int j = 0; j < ns; j++) { + std::string form((*slst)[j]); + mkallcap(form); + + if (pAMgr && pAMgr->get_checksharps()) { + if (utf8) { + mystrrep(form, "\xC3\x9F", "SS"); + } else { + mystrrep(form, "\xDF", "SS"); + } + } + + free((*slst)[j]); + (*slst)[j] = mystrdup(form.c_str()); + + } + break; + } + } + + // LANG_hu section: replace '-' with ' ' in Hungarian + if (langnum == LANG_hu) { + for (int j = 0; j < ns; j++) { + char* pos = strchr((*slst)[j], '-'); + if (pos) { + int info; + *pos = '\0'; + std::string w((*slst)[j]); + w.append(pos + 1); + (void)spell(w.c_str(), &info, NULL); + if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { + *pos = ' '; + } else + *pos = '-'; + } + } + } + // END OF LANG_hu section + + // try ngram approach since found nothing or only compound words + if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && + (*slst)) { + switch (captype) { + case NOCAP: { + ns = pSMgr->ngsuggest(*slst, scw.c_str(), ns, pHMgr, maxdic); + break; + } + case HUHINITCAP: + capwords = 1; + case HUHCAP: { + std::string wspace(scw); + mkallsmall2(wspace, sunicw); + ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic); + break; + } + case INITCAP: { + capwords = 1; + std::string wspace(scw); + mkallsmall2(wspace, sunicw); + ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic); + break; + } + case ALLCAP: { + std::string wspace(scw); + mkallsmall2(wspace, sunicw); + int oldns = ns; + ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic); + for (int j = oldns; j < ns; j++) { + std::string form((*slst)[j]); + mkallcap(form); + free((*slst)[j]); + (*slst)[j] = mystrdup(form.c_str()); + } + break; + } + } + } + + // try dash suggestion (Afo-American -> Afro-American) + size_t dash_pos = scw.find('-'); + if (dash_pos != std::string::npos) { + int nodashsug = 1; + for (int j = 0; j < ns && nodashsug == 1; j++) { + if (strchr((*slst)[j], '-')) + nodashsug = 0; + } + + size_t prev_pos = 0; + bool last = false; + + while (nodashsug && !last) { + if (dash_pos == scw.size()) + last = 1; + std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); + if (!spell(chunk.c_str())) { + char** nlst = NULL; + int nn = suggest(&nlst, chunk.c_str()); + for (int j = nn - 1; j >= 0; j--) { + std::string wspace = scw.substr(0, prev_pos); + wspace.append(nlst[j]); + if (!last) { + wspace.append("-"); + wspace.append(scw.substr(dash_pos + 1)); + } + ns = insert_sug(slst, wspace.c_str(), ns); + free(nlst[j]); + } + if (nlst != NULL) + free(nlst); + nodashsug = 0; + } + if (!last) { + prev_pos = dash_pos + 1; + dash_pos = scw.find('-', prev_pos); + } + if (dash_pos == std::string::npos) + dash_pos = scw.size(); + } + } + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (int j = 0; j < ns; j++) { + std::string root((*slst)[j]); + free((*slst)[j]); + if (utf8) + reverseword_utf(root); + else + reverseword(root); + (*slst)[j] = mystrdup(root.c_str()); + } + } + + // capitalize + if (capwords) + for (int j = 0; j < ns; j++) { + std::string form((*slst)[j]); + free((*slst)[j]); + mkinitcap(form); + (*slst)[j] = mystrdup(form.c_str()); + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (int j = 0; j < ns; j++) { + (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); + strcat((*slst)[j], word + strlen(word) - abbv); + } + } + + // remove bad capitalized and forbidden forms + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { + switch (captype) { + case INITCAP: + case ALLCAP: { + int l = 0; + for (int j = 0; j < ns; j++) { + if (!strchr((*slst)[j], ' ') && !spell((*slst)[j])) { + std::string s; + std::vector<w_char> w; + if (utf8) { + u8_u16(w, (*slst)[j]); + } else { + s = (*slst)[j]; + } + mkallsmall2(s, w); + free((*slst)[j]); + if (spell(s.c_str())) { + (*slst)[l] = mystrdup(s.c_str()); + if ((*slst)[l]) + l++; + } else { + mkinitcap2(s, w); + if (spell(s.c_str())) { + (*slst)[l] = mystrdup(s.c_str()); + if ((*slst)[l]) + l++; + } + } + } else { + (*slst)[l] = (*slst)[j]; + l++; + } + } + ns = l; + } + } + } + + // remove duplications + int l = 0; + for (int j = 0; j < ns; j++) { + (*slst)[l] = (*slst)[j]; + for (int k = 0; k < l; k++) { + if (strcmp((*slst)[k], (*slst)[j]) == 0) { + free((*slst)[j]); + l--; + break; + } + } + l++; + } + ns = l; + + // output conversion + rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + for (int j = 0; rl && j < ns; j++) { + std::string wspace; + if (rl->conv((*slst)[j], wspace) > 0) { + free((*slst)[j]); + (*slst)[j] = mystrdup(wspace.c_str()); + } + } + + // if suggestions removed by nosuggest, onlyincompound parameters + if (l == 0 && *slst) { + free(*slst); + *slst = NULL; + } + return l; +} + +void Hunspell::free_list(char*** slst, int n) { + freelist(slst, n); +} + +char* Hunspell::get_dic_encoding() { + return encoding; +} + +int Hunspell::stem(char*** slst, char** desc, int n) { + + std::string result2; + *slst = NULL; + if (n == 0) + return 0; + for (int i = 0; i < n; i++) { + + std::string result; + + // add compound word parts (except the last one) + char* s = (char*)desc[i]; + char* part = strstr(s, MORPH_PART); + if (part) { + char* nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + std::string field; + copy_field(field, part, MORPH_PART); + result.append(field); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } + + char** pl; + std::string tok(s); + size_t alt = 0; + while ((alt = tok.find(" | ", alt)) != std::string::npos) { + tok[alt + 1] = MSEP_ALT; + } + int pln = line_tok(tok.c_str(), &pl, MSEP_ALT); + for (int k = 0; k < pln; k++) { + // add derivational suffixes + if (strstr(pl[k], MORPH_DERI_SFX)) { + // remove inflectional suffixes + char* is = strstr(pl[k], MORPH_INFL_SFX); + if (is) + *is = '\0'; + char* sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); + if (sg) { + char** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + for (int j = 0; j < genl; j++) { + result2.push_back(MSEP_REC); + result2.append(result); + result2.append(gen[j]); + } + freelist(&gen, genl); + } + } else { + result2.push_back(MSEP_REC); + result2.append(result); + if (strstr(pl[k], MORPH_SURF_PFX)) { + std::string field; + copy_field(field, pl[k], MORPH_SURF_PFX); + result2.append(field); + } + std::string field; + copy_field(field, pl[k], MORPH_STEM); + result2.append(field); + } + } + freelist(&pl, pln); + } + int sln = line_tok(result2.c_str(), slst, MSEP_REC); + return uniqlist(*slst, sln); +} + +int Hunspell::stem(char*** slst, const char* word) { + char** pl; + int pln = analyze(&pl, word); + int pln2 = stem(slst, pl, pln); + freelist(&pl, pln); + return pln2; +} + +const char* Hunspell::get_wordchars() { + return pAMgr->get_wordchars(); +} + +const std::vector<w_char>& Hunspell::get_wordchars_utf16() { + return pAMgr->get_wordchars_utf16(); +} + +void Hunspell::mkinitcap(std::string& u8) { + if (utf8) { + std::vector<w_char> u16; + u8_u16(u16, u8); + ::mkinitcap_utf(u16, langnum); + u16_u8(u8, u16); + } else { + ::mkinitcap(u8, csconv); + } +} + +int Hunspell::mkinitcap2(std::string& u8, std::vector<w_char>& u16) { + if (utf8) { + ::mkinitcap_utf(u16, langnum); + u16_u8(u8, u16); + } else { + ::mkinitcap(u8, csconv); + } + return u8.size(); +} + +int Hunspell::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) { + if (utf8) { + ::mkinitsmall_utf(u16, langnum); + u16_u8(u8, u16); + } else { + ::mkinitsmall(u8, csconv); + } + return u8.size(); +} + +int Hunspell::add(const char* word) { + if (pHMgr[0]) + return (pHMgr[0])->add(word); + return 0; +} + +int Hunspell::add_with_affix(const char* word, const char* example) { + if (pHMgr[0]) + return (pHMgr[0])->add_with_affix(word, example); + return 0; +} + +int Hunspell::remove(const char* word) { + if (pHMgr[0]) + return (pHMgr[0])->remove(word); + return 0; +} + +const char* Hunspell::get_version() { + return pAMgr->get_version(); +} + +struct cs_info* Hunspell::get_csconv() { + return csconv; +} + +void Hunspell::cat_result(std::string& result, char* st) { + if (st) { + if (!result.empty()) + result.append("\n"); + result.append(st); + free(st); + } +} + +int Hunspell::analyze(char*** slst, const char* word) { + *slst = NULL; + if (!pSMgr || maxdic == 0) + return 0; + int nc = strlen(word); + if (utf8) { + if (nc >= MAXWORDUTF8LEN) + return 0; + } else { + if (nc >= MAXWORDLEN) + return 0; + } + int captype = NOCAP; + size_t abbv = 0; + size_t wl = 0; + + std::string scw; + std::vector<w_char> sunicw; + + // input conversion + RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + { + std::string wspace; + + int convstatus = rl ? rl->conv(word, wspace) : 0; + if (convstatus < 0) + return 0; + else if (convstatus > 0) + wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv); + else + wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv); + } + + if (wl == 0) { + if (abbv) { + scw.clear(); + for (wl = 0; wl < abbv; wl++) + scw.push_back('.'); + abbv = 0; + } else + return 0; + } + + std::string result; + + size_t n = 0; + size_t n2 = 0; + size_t n3 = 0; + + // test numbers + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) { + while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) || + (((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) { + n++; + if ((scw[n] == '.') || (scw[n] == ',')) { + if (((n2 == 0) && (n > 3)) || + ((n2 > 0) && ((scw[n - 1] == '.') || (scw[n - 1] == ',')))) + break; + n2++; + n3 = n; + } + } + + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) + return 0; + if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) && + checkword(scw.c_str() + n, NULL, NULL))) { + result.append(scw); + result.resize(n - 1); + if (n == wl) + cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1)); + else { + char sign = scw[n]; + scw[n] = '\0'; + cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1)); + result.push_back('+'); // XXX SPEC. MORPHCODE + scw[n] = sign; + cat_result(result, pSMgr->suggest_morph(scw.c_str() + n)); + } + return line_tok(result.c_str(), slst, MSEP_REC); + } + } + // END OF LANG_hu section + + switch (captype) { + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + cat_result(result, pSMgr->suggest_morph(scw.c_str())); + if (abbv) { + std::string u8buffer(scw); + u8buffer.push_back('.'); + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + } + break; + } + case INITCAP: { + wl = mkallsmall2(scw, sunicw); + std::string u8buffer(scw); + mkinitcap2(scw, sunicw); + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(scw.c_str())); + if (abbv) { + u8buffer.push_back('.'); + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + + u8buffer = scw; + u8buffer.push_back('.'); + + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + } + break; + } + case ALLCAP: { + cat_result(result, pSMgr->suggest_morph(scw.c_str())); + if (abbv) { + std::string u8buffer(scw); + u8buffer.push_back('.'); + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + } + mkallsmall2(scw, sunicw); + std::string u8buffer(scw); + mkinitcap2(scw, sunicw); + + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(scw.c_str())); + if (abbv) { + u8buffer.push_back('.'); + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + + u8buffer = scw; + u8buffer.push_back('.'); + + cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + } + break; + } + } + + if (!result.empty()) { + // word reversing wrapper for complex prefixes + if (complexprefixes) { + if (utf8) + reverseword_utf(result); + else + reverseword(result); + } + return line_tok(result.c_str(), slst, MSEP_REC); + } + + // compound word with dash (HU) I18n + // LANG_hu section: set dash information for suggestions + + size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos; + int nresult = 0; + if (dash_pos != std::string::npos) { + std::string part1 = scw.substr(0, dash_pos); + std::string part2 = scw.substr(dash_pos+1); + + // examine 2 sides of the dash + if (part2.empty()) { // base word ending with dash + if (spell(part1.c_str())) { + char* p = pSMgr->suggest_morph(part1.c_str()); + if (p) { + int ret = line_tok(p, slst, MSEP_REC); + free(p); + return ret; + } + } + } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat. + if (spell(part1.c_str()) && (spell("-e"))) { + char* st = pSMgr->suggest_morph(part1.c_str()); + if (st) { + result.append(st); + free(st); + } + result.push_back('+'); // XXX spec. separator in MORPHCODE + st = pSMgr->suggest_morph("-e"); + if (st) { + result.append(st); + free(st); + } + return line_tok(result.c_str(), slst, MSEP_REC); + } + } else { + // first word ending with dash: word- XXX ??? + part1.push_back(' '); + nresult = spell(part1.c_str()); + part1.erase(part1.size() - 1); + if (nresult && spell(part2.c_str()) && + ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) { + char* st = pSMgr->suggest_morph(part1.c_str()); + if (st) { + result.append(st); + free(st); + result.push_back('+'); // XXX spec. separator in MORPHCODE + } + st = pSMgr->suggest_morph(part2.c_str()); + if (st) { + result.append(st); + free(st); + } + return line_tok(result.c_str(), slst, MSEP_REC); + } + } + // affixed number in correct word + if (nresult && (dash_pos > 0) && + (((scw[dash_pos - 1] <= '9') && (scw[dash_pos - 1] >= '0')) || + (scw[dash_pos - 1] == '.'))) { + n = 1; + if (scw[dash_pos - n] == '.') + n++; + // search first not a number character to left from dash + while ((dash_pos >= n) && ((scw[dash_pos - n] == '0') || (n < 3)) && + (n < 6)) { + n++; + } + if (dash_pos < n) + n--; + // numbers: valami1000000-hoz + // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, + // 56-hoz, 6-hoz + for (; n >= 1; n--) { + if (scw[dash_pos - n] < '0' || scw[dash_pos - n] > '9') { + continue; + } + std::string chunk = scw.substr(dash_pos - n); + if (checkword(chunk.c_str(), NULL, NULL)) { + result.append(chunk); + char* st = pSMgr->suggest_morph(chunk.c_str()); + if (st) { + result.append(st); + free(st); + } + return line_tok(result.c_str(), slst, MSEP_REC); + } + } + } + } + return 0; +} + +int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { + *slst = NULL; + if (!pSMgr || !pln) + return 0; + char** pl2; + int pl2n = analyze(&pl2, word); + int captype = NOCAP; + int abbv = 0; + std::string cw; + cleanword(cw, word, &captype, &abbv); + std::string result; + + for (int i = 0; i < pln; i++) { + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); + } + freelist(&pl2, pl2n); + + if (!result.empty()) { + // allcap + if (captype == ALLCAP) + mkallcap(result); + + // line split + int linenum = line_tok(result.c_str(), slst, MSEP_REC); + + // capitalize + if (captype == INITCAP || captype == HUHINITCAP) { + for (int j = 0; j < linenum; j++) { + std::string form((*slst)[j]); + free((*slst)[j]); + mkinitcap(form); + (*slst)[j] = mystrdup(form.c_str()); + } + } + + // temporary filtering of prefix related errors (eg. + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") + + int r = 0; + for (int j = 0; j < linenum; j++) { + if (!spell((*slst)[j])) { + free((*slst)[j]); + (*slst)[j] = NULL; + } else { + if (r < j) + (*slst)[r] = (*slst)[j]; + r++; + } + } + if (r > 0) + return r; + free(*slst); + *slst = NULL; + } + return 0; +} + +int Hunspell::generate(char*** slst, const char* word, const char* pattern) { + char** pl; + int pln = analyze(&pl, pattern); + int n = generate(slst, word, pl, pln); + freelist(&pl, pln); + return uniqlist(*slst, n); +} + +// minimal XML parser functions +std::string Hunspell::get_xml_par(const char* par) { + std::string dest; + if (!par) + return dest; + char end = *par; + if (end == '>') + end = '<'; + else if (end != '\'' && end != '"') + return 0; // bad XML + for (par++; *par != '\0' && *par != end; ++par) { + dest.push_back(*par); + } + mystrrep(dest, "<", "<"); + mystrrep(dest, "&", "&"); + return dest; +} + +int Hunspell::get_langnum() const { + return langnum; +} + +int Hunspell::input_conv(const char* word, char* dest, size_t destsize) { + RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + return (rl && rl->conv(word, dest, destsize) > 0); +} + +// return the beginning of the element (attr == NULL) or the attribute +const char* Hunspell::get_xml_pos(const char* s, const char* attr) { + const char* end = strchr(s, '>'); + const char* p = s; + if (attr == NULL) + return end; + do { + p = strstr(p, attr); + if (!p || p >= end) + return 0; + } while (*(p - 1) != ' ' && *(p - 1) != '\n'); + return p + strlen(attr); +} + +int Hunspell::check_xml_par(const char* q, + const char* attr, + const char* value) { + std::string cw = get_xml_par(get_xml_pos(q, attr)); + if (cw == value) + return 1; + return 0; +} + +int Hunspell::get_xml_list(char*** slst, const char* list, const char* tag) { + if (!list) + return 0; + int n = 0; + const char* p; + for (p = list; ((p = strstr(p, tag)) != NULL); p++) + n++; + if (n == 0) + return 0; + *slst = (char**)malloc(sizeof(char*) * n); + if (!*slst) + return 0; + for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) { + std::string cw = get_xml_par(p + strlen(tag) - 1); + if (cw.empty()) { + break; + } + (*slst)[n] = mystrdup(cw.c_str()); + } + return n; +} + +int Hunspell::spellml(char*** slst, const char* word) { + const char* q = strstr(word, "<query"); + if (!q) + return 0; // bad XML input + const char* q2 = strchr(q, '>'); + if (!q2) + return 0; // bad XML input + q2 = strstr(q2, "<word"); + if (!q2) + return 0; // bad XML input + if (check_xml_par(q, "type=", "analyze")) { + int n = 0; + std::string cw = get_xml_par(strchr(q2, '>')); + if (!cw.empty()) + n = analyze(slst, cw.c_str()); + if (n == 0) + return 0; + // convert the result to <code><a>ana1</a><a>ana2</a></code> format + std::string r; + r.append("<code>"); + for (int i = 0; i < n; i++) { + r.append("<a>"); + + std::string entry((*slst)[i]); + free((*slst)[i]); + mystrrep(entry, "\t", " "); + mystrrep(entry, "&", "&"); + mystrrep(entry, "<", "<"); + r.append(entry); + + r.append("</a>"); + } + r.append("</code>"); + (*slst)[0] = mystrdup(r.c_str()); + return 1; + } else if (check_xml_par(q, "type=", "stem")) { + std::string cw = get_xml_par(strchr(q2, '>')); + if (!cw.empty()) + return stem(slst, cw.c_str()); + } else if (check_xml_par(q, "type=", "generate")) { + std::string cw = get_xml_par(strchr(q2, '>')); + if (cw.empty()) + return 0; + const char* q3 = strstr(q2 + 1, "<word"); + if (q3) { + std::string cw2 = get_xml_par(strchr(q3, '>')); + if (!cw2.empty()) { + return generate(slst, cw.c_str(), cw2.c_str()); + } + } else { + if ((q2 = strstr(q2 + 1, "<code")) != NULL) { + char** slst2; + int n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"); + if (n != 0) { + int n2 = generate(slst, cw.c_str(), slst2, n); + freelist(&slst2, n); + return uniqlist(*slst, n2); + } + freelist(&slst2, n); + } + } + } + return 0; +} + +Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { + return (Hunhandle*)(new Hunspell(affpath, dpath)); +} + +Hunhandle* Hunspell_create_key(const char* affpath, + const char* dpath, + const char* key) { + return (Hunhandle*)(new Hunspell(affpath, dpath, key)); +} + +void Hunspell_destroy(Hunhandle* pHunspell) { + delete (Hunspell*)(pHunspell); +} + +int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) { + return ((Hunspell*)pHunspell)->add_dic(dpath); +} + +int Hunspell_spell(Hunhandle* pHunspell, const char* word) { + return ((Hunspell*)pHunspell)->spell(word); +} + +char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) { + return ((Hunspell*)pHunspell)->get_dic_encoding(); +} + +int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) { + return ((Hunspell*)pHunspell)->suggest(slst, word); +} + +int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) { + return ((Hunspell*)pHunspell)->analyze(slst, word); +} + +int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) { + return ((Hunspell*)pHunspell)->stem(slst, word); +} + +int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) { + return ((Hunspell*)pHunspell)->stem(slst, desc, n); +} + +int Hunspell_generate(Hunhandle* pHunspell, + char*** slst, + const char* word, + const char* word2) { + return ((Hunspell*)pHunspell)->generate(slst, word, word2); +} + +int Hunspell_generate2(Hunhandle* pHunspell, + char*** slst, + const char* word, + char** desc, + int n) { + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); +} + +/* functions for run-time modification of the dictionary */ + +/* add word to the run-time dictionary */ + +int Hunspell_add(Hunhandle* pHunspell, const char* word) { + return ((Hunspell*)pHunspell)->add(word); +} + +/* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +int Hunspell_add_with_affix(Hunhandle* pHunspell, + const char* word, + const char* example) { + return ((Hunspell*)pHunspell)->add_with_affix(word, example); +} + +/* remove word from the run-time dictionary */ + +int Hunspell_remove(Hunhandle* pHunspell, const char* word) { + return ((Hunspell*)pHunspell)->remove(word); +} + +void Hunspell_free_list(Hunhandle*, char*** slst, int n) { + freelist(slst, n); +} + +int Hunspell::suffix_suggest(char*** slst, const char* root_word) { + struct hentry* he = NULL; + int len; + std::string w2; + const char* word; + char* ignoredchars = pAMgr->get_ignore(); + if (ignoredchars != NULL) { + w2.assign(root_word); + if (utf8) { + const std::vector<w_char>& ignoredchars_utf16 = + pAMgr->get_ignore_utf16(); + remove_ignored_chars_utf(w2, ignoredchars_utf16); + } else { + remove_ignored_chars(w2, ignoredchars); + } + word = w2.c_str(); + } else + word = root_word; + + len = strlen(word); + + if (!len) + return 0; + + char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*)); + if (wlst == NULL) + return -1; + *slst = wlst; + for (int i = 0; i < MAXSUGGESTION; i++) { + wlst[i] = NULL; + } + + for (int i = 0; (i < maxdic) && !he; i++) { + he = (pHMgr[i])->lookup(word); + } + if (he) { + return pAMgr->get_suffix_words(he->astr, he->alen, root_word, *slst); + } + return 0; +} diff --git a/libs/hunspell/src/hunspell.h b/libs/hunspell/src/hunspell.h new file mode 100644 index 000000000..726bbe207 --- /dev/null +++ b/libs/hunspell/src/hunspell.h @@ -0,0 +1,162 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef _MYSPELLMGR_H_ +#define _MYSPELLMGR_H_ + +#include "hunvisapi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct Hunhandle Hunhandle; + +LIBHUNSPELL_DLL_EXPORTED Hunhandle* Hunspell_create(const char* affpath, + const char* dpath); + +LIBHUNSPELL_DLL_EXPORTED Hunhandle* Hunspell_create_key(const char* affpath, + const char* dpath, + const char* key); + +LIBHUNSPELL_DLL_EXPORTED void Hunspell_destroy(Hunhandle* pHunspell); + +/* load extra dictionaries (only dic files) + * output: 0 = additional dictionary slots available, 1 = slots are now full*/ +LIBHUNSPELL_DLL_EXPORTED int Hunspell_add_dic(Hunhandle* pHunspell, + const char* dpath); + +/* spell(word) - spellcheck word + * output: 0 = bad word, not 0 = good word + */ +LIBHUNSPELL_DLL_EXPORTED int Hunspell_spell(Hunhandle* pHunspell, const char*); + +LIBHUNSPELL_DLL_EXPORTED char* Hunspell_get_dic_encoding(Hunhandle* pHunspell); + +/* suggest(suggestions, word) - search suggestions + * input: pointer to an array of strings pointer and the (bad) word + * array of strings pointer (here *slst) may not be initialized + * output: number of suggestions in string array, and suggestions in + * a newly allocated array of strings (*slts will be NULL when number + * of suggestion equals 0.) + */ +LIBHUNSPELL_DLL_EXPORTED int Hunspell_suggest(Hunhandle* pHunspell, + char*** slst, + const char* word); + +/* morphological functions */ + +/* analyze(result, word) - morphological analysis of the word */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_analyze(Hunhandle* pHunspell, + char*** slst, + const char* word); + +/* stem(result, word) - stemmer function */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_stem(Hunhandle* pHunspell, + char*** slst, + const char* word); + +/* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = Hunspell_analyze(result, "words"); + * int n2 = Hunspell_stem2(result2, result, n1); + */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_stem2(Hunhandle* pHunspell, + char*** slst, + char** desc, + int n); + +/* generate(result, word, word2) - morphological generation by example(s) */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_generate(Hunhandle* pHunspell, + char*** slst, + const char* word, + const char* word2); + +/* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = Hunspell_generate2(result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_generate2(Hunhandle* pHunspell, + char*** slst, + const char* word, + char** desc, + int n); + +/* functions for run-time modification of the dictionary */ + +/* add word to the run-time dictionary */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_add(Hunhandle* pHunspell, + const char* word); + +/* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_add_with_affix(Hunhandle* pHunspell, + const char* word, + const char* example); + +/* remove word from the run-time dictionary */ + +LIBHUNSPELL_DLL_EXPORTED int Hunspell_remove(Hunhandle* pHunspell, + const char* word); + +/* free suggestion lists */ + +LIBHUNSPELL_DLL_EXPORTED void Hunspell_free_list(Hunhandle* pHunspell, + char*** slst, + int n); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libs/hunspell/src/hunspell.hxx b/libs/hunspell/src/hunspell.hxx new file mode 100644 index 000000000..401475309 --- /dev/null +++ b/libs/hunspell/src/hunspell.hxx @@ -0,0 +1,258 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "hunvisapi.h" + +#include "hashmgr.hxx" +#include "affixmgr.hxx" +#include "suggestmgr.hxx" +#include "langnum.hxx" +#include <vector> + +#define SPELL_XML "<?xml?>" + +#define MAXDIC 20 +#define MAXSUGGESTION 15 +#define MAXSHARPS 5 + +#define HUNSPELL_OK (1 << 0) +#define HUNSPELL_OK_WARN (1 << 1) + +#ifndef _MYSPELLMGR_HXX_ +#define _MYSPELLMGR_HXX_ + +class LIBHUNSPELL_DLL_EXPORTED Hunspell { + private: + Hunspell(const Hunspell&); + Hunspell& operator=(const Hunspell&); + + private: + AffixMgr* pAMgr; + HashMgr* pHMgr[MAXDIC]; + int maxdic; + SuggestMgr* pSMgr; + char* affixpath; + char* encoding; + struct cs_info* csconv; + int langnum; + int utf8; + int complexprefixes; + char** wordbreak; + + public: + /* Hunspell(aff, dic) - constructor of Hunspell class + * input: path of affix file and dictionary file + * + * In WIN32 environment, use UTF-8 encoded paths started with the long path + * prefix \\\\?\\ to handle system-independent character encoding and very + * long path names (without the long path prefix Hunspell will use fopen() + * with system-dependent character encoding instead of _wfopen()). + */ + + Hunspell(const char* affpath, const char* dpath, const char* key = NULL); + ~Hunspell(); + + /* load extra dictionaries (only dic files) */ + int add_dic(const char* dpath, const char* key = NULL); + + /* spell(word) - spellcheck word + * output: 0 = bad word, not 0 = good word + * + * plus output: + * info: information bit array, fields: + * SPELL_COMPOUND = a compound word + * SPELL_FORBIDDEN = an explicit forbidden word + * root: root (stem), when input is a word with affix(es) + */ + + int spell(const char* word, int* info = NULL, char** root = NULL); + + /* suggest(suggestions, word) - search suggestions + * input: pointer to an array of strings pointer and the (bad) word + * array of strings pointer (here *slst) may not be initialized + * output: number of suggestions in string array, and suggestions in + * a newly allocated array of strings (*slts will be NULL when number + * of suggestion equals 0.) + */ + + int suggest(char*** slst, const char* word); + + /* Suggest words from suffix rules + * suffix_suggest(suggestions, root_word) + * input: pointer to an array of strings pointer and the word + * array of strings pointer (here *slst) may not be initialized + * output: number of suggestions in string array, and suggestions in + * a newly allocated array of strings (*slts will be NULL when number + * of suggestion equals 0.) + */ + int suffix_suggest(char*** slst, const char* root_word); + + /* deallocate suggestion lists */ + + void free_list(char*** slst, int n); + + char* get_dic_encoding(); + + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + + int analyze(char*** slst, const char* word); + + /* stem(result, word) - stemmer function */ + + int stem(char*** slst, const char* word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = analyze(&result, "words"); + * int n2 = stem(&result2, result, n1); + */ + + int stem(char*** slst, char** morph, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + + int generate(char*** slst, const char* word, const char* word2); + + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = generate(&result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + + int generate(char*** slst, const char* word, char** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + + int add(const char* word); + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + + int add_with_affix(const char* word, const char* example); + + /* remove word from the run-time dictionary */ + + int remove(const char* word); + + /* other */ + + /* get extra word characters definied in affix file for tokenization */ + const char* get_wordchars(); + const std::vector<w_char>& get_wordchars_utf16(); + + struct cs_info* get_csconv(); + const char* get_version(); + + int get_langnum() const; + + /* need for putdic */ + int input_conv(const char* word, char* dest, size_t destsize); + + private: + void cleanword(std::string& dest, const char*, int* pcaptype, int* pabbrev); + size_t cleanword2(std::string& dest, + std::vector<w_char>& dest_u, + const char*, + int* w_len, + int* pcaptype, + size_t* pabbrev); + void mkinitcap(std::string& u8); + int mkinitcap2(std::string& u8, std::vector<w_char>& u16); + int mkinitsmall2(std::string& u8, std::vector<w_char>& u16); + void mkallcap(std::string& u8); + int mkallsmall2(std::string& u8, std::vector<w_char>& u16); + struct hentry* checkword(const char*, int* info, char** root); + std::string sharps_u8_l1(const std::string& source); + hentry* + spellsharps(std::string& base, size_t start_pos, int, int, int* info, char** root); + int is_keepcase(const hentry* rv); + int insert_sug(char*** slst, const char* word, int ns); + void cat_result(std::string& result, char* st); + char* stem_description(const char* desc); + int spellml(char*** slst, const char* word); + std::string get_xml_par(const char* par); + const char* get_xml_pos(const char* s, const char* attr); + int get_xml_list(char*** slst, const char* list, const char* tag); + int check_xml_par(const char* q, const char* attr, const char* value); +}; + +#endif diff --git a/libs/hunspell/src/hunvisapi.h b/libs/hunspell/src/hunvisapi.h new file mode 100644 index 000000000..503c20f66 --- /dev/null +++ b/libs/hunspell/src/hunvisapi.h @@ -0,0 +1,18 @@ +#ifndef _HUNSPELL_VISIBILITY_H_ +#define _HUNSPELL_VISIBILITY_H_ + +#if defined(HUNSPELL_STATIC) +# define LIBHUNSPELL_DLL_EXPORTED +#elif defined(_MSC_VER) +# if defined(BUILDING_LIBHUNSPELL) +# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport) +# else +# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllimport) +# endif +#elif defined(BUILDING_LIBHUNSPELL) && 1 +# define LIBHUNSPELL_DLL_EXPORTED __attribute__((__visibility__("default"))) +#else +# define LIBHUNSPELL_DLL_EXPORTED +#endif + +#endif diff --git a/libs/hunspell/src/hunzip.cxx b/libs/hunspell/src/hunzip.cxx new file mode 100644 index 000000000..b2788a105 --- /dev/null +++ b/libs/hunspell/src/hunzip.cxx @@ -0,0 +1,263 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include "hunzip.hxx" +#include "csutil.hxx" + +#define CODELEN 65536 +#define BASEBITREC 5000 + +#define UNCOMPRESSED '\002' +#define MAGIC "hz0" +#define MAGIC_ENCRYPT "hz1" +#define MAGICLEN (sizeof(MAGIC) - 1) + +int Hunzip::fail(const char* err, const char* par) { + fprintf(stderr, err, par); + return -1; +} + +Hunzip::Hunzip(const char* file, const char* key) + : fin(NULL), bufsiz(0), lastbit(0), inc(0), inbits(0), outc(0), dec(NULL) { + in[0] = out[0] = line[0] = '\0'; + filename = mystrdup(file); + if (getcode(key) == -1) + bufsiz = -1; + else + bufsiz = getbuf(); +} + +int Hunzip::getcode(const char* key) { + unsigned char c[2]; + int i, j, n, p; + int allocatedbit = BASEBITREC; + const char* enc = key; + + if (!filename) + return -1; + + fin = myfopen(filename, "rb"); + if (!fin) + return -1; + + // read magic number + if ((fread(in, 1, 3, fin) < MAGICLEN) || + !(strncmp(MAGIC, in, MAGICLEN) == 0 || + strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) { + return fail(MSG_FORMAT, filename); + } + + // check encryption + if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) { + unsigned char cs; + if (!key) + return fail(MSG_KEY, filename); + if (fread(&c, 1, 1, fin) < 1) + return fail(MSG_FORMAT, filename); + for (cs = 0; *enc; enc++) + cs ^= *enc; + if (cs != c[0]) + return fail(MSG_KEY, filename); + enc = key; + } else + key = NULL; + + // read record count + if (fread(&c, 1, 2, fin) < 2) + return fail(MSG_FORMAT, filename); + + if (key) { + c[0] ^= *enc; + if (*(++enc) == '\0') + enc = key; + c[1] ^= *enc; + } + + n = ((int)c[0] << 8) + c[1]; + dec = (struct bit*)malloc(BASEBITREC * sizeof(struct bit)); + if (!dec) + return fail(MSG_MEMORY, filename); + dec[0].v[0] = 0; + dec[0].v[1] = 0; + + // read codes + for (i = 0; i < n; i++) { + unsigned char l; + if (fread(c, 1, 2, fin) < 2) + return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') + enc = key; + c[0] ^= *enc; + if (*(++enc) == '\0') + enc = key; + c[1] ^= *enc; + } + if (fread(&l, 1, 1, fin) < 1) + return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') + enc = key; + l ^= *enc; + } + if (fread(in, 1, l / 8 + 1, fin) < (size_t)l / 8 + 1) + return fail(MSG_FORMAT, filename); + if (key) + for (j = 0; j <= l / 8; j++) { + if (*(++enc) == '\0') + enc = key; + in[j] ^= *enc; + } + p = 0; + for (j = 0; j < l; j++) { + int b = (in[j / 8] & (1 << (7 - (j % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + lastbit++; + if (lastbit == allocatedbit) { + allocatedbit += BASEBITREC; + dec = (struct bit*)realloc(dec, allocatedbit * sizeof(struct bit)); + } + dec[lastbit].v[0] = 0; + dec[lastbit].v[1] = 0; + dec[oldp].v[b] = lastbit; + p = lastbit; + } + } + dec[p].c[0] = c[0]; + dec[p].c[1] = c[1]; + } + return 0; +} + +Hunzip::~Hunzip() { + if (dec) + free(dec); + if (fin) + fclose(fin); + if (filename) + free(filename); +} + +int Hunzip::getbuf() { + int p = 0; + int o = 0; + do { + if (inc == 0) + inbits = fread(in, 1, BUFSIZE, fin) * 8; + for (; inc < inbits; inc++) { + int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + if (oldp == lastbit) { + fclose(fin); + fin = NULL; + // add last odd byte + if (dec[lastbit].c[0]) + out[o++] = dec[lastbit].c[1]; + return o; + } + out[o++] = dec[oldp].c[0]; + out[o++] = dec[oldp].c[1]; + if (o == BUFSIZE) + return o; + p = dec[p].v[b]; + } + } + inc = 0; + } while (inbits == BUFSIZE * 8); + return fail(MSG_FORMAT, filename); +} + +const char* Hunzip::getline() { + char linebuf[BUFSIZE]; + int l = 0, eol = 0, left = 0, right = 0; + if (bufsiz == -1) + return NULL; + while (l < bufsiz && !eol) { + linebuf[l++] = out[outc]; + switch (out[outc]) { + case '\t': + break; + case 31: { // escape + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + linebuf[l - 1] = out[outc]; + break; + } + case ' ': + break; + default: + if (((unsigned char)out[outc]) < 47) { + if (out[outc] > 32) { + right = out[outc] - 31; + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + } + if (out[outc] == 30) + left = 9; + else + left = out[outc]; + linebuf[l - 1] = '\n'; + eol = 1; + } + } + if (++outc == bufsiz) { + outc = 0; + bufsiz = fin ? getbuf() : -1; + } + } + if (right) + strcpy(linebuf + l - 1, line + strlen(line) - right - 1); + else + linebuf[l] = '\0'; + strcpy(line + left, linebuf); + return line; +} diff --git a/libs/hunspell/src/hunzip.hxx b/libs/hunspell/src/hunzip.hxx new file mode 100644 index 000000000..5082adddb --- /dev/null +++ b/libs/hunspell/src/hunzip.hxx @@ -0,0 +1,87 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +/* hunzip: file decompression for sorted dictionaries with optional encryption, + * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ + +#ifndef _HUNZIP_HXX_ +#define _HUNZIP_HXX_ + +#include "hunvisapi.h" + +#include <stdio.h> + +#define BUFSIZE 65536 +#define HZIP_EXTENSION ".hz" + +#define MSG_OPEN "error: %s: cannot open\n" +#define MSG_FORMAT "error: %s: not in hzip format\n" +#define MSG_MEMORY "error: %s: missing memory\n" +#define MSG_KEY "error: %s: missing or bad password\n" + +struct bit { + unsigned char c[2]; + int v[2]; +}; + +class LIBHUNSPELL_DLL_EXPORTED Hunzip { + private: + Hunzip(const Hunzip&); + Hunzip& operator=(const Hunzip&); + + protected: + char* filename; + FILE* fin; + int bufsiz, lastbit, inc, inbits, outc; + struct bit* dec; // code table + char in[BUFSIZE]; // input buffer + char out[BUFSIZE + 1]; // Huffman-decoded buffer + char line[BUFSIZE + 50]; // decoded line + int getcode(const char* key); + int getbuf(); + int fail(const char* err, const char* par); + + public: + Hunzip(const char* filename, const char* key = NULL); + ~Hunzip(); + const char* getline(); +}; + +#endif diff --git a/libs/hunspell/src/langnum.hxx b/libs/hunspell/src/langnum.hxx new file mode 100644 index 000000000..af5c86e4f --- /dev/null +++ b/libs/hunspell/src/langnum.hxx @@ -0,0 +1,78 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef _LANGNUM_HXX_ +#define _LANGNUM_HXX_ + +/* + language numbers for language specific codes + see http://l10n.openoffice.org/languages.html +*/ + +enum { + LANG_ar = 96, + LANG_az = 100, // custom number + LANG_bg = 41, + LANG_ca = 37, + LANG_cs = 42, + LANG_da = 45, + LANG_de = 49, + LANG_el = 30, + LANG_en = 01, + LANG_es = 34, + LANG_eu = 10, + LANG_fr = 02, + LANG_gl = 38, + LANG_hr = 78, + LANG_hu = 36, + LANG_it = 39, + LANG_la = 99, // custom number + LANG_lv = 101, // custom number + LANG_nl = 31, + LANG_pl = 48, + LANG_pt = 03, + LANG_ru = 07, + LANG_sv = 50, + LANG_tr = 90, + LANG_uk = 80, + LANG_xx = 999 +}; + +#endif diff --git a/libs/hunspell/src/phonet.cxx b/libs/hunspell/src/phonet.cxx new file mode 100644 index 000000000..17350e74a --- /dev/null +++ b/libs/hunspell/src/phonet.cxx @@ -0,0 +1,274 @@ +/* phonetic.c - generic replacement aglogithms for phonetic transformation + Copyright (C) 2000 Bjoern Jacke + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation; + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; If not, see + <http://www.gnu.org/licenses/>. + + Changelog: + + 2000-01-05 Bjoern Jacke <bjoern at j3e.de> + Initial Release insprired by the article about phonetic + transformations out of c't 25/1999 + + 2007-07-26 Bjoern Jacke <bjoern at j3e.de> + Released under MPL/GPL/LGPL tri-license for Hunspell + + 2007-08-23 Laszlo Nemeth <nemeth at OOo> + Porting from Aspell to Hunspell using C-like structs +*/ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include "csutil.hxx" +#include "phonet.hxx" + +void init_phonet_hash(phonetable& parms) { + int i, k; + + for (i = 0; i < HASHSIZE; i++) { + parms.hash[i] = -1; + } + + for (i = 0; parms.rules[i][0] != '\0'; i += 2) { + /** set hash value **/ + k = (unsigned char)parms.rules[i][0]; + + if (parms.hash[k] < 0) { + parms.hash[k] = i; + } + } +} + +// like strcpy but safe if the strings overlap +// but only if dest < src +static inline void strmove(char* dest, char* src) { + while (*src) + *dest++ = *src++; + *dest = '\0'; +} + +static int myisalpha(char ch) { + if ((unsigned char)ch < 128) + return isalpha(ch); + return 1; +} + +/* Do phonetic transformation. */ +/* phonetic transcription algorithm */ +/* see: http://aspell.net/man-html/Phonetic-Code.html */ +/* convert string to uppercase before this call */ +std::string phonet(const std::string& inword, phonetable& parms) { + + int i, k = 0, p, z; + int k0, n0, p0 = -333, z0; + char c; + const char* s; + typedef unsigned char uchar; + + size_t len = inword.size(); + if (len > MAXPHONETUTF8LEN) + return std::string(); + char word[MAXPHONETUTF8LEN + 1]; + strncpy(word, inword.c_str(), MAXPHONETUTF8LEN); + word[MAXPHONETUTF8LEN] = '\0'; + + std::string target; + /** check word **/ + i = z = 0; + while ((c = word[i]) != '\0') { + int n = parms.hash[(uchar)c]; + z0 = 0; + + if (n >= 0) { + /** check all rules for the same letter **/ + while (parms.rules[n][0] == c) { + /** check whole string **/ + k = 1; /** number of found letters **/ + p = 5; /** default priority **/ + s = parms.rules[n]; + s++; /** important for (see below) "*(s-1)" **/ + + while (*s != '\0' && word[i + k] == *s && !isdigit((unsigned char)*s) && + strchr("(-<^$", *s) == NULL) { + k++; + s++; + } + if (*s == '(') { + /** check letters in "(..)" **/ + if (myisalpha(word[i + k]) // ...could be implied? + && strchr(s + 1, word[i + k]) != NULL) { + k++; + while (*s != ')') + s++; + s++; + } + } + p0 = (int)*s; + k0 = k; + while (*s == '-' && k > 1) { + k--; + s++; + } + if (*s == '<') + s++; + if (isdigit((unsigned char)*s)) { + /** determine priority **/ + p = *s - '0'; + s++; + } + if (*s == '^' && *(s + 1) == '^') + s++; + + if (*s == '\0' || (*s == '^' && (i == 0 || !myisalpha(word[i - 1])) && + (*(s + 1) != '$' || (!myisalpha(word[i + k0])))) || + (*s == '$' && i > 0 && myisalpha(word[i - 1]) && + (!myisalpha(word[i + k0])))) { + /** search for followup rules, if: **/ + /** parms.followup and k > 1 and NO '-' in searchstring **/ + char c0 = word[i + k - 1]; + n0 = parms.hash[(uchar)c0]; + + // if (parms.followup && k > 1 && n0 >= 0 + if (k > 1 && n0 >= 0 && p0 != (int)'-' && word[i + k] != '\0') { + /** test follow-up rule for "word[i+k]" **/ + while (parms.rules[n0][0] == c0) { + /** check whole string **/ + k0 = k; + p0 = 5; + s = parms.rules[n0]; + s++; + while (*s != '\0' && word[i + k0] == *s && + !isdigit((unsigned char)*s) && + strchr("(-<^$", *s) == NULL) { + k0++; + s++; + } + if (*s == '(') { + /** check letters **/ + if (myisalpha(word[i + k0]) && + strchr(s + 1, word[i + k0]) != NULL) { + k0++; + while (*s != ')' && *s != '\0') + s++; + if (*s == ')') + s++; + } + } + while (*s == '-') { + /** "k0" gets NOT reduced **/ + /** because "if (k0 == k)" **/ + s++; + } + if (*s == '<') + s++; + if (isdigit((unsigned char)*s)) { + p0 = *s - '0'; + s++; + } + + if (*s == '\0' + /** *s == '^' cuts **/ + || (*s == '$' && !myisalpha(word[i + k0]))) { + if (k0 == k) { + /** this is just a piece of the string **/ + n0 += 2; + continue; + } + + if (p0 < p) { + /** priority too low **/ + n0 += 2; + continue; + } + /** rule fits; stop search **/ + break; + } + n0 += 2; + } /** End of "while (parms.rules[n0][0] == c0)" **/ + + if (p0 >= p && parms.rules[n0][0] == c0) { + n += 2; + continue; + } + } /** end of follow-up stuff **/ + + /** replace string **/ + s = parms.rules[n + 1]; + p0 = (parms.rules[n][0] != '\0' && + strchr(parms.rules[n] + 1, '<') != NULL) + ? 1 + : 0; + if (p0 == 1 && z == 0) { + /** rule with '<' is used **/ + if (!target.empty() && *s != '\0' && + (target[target.size()-1] == c || target[target.size()-1] == *s)) { + target.erase(target.size() - 1); + } + z0 = 1; + z = 1; + k0 = 0; + while (*s != '\0' && word[i + k0] != '\0') { + word[i + k0] = *s; + k0++; + s++; + } + if (k > k0) + strmove(&word[0] + i + k0, &word[0] + i + k); + + /** new "actual letter" **/ + c = word[i]; + } else { /** no '<' rule used **/ + i += k - 1; + z = 0; + while (*s != '\0' && *(s + 1) != '\0' && target.size() < len) { + if (target.empty() || target[target.size()-1] != *s) { + target.push_back(*s); + } + s++; + } + /** new "actual letter" **/ + c = *s; + if (parms.rules[n][0] != '\0' && + strstr(parms.rules[n] + 1, "^^") != NULL) { + if (c != '\0') { + target.push_back(c); + } + strmove(&word[0], &word[0] + i + 1); + i = 0; + z0 = 1; + } + } + break; + } /** end of follow-up stuff **/ + n += 2; + } /** end of while (parms.rules[n][0] == c) **/ + } /** end of if (n >= 0) **/ + if (z0 == 0) { + if (k && !p0 && target.size() < len && c != '\0' && + (1 || target.empty() || target[target.size()-1] != c)) { + /** condense only double letters **/ + target.push_back(c); + /// printf("\n setting \n"); + } + + i++; + z = 0; + k = 0; + } + } /** end of while ((c = word[i]) != '\0') **/ + + return target; +} /** end of function "phonet" **/ diff --git a/libs/hunspell/src/phonet.hxx b/libs/hunspell/src/phonet.hxx new file mode 100644 index 000000000..eb9fd0c62 --- /dev/null +++ b/libs/hunspell/src/phonet.hxx @@ -0,0 +1,52 @@ +/* phonetic.c - generic replacement aglogithms for phonetic transformation + Copyright (C) 2000 Bjoern Jacke + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation; + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; If not, see + <http://www.gnu.org/licenses/>. + + Changelog: + + 2000-01-05 Bjoern Jacke <bjoern at j3e.de> + Initial Release insprired by the article about phonetic + transformations out of c't 25/1999 + + 2007-07-26 Bjoern Jacke <bjoern at j3e.de> + Released under MPL/GPL/LGPL tri-license for Hunspell + + 2007-08-23 Laszlo Nemeth <nemeth at OOo> + Porting from Aspell to Hunspell using C-like structs +*/ + +#ifndef __PHONETHXX__ +#define __PHONETHXX__ + +#define HASHSIZE 256 +#define MAXPHONETLEN 256 +#define MAXPHONETUTF8LEN (MAXPHONETLEN * 4) + +#include "hunvisapi.h" + +struct phonetable { + char utf8; + cs_info* lang; + int num; + char** rules; + int hash[HASHSIZE]; +}; + +LIBHUNSPELL_DLL_EXPORTED void init_phonet_hash(phonetable& parms); + +LIBHUNSPELL_DLL_EXPORTED std::string phonet(const std::string& inword, + phonetable& phone); + +#endif diff --git a/libs/hunspell/src/replist.cxx b/libs/hunspell/src/replist.cxx new file mode 100644 index 000000000..b3e6b37d2 --- /dev/null +++ b/libs/hunspell/src/replist.cxx @@ -0,0 +1,193 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <limits> + +#include "replist.hxx" +#include "csutil.hxx" + +RepList::RepList(int n) { + dat = (replentry**)malloc(sizeof(replentry*) * n); + if (dat == 0) + size = 0; + else + size = n; + pos = 0; +} + +RepList::~RepList() { + for (int i = 0; i < pos; i++) { + free(dat[i]->pattern); + free(dat[i]->pattern2); + free(dat[i]); + } + free(dat); +} + +int RepList::get_pos() { + return pos; +} + +replentry* RepList::item(int n) { + return dat[n]; +} + +int RepList::near(const char* word) { + int p1 = 0; + int p2 = pos; + while ((p2 - p1) > 1) { + int m = (p1 + p2) / 2; + int c = strcmp(word, dat[m]->pattern); + if (c <= 0) { + if (c < 0) + p2 = m; + else + p1 = p2 = m; + } else + p1 = m; + } + return p1; +} + +int RepList::match(const char* word, int n) { + if (strncmp(word, dat[n]->pattern, strlen(dat[n]->pattern)) == 0) + return strlen(dat[n]->pattern); + return 0; +} + +int RepList::add(char* pat1, char* pat2) { + if (pos >= size || pat1 == NULL || pat2 == NULL) + return 1; + replentry* r = (replentry*)malloc(sizeof(replentry)); + if (r == NULL) + return 1; + r->pattern = mystrrep(pat1, "_", " "); + r->pattern2 = mystrrep(pat2, "_", " "); + r->start = false; + r->end = false; + dat[pos++] = r; + for (int i = pos - 1; i > 0; i--) { + r = dat[i]; + if (strcmp(r->pattern, dat[i - 1]->pattern) < 0) { + dat[i] = dat[i - 1]; + dat[i - 1] = r; + } else + break; + } + return 0; +} + +int RepList::conv(const char* word, char* dest, size_t destsize) { + size_t stl = 0; + int change = 0; + for (size_t i = 0; i < strlen(word); i++) { + int n = near(word + i); + int l = match(word + i, n); + if (l) { + size_t replen = strlen(dat[n]->pattern2); + if (stl + replen >= destsize) + return -1; + strcpy(dest + stl, dat[n]->pattern2); + stl += replen; + i += l - 1; + change = 1; + } else { + if (stl + 1 >= destsize) + return -1; + dest[stl++] = word[i]; + } + } + dest[stl] = '\0'; + return change; +} + +bool RepList::conv(const char* word, std::string& dest) { + dest.clear(); + + bool change = false; + for (size_t i = 0; i < strlen(word); i++) { + int n = near(word + i); + int l = match(word + i, n); + if (l) { + dest.append(dat[n]->pattern2); + i += l - 1; + change = true; + } else { + dest.push_back(word[i]); + } + } + return change; +} diff --git a/libs/hunspell/src/replist.hxx b/libs/hunspell/src/replist.hxx new file mode 100644 index 000000000..0c5153625 --- /dev/null +++ b/libs/hunspell/src/replist.hxx @@ -0,0 +1,107 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* string replacement list class */ +#ifndef _REPLIST_HXX_ +#define _REPLIST_HXX_ + +#include "hunvisapi.h" + +#include "w_char.hxx" + +#include <string> +#include <vector> + +class LIBHUNSPELL_DLL_EXPORTED RepList { + private: + RepList(const RepList&); + RepList& operator=(const RepList&); + + protected: + replentry** dat; + int size; + int pos; + + public: + RepList(int n); + ~RepList(); + + int get_pos(); + int add(char* pat1, char* pat2); + replentry* item(int n); + int near(const char* word); + int match(const char* word, int n); + int conv(const char* word, char* dest, size_t destsize); + bool conv(const char* word, std::string& dest); +}; +#endif diff --git a/libs/hunspell/src/suggestmgr.cxx b/libs/hunspell/src/suggestmgr.cxx new file mode 100644 index 000000000..17becd758 --- /dev/null +++ b/libs/hunspell/src/suggestmgr.cxx @@ -0,0 +1,2192 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include "suggestmgr.hxx" +#include "htypes.hxx" +#include "csutil.hxx" + +const w_char W_VLINE = {'\0', '|'}; + +SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) { + // register affix manager and check in string of chars to + // try when building candidate suggestions + pAMgr = aptr; + + csconv = NULL; + + ckeyl = 0; + ckey = NULL; + ckey_utf = NULL; + + ctryl = 0; + ctry = NULL; + ctry_utf = NULL; + + utf8 = 0; + langnum = 0; + complexprefixes = 0; + + maxSug = maxn; + nosplitsugs = 0; + maxngramsugs = MAXNGRAMSUGS; + maxcpdsugs = MAXCOMPOUNDSUGS; + + if (pAMgr) { + langnum = pAMgr->get_langnum(); + ckey = pAMgr->get_key_string(); + nosplitsugs = pAMgr->get_nosplitsugs(); + if (pAMgr->get_maxngramsugs() >= 0) + maxngramsugs = pAMgr->get_maxngramsugs(); + utf8 = pAMgr->get_utf8(); + if (pAMgr->get_maxcpdsugs() >= 0) + maxcpdsugs = pAMgr->get_maxcpdsugs(); + if (!utf8) { + char* enc = pAMgr->get_encoding(); + csconv = get_current_cs(enc); + free(enc); + } + complexprefixes = pAMgr->get_complexprefixes(); + } + + if (ckey) { + if (utf8) { + std::vector<w_char> t; + ckeyl = u8_u16(t, ckey); + ckey_utf = (w_char*)malloc(ckeyl * sizeof(w_char)); + if (ckey_utf) + memcpy(ckey_utf, &t[0], ckeyl * sizeof(w_char)); + else + ckeyl = 0; + } else { + ckeyl = strlen(ckey); + } + } + + if (tryme) { + ctry = mystrdup(tryme); + if (ctry) + ctryl = strlen(ctry); + if (ctry && utf8) { + std::vector<w_char> t; + ctryl = u8_u16(t, tryme); + ctry_utf = (w_char*)malloc(ctryl * sizeof(w_char)); + if (ctry_utf) + memcpy(ctry_utf, &t[0], ctryl * sizeof(w_char)); + else + ctryl = 0; + } + } +} + +SuggestMgr::~SuggestMgr() { + pAMgr = NULL; + if (ckey) + free(ckey); + ckey = NULL; + if (ckey_utf) + free(ckey_utf); + ckey_utf = NULL; + ckeyl = 0; + if (ctry) + free(ctry); + ctry = NULL; + if (ctry_utf) + free(ctry_utf); + ctry_utf = NULL; + ctryl = 0; + maxSug = 0; +#ifdef MOZILLA_CLIENT + delete[] csconv; +#endif +} + +int SuggestMgr::testsug(char** wlst, + const char* candidate, + int wl, + int ns, + int cpdsuggest, + int* timer, + clock_t* timelimit) { + int cwrd = 1; + if (ns == maxSug) + return maxSug; + for (int k = 0; k < ns; k++) { + if (strcmp(candidate, wlst[k]) == 0) { + cwrd = 0; + break; + } + } + if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) { + for (int j = 0; j < ns; j++) + free(wlst[j]); + return -1; + } + ns++; + } + return ns; +} + +// generate suggestions for a misspelled word +// pass in address of array of char * pointers +// onlycompoundsug: probably bad suggestions (need for ngram sugs, too) + +int SuggestMgr::suggest(char*** slst, + const char* w, + int nsug, + int* onlycompoundsug) { + int nocompoundtwowords = 0; + char** wlst; + std::vector<w_char> word_utf; + int wl = 0; + int nsugorig = nsug; + std::string w2; + const char* word = w; + int oldSug = 0; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + w2.assign(w); + if (utf8) + reverseword_utf(w2); + else + reverseword(w2); + word = w2.c_str(); + } + + if (*slst) { + wlst = *slst; + } else { + wlst = (char**)malloc(maxSug * sizeof(char*)); + if (wlst == NULL) + return -1; + for (int i = 0; i < maxSug; i++) { + wlst[i] = NULL; + } + } + + if (utf8) { + wl = u8_u16(word_utf, word); + if (wl == -1) { + *slst = wlst; + return nsug; + } + } + + for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0); + cpdsuggest++) { + // limit compound suggestion + if (cpdsuggest > 0) + oldSug = nsug; + + // suggestions for an uppercase word (html -> HTML) + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? capchars_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : capchars(wlst, word, nsug, cpdsuggest); + } + + // perhaps we made a typical fault of spelling + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = replchars(wlst, word, nsug, cpdsuggest); + } + + // perhaps we made chose the wrong char from a related set + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = mapchars(wlst, word, nsug, cpdsuggest); + } + + // only suggest compound words when no other suggestion + if ((cpdsuggest == 0) && (nsug > nsugorig)) + nocompoundtwowords = 1; + + // did we swap the order of chars by mistake + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? swapchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : swapchar(wlst, word, nsug, cpdsuggest); + } + + // did we swap the order of non adjacent chars by mistake + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? longswapchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : longswapchar(wlst, word, nsug, cpdsuggest); + } + + // did we just hit the wrong key in place of a good char (case and keyboard) + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? badcharkey_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : badcharkey(wlst, word, nsug, cpdsuggest); + } + + // did we add a char that should not be there + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? extrachar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : extrachar(wlst, word, nsug, cpdsuggest); + } + + // did we forgot a char + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? forgotchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : forgotchar(wlst, word, nsug, cpdsuggest); + } + + // did we move a char + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? movechar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : movechar(wlst, word, nsug, cpdsuggest); + } + + // did we just hit the wrong key in place of a good char + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? badchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : badchar(wlst, word, nsug, cpdsuggest); + } + + // did we double two characters + if ((nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = (utf8) ? doubletwochars_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) + : doubletwochars(wlst, word, nsug, cpdsuggest); + } + + // perhaps we forgot to hit space and two words ran together + if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && + (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { + nsug = twowords(wlst, word, nsug, cpdsuggest); + } + + } // repeating ``for'' statement compounding support + + if (nsug < 0) { + // we ran out of memory - we should free up as much as possible + for (int i = 0; i < maxSug; i++) + if (wlst[i] != NULL) + free(wlst[i]); + free(wlst); + wlst = NULL; + } + + if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) + *onlycompoundsug = 1; + + *slst = wlst; + return nsug; +} + +// suggestions for an uppercase word (html -> HTML) +int SuggestMgr::capchars_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + mkallcap_utf(candidate_utf, langnum); + std::string candidate; + u16_u8(candidate, candidate_utf); + return testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); +} + +// suggestions for an uppercase word (html -> HTML) +int SuggestMgr::capchars(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + mkallcap(candidate, csconv); + return testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); +} + +// suggestions for when chose the wrong char out of a related set +int SuggestMgr::mapchars(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate; + clock_t timelimit; + int timer; + + int wl = strlen(word); + if (wl < 2 || !pAMgr) + return ns; + + int nummap = pAMgr->get_nummap(); + struct mapentry* maptable = pAMgr->get_maptable(); + if (maptable == NULL) + return ns; + + timelimit = clock(); + timer = MINTIMER; + return map_related(word, candidate, 0, wlst, cpdsuggest, ns, + maptable, nummap, &timer, &timelimit); +} + +int SuggestMgr::map_related(const char* word, + std::string& candidate, + int wn, + char** wlst, + int cpdsuggest, + int ns, + const mapentry* maptable, + int nummap, + int* timer, + clock_t* timelimit) { + if (*(word + wn) == '\0') { + int cwrd = 1; + for (int m = 0; m < ns; m++) { + if (candidate == wlst[m]) { + cwrd = 0; + break; + } + } + if ((cwrd) && checkword(candidate.c_str(), candidate.size(), cpdsuggest, timer, timelimit)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate.c_str()); + if (wlst[ns] == NULL) + return -1; + ns++; + } + } + return ns; + } + int in_map = 0; + for (int j = 0; j < nummap; j++) { + for (int k = 0; k < maptable[j].len; k++) { + int len = strlen(maptable[j].set[k]); + if (strncmp(maptable[j].set[k], word + wn, len) == 0) { + in_map = 1; + size_t cn = candidate.size(); + for (int l = 0; l < maptable[j].len; l++) { + candidate.resize(cn); + candidate.append(maptable[j].set[l]); + ns = map_related(word, candidate, wn + len, wlst, + cpdsuggest, ns, maptable, nummap, timer, timelimit); + if (!(*timer)) + return ns; + } + } + } + } + if (!in_map) { + candidate.push_back(*(word + wn)); + ns = map_related(word, candidate, wn + 1, wlst, cpdsuggest, ns, + maptable, nummap, timer, timelimit); + } + return ns; +} + +// suggestions for a typical fault of spelling, that +// differs with more, than 1 letter from the right form. +int SuggestMgr::replchars(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate; + int wl = strlen(word); + if (wl < 2 || !pAMgr) + return ns; + int numrep = pAMgr->get_numrep(); + struct replentry* reptable = pAMgr->get_reptable(); + if (reptable == NULL) + return ns; + for (int i = 0; i < numrep; i++) { + const char* r = word; + // search every occurence of the pattern in the word + while ((r = strstr(r, reptable[i].pattern)) != NULL && + (!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) && + (!reptable[i].start || r == word)) { + candidate.assign(word); + candidate.resize(r - word); + candidate.append(reptable[i].pattern2); + int lenp = strlen(reptable[i].pattern); + candidate.append(r + lenp); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + // check REP suggestions with space + size_t sp = candidate.find(' '); + if (sp != std::string::npos) { + size_t prev = 0; + while (sp != std::string::npos) { + std::string prev_chunk = candidate.substr(prev, sp - prev); + if (checkword(prev_chunk.c_str(), prev_chunk.size(), 0, NULL, NULL)) { + int oldns = ns; + std::string post_chunk = candidate.substr(sp + 1); + ns = testsug(wlst, post_chunk.c_str(), post_chunk.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + if (oldns < ns) { + free(wlst[ns - 1]); + wlst[ns - 1] = mystrdup(candidate.c_str()); + if (!wlst[ns - 1]) + return -1; + } + } + prev = sp + 1; + sp = candidate.find(' ', prev); + } + } + r++; // search for the next letter + } + } + return ns; +} + +// perhaps we doubled two characters (pattern aba -> ababa, for example vacation +// -> vacacation) +int SuggestMgr::doubletwochars(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + int state = 0; + int wl = strlen(word); + if (wl < 5 || !pAMgr) + return ns; + for (int i = 2; i < wl; i++) { + if (word[i] == word[i - 2]) { + state++; + if (state == 3) { + std::string candidate(word, word + i - 1); + candidate.insert(candidate.end(), word + i + 1, word + wl); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + state = 0; + } + } else { + state = 0; + } + } + return ns; +} + +// perhaps we doubled two characters (pattern aba -> ababa, for example vacation +// -> vacacation) +int SuggestMgr::doubletwochars_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + int state = 0; + if (wl < 5 || !pAMgr) + return ns; + for (int i = 2; i < wl; i++) { + if (word[i] == word[i - 2]) { + state++; + if (state == 3) { + std::vector<w_char> candidate_utf(word, word + i - 1); + candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl); + std::string candidate; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + state = 0; + } + } else { + state = 0; + } + } + return ns; +} + +// error is wrong char in place of correct one (case and keyboard related +// version) +int SuggestMgr::badcharkey(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + + // swap out each char one by one and try uppercase and neighbor + // keyboard chars in its place to see if that makes a good word + for (size_t i = 0; i < candidate.size(); ++i) { + char tmpc = candidate[i]; + // check with uppercase letters + candidate[i] = csconv[((unsigned char)tmpc)].cupper; + if (tmpc != candidate[i]) { + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + candidate[i] = tmpc; + } + // check neighbor characters in keyboard string + if (!ckey) + continue; + char* loc = strchr(ckey, tmpc); + while (loc) { + if ((loc > ckey) && (*(loc - 1) != '|')) { + candidate[i] = *(loc - 1); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + } + if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { + candidate[i] = *(loc + 1); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + } + loc = strchr(loc + 1, tmpc); + } + candidate[i] = tmpc; + } + return ns; +} + +// error is wrong char in place of correct one (case and keyboard related +// version) +int SuggestMgr::badcharkey_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::string candidate; + std::vector<w_char> candidate_utf(word, word + wl); + // swap out each char one by one and try all the tryme + // chars in its place to see if that makes a good word + for (int i = 0; i < wl; i++) { + w_char tmpc = candidate_utf[i]; + // check with uppercase letters + candidate_utf[i] = upper_utf(candidate_utf[i], 1); + if (tmpc != candidate_utf[i]) { + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + candidate_utf[i] = tmpc; + } + // check neighbor characters in keyboard string + if (!ckey) + continue; + w_char* loc = ckey_utf; + while ((loc < (ckey_utf + ckeyl)) && *loc != tmpc) + loc++; + while (loc < (ckey_utf + ckeyl)) { + if ((loc > ckey_utf) && *(loc - 1) != W_VLINE) { + candidate_utf[i] = *(loc - 1); + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + } + if (((loc + 1) < (ckey_utf + ckeyl)) && (*(loc + 1) != W_VLINE)) { + candidate_utf[i] = *(loc + 1); + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + } + do { + loc++; + } while ((loc < (ckey_utf + ckeyl)) && *loc != tmpc); + } + candidate_utf[i] = tmpc; + } + return ns; +} + +// error is wrong char in place of correct one +int SuggestMgr::badchar(char** wlst, const char* word, int ns, int cpdsuggest) { + std::string candidate(word); + clock_t timelimit = clock(); + int timer = MINTIMER; + // swap out each char one by one and try all the tryme + // chars in its place to see if that makes a good word + for (int j = 0; j < ctryl; j++) { + for (std::string::reverse_iterator aI = candidate.rbegin(), aEnd = candidate.rend(); aI != aEnd; ++aI) { + char tmpc = *aI; + if (ctry[j] == tmpc) + continue; + *aI = ctry[j]; + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, &timelimit); + if (ns == -1) + return -1; + if (!timer) + return ns; + *aI = tmpc; + } + } + return ns; +} + +// error is wrong char in place of correct one +int SuggestMgr::badchar_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + std::string candidate; + clock_t timelimit = clock(); + int timer = MINTIMER; + // swap out each char one by one and try all the tryme + // chars in its place to see if that makes a good word + for (int j = 0; j < ctryl; j++) { + for (int i = wl - 1; i >= 0; i--) { + w_char tmpc = candidate_utf[i]; + if (tmpc == ctry_utf[j]) + continue; + candidate_utf[i] = ctry_utf[j]; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, + &timelimit); + if (ns == -1) + return -1; + if (!timer) + return ns; + candidate_utf[i] = tmpc; + } + } + return ns; +} + +// error is word has an extra letter it does not need +int SuggestMgr::extrachar_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + if (candidate_utf.size() < 2) + return ns; + // try omitting one char of word at a time + for (size_t i = 0; i < candidate_utf.size(); ++i) { + size_t index = candidate_utf.size() - 1 - i; + w_char tmpc = candidate_utf[index]; + candidate_utf.erase(candidate_utf.begin() + index); + std::string candidate; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + candidate_utf.insert(candidate_utf.begin() + index, tmpc); + } + return ns; +} + +// error is word has an extra letter it does not need +int SuggestMgr::extrachar(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + if (candidate.size() < 2) + return ns; + // try omitting one char of word at a time + for (size_t i = 0; i < candidate.size(); ++i) { + size_t index = candidate.size() - 1 - i; + char tmpc = candidate[index]; + candidate.erase(candidate.begin() + index); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + candidate.insert(candidate.begin() + index, tmpc); + } + return ns; +} + +// error is missing a letter it needs +int SuggestMgr::forgotchar(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + clock_t timelimit = clock(); + int timer = MINTIMER; + + // try inserting a tryme character before every letter (and the null + // terminator) + for (int k = 0; k < ctryl; ++k) { + for (size_t i = 0; i <= candidate.size(); ++i) { + size_t index = candidate.size() - i; + candidate.insert(candidate.begin() + index, ctry[k]); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, &timelimit); + if (ns == -1) + return -1; + if (!timer) + return ns; + candidate.erase(candidate.begin() + index); + } + } + return ns; +} + +// error is missing a letter it needs +int SuggestMgr::forgotchar_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + clock_t timelimit = clock(); + int timer = MINTIMER; + + // try inserting a tryme character at the end of the word and before every + // letter + for (int k = 0; k < ctryl; ++k) { + for (size_t i = 0; i <= candidate_utf.size(); ++i) { + size_t index = candidate_utf.size() - i; + candidate_utf.insert(candidate_utf.begin() + index, ctry_utf[k]); + std::string candidate; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, + &timelimit); + if (ns == -1) + return -1; + if (!timer) + return ns; + candidate_utf.erase(candidate_utf.begin() + index); + } + } + return ns; +} + +/* error is should have been two words */ +int SuggestMgr::twowords(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + int c1, c2; + int forbidden = 0; + int cwrd; + + int wl = strlen(word); + if (wl < 3) + return ns; + + if (langnum == LANG_hu) + forbidden = check_forbidden(word, wl); + + char* candidate = (char*)malloc(wl + 2); + strcpy(candidate + 1, word); + + // split the string into two pieces after every char + // if both pieces are good words make them a suggestion + for (char* p = candidate + 1; p[1] != '\0'; p++) { + p[-1] = *p; + // go to end of the UTF-8 character + while (utf8 && ((p[1] & 0xc0) == 0x80)) { + *p = p[1]; + p++; + } + if (utf8 && p[1] == '\0') + break; // last UTF-8 character + *p = '\0'; + c1 = checkword(candidate, strlen(candidate), cpdsuggest, NULL, NULL); + if (c1) { + c2 = checkword((p + 1), strlen(p + 1), cpdsuggest, NULL, NULL); + if (c2) { + *p = ' '; + + // spec. Hungarian code (need a better compound word support) + if ((langnum == LANG_hu) && !forbidden && + // if 3 repeating letter, use - instead of space + (((p[-1] == p[1]) && + (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || + // or multiple compounding, with more, than 6 syllables + ((c1 == 3) && (c2 >= 2)))) + *p = '-'; + + cwrd = 1; + for (int k = 0; k < ns; k++) { + if (strcmp(candidate, wlst[k]) == 0) { + cwrd = 0; + break; + } + } + if (ns < maxSug) { + if (cwrd) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) { + free(candidate); + return -1; + } + ns++; + } + } else { + free(candidate); + return ns; + } + // add two word suggestion with dash, if TRY string contains + // "a" or "-" + // NOTE: cwrd doesn't modified for REP twoword sugg. + if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && + mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) { + *p = '-'; + for (int k = 0; k < ns; k++) { + if (strcmp(candidate, wlst[k]) == 0) { + cwrd = 0; + break; + } + } + if (ns < maxSug) { + if (cwrd) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) { + free(candidate); + return -1; + } + ns++; + } + } else { + free(candidate); + return ns; + } + } + } + } + } + free(candidate); + return ns; +} + +// error is adjacent letter were swapped +int SuggestMgr::swapchar(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + if (candidate.size() < 2) + return ns; + + // try swapping adjacent chars one by one + for (size_t i = 0; i < candidate.size() - 1; ++i) { + std::swap(candidate[i], candidate[i+1]); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + std::swap(candidate[i], candidate[i+1]); + } + + // try double swaps for short words + // ahev -> have, owudl -> would + if (candidate.size() == 4 || candidate.size() == 5) { + candidate[0] = word[1]; + candidate[1] = word[0]; + candidate[2] = word[2]; + candidate[candidate.size() - 2] = word[candidate.size() - 1]; + candidate[candidate.size() - 1] = word[candidate.size() - 2]; + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + if (candidate.size() == 5) { + candidate[0] = word[0]; + candidate[1] = word[2]; + candidate[2] = word[1]; + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + } + } + + return ns; +} + +// error is adjacent letter were swapped +int SuggestMgr::swapchar_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + if (candidate_utf.size() < 2) + return ns; + + std::string candidate; + // try swapping adjacent chars one by one + for (size_t i = 0; i < candidate_utf.size() - 1; ++i) { + std::swap(candidate_utf[i], candidate_utf[i+1]); + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + std::swap(candidate_utf[i], candidate_utf[i+1]); + } + + // try double swaps for short words + // ahev -> have, owudl -> would, suodn -> sound + if (candidate_utf.size() == 4 || candidate_utf.size() == 5) { + candidate_utf[0] = word[1]; + candidate_utf[1] = word[0]; + candidate_utf[2] = word[2]; + candidate_utf[candidate_utf.size() - 2] = word[candidate_utf.size() - 1]; + candidate_utf[candidate_utf.size() - 1] = word[candidate_utf.size() - 2]; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + if (candidate_utf.size() == 5) { + candidate_utf[0] = word[0]; + candidate_utf[1] = word[2]; + candidate_utf[2] = word[1]; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + } + } + return ns; +} + +// error is not adjacent letter were swapped +int SuggestMgr::longswapchar(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + // try swapping not adjacent chars one by one + for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { + for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) { + if (abs(std::distance(q, p)) > 1) { + std::swap(*p, *q); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + std::swap(*p, *q); + } + } + } + return ns; +} + +// error is adjacent letter were swapped +int SuggestMgr::longswapchar_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + // try swapping not adjacent chars + for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { + for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) { + if (abs(std::distance(q, p)) > 1) { + std::swap(*p, *q); + std::string candidate; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + std::swap(*p, *q); + } + } + } + return ns; +} + +// error is a letter was moved +int SuggestMgr::movechar(char** wlst, + const char* word, + int ns, + int cpdsuggest) { + std::string candidate(word); + if (candidate.size() < 2) + return ns; + + // try moving a char + for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { + for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) < 10; ++q) { + std::swap(*q, *(q - 1)); + if (std::distance(p, q) < 2) + continue; // omit swap char + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + } + std::copy(word, word + candidate.size(), candidate.begin()); + } + + for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) { + for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) < 10; ++q) { + std::swap(*q, *(q - 1)); + if (std::distance(p, q) < 2) + continue; // omit swap char + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); + if (ns == -1) + return -1; + } + std::copy(word, word + candidate.size(), candidate.begin()); + } + + return ns; +} + +// error is a letter was moved +int SuggestMgr::movechar_utf(char** wlst, + const w_char* word, + int wl, + int ns, + int cpdsuggest) { + std::vector<w_char> candidate_utf(word, word + wl); + if (candidate_utf.size() < 2) + return ns; + + // try moving a char + for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { + for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) < 10; ++q) { + std::swap(*q, *(q - 1)); + if (std::distance(p, q) < 2) + continue; // omit swap char + std::string candidate; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + } + std::copy(word, word + candidate_utf.size(), candidate_utf.begin()); + } + + for (std::vector<w_char>::iterator p = candidate_utf.begin() + candidate_utf.size() - 1; p > candidate_utf.begin(); --p) { + for (std::vector<w_char>::iterator q = p - 1; q >= candidate_utf.begin() && std::distance(q, p) < 10; --q) { + std::swap(*q, *(q + 1)); + if (std::distance(q, p) < 2) + continue; // omit swap char + std::string candidate; + u16_u8(candidate, candidate_utf); + ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, + NULL); + if (ns == -1) + return -1; + } + std::copy(word, word + candidate_utf.size(), candidate_utf.begin()); + } + + return ns; +} + +// generate a set of suggestions for very poorly spelled words +int SuggestMgr::ngsuggest(char** wlst, + const char* w, + int ns, + HashMgr** pHMgr, + int md) { + int i, j; + int lval; + int sc; + int lp, lpphon; + int nonbmp = 0; + + // exhaustively search through all root words + // keeping track of the MAX_ROOTS most similar root words + struct hentry* roots[MAX_ROOTS]; + char* rootsphon[MAX_ROOTS]; + int scores[MAX_ROOTS]; + int scoresphon[MAX_ROOTS]; + for (i = 0; i < MAX_ROOTS; i++) { + roots[i] = NULL; + scores[i] = -100 * i; + rootsphon[i] = NULL; + scoresphon[i] = -100 * i; + } + lp = MAX_ROOTS - 1; + lpphon = MAX_ROOTS - 1; + int low = NGRAM_LOWERING; + + std::string w2; + const char* word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + w2.assign(w); + if (utf8) + reverseword_utf(w2); + else + reverseword(w2); + word = w2.c_str(); + } + + std::vector<w_char> u8; + int nc = strlen(word); + int n = (utf8) ? u8_u16(u8, word) : nc; + + // set character based ngram suggestion for words with non-BMP Unicode + // characters + if (n == -1) { + utf8 = 0; // XXX not state-free + n = nc; + nonbmp = 1; + low = 0; + } + + struct hentry* hp = NULL; + int col = -1; + phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; + std::string target; + std::string candidate; + if (ph) { + if (utf8) { + std::vector<w_char> _w; + u8_u16(_w, word); + mkallcap_utf(_w, langnum); + u16_u8(candidate, _w); + } else { + candidate.assign(word); + if (!nonbmp) + mkallcap(candidate, csconv); + } + target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n) + } + + FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL; + FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL; + FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; + FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; + + for (i = 0; i < md; i++) { + while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { + if ((hp->astr) && (pAMgr) && + (TESTAFF(hp->astr, forbiddenword, hp->alen) || + TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || + TESTAFF(hp->astr, nosuggest, hp->alen) || + TESTAFF(hp->astr, nongramsuggest, hp->alen) || + TESTAFF(hp->astr, onlyincompound, hp->alen))) + continue; + + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + + leftcommonsubstring(word, HENTRY_WORD(hp)); + + // check special pronounciation + std::string f; + if ((hp->var & H_OPT_PHON) && + copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { + int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + + +leftcommonsubstring(word, f.c_str()); + if (sc2 > sc) + sc = sc2; + } + + int scphon = -20000; + if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { + if (utf8) { + std::vector<w_char> _w; + u8_u16(_w, HENTRY_WORD(hp)); + mkallcap_utf(_w, langnum); + u16_u8(candidate, _w); + } else { + candidate.assign(HENTRY_WORD(hp)); + mkallcap(candidate, csconv); + } + std::string target2 = phonet(candidate, *ph); + scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); + } + + if (sc > scores[lp]) { + scores[lp] = sc; + roots[lp] = hp; + lval = sc; + for (j = 0; j < MAX_ROOTS; j++) + if (scores[j] < lval) { + lp = j; + lval = scores[j]; + } + } + + if (scphon > scoresphon[lpphon]) { + scoresphon[lpphon] = scphon; + rootsphon[lpphon] = HENTRY_WORD(hp); + lval = scphon; + for (j = 0; j < MAX_ROOTS; j++) + if (scoresphon[j] < lval) { + lpphon = j; + lval = scoresphon[j]; + } + } + } + } + + // find minimum threshold for a passable suggestion + // mangle original word three differnt ways + // and score them to generate a minimum acceptable score + int thresh = 0; + for (int sp = 1; sp < 4; sp++) { + if (utf8) { + for (int k = sp; k < n; k += 4) { + u8[k].l = '*'; + u8[k].h = 0; + } + std::string mw; + u16_u8(mw, u8); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + } else { + std::string mw(word); + for (int k = sp; k < n; k += 4) + mw[k] = '*'; + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + } + } + thresh = thresh / 3; + thresh--; + + // now expand affixes on each of these root words and + // and use length adjusted ngram scores to select + // possible suggestions + char* guess[MAX_GUESS]; + char* guessorig[MAX_GUESS]; + int gscore[MAX_GUESS]; + for (i = 0; i < MAX_GUESS; i++) { + guess[i] = NULL; + guessorig[i] = NULL; + gscore[i] = -100 * i; + } + + lp = MAX_GUESS - 1; + + struct guessword* glst; + glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword)); + if (!glst) { + if (nonbmp) + utf8 = 1; + return ns; + } + + for (i = 0; i < MAX_ROOTS; i++) { + if (roots[i]) { + struct hentry* rp = roots[i]; + + std::string f; + const char *field = NULL; + if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) + field = f.c_str(); + int nw = pAMgr->expand_rootword( + glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word, + nc, field); + + for (int k = 0; k < nw; k++) { + sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + + leftcommonsubstring(word, glst[k].word); + + if (sc > thresh) { + if (sc > gscore[lp]) { + if (guess[lp]) { + free(guess[lp]); + if (guessorig[lp]) { + free(guessorig[lp]); + guessorig[lp] = NULL; + } + } + gscore[lp] = sc; + guess[lp] = glst[k].word; + guessorig[lp] = glst[k].orig; + lval = sc; + for (j = 0; j < MAX_GUESS; j++) + if (gscore[j] < lval) { + lp = j; + lval = gscore[j]; + } + } else { + free(glst[k].word); + if (glst[k].orig) + free(glst[k].orig); + } + } else { + free(glst[k].word); + if (glst[k].orig) + free(glst[k].orig); + } + } + } + } + free(glst); + + // now we are done generating guesses + // sort in order of decreasing score + + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); + if (ph) + bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); + + // weight suggestions with a similarity index, based on + // the longest common subsequent algorithm and resort + + int is_swap = 0; + int re = 0; + double fact = 1.0; + if (pAMgr) { + int maxd = pAMgr->get_maxdiff(); + if (maxd >= 0) + fact = (10.0 - maxd) / 5.0; + } + + for (i = 0; i < MAX_GUESS; i++) { + if (guess[i]) { + // lowering guess[i] + std::string gl; + int len; + if (utf8) { + std::vector<w_char> _w; + len = u8_u16(_w, guess[i]); + mkallsmall_utf(_w, langnum); + u16_u8(gl, _w); + } else { + gl.assign(guess[i]); + if (!nonbmp) + mkallsmall(gl, csconv); + len = strlen(guess[i]); + } + + int _lcs = lcslen(word, gl.c_str()); + + // same characters with different casing + if ((n == len) && (n == _lcs)) { + gscore[i] += 2000; + break; + } + // using 2-gram instead of 3, and other weightening + + re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + + ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + + gscore[i] = + // length of longest common subsequent minus length difference + 2 * _lcs - abs((int)(n - len)) + + // weight length of the left common substring + leftcommonsubstring(word, gl.c_str()) + + // weight equal character positions + (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) + ? 1 + : 0) + + // swap character (not neighboring) + ((is_swap) ? 10 : 0) + + // ngram + ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + + // weighted ngrams + re + + // different limit for dictionaries with PHONE rules + (ph ? (re < len * fact ? -1000 : 0) + : (re < (n + len) * fact ? -1000 : 0)); + } + } + + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); + + // phonetic version + if (ph) + for (i = 0; i < MAX_ROOTS; i++) { + if (rootsphon[i]) { + // lowering rootphon[i] + std::string gl; + int len; + if (utf8) { + std::vector<w_char> _w; + len = u8_u16(_w, rootsphon[i]); + mkallsmall_utf(_w, langnum); + u16_u8(gl, _w); + } else { + gl.assign(rootsphon[i]); + if (!nonbmp) + mkallsmall(gl, csconv); + len = strlen(rootsphon[i]); + } + + // heuristic weigthing of ngram scores + scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + + // weight length of the left common substring + leftcommonsubstring(word, gl.c_str()); + } + } + + if (ph) + bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); + + // copy over + int oldns = ns; + + int same = 0; + for (i = 0; i < MAX_GUESS; i++) { + if (guess[i]) { + if ((ns < oldns + maxngramsugs) && (ns < maxSug) && + (!same || (gscore[i] > 1000))) { + int unique = 1; + // leave only excellent suggestions, if exists + if (gscore[i] > 1000) + same = 1; + else if (gscore[i] < -100) { + same = 1; + // keep the best ngram suggestions, unless in ONLYMAXDIFF mode + if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) { + free(guess[i]); + if (guessorig[i]) + free(guessorig[i]); + continue; + } + } + for (j = 0; j < ns; j++) { + // don't suggest previous suggestions or a previous suggestion with + // prefixes or affixes + if ((!guessorig[i] && strstr(guess[i], wlst[j])) || + (guessorig[i] && strstr(guessorig[i], wlst[j])) || + // check forbidden words + !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) { + unique = 0; + break; + } + } + if (unique) { + wlst[ns++] = guess[i]; + if (guessorig[i]) { + free(guess[i]); + wlst[ns - 1] = guessorig[i]; + } + } else { + free(guess[i]); + if (guessorig[i]) + free(guessorig[i]); + } + } else { + free(guess[i]); + if (guessorig[i]) + free(guessorig[i]); + } + } + } + + oldns = ns; + if (ph) + for (i = 0; i < MAX_ROOTS; i++) { + if (rootsphon[i]) { + if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { + int unique = 1; + for (j = 0; j < ns; j++) { + // don't suggest previous suggestions or a previous suggestion with + // prefixes or affixes + if (strstr(rootsphon[i], wlst[j]) || + // check forbidden words + !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) { + unique = 0; + break; + } + } + if (unique) { + wlst[ns++] = mystrdup(rootsphon[i]); + if (!wlst[ns - 1]) + return ns - 1; + } + } + } + } + + if (nonbmp) + utf8 = 1; + return ns; +} + +// see if a candidate suggestion is spelled correctly +// needs to check both root words and words with affixes + +// obsolote MySpell-HU modifications: +// return value 2 and 3 marks compounding with hyphen (-) +// `3' marks roots without suffix +int SuggestMgr::checkword(const char* word, + int len, + int cpdsuggest, + int* timer, + clock_t* timelimit) { + struct hentry* rv = NULL; + struct hentry* rv2 = NULL; + int nosuffix = 0; + + // check time limit + if (timer) { + (*timer)--; + if (!(*timer) && timelimit) { + if ((clock() - *timelimit) > TIMELIMIT) + return 0; + *timer = MAXPLUSTIMER; + } + } + + if (pAMgr) { + if (cpdsuggest == 1) { + if (pAMgr->get_compound()) { + struct hentry* rwords[100]; // buffer for COMPOUND pattern checking + rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 1, + 0); // EXT + if (rv && + (!(rv2 = pAMgr->lookup(word)) || !rv2->astr || + !(TESTAFF(rv2->astr, pAMgr->get_forbiddenword(), rv2->alen) || + TESTAFF(rv2->astr, pAMgr->get_nosuggest(), rv2->alen)))) + return 3; // XXX obsolote categorisation + only ICONV needs affix + // flag check? + } + return 0; + } + + rv = pAMgr->lookup(word); + + if (rv) { + if ((rv->astr) && + (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen))) + return 0; + while (rv) { + if (rv->astr && + (TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) { + rv = rv->next_homonym; + } else + break; + } + } else + rv = pAMgr->prefix_check(word, len, + 0); // only prefix, and prefix + suffix XXX + + if (rv) { + nosuffix = 1; + } else { + rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, + NULL); // only suffix + } + + if (!rv && pAMgr->have_contclass()) { + rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL); + if (!rv) + rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL); + } + + // check forbidden words + if ((rv) && (rv->astr) && + (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) + return 0; + + if (rv) { // XXX obsolote + if ((pAMgr->get_compoundflag()) && + TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) + return 2 + nosuffix; + return 1; + } + } + return 0; +} + +int SuggestMgr::check_forbidden(const char* word, int len) { + struct hentry* rv = NULL; + + if (pAMgr) { + rv = pAMgr->lookup(word); + if (rv && rv->astr && + (TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) + rv = NULL; + if (!(pAMgr->prefix_check(word, len, 1))) + rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, + NULL); // prefix+suffix, suffix + // check forbidden words + if ((rv) && (rv->astr) && + TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen)) + return 1; + } + return 0; +} + +char* SuggestMgr::suggest_morph(const char* w) { + char result[MAXLNLEN]; + char* r = (char*)result; + char* st; + + struct hentry* rv = NULL; + + *result = '\0'; + + if (!pAMgr) + return NULL; + + std::string w2; + const char* word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + w2.assign(w); + if (utf8) + reverseword_utf(w2); + else + reverseword(w2); + word = w2.c_str(); + } + + rv = pAMgr->lookup(word); + + while (rv) { + if ((!rv->astr) || + !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) { + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, word, MAXLNLEN); + } + if (HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + mystrcat(result, "\n", MAXLNLEN); + } + rv = rv->next_homonym; + } + + st = pAMgr->affix_check_morph(word, strlen(word)); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + + if (pAMgr->get_compound() && (*result == '\0')) { + struct hentry* rwords[100]; // buffer for COMPOUND pattern checking + pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, &r, + NULL); + } + + return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; +} + +/* affixation */ +char* SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { + char result[MAXLNLEN]; + *result = '\0'; + int sfxcount = get_sfxcount(pattern); + + if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) + return NULL; + + if (HENTRY_DATA(rv)) { + char* aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, + HENTRY_DATA(rv), pattern, 0); + if (aff) { + mystrcat(result, aff, MAXLNLEN); + mystrcat(result, "\n", MAXLNLEN); + free(aff); + } + } + + // check all allomorphs + char allomorph[MAXLNLEN]; + char* p = NULL; + if (HENTRY_DATA(rv)) + p = (char*)strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); + while (p) { + struct hentry* rv2 = NULL; + p += MORPH_TAG_LEN; + int plen = fieldlen(p); + strncpy(allomorph, p, plen); + allomorph[plen] = '\0'; + rv2 = pAMgr->lookup(allomorph); + while (rv2) { + // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= + // sfxcount) { + if (HENTRY_DATA(rv2)) { + char* st = (char*)strstr(HENTRY_DATA2(rv2), MORPH_STEM); + if (st && (strncmp(st + MORPH_TAG_LEN, HENTRY_WORD(rv), + fieldlen(st + MORPH_TAG_LEN)) == 0)) { + char* aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, + rv2->alen, HENTRY_DATA(rv2), pattern, 0); + if (aff) { + mystrcat(result, aff, MAXLNLEN); + mystrcat(result, "\n", MAXLNLEN); + free(aff); + } + } + } + rv2 = rv2->next_homonym; + } + p = strstr(p + plen, MORPH_ALLOMORPH); + } + + return (*result) ? mystrdup(result) : NULL; +} + +char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) { + if (n == 0 || !pAMgr) + return NULL; + + std::string result2; + std::string newpattern; + struct hentry* rv = NULL; + + // search affixed forms with and without derivational suffixes + while (1) { + for (int k = 0; k < n; k++) { + std::string result; + + // add compound word parts (except the last one) + char* s = (char*)desc[k]; + char* part = strstr(s, MORPH_PART); + if (part) { + char* nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + std::string field; + copy_field(field, part, MORPH_PART); + result.append(field); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } + + char** pl; + std::string tok(s); + size_t pos = tok.find(" | "); + while (pos != std::string::npos) { + tok[pos + 1] = MSEP_ALT; + pos = tok.find(" | ", pos); + } + int pln = line_tok(tok.c_str(), &pl, MSEP_ALT); + for (int i = 0; i < pln; i++) { + // remove inflectional and terminal suffixes + char* is = strstr(pl[i], MORPH_INFL_SFX); + if (is) + *is = '\0'; + char* ts = strstr(pl[i], MORPH_TERM_SFX); + while (ts) { + *ts = '_'; + ts = strstr(pl[i], MORPH_TERM_SFX); + } + char* st = strstr(s, MORPH_STEM); + if (st) { + copy_field(tok, st, MORPH_STEM); + rv = pAMgr->lookup(tok.c_str()); + while (rv) { + std::string newpat(pl[i]); + newpat.append(pattern); + char* sg = suggest_hentry_gen(rv, newpat.c_str()); + if (!sg) + sg = suggest_hentry_gen(rv, pattern); + if (sg) { + char** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + sg = NULL; + for (int j = 0; j < genl; j++) { + result2.push_back(MSEP_REC); + result2.append(result); + if (strstr(pl[i], MORPH_SURF_PFX)) { + std::string field; + copy_field(field, pl[i], MORPH_SURF_PFX); + result2.append(field); + } + result2.append(gen[j]); + } + freelist(&gen, genl); + } + rv = rv->next_homonym; + } + } + } + freelist(&pl, pln); + } + + if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX)) + break; + + newpattern.assign(pattern); + mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX); + pattern = newpattern.c_str(); + } + return (!result2.empty() ? mystrdup(result2.c_str()) : NULL); +} + +// generate an n-gram score comparing s1 and s2 +int SuggestMgr::ngram(int n, + const std::string& s1, + const std::string& s2, + int opt) { + int nscore = 0; + int ns; + int l1; + int l2; + int test = 0; + + if (utf8) { + std::vector<w_char> su1; + std::vector<w_char> su2; + l1 = u8_u16(su1, s1); + l2 = u8_u16(su2, s2); + if ((l2 <= 0) || (l1 == -1)) + return 0; + // lowering dictionary word + if (opt & NGRAM_LOWERING) + mkallsmall_utf(su2, langnum); + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1 - j); i++) { + int k = 0; + for (int l = 0; l <= (l2 - j); l++) { + for (k = 0; k < j; k++) { + w_char& c1 = su1[i + k]; + w_char& c2 = su2[l + k]; + if ((c1.l != c2.l) || (c1.h != c2.h)) + break; + } + if (k == j) { + ns++; + break; + } + } + if (k != j && opt & NGRAM_WEIGHTED) { + ns--; + test++; + if (i == 0 || i == l1 - j) + ns--; // side weight + } + } + nscore = nscore + ns; + if (ns < 2 && !(opt & NGRAM_WEIGHTED)) + break; + } + } else { + l2 = s2.size(); + if (l2 == 0) + return 0; + l1 = s1.size(); + std::string t(s2); + if (opt & NGRAM_LOWERING) + mkallsmall(t, csconv); + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1 - j); i++) { + std::string temp(s1.substr(i, j)); + if (t.find(temp) != std::string::npos) { + ns++; + } else if (opt & NGRAM_WEIGHTED) { + ns--; + test++; + if (i == 0 || i == l1 - j) + ns--; // side weight + } + } + nscore = nscore + ns; + if (ns < 2 && !(opt & NGRAM_WEIGHTED)) + break; + } + } + + ns = 0; + if (opt & NGRAM_LONGER_WORSE) + ns = (l2 - l1) - 2; + if (opt & NGRAM_ANY_MISMATCH) + ns = abs(l2 - l1) - 2; + ns = (nscore - ((ns > 0) ? ns : 0)); + return ns; +} + +// length of the left common substring of s1 and (decapitalised) s2 +int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) { + if (utf8) { + std::vector<w_char> su1; + std::vector<w_char> su2; + int l1 = u8_u16(su1, s1); + int l2 = u8_u16(su2, s2); + // decapitalize dictionary word + if (complexprefixes) { + if (su1[l1 - 1] == su2[l2 - 1]) + return 1; + } else { + unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; + unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; + if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) + return 0; + int i; + for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && + (su1[i].h == su2[i].h); + i++) + ; + return i; + } + } else { + if (complexprefixes) { + int l1 = strlen(s1); + int l2 = strlen(s2); + if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) + return 1; + } else if (csconv) { + const char* olds = s1; + // decapitalise dictionary word + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) + return 0; + do { + s1++; + s2++; + } while ((*s1 == *s2) && (*s1 != '\0')); + return (int)(s1 - olds); + } + } + return 0; +} + +int SuggestMgr::commoncharacterpositions(const char* s1, + const char* s2, + int* is_swap) { + int num = 0; + int diff = 0; + int diffpos[2]; + *is_swap = 0; + if (utf8) { + std::vector<w_char> su1; + std::vector<w_char> su2; + int l1 = u8_u16(su1, s1); + int l2 = u8_u16(su2, s2); + + if (l1 <= 0 || l2 <= 0) + return 0; + + // decapitalize dictionary word + if (complexprefixes) { + su2[l2 - 1] = lower_utf(su2[l2 - 1], langnum); + } else { + su2[0] = lower_utf(su2[0], langnum); + } + for (int i = 0; (i < l1) && (i < l2); i++) { + if (su1[i] == su2[i]) { + num++; + } else { + if (diff < 2) + diffpos[diff] = i; + diff++; + } + } + if ((diff == 2) && (l1 == l2) && + (su1[diffpos[0]] == su2[diffpos[1]]) && + (su1[diffpos[1]] == su2[diffpos[0]])) + *is_swap = 1; + } else { + size_t i; + std::string t(s2); + // decapitalize dictionary word + if (complexprefixes) { + size_t l2 = t.size(); + t[l2 - 1] = csconv[(unsigned char)t[l2 - 1]].clower; + } else { + mkallsmall(t, csconv); + } + for (i = 0; (*(s1 + i) != 0) && i < t.size(); i++) { + if (*(s1 + i) == t[i]) { + num++; + } else { + if (diff < 2) + diffpos[diff] = i; + diff++; + } + } + if ((diff == 2) && (*(s1 + i) == 0) && i == t.size() && + (*(s1 + diffpos[0]) == t[diffpos[1]]) && + (*(s1 + diffpos[1]) == t[diffpos[0]])) + *is_swap = 1; + } + return num; +} + +int SuggestMgr::mystrlen(const char* word) { + if (utf8) { + std::vector<w_char> w; + return u8_u16(w, word); + } else + return strlen(word); +} + +// sort in decreasing order of score +void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n) { + int m = 1; + while (m < n) { + int j = m; + while (j > 0) { + if (rsc[j - 1] < rsc[j]) { + int sctmp = rsc[j - 1]; + char* wdtmp = rword[j - 1]; + rsc[j - 1] = rsc[j]; + rword[j - 1] = rword[j]; + rsc[j] = sctmp; + rword[j] = wdtmp; + if (rword2) { + wdtmp = rword2[j - 1]; + rword2[j - 1] = rword2[j]; + rword2[j] = wdtmp; + } + j--; + } else + break; + } + m++; + } + return; +} + +// longest common subsequence +void SuggestMgr::lcs(const char* s, + const char* s2, + int* l1, + int* l2, + char** result) { + int n, m; + std::vector<w_char> su; + std::vector<w_char> su2; + char* b; + char* c; + int i; + int j; + if (utf8) { + m = u8_u16(su, s); + n = u8_u16(su2, s2); + } else { + m = strlen(s); + n = strlen(s2); + } + c = (char*)malloc((m + 1) * (n + 1)); + b = (char*)malloc((m + 1) * (n + 1)); + if (!c || !b) { + if (c) + free(c); + if (b) + free(b); + *result = NULL; + return; + } + for (i = 1; i <= m; i++) + c[i * (n + 1)] = 0; + for (j = 0; j <= n; j++) + c[j] = 0; + for (i = 1; i <= m; i++) { + for (j = 1; j <= n; j++) { + if (((utf8) && (su[i - 1] == su2[j - 1])) || + ((!utf8) && (s[i - 1] == s2[j - 1]))) { + c[i * (n + 1) + j] = c[(i - 1) * (n + 1) + j - 1] + 1; + b[i * (n + 1) + j] = LCS_UPLEFT; + } else if (c[(i - 1) * (n + 1) + j] >= c[i * (n + 1) + j - 1]) { + c[i * (n + 1) + j] = c[(i - 1) * (n + 1) + j]; + b[i * (n + 1) + j] = LCS_UP; + } else { + c[i * (n + 1) + j] = c[i * (n + 1) + j - 1]; + b[i * (n + 1) + j] = LCS_LEFT; + } + } + } + *result = b; + free(c); + *l1 = m; + *l2 = n; +} + +int SuggestMgr::lcslen(const char* s, const char* s2) { + int m; + int n; + int i; + int j; + char* result; + int len = 0; + lcs(s, s2, &m, &n, &result); + if (!result) + return 0; + i = m; + j = n; + while ((i != 0) && (j != 0)) { + if (result[i * (n + 1) + j] == LCS_UPLEFT) { + len++; + i--; + j--; + } else if (result[i * (n + 1) + j] == LCS_UP) { + i--; + } else + j--; + } + free(result); + return len; +} + +int SuggestMgr::lcslen(const std::string& s, const std::string& s2) { + return lcslen(s.c_str(), s2.c_str()); +} diff --git a/libs/hunspell/src/suggestmgr.hxx b/libs/hunspell/src/suggestmgr.hxx new file mode 100644 index 000000000..675d98eb8 --- /dev/null +++ b/libs/hunspell/src/suggestmgr.hxx @@ -0,0 +1,198 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SUGGESTMGR_HXX_ +#define _SUGGESTMGR_HXX_ + +#define MAX_ROOTS 100 +#define MAX_WORDS 100 +#define MAX_GUESS 200 +#define MAXNGRAMSUGS 4 +#define MAXPHONSUGS 2 +#define MAXCOMPOUNDSUGS 3 + +// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function +#define TIMELIMIT (CLOCKS_PER_SEC >> 2) +#define MINTIMER 100 +#define MAXPLUSTIMER 100 + +#define NGRAM_LONGER_WORSE (1 << 0) +#define NGRAM_ANY_MISMATCH (1 << 1) +#define NGRAM_LOWERING (1 << 2) +#define NGRAM_WEIGHTED (1 << 3) + +#include "hunvisapi.h" + +#include "atypes.hxx" +#include "affixmgr.hxx" +#include "hashmgr.hxx" +#include "langnum.hxx" +#include <time.h> + +enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; + +class LIBHUNSPELL_DLL_EXPORTED SuggestMgr { + private: + SuggestMgr(const SuggestMgr&); + SuggestMgr& operator=(const SuggestMgr&); + + private: + char* ckey; + int ckeyl; + w_char* ckey_utf; + + char* ctry; + int ctryl; + w_char* ctry_utf; + + AffixMgr* pAMgr; + int maxSug; + struct cs_info* csconv; + int utf8; + int langnum; + int nosplitsugs; + int maxngramsugs; + int maxcpdsugs; + int complexprefixes; + + public: + SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr); + ~SuggestMgr(); + + int suggest(char*** slst, const char* word, int nsug, int* onlycmpdsug); + int ngsuggest(char** wlst, const char* word, int ns, HashMgr** pHMgr, int md); + int suggest_auto(char*** slst, const char* word, int nsug); + int suggest_stems(char*** slst, const char* word, int nsug); + int suggest_pos_stems(char*** slst, const char* word, int nsug); + + char* suggest_morph(const char* word); + char* suggest_gen(char** pl, int pln, const char* pattern); + char* suggest_morph_for_spelling_error(const char* word); + + private: + int testsug(char** wlst, + const char* candidate, + int wl, + int ns, + int cpdsuggest, + int* timer, + clock_t* timelimit); + int checkword(const char*, int, int, int*, clock_t*); + int check_forbidden(const char*, int); + + int capchars(char**, const char*, int, int); + int replchars(char**, const char*, int, int); + int doubletwochars(char**, const char*, int, int); + int forgotchar(char**, const char*, int, int); + int swapchar(char**, const char*, int, int); + int longswapchar(char**, const char*, int, int); + int movechar(char**, const char*, int, int); + int extrachar(char**, const char*, int, int); + int badcharkey(char**, const char*, int, int); + int badchar(char**, const char*, int, int); + int twowords(char**, const char*, int, int); + int fixstems(char**, const char*, int); + + int capchars_utf(char**, const w_char*, int wl, int, int); + int doubletwochars_utf(char**, const w_char*, int wl, int, int); + int forgotchar_utf(char**, const w_char*, int wl, int, int); + int extrachar_utf(char**, const w_char*, int wl, int, int); + int badcharkey_utf(char**, const w_char*, int wl, int, int); + int badchar_utf(char**, const w_char*, int wl, int, int); + int swapchar_utf(char**, const w_char*, int wl, int, int); + int longswapchar_utf(char**, const w_char*, int, int, int); + int movechar_utf(char**, const w_char*, int, int, int); + + int mapchars(char**, const char*, int, int); + int map_related(const char*, + std::string&, + int, + char** wlst, + int, + int, + const mapentry*, + int, + int*, + clock_t*); + int ngram(int n, const std::string& s1, const std::string& s2, int opt); + int mystrlen(const char* word); + int leftcommonsubstring(const char* s1, const char* s2); + int commoncharacterpositions(const char* s1, const char* s2, int* is_swap); + void bubblesort(char** rwd, char** rwd2, int* rsc, int n); + void lcs(const char* s, const char* s2, int* l1, int* l2, char** result); + int lcslen(const char* s, const char* s2); + int lcslen(const std::string& s, const std::string& s2); + char* suggest_hentry_gen(hentry* rv, const char* pattern); +}; + +#endif diff --git a/libs/hunspell/src/w_char.hxx b/libs/hunspell/src/w_char.hxx new file mode 100644 index 000000000..336c454f7 --- /dev/null +++ b/libs/hunspell/src/w_char.hxx @@ -0,0 +1,75 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Hunspell, based on MySpell. + * + * The Initial Developers of the Original Code are + * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Portions created by the Initial Developers are Copyright (C) 2002-2005 + * the Initial Developers. All Rights Reserved. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef __WCHARHXX__ +#define __WCHARHXX__ + +#ifndef GCC +struct w_char { +#else +struct __attribute__((packed)) w_char { +#endif + unsigned char l; + unsigned char h; + + friend bool operator<(const w_char a, const w_char b) { + unsigned short a_idx = (a.h << 8) + a.l; + unsigned short b_idx = (b.h << 8) + b.l; + return a_idx < b_idx; + } + + friend bool operator==(const w_char a, const w_char b) { + return (((a).l == (b).l) && ((a).h == (b).h)); + } + + friend bool operator!=(const w_char a, const w_char b) { + return !(a == b);; + } +}; + +// two character arrays +struct replentry { + char* pattern; + char* pattern2; + bool start; + bool end; +}; + +#endif diff --git a/libs/moz.build b/libs/moz.build index 457bd37ff..42c1b830b 100644 --- a/libs/moz.build +++ b/libs/moz.build @@ -47,6 +47,9 @@ if CONFIG['MOZ_ENABLE_SKIA']: if CONFIG['MOZ_FFVPX']: DIRS += ['ffvpx'] +if not CONFIG['MOZ_SYSTEM_HUNSPELL']: + DIRS += ['hunspell'] + if CONFIG['MOZ_SCTP']: DIRS += ['sctp'] |