diff options
Diffstat (limited to 'extensions/universalchardet/src/base/JpCntx.h')
-rw-r--r-- | extensions/universalchardet/src/base/JpCntx.h | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/extensions/universalchardet/src/base/JpCntx.h b/extensions/universalchardet/src/base/JpCntx.h new file mode 100644 index 000000000..88e096432 --- /dev/null +++ b/extensions/universalchardet/src/base/JpCntx.h @@ -0,0 +1,107 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef __JPCNTX_H__ +#define __JPCNTX_H__ + +#define NUM_OF_CATEGORY 6 + +#include "nscore.h" + +#define ENOUGH_REL_THRESHOLD 100 +#define MAX_REL_THRESHOLD 1000 + +//hiragana frequency category table +extern const uint8_t jp2CharContext[83][83]; + +class JapaneseContextAnalysis +{ +public: + JapaneseContextAnalysis() {Reset();} + + void HandleData(const char* aBuf, uint32_t aLen); + + void HandleOneChar(const char* aStr, uint32_t aCharLen) + { + int32_t order; + + //if we received enough data, stop here + if (mTotalRel > MAX_REL_THRESHOLD) mDone = true; + if (mDone) return; + + //Only 2-bytes characters are of our interest + order = (aCharLen == 2) ? GetOrder(aStr) : -1; + if (order != -1 && mLastCharOrder != -1) + { + mTotalRel++; + //count this sequence to its category counter + mRelSample[jp2CharContext[mLastCharOrder][order]]++; + } + mLastCharOrder = order; + } + + float GetConfidence(void); + void Reset(); + bool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} + +protected: + virtual int32_t GetOrder(const char* str, uint32_t *charLen) = 0; + virtual int32_t GetOrder(const char* str) = 0; + + //category counters, each integer counts sequences in its category + uint32_t mRelSample[NUM_OF_CATEGORY]; + + //total sequence received + uint32_t mTotalRel; + + //Number of sequences needed to trigger detection + uint32_t mDataThreshold; + + //The order of previous char + int32_t mLastCharOrder; + + //if last byte in current buffer is not the last byte of a character, we + //need to know how many byte to skip in next buffer. + uint32_t mNeedToSkipCharNum; + + //If this flag is set to true, detection is done and conclusion has been made + bool mDone; +}; + + +class SJISContextAnalysis : public JapaneseContextAnalysis +{ + //SJISContextAnalysis(){}; +protected: + int32_t GetOrder(const char* str, uint32_t *charLen); + + int32_t GetOrder(const char* str) + { + //We only interested in Hiragana, so first byte is '\202' + if (*str == '\202' && + (unsigned char)*(str+1) >= (unsigned char)0x9f && + (unsigned char)*(str+1) <= (unsigned char)0xf1) + return (unsigned char)*(str+1) - (unsigned char)0x9f; + return -1; + } +}; + +class EUCJPContextAnalysis : public JapaneseContextAnalysis +{ +protected: + int32_t GetOrder(const char* str, uint32_t *charLen); + int32_t GetOrder(const char* str) + //We only interested in Hiragana, so first byte is '\244' + { + if (*str == '\244' && + (unsigned char)*(str+1) >= (unsigned char)0xa1 && + (unsigned char)*(str+1) <= (unsigned char)0xf3) + return (unsigned char)*(str+1) - (unsigned char)0xa1; + return -1; + } +}; + +#endif /* __JPCNTX_H__ */ + |