diff options
author | Moonchild <moonchild@palemoon.org> | 2021-05-14 12:25:57 +0000 |
---|---|---|
committer | Moonchild <moonchild@palemoon.org> | 2021-05-14 12:25:57 +0000 |
commit | c921ad59d5acfe973199bf742a852db71ebe2b5c (patch) | |
tree | 804e618fd084b1782f77b4ba07031b064b3b8656 /xpcom | |
parent | 349346d0b76aec02354149da0f860d5bf7ec7b83 (diff) | |
download | uxp-c921ad59d5acfe973199bf742a852db71ebe2b5c.tar.gz |
Issue #1772 - Stop using legacy code page conversion for file paths on Linux.
OS.File already only supports UTF-8 paths on non-Windows systems, so this change
makes our different ways of accessing file paths consistent with each other.
This should prevent unexpected crashes in glibc that expect UTF-8.
This resolves #1772.
Diffstat (limited to 'xpcom')
-rw-r--r-- | xpcom/build/XPCOMInit.cpp | 7 | ||||
-rw-r--r-- | xpcom/io/nsNativeCharsetUtils.cpp | 875 | ||||
-rw-r--r-- | xpcom/io/nsNativeCharsetUtils.h | 24 |
3 files changed, 15 insertions, 891 deletions
diff --git a/xpcom/build/XPCOMInit.cpp b/xpcom/build/XPCOMInit.cpp index 18aed6528a..e96314a1c5 100644 --- a/xpcom/build/XPCOMInit.cpp +++ b/xpcom/build/XPCOMInit.cpp @@ -574,10 +574,6 @@ NS_InitXPCOM2(nsIServiceManager** aResult, setlocale(LC_ALL, ""); } -#if defined(XP_UNIX) - NS_StartupNativeCharsetUtils(); -#endif - NS_StartupLocalFile(); nsDirectoryService::RealInit(); @@ -1020,9 +1016,6 @@ ShutdownXPCOM(nsIServiceManager* aServMgr) // Shutdown nsLocalFile string conversion NS_ShutdownLocalFile(); -#ifdef XP_UNIX - NS_ShutdownNativeCharsetUtils(); -#endif // Shutdown xpcom. This will release all loaders and cause others holding // a refcount to the component manager to release it. diff --git a/xpcom/io/nsNativeCharsetUtils.cpp b/xpcom/io/nsNativeCharsetUtils.cpp index 927e8cd591..3ed20d7f04 100644 --- a/xpcom/io/nsNativeCharsetUtils.cpp +++ b/xpcom/io/nsNativeCharsetUtils.cpp @@ -6,864 +6,9 @@ #include "xpcom-private.h" //----------------------------------------------------------------------------- -// XP_UNIX +// Windows //----------------------------------------------------------------------------- -#if defined(XP_UNIX) - -#include <stdlib.h> // mbtowc, wctomb -#include <locale.h> // setlocale -#include "mozilla/Mutex.h" -#include "nscore.h" -#include "nsAString.h" -#include "nsReadableUtils.h" - -using namespace mozilla; - -// -// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux, -// but that doesn't work for non-BMP characters whether we use '-fshort-wchar' -// or not (see bug 206811 and -// news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use -// iconv for all platforms where nltypes.h and nllanginfo.h are present -// along with iconv. -// -#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET) -#define USE_ICONV 1 -#else -#define USE_STDCONV 1 -#endif - -static void -isolatin1_to_utf16(const char** aInput, uint32_t* aInputLeft, - char16_t** aOutput, uint32_t* aOutputLeft) -{ - while (*aInputLeft && *aOutputLeft) { - **aOutput = (unsigned char)** aInput; - (*aInput)++; - (*aInputLeft)--; - (*aOutput)++; - (*aOutputLeft)--; - } -} - -static void -utf16_to_isolatin1(const char16_t** aInput, uint32_t* aInputLeft, - char** aOutput, uint32_t* aOutputLeft) -{ - while (*aInputLeft && *aOutputLeft) { - **aOutput = (unsigned char)**aInput; - (*aInput)++; - (*aInputLeft)--; - (*aOutput)++; - (*aOutputLeft)--; - } -} - -//----------------------------------------------------------------------------- -// conversion using iconv -//----------------------------------------------------------------------------- -#if defined(USE_ICONV) -#include <nl_types.h> // CODESET -#include <langinfo.h> // nl_langinfo -#include <iconv.h> // iconv_open, iconv, iconv_close -#include <errno.h> -#include "plstr.h" - -#if defined(HAVE_ICONV_WITH_CONST_INPUT) -#define ICONV_INPUT(x) (x) -#else -#define ICONV_INPUT(x) ((char **)x) -#endif - -// solaris definitely needs this, but we'll enable it by default -// just in case... but we know for sure that iconv(3) in glibc -// doesn't need this. -#if !defined(__GLIBC__) -#define ENABLE_UTF8_FALLBACK_SUPPORT -#endif - -#define INVALID_ICONV_T ((iconv_t)-1) - -static inline size_t -xp_iconv(iconv_t converter, - const char** aInput, size_t* aInputLeft, - char** aOutput, size_t* aOutputLeft) -{ - size_t res, outputAvail = *aOutputLeft; - res = iconv(converter, ICONV_INPUT(aInput), aInputLeft, aOutput, aOutputLeft); - if (res == (size_t)-1) { - // on some platforms (e.g., linux) iconv will fail with - // E2BIG if it cannot convert _all_ of its input. it'll - // still adjust all of the in/out params correctly, so we - // can ignore this error. the assumption is that we will - // be called again to complete the conversion. - if ((errno == E2BIG) && (*aOutputLeft < outputAvail)) { - res = 0; - } - } - return res; -} - -static inline void -xp_iconv_reset(iconv_t converter) -{ - // NOTE: the man pages on Solaris claim that you can pass nullptr - // for all parameter to reset the converter, but beware the - // evil Solaris crash if you go down this route >:-) - - const char* zero_char_in_ptr = nullptr; - char* zero_char_out_ptr = nullptr; - size_t zero_size_in = 0; - size_t zero_size_out = 0; - - xp_iconv(converter, - &zero_char_in_ptr, - &zero_size_in, - &zero_char_out_ptr, - &zero_size_out); -} - -static inline iconv_t -xp_iconv_open(const char** to_list, const char** from_list) -{ - iconv_t res; - const char** from_name; - const char** to_name; - - // try all possible combinations to locate a converter. - to_name = to_list; - while (*to_name) { - if (**to_name) { - from_name = from_list; - while (*from_name) { - if (**from_name) { - res = iconv_open(*to_name, *from_name); - if (res != INVALID_ICONV_T) { - return res; - } - } - from_name++; - } - } - to_name++; - } - - return INVALID_ICONV_T; -} - -/* - * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we - * have to use UTF-16 with iconv(3) on platforms where it's supported. - * However, the way UTF-16 and UCS-2 are interpreted varies across platforms - * and implementations of iconv(3). On Tru64, it also depends on the environment - * variable. To avoid the trouble arising from byte-swapping - * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling - * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 - * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness, - * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE' - * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment - * variable ICONV_BYTEORDER is set to 'big-endian', about which not much - * can be done other than adding a note in the release notes. (bug 206811) - */ -static const char* UTF_16_NAMES[] = { -#if defined(IS_LITTLE_ENDIAN) - "UTF-16LE", -#if defined(__GLIBC__) - "UNICODELITTLE", -#endif - "UCS-2LE", -#else - "UTF-16BE", -#if defined(__GLIBC__) - "UNICODEBIG", -#endif - "UCS-2BE", -#endif - "UTF-16", - "UCS-2", - "UCS2", - "UCS_2", - "ucs-2", - "ucs2", - "ucs_2", - nullptr -}; - -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) -static const char* UTF_8_NAMES[] = { - "UTF-8", - "UTF8", - "UTF_8", - "utf-8", - "utf8", - "utf_8", - nullptr -}; -#endif - -static const char* ISO_8859_1_NAMES[] = { - "ISO-8859-1", -#if !defined(__GLIBC__) - "ISO8859-1", - "ISO88591", - "ISO_8859_1", - "ISO8859_1", - "iso-8859-1", - "iso8859-1", - "iso88591", - "iso_8859_1", - "iso8859_1", -#endif - nullptr -}; - -class nsNativeCharsetConverter -{ -public: - nsNativeCharsetConverter(); - ~nsNativeCharsetConverter(); - - nsresult NativeToUnicode(const char** aInput, uint32_t* aInputLeft, - char16_t** aOutput, uint32_t* aOutputLeft); - nsresult UnicodeToNative(const char16_t** aInput, uint32_t* aInputLeft, - char** aOutput, uint32_t* aOutputLeft); - - static void GlobalInit(); - static void GlobalShutdown(); - static bool IsNativeUTF8(); - -private: - static iconv_t gNativeToUnicode; - static iconv_t gUnicodeToNative; -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - static iconv_t gNativeToUTF8; - static iconv_t gUTF8ToNative; - static iconv_t gUnicodeToUTF8; - static iconv_t gUTF8ToUnicode; -#endif - static Mutex* gLock; - static bool gInitialized; - static bool gIsNativeUTF8; - - static void LazyInit(); - - static void Lock() - { - if (gLock) { - gLock->Lock(); - } - } - static void Unlock() - { - if (gLock) { - gLock->Unlock(); - } - } -}; - -iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T; -iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T; -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) -iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T; -iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T; -iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T; -iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T; -#endif -Mutex* nsNativeCharsetConverter::gLock = nullptr; -bool nsNativeCharsetConverter::gInitialized = false; -bool nsNativeCharsetConverter::gIsNativeUTF8 = false; - -void -nsNativeCharsetConverter::LazyInit() -{ - // LazyInit may be called before NS_StartupNativeCharsetUtils, but - // the setlocale it does has to be called before nl_langinfo. Like in - // NS_StartupNativeCharsetUtils, assume we are called early enough that - // we are the first to care about the locale's charset. - if (!gLock) { - setlocale(LC_CTYPE, ""); - } - const char* blank_list[] = { "", nullptr }; - const char** native_charset_list = blank_list; - const char* native_charset = nl_langinfo(CODESET); - if (!native_charset) { - NS_ERROR("native charset is unknown"); - // fallback to ISO-8859-1 - native_charset_list = ISO_8859_1_NAMES; - } else { - native_charset_list[0] = native_charset; - } - - // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) - // return 'UTF-8' (or 'utf-8') - if (!PL_strcasecmp(native_charset, "UTF-8")) { - gIsNativeUTF8 = true; - } - - gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list); - gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES); - -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - if (gNativeToUnicode == INVALID_ICONV_T) { - gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list); - gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES); - NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter"); - NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter"); - } - if (gUnicodeToNative == INVALID_ICONV_T) { - gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES); - gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES); - NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter"); - NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter"); - } -#else - NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter"); - NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter"); -#endif - - /* - * On Solaris 8 (and newer?), the iconv modules converting to UCS-2 - * prepend a byte order mark unicode character (BOM, u+FEFF) during - * the first use of the iconv converter. The same is the case of - * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. - * However, we use 'UTF-16LE/BE' in both cases, instead so that we - * should be safe. But just in case... - * - * This dummy conversion gets rid of the BOMs and fixes bug 153562. - */ - char dummy_input[1] = { ' ' }; - char dummy_output[4]; - - if (gNativeToUnicode != INVALID_ICONV_T) { - const char* input = dummy_input; - size_t input_left = sizeof(dummy_input); - char* output = dummy_output; - size_t output_left = sizeof(dummy_output); - - xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left); - } -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - if (gUTF8ToUnicode != INVALID_ICONV_T) { - const char* input = dummy_input; - size_t input_left = sizeof(dummy_input); - char* output = dummy_output; - size_t output_left = sizeof(dummy_output); - - xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left); - } -#endif - - gInitialized = true; -} - -void -nsNativeCharsetConverter::GlobalInit() -{ - gLock = new Mutex("nsNativeCharsetConverter.gLock"); -} - -void -nsNativeCharsetConverter::GlobalShutdown() -{ - delete gLock; - gLock = nullptr; - - if (gNativeToUnicode != INVALID_ICONV_T) { - iconv_close(gNativeToUnicode); - gNativeToUnicode = INVALID_ICONV_T; - } - - if (gUnicodeToNative != INVALID_ICONV_T) { - iconv_close(gUnicodeToNative); - gUnicodeToNative = INVALID_ICONV_T; - } - -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - if (gNativeToUTF8 != INVALID_ICONV_T) { - iconv_close(gNativeToUTF8); - gNativeToUTF8 = INVALID_ICONV_T; - } - if (gUTF8ToNative != INVALID_ICONV_T) { - iconv_close(gUTF8ToNative); - gUTF8ToNative = INVALID_ICONV_T; - } - if (gUnicodeToUTF8 != INVALID_ICONV_T) { - iconv_close(gUnicodeToUTF8); - gUnicodeToUTF8 = INVALID_ICONV_T; - } - if (gUTF8ToUnicode != INVALID_ICONV_T) { - iconv_close(gUTF8ToUnicode); - gUTF8ToUnicode = INVALID_ICONV_T; - } -#endif - - gInitialized = false; -} - -nsNativeCharsetConverter::nsNativeCharsetConverter() -{ - Lock(); - if (!gInitialized) { - LazyInit(); - } -} - -nsNativeCharsetConverter::~nsNativeCharsetConverter() -{ - // reset converters for next time - if (gNativeToUnicode != INVALID_ICONV_T) { - xp_iconv_reset(gNativeToUnicode); - } - if (gUnicodeToNative != INVALID_ICONV_T) { - xp_iconv_reset(gUnicodeToNative); - } -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - if (gNativeToUTF8 != INVALID_ICONV_T) { - xp_iconv_reset(gNativeToUTF8); - } - if (gUTF8ToNative != INVALID_ICONV_T) { - xp_iconv_reset(gUTF8ToNative); - } - if (gUnicodeToUTF8 != INVALID_ICONV_T) { - xp_iconv_reset(gUnicodeToUTF8); - } - if (gUTF8ToUnicode != INVALID_ICONV_T) { - xp_iconv_reset(gUTF8ToUnicode); - } -#endif - Unlock(); -} - -nsresult -nsNativeCharsetConverter::NativeToUnicode(const char** aInput, - uint32_t* aInputLeft, - char16_t** aOutput, - uint32_t* aOutputLeft) -{ - size_t res = 0; - size_t inLeft = (size_t)*aInputLeft; - size_t outLeft = (size_t)*aOutputLeft * 2; - - if (gNativeToUnicode != INVALID_ICONV_T) { - - res = xp_iconv(gNativeToUnicode, aInput, &inLeft, (char**)aOutput, &outLeft); - - *aInputLeft = inLeft; - *aOutputLeft = outLeft / 2; - if (res != (size_t)-1) { - return NS_OK; - } - - NS_WARNING("conversion from native to utf-16 failed"); - - // reset converter - xp_iconv_reset(gNativeToUnicode); - } -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - else if ((gNativeToUTF8 != INVALID_ICONV_T) && - (gUTF8ToUnicode != INVALID_ICONV_T)) { - // convert first to UTF8, then from UTF8 to UCS2 - const char* in = *aInput; - - char ubuf[1024]; - - // we assume we're always called with enough space in |aOutput|, - // so convert many chars at a time... - while (inLeft) { - char* p = ubuf; - size_t n = sizeof(ubuf); - res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n); - if (res == (size_t)-1) { - NS_ERROR("conversion from native to utf-8 failed"); - break; - } - NS_ASSERTION(outLeft > 0, "bad assumption"); - p = ubuf; - n = sizeof(ubuf) - n; - res = xp_iconv(gUTF8ToUnicode, (const char**)&p, &n, - (char**)aOutput, &outLeft); - if (res == (size_t)-1) { - NS_ERROR("conversion from utf-8 to utf-16 failed"); - break; - } - } - - (*aInput) += (*aInputLeft - inLeft); - *aInputLeft = inLeft; - *aOutputLeft = outLeft / 2; - - if (res != (size_t)-1) { - return NS_OK; - } - - // reset converters - xp_iconv_reset(gNativeToUTF8); - xp_iconv_reset(gUTF8ToUnicode); - } -#endif - - // fallback: zero-pad and hope for the best - // XXX This is lame and we have to do better. - isolatin1_to_utf16(aInput, aInputLeft, aOutput, aOutputLeft); - - return NS_OK; -} - -nsresult -nsNativeCharsetConverter::UnicodeToNative(const char16_t** aInput, - uint32_t* aInputLeft, - char** aOutput, - uint32_t* aOutputLeft) -{ - size_t res = 0; - size_t inLeft = (size_t)*aInputLeft * 2; - size_t outLeft = (size_t)*aOutputLeft; - - if (gUnicodeToNative != INVALID_ICONV_T) { - res = xp_iconv(gUnicodeToNative, (const char**)aInput, &inLeft, - aOutput, &outLeft); - - *aInputLeft = inLeft / 2; - *aOutputLeft = outLeft; - if (res != (size_t)-1) { - return NS_OK; - } - - NS_ERROR("iconv failed"); - - // reset converter - xp_iconv_reset(gUnicodeToNative); - } -#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) - else if ((gUnicodeToUTF8 != INVALID_ICONV_T) && - (gUTF8ToNative != INVALID_ICONV_T)) { - const char* in = (const char*)*aInput; - - char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes) - - // convert one uchar at a time... - while (inLeft && outLeft) { - char* p = ubuf; - size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t); - res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n); - if (res == (size_t)-1) { - NS_ERROR("conversion from utf-16 to utf-8 failed"); - break; - } - p = ubuf; - n = sizeof(ubuf) - n; - res = xp_iconv(gUTF8ToNative, (const char**)&p, &n, aOutput, &outLeft); - if (res == (size_t)-1) { - if (errno == E2BIG) { - // not enough room for last uchar... back up and return. - in -= sizeof(char16_t); - res = 0; - } else { - NS_ERROR("conversion from utf-8 to native failed"); - } - break; - } - inLeft -= sizeof(char16_t); - } - - (*aInput) += (*aInputLeft - inLeft / 2); - *aInputLeft = inLeft / 2; - *aOutputLeft = outLeft; - if (res != (size_t)-1) { - return NS_OK; - } - - // reset converters - xp_iconv_reset(gUnicodeToUTF8); - xp_iconv_reset(gUTF8ToNative); - } -#endif - - // fallback: truncate and hope for the best - // XXX This is lame and we have to do better. - utf16_to_isolatin1(aInput, aInputLeft, aOutput, aOutputLeft); - - return NS_OK; -} - -bool -nsNativeCharsetConverter::IsNativeUTF8() -{ - if (!gInitialized) { - Lock(); - if (!gInitialized) { - LazyInit(); - } - Unlock(); - } - return gIsNativeUTF8; -} - -#endif // USE_ICONV - -//----------------------------------------------------------------------------- -// conversion using mb[r]towc/wc[r]tomb -//----------------------------------------------------------------------------- -#if defined(USE_STDCONV) -#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) -#include <wchar.h> // mbrtowc, wcrtomb -#endif - -class nsNativeCharsetConverter -{ -public: - nsNativeCharsetConverter(); - - nsresult NativeToUnicode(const char** aInput, uint32_t* aInputLeft, - char16_t** aOutput, uint32_t* aOutputLeft); - nsresult UnicodeToNative(const char16_t** aInput, uint32_t* aInputLeft, - char** aOutput, uint32_t* aOutputLeft); - - static void GlobalInit(); - static void GlobalShutdown() { } - static bool IsNativeUTF8(); - -private: - static bool gWCharIsUnicode; - -#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) - mbstate_t ps; -#endif -}; - -bool nsNativeCharsetConverter::gWCharIsUnicode = false; - -nsNativeCharsetConverter::nsNativeCharsetConverter() -{ -#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) - memset(&ps, 0, sizeof(ps)); -#endif -} - -void -nsNativeCharsetConverter::GlobalInit() -{ - // verify that wchar_t for the current locale is actually unicode. - // if it is not, then we should avoid calling mbtowc/wctomb and - // just fallback on zero-pad/truncation conversion. - // - // this test cannot be done at build time because the encoding of - // wchar_t may depend on the runtime locale. sad, but true!! - // - // so, if wchar_t is unicode then converting an ASCII character - // to wchar_t should not change its numeric value. we'll just - // check what happens with the ASCII 'a' character. - // - // this test is not perfect... obviously, it could yield false - // positives, but then at least ASCII text would be converted - // properly (or maybe just the 'a' character) -- oh well :( - - char a = 'a'; - unsigned int w = 0; - - int res = mbtowc((wchar_t*)&w, &a, 1); - - gWCharIsUnicode = (res != -1 && w == 'a'); - -#ifdef DEBUG - if (!gWCharIsUnicode) { - NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)"); - } -#endif -} - -nsresult -nsNativeCharsetConverter::NativeToUnicode(const char** aInput, - uint32_t* aInputLeft, - char16_t** aOutput, - uint32_t* aOutputLeft) -{ - if (gWCharIsUnicode) { - int incr; - - // cannot use wchar_t here since it may have been redefined (e.g., - // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP. - unsigned int tmp = 0; - while (*aInputLeft && *aOutputLeft) { -#ifdef HAVE_MBRTOWC - incr = (int)mbrtowc((wchar_t*)&tmp, *aInput, *aInputLeft, &ps); -#else - // XXX is this thread-safe? - incr = (int)mbtowc((wchar_t*)&tmp, *aInput, *aInputLeft); -#endif - if (incr < 0) { - NS_WARNING("mbtowc failed: possible charset mismatch"); - // zero-pad and hope for the best - tmp = (unsigned char)**aInput; - incr = 1; - } - ** aOutput = (char16_t)tmp; - (*aInput) += incr; - (*aInputLeft) -= incr; - (*aOutput)++; - (*aOutputLeft)--; - } - } else { - // wchar_t isn't unicode, so the best we can do is treat the - // input as if it is isolatin1 :( - isolatin1_to_utf16(aInput, aInputLeft, aOutput, aOutputLeft); - } - - return NS_OK; -} - -nsresult -nsNativeCharsetConverter::UnicodeToNative(const char16_t** aInput, - uint32_t* aInputLeft, - char** aOutput, - uint32_t* aOutputLeft) -{ - if (gWCharIsUnicode) { - int incr; - - while (*aInputLeft && *aOutputLeft >= MB_CUR_MAX) { -#ifdef HAVE_WCRTOMB - incr = (int)wcrtomb(*aOutput, (wchar_t)**aInput, &ps); -#else - // XXX is this thread-safe? - incr = (int)wctomb(*aOutput, (wchar_t)**aInput); -#endif - if (incr < 0) { - NS_WARNING("mbtowc failed: possible charset mismatch"); - ** aOutput = (unsigned char)**aInput; // truncate - incr = 1; - } - // most likely we're dead anyways if this assertion should fire - NS_ASSERTION(uint32_t(incr) <= *aOutputLeft, "wrote beyond end of string"); - (*aOutput) += incr; - (*aOutputLeft) -= incr; - (*aInput)++; - (*aInputLeft)--; - } - } else { - // wchar_t isn't unicode, so the best we can do is treat the - // input as if it is isolatin1 :( - utf16_to_isolatin1(aInput, aInputLeft, aOutput, aOutputLeft); - } - - return NS_OK; -} - -// XXX : for now, return false -bool -nsNativeCharsetConverter::IsNativeUTF8() -{ - return false; -} - -#endif // USE_STDCONV - -//----------------------------------------------------------------------------- -// API implementation -//----------------------------------------------------------------------------- - -nsresult -NS_CopyNativeToUnicode(const nsACString& aInput, nsAString& aOutput) -{ - aOutput.Truncate(); - - uint32_t inputLen = aInput.Length(); - - nsACString::const_iterator iter; - aInput.BeginReading(iter); - - // - // OPTIMIZATION: preallocate space for largest possible result; convert - // directly into the result buffer to avoid intermediate buffer copy. - // - // this will generally result in a larger allocation, but that seems - // better than an extra buffer copy. - // - if (!aOutput.SetLength(inputLen, fallible)) { - return NS_ERROR_OUT_OF_MEMORY; - } - nsAString::iterator out_iter; - aOutput.BeginWriting(out_iter); - - char16_t* result = out_iter.get(); - uint32_t resultLeft = inputLen; - - const char* buf = iter.get(); - uint32_t bufLeft = inputLen; - - nsNativeCharsetConverter conv; - nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft); - if (NS_SUCCEEDED(rv)) { - NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer"); - aOutput.SetLength(inputLen - resultLeft); - } - return rv; -} - -nsresult -NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput) -{ - aOutput.Truncate(); - - nsAString::const_iterator iter, end; - aInput.BeginReading(iter); - aInput.EndReading(end); - - // cannot easily avoid intermediate buffer copy. - char temp[4096]; - - nsNativeCharsetConverter conv; - - const char16_t* buf = iter.get(); - uint32_t bufLeft = Distance(iter, end); - while (bufLeft) { - char* p = temp; - uint32_t tempLeft = sizeof(temp); - - nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft); - if (NS_FAILED(rv)) { - return rv; - } - - if (tempLeft < sizeof(temp)) { - aOutput.Append(temp, sizeof(temp) - tempLeft); - } - } - return NS_OK; -} - -bool -NS_IsNativeUTF8() -{ - return nsNativeCharsetConverter::IsNativeUTF8(); -} - -void -NS_StartupNativeCharsetUtils() -{ - // - // need to initialize the locale or else charset conversion will fail. - // better not delay this in case some other component alters the locale - // settings. - // - // XXX we assume that we are called early enough that we should - // always be the first to care about the locale's charset. - // - setlocale(LC_CTYPE, ""); - - nsNativeCharsetConverter::GlobalInit(); -} - -void -NS_ShutdownNativeCharsetUtils() -{ - nsNativeCharsetConverter::GlobalShutdown(); -} - -//----------------------------------------------------------------------------- -// XP_WIN -//----------------------------------------------------------------------------- -#elif defined(XP_WIN) +#if defined(XP_WIN) #include <windows.h> #include "nsString.h" @@ -980,30 +125,22 @@ NS_ConvertWtoA(const char16_t* aStrInW, int aBufferSizeOut, #else +// Non-windows will always use UTF-8 conversion. + #include "nsReadableUtils.h" nsresult NS_CopyNativeToUnicode(const nsACString& aInput, nsAString& aOutput) { - CopyASCIItoUTF16(aInput, aOutput); + CopyUTF8toUTF16(aInput, aOutput); return NS_OK; } nsresult NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput) { - LossyCopyUTF16toASCII(aInput, aOutput); + CopyUTF16toUTF8(aInput, aOutput); return NS_OK; } -void -NS_StartupNativeCharsetUtils() -{ -} - -void -NS_ShutdownNativeCharsetUtils() -{ -} - #endif diff --git a/xpcom/io/nsNativeCharsetUtils.h b/xpcom/io/nsNativeCharsetUtils.h index bee533cbef..572995aad5 100644 --- a/xpcom/io/nsNativeCharsetUtils.h +++ b/xpcom/io/nsNativeCharsetUtils.h @@ -14,9 +14,13 @@ * *** THESE ARE NOT GENERAL PURPOSE CONVERTERS *** * * * * NS_CopyNativeToUnicode / NS_CopyUnicodeToNative should only be used * - * for converting *FILENAMES* between native and unicode. They are not * + * for converting *FILENAMES* between bytes and UTF-16. They are not * * designed or tested for general encoding converter use. * * * + * On Windows, these functions convert to and from the system's legacy * + * code page, which cannot represent all of Unicode. Elsewhere, these * + * convert to and from UTF-8. * + * * \*****************************************************************************/ /** @@ -33,25 +37,15 @@ nsresult NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput); * name in UTF-8 out of nsIFile, we can just use |GetNativeLeafName| rather * than using |GetLeafName| and converting the result to UTF-8 if the file * system encoding is UTF-8. - * On Unix, it depends on the locale and is not known in advance (at the - * compilation time) so that this function needs to be a real function. - * On Windows and other platforms (e.g. OS2), it's never UTF-8. */ -#if defined(XP_UNIX) -bool NS_IsNativeUTF8(); -#else inline bool NS_IsNativeUTF8() { +#ifdef XP_WIN return false; -} +#else + return true; #endif - - -/** - * internal - */ -void NS_StartupNativeCharsetUtils(); -void NS_ShutdownNativeCharsetUtils(); +} #endif // nsNativeCharsetUtils_h__ |