summaryrefslogtreecommitdiff
path: root/libraries/libuchardet
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/libuchardet')
-rw-r--r--libraries/libuchardet/README4
-rw-r--r--libraries/libuchardet/libuchardet.SlackBuild18
-rw-r--r--libraries/libuchardet/libuchardet.info8
-rw-r--r--libraries/libuchardet/slack-desc4
-rw-r--r--libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch116
-rw-r--r--libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch30
6 files changed, 17 insertions, 163 deletions
diff --git a/libraries/libuchardet/README b/libraries/libuchardet/README
index 9822181500..62c9882143 100644
--- a/libraries/libuchardet/README
+++ b/libraries/libuchardet/README
@@ -6,3 +6,7 @@ implementation of the universal charset detection library by Mozilla.
uchardet is an encoding detector library, which takes a sequence of
bytes in an unknown character encoding without any additional
information, and attempts to determine the encoding of the text.
+Returned encoding names are iconv-compatible.
+
+It can now detect more charsets, and more reliably than the original
+implementation.
diff --git a/libraries/libuchardet/libuchardet.SlackBuild b/libraries/libuchardet/libuchardet.SlackBuild
index 42e440807c..aa2b6cbbb4 100644
--- a/libraries/libuchardet/libuchardet.SlackBuild
+++ b/libraries/libuchardet/libuchardet.SlackBuild
@@ -1,8 +1,8 @@
#!/bin/sh
-
+#
# Slackware build script for libuchardet.
-
-# Copyright 2015 Edinaldo P. Silva, Rio de Janeiro, Brazil.
+#
+# Copyright 2015-2016 Edinaldo P. Silva, Rio de Janeiro, Brazil.
# All rights reserved.
#
# Redistribution and use of this script, with or without modification, is
@@ -23,13 +23,13 @@
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PRGNAM=libuchardet
-VERSION=${VERSION:-0.0.5}
+VERSION=${VERSION:-0.0.6}
BUILD=${BUILD:-1}
TAG=${TAG:-_SBo}
if [ -z "$ARCH" ]; then
case "$( uname -m )" in
- i?86) ARCH=i486 ;;
+ i?86) ARCH=i586 ;;
arm*) ARCH=arm ;;
*) ARCH=$( uname -m ) ;;
esac
@@ -40,7 +40,7 @@ TMP=${TMP:-/tmp/SBo}
PKG=$TMP/package-$PRGNAM
OUTPUT=${OUTPUT:-/tmp}
-if [ "$ARCH" = "i486" ]; then
+if [ "$ARCH" = "i586" ]; then
SLKCFLAGS="-O2 -march=i686 -mtune=i686"
LIBDIRSUFFIX=""
elif [ "$ARCH" = "i686" ]; then
@@ -62,7 +62,7 @@ rm -rf $PKG
mkdir -p $TMP $PKG $OUTPUT
cd $TMP
rm -rf $SRCNAM-$VERSION
-tar xvf $CWD/$SRCNAM-$VERSION.tar.gz
+tar xvf $CWD/$SRCNAM-$VERSION.tar.xz
cd $SRCNAM-$VERSION
chown -R root:root .
find -L . \
@@ -71,16 +71,12 @@ find -L . \
\( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \
-o -perm 440 -o -perm 400 \) -exec chmod 644 {} \;
-patch -Np1 < $CWD/uchardet-0.0.5-fix-ASCII-detection.patch
-patch -Np1 < $CWD/uchardet-0.0.5-use-proper-package-name.patch
-
cmake \
-DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \
-DCMAKE_INSTALL_PREFIX=/usr \
-DCMAKE_INSTALL_LIBDIR=/usr/lib${LIBDIRSUFFIX} \
.
make
-#make test
make install DESTDIR=$PKG
find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \
diff --git a/libraries/libuchardet/libuchardet.info b/libraries/libuchardet/libuchardet.info
index e58f5e0b7b..1a881d06e5 100644
--- a/libraries/libuchardet/libuchardet.info
+++ b/libraries/libuchardet/libuchardet.info
@@ -1,8 +1,8 @@
PRGNAM="libuchardet"
-VERSION="0.0.5"
-HOMEPAGE="https://github.com/BYVoid/uchardet"
-DOWNLOAD="https://github.com/BYVoid/uchardet/archive/v0.0.5/uchardet-0.0.5.tar.gz"
-MD5SUM="2421993e7b098366bd008d81385150b6"
+VERSION="0.0.6"
+HOMEPAGE="https://www.freedesktop.org/wiki/Software/uchardet/"
+DOWNLOAD="https://www.freedesktop.org/software/uchardet/releases/uchardet-0.0.6.tar.xz"
+MD5SUM="03425c0bbe5faaf399e15e947d3e03c7"
DOWNLOAD_x86_64=""
MD5SUM_x86_64=""
REQUIRES=""
diff --git a/libraries/libuchardet/slack-desc b/libraries/libuchardet/slack-desc
index 11f882aabd..c3d801fbe6 100644
--- a/libraries/libuchardet/slack-desc
+++ b/libraries/libuchardet/slack-desc
@@ -13,7 +13,7 @@ libuchardet: implementation of the universal charset detection library by Mozill
libuchardet: uchardet is an encoding detector library, which takes a sequence of
libuchardet: bytes in an unknown character encoding without any additional
libuchardet: information, and attempts to determine the encoding of the text.
+libuchardet: Returned encoding names are iconv-compatible.
libuchardet:
-libuchardet: Home page: https://github.com/BYVoid/uchardet/
-libuchardet:
+libuchardet: Home page: https://www.freedesktop.org/wiki/Software/uchardet/
libuchardet:
diff --git a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch
deleted file mode 100644
index c82aee866e..0000000000
--- a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
-Author: Jehan <jehan@girinstud.io>
-Date: Sat Dec 5 21:04:20 2015 +0100
-
- Nearly-ASCII text with NBSP is still not ASCII.
-
- There is no "exception" in encoding. The non-breaking space 0xA0 is not
- ASCII, and therefore returning "ASCII" will later create issues (for
- instance trying to re-encode with iconv produces an error).
- This was obviously an explicit decision in original code (according to
- code comments), probably tied to specifity of the original program from
- Mozilla. Now we want strict detection.
- I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
- exception" (note that I could have returned any ISO-8859 charsets since
- they all have this character in common).
-
-diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
-index ab8bae0..ff06b9d 100644
---- a/src/nsUniversalDetector.cpp
-+++ b/src/nsUniversalDetector.cpp
-@@ -47,6 +47,7 @@
-
- nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
- {
-+ mNbspFound = PR_FALSE;
- mDone = PR_FALSE;
- mBestGuess = -1; //illegal value as signal
- mInTag = PR_FALSE;
-@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
- void
- nsUniversalDetector::Reset()
- {
-+ mNbspFound = PR_FALSE;
- mDone = PR_FALSE;
- mBestGuess = -1; //illegal value as signal
- mInTag = PR_FALSE;
-@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
- PRUint32 i;
- for (i = 0; i < aLen; i++)
- {
-- /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
-+ /* If every other character is ASCII or 0xA0, we don't run charset
-+ * probers.
- * 0xA0 (NBSP in a few charset) is apparently a rare exception
-- * of non-ASCII character contained in ASCII text. */
-+ * of non-ASCII character often contained in nearly-ASCII text. */
- if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
- {
- /* We got a non-ASCII byte (high-byte) */
-@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
- }
- else
- {
-- //ok, just pure ascii so far
-- if ( ePureAscii == mInputState &&
-- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
-+ /* Just pure ASCII or NBSP so far. */
-+ if (aBuf[i] == '\xA0')
- {
-- //found escape character or HZ "~{"
-+ /* ASCII with the only exception of NBSP seems quite common.
-+ * I doubt it is really necessary to train a model here, so let's
-+ * just make an exception.
-+ */
-+ mNbspFound = PR_TRUE;
-+ }
-+ else if (mInputState == ePureAscii &&
-+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
-+ {
-+ /* We found an escape character or HZ "~{". */
- mInputState = eEscAscii;
- }
- mLastChar = aBuf[i];
-@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
- mDone = PR_TRUE;
- mDetectedCharset = mEscCharSetProber->GetCharSetName();
- }
-+ else if (mNbspFound)
-+ {
-+ mDetectedCharset = "ISO-8859-1";
-+ }
- else
- {
- /* ASCII with the ESC character (or the sequence "~{") is still
-@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
- break;
-
- default:
-- /* Pure ASCII */
-- mDetectedCharset = "ASCII";
-+ if (mNbspFound)
-+ {
-+ /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
-+ * (though it could have been any ISO-8859 encoding). */
-+ mDetectedCharset = "ISO-8859-1";
-+ }
-+ else
-+ {
-+ /* Pure ASCII */
-+ mDetectedCharset = "ASCII";
-+ }
- break;
- }
- return NS_OK;
-diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
-index 4d9b460..9f0a4b1 100644
---- a/src/nsUniversalDetector.h
-+++ b/src/nsUniversalDetector.h
-@@ -72,6 +72,7 @@ protected:
- virtual void Report(const char* aCharset) = 0;
- virtual void Reset();
- nsInputState mInputState;
-+ PRBool mNbspFound;
- PRBool mDone;
- PRBool mInTag;
- PRBool mStart;
diff --git a/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch b/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch
deleted file mode 100644
index b1ed88991c..0000000000
--- a/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55
-Author: Jehan <jehan@girinstud.io>
-Date: Tue Dec 15 21:40:16 2015 +0100
-
- app: package name wrong in CMakeLists.txt.
-
- Probably coming from a copy-paste error when the build system was
- originally created.
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 0b65c49..4f279e1 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -1,6 +1,6 @@
- ######## Project settings
- cmake_minimum_required(VERSION 2.8)
--set (PACKAGE_NAME opencc)
-+set (PACKAGE_NAME uchardet)
- project (${PACKAGE_NAME} CXX C)
- enable_testing()
-
-@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR)
- set (DIR_ETC ${SYSCONF_INSTALL_DIR})
- endif (DEFINED SYSCONF_INSTALL_DIR)
-
--set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc)
-+set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet)
- set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale)
-
- ######## Configuration