summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPierre Cazenave <pwcazenave at gmail {dot} com>2010-07-30 02:46:02 -0500
committerErik Hanson <erik@slackbuilds.org>2010-07-31 22:31:51 -0500
commite502945912c3ccc6d55a6819bc921cf5f47cc4fd (patch)
tree5c008c3bf671002bdedefa3f8a7fde59d88bb113
parent43cc5518b42cbafb28111cfa607ce8a50e64bb6a (diff)
downloadslackbuilds-e502945912c3ccc6d55a6819bc921cf5f47cc4fd.tar.gz
graphics/ocropus: Added (document analysis and OCR system)
Signed-off-by: Robby Workman <rworkman@slackbuilds.org>
-rw-r--r--graphics/ocropus/README9
-rw-r--r--graphics/ocropus/ocrodata-env.diff15
-rw-r--r--graphics/ocropus/ocropus.SlackBuild109
-rw-r--r--graphics/ocropus/ocropus.info10
-rw-r--r--graphics/ocropus/ocroscript.143
-rw-r--r--graphics/ocropus/slack-desc19
-rw-r--r--graphics/ocropus/usr-local.diff22
7 files changed, 227 insertions, 0 deletions
diff --git a/graphics/ocropus/README b/graphics/ocropus/README
new file mode 100644
index 0000000000..804acf85ff
--- /dev/null
+++ b/graphics/ocropus/README
@@ -0,0 +1,9 @@
+OCRopus is a state-of-the-art document analysis and OCR system, featuring
+pluggable layout analysis, pluggable character recognition, statistical
+natural language modeling, and multi-lingual capabilities.
+
+The system is being developed with the generous support from Google and
+other organizations; the primary developers are at the IUPR Research
+Group at the DFKI Research Center.
+
+This requires tesseract and iulib.
diff --git a/graphics/ocropus/ocrodata-env.diff b/graphics/ocropus/ocrodata-env.diff
new file mode 100644
index 0000000000..04cfd5d9af
--- /dev/null
+++ b/graphics/ocropus/ocrodata-env.diff
@@ -0,0 +1,15 @@
+Description: Respect the OCRODATA environment variable for all lua scripts.
+Author: Jakub Wilk <jwilk@debian.org>
+
+Index: ocropus-0.3.1/ocroscript/ocrotoplevel.cc
+===================================================================
+--- ocropus-0.3.1.orig/ocroscript/ocrotoplevel.cc 2009-11-26 18:47:54.000000000 +0100
++++ ocropus-0.3.1/ocroscript/ocrotoplevel.cc 2009-11-26 18:47:54.000000000 +0100
+@@ -471,6 +471,7 @@
+ lua_call(L, 0, 0);
+
+ // handle OCRODATA environment variable as a directory
++ if(getenv("OCRODATA")) ocroscripts = getenv("OCRODATA");
+ lua_pushstring(L, ocrodata);
+ lua_setglobal(L, "ocrodata");
+
diff --git a/graphics/ocropus/ocropus.SlackBuild b/graphics/ocropus/ocropus.SlackBuild
new file mode 100644
index 0000000000..e8c2ce60b0
--- /dev/null
+++ b/graphics/ocropus/ocropus.SlackBuild
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+# Slackware build script for OCROpus.
+
+# Copyright 2010 Pierre Cazenave <pwcazenave {at} gmail [dot] com>
+# All rights reserved.
+#
+# Redistribution and use of this script, with or without modification, is
+# permitted provided that the following conditions are met:
+#
+# 1. Redistributions of this script must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PRGNAM=ocropus
+VERSION=${VERSION:-0.3.1}
+BUILD=${BUILD:-1}
+TAG=${TAG:-_SBo}
+
+DIRVER=${DIRVER:-0.3}
+
+if [ -z "$ARCH" ]; then
+ case "$( uname -m )" in
+ i?86) ARCH=i486 ;;
+ arm*) ARCH=arm ;;
+ *) ARCH=$( uname -m ) ;;
+ esac
+fi
+
+CWD=$(pwd)
+TMP=${TMP:-/tmp/SBo}
+PKG=$TMP/package-$PRGNAM
+OUTPUT=${OUTPUT:-/tmp}
+
+if [ "$ARCH" = "i486" ]; then
+ SLKCFLAGS="-O2 -march=i486 -mtune=i686"
+ LIBDIRSUFFIX=""
+elif [ "$ARCH" = "i686" ]; then
+ SLKCFLAGS="-O2 -march=i686 -mtune=i686"
+ LIBDIRSUFFIX=""
+elif [ "$ARCH" = "x86_64" ]; then
+ SLKCFLAGS="-O2 -fPIC"
+ LIBDIRSUFFIX="64"
+else
+ SLKCFLAGS="-O2"
+ LIBDIRSUFFIX=""
+fi
+
+set -e
+
+rm -rf $PKG
+mkdir -p $TMP $PKG $OUTPUT
+cd $TMP
+rm -rf $PRGNAM-$DIRVER
+tar xvf $CWD/$PRGNAM-$VERSION.tar.gz
+cd $PRGNAM-$DIRVER
+chown -R root:root .
+chmod -R u+w,go+r-w,a-s .
+
+# Debian patch to fix hardcoded /usr/local paths in some source files
+patch -p1 < $CWD/usr-local.diff
+# Debian patch to fix behaviour of the OCRODATA environment variable
+patch -p1 < $CWD/ocrodata-env.diff
+
+CFLAGS="$SLKCFLAGS" \
+CXXFLAGS="$SLKCFLAGS" \
+./configure \
+ --prefix=/usr \
+ --sysconfdir=/etc \
+ --localstatedir=/var \
+ --libdir=/usr/lib${LIBDIRSUFFIX} \
+ --mandir=/usr/man \
+ --docdir=/usr/doc/$PRGNAM-$VERSION \
+ --with-tesseract=/usr \
+ --with-iulib=/usr \
+ --without-fst \
+ --without-SDL \
+ --without-leptonica \
+ --build=$ARCH-slackware-linux
+
+make
+make install DESTDIR=$PKG
+
+find $PKG | xargs file | grep -e "executable" -e "shared object" | grep ELF \
+ | cut -f 1 -d : | xargs strip --strip-unneeded 2> /dev/null || true
+
+# Add Debian's manpage
+mkdir -p $PKG/usr/man/man1
+gzip -9c $CWD/ocroscript.1 > $PKG/usr/man/man1/ocroscript.1.gz
+
+mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION
+cp -a CHANGES COPYING DIRS INSTALL README $PKG/usr/doc/$PRGNAM-$VERSION
+cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM.SlackBuild
+
+mkdir -p $PKG/install
+cat $CWD/slack-desc > $PKG/install/slack-desc
+
+cd $PKG
+/sbin/makepkg -l y -c n $OUTPUT/$PRGNAM-$VERSION-$ARCH-$BUILD$TAG.${PKGTYPE:-tgz}
diff --git a/graphics/ocropus/ocropus.info b/graphics/ocropus/ocropus.info
new file mode 100644
index 0000000000..a38b5bc680
--- /dev/null
+++ b/graphics/ocropus/ocropus.info
@@ -0,0 +1,10 @@
+PRGNAM="ocropus"
+VERSION="0.3.1"
+HOMEPAGE="http://sites.google.com/site/ocropus/"
+DOWNLOAD="http://ocropus.googlecode.com/files/ocropus-0.3.1.tar.gz"
+MD5SUM="2a1b66419ae69ef031d5e6269db15bb5"
+DOWNLOAD_x86_64=""
+MD5SUM_x86_64=""
+MAINTAINER="Pierre Cazenave"
+EMAIL="pwcazenave < at > gmail {dot} com"
+APPROVED="rworkman"
diff --git a/graphics/ocropus/ocroscript.1 b/graphics/ocropus/ocroscript.1
new file mode 100644
index 0000000000..d8087203f7
--- /dev/null
+++ b/graphics/ocropus/ocroscript.1
@@ -0,0 +1,43 @@
+.TH ocroscript 1 "June 06, 2008"
+.SH NAME
+ocropus \- command line OCR tool
+.SH SYNOPSIS
+.B ocroscript
+.RI "<script> <arguments>"
+.SH DESCRIPTION
+You can see a list of all available commands by looking in the $OCROSCRIPTS
+(/usr/share/ocropus/scripts/ by default) path.
+.PP
+The \(oqrecognize\(cq script uses tesseract for recognition and sends the html-based hOCR
+ouput to stdout. Tesseract is probably the most mature text recognizer within
+OCRopus at the moment. Natively, Tesseract doesn't do layout analysis, but
+combined with OCRopus, it makes for a pretty good OCR system:
+.RS
+$ ocroscript recognize page.png > page.html
+.RE
+.PP
+Here is a brief summary of the remaining command line commands available.
+You will need to look at the script to see what the command line arguments are:
+.TP
+degrade.lua
+Simple document image degradation
+.TP
+hocr-to-text.lua
+Convert hOCR output to plain text.
+.TP
+line-clean.lua
+Given a line image, remove marginal noise and fix some other problems.
+.TP
+sauvola.lua
+Perform Sauvola thresholding.
+.SH SEE ALSO
+.BR tesseract (1),
+.br
+.PP
+.UR http://code.google.com/p/ocropus/w/list
+.UE
+.SH AUTHOR
+ocroscript was written by Thomas Breuel.
+.PP
+This manual page was written by Jeffrey Ratcliffe <Jeffrey.Ratcliffe@gmail.com>,
+for the Debian project (but may be used by others).
diff --git a/graphics/ocropus/slack-desc b/graphics/ocropus/slack-desc
new file mode 100644
index 0000000000..00aef62c44
--- /dev/null
+++ b/graphics/ocropus/slack-desc
@@ -0,0 +1,19 @@
+# HOW TO EDIT THIS FILE:
+# The "handy ruler" below makes it easier to edit a package description. Line
+# up the first '|' above the ':' following the base package name, and the '|'
+# on the right side marks the last column you can put a character in. You must
+# make exactly 11 lines for the formatting to be correct. It's also
+# customary to leave one space after the ':'.
+
+ |-----handy-ruler------------------------------------------------------|
+ocropus: OCRopus (document analysis and OCR system)
+ocropus:
+ocropus: OCRopus(tm) is a state-of-the-art document analysis and OCR system
+ocropus: featuring pluggable layout analysis, pluggable character recognition,
+ocropus: statistical natural language modeling, and multi-lingual capabilities.
+ocropus:
+ocropus: The system is being developed with the generous support from Google
+ocropus: and other organizations; the primary developers are at the IUPR
+ocropus: Research Group at the DFKI Research Center.
+ocropus:
+ocropus: http://sites.google.com/site/ocropus/
diff --git a/graphics/ocropus/usr-local.diff b/graphics/ocropus/usr-local.diff
new file mode 100644
index 0000000000..0a17478e45
--- /dev/null
+++ b/graphics/ocropus/usr-local.diff
@@ -0,0 +1,22 @@
+Description:
+ Use /usr/share/ocropus/scripts/ and /usr/share/ocropus/ as defaults for
+ OCROSCRIPTS and OCRODATA.
+Author: Jakub Wilk <jwilk@debian.org>
+
+Index: ocropus-0.3.1/ocroscript/ocrotoplevel.cc
+===================================================================
+--- ocropus-0.3.1.orig/ocroscript/ocrotoplevel.cc 2009-11-26 16:56:18.000000000 +0100
++++ ocropus-0.3.1/ocroscript/ocrotoplevel.cc 2009-11-26 17:16:32.000000000 +0100
+@@ -68,10 +68,10 @@
+
+ // FIXME the Jamfile isn't passing this flag, so for now, this is a workaround
+ #ifndef OCROSCRIPTS
+-#define OCROSCRIPTS "/usr/local/share/ocropus/scripts/"
++#define OCROSCRIPTS "/usr/share/ocropus/scripts/"
+ #endif
+ #ifndef OCRODATA
+-#define OCRODATA "/usr/local/share/ocropus/"
++#define OCRODATA "/usr/share/ocropus/"
+ #endif
+
+ const char *ocroscripts = OCROSCRIPTS;