Some time ago I've already written a tutorial how to install tesseract for python on Ubuntu 14.04.
And today I've struggled with the new challenges during the installation of tesseract and python-tesseract on Ubuntu 15.10. So here is my way to make it usable.
user@server:~$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=15.10
DISTRIB_CODENAME=wily
DISTRIB_DESCRIPTION="Ubuntu 15.10"
Install packages
sudo apt-get install python-distutils-extra tesseract-ocr tesseract-ocr-eng libopencv-dev libtesseract-dev libleptonica-dev python-all-dev swig libcv-dev python-opencv python-numpy python-setuptools build-essential subversion git
sudo apt-get install autoconf automake libtool
sudo apt-get install libpng12-dev libjpeg62-dev libtiff4-dev zlib1g-dev
Download leptonica
wget http://www.leptonica.com/source/leptonica-1.73.tar.gz
tar xvf leptonica-1.73.tar.gz
build it
cd leptonica-1.73
./configure
make
make install
Download tesseract-ocr
wget https://github.com/tesseract-ocr/tesseract/archive/3.04.00.tar.gz
tar xvf 3.04.00.tar.gz
cd tesseract-3.04.00
./autogen.sh
./configure
make
sudo make install
sudo ldconfig
And test:
user@server:~$ tesseract
Usage:
tesseract imagename|stdin outputbase|stdout [options...] [configfile...]
Check out `python-tesseract`
git clone https://bitbucket.org/3togo/python-tesseract.git
It's needed to update "baseapi_mini.h" file in ./python-tesseract/src/ folder:
....
class MutableIterator;
line: 85
class TessResultRenderer;
....
line 316:
//void SetImage(const Pix* pix);
void SetImage(Pix* pix);
line 477:
/*
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out);
*/
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
line 493:
/*
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out);
*/
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
It's needed to update "main.cpp" file in ./python-tesseract/src/ folder:
line: 15
#include "renderer.h"
line: 64
char* ProcessPagesWrapper(const char* image,tesseract::TessBaseAPI* api) {
const char *data = "";
tesseract::TessTextRenderer renderer(data);
api->ProcessPages(image, NULL, 0, &renderer);
return api->GetUTF8Text();
}
line: 73
char* ProcessPagesPix(const char* image,tesseract::TessBaseAPI* api) {
const char *data = "";
tesseract::TessTextRenderer renderer(data);
int page=0;
Pix *pix;
pix = pixRead(image);
api->ProcessPage(pix, page, NULL, NULL, 0, &renderer);
free(pix->data);
free(pix->text);
return api->GetUTF8Text();
}
line: 86
char* ProcessPagesFileStream(const char* image,tesseract::TessBaseAPI* api) {
Pix *pix;
const char *data = "";
tesseract::TessTextRenderer renderer(data);
int page=0;
FILE *fp=fopen(image,"rb");
pix=pixReadStream(fp,0);
fclose(fp);
api->ProcessPage(pix, page, NULL, NULL, 0, &renderer);
free(pix->data);
free(pix->text);
return api->GetUTF8Text();
}
line 107:
char* ProcessPagesBuffer(char* buffer, int fileLen, tesseract::TessBaseAPI* api) {
FILE *stream;
stream=fmemopen((void*)buffer,fileLen,"rb");
if (stream == NULL) {
puts("cant't open stream using fmemopen");
return (char*)"Error";
}
Pix *pix;
int page=0;
const char *data = "";
tesseract::TessTextRenderer renderer(data);
pix=pixReadStream(stream,0);
if (stream != NULL)
fclose(stream);
api->ProcessPage(pix, page, NULL, NULL, 0, &renderer);
free(pix->data);
free(pix->text);
return api->GetUTF8Text();
}
and build it:
python setup.py clean
python setup.py build
sudo python setup.py install
After that try to run your python example.
If you'll get such error:
Error opening data file ./tessdata/eng.traineddata
Please make sure the TESSDATA_PREFIX environment variable is set to the parent directory of your "tessdata" directory.
Failed loading language 'eng'
Tesseract couldn't load any languages!
AdaptedTemplates != NULL:Error:Assert failed:in file adaptmatch.cpp, line 174
Segmentation fault (core dumped)
You could fix it by patching "mainblk.cpp" file inside tesseract-3.04.00/ccutil/ folder the next way:
In the "mainblk.cpp" file code:
if (argv0 != NULL) {
datadir = argv0;
} else {
if (getenv("TESSDATA_PREFIX")) {
datadir = getenv("TESSDATA_PREFIX");
} else {
#ifdef TESSDATA_PREFIX
#define _STR(a) #a
#define _XSTR(a) _STR(a)
datadir = _XSTR(TESSDATA_PREFIX);
#undef _XSTR
#undef _STR
#endif
}
}
// insert code here
// datadir may still be empty:
if (datadir.length() == 0) {
datadir = "./";
add into "insert code here" place the next code:
if (getenv("TESSDATA_PREFIX")) {
datadir = getenv("TESSDATA_PREFIX");
} else {
// check dir with tessdata
struct stat sb;
if (stat("/usr/share/tesseract-ocr/tessdata", &sb) == 0 && S_ISDIR(sb.st_mode)) {
datadir = "/usr/share/tesseract-ocr";
}
}
and include the next:
#include <sys/stat.h>
Rebuild and reinstall tesseract-ocr:
cd tesseract-3.04.00
make
sudo make install
So, after that, if you have TESSDATA_PREFIX env variable, it will be loaded, and if you have tessdata folder with files in /usr/share/tesseract-ocr/ it will be loaded, otherwise directory with your python example module (./) will be checked for tessdata folder.
Test installed python tesseract using the tests in test folder:
user@server:~/python-tesseract/src/test$ python test.py
result(ProcessPagesWrapper)= The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
fiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom répida
salta sobre 0 C50 preguieoso.
result(ProcessPagesFileStream)= The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
fiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom répida
salta sobre 0 C50 preguicoso.
size=156302
retStr length=422
result(ProcessPagesRaw) The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
iiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom rapida
salta sobre 0 C50 preguicoso.
len=156302
result(ProcessPagesBuffer)= The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
iiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom rapida
salta sobre 0 C50 preguicoso.
user@server:~/python-tesseract/src/test$ python test2.py
The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
fiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom répida
salta sobre 0 C50 preguieoso.
88
And today I've struggled with the new challenges during the installation of tesseract and python-tesseract on Ubuntu 15.10. So here is my way to make it usable.
user@server:~$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=15.10
DISTRIB_CODENAME=wily
DISTRIB_DESCRIPTION="Ubuntu 15.10"
Install packages
sudo apt-get install python-distutils-extra tesseract-ocr tesseract-ocr-eng libopencv-dev libtesseract-dev libleptonica-dev python-all-dev swig libcv-dev python-opencv python-numpy python-setuptools build-essential subversion git
sudo apt-get install autoconf automake libtool
sudo apt-get install libpng12-dev libjpeg62-dev libtiff4-dev zlib1g-dev
Download leptonica
wget http://www.leptonica.com/source/leptonica-1.73.tar.gz
tar xvf leptonica-1.73.tar.gz
build it
cd leptonica-1.73
./configure
make
make install
Download tesseract-ocr
wget https://github.com/tesseract-ocr/tesseract/archive/3.04.00.tar.gz
tar xvf 3.04.00.tar.gz
cd tesseract-3.04.00
./autogen.sh
./configure
make
sudo make install
sudo ldconfig
And test:
user@server:~$ tesseract
Usage:
tesseract imagename|stdin outputbase|stdout [options...] [configfile...]
Check out `python-tesseract`
git clone https://bitbucket.org/3togo/python-tesseract.git
It's needed to update "baseapi_mini.h" file in ./python-tesseract/src/ folder:
....
class MutableIterator;
line: 85
class TessResultRenderer;
....
line 316:
//void SetImage(const Pix* pix);
void SetImage(Pix* pix);
line 477:
/*
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out);
*/
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
line 493:
/*
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out);
*/
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
It's needed to update "main.cpp" file in ./python-tesseract/src/ folder:
line: 15
#include "renderer.h"
line: 64
char* ProcessPagesWrapper(const char* image,tesseract::TessBaseAPI* api) {
const char *data = "";
tesseract::TessTextRenderer renderer(data);
api->ProcessPages(image, NULL, 0, &renderer);
return api->GetUTF8Text();
}
line: 73
char* ProcessPagesPix(const char* image,tesseract::TessBaseAPI* api) {
const char *data = "";
tesseract::TessTextRenderer renderer(data);
int page=0;
Pix *pix;
pix = pixRead(image);
api->ProcessPage(pix, page, NULL, NULL, 0, &renderer);
free(pix->data);
free(pix->text);
return api->GetUTF8Text();
}
line: 86
char* ProcessPagesFileStream(const char* image,tesseract::TessBaseAPI* api) {
Pix *pix;
const char *data = "";
tesseract::TessTextRenderer renderer(data);
int page=0;
FILE *fp=fopen(image,"rb");
pix=pixReadStream(fp,0);
fclose(fp);
api->ProcessPage(pix, page, NULL, NULL, 0, &renderer);
free(pix->data);
free(pix->text);
return api->GetUTF8Text();
}
line 107:
char* ProcessPagesBuffer(char* buffer, int fileLen, tesseract::TessBaseAPI* api) {
FILE *stream;
stream=fmemopen((void*)buffer,fileLen,"rb");
if (stream == NULL) {
puts("cant't open stream using fmemopen");
return (char*)"Error";
}
Pix *pix;
int page=0;
const char *data = "";
tesseract::TessTextRenderer renderer(data);
pix=pixReadStream(stream,0);
if (stream != NULL)
fclose(stream);
api->ProcessPage(pix, page, NULL, NULL, 0, &renderer);
free(pix->data);
free(pix->text);
return api->GetUTF8Text();
}
and build it:
python setup.py clean
python setup.py build
sudo python setup.py install
After that try to run your python example.
If you'll get such error:
Error opening data file ./tessdata/eng.traineddata
Please make sure the TESSDATA_PREFIX environment variable is set to the parent directory of your "tessdata" directory.
Failed loading language 'eng'
Tesseract couldn't load any languages!
AdaptedTemplates != NULL:Error:Assert failed:in file adaptmatch.cpp, line 174
Segmentation fault (core dumped)
You could fix it by patching "mainblk.cpp" file inside tesseract-3.04.00/ccutil/ folder the next way:
In the "mainblk.cpp" file code:
if (argv0 != NULL) {
datadir = argv0;
} else {
if (getenv("TESSDATA_PREFIX")) {
datadir = getenv("TESSDATA_PREFIX");
} else {
#ifdef TESSDATA_PREFIX
#define _STR(a) #a
#define _XSTR(a) _STR(a)
datadir = _XSTR(TESSDATA_PREFIX);
#undef _XSTR
#undef _STR
#endif
}
}
// insert code here
// datadir may still be empty:
if (datadir.length() == 0) {
datadir = "./";
add into "insert code here" place the next code:
if (getenv("TESSDATA_PREFIX")) {
datadir = getenv("TESSDATA_PREFIX");
} else {
// check dir with tessdata
struct stat sb;
if (stat("/usr/share/tesseract-ocr/tessdata", &sb) == 0 && S_ISDIR(sb.st_mode)) {
datadir = "/usr/share/tesseract-ocr";
}
}
and include the next:
#include <sys/stat.h>
Rebuild and reinstall tesseract-ocr:
cd tesseract-3.04.00
make
sudo make install
So, after that, if you have TESSDATA_PREFIX env variable, it will be loaded, and if you have tessdata folder with files in /usr/share/tesseract-ocr/ it will be loaded, otherwise directory with your python example module (./) will be checked for tessdata folder.
Test installed python tesseract using the tests in test folder:
user@server:~/python-tesseract/src/test$ python test.py
result(ProcessPagesWrapper)= The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
fiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom répida
salta sobre 0 C50 preguieoso.
result(ProcessPagesFileStream)= The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
fiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom répida
salta sobre 0 C50 preguicoso.
size=156302
retStr length=422
result(ProcessPagesRaw) The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
iiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom rapida
salta sobre 0 C50 preguicoso.
len=156302
result(ProcessPagesBuffer)= The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
iiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom rapida
salta sobre 0 C50 preguicoso.
user@server:~/python-tesseract/src/test$ python test2.py
The (quick) [brown] {fox} jumps!
Over the $43,456.78 <lazy> #90 dog
& duck/goose, as 12.5% of E-mail
from aspammer@website.com is spam.
Der ,,schnelle” braune Fuchs springt
fiber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra i] cane pigro. El zorro
marrén répido salta sobre el perro
perezoso. A raposa marrom répida
salta sobre 0 C50 preguieoso.
88
No comments:
Post a Comment