Skip to content
Commits on Source (38)
ocrizer (1.0.0) jessie; urgency=medium
[ Colomban Wendling ]
* ocrize.py: fix error recovery when converting a PDF to an image fails.
* ocrengines/finereader_ocrizer.py: add post-processing to make the
output ODF files more readable with Orca.
* Fix converting TXT to ODT when LibreOffice is already running.
This fixes the Tesseract backend with the default option when
LibreOffice is already running.
* ocrengines/finereader_ocrizer.py: Fix handling of improperly oriented
PDFs.
* po/fr.po: Update French translation.
[ Alex ARNAUD ]
* ocrize.py: display OCR engine on each call on the console, not only
when scanning a document
* ocrize.py: display an error to the user when it occurs to avoid
waiting indefinitely
* Make USB scanners have a priority over network ones
-- Alex ARNAUD <alexarnaud@hypra.fr> Tue, 09 Apr 2019 12:38:20 +0200
ocrizer (0.4.1) stretch; urgency=medium
[ Alex ARNAUD ]
* change the default desktop icon from the Mate generic one to printer
* Add the category Graphics to the desktop icon to make the software
discoverable on the Mate menu
-- Cyril Brulebois <cyril@debamax.com> Fri, 20 Apr 2018 01:45:41 +0200
ocrizer (0.4) jessie; urgency=medium
[ Alex ARNAUD ]
......@@ -26,6 +57,12 @@ ocrizer (0.4) jessie; urgency=medium
-- Cyril Brulebois <cyril@debamax.com> Tue, 09 Jan 2018 01:38:54 +0100
ocrizer (0.3.1) stretch; urgency=medium
* Rebuild within stretch.
-- Cyril Brulebois <cyril@debamax.com> Thu, 31 Aug 2017 22:55:21 +0200
ocrizer (0.3) jessie; urgency=medium
[ Alex ARNAUD ]
......
......@@ -62,6 +62,7 @@ Depends:
${misc:Depends},
python3,
ocrizer-common (= ${source:Version}),
python3-lxml,
Description: Finereader engine support for ocrizer
This package installs the glue between the abby proprietary command line tool
and ocrizer, and makes finereader an available engine.
usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py
usr/share/ocrize/ocrengines/finereader/odf-post-process.xsl
#!/usr/bin/make -f
# GNU Make makefile to manage translations
PACKAGE=ocrize
VERSION=$(shell dpkg-parsechangelog -l ../debian/changelog -S Version)
BUGMAIL=bugs@hypra.fr
LINGUAS=fr
XGETTEXT=xgettext
MSGMERGE=msgmerge
MSGFMT=msgfmt
POTFILES=$(shell sed 's,^,../,' POTFILES.in)
PO_FILES=$(LINGUAS:%=%.po)
all: update-po
update-po: $(PO_FILES)
$(PACKAGE).pot: POTFILES.in $(POTFILES)
$(XGETTEXT) \
--package-name="$(PACKAGE)" \
--package-version="$(VERSION)" \
--msgid-bugs-address="$(BUGMAIL)" \
-o $@ \
$(POTFILES)
%.po: $(PACKAGE).pot
$(MSGMERGE) -U $@ $(PACKAGE).pot
check: $(PO_FILES)
$(MSGFMT) -c $^ -o /dev/null
clean:
rm -f $(PACKAGE).pot
usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py
usr/lib/python3/dist-packages/ocrize/ocrengines/tesseract_ocrizer.py
usr/lib/python3/dist-packages/ocrize/ocrize.py
......@@ -5,136 +5,188 @@
#
msgid ""
msgstr ""
"Project-Id-Version: 0.2\n"
"Project-Id-Version: 0.4.2\n"
"Report-Msgid-Bugs-To: bugs@hypra.fr\n"
"POT-Creation-Date: 2016-05-02 16:51+0200\n"
"PO-Revision-Date: 2017-11-09 12:08+0100\n"
"Last-Translator: Alex ARNAUD <alexarnaud@hypra.fr>\n"
"POT-Creation-Date: 2018-12-05 09:59+0100\n"
"PO-Revision-Date: 2018-12-05 10:00+0100\n"
"Last-Translator: Colomban Wendling <cwendling@hypra.fr>\n"
"Language-Team: Hypra team <bugs@hypra.fr>\n"
"Language: French\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=2; plural=(n > 1);\n"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:60
msgid "Attempting to scan a new image"
msgstr "Démarrage de l'acquisition d'image"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:49
msgid "abbyy Finereader executable '/usr/local/bin/abbyyocr11' not available."
msgstr ""
"l'exécutable ABBYY Finereader « /usr/local/bin/abbyyocr11 » n'est pas "
"disponible."
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:50
msgid "abbyy Finereader executable not found"
msgstr "impossible de trouver l'exécutable ABBYY Finereader"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:81
#, python-brace-format
msgid "Failed to extract page count from PDF \"{0}\": {1}"
msgstr "Impossible d'extraire le nombre de pages du fichier PDF « {0} » : {1}"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:93
#, python-brace-format
msgid "File \"{0}\" has one page"
msgid_plural "File \"{0}\" has {1} pages"
msgstr[0] "Le fichier « {0} » a {1} page"
msgstr[1] "Le fichier « {0} » a {1} pages"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:96
#, python-brace-format
msgid "Not converting PDF \"{0}\" to TIFF because it has too many pages"
msgstr ""
"Le fichier PDF « {0} » ne sera pas converti en TIFF car il a trop de pages"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:77
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:78
msgid "Couldn't find any scanner device"
msgstr "Impossible de trouver un scanner"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:103
#, python-brace-format
msgid "Command for converting PDF to TIFF: {0}"
msgstr "Commande pour convertir le PDF en TIFF : {0}"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:81
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:82
msgid "Couldn't find an obvious scanner device"
msgstr "Aucun périphérique d'acquisition évident trouvé"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:107
#, python-brace-format
msgid ""
"Failed to convert PDF \"{0}\" to TIFF: falling back to using PDF directly"
msgstr ""
"Erreur lors de la conversion du PDF « {0} » en TIFF, utilisation du PDF "
"directement à la place"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:110
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:220
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:246
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:265
msgid "An internal error occurred"
msgstr "Une erreur interne s'est produite"
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:116
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:117
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:303
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:304
#, python-format
msgid "Output format %s is not supported"
msgstr "Format de sortie %s non supporté"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:86
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:125
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:312
#, python-format
msgid "Found %s as acquisition device"
msgstr "Trouvé le périférique d'acquisition %s"
msgid "Info: Converting output file %s to %s"
msgstr "Info: Convertion du fichier de sortie %s vers %s"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:94
#: ../usr/lib/python3/dist-packages/ocrize/ocrengines/finereader_ocrizer.py:130
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:317
#, python-format
msgid "Cannot set depth, defaulting to %d"
msgstr ""
"Impossible de paramétrer la profondeur de bits, utilisation du défaut %d"
msgid "Couldn't manage the output format %s"
msgstr "Impossible de gérer le format de sortie %s"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:99
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:106
msgid "Attempting to scan a new image"
msgstr "Démarrage de l'acquisition d'image"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:110
msgid "Chosen scanner : "
msgstr "Scanner sélectionné : "
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:113
#, python-format
msgid "Cannot set mode, defaulting to %s"
msgstr "Impossible de paramétrer le mode couleur, utilisation du défaut %s"
msgid "Found %s as acquisition device"
msgstr "Trouvé le périférique d'acquisition %s"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:118
msgid "Mode \"color\" not accepted by the driver"
msgstr "Le mode « color » n'est pas accepté par le pilote"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:105
msgid "Cannot set scan area, using default"
msgstr "Impossible de paramétrer la zone de scan, utilisation du défaut"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:127
msgid "Resolution : "
msgstr "Résolution : "
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:108
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:130
msgid "Device parameters: "
msgstr "Paramètres du périférique: "
msgstr "Paramètres du périphérique : "
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:110
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:132
msgid "Scanning a document, please wait"
msgstr "Acquisition du document, merci de patienter"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:130
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:203
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:170
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:171
msgid "Couldn't find any scanner device"
msgstr "Impossible de trouver un scanner"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:174
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:175
msgid "Couldn't find an obvious scanner device"
msgstr "Aucun périphérique d'acquisition évident trouvé"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:192
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:297
#, python-format
msgid "Couldn't manage the input format %s"
msgstr "Impossible de gérer le format d'entrée %s"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:138
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:141
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:144
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:204
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:207
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:210
msgid "scanned_document_"
msgstr "document_scanne_"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:150
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:173
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:183
msgid "An internal error occured"
msgstr "Une erreur interne s'est produit"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:191
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:282
msgid "No input file."
msgstr "Pas de fichier d'entrée"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:192
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:283
msgid "please provide an input file to process"
msgstr "veuillez fournir un fichier d'entrée à traiter"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:209
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:210
#, python-format
msgid "Output format %s is not supported"
msgstr "Format de sortie %s non supporté"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:218
#, python-format
msgid "Info: Converting output file %s to %s"
msgstr "Info: Convertion du fichier de sortie %s vers %s"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:223
#, python-format
msgid "Couldn't manage the output format %s"
msgstr "Impossible de gérer le format de sortie %s"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:340
msgid "Opening libreoffice"
msgstr "Ouverture de LibreOffice"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:250
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:349
msgid "Starting Optical Character Recognition"
msgstr "Démarrage de la Reconnaissance Optique de Caractères"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:253
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:350
msgid "OCR Engine : "
msgstr "Moteur d'OCR : "
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:353
msgid "Treating input image, please wait"
msgstr "Traitement de l'image d'entrée, merci de patienter"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:255
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:355
msgid "Recognizing text from image, please wait"
msgstr "Reconnaissance de texte depuis l'image, merci de patienter"
#. sets self.output_files
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:257
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:358
msgid "Finalising document, please wait"
msgstr "Finalisation du document, merci de patienter"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:261
msgid "Opening libreoffice"
msgstr "Ouverture de LibreOffice"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:366
msgid "An error occurred during Optical Character Recognition"
msgstr ""
"Une erreur est survenue pendant la Reconnaissance Optique de Caractères"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:271
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:368
msgid "finished"
msgstr "Terminé"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:288
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:298
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:385
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:395
msgid "Couldn't find any OCR engine available, exiting"
msgstr "Impossible de trouver un moteur d'OCR disponible"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:304
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:401
msgid "Mate-accessibility OCR tool"
msgstr "Outil de reconnaissance de caractère de Mate-accessibility"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:304
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:401
msgid ""
"The mate-accessibility tool for OCRising documents. includes a scanner "
"function"
......@@ -142,15 +194,15 @@ msgstr ""
"Outil de reconnaissance de caractères (OCR) de mate-accessibility. Inclut "
"une fonction scanner"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:305
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:402
msgid "file(s)"
msgstr "fichier(s)"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:305
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:402
msgid "file(s) to run OCR against"
msgstr "fichier(s) à analyser"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:306
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:403
msgid ""
"scan mode. attempts to scan a file from available device. This option "
"ignores other input files"
......@@ -158,7 +210,7 @@ msgstr ""
"mode scanner. tente de scanner un document depuis les équipements existants. "
"Cette option ignorera les autres fichier(s) en entrée"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:307
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:404
msgid ""
"specify a scanner, understands sane-like devices, eg: 'genesys:"
"libusb:001:002'"
......@@ -166,39 +218,73 @@ msgstr ""
"spécifie un périférique pour scanner. comprend les formats sane, ex: "
"'genesys:libusb:001:002'"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:308
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:405
msgid "attempt to scan from all scanners"
msgstr "tente de scanner depuis n'importe quel périférique"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:309
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:406
msgid "List all availables scanner"
msgstr "Liste tous les scanners disponibles"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:407
msgid "scanner resolution"
msgstr "résolution du scanner"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:408
msgid "The OCR engine to run"
msgstr "Le moteur d'OCR à utiliser"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:309
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:408
msgid "engine"
msgstr "moteur"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:310
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:409
msgid "the langage/locale to concentrate research on. eg: en_US"
msgstr ""
"le language (la locale) sur laquelle concentrer les recherches ex: fr_FR"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:311
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:410
msgid "the desired output format"
msgstr "le format de sortie désiré"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:312
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:411
msgid "batch mode, do not open libreoffice in the end"
msgstr "mode batch, ne pas ouvrir libreoffice à la fin de l'execution"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:313
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:412
msgid "verbosity level"
msgstr "niveau de verbosité"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:314
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:413
msgid "do not log to syslog"
msgstr "ne pas logger dans syslog"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:316
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:414
msgid "do not delete temporary files (useful for debugging)"
msgstr "ne pas supprimer les fichiers temporaires (utile pour le débogage)"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:415
msgid "do not delete use OCR engine (useful for debugging)"
msgstr "ne pas utiliser le moteur d'OCR (utile pour le débogage)"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:416
msgid "do not open LibreOffice"
msgstr "ne pas ouvrir LibreOffice"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:417
msgid "ocrized documents destination directory"
msgstr "dossier de destination des documents reconnus"
#: ../usr/lib/python3/dist-packages/ocrize/ocrize.py:418
msgid "do not post-process Finereader ODF files"
msgstr "ne pas post-traiter les fichiers ODF de Finereader"
#~ msgid "Cannot set scan area, using default"
#~ msgstr "Impossible de paramétrer la zone de scan, utilisation du défaut"
#~ msgid "Cannot set mode, defaulting to %s"
#~ msgstr "Impossible de paramétrer le mode couleur, utilisation du défaut %s"
#~ msgid "Cannot set depth, defaulting to %d"
#~ msgstr ""
#~ "Impossible de paramétrer la profondeur de bits, utilisation du défaut %d"
import os
list_mods = [i[:-3] for i in os.listdir('/usr/lib/python3/dist-packages/ocrize/ocrengines') if i.endswith(".py") and i != "__init__.py"]
import inspect
#base_path = '/usr/lib/python3/dist-packages/ocrize/ocrengines'
base_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
list_mods = [i[:-3] for i in os.listdir(base_path) if i.endswith(".py") and i != "__init__.py"]
__all__ = list_mods
......@@ -31,13 +31,17 @@ from ocrize.ocrize import OCRizer
import os
import tempfile
import subprocess
from zipfile import ZipFile
from shutil import copyfileobj
from lxml import etree
class FinereaderOCRizer(OCRizer):
engine_name = 'finereader'
priority = 2
def __init__(self, args):
OCRizer.__init__(self, args)
self.available_input_formats = [".pdf", ".png", ".jpeg", ".jpg", ".tiff", ".bmp", ".gif", ".dcx", ".jbig2"]
self.available_input_formats = [".tiff", ".png", ".jpeg", ".jpg", ".bmp", ".gif", ".dcx", ".jbig2"]
self.available_output_formats = ['.txt', '.odt', '.htmlversion10defaults', '.htmlunicodedefaults', '.pdf', '.rtf', '.docx', '.xls', '.xlsx', '.xml', '.pptx', '.textversion10defaults', '.textunicodedefaults', '.alto', '.epub', '.fb2', '.odt']
self.available_format_extentions = {'txt':'.txt', 'odt':'.odt', 'htmlversion10defaults':'.html', 'htmlunicodedefaults':'.html', 'pdf':'.pdf', 'rtf':'.rtf', 'docx':'.docx', 'xls':'.xls', 'xlsx':'.xlsx', 'xml':'.xml', 'pptx':'.pptx', 'textversion10defaults':'.txt', 'textunicodedefaults':'.txt', 'alto':'.alto', 'epub':'.epub', 'fb2':'.fb2'}
self.lang_mapping = {'fr_FR':'French', 'en_GB':'English', 'en_US':'English'}
......@@ -59,8 +63,54 @@ class FinereaderOCRizer(OCRizer):
tmp_file = os.path.join(tmp_dir, tmp_name)
self.logger.info("ORCizing file %s", fil)
subprocess.check_call(['abbyyocr11', '-ido', '-rl', self.lang_mapping[self.args.language], '-if', fil, '-f', target_format, '-of', tmp_file], stdout=subprocess.DEVNULL)
if target_format in ['odt'] and not self.args.finereader_no_postproc:
postproc_tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=self.available_format_extentions[target_format])
self._post_process_odf(tmp_file, postproc_tmp_file)
postproc_tmp_file.close()
self.temp_files.append(tmp_file)
tmp_file = postproc_tmp_file.name
self.output_files.append(tmp_file)
def _get_pdf_page_count(self, filename_pdf):
try:
output = subprocess.check_output(["pdfinfo", filename_pdf])
lines = output.decode('ASCII', errors='ignore').splitlines()
pages_line = [line for line in lines if line.startswith("Pages:")][0]
return int(pages_line.split(":")[1])
except (FileNotFoundError, subprocess.CalledProcessError) as e:
self.logger.warning(_('Failed to extract page count from PDF "{0}": {1}').format(filename_pdf, e))
return 0
def pdf_extract_images(self, filename_pdf):
""" We pretend not to support PDF input because it doesn't properly
handle pages not properly oriented. To work around that, we
convert to TIFF, which doesn't show the issue.
However, TIFF files can easily grow huge, so we limit the
conversion to smallish page count, and hope for the best with
higher page count. """
page_count = self._get_pdf_page_count(filename_pdf)
self.logger.debug(ngettext('File "{0}" has one page',
'File "{0}" has {1} pages', page_count).format(filename_pdf, page_count))
if page_count < 1 or page_count >= 10: # too many pages, just use the PDF itself and hope for the best
self.logger.debug(_('Not converting PDF "{0}" to TIFF because it has too many pages').format(filename_pdf))
return filename_pdf
tiff_file = tempfile.NamedTemporaryFile(delete=False, suffix='.tiff').name
self.temp_files.append(tiff_file)
try:
cmd = ['convert', '-density', '150', filename_pdf, '-background', 'white', '-alpha', 'remove', '-depth', '8', '-compress', 'lzw', tiff_file]
self.logger.debug(_('Command for converting PDF to TIFF: {0}').format(cmd))
subprocess.check_call(cmd, stdout=subprocess.DEVNULL)
except (FileNotFoundError, subprocess.CalledProcessError) as e:
self.logger.debug(str(e))
self.logger.warning(_('Failed to convert PDF "{0}" to TIFF: falling back to using PDF directly'))
return filename_pdf
except Exception as e:
e.notification_message = _("An internal error occurred")
raise e
return tiff_file
def convert_output(self):
if '.' + self.args.output_ext not in self.available_output_formats:
e = Exception(_("Output format %s is not supported" %self.args.output_ext))
......@@ -80,3 +130,35 @@ class FinereaderOCRizer(OCRizer):
e.notification_message = _("Couldn't manage the output format %s" %ext.replace(".",""))
raise e
self.output_files = endlist
def _post_process_odf(self, src, dst):
def get_page_width(odf_zip, default='21cm'):
try:
styles_info = odf_zip.getinfo('styles.xml')
with odf_zip.open(styles_info, 'r') as styles:
tree = etree.parse(styles)
nsmap = { 'fo': 'urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0',
'office':'urn:oasis:names:tc:opendocument:xmlns:office:1.0',
'style': 'urn:oasis:names:tc:opendocument:xmlns:style:1.0' }
return tree.xpath('//style:page-layout[@style:name=//office:master-styles/style:master-page/@style:page-layout-name]/style:page-layout-properties/@fo:page-width', namespaces=nsmap)[0]
except KeyError:
return default
xsl_path = os.path.join(self.engine_datadir, 'odf-post-process.xsl')
with open(xsl_path, 'rb') as xsl:
stylesheet = etree.XSLT(etree.parse(xsl))
with ZipFile(src, 'r') as odf_in:
with ZipFile(dst, 'w') as odf_out:
for info in odf_in.infolist():
with odf_in.open(info, 'r') as i:
if info.filename == 'content.xml':
parameters = {
'page-width': etree.XSLT.strparam(get_page_width(odf_in))
}
data = bytes(stylesheet(etree.parse(i), **parameters))
for e in stylesheet.error_log:
self.logger.debug('%s:%s:%s: %s' % (os.path.basename(xsl_path), e.line, e.column, e.message))
odf_out.writestr(info, data)
else:
odf_out.writestr(info, i.read())
......@@ -56,6 +56,8 @@ class OCRizer():
self.input_files = []
self.output_files = []
self.temp_files = []
self.datadir = os.environ.get('OCRIZE_DATADIR', '/usr/share/ocrize')
self.engine_datadir = os.path.join(self.datadir, 'ocrengines/%s' % self.engine_name)
def init_logger(self):
# list to convert verbose argument to the logging numeric level value
......@@ -101,7 +103,6 @@ class OCRizer():
if not self.args.scan_document:
return
self.logger.info(_('OCR Engine : ' + self.args.ocr_engine))
self.send_notification(_("Attempting to scan a new image"))
sane_version = sane.init()
......@@ -142,11 +143,7 @@ class OCRizer():
dev.close()
self.args.files = [self.scan_file] # overrides other input files.
def select_scanner(self):
devices = sane.get_devices()
self.list_scanners(devices)
def filter_scanner(self, devices):
chosen_dev = None
if self.args.chosen_scanner:
for device in devices:
......@@ -161,24 +158,37 @@ class OCRizer():
chosen_dev = device
break
return chosen_dev
def select_scanner(self):
self.list_scanners()
# check local devices
chosen_dev = self.filter_scanner(sane.get_devices(True))
if chosen_dev == None:
if self.args.all_scanners:
try:
chosen_dev = devices[0]
except IndexError as e:
e = Exception(_("Couldn't find any scanner device"))
e.notification_message = _("Couldn't find any scanner device")
# check all devices
devices = sane.get_devices()
chosen_dev = self.filter_scanner(devices)
if chosen_dev == None:
if self.args.all_scanners:
try:
chosen_dev = devices[0]
except IndexError as e:
e = Exception(_("Couldn't find any scanner device"))
e.notification_message = _("Couldn't find any scanner device")
raise e
else:
e = Exception(_("Couldn't find an obvious scanner device"))
e.notification_message = _("Couldn't find an obvious scanner device")
raise e
else:
e = Exception(_("Couldn't find an obvious scanner device"))
e.notification_message = _("Couldn't find an obvious scanner device")
raise e
return chosen_dev
def list_scanners(self, devices):
def list_scanners(self):
if not self.args.list_scanners:
return
devices = sane.get_devices()
for device in devices:
print (device)
sys.exit(0)
......@@ -209,14 +219,17 @@ class OCRizer():
return os.path.join(expended_output_dir, _("scanned_document_") + str(next_num).zfill(3))
def convert_txt_to_odt(self, filename):
cmd_args = ["soffice", "--headless", "--convert-to", "odt", "--outdir", os.path.split(filename)[0], filename]
temp_lo_user_dir = tempfile.mkdtemp()
cmd_args = ["soffice", "-env:UserInstallation=file://" + temp_lo_user_dir, "--headless", "--convert-to", "odt", "--outdir", os.path.split(filename)[0], filename]
if self.args.verbose >= 3:
print(" ".join(cmd_args))
try:
res = subprocess.check_call(cmd_args, stdout=subprocess.DEVNULL)
except Exception as e:
e.notification_message = _("An internal error occured")
e.notification_message = _("An internal error occurred")
raise e
finally:
shutil.rmtree(temp_lo_user_dir, ignore_errors=True)
self.temp_files.append(filename) # add for cleanup
return re.sub("(?i)\.txt", ".odt", filename)
......@@ -239,7 +252,7 @@ class OCRizer():
print("convert ", filename, " ", dest_file)
subprocess.check_call(["convert", filename, dest_file], stdout=subprocess.DEVNULL)
except Exception as e:
e.notification_message = _("An internal error occured")
e.notification_message = _("An internal error occurred")
raise e
return filename_pdf
......@@ -257,8 +270,8 @@ class OCRizer():
print(" ".join(cmd_args))
try:
subprocess.check_call(cmd_args, stdout=subprocess.DEVNULL)
except:
e.notification_message = _("An internal error occured")
except Exception as e:
e.notification_message = _("An internal error occurred")
raise e
pdf_images = sorted(glob.glob(tmp_basename_path + "-*.png"))
......@@ -343,6 +356,7 @@ class OCRizer():
def launch(self):
self.send_notification(_("Starting Optical Character Recognition"))
self.logger.info(_('OCR Engine : ') + self.args.ocr_engine)
try:
self.scan()
self.send_notification(_("Treating input image, please wait"))
......@@ -358,10 +372,7 @@ class OCRizer():
except Exception as e:
traceback.print_exc()
self.logger.critical(e)
try:
self.send_notification(e.notification_message)
except:
pass
self.send_notification(_("An error occurred during Optical Character Recognition"))
sys.exit(1)
self.logger.info(_("finished"))
......@@ -407,12 +418,13 @@ def parse_args(argv=None):
parser.add_argument('-l', '--langage', action='store', type=str, default=os.getenv("LANG").split('.')[0], dest="language", help=_("the langage/locale to concentrate research on. eg: en_US"), metavar="lang")
parser.add_argument("-o", "--output-format", action='store', type=str, default='odt', dest="output_ext", help=_("the desired output format"), metavar="format")
parser.add_argument("-b", "--batch", action='store_true', default=False, dest="batch_mode", help=_("batch mode, do not open libreoffice in the end"))
parser.add_argument("-v", "--verbose", action="store", type=int, default=5, dest="verbose", choices=range(6), help=_("verbosity level"))
parser.add_argument("-v", "--verbose", action="store", type=int, default=4, dest="verbose", choices=range(6), help=_("verbosity level"))
parser.add_argument("--no-syslog", action="store_false", default=True, dest="syslog", help=_("do not log to syslog"))
parser.add_argument("--no-delete", action='store_true', default=False, dest="tmp_no_delete", help=_("do not delete temporary files (useful for debugging)"))
parser.add_argument("--no-ocr", action='store_true', default=False, dest="no_ocr", help=_("do not delete use OCR engine (useful for debugging)"))
parser.add_argument("--no-open", action='store_true', default=False, dest="no_open", help=_("do not open LibreOffice"))
parser.add_argument("-d", "--destination", action="store", type=str, default=".", dest="output_dir", help=_("ocrized documents destination directory"))
parser.add_argument("--no-finereader-post-processing", action='store_true', default=False, dest="finereader_no_postproc", help=_("do not post-process Finereader ODF files"))
args = parser.parse_args(argv)
args.ocr_engine_class = OCREngines_available[args.ocr_engine] # pass the class inside the args
......@@ -422,7 +434,7 @@ def parse_args(argv=None):
def start(argv=None):
es = gettext.translation('ocrizer', fallback=True)
es.install()
es.install(names=['ngettext'])
args = parse_args(argv)
args.files = args.files[0]
......
......@@ -2,11 +2,10 @@
Version=1.0
Type=Application
Terminal=false
Categories=Graphics;
Name=scan and OCRize
Comment=scans a document and launches Optical character recognition
Exec=/usr/bin/ocrizer -s -o odt -d ~/Documents/
Icon=mate-application-generic
Icon=printer
Name[fr]=scanner et reconnaitre
Comment[fr]=scanner un document et lancer l'OCR
Exec[fr]=/usr/bin/ocrizer -s -l fr_FR -o odt -d ~/Documents/
Icon[fr]=mate-application-generic
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
xmlns:svg="urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"
xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:table="urn:oasis:names:tc:opendocument:xmlns:table:1.0"
xmlns:draw="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
version="1.0">
<xsl:output
method="xml"
indent="no"
encoding="UTF-8"
standalone="yes"
/>
<xsl:param name="page-width" select="'21cm'" />
<!-- styles -->
<xsl:template match="office:automatic-styles">
<xsl:copy>
<xsl:apply-templates select="@* | node()"/>
<!-- create styles for each sections we create based on the corresponding
frame properties -->
<xsl:message>Using page-width=<xsl:value-of select="$page-width"/></xsl:message>
<xsl:for-each select="//draw:frame[draw:text-box]">
<xsl:element name="style:style">
<xsl:attribute name="style:family">section</xsl:attribute>
<xsl:attribute name="style:name">
<xsl:value-of select="concat('PlaceholderSection', generate-id(.))"/>
</xsl:attribute>
<xsl:element name="style:section-properties">
<xsl:attribute name="fo:margin-left"><xsl:value-of select="@svg:x"/></xsl:attribute>
<xsl:attribute name="fo:margin-right"><xsl:value-of select="substring-before($page-width, 'cm') - substring-before(@svg:width, 'cm') - substring-before(@svg:x, 'cm')"/>cm</xsl:attribute>
</xsl:element>
</xsl:element>
</xsl:for-each>
<!-- create a derived style for each paragraph we strip from around a frame,
which should be identical but for having a very small line height.
It's not 0, otherwise if it's the style of the first paragraph in the
document, arrowing up moves inside the header for some reason, so make
it simply "very small" (0.001cm and above works, 0.0001cm doesn't).
It's likely because under a certain size libreoffice considers it
invisible, and this triggers a bug of some kind. -->
<xsl:for-each select="//text:p[draw:frame]
|//text:h[draw:frame]">
<xsl:element name="style:style">
<xsl:variable name="style-name" select="@text:style-name"/>
<xsl:attribute name="style:family">paragraph</xsl:attribute>
<xsl:attribute name="style:name">
<xsl:value-of select="concat('Placeholder', $style-name)"/>
</xsl:attribute>
<xsl:attribute name="style:parent-style-name">
<xsl:value-of select="$style-name"/>
</xsl:attribute>
<!-- copy the original style:master-page-name attribute -->
<xsl:copy-of select="//office:automatic-styles/style:style[@style:name=$style-name]/@style:master-page-name"/>
<xsl:element name="style:paragraph-properties">
<xsl:attribute name="fo:line-height">0.001cm</xsl:attribute>
</xsl:element>
</xsl:element>
</xsl:for-each>
</xsl:copy>
</xsl:template>
<!-- style: avoid images and text overlapping by modifying "run-through" wrap
style with "dynamic" that does what we want. -->
<xsl:template match="office:automatic-styles/style:style/style:graphic-properties[@style:wrap='run-through']">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<!-- override previous value -->
<xsl:attribute name="style:wrap">dynamic</xsl:attribute>
<xsl:apply-templates select="node()"/>
</xsl:copy>
</xsl:template>
<!-- strip empty paragraphs that are likely here only to account for some
frame sizes and will increase the page contents size for no good reason
once the frames are stripped -->
<xsl:template match="text:p[count(*) = 0][count(preceding-sibling::*) > 0]">
<xsl:comment>Stripped empty &lt;<xsl:value-of select="name()"/> text:style-name="<xsl:value-of select="@text:style-name"/>"&gt;</xsl:comment>
</xsl:template>
<!-- moves children of a draw:p containing a draw:frame outside of itself,
because we want to replace the frame with sections, which do not belong
inside a text:p -->
<xsl:template match="text:p[draw:frame/draw:text-box] |
text:h[draw:frame/draw:text-box]">
<xsl:comment>Removed outer <xsl:value-of select="name()"/></xsl:comment>
<!-- We keep an empty copy with an altered version of its style so the side
effects of the style are applied. -->
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:for-each select="@text:style-name">
<xsl:attribute name="{name(.)}">
<xsl:value-of select="concat('Placeholder', .)"/>
</xsl:attribute>
</xsl:for-each>
</xsl:copy>
<xsl:variable name="self" select="."/>
<!-- walk through the children and pack them inside their own wrapping
element if appropriate. We wrap contiguous non-frame children together. -->
<xsl:for-each select="*">
<xsl:variable name="item-pos" select="count(preceding-sibling::*) + 1"/>
<xsl:choose>
<!-- child is a draw:frame we should strip -->
<xsl:when test="self::draw:frame[draw:text-box]">
<xsl:comment>draw:frame</xsl:comment>
<xsl:comment>draw:text-box</xsl:comment>
<xsl:element name="text:section">
<xsl:attribute name="text:name">
<xsl:value-of select="concat('PlaceholderSection', generate-id())" />
</xsl:attribute>
<xsl:attribute name="text:style-name">
<xsl:value-of select="concat('PlaceholderSection', generate-id())" />
</xsl:attribute>
<xsl:apply-templates select="./draw:text-box/*"/>
</xsl:element>
</xsl:when>
<!-- We group everything we can in one single extra element, to avoid
creating many duplicate wrappers for sibling elements. To do so,
we match children that come right after a stripped frame, add the
outer element back and loop on self and its following non-stripped
elements to put them inside. -->
<!-- TODO: maybe come up with a selector for the outer for-each that
avoids the need for matching here? -->
<xsl:when test="$item-pos = 1 or ../draw:frame[$item-pos - 1][draw:text-box]">
<xsl:comment>Re-added <xsl:value-of select="name($self)" /> wrapping element</xsl:comment>
<!-- FIXME: there's probably a better way to do that.
Here in order to change the context (and because XSLT 1.0 doesn't have
the "select" attribute on xsl:copy), we use a xsl:for-each that matches
a single element in order to switch context -->
<xsl:for-each select="$self">
<xsl:copy>
<!-- We DO NOT copy the style attribute verbatim here, because it might
have extra side effects (even crash LO when using master-page-name
style attributes, hehe) -->
<!-- TODO: add the stripped element's attributes here (?) -->
<!-- copy the mandatory text:outline-level attribute of text:h elements -->
<xsl:for-each select="@text:outline-level">
<xsl:apply-templates select="."/>
</xsl:for-each>
<!-- copy all the non-stripped contiguous siblings -->
<xsl:for-each select="./*[not(self::draw:frame[draw:text-box]) and
count(preceding-sibling::*) + 1 >= $item-pos and
not(preceding-sibling::draw:frame[count(preceding-sibling::*) + 1 > $item-pos][draw:text-box])]">
<xsl:apply-templates select="."/>
</xsl:for-each>
</xsl:copy>
</xsl:for-each>
</xsl:when>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<!-- Detect frames we can't handle -->
<xsl:template match="draw:frame[count(draw:text-box) > 0 and count(draw:text-box) != count(*)]">
<xsl:message>Unsupported input with a frame combining children of different types</xsl:message>
<xsl:comment>Unsupported input with a frame combining children of different types</xsl:comment>
<xsl:call-template name="copy-and-recurse"/>
</xsl:template>
<!-- Just catch cases we don't support yet but likely should, if any -->
<xsl:template match="*[not(self::text:p) and not(self::text:h)][draw:frame[count(draw:text-box) = count(*)]]">
<xsl:message>Unsupported input with a draw:frame not inside a text:p or text:h (but a <xsl:value-of select="name()"/>) but with draw:text-box children</xsl:message>
<xsl:comment>Unsupported input with a draw:frame not inside a text:p or text:h (but a <xsl:value-of select="name()"/>) but with draw:text-box children</xsl:comment>
<xsl:call-template name="copy-and-recurse"/>
</xsl:template>
<!-- base template to match anything and call other templates -->
<xsl:template match="@* | node()" name="copy-and-recurse">
<xsl:copy>
<xsl:apply-templates select="@* | node()"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>