Commits

Andy Mikhailenko  committed 20585ee

Added basic text extractors for DejaVu, ODF, DOCX, MS Word and MS Excel. None is connected yet (only images+OCR are supported at the moment).

  • Participants
  • Parent commits 04286a6

Comments (0)

Files changed (12)

File orgtool/ext/papers/commands.py

 from tool.cli import Style
 
 from .helpers import args_to_unicode, name_from_path, get_file_hash
-from .schema import Page
-from .ocr import image_to_text
+from .schema import UploadedDocumentPart as Page
+from .extractors.ocr import image_to_text
 
 
 IMAGE_FORMATS = 'JPEG', 'PNG', 'GIF'

File orgtool/ext/papers/extractors/__init__.py

+from .ocr import image_to_text
+from .pdf import PDFExtractor
+
+#extractors = {
+#    'jpg': ImageExtractor,
+#    'pdf': PDFExtractor,
+#}

File orgtool/ext/papers/extractors/dejavu.py

+# -*- coding: utf-8 -*-
+"""
+DevaVu extraction
+=================
+
+:dependencies: python-djvulibre_ (which, in turn, requires pyrex)
+
+.. note::
+
+    Text extraction implies existence of a text layer in the DevaVu file.
+    Images are *not* OCR'ed. See ocrodjvu_ for hints about how to add a text
+    layer to your DejaVu files.
+
+.. _python-djvulibre: http://jwilk.net/software/python-djvulibre
+.. _ocrodjvu: http://jwilk.net/software/ocrodjvu
+
+"""
+# Inspired by the "djvu-dump-text" example from djvu source code
+import sys
+import djvu.decode
+
+
+class DejaVuExtractor(object):
+    def __init__(self, path):
+        self.path = path
+
+    def get_text(self):
+        return extract_text(self.path)
+
+
+def extract_text(path):
+    "Extracts text from a DejaVu file."
+    context = Context()
+    pages = context.process(path)
+    return pages
+
+
+#--- helpers
+
+def collect_lines(sexpr, level=0):
+    if sexpr and isinstance(sexpr, djvu.sexpr.ListExpression):
+        for child in sexpr[5:]:
+            lines = list(collect_lines(child, level + 1))
+            yield ' '.join(lines)
+    else:
+        yield sexpr.value
+
+
+class Context(djvu.decode.Context):
+
+    def handle_message(self, message):
+        if isinstance(message, djvu.decode.ErrorMessage):
+            print >>sys.stderr, message
+            sys.exit(1)
+
+    def process(self, path):
+        document = self.new_document(djvu.decode.FileURI(path))
+        document.decoding_job.wait()
+        for page in document.pages:
+            lines = collect_lines(page.text.sexpr)
+            return '\n'.join(line for line in lines if line)

File orgtool/ext/papers/extractors/ms_openxml.py

+# -*- coding: utf-8 -*-
+"""
+MS Office OpenXML extraction
+============================
+
+:dependencies: openxmllib_
+
+.. _openxmllib: http://pypi.python.org/pypi/openxmllib
+
+"""
+import openxmllib
+
+
+class MSOpenXMLExtractor(object):
+    def __init__(self, path):
+        self.doc = load(path)
+
+    def get_text(self):
+        return extract_text(self.doc)
+
+
+def load(self, path):
+    return openxmllib.openXmlDocument(path=path)   # or: file=stream
+
+def extract_text(doc):
+    """Returns space-separated list of unique words found in the various texts
+    of the document.
+
+    .. warning::
+
+        This is for indexing/search only.
+
+    """
+    return doc.indexableText(include_properties=False)

File orgtool/ext/papers/extractors/msexcel.py

+# -*- coding: utf-8 -*-
+"""
+MS Excel extraction
+===================
+
+:dependencies: xlrd_
+
+.. note::
+
+    Known alternatives for xlrd_: pyExcelerator_, rbco.msexcel_.
+
+.. _xlrd: http://pypi.python.org/pypi/xlrd
+.. _pyExcelerator: http://pypi.python.org/pypi/pyExcelerator
+.. _rbco.msexcel: http://pypi.python.org/pypi/rbco.msexcel
+
+"""
+import csv
+import xlrd
+from StringIO import StringIO
+
+
+class MSExcelExtractor(object):
+    def __init__(self, path):
+        self.path = path
+
+    def get_text(self):
+        return load(self.path)
+
+def load(path):
+    """Returns all cells from all sheets of the workbook as pseudo-CSV.
+
+    .. note::
+
+        The rows are delimited by ``\\n`` and the values by ``\\t``, but no
+        escaping is done and thus the output is not guaranteed to be valid
+        CSV. This is done intentionally as the standard `csv` module lacks
+        direct support for Unicode, so proper handling of Unicode would
+        increase the volume of code without adding to the readability and
+        indexability of the output.
+
+    """
+    book = xlrd.open_workbook(path)
+    def get_rows():
+        for sheet in book.sheets():
+            for row_idx in xrange(sheet.nrows):
+                yield ' '.join(unicode(x) for x in sheet.row_values(row_idx))
+    return '\n'.join(get_rows())
+
+#def load_as_csv(path):
+#    """Returns valid CSV. Yes, it's overkill for indexing purposes.
+#    Does *not* support Unicode (see http://docs.python.org/library/csv.html).
+#    """
+#    book = xlrd.open_workbook(path)
+#    stream = StringIO()
+#    writer = csv.writer(stream, delimiter='\t', lineterminator='\n')
+#    for sheet in book.sheets():
+#        for row_idx in xrange(sheet.nrows):
+#            values = sheet.row_values(row_idx)
+#            writer.writerow(values)
+#    return stream.read()

File orgtool/ext/papers/extractors/msword.py

+# -*- coding: utf-8 -*-
+"""
+MS Word extraction
+==================
+
+:dependencies: antiword_
+
+.. note::
+
+    Known alternatives for Antiword: catdoc_ and wv_.
+
+.. _antiword: http://winfield.demon.nl
+.. _catdoc: http://wagner.pp.ru/~vitus/software/catdoc/
+.. _wv: http://wvware.sourceforge.net
+
+"""
+import subprocess
+
+
+class MSWordExtractor(object):
+    def __init__(self, path):
+        self.path = path
+
+    def get_text(self):
+        proc = Popen(['antiword', '-w', self.path], stdout=subprocess.PIPE)
+        text = proc.read()
+        return text.decode('utf-8')

File orgtool/ext/papers/extractors/ocr.py

+# -*- coding: utf-8 -*-
+# TODO: maybe make an abstraction layer for OCRs
+# see also: https://github.com/hoffstaetter/python-tesseract
+
+import os
+import subprocess
+
+from orgtool.ext.papers.helpers import get_thumbnail
+
+
+OUT_FILENAME = __name__+'auto-cuneiform.txt'
+
+def image_to_text(path, language='eng', fax=False, dotmatrix=False):
+    """Parses image using an OCR utility and returns resulting plain text.
+
+    :param path: path to file (string)
+    :param language: language name (e.g. "English" or "eng" but *not* "en")
+
+    Usage::
+
+        text = image_to_text('scan0015.tiff', language='Russian')
+
+    """
+    if not os.path.exists(path):
+        raise ValueError('File {0} does not exist.'.format(path))
+    args = ['cuneiform', '-o', OUT_FILENAME, path]
+    if language:
+        args += ['-l', language[:3].lower()]
+    if fax:
+        args += ['--fax']
+    if dotmatrix:
+        args += ['--dotmatrix']
+    process = subprocess.Popen(args)
+    return_code = process.wait()
+    if return_code:
+        raise RuntimeError('OCR parsing failed.')
+    f = open(OUT_FILENAME)
+    result = f.read().strip()
+    result = result.decode('utf-8', 'replace')
+    f.close()
+    os.unlink(OUT_FILENAME)
+    return result
+
+
+class OCRExtractor(object):
+    def __init__(self, path):
+        self.path = path
+
+    def get_text(self, **kwargs):
+        return image_to_text(self.path, **kwargs)
+
+    def get_thumbnail(self, width, height):
+        raise NotImplementedError
+        #return get_thumbnail(image,

File orgtool/ext/papers/extractors/odf_plugin.py

+# -*- coding: utf-8 -*-
+"""
+OpenDocument extraction
+=======================
+
+:dependencies: odfpy_
+
+.. _odfpy: http://odfpy.forge.osor.eu
+
+"""
+import odf.opendocument as opendocument
+import odf.text
+
+
+class ODFExtractor(object):
+    def __init__(self, path):
+        self.doc = load(path)
+
+    def get_text(self):
+        return extract_text(self.doc)
+
+
+def load(self, path):
+    return opendocument.load(path)
+
+def extract_text(doc):
+    "Extracts text from given ODFDocument instance"
+    pararaphs = doc.getElementsByType(odf.text.P)
+    lines = collect_lines(pararaphs)
+    return '\n'.join(lines)
+
+def collect_lines(nodes):
+    for node in nodes:
+        if node.hasChildNodes():
+            for line in collect_lines(node.childNodes):
+                yield line
+        else:
+            if hasattr(node, 'data'):
+                yield node.data

File orgtool/ext/papers/extractors/pdf.py

+# -*- coding: utf-8 -*-
+"""Extraction of plain text from PDF files.
+
+:dependencies: PDFMiner_, ImageMagick_ (no Python bindings)
+
+.. _PDFMiner: http://www.unixuser.org/~euske/python/pdfminer/
+.. _ImageMagick:
+
+Based on:
+
+* http://unixuser.org/~euske/python/pdfminer/programming.html
+* http://stackoverflow.com/questions/25665/python-module-for-converting-pdf-to-text
+
+"""
+from StringIO import StringIO
+from pdfminer.pdfparser import PDFParser, PDFDocument
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.converter import TextConverter
+
+
+class PDFExtractor(object):
+
+    def __init__(self, path):
+        self.path = path
+
+    def get_text(self, password='', encoding='utf-8'):
+        # Open a PDF file.
+        fp = open(self.path, 'rb')
+        # Create a PDF parser object associated with the file object.
+        parser = PDFParser(fp)
+        # Create a PDF document object that stores the document structure.
+        doc = PDFDocument()
+        # Connect the parser and document objects.
+        parser.set_document(doc)
+        doc.set_parser(parser)
+        # Supply the password for initialization.
+        # (If no password is set, give an empty string.)
+        doc.initialize(password)
+        # Check if the document allows text extraction. If not, abort.
+        if not doc.is_extractable:
+            raise RuntimeError('PDF text extraction is not allowed')
+        # Create a PDF resource manager object that stores shared resources.
+        rsrcmgr = PDFResourceManager()
+        # Create a PDF device object.
+        outfp = StringIO()
+        device = TextConverter(rsrcmgr, outfp, codec=encoding)
+        # Create a PDF interpreter object.
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        # Process each page contained in the document.
+        for page in doc.get_pages():
+            interpreter.process_page(page)
+        return outfp.getvalue()
+
+    def make_thumbnail(self):
+        # TODO: convert -thumbnail {width}x{height} foo.pdf[0] foo.jpg
+        raise NotImplementedError
+

File orgtool/ext/papers/ocr.py

-# -*- coding: utf-8 -*-
-# TODO: maybe make an abstraction layer for OCRs
-# see also: https://github.com/hoffstaetter/python-tesseract
-
-import os
-import subprocess
-
-
-OUT_FILENAME = __name__+'auto-cuneiform.txt'
-
-def image_to_text(path, language='eng', fax=False, dotmatrix=False):
-    """Parses image using an OCR utility and returns resulting plain text.
-
-    :param path: path to file (string)
-    :param language: language name (e.g. "English" or "eng" but *not* "en")
-
-    Usage::
-
-        text = image_to_text('scan0015.tiff', language='Russian')
-
-    """
-    if not os.path.exists(path):
-        raise ValueError('File {0} does not exist.'.format(path))
-    args = ['cuneiform', '-o', OUT_FILENAME, path]
-    if language:
-        args += ['-l', language[:3].lower()]
-    if fax:
-        args += ['--fax']
-    if dotmatrix:
-        args += ['--dotmatrix']
-    process = subprocess.Popen(args)
-    return_code = process.wait()
-    if return_code:
-        raise RuntimeError('OCR parsing failed.')
-    f = open(OUT_FILENAME)
-    result = f.read().strip()
-    result = result.decode('utf-8', 'replace')
-    f.close()
-    os.unlink(OUT_FILENAME)
-    return result

File orgtool/ext/papers/schema.py

 from .helpers import get_image_base_path
 
 
-class Page(TrackedDocument):
+class UploadedDocumentPart(TrackedDocument):
     "Scanned paper (single page)"
     summary = Field(unicode, required=True)
     details = Field(unicode)
     image = ImageField(get_image_base_path, required=True)
+    #attachment = FileField(get_image_base_path, required=True)
     language = Field(unicode)  # "en", "cze", etc. — mainly for OCR (CuneiForm)
     source_fingerprint = Field(str)  # ETL: not necessarily current file
 
         return u'{0}{1}'.format(ext.env['image_base_url'], self.image.path)
 
 
-class Paper(TrackedDocument):
+class UploadedDocument(TrackedDocument):
     "Scanned document (multiple pages)"
     summary = Field(unicode, required=True)
     details = Field(unicode)
-    pages = Many(Page)
+    pages = Many(UploadedDocumentPart)
+    # TODO:
+    #author = ...
+    #owner = ...
+    #location = ...
+    #date = ...
+

File orgtool/ext/papers/views.py

 from tool.ext.breadcrumbs import entitled
 
 from .helpers import get_thumbnail
-from .schema import Page
+from .schema import UploadedDocumentPart as Page
 
 
 @url('/')