Commits

Anonymous committed c1b8a74

Starting with v1.0

  • Participants
  • Tags v1.0

Comments (0)

Files changed (1)

+# -*- coding: utf-8 -*-
+'''
+TODO:
+-----
+1. [X] It fails when the document doesn't contain text. (Nick Dokos)
+   To fail cleanly and advice this is not OCR software.
+2. <text:p>   -> This is a typical paragraph.
+     tetxo01 jskdljsd
+     <text:s text:c="6">   -> This means 6 spaces
+    more text (this is a continuation)
+    <text:tab />     -> This is a tab
+   </text:p>
+'''
+
+ERROR = False
+import codecs
+import zipfile
+import os
+import string
+import re
+try:
+    from lxml import etree
+except ImportError:
+    print u'ERROR: install "nxml": http://codespeak.net/lxml/'
+    ERROR = True
+
+try:
+    import OleFileIO_PL as ole
+except ImportError:
+    print u'ERROR: please, install "OleFileIO_PL": http://www.decalage.info/files/OleFileIO_PL-0.18.zip'
+    # http://www.decalage.info/en/python/olefileio
+    ERROR = True
+
+class Ole:
+    def __init__(self,filename=None):
+        self.filename = filename
+        self._data = ole.OleFileIO(self.filename)
+        self.files = self.__getFiles__()
+        self._dict = { '.*AcroExch\.Document\.[0-9]+.*' : { 'ext':'.pdf','name':'CONTENTS'},
+                '.*Word\.Document\.[0-9]+.*' : { 'ext':'.doc','name': None},
+                '.*Excel\.Sheet\.[0-9]+.*'   : { 'ext':'.xls','name': None},
+                '.*PowerPoint\.Show\.[0-9]+.*'   : { 'ext':'.ppt','name': None},
+                       }
+    
+    def __guessFormat__(self):
+        _data = self.__getItem__(0)
+        #print _data
+        _KEY = None
+        for _key in self._dict.keys():
+            _kernel = re.compile(  _key )
+            _tmp = _kernel.findall( _data )
+            if len(_tmp) == 1:
+                _KEY = _key
+        return _KEY
+
+    def __getFiles__(self):
+        return self._data.listdir()
+    
+    def __getItem__(self,_idx):
+        _tmp = self._data.openstream( self.files[_idx] )
+        _data = _tmp.read()
+        return _data
+        
+    def __extractFile__(self,_idx):
+        # Leemos el fichero
+        _data = self.__getItem__(_idx)
+        _fp = open('filaneme.txt','wb')
+        _fp.write( _data )
+        _fp.close()
+    
+    def extractFile(self):
+        _key = self.__guessFormat__()
+        if _key == None:
+            _tmp = self._data.openstream( self.files[0] )
+            _kernel = re.compile('[a-zA-Z]+\.[a-zA-Z]+\.[0-9]+')
+            _tmp = _tmp.read()
+            print 'ERROR: It is not configured for: %s' % _kernel.findall( _tmp )[0]
+            #print self.files
+            #print _tmp
+            return None
+        else:
+            if self._dict[_key]['name'] != None:
+                _tmp = self._data.openstream( self._dict[_key]['name'] )
+                _file = os.path.splitext(self.filename)
+                _ext = self._dict[ _key ]['ext']
+                _filename = _file[0] + _ext
+                #_tmp = self.outfile +_ext
+                #_tmp = os.path.join( self.outdir, _tmp)
+                _fp = open( _filename, 'wb')
+                _fp.write( _tmp.read() )
+                _fp.close()
+    #            print self.filename
+                del(_tmp)
+                self._data.fp.close()
+                os.remove(self.filename)
+                return _filename
+            else:
+                # Símplemente se renombra
+                #print self.files
+                #print self.__getItem__(7)
+                _file = os.path.splitext(self.filename)
+                _ext = self._dict[ _key ]['ext']
+                _filename = _file[0] + _ext                
+                self._data.fp.close()
+                try:
+                    os.rename(self.filename,_filename)
+                except WindowsError:
+                    print 'WARNING: the file already exists: %s' % _filename
+                return _filename
+
+    def __close__(self):
+        self._data.fp.close()
+
+class FileError(Exception):
+    def __init__(self, 
+                 message,
+                 filename ):
+        self.filename = filename
+        self.message = message
+
+    def __str__(self):
+        return u'%s: %s' % ( self.message, self.filename )
+
+class ORGfile:
+    '''This class is used to generate an ORG file'''
+    def __init__(self,
+                 filename = None,
+                 overwrite = False):
+        self.filename = filename
+        self.isWarning = False
+        self.data = [u'-*- mode: org; coding: utf-8 -*-\n' ]
+    
+    def readfromlist(self, _list):
+        #print _list
+        if _list[0][1] == u'Title':
+            _start = 1
+            self.addHeading( level = 1,
+                             text = _list[0][2] )
+        else:
+            _start = 0
+            self.addHeading( level = 1,
+                             text = [u'%s' % os.path.split(self.filename)[1]] )
+            
+        for _i in _list[_start:]:
+            if _i[3]: # Is heading (a dictionary would be more readable)
+                self.addHeading( level = int(_i[0])+1,
+                                 text = _i[2] )
+            elif _i[4]: # Is heading (a dictionary would be more readable)
+                self.addParagraph( text = _i[2] )
+            elif _i[5] == 'table':
+                self.addTable( table = _i[2] )
+            elif _i[5] == 'enumeration':
+                self.addEnumeration( enumeration = _i[2] )
+                
+
+    def addHeading(self, 
+                   level = 1,
+                   text = [] ):
+        #print text
+        # A heading should only have one line.
+        if len( text ) > 1:
+            pass # TODO: juntar todas las líneas
+            # text.replace('\r\n',' ')    
+            #text.replace('\n',' ')
+
+        # Check level is valid
+        if level < 1:
+            text = [u'* [WARNING] ' + text[0]]
+            self.isWarning = True
+        else:
+            text = [u'*'*(level) + u' %s' % text[0]]
+
+        self.data = self.data + text
+
+    def addParagraph(self, 
+                     text = [] ):
+        self.data = self.data + text
+
+    def addTable(self, 
+                 table = []):
+        _list = []
+        if len(table) > 0:
+            # First line
+            _list = _list + [u'|-']
+            _list = _list + self.__addRow__( table[0] )
+            _list = _list + [u'|-']
+        if len(table) > 1:
+            for _row in table[1:]:
+                _list = _list + self.__addRow__( _row )
+            _list = _list + [u'|-']
+
+        self.data = self.data +  _list 
+
+    def __addRow__(self, _row):
+        '''Helper function to show rows'''
+        _list = []
+        _n = 0
+        _flag = True
+        while _flag:
+            _text = u'| '
+            for _cell in _row:
+                _flag = False
+                if len(_cell) > _n:
+                    _text = _text + u'%s |' % _cell[_n]
+                    _flag = True
+                else:
+                    _text = _text + u'   |'
+            if _flag:
+                _list.append( _text )
+            _n = _n + 1
+        return _list
+
+    def addEnumeration(self, 
+                       enumeration = []):
+        _lista = []
+        for _i in enumeration:
+            _lista = _lista + [u' ' * (_i[0]-1) + u'- ' + _i[1][0]]
+            if len(_i[1]) > 1:
+                for _j in _i[1:]:
+                    _lista = _lista + [ _j ]
+
+        self.data = self.data +  _lista
+
+    def clean(self):
+        pass
+
+    def export(self):
+       #try:
+           _tmp = u''
+           for _i in self.data:
+               _tmp = _tmp + u'%s\n' % _i
+           _fp = codecs.open( self.filename, 'w', 'utf-8')
+           _fp.write( _tmp )
+           _fp.close()
+       #except:
+       #    print u'ERROR: the file was processed, but it failed when writting it:\n   %s' % self.filename
+
+class ODTfile:
+    '''This object enables the access to the OpenOffice file'''
+    def __init__(self,
+                 filename = None,
+                 output = None,
+                 overwrite = False,
+                 original = False):
+        '''All the fields are mandatory. All the checks are done outside.
+        '''
+        self.isOverWriter = overwrite
+        self.isOriginal = original
+        if filename != None or output != None or overwrite != None:
+            self.filename = os.path.realpath( filename )
+            self.orgfile = ORGfile( filename = os.path.realpath( output ),
+                                    overwrite = overwrite)
+
+            # Prefix
+            _tmp = os.path.split( self.orgfile.filename )[1]
+            _tmp = _tmp.replace(' ','_') 
+            _tmp = _tmp.replace('.','_')
+            self.prefix = _tmp + '_'
+
+            # Files
+            self.files = self.__readAsZipFile__( )
+            self.xml = None
+            self.ns = { 'office' : "urn:oasis:names:tc:opendocument:xmlns:office:1.0", 
+                        'text'   : "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
+                        'table'  : "urn:oasis:names:tc:opendocument:xmlns:table:1.0",
+                        'draw'   : "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
+                        'xlink'  : "http://www.w3.org/1999/xlink"}
+        else:
+            print '''ERROR - class OO: requires all input fields to be provided'''
+
+    def exportToORG(self):
+        if self.isOpenOfficeFile():
+            self.__processText__()
+            self.orgfile.export()
+        else:
+            print u'ERROR - OO.exportToORG: it is not an OpenOffice Document: %s' % self.filename
+
+    def __readAsZipFile__(self):
+        '''__readAsZipFile__: this function opens the .ODT as a ZIP file''' 
+        try:
+            _data = zipfile.ZipFile( self.filename, "r")
+            return _data
+        except: 
+            print 'ERROR in OO.__readAsZipFile: ODT files should be compressed as ZIP files: %s' % self.filename
+
+    def isOpenOfficeFile(self):
+        '''Verifies if this is an Open Office Document'''
+        # 1. Contiene el fichero "content.xml"
+        _isOK = False
+        if self.files.namelist().__contains__( 'content.xml'):
+
+            # 2. Leemos el fichero 'content.xml' y vemos que tiene el tag "office:document-content"
+            _data = self.files.read( 'content.xml' )
+            self.xml = etree.fromstring( _data )
+            _lista = self.xml.xpath('/office:document-content', 
+                                namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )
+            if len(_lista) == 1:
+                _isOK = True
+        return _isOK 
+
+    def __processText__(self):
+        _data = self.files.read('content.xml')
+        _dataxml = etree.fromstring( _data )
+
+        _body = _dataxml.xpath('/office:document-content/office:body', 
+                             namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
+
+        _xml = _body.xpath('office:text', 
+                             namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
+        _list = []
+        for _child in _xml.getchildren():
+            _tmp = self.getData( _child )
+            if _tmp != None:
+                _list = _list + [_tmp]
+
+        self.orgfile.readfromlist( _list )
+        
+
+    def __checkTag__(self,
+                     _xml,
+                     _field,
+                     mode=None):
+        if _xml.tag == '{%s}%s' % (self.ns[mode],_field):
+            return True
+        else:
+            return False
+
+    def getData(self, _child):
+        # Get style
+        try:
+            _style = _child.attrib['{%s}%s' % (self.ns['text'],'style-name')]
+        except KeyError:
+            _style = None
+        
+        # Get text
+        
+        _list = []
+        
+        # Gets the text from frames.
+        for _i in _child.getchildren():
+            if self.__checkTag__(_i,'frame',mode='draw'):
+                _name = _i.attrib['{%s}%s' % (self.ns['draw'],'name')]
+                if _name == None:
+                    _name = u'FrameName'
+                _tmp = self.__processFrame__(_i,_name)
+                _list = _list + ['%s' % _tmp ]
+
+        # Get level
+        try:
+            _level = _child.attrib['{%s}%s' % (self.ns['text'],'outline-level')]
+        except KeyError:
+            _level = None
+
+        # Is paragraph
+        if _child.tag == '{%s}%s' % (self.ns['text'],'p'):
+            _isParagraph = True
+            _list = _list + self.getParagraph( _child, mode = 'p' )
+            _mode = 'paragraph'
+        else:
+            _isParagraph = False
+
+        # Is heading
+        if _child.tag == '{%s}%s' % (self.ns['text'],'h'):
+            _list = _list + self.getParagraph( _child, mode = 'h' )
+            _isHeading = True
+            _mode = 'heading'
+        else:
+            _isHeading = False
+
+        # Enumerations
+        if _child.tag == '{%s}%s' % (self.ns['text'],'list'):
+            _isEnumeration = True
+            _mode = 'enumeration'
+            _list = self.__processEnumeration__( _child)
+        else:
+            _isEnumeration = False
+
+        # Table
+        if _child.tag == '{%s}%s' % (self.ns['table'],'table'):
+            _list = self.getTable( _child )
+            _mode = 'table'
+        # Declarations: at the beginning of the file
+        if _child.tag == '{%s}%s' % (self.ns['text'],'sequence-decls'):
+            return None
+        # level, style, list of sentences, isHeading, isPara, XML in case of error.
+        else:
+            _data = (_level, _style, _list, _isHeading, _isParagraph, _mode, etree.tostring(_child, pretty_print=True) )
+            return  _data
+
+    def getParagraph(self, _child, mode = 'p'):
+       '''It checks this is paragraph and recovers the text.'''
+       _list = []
+       if _child.tag == '{%s}%s' % (self.ns['text'], mode):
+          _text = u''
+          for _i in _child.iter():
+             if _i.tag == '{%s}%s' % (self.ns['text'], mode):
+                if _i.text != None:
+                   _text = _text + _i.text
+             elif _i.tag == '{%s}%s' % (self.ns['text'],'s'):
+                try:
+                   _text = _text +  ' ' * int(_i.attrib['{%s}%s' % (self.ns['text'],'c')])
+                except KeyError:
+                   pass # It is used not only to add spaces.
+             elif _i.tag == '{%s}%s' % (self.ns['text'],'tab'):             
+                _text = _text + '\t'
+             #else:
+             #   print "[WARNING] Currently getParagraph does not implement:\n    %s" % _i
+
+             if _i.tail != None:
+                _text = _text + _i.tail
+          if not self.isOriginal: # We clean the text.
+             _text = _text.replace('\t',' ').replace('\r',' ').replace('\n',' ').strip()
+             _text = _text.split(' ')
+             _text = [_i for _i in _text if _i != u'']
+             _text = string.join(_text, ' ')
+          return [_text]
+       else:
+          return []
+
+    def __processEnumeration__(self,
+                               _child):
+        '''Structure for Enumerations:
+        <text:list>
+           <text:list-item>
+                <text:p>
+        '''
+        _list = []
+        _level = 1
+        _levels = { _child : {'type' : 'list', 'level' : 1}}
+        for _i in _child.iterdescendants(): # We follow all the nested structure.
+            # list-item: mismo nivel que su padre.
+            if _i.tag == '{%s}%s' % (self.ns['text'],'list-item'):
+                _levels.update({ _i : {'type' : 'list-item',
+                                      'level' : _levels[ _i.getparent()]['level']}})
+            # list: si el padre era un list-item, se le suma uno.
+            if _i.tag == '{%s}%s' % (self.ns['text'],'list'):
+                _levels.update({ _i : {'type' : 'list',
+                                      'level' : _levels[ _i.getparent()]['level']+1}})
+                
+            if _i.tag == '{%s}%s' % (self.ns['text'],'p'):
+                _levels.update({ _i : {'type' : 'p',
+                                       'level' : _levels[ _i.getparent()]['level']}})
+                _n = _levels[ _i.getparent()]['level']
+                _tmp = []
+                for _j in _i.itertext():
+                    if _j != None:
+                        
+                        _tmp = _tmp + [u'%s' % _j]
+                        #_list = _list + [ (_n, _tmp )]
+                _list = _list + [ (_n, _tmp) ]
+        return _list
+    
+    def getTable(self,_xml):
+        #<table:table-row>
+        #  <table:table-cell>
+        _list = []
+        _rows = _xml.xpath('table:table-row',namespaces={'table':self.ns['table']})
+        for _row in _rows:
+            _cells = _row.xpath('table:table-cell',namespaces={'table':self.ns['table']})
+            _row = []
+            for _c in _cells:
+                _tmp = []
+                for _i in _c.getchildren():
+                    if self.__checkTag__(_i, 'p',mode='text'):
+                        _tmp1 = []
+                        for _j in _i.itertext():
+                            if _j != None:
+                                _tmp1 = _tmp1 + [ u'%s' % _j ]
+                        _tmp = _tmp + _tmp1
+
+                    elif self.__checkTag__(_i, 'list',mode='text'):
+                        _tmp1 = self.__processList__( _i )
+                        _tmp = _tmp + _tmp1
+                _row.append( _tmp )
+            _list.append(_row)
+        return _list
+
+    def __processFrame__(self,_xml,_name):
+        _output = u''
+        #print 'ok'
+        for _i in _xml.getchildren():
+            if self.__checkTag__(_i,'object-ole',mode='draw'):
+                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
+                _newfilename = self.__fileExtractor__( _filename )
+                _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)
+            elif self.__checkTag__(_i,'image',mode='draw'):
+                # No hacer nada si la imagen es un Object Replacement
+                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
+                if not _filename.__contains__('ObjectReplacements/'):
+                    _newfilename = self.__fileExtractor__( _filename )
+                    _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)          
+        return _output
+    
+    def __fileExtractor__(self,_filename):
+        '''Extract the file an assigns a proper filename to it'''
+        # All paths are relatives. We take away the beginning.
+        if _filename[0:2] == './':
+            _tmp = _filename[2:]
+        else:
+            _tmp = _filename
+        
+        _outdir = os.path.split( self.orgfile.filename)[0]
+
+        # The file is extracted.
+        try:
+           self.files.extract(_tmp, _outdir)
+        except AttributeError:
+           print "ERROR: ZipFile provides 'extract' function in Python 2.6." 
+           print "       Please, consider updating to PYTHON v2.6"
+           raise AttributeError("Zipfile instance has no attribute 'extract'")
+        _extractedfile = os.path.join(_outdir,_filename)
+
+        # Is an OLE object.
+        _toRename = False
+        _ole = None
+        if ole.isOleFile(_extractedfile):
+            _ole = Ole( filename = _extractedfile )
+            _new = _ole.extractFile()
+            _ole.__close__()
+            _extractedfile = _new
+            _toRename = True
+        
+        if _toRename:
+            _extractedfile = os.path.realpath(_extractedfile)
+            _tmp = os.path.split(_new)
+            _tmpname = _tmp[1]
+            _tmpname = _tmpname.replace(' ','_')
+            _new = os.path.join(_tmp[0],_tmpname )
+        
+            if self.isOverWriter:
+                os.remove( _new )
+            try:
+                os.rename(_extractedfile,_new)
+            except WindowsError:
+                print 'ERROR: file already exists: %s' % _new
+            _extractedfile = _new
+
+        # Los ficheros se extraen a sus rutas originales
+        # Crea directorios si fuera necesario.
+        # - Movemos los ficheros al mismo directorio que el .org.
+        # - Usamos como prefijo el nombre del fichero.
+        _tmp = os.path.split( _extractedfile )
+        _newname = self.prefix + _tmp[1]
+        _fullnewname = os.path.join(_outdir,_newname)
+        
+        if os.path.isfile( _fullnewname ) and self.isOverWriter:
+            os.remove( _fullnewname )  # The preexisting file is removed.
+        elif os.path.isfile( _fullnewname ) and not self.isOverWriter:
+            print 'WARNING - The file already exists.'
+            print '  - FILENAME: %s' % _fullnewname
+            print '  - Keeping both files'
+            print '  - Referencing to the old one'
+        else:
+            pass
+        
+        # We try to move the file changing its name. 
+        try:
+            os.rename(_extractedfile,_fullnewname)
+            return _newname
+        except:   
+            return _filename
+
+if __name__ == '__main__':
+    if not ERROR:
+        import optparse
+        _usage = u'''ODT2ORG: This software converts <filename.odt> (Open Office) into <filename.org> (emacs org-mode).
+
+   General usage: <python_with_path> %prog [<options>] <input_file> [<output_file>]
+
+Some examples:
+- Convert "filename.odt" into "outputfile.org":
+   python.exe odt2org.py filename.odt outputfile.org
+
+- Covert  "filename.odt" into "filename.org"
+   python odt2org.py filename.odt
+
+- Using paths:
+   C:\\python26\\python.exe C:\\odt2org\\odt2org.py C:\\MyFiles\\filename.odt
+
+- Forcing overwriting:
+   python odt2org.py -f filename.odt
+
+'''
+        _parser = optparse.OptionParser(usage = _usage, 
+                                        version = "%prog 1.0" )
+
+#        _parser.add_option( "-i", 
+#                           "--inputfile", 
+#                           action="store",
+#                           dest="inputfile",
+#                           help="FILE is an .odt (open office) to be read",
+#                           metavar= "FILE")
+        _parser.add_option( "-f", 
+                           "--force", 
+                           action = "store_true",
+                           dest = "overwrite",
+                           help = "overwrite the output file if it exists")
+        _parser.add_option( "-o", 
+                           "--original", 
+                           action = "store_true",
+                           dest = "original",
+                           help = "it shows spaces/tabs/returns as in the original file")
+
+        (_options, _args) = _parser.parse_args()
+
+        _isOK = True
+        if len(_args) > 0:
+            _inputfile = os.path.realpath( _args[0] )
+        else:
+            _isOK = False
+        
+        if not _isOK:
+            print '''Get the help by writing: 
+
+   python odt2org.py -h
+
+'''
+        
+        if not os.path.isfile( _inputfile):
+            print u'ERROR: the input file does not exist: %s' % _inputfile
+            _isOK = False
+
+        # If the outputfile is not provided, it is used one based in the original.
+        if len( _args ) > 1:
+            _outputfile = _args[1]
+            _outputfile = os.path.realpath( _outputfile )
+        else:
+            _tmp = os.path.splitext( _inputfile )[0]
+            _outputfile = os.path.realpath( _tmp + '.org' )
+            
+        # Check if output directory is valid.
+        if not os.path.isdir( os.path.split(_outputfile)[0] ):
+            print 'ERROR: output dir does not exists: %s' % os.path.split(_outputfile)[0]
+            _isOK = False
+
+        # Overwritting
+        if not _options.overwrite and os.path.isfile( _outputfile ):
+            print u'ERROR: output file already exist: %s' % _outputfile
+            print u'       Use -f or --force to override'
+            _isOK = False
+
+        # We have proper inputfile, outputfile and overwrite option.
+        if _isOK:
+            _odtfile = ODTfile(filename = _inputfile,
+                               output = _outputfile,
+                               overwrite = _options.overwrite,
+                               original = _options.original)
+            _odtfile.exportToORG()
+