Commits

Jose Maria Garcia  committed 591c51b

Refactoring

  • Participants
  • Parent commits 9f424c0

Comments (0)

Files changed (3)

 '''
 
 ERROR = False
-import codecs
-import zipfile
+#import codecs
+#import zipfile
 import os
-import string
-import re
-try:
-    from lxml import etree
-except ImportError:
-    print u'ERROR: install "nxml": http://codespeak.net/lxml/'
-    ERROR = True
-
-try:
-    import OleFileIO_PL as ole
-except ImportError:
-    print u'ERROR: please, install "OleFileIO_PL": http://www.decalage.info/files/OleFileIO_PL-0.18.zip'
-    # http://www.decalage.info/en/python/olefileio
-    ERROR = True
-
-class Ole:
-    def __init__(self,filename=None):
-        self.filename = filename
-        self._data = ole.OleFileIO(self.filename)
-        self.files = self.__getFiles__()
-        self._dict = { '.*AcroExch\.Document\.[0-9]+.*' : { 'ext':'.pdf','name':'CONTENTS'},
-                '.*Word\.Document\.[0-9]+.*' : { 'ext':'.doc','name': None},
-                '.*Excel\.Sheet\.[0-9]+.*'   : { 'ext':'.xls','name': None},
-                '.*PowerPoint\.Show\.[0-9]+.*'   : { 'ext':'.ppt','name': None},
-                       }
-    
-    def __guessFormat__(self):
-        _data = self.__getItem__(0)
-        #print _data
-        _KEY = None
-        for _key in self._dict.keys():
-            _kernel = re.compile(  _key )
-            _tmp = _kernel.findall( _data )
-            if len(_tmp) == 1:
-                _KEY = _key
-        return _KEY
-
-    def __getFiles__(self):
-        return self._data.listdir()
-    
-    def __getItem__(self,_idx):
-        _tmp = self._data.openstream( self.files[_idx] )
-        _data = _tmp.read()
-        return _data
-        
-    def __extractFile__(self,_idx):
-        # Leemos el fichero
-        _data = self.__getItem__(_idx)
-        _fp = open('filaneme.txt','wb')
-        _fp.write( _data )
-        _fp.close()
-    
-    def extractFile(self):
-        _key = self.__guessFormat__()
-        if _key == None:
-            _tmp = self._data.openstream( self.files[0] )
-            _kernel = re.compile('[a-zA-Z]+\.[a-zA-Z]+\.[0-9]+')
-            _tmp = _tmp.read()
-            print 'ERROR: It is not configured for: %s' % _kernel.findall( _tmp )[0]
-            #print self.files
-            #print _tmp
-            return None
-        else:
-            if self._dict[_key]['name'] != None:
-                _tmp = self._data.openstream( self._dict[_key]['name'] )
-                _file = os.path.splitext(self.filename)
-                _ext = self._dict[ _key ]['ext']
-                _filename = _file[0] + _ext
-                #_tmp = self.outfile +_ext
-                #_tmp = os.path.join( self.outdir, _tmp)
-                _fp = open( _filename, 'wb')
-                _fp.write( _tmp.read() )
-                _fp.close()
-    #            print self.filename
-                del(_tmp)
-                self._data.fp.close()
-                os.remove(self.filename)
-                return _filename
-            else:
-                # Símplemente se renombra
-                #print self.files
-                #print self.__getItem__(7)
-                _file = os.path.splitext(self.filename)
-                _ext = self._dict[ _key ]['ext']
-                _filename = _file[0] + _ext                
-                self._data.fp.close()
-                try:
-                    os.rename(self.filename,_filename)
-                except WindowsError:
-                    print 'WARNING: the file already exists: %s' % _filename
-                return _filename
-
-    def __close__(self):
-        self._data.fp.close()
-
-class FileError(Exception):
-    def __init__(self, 
-                 message,
-                 filename ):
-        self.filename = filename
-        self.message = message
-
-    def __str__(self):
-        return u'%s: %s' % ( self.message, self.filename )
-
-class ORGfile:
-    '''This class is used to generate an ORG file'''
-    def __init__(self,
-                 filename = None,
-                 overwrite = False):
-        self.filename = filename
-        self.isWarning = False
-        self.data = [u'-*- mode: org; coding: utf-8 -*-\n' ]
-    
-    def readfromlist(self, _list):
-        #print _list
-        if _list[0][1] == u'Title':
-            _start = 1
-            self.addHeading( level = 1,
-                             text = _list[0][2] )
-        else:
-            _start = 0
-            self.addHeading( level = 1,
-                             text = [u'%s' % os.path.split(self.filename)[1]] )
-            
-        for _i in _list[_start:]:
-            if _i[3]: # Is heading (a dictionary would be more readable)
-                self.addHeading( level = int(_i[0])+1,
-                                 text = _i[2] )
-            elif _i[4]: # Is heading (a dictionary would be more readable)
-                self.addParagraph( text = _i[2] )
-            elif _i[5] == 'table':
-                self.addTable( table = _i[2] )
-            elif _i[5] == 'enumeration':
-                self.addEnumeration( enumeration = _i[2] )
-                
-
-    def addHeading(self, 
-                   level = 1,
-                   text = [] ):
-        #print text
-        # A heading should only have one line.
-        if len( text ) > 1:
-            pass # TODO: juntar todas las líneas
-            # text.replace('\r\n',' ')    
-            #text.replace('\n',' ')
-
-        # Check level is valid
-        if level < 1:
-            text = [u'* [WARNING] ' + text[0]]
-            self.isWarning = True
-        else:
-            text = [u'*'*(level) + u' %s' % text[0]]
-
-        self.data = self.data + text
-
-    def addParagraph(self, 
-                     text = [] ):
-        self.data = self.data + text
-
-    def addTable(self, 
-                 table = []):
-        _list = []
-        if len(table) > 0:
-            # First line
-            _list = _list + [u'|-']
-            _list = _list + self.__addRow__( table[0] )
-            _list = _list + [u'|-']
-        if len(table) > 1:
-            for _row in table[1:]:
-                _list = _list + self.__addRow__( _row )
-            _list = _list + [u'|-']
-
-        self.data = self.data +  _list 
-
-    def __addRow__(self, _row):
-        '''Helper function to show rows'''
-        _list = []
-        _n = 0
-        _flag = True
-        while _flag:
-            _text = u'| '
-            for _cell in _row:
-                _flag = False
-                if len(_cell) > _n:
-                    _text = _text + u'%s |' % _cell[_n]
-                    _flag = True
-                else:
-                    _text = _text + u'   |'
-            if _flag:
-                _list.append( _text )
-            _n = _n + 1
-        return _list
-
-    def addEnumeration(self, 
-                       enumeration = []):
-        _lista = []
-        for _i in enumeration:
-            _lista = _lista + [u' ' * (_i[0]-1) + u'- ' + _i[1][0]]
-            if len(_i[1]) > 1:
-                for _j in _i[1:]:
-                    _lista = _lista + [ _j ]
-
-        self.data = self.data +  _lista
-
-    def clean(self):
-        pass
-
-    def export(self):
-       #try:
-           _tmp = u''
-           for _i in self.data:
-               _tmp = _tmp + u'%s\n' % _i
-           _fp = codecs.open( self.filename, 'w', 'utf-8')
-           _fp.write( _tmp )
-           _fp.close()
-       #except:
-       #    print u'ERROR: the file was processed, but it failed when writting it:\n   %s' % self.filename
-
-class ODTfile:
-    '''This object enables the access to the OpenOffice file'''
-    def __init__(self,
-                 filename = None,
-                 output = None,
-                 overwrite = False,
-                 original = False):
-        '''All the fields are mandatory. All the checks are done outside.
-        '''
-        self.isOverWriter = overwrite
-        self.isOriginal = original
-        if filename != None or output != None or overwrite != None:
-            self.filename = os.path.realpath( filename )
-            self.orgfile = ORGfile( filename = os.path.realpath( output ),
-                                    overwrite = overwrite)
-
-            # Prefix
-            _tmp = os.path.split( self.orgfile.filename )[1]
-            _tmp = _tmp.replace(' ','_') 
-            _tmp = _tmp.replace('.','_')
-            self.prefix = _tmp + '_'
-
-            # Files
-            self.files = self.__readAsZipFile__( )
-            self.xml = None
-            self.ns = { 'office' : "urn:oasis:names:tc:opendocument:xmlns:office:1.0", 
-                        'text'   : "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
-                        'table'  : "urn:oasis:names:tc:opendocument:xmlns:table:1.0",
-                        'draw'   : "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
-                        'xlink'  : "http://www.w3.org/1999/xlink"}
-        else:
-            print '''ERROR - class OO: requires all input fields to be provided'''
-
-    def exportToORG(self):
-        if self.isOpenOfficeFile():
-            self.__processText__()
-            self.orgfile.export()
-        else:
-            print u'ERROR - OO.exportToORG: it is not an OpenOffice Document: %s' % self.filename
-
-    def __readAsZipFile__(self):
-        '''__readAsZipFile__: this function opens the .ODT as a ZIP file''' 
-        try:
-            _data = zipfile.ZipFile( self.filename, "r")
-            return _data
-        except: 
-            print 'ERROR in OO.__readAsZipFile: ODT files should be compressed as ZIP files: %s' % self.filename
-
-    def isOpenOfficeFile(self):
-        '''Verifies if this is an Open Office Document'''
-        # 1. Contiene el fichero "content.xml"
-        _isOK = False
-        if self.files.namelist().__contains__( 'content.xml'):
-
-            # 2. Leemos el fichero 'content.xml' y vemos que tiene el tag "office:document-content"
-            _data = self.files.read( 'content.xml' )
-            self.xml = etree.fromstring( _data )
-            _lista = self.xml.xpath('/office:document-content', 
-                                namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )
-            if len(_lista) == 1:
-                _isOK = True
-        return _isOK 
-
-    def __processText__(self):
-        _data = self.files.read('content.xml')
-        _dataxml = etree.fromstring( _data )
-
-        _body = _dataxml.xpath('/office:document-content/office:body', 
-                             namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
-
-        _xml = _body.xpath('office:text', 
-                             namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
-        _list = []
-        for _child in _xml.getchildren():
-            _tmp = self.getData( _child )
-            if _tmp != None:
-                _list = _list + [_tmp]
-
-        self.orgfile.readfromlist( _list )
-        
-
-    def __checkTag__(self,
-                     _xml,
-                     _field,
-                     mode=None):
-        if _xml.tag == '{%s}%s' % (self.ns[mode],_field):
-            return True
-        else:
-            return False
-
-    def getData(self, _child):
-        # Get style
-        try:
-            _style = _child.attrib['{%s}%s' % (self.ns['text'],'style-name')]
-        except KeyError:
-            _style = None
-        
-        # Get text
-        
-        _list = []
-        
-        # Gets the text from frames.
-        for _i in _child.getchildren():
-            if self.__checkTag__(_i,'frame',mode='draw'):
-                _name = _i.attrib['{%s}%s' % (self.ns['draw'],'name')]
-                if _name == None:
-                    _name = u'FrameName'
-                _tmp = self.__processFrame__(_i,_name)
-                _list = _list + ['%s' % _tmp ]
-
-        # Get level
-        try:
-            _level = _child.attrib['{%s}%s' % (self.ns['text'],'outline-level')]
-        except KeyError:
-            _level = None
-
-        # Is paragraph
-        if _child.tag == '{%s}%s' % (self.ns['text'],'p'):
-            _isParagraph = True
-            _list = _list + self.getParagraph( _child, mode = 'p' )
-            _mode = 'paragraph'
-        else:
-            _isParagraph = False
-
-        # Is heading
-        if _child.tag == '{%s}%s' % (self.ns['text'],'h'):
-            _list = _list + self.getParagraph( _child, mode = 'h' )
-            _isHeading = True
-            _mode = 'heading'
-        else:
-            _isHeading = False
-
-        # Enumerations
-        if _child.tag == '{%s}%s' % (self.ns['text'],'list'):
-            _isEnumeration = True
-            _mode = 'enumeration'
-            _list = self.__processEnumeration__( _child)
-        else:
-            _isEnumeration = False
-
-        # Table
-        if _child.tag == '{%s}%s' % (self.ns['table'],'table'):
-            _list = self.getTable( _child )
-            _mode = 'table'
-        # Declarations: at the beginning of the file
-        if _child.tag == '{%s}%s' % (self.ns['text'],'sequence-decls'):
-            return None
-        # level, style, list of sentences, isHeading, isPara, XML in case of error.
-        else:
-            _data = (_level, _style, _list, _isHeading, _isParagraph, _mode, etree.tostring(_child, pretty_print=True) )
-            return  _data
-
-    def getParagraph(self, _child, mode = 'p'):
-       '''It checks this is paragraph and recovers the text.'''
-       _list = []
-       if _child.tag == '{%s}%s' % (self.ns['text'], mode):
-          _text = u''
-          for _i in _child.iter():
-             if _i.tag == '{%s}%s' % (self.ns['text'], mode):
-                if _i.text != None:
-                   _text = _text + _i.text
-             elif _i.tag == '{%s}%s' % (self.ns['text'],'s'):
-                try:
-                   _text = _text +  ' ' * int(_i.attrib['{%s}%s' % (self.ns['text'],'c')])
-                except KeyError:
-                   pass # It is used not only to add spaces.
-             elif _i.tag == '{%s}%s' % (self.ns['text'],'tab'):             
-                _text = _text + '\t'
-             #else:
-             #   print "[WARNING] Currently getParagraph does not implement:\n    %s" % _i
-
-             if _i.tail != None:
-                _text = _text + _i.tail
-          if not self.isOriginal: # We clean the text.
-             _text = _text.replace('\t',' ').replace('\r',' ').replace('\n',' ').strip()
-             _text = _text.split(' ')
-             _text = [_i for _i in _text if _i != u'']
-             _text = string.join(_text, ' ')
-          return [_text]
-       else:
-          return []
-
-    def __processEnumeration__(self,
-                               _child):
-        '''Structure for Enumerations:
-        <text:list>
-           <text:list-item>
-                <text:p>
-        '''
-        _list = []
-        _level = 1
-        _levels = { _child : {'type' : 'list', 'level' : 1}}
-        for _i in _child.iterdescendants(): # We follow all the nested structure.
-            # list-item: mismo nivel que su padre.
-            if _i.tag == '{%s}%s' % (self.ns['text'],'list-item'):
-                _levels.update({ _i : {'type' : 'list-item',
-                                      'level' : _levels[ _i.getparent()]['level']}})
-            # list: si el padre era un list-item, se le suma uno.
-            if _i.tag == '{%s}%s' % (self.ns['text'],'list'):
-                _levels.update({ _i : {'type' : 'list',
-                                      'level' : _levels[ _i.getparent()]['level']+1}})
-                
-            if _i.tag == '{%s}%s' % (self.ns['text'],'p'):
-                _levels.update({ _i : {'type' : 'p',
-                                       'level' : _levels[ _i.getparent()]['level']}})
-                _n = _levels[ _i.getparent()]['level']
-                _tmp = []
-                for _j in _i.itertext():
-                    if _j != None:
-                        
-                        _tmp = _tmp + [u'%s' % _j]
-                        #_list = _list + [ (_n, _tmp )]
-                _list = _list + [ (_n, _tmp) ]
-        return _list
-    
-    def getTable(self,_xml):
-        #<table:table-row>
-        #  <table:table-cell>
-        _list = []
-        _rows = _xml.xpath('table:table-row',namespaces={'table':self.ns['table']})
-        for _row in _rows:
-            _cells = _row.xpath('table:table-cell',namespaces={'table':self.ns['table']})
-            _row = []
-            for _c in _cells:
-                _tmp = []
-                for _i in _c.getchildren():
-                    if self.__checkTag__(_i, 'p',mode='text'):
-                        _tmp1 = []
-                        for _j in _i.itertext():
-                            if _j != None:
-                                _tmp1 = _tmp1 + [ u'%s' % _j ]
-                        _tmp = _tmp + _tmp1
-
-                    elif self.__checkTag__(_i, 'list',mode='text'):
-                        _tmp1 = self.__processList__( _i )
-                        _tmp = _tmp + _tmp1
-                _row.append( _tmp )
-            _list.append(_row)
-        return _list
-
-    def __processFrame__(self,_xml,_name):
-        _output = u''
-        #print 'ok'
-        for _i in _xml.getchildren():
-            if self.__checkTag__(_i,'object-ole',mode='draw'):
-                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
-                _newfilename = self.__fileExtractor__( _filename )
-                _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)
-            elif self.__checkTag__(_i,'image',mode='draw'):
-                # No hacer nada si la imagen es un Object Replacement
-                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
-                if not _filename.__contains__('ObjectReplacements/'):
-                    _newfilename = self.__fileExtractor__( _filename )
-                    _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)          
-        return _output
-    
-    def __fileExtractor__(self,_filename):
-        '''Extract the file an assigns a proper filename to it'''
-        # All paths are relatives. We take away the beginning.
-        if _filename[0:2] == './':
-            _tmp = _filename[2:]
-        else:
-            _tmp = _filename
-        
-        _outdir = os.path.split( self.orgfile.filename)[0]
-
-        # The file is extracted.
-        try:
-           self.files.extract(_tmp, _outdir)
-        except AttributeError:
-           print "ERROR: ZipFile provides 'extract' function in Python 2.6." 
-           print "       Please, consider updating to PYTHON v2.6"
-           raise AttributeError("Zipfile instance has no attribute 'extract'")
-        _extractedfile = os.path.join(_outdir,_filename)
-
-        # Is an OLE object.
-        _toRename = False
-        _ole = None
-        if ole.isOleFile(_extractedfile):
-            _ole = Ole( filename = _extractedfile )
-            _new = _ole.extractFile()
-            _ole.__close__()
-            _extractedfile = _new
-            _toRename = True
-        
-        if _toRename:
-            _extractedfile = os.path.realpath(_extractedfile)
-            _tmp = os.path.split(_new)
-            _tmpname = _tmp[1]
-            _tmpname = _tmpname.replace(' ','_')
-            _new = os.path.join(_tmp[0],_tmpname )
-        
-            if self.isOverWriter:
-                os.remove( _new )
-            try:
-                os.rename(_extractedfile,_new)
-            except WindowsError:
-                print 'ERROR: file already exists: %s' % _new
-            _extractedfile = _new
-
-        # Los ficheros se extraen a sus rutas originales
-        # Crea directorios si fuera necesario.
-        # - Movemos los ficheros al mismo directorio que el .org.
-        # - Usamos como prefijo el nombre del fichero.
-        _tmp = os.path.split( _extractedfile )
-        _newname = self.prefix + _tmp[1]
-        _fullnewname = os.path.join(_outdir,_newname)
-        
-        if os.path.isfile( _fullnewname ) and self.isOverWriter:
-            os.remove( _fullnewname )  # The preexisting file is removed.
-        elif os.path.isfile( _fullnewname ) and not self.isOverWriter:
-            print 'WARNING - The file already exists.'
-            print '  - FILENAME: %s' % _fullnewname
-            print '  - Keeping both files'
-            print '  - Referencing to the old one'
-        else:
-            pass
-        
-        # We try to move the file changing its name. 
-        try:
-            os.rename(_extractedfile,_fullnewname)
-            return _newname
-        except:   
-            return _filename
+#import string
+#import re
+from orgfile import ORGfile
+from odtfile import ODTfile
 
 if __name__ == '__main__':
     if not ERROR:
 
         # We have proper inputfile, outputfile and overwrite option.
         if _isOK:
-            _odtfile = ODTfile(filename = _inputfile,
-                               output = _outputfile,
-                               overwrite = _options.overwrite,
-                               original = _options.original)
-            _odtfile.exportToORG()
+            _odtfile = ODTfile(filename = _inputfile)#,
+                               #output = _outputfile,
+                               #overwrite = _options.overwrite,
+                               #original = _options.original)
+            _list = _odtfile.gen_list()
+
+            _orgfile = ORGfile( filename = os.path.realpath( _outputfile ))
+#                                overwrite = overwrite)
+            _orgfile.readfromlist( _list )
+            _orgfile.export( )
+
        
+# -*- coding: utf-8 -*-
+'''
+TODO:
+-----
+1. [X] It fails when the document doesn't contain text. (Nick Dokos)
+   To fail cleanly and advice this is not OCR software.
+2. <text:p>   -> This is a typical paragraph.
+     tetxo01 jskdljsd
+     <text:s text:c="6">   -> This means 6 spaces
+    more text (this is a continuation)
+    <text:tab />     -> This is a tab
+   </text:p>
+'''
+
+ERROR = False
+import codecs
+import zipfile
+import os
+import string
+import re
+from orgfile import ORGfile
+try:
+    from lxml import etree
+except ImportError:
+    print u'ERROR: install "nxml": http://codespeak.net/lxml/'
+    ERROR = True
+
+try:
+    import OleFileIO_PL as ole
+except ImportError:
+    print u'ERROR: please, install "OleFileIO_PL": http://www.decalage.info/files/OleFileIO_PL-0.18.zip'
+    # http://www.decalage.info/en/python/olefileio
+    ERROR = True
+
+class ODTfileError(Exception):
+    def __init__(self):
+        super(ODTfileError,self).__init__()
+
+class ODTfile:
+    '''This class enables the access to OpenOffice files'''
+    def __init__(self,
+                 filename = None):
+        '''All the fields are mandatory. All the checks are done outside.
+        '''
+        self.ns = { 
+           'office' : "urn:oasis:names:tc:opendocument:xmlns:office:1.0", 
+           'text'   : "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
+           'table'  : "urn:oasis:names:tc:opendocument:xmlns:table:1.0",
+           'draw'   : "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
+           'xlink'  : "http://www.w3.org/1999/xlink"
+                  }
+
+        self.types = { 
+            '{%s}%s' % (self.ns['text'],'p') : 'paragraph',
+            '{%s}%s' % (self.ns['text'],'h') : 'heading',
+            '{%s}%s' % (self.ns['draw'],'frame') : 'frame',
+            '{%s}%s' % (self.ns['draw'],'name') : 'framename',
+            '{%s}%s' % (self.ns['text'],'outline-level') : 'outline-level',
+            '{%s}%s' % (self.ns['text'],'list'): 'list',
+            '{%s}%s' % (self.ns['text'],'style-name'): 'style-name',
+            '{%s}%s' % (self.ns['table'],'table'): 'table',
+            '{%s}%s' % (self.ns['draw'],'object-ole'): 'object-ole',
+            '{%s}%s' % (self.ns['draw'],'image'): 'image',
+            '{%s}%s' % (self.ns['text'],'s') : 'spaces',
+            '{%s}%s' % (self.ns['text'],'tab') : 'tabs'
+
+                        }
+        #
+
+        if filename != None:
+            self.filename = os.path.realpath( filename )
+
+            # Prefix
+            #_tmp = os.path.split( self.orgfile.filename )[1]
+            #_tmp = _tmp.replace(' ','_') 
+            #_tmp = _tmp.replace('.','_')
+            #self.prefix = _tmp + '_'
+
+            # Files
+            try:
+                self.files = zipfile.ZipFile( self.filename, "r")
+            except:
+                raise ODTFileError( u"""ODT files are zip files. The file provided isn't.""" )
+
+            if not self.isOpenOfficeFile():
+                raise ODTFileError( u"""Only compatible with ODT files 1.0 specification.""" )
+ 
+            self.xml = None
+        else:
+            raise ODTFileError( u"""Provided ODT file class ODTfile requires a valid filename during initialization.""" )
+
+    def isOpenOfficeFile(self):
+        '''Verifies if this is an Open Office Document'''
+        # 1. Contiene el fichero "content.xml"
+        _isOK = False
+        if self.files.namelist().__contains__( 'content.xml'):
+
+            # 2. Leemos el fichero 'content.xml' y vemos que tiene el tag "office:document-content"
+            _data = self.files.read( 'content.xml' )
+            self.xml = etree.fromstring( _data )
+            _lista = self.xml.xpath('/office:document-content', 
+                                namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )
+            if len(_lista) == 1:
+                _isOK = True
+        return _isOK 
+
+    def _get_textbody( self ):
+        """Returns the body from the file
+        """
+        _data = self.files.read('content.xml')
+        _dataxml = etree.fromstring( _data )
+
+        _body = _dataxml.xpath('/office:document-content/office:body', 
+                             namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
+
+        return _body.xpath('office:text', 
+                            namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
+
+    def gen_list(self):
+        """Creates a list that is understood by ORGfile class.
+        """
+        _xml = self._get_textbody()
+        _list = []
+
+        for _child in _xml.getchildren():
+            _tmp = self.get_newlist( [{'child' : _child,
+                                       'tags' : {'nesting' : 1}}] )
+            if _tmp != None:
+                _list = _list + [_tmp]
+        for _i in _list:
+            print _i
+        return _list
+
+    def get_newlist(self, _list ):
+        """
+        """
+        while self.has_children(_list):
+            _newlist = []
+            for _i in _list:
+                if not _i.__contains__('child'):
+                    _newlist.append(_i)
+                else:
+                    _newlist += self.analyse_child( _i)
+            _list = _newlist
+        return _list 
+
+    def is_paragraph(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['text'],'p'):
+            return True
+        else:
+            return False
+
+    def is_heading(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['text'],'h'):
+            return True
+        else:
+            return False
+
+    def is_frame(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['draw'],'frame'):
+            return True
+        else:
+            return False
+
+    def is_framename(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['draw'],'name'):
+            return True
+        else:
+            return False
+
+    def is_level(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['text'],'outline-level'):
+            return True
+        else:
+            return False
+
+    def is_enum(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['text'],'list'):
+            return True
+        else:
+            return False
+
+    def is_style(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['text'],'style-name'):
+            return True
+        else:
+            return False
+
+    def is_table(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['table'],'table'):
+            return True
+        else:
+            return False
+
+    def is_object_ole(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['draw'],'object-ole'):
+            return True
+        else:
+            return False
+
+    def is_image(self,_child):
+        if _child.tag == '{%s}%s' % (self.ns['draw'],'image'):
+            return True
+        else:
+            return False
+
+
+    def has_children(self, _list):
+        for _i in _list:
+            if _i.__contains__('child'):
+                return True
+        return False
+
+    def analyse_child( self, _dict):
+        """Each child will contain:
+        - TAG: defines which kind of node the child is.
+        - ATTRIB: dictionary defining node attributes.
+        - CHILDREN: the node may contain children.
+        """
+        _child = _dict['child']
+        _tags = _dict['tags']
+        # TAG: defines the kind node of the child.
+        _tag = _child.tag
+        # ATTRIB: this is a dictionary defining attributes for the node.
+        _dict = _child.attrib 
+        # CHILDREN: it is the content of the node.
+        _list = []
+
+        _type = self.get_type(_child)
+        _newtags = self.get_tags( _child, _dict)
+        for _key in _newtags.keys():
+            _tags.update({_key : _newtags[_key] })
+        _tags.update( {'type': _type})
+
+        if _type == 'spaces':
+            _tmp = self.get_spaces( _child )
+            _list.append( {'txt' : _tmp, 'tags' : _tags })
+        elif _type == 'tabs':
+            _tmp = self.get_tabs( _child )
+            _list.append( {'txt' : _tmp, 'tags' : _tags })
+
+        if _child.text != None:
+            _list.append( {'txt' : _child.text, 'tags' : _tags })
+
+        _nesting = _tags['nesting'] + 1
+        _tags.update({'nesting' : _nesting })
+        for _i in _child.iter():
+            if _i != _child:
+                _childtype = self.get_type(_child)
+                if _childtype != 'UNKNOWN': 
+                    _list.append( {'child' : _i, 'tags' : _tags })
+                else:
+                    print 'WARNING: unkown: ', _child.tag
+        if _child.tail != None:
+            _nesting = _tags['nesting'] - 1
+            _tags.update({'nesting' : _nesting })
+            _list.append( {'txt' : _child.tail, 'tags' : _tags })
+        
+        return _list
+
+    def get_tags(self, _child, _dict = {}):
+        """This function creates a tagged text
+        """
+        _tags = {}
+        # Style
+        _tagstyle = '{%s}%s' % (self.ns['text'],'style-name')
+        if _dict.__contains__(_tagstyle):
+            _tagname = _child.attrib[_tagstyle]
+            _tags.update({'style' : _tagname } )
+        else:
+            _tags.update({'style' : None} )
+
+        # Level
+        _taglevel = '{%s}%s' % (self.ns['text'],'outline-level')
+        if _dict.__contains__(_taglevel):
+            _level = _child.attrib[_taglevel]
+            _tags.update({'level' : _level })
+        else:
+            _tags.update({'level' : None} )
+        return _tags
+
+    def get_type( self, _child ):
+        """Gets the kind of child"""
+        if self.types.__contains__(_child.tag):
+            return self.types[_child.tag]
+        else:
+            return 'UNKNOWN'
+
+    def getData(self, _child):
+        """Process each child node.
+        """
+        _tag = _child.tag
+
+        # Get text
+        
+        _list = []
+
+        
+        # Gets the text from frames.
+        for _i in _child.getchildren():
+            if is_frame(_i):
+                _name = _i.attrib['{%s}%s' % (self.ns['draw'],'name')]
+                if _name == None:
+                    _name = u'FrameName'
+                _tmp = self.__processFrame__(_i,_name)
+                _list = _list + ['%s' % _tmp ]
+
+        # Enumerations
+        if _child.tag == '{%s}%s' % (self.ns['text'],'list'):
+            _isEnumeration = True
+            _mode = 'enumeration'
+            _list = self.__processEnumeration__( _child)
+        else:
+            _isEnumeration = False
+
+        # Table
+        if self.is_table( _child.tag ):
+            _list = self.getTable( _child )
+            _mode = 'table'
+
+        else:
+            return (_level, _style, _list, _isHeading, _isParagraph, _mode, etree.tostring(_child, pretty_print=True) )
+
+    def get_spaces( self, _child):
+        """Deals with <text:s> which deals with extra spaces.
+        """
+        _tag = '{%s}%s' % (self.ns['text'],'c')
+        if _child.attrib.__contains__(_tag):
+            _value = _child.attrib[_tag]
+            _tmp = ' ' * int(_value)
+        else:
+            _tmp = ' '
+        return _tmp
+
+    def get_tabs( self, _child):
+        """Deals with <text:s> which deals with extra spaces.
+        """
+        _tag = '{%s}%s' % (self.ns['text'],'tab')
+        _tmp = '    '
+        #if _child.attrib.__contains__(_tag):
+        #    _value = _child.attrib[_tag]
+        #    _tmp = ' ' * int(_value)
+        #else:
+        #    _tmp = ' '
+        return _tmp
+
+
+    def getParagraph(self, _child, mode = 'p'):
+       '''It checks this is paragraph and recovers the text.'''
+       _list = []
+       if _child.tag == '{%s}%s' % (self.ns['text'], mode):
+          _text = u''
+          for _i in _child.iter():
+             if _i.tag == '{%s}%s' % (self.ns['text'], mode):
+                if _i.text != None:
+                   _text = _text + _i.text
+             elif _i.tag == '{%s}%s' % (self.ns['text'],'s'):
+                try:
+                   _text = _text +  ' ' * int(_i.attrib['{%s}%s' % (self.ns['text'],'c')])
+                except KeyError:
+                   pass # It is used not only to add spaces.
+             elif _i.tag == '{%s}%s' % (self.ns['text'],'tab'):             
+                _text = _text + '\t'
+             #else:
+             #   print "[WARNING] Currently getParagraph does not implement:\n    %s" % _i
+
+             if _i.tail != None:
+                _text = _text + _i.tail
+          #if not self.isOriginal: # We clean the text.
+          #   _text = _text.replace('\t',' ').replace('\r',' ').replace('\n',' ').strip()
+          #   _text = _text.split(' ')
+          #   _text = [_i for _i in _text if _i != u'']
+          #   _text = string.join(_text, ' ')
+          return [_text]
+       else:
+          return []
+
+    def __processEnumeration__(self,
+                               _child):
+        '''Structure for Enumerations:
+        <text:list>
+           <text:list-item>
+                <text:p>
+        '''
+        _list = []
+        _level = 1
+        _levels = { _child : {'type' : 'list', 'level' : 1}}
+        for _i in _child.iterdescendants(): # We follow all the nested structure.
+            # list-item: mismo nivel que su padre.
+            if _i.tag == '{%s}%s' % (self.ns['text'],'list-item'):
+                _levels.update({ _i : {'type' : 'list-item',
+                                      'level' : _levels[ _i.getparent()]['level']}})
+            # list: si el padre era un list-item, se le suma uno.
+            if _i.tag == '{%s}%s' % (self.ns['text'],'list'):
+                _levels.update({ _i : {'type' : 'list',
+                                      'level' : _levels[ _i.getparent()]['level']+1}})
+                
+            if _i.tag == '{%s}%s' % (self.ns['text'],'p'):
+                _levels.update({ _i : {'type' : 'p',
+                                       'level' : _levels[ _i.getparent()]['level']}})
+                _n = _levels[ _i.getparent()]['level']
+                _tmp = []
+                for _j in _i.itertext():
+                    if _j != None:
+                        
+                        _tmp = _tmp + [u'%s' % _j]
+                        #_list = _list + [ (_n, _tmp )]
+                _list = _list + [ (_n, _tmp) ]
+        return _list
+    
+    def getTable(self,_xml):
+        #<table:table-row>
+        #  <table:table-cell>
+        _list = []
+        _rows = _xml.xpath('table:table-row',namespaces={'table':self.ns['table']})
+        for _row in _rows:
+            _cells = _row.xpath('table:table-cell',namespaces={'table':self.ns['table']})
+            _row = []
+            for _c in _cells:
+                _tmp = []
+                for _i in _c.getchildren():
+                    if self.is_paragraph( _i ):
+                        _tmp1 = []
+                        for _j in _i.itertext():
+                            if _j != None:
+                                _tmp1 = _tmp1 + [ u'%s' % _j ]
+                        _tmp = _tmp + _tmp1
+
+                    elif self.is_enum( _i ):
+                        _tmp1 = self.__processList__( _i )
+                        _tmp = _tmp + _tmp1
+                _row.append( _tmp )
+            _list.append(_row)
+        return _list
+
+    def __processFrame__(self,_xml,_name):
+        _output = u''
+        #print 'ok'
+        for _i in _xml.getchildren():
+            if self.is_object_ole(_i ):
+                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
+                _newfilename = self.__fileExtractor__( _filename )
+                _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)
+            elif self.is_image(_i):
+                # No hacer nada si la imagen es un Object Replacement
+                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
+                if not _filename.__contains__('ObjectReplacements/'):
+                    _newfilename = self.__fileExtractor__( _filename )
+                    _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)          
+        return _output
+    
+    def __fileExtractor__(self,_filename):
+        '''Extract the file an assigns a proper filename to it'''
+        # All paths are relatives. We take away the beginning.
+        if _filename[0:2] == './':
+            _tmp = _filename[2:]
+        else:
+            _tmp = _filename
+        
+        _outdir = os.path.split( self.orgfile.filename)[0]
+
+        # The file is extracted.
+        try:
+           self.files.extract(_tmp, _outdir)
+        except AttributeError:
+           print "ERROR: ZipFile provides 'extract' function in Python 2.6." 
+           print "       Please, consider updating to PYTHON v2.6"
+           raise AttributeError("Zipfile instance has no attribute 'extract'")
+        _extractedfile = os.path.join(_outdir,_filename)
+
+        # Is an OLE object.
+        _toRename = False
+        _ole = None
+        if ole.isOleFile(_extractedfile):
+            _ole = Ole( filename = _extractedfile )
+            _new = _ole.extractFile()
+            _ole.__close__()
+            _extractedfile = _new
+            _toRename = True
+        
+        if _toRename:
+            _extractedfile = os.path.realpath(_extractedfile)
+            _tmp = os.path.split(_new)
+            _tmpname = _tmp[1]
+            _tmpname = _tmpname.replace(' ','_')
+            _new = os.path.join(_tmp[0],_tmpname )
+        
+            if self.isOverWriter:
+                os.remove( _new )
+            try:
+                os.rename(_extractedfile,_new)
+            except WindowsError:
+                print 'ERROR: file already exists: %s' % _new
+            _extractedfile = _new
+
+        # Los ficheros se extraen a sus rutas originales
+        # Crea directorios si fuera necesario.
+        # - Movemos los ficheros al mismo directorio que el .org.
+        # - Usamos como prefijo el nombre del fichero.
+        _tmp = os.path.split( _extractedfile )
+        _newname = self.prefix + _tmp[1]
+        _fullnewname = os.path.join(_outdir,_newname)
+        
+        if os.path.isfile( _fullnewname ) and self.isOverWriter:
+            os.remove( _fullnewname )  # The preexisting file is removed.
+        elif os.path.isfile( _fullnewname ) and not self.isOverWriter:
+            print 'WARNING - The file already exists.'
+            print '  - FILENAME: %s' % _fullnewname
+            print '  - Keeping both files'
+            print '  - Referencing to the old one'
+        else:
+            pass
+        
+        # We try to move the file changing its name. 
+        try:
+            os.rename(_extractedfile,_fullnewname)
+            return _newname
+        except:   
+            return _filename
+
+#=================================0
+class Ole:
+    def __init__(self,filename=None):
+        self.filename = filename
+        self._data = ole.OleFileIO(self.filename)
+        self.files = self.__getFiles__()
+        self._dict = { '.*AcroExch\.Document\.[0-9]+.*' : { 'ext':'.pdf','name':'CONTENTS'},
+                '.*Word\.Document\.[0-9]+.*' : { 'ext':'.doc','name': None},
+                '.*Excel\.Sheet\.[0-9]+.*'   : { 'ext':'.xls','name': None},
+                '.*PowerPoint\.Show\.[0-9]+.*'   : { 'ext':'.ppt','name': None},
+                       }
+    
+    def __guessFormat__(self):
+        _data = self.__getItem__(0)
+        #print _data
+        _KEY = None
+        for _key in self._dict.keys():
+            _kernel = re.compile(  _key )
+            _tmp = _kernel.findall( _data )
+            if len(_tmp) == 1:
+                _KEY = _key
+        return _KEY
+
+    def __getFiles__(self):
+        return self._data.listdir()
+    
+    def __getItem__(self,_idx):
+        _tmp = self._data.openstream( self.files[_idx] )
+        _data = _tmp.read()
+        return _data
+        
+    def __extractFile__(self,_idx):
+        # Leemos el fichero
+        _data = self.__getItem__(_idx)
+        _fp = open('filaneme.txt','wb')
+        _fp.write( _data )
+        _fp.close()
+    
+    def extractFile(self):
+        _key = self.__guessFormat__()
+        if _key == None:
+            _tmp = self._data.openstream( self.files[0] )
+            _kernel = re.compile('[a-zA-Z]+\.[a-zA-Z]+\.[0-9]+')
+            _tmp = _tmp.read()
+            print 'ERROR: It is not configured for: %s' % _kernel.findall( _tmp )[0]
+            #print self.files
+            #print _tmp
+            return None
+        else:
+            if self._dict[_key]['name'] != None:
+                _tmp = self._data.openstream( self._dict[_key]['name'] )
+                _file = os.path.splitext(self.filename)
+                _ext = self._dict[ _key ]['ext']
+                _filename = _file[0] + _ext
+                #_tmp = self.outfile +_ext
+                #_tmp = os.path.join( self.outdir, _tmp)
+                _fp = open( _filename, 'wb')
+                _fp.write( _tmp.read() )
+                _fp.close()
+    #            print self.filename
+                del(_tmp)
+                self._data.fp.close()
+                os.remove(self.filename)
+                return _filename
+            else:
+                # Símplemente se renombra
+                #print self.files
+                #print self.__getItem__(7)
+                _file = os.path.splitext(self.filename)
+                _ext = self._dict[ _key ]['ext']
+                _filename = _file[0] + _ext                
+                self._data.fp.close()
+                try:
+                    os.rename(self.filename,_filename)
+                except WindowsError:
+                    print 'WARNING: the file already exists: %s' % _filename
+                return _filename
+
+    def __close__(self):
+        self._data.fp.close()
+
+class FileError(Exception):
+    def __init__(self, 
+                 message,
+                 filename ):
+        self.filename = filename
+        self.message = message
+
+    def __str__(self):
+        return u'%s: %s' % ( self.message, self.filename )
+
+
+
+if __name__ == '__main__':
+    pass
+# -*- coding: utf-8 -*-
+#!/usr/python
+'''
+'''
+import codecs
+import os
+
+class ORGfile:
+    '''This class is used to generate an ORG file'''
+    def __init__(self,
+                 filename = None,
+                 overwrite = False):
+        self.filename = filename
+        self.isWarning = False
+        self.data = [u'-*- mode: org; coding: utf-8 -*-\n' ]
+    
+    def readfromlist(self, _list):
+        #print _list
+        if _list[0][1] == u'Title':
+            _start = 1
+            self.addHeading( level = 1,
+                             text = _list[0][2] )
+        else:
+            _start = 0
+            self.addHeading( level = 1,
+                             text = [u'%s' % os.path.split(self.filename)[1]] )
+            
+        for _i in _list[_start:]:
+            if _i[3]: # Is heading (a dictionary would be more readable)
+                self.addHeading( level = int(_i[0])+1,
+                                 text = _i[2] )
+            elif _i[4]: # Is heading (a dictionary would be more readable)
+                self.addParagraph( text = _i[2] )
+            elif _i[5] == 'table':
+                self.addTable( table = _i[2] )
+            elif _i[5] == 'enumeration':
+                self.addEnumeration( enumeration = _i[2] )
+                
+
+    def addHeading(self, 
+                   level = 1,
+                   text = [] ):
+        #print text
+        # A heading should only have one line.
+        if len( text ) > 1:
+            pass # TODO: juntar todas las líneas
+            # text.replace('\r\n',' ')    
+            #text.replace('\n',' ')
+
+        # Check level is valid
+        if level < 1:
+            text = [u'* [WARNING] ' + text[0]]
+            self.isWarning = True
+        else:
+            text = [u'*'*(level) + u' %s' % text[0]]
+
+        self.data = self.data + text
+
+    def addParagraph(self, 
+                     text = [] ):
+        self.data = self.data + text
+
+    def addTable(self, 
+                 table = []):
+        _list = []
+        if len(table) > 0:
+            # First line
+            _list = _list + [u'|-']
+            _list = _list + self.__addRow__( table[0] )
+            _list = _list + [u'|-']
+        if len(table) > 1:
+            for _row in table[1:]:
+                _list = _list + self.__addRow__( _row )
+            _list = _list + [u'|-']
+
+        self.data = self.data +  _list 
+
+    def __addRow__(self, _row):
+        '''Helper function to show rows'''
+        _list = []
+        _n = 0
+        _flag = True
+        while _flag:
+            _text = u'| '
+            for _cell in _row:
+                _flag = False
+                if len(_cell) > _n:
+                    _text = _text + u'%s |' % _cell[_n]
+                    _flag = True
+                else:
+                    _text = _text + u'   |'
+            if _flag:
+                _list.append( _text )
+            _n = _n + 1
+        return _list
+
+    def addEnumeration(self, 
+                       enumeration = []):
+        _lista = []
+        for _i in enumeration:
+            _lista = _lista + [u' ' * (_i[0]-1) + u'- ' + _i[1][0]]
+            if len(_i[1]) > 1:
+                for _j in _i[1:]:
+                    _lista = _lista + [ _j ]
+
+        self.data = self.data +  _lista
+
+    def clean(self):
+        pass
+
+    def export(self):
+       #try:
+           _tmp = u''
+           for _i in self.data:
+               _tmp = _tmp + u'%s\n' % _i
+           _fp = codecs.open( self.filename, 'w', 'utf-8')
+           _fp.write( _tmp )
+           _fp.close()
+       #except:
+       #    print u'ERROR: the file was processed, but it failed when writting it:\n   %s' % self.filename
+
+
+if __name__ == '__main__':
+    pass