Commits

Anonymous committed b0afbc4

Much more improved

  • Participants
  • Parent commits 3fa2f69

Comments (0)

Files changed (3)

                                #output = _outputfile,
                                #overwrite = _options.overwrite,
                                #original = _options.original)
-            _list = _odtfile.gen_list()
-
-            _orgfile = ORGfile( filename = os.path.realpath( _outputfile ))
-#                                overwrite = overwrite)
-            _orgfile.readfromlist( _list )
+            _list, _extra = _odtfile.gen_list()
+            #overwrite = False
+            _orgfile = ORGfile( filename = os.path.realpath( _outputfile ),
+                                original = _inputfile,
+                                overwrite = _options.overwrite)
+            _orgfile.read_list( _list,_extra )
             _orgfile.export( )
 
        
 # -*- coding: utf-8 -*-
 '''
-TODO:
------
-1. [X] It fails when the document doesn't contain text. (Nick Dokos)
-   To fail cleanly and advice this is not OCR software.
-2. <text:p>   -> This is a typical paragraph.
-     tetxo01 jskdljsd
-     <text:s text:c="6">   -> This means 6 spaces
-    more text (this is a continuation)
-    <text:tab />     -> This is a tab
-   </text:p>
+TODO: 
+1. Not having OLE should produce only a warning
 '''
 
 ERROR = False
 import codecs
-import zipfile
 import os
 import string
 import re
+import tempfile
 from orgfile import ORGfile
+
+try:
+    import zipfile
+except ImportError:
+    print u"ERROR: package 'zipfile' is mandatory. Install Python2.6 or higher"
+    ERROR=True
+
 try:
     from lxml import etree
 except ImportError:
                  filename = None):
         '''All the fields are mandatory. All the checks are done outside.
         '''
+        self.tmpdir = tempfile.mkdtemp()
         self.ns = { 
            'office' : "urn:oasis:names:tc:opendocument:xmlns:office:1.0", 
            'text'   : "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
         self.types = { 
             '{%s}%s' % (self.ns['text'],'p') : 'paragraph',
             '{%s}%s' % (self.ns['text'],'h') : 'heading',
-            '{%s}%s' % (self.ns['draw'],'frame') : 'frame',
-            '{%s}%s' % (self.ns['draw'],'name') : 'framename',
             '{%s}%s' % (self.ns['text'],'outline-level') : 'outline-level',
             '{%s}%s' % (self.ns['text'],'list'): 'list',
             '{%s}%s' % (self.ns['text'],'list-item'): 'list-item',
             '{%s}%s' % (self.ns['text'],'style-name'): 'style-name',
-            '{%s}%s' % (self.ns['table'],'table'): 'table',
-            '{%s}%s' % (self.ns['draw'],'object-ole'): 'object-ole',
-            '{%s}%s' % (self.ns['draw'],'image'): 'image',
             '{%s}%s' % (self.ns['text'],'s') : 'spaces',
             '{%s}%s' % (self.ns['text'],'tab') : 'tabs',
             '{%s}%s' % (self.ns['text'],'note') : 'note',
             '{%s}%s' % (self.ns['text'],'note-citation') : 'note-citation',
-            '{%s}%s' % (self.ns['text'],'note-body') : 'note-body'
+            '{%s}%s' % (self.ns['text'],'note-body') : 'note-body',
+            '{%s}%s' % (self.ns['draw'],'frame') : 'frame',
+            '{%s}%s' % (self.ns['draw'],'name') : 'framename',
+            '{%s}%s' % (self.ns['draw'],'object-ole'): 'object-ole',
+            '{%s}%s' % (self.ns['draw'],'image'): 'image',
+            '{%s}%s' % (self.ns['xlink'],'href'): 'href',
+            '{%s}%s' % (self.ns['table'],'table'): 'table',
+            '{%s}%s' % (self.ns['table'],'table-row'): 'table-row',
+            '{%s}%s' % (self.ns['table'],'table-cell'): 'table-cell'
                         }
         #
 
         _list = _newlist
 
         # - Regroup: move spaces, tabs into the prior child.
+        _list = self.process_regroup(_list)
+         
+        # - Getting heading and paragraph text.
+        _list = self.process_headings_paragraphs( _list )
+        # LIST
+        _list = self.group_children( _list, tag = 'list', clean= True)
+        _list = self.process_enumeration(_list)
+
+        # TABLE
+        _list = self.group_children( _list, tag = 'table', clean= True)
+        _list = self.process_tables(_list)
+
+        # FRAME
+        # - Don't clean to recover the framename
+        _list = self.group_children( _list, tag = 'frame', clean= False)
+        _list = self.process_framename(_list)
+
+        # PRINT
+        #for _i in _list:
+        #    print _i
+        #print _footnotes
+        # Process Footnotes
+        _tmp = []
+        for _i in _footnotes:
+            print _i
+            _tmp1 = self.process_regroup( _i[1] )
+
+            _tmp1 = self.process_headings_paragraphs( _tmp1 )
+            _tmp.append((_i[0],_tmp1))
+
+        _extra = {'footnotes' : _tmp}
+        return _list, _extra
+
+    def process_regroup(self,_list):
         _newlist = []
         _tmp = u''
         for _i in xrange(len(_list)):
             else:
                 _tmp = _tmp + _item['txt']
             
-        _list = _newlist        
-         
-        # - Getting heading and paragraph text.
+        return  _newlist        
+
+    def process_headings_paragraphs( self,_list ):
         _newlist = []
         for _i in _list:
-            _flag = True
-            # - Heading
+            _isHeading = False
+            _isParagraph = False
+
             try:
                 if self.types[ _i['child'].tag] == 'heading':
-                    _tmp = u''
-                    if _i['pretext'] != None:
-                        _tmp += _i['pretext']
-                    if _i.__contains__('txt'):
-                        if _i['txt'] != None:
-                            _tmp += _i['txt']
-                    if _i['posttext'] != None:
-                        _tmp += _i['posttext']
-                    _tags =  self.get_tags( _i['child'] )
-                    #print _i['child']
-                    _tags.update({'type' : 'heading'})
-                    _tmp = { 'text' : _tmp,
-                             'nesting': _i['nesting'],
-                             'tags' : _tags}
-
-                    _newlist.append(_tmp)
-                    _flag = False
+                    _isHeading = True
             except:
                 pass 
-            # - Paragraph
             try:
                 if self.types[ _i['child'].tag] == 'paragraph':
-                    _tmp = u''
-                    if _i['pretext'] != None:
-                        _tmp += _i['pretext']
-                    if _i.__contains__('txt'):
-                        if _i['txt'] != None:
-                            _tmp += _i['txt']
-                    if _i['posttext'] != None:
-                        _tmp += _i['posttext']
-                    _tags =  self.get_tags( _i['child'] )
-                    _tags.update({'type' : 'paragraph'})
-                    _tmp = { 'text' : _tmp,
-                             'nesting': _i['nesting'],
-                             'tags' : _tags}
-                    _newlist.append(_tmp)
-                    _flag = False
+                    _isParagraph = True
             except:
                 pass 
-            if _flag:
+            
+            if _isHeading or _isParagraph:                    
+                _tmp = u''
+                if _i['pretext'] != None:
+                    _tmp += _i['pretext']
+                if _i.__contains__('txt'):
+                    if _i['txt'] != None:
+                        _tmp += _i['txt']
+                if _i['posttext'] != None:
+                    _tmp += _i['posttext']
+                _tags = self.get_tags( _i['child'])
+                if _isHeading:
+                    _tags.update({'type' : 'heading'})
+                elif _isParagraph:
+                    _tags.update({'type' : 'paragraph'})
+
+                _tmp = { 'text' : _tmp,
+                         'nesting': _i['nesting'],
+                         'tags' : _tags}
+
+                _newlist.append(_tmp)
+
+            else:
                 _newlist.append(_i)
             
-        _list = _newlist        
-        # LIST
-        #_list = self.group_type( _list, tag = 'list', clean= True)
-        _list = self.group_children( _list, tag = 'list', clean= True)
+        return _newlist        
+
+    def process_enumeration(self, _list):
+        """Convert lists into something closer to text.
+        Structure for Enumerations:
+        <text:list>
+           <text:list-item>
+                <text:p>
+        """
+        # Phase 1
         _newlist = []
         for _i in _list:
             if _i.__contains__('list'):
+                # - Clean "list"
                 _tmp = self.clean( _i['list'], tags = ['list'])
+                # - Group "list-item"
+                _tmp = self.group_type( _tmp, tag = 'list-item', clean= True)
                 _tmp1 = _i.copy()
                 _tmp1['list'] = _tmp
+
                 _newlist.append( _tmp1)
             else:
                 _newlist.append( _i)
         _list = _newlist
+        # Phase 2
+        _newlist = []
         for _i in _list:
-            print _i
-        _list = self.process_enumeration(_list)
+            if _i.__contains__('list'):
+                _tmp = []
+                for _j in _i['list']:
+                    if _j.__contains__('list-item'):
+                        _paragraphs = _j['list-item']
+                        _txt = u''
+                        for _k in _paragraphs:
+                            _txt += _k['text'] +'\n'
+                        _txt = _txt[0:-1]
+                        _tmp1 = _paragraphs[0].copy()
+                        _tmp1['text'] = _txt
 
-        # TABLE
-        _list = self.group_type( _list, tag = 'table')
+                        _tmp1['tags']['type'] = 'list-item'
+                        _n = (_tmp1['nesting']-_i['nesting'] )/ 2
+                        _tmp1['tags']['level'] = _n
+                        _tmp1.pop('nesting')
+                        _tmp.append( _tmp1)
+                
+                _newlist += _tmp
+            else:
+                _newlist.append( _i )
 
-        # PRINT
-        #for _i in _list:
-        #    print _i
-        #print _footnotes
-        return _list
+        return _newlist
 
-    def process_enumeration(self, _list):
+    def process_tables(self, _list):
+        """Simplies the format in which tables are expressed.
         """
+        # Phase 1   
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('table'):
+                # - Group "table-row"
+                _tmp = self.group_children( _i['table'], 
+                                             tag = 'table-row', 
+                                             clean= True)
+
+                _tmp1 = []
+                for _j in _tmp:
+                    if _j.__contains__('table-row'):
+                        _tmp2 = self.group_children( _j['table-row'], 
+                                             tag = 'table-cell', 
+                                             clean= True)
+                        _tmp3 = _j.copy()
+                        _tmp3['table-row'] = _tmp2
+                        _tmp1.append( _tmp3)
+                    else: 
+                        _tmp1.append( _j)
+
+                _tmp4 = _i.copy()
+                _tmp4['table'] = _tmp1
+                _newlist.append( _tmp4)
+            else:
+                _newlist.append( _i)
+        _list = _newlist
+        
+        # Phase 2: convert into "text"
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('table'):
+                _table = _i['table']
+                # Get rows.
+                _nrow = 0
+                for _j in _table:
+                    if _j.__contains__('table-row'):
+                        _row = _j['table-row']
+                        # Get cells
+                        for _k in _row:
+                            if _k.__contains__('table-cell'):
+                                _cell = _k['table-cell']
+                                _txt = u''
+                                for _m in _cell:
+                                    _txt += _m['text'] + ' '
+                                _txt = _txt[0:-1]
+                                _tmp = _cell[0].copy()
+                                _tmp['text'] = _txt
+                                _tmp.pop('nesting')
+                                _tmp['tags']['type'] = 'table-cell'
+                                _tmp['tags'].update( { 'nrow' : _nrow })
+                                _newlist.append( _tmp)
+                        _nrow +=1
+            else:
+                _newlist.append( _i ) 
+        return _newlist
+
+    def process_framename(self, _list):
+        # Phase 1
+        _newlist = []
+        _counter = 0
+        for _i in _list:
+            if _i.__contains__('frame'):
+                _frame = _i['frame']
+                # Framename
+                _tags = self.get_tags(_frame[0]['child'])
+                if not _tags.__contains__('framename'): 
+                    _tags.update({'framename' : None}) 
+                # - Images
+                _tmp = self.group_children(_frame[1:],tag = 'image',clean = False)
+                _tmp1 = []
+                for _j in _tmp:
+                    if _j.__contains__('image'):
+                        _images = _j['image']
+                        _n = 0
+                        for _image in _images:
+                            # - Extract reference
+                            _imgtags = self.get_tags( _image['child'])
+                            if _imgtags.__contains__('href'):
+                                _dict = { 'text' : _imgtags['href'],
+                                          'tags' : {'type' : 'image',
+                                                    'style' : None} }
+                                if _tags['framename'] == None:
+                                    _name = 'Img_%.4d' % _counter
+                                    _counter += 1
+                                else:
+                                    if len(_images) >1:
+                                        _name = '%s_%.4d' % (_tags['framename'],_n)
+                                        _n +=1
+                                    else:
+                                        _name = '%s' % _tags['framename']
+                                _dict['tags'].update({'name': _name})
+                                _tmp1.append(_dict)
+                    else:
+                        _tmp1.append(_j)
+                _tmpp = _i.copy()
+                _tmpp['frame'] = _tmp1 
+                _newlist.append(_tmpp)
+            else:
+                _newlist.append( _i )
+        _list = _newlist
+
+        # Phase 2: create text
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('frame'):
+                _frame = _i['frame']
+                for _j in _frame:
+                    if _j.__contains__('text'):
+                        _newlist.append(_j)
+            else:
+                _newlist.append(_i)
+        _list = _newlist
+
+        # Phase 3: file extractor
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('tags'):
+                if _i['tags']['type'] == 'image':
+                    _newfilename = self.__fileExtractor__( _i['text'] )
+                    _i['text'] = _newfilename
+                    #print _newfilename
+                    _newlist.append(_i)
+            else:
+                _newlist.append(_i)
+
+        # - Si el filename contiene: 'ObjectReplacements/' entonces no se extrae.
+        # antes usaba: is_object_ole y también is_image, pero no entiendo con qué propósito.
+        #_newfilename = self.__fileExtractor__( _filename )
+        # '[[file:%s][%s]]\n' % (_newfilename,_name)
+        return _newlist
+
+    def clean(self, _list, tags = []):
+        """Removes those items tagged with something contain in the list 'tags'
         """
         _newlist = []
         for _i in _list:
-            if _i.__contains__('list'):
-                #print _i,'\n'
-
-                _tmp = self.group_type( _i['list'], 
-                                        tag = 'list-item',
-                                        clean = True)
-  
-                # Clean from 'list' and 'list-item'
-                #_tmp = self.clean(_tmp, tags = ['list','list-item'])
-
-                _tmp1 = _i.copy()
-                _tmp1['list'] = _tmp
-
-                _newlist.append( _tmp1 )
-            else:
-                _newlist.append( _i )
-        #print '\n\n\n'        
-        return _newlist
-
-    def clean(self, _list, tags = []):
-        _newlist = []
-        for _i in _list:
             try:
-                _tag = _i['child'].tag
-                _type = self.types[_tag]
+                _type = self.types[ _i['child'].tag ]
             except:
                 _type = None
             if not tags.__contains__(_type):
                 _newlist.append(_i)
-        return _newlist
-                
+        #if tags == ['list']:
+        #    print _newlist,'\n\n'
+        return _newlist     
 
-    def __processEnumeration__(self,
-                               _child):
-        '''Structure for Enumerations:
-        <text:list>
-           <text:list-item>
-                <text:p>
-        '''
-        _list = []
-        _level = 1
-        _levels = { _child : {'type' : 'list', 'level' : 1}}
-        for _i in _child.iterdescendants(): # We follow all the nested structure.
-            # list-item: mismo nivel que su padre.
-            if _i.tag == '{%s}%s' % (self.ns['text'],'list-item'):
-                _levels.update({ _i : {'type' : 'list-item',
-                                      'level' : _levels[ _i.getparent()]['level']}})
-            # list: si el padre era un list-item, se le suma uno.
-            if _i.tag == '{%s}%s' % (self.ns['text'],'list'):
-                _levels.update({ _i : {'type' : 'list',
-                                      'level' : _levels[ _i.getparent()]['level']+1}})
-                
-            if _i.tag == '{%s}%s' % (self.ns['text'],'p'):
-                _levels.update({ _i : {'type' : 'p',
-                                       'level' : _levels[ _i.getparent()]['level']}})
-                _n = _levels[ _i.getparent()]['level']
-                _tmp = []
-                for _j in _i.itertext():
-                    if _j != None:
-                        
-                        _tmp = _tmp + [u'%s' % _j]
-                        #_list = _list + [ (_n, _tmp )]
-                _list = _list + [ (_n, _tmp) ]
-        return _list
-    def group_recursively(self, _list, tag = 'list'):
-        """Groups recursively up to making dissapear the tag
+    def get_istagged(self,_list,tag = 'list'):
+        """Creates an array stating 'True' where the tag is as specified.
         """
-        
-        pass
+        _tmp = []
+        for _i in _list:
+            try:
+                if self.types[ _i['child'].tag ] == tag:
+                    _tmp.append( True )
+                else:
+                    _tmp.append( False )
+            except:
+                _tmp.append( False )
+        return _tmp
 
-    def has_tag(self):
-        pass
+    def get_number_of_children(self, _list, idx = 0):
+        """Given an item in a list indicated by its position: 'idx', this
+        function returns the number of children it has.
+        """
+        _nesting = _list[idx]['nesting']
+        _count = 0
+        for _i in xrange(idx+1, len(_list)):
+            if _list[_i]['nesting'] > _nesting:
+                _count += 1
+            else:
+                break
+        return _count
+
+    def get_pairs(self,_list, tag= 'list'):  #<<<<<<<<<<<<<<<
+        """Creates a list with pairs. Each pairs represent and item having
+        an specified tag and ALL its children (even if the tag is contained
+        again among its chuldren).
+        """
+        _istagged = self.get_istagged( _list,tag) 
+        #if tag == 'table-row':
+        #    print _istagged
+
+        #print _istagged
+        _pairs = []
+        _n = 0
+        for _i in xrange(len(_list)):
+            if len(_pairs) > 0:
+                _limit = _pairs[-1][1]
+            else:
+                _limit = 0
+            #if tag == 'list':
+            #    print _pairs
+            #    print _i, _limit
+            if _istagged[_i] and _i >= _limit:
+                _nesting = _list[_i]['nesting']
+                _n = self.get_number_of_children( _list, idx = _i)
+                _pairs.append( [_i, _i+_n+1])
+        return _pairs
 
     def group_children( self,
                         _list, 
                         tag = 'list',
                         clean = False ):
-        """Groups one tag and all its children.
         """
+        """
+        if clean:
+            _tmp = 1
+        else:
+            _tmp = 0
+        _pairs = self.get_pairs(_list, tag)
         _newlist = []
-        _tmp = []
-        _flag = False
-        _n = 0
-        for _i in _list:
-            #_isTag = False
-            try:
-                if not _flag and self.types[ _i['child'].tag ] == tag:
-                    #_isTag = True
-                    _flag = True
-                    _n = _i['nesting']
-            except:
-                pass
-            
-            #if not _flag and _isTag:
-            #print _n, _i['nesting']
-            if _flag and _i['nesting'] > _n:
-                _tmp.append(_i ) 
-            else:
-                if _tmp != []:
-                    _newlist.append({tag : _tmp, 'nesting' : _n})
-                    _tmp = []
-                    _flag = False 
-                    _n = 0
-                _newlist.append(_i)
-        if _flag and _tmp!=[]:
-            _newlist.append({tag : _tmp, 'nesting' : _n})
-        return _newlist       
+        _ini = 0
+        for _pair in _pairs:
+            _newlist += _list[_ini:_pair[0]]
+            _newlist.append( { tag : _list[_pair[0]+_tmp:_pair[1]], 
+                               'nesting' : _list[_ini]['nesting']})
+            _ini = _pair[1]
+        _newlist += _list[_ini:]
+        return _newlist
 
     def group_type( self,
                     _list, 
                     _nesting = _item['nesting']+1
                     _limits = [_i]                    
                     _tmp = _item
-                    #if tag == 'list-item':
-                    #   print '== OK ==', _limits
-                    #   print _item['nesting']
             except:
                 pass
        
             if _flag and _item != _tmp:
-                #if tag == 'list-item':
-                #    print '<< OK >>', _limits
-                #    print _item['nesting'], _nesting
                 if _item['nesting'] < _nesting:
                     _flag = False
                     _nesting = 0
                 return True
         return False
 
-    def analyse_child( self, _dict):
-        """Each child will contain:
-        - TAG: defines which kind of node the child is.
-        - ATTRIB: dictionary defining node attributes.
-        - CHILDREN: the node may contain children.
-        """
-        _child = _dict['child']
-        _tags = _dict['tags']
-        # TAG: defines the kind node of the child.
-        _tag = _child.tag
-        # ATTRIB: this is a dictionary defining attributes for the node.
-        _dict = _child.attrib 
-        # CHILDREN: it is the content of the node.
-        _list = []
-
-        _type = self.get_type(_child)
-        if _tags.__contains__('type'):
-            _type = _tags['type'] + [_type]
-        else:
-            _type = [_type]
-        _newtags = self.get_tags( _child, _dict)
-        for _key in _newtags.keys():
-            _tags.update({_key : _newtags[_key] })
-        _tags.update( {'type': _type})
-
-        if _type[-1] == 'spaces':
-            _tmp = self.get_spaces( _child )
-            _list.append( {'txt' : _tmp, 'tags' : _tags })
-        elif _type[-1] == 'tabs':
-            _tmp = self.get_tabs( _child )
-            _list.append( {'txt' : _tmp, 'tags' : _tags })
-
-        if _child.text != None:
-            _list.append( {'txt' : _child.text, 'tags' : _tags })
-
-        _nesting = _tags['nesting'] + 1
-        _tags.update({'nesting' : _nesting })
-        for _i in _child.iter():
-            if _i != _child:
-                _childtype = self.get_type(_child)
-                if _childtype != 'UNKNOWN': 
-                    _list.append( {'child' : _i, 'tags' : _tags })
-                else:
-                    print 'WARNING: unkown: ', _child.tag
-        if _child.tail != None:
-            _nesting = _tags['nesting'] - 1
-            _tags.update({'nesting' : _nesting })
-            _list.append( {'txt' : _child.tail, 'tags' : _tags })
-        
-        return _list
 
     def get_tags(self, _child):
         """This function creates a tagged text
         """
-        #print '\n\n>>>>>>> ', _child
         _dict = _child.attrib
-        #print _dict
         _tags = {}
+
         # Style
         _tagstyle = '{%s}%s' % (self.ns['text'],'style-name')
         if _dict.__contains__(_tagstyle):
             _tags.update({'level' : _level })
         else:
             _tags.update({'level' : None} )
+
+        # Framename
+        _tag = '{%s}%s' % (self.ns['draw'],'name')
+        if _dict.__contains__(_tag):
+            _value = _child.attrib[_tag]
+            _id = self.types[ _tag]
+            _tags.update({ _id : _value })
+  
+        # Image > href
+        _tag = '{%s}%s' % (self.ns['xlink'],'href')
+        if _dict.__contains__(_tag):
+            _value = _child.attrib[_tag]
+            _id = self.types[ _tag]
+            _tags.update({ _id : _value })
+ 
+        #else:
+        #    _tags.update({'level' : None} )
+        
         return _tags
 
-    def get_type( self, _child ):
-        """Gets the kind of child"""
-        if self.types.__contains__(_child.tag):
-            return self.types[_child.tag]
-        else:
-            return 'UNKNOWN'
-
-    def getData(self, _child):
-        """Process each child node.
-        """
-        _tag = _child.tag
-
-        # Get text
-        
-        _list = []
-
-        
-        # Gets the text from frames.
-        for _i in _child.getchildren():
-            if is_frame(_i):
-                _name = _i.attrib['{%s}%s' % (self.ns['draw'],'name')]
-                if _name == None:
-                    _name = u'FrameName'
-                _tmp = self.__processFrame__(_i,_name)
-                _list = _list + ['%s' % _tmp ]
-
-        # Enumerations
-        if _child.tag == '{%s}%s' % (self.ns['text'],'list'):
-            _isEnumeration = True
-            _mode = 'enumeration'
-            _list = self.__processEnumeration__( _child)
-        else:
-            _isEnumeration = False
-
-        # Table
-        if self.is_table( _child.tag ):
-            _list = self.getTable( _child )
-            _mode = 'table'
-
-        else:
-            return (_level, _style, _list, _isHeading, _isParagraph, _mode, etree.tostring(_child, pretty_print=True) )
-
     def get_spaces( self, _child):
         """Deals with <text:s> which deals with extra spaces.
         """
         _tmp = '    '
         return _tmp
 
-    def getTable(self,_xml):
-        #<table:table-row>
-        #  <table:table-cell>
-        _list = []
-        _rows = _xml.xpath('table:table-row',namespaces={'table':self.ns['table']})
-        for _row in _rows:
-            _cells = _row.xpath('table:table-cell',namespaces={'table':self.ns['table']})
-            _row = []
-            for _c in _cells:
-                _tmp = []
-                for _i in _c.getchildren():
-                    if self.is_paragraph( _i ):
-                        _tmp1 = []
-                        for _j in _i.itertext():
-                            if _j != None:
-                                _tmp1 = _tmp1 + [ u'%s' % _j ]
-                        _tmp = _tmp + _tmp1
-
-                    elif self.is_enum( _i ):
-                        _tmp1 = self.__processList__( _i )
-                        _tmp = _tmp + _tmp1
-                _row.append( _tmp )
-            _list.append(_row)
-        return _list
-
-    def __processFrame__(self,_xml,_name):
-        _output = u''
-        #print 'ok'
-        for _i in _xml.getchildren():
-            if self.is_object_ole(_i ):
-                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
-                _newfilename = self.__fileExtractor__( _filename )
-                _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)
-            elif self.is_image(_i):
-                # No hacer nada si la imagen es un Object Replacement
-                _filename = _i.attrib['{%s}%s' % (self.ns['xlink'],'href')]
-                if not _filename.__contains__('ObjectReplacements/'):
-                    _newfilename = self.__fileExtractor__( _filename )
-                    _output = _output + '[[file:%s][%s]]\n' % (_newfilename,_name)          
-        return _output
     
     def __fileExtractor__(self,_filename):
         '''Extract the file an assigns a proper filename to it'''
-        # All paths are relatives. We take away the beginning.
-        if _filename[0:2] == './':
-            _tmp = _filename[2:]
-        else:
-            _tmp = _filename
-        
-        _outdir = os.path.split( self.orgfile.filename)[0]
+        self.files.extract( _filename,self.tmpdir )
 
-        # The file is extracted.
-        try:
-           self.files.extract(_tmp, _outdir)
-        except AttributeError:
-           print "ERROR: ZipFile provides 'extract' function in Python 2.6." 
-           print "       Please, consider updating to PYTHON v2.6"
-           raise AttributeError("Zipfile instance has no attribute 'extract'")
-        _extractedfile = os.path.join(_outdir,_filename)
+        _extractedfile = os.path.join(self.tmpdir,_filename)
 
         # Is an OLE object.
         _toRename = False
             except WindowsError:
                 print 'ERROR: file already exists: %s' % _new
             _extractedfile = _new
-
+        return _extractedfile
         # Los ficheros se extraen a sus rutas originales
         # Crea directorios si fuera necesario.
         # - Movemos los ficheros al mismo directorio que el .org.
         # - Usamos como prefijo el nombre del fichero.
-        _tmp = os.path.split( _extractedfile )
-        _newname = self.prefix + _tmp[1]
-        _fullnewname = os.path.join(_outdir,_newname)
+        #_tmp = os.path.split( _extractedfile )
+        #_newname = self.prefix + _tmp[1]
+        #_fullnewname = os.path.join(_outdir,_newname)
         
-        if os.path.isfile( _fullnewname ) and self.isOverWriter:
-            os.remove( _fullnewname )  # The preexisting file is removed.
-        elif os.path.isfile( _fullnewname ) and not self.isOverWriter:
-            print 'WARNING - The file already exists.'
-            print '  - FILENAME: %s' % _fullnewname
-            print '  - Keeping both files'
-            print '  - Referencing to the old one'
-        else:
-            pass
+        #if os.path.isfile( _fullnewname ) and self.isOverWriter:
+        #    os.remove( _fullnewname )  # The preexisting file is removed.
+        #elif os.path.isfile( _fullnewname ) and not self.isOverWriter:
+        #    print 'WARNING - The file already exists.'
+        #    print '  - FILENAME: %s' % _fullnewname
+        #    print '  - Keeping both files'
+        #    print '  - Referencing to the old one'
+        #else:
+        #    pass
         
         # We try to move the file changing its name. 
-        try:
-            os.rename(_extractedfile,_fullnewname)
-            return _newname
-        except:   
-            return _filename
+        #try:
+        #    os.rename(_extractedfile,_fullnewname)
+        #    return _newname
+        #except:   
+        #    return _filename
 
-#=================================0
+#=================================
 class Ole:
     def __init__(self,filename=None):
         self.filename = filename
         self._data = ole.OleFileIO(self.filename)
         self.files = self.__getFiles__()
-        self._dict = { '.*AcroExch\.Document\.[0-9]+.*' : { 'ext':'.pdf','name':'CONTENTS'},
-                '.*Word\.Document\.[0-9]+.*' : { 'ext':'.doc','name': None},
-                '.*Excel\.Sheet\.[0-9]+.*'   : { 'ext':'.xls','name': None},
-                '.*PowerPoint\.Show\.[0-9]+.*'   : { 'ext':'.ppt','name': None},
-                       }
+        self._dict = { 
+             '.*AcroExch\.Document\.[0-9]+.*' : { 'ext':'.pdf','name':'CONTENTS'},
+             '.*Word\.Document\.[0-9]+.*' : { 'ext':'.doc','name': None},
+             '.*Excel\.Sheet\.[0-9]+.*'   : { 'ext':'.xls','name': None},
+             '.*PowerPoint\.Show\.[0-9]+.*'   : { 'ext':'.ppt','name': None},
+                     }
     
     def __guessFormat__(self):
         _data = self.__getItem__(0)
         _fp.write( _data )
         _fp.close()
     
-    def extractFile(self):
+    def extract_file(self):
         _key = self.__guessFormat__()
         if _key == None:
             _tmp = self._data.openstream( self.files[0] )
     def __str__(self):
         return u'%s: %s' % ( self.message, self.filename )
 
-
-
 if __name__ == '__main__':
     pass
     '''This class is used to generate an ORG file'''
     def __init__(self,
                  filename = None,
+                 original = None,
                  overwrite = False):
-        self.filename = filename
+        self.filename = os.path.realpath( filename )
+        self.prefix = os.path.splitext(filename)[0]
+        self.original = os.path.split(original)[1]
+        self.overwrite = overwrite
         self.isWarning = False
-        self.data = [u'-*- mode: org; coding: utf-8 -*-\n' ]
+        self.data = u'-*- mode: org; coding: utf-8 -*-\n'
+        self.nrow = -1
     
-    def readfromlist(self, _list):
-        #print _list
-        if _list[0][1] == u'Title':
-            _start = 1
-            self.addHeading( level = 1,
-                             text = _list[0][2] )
+    def read_list(self, _list, _extra):
+        """
+        """
+        # Remove unprocessed stuff
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('text'):
+                _newlist.append(_i)
+        _list = _newlist
+        # First item should be the original filename     
+        self.addHeading( level = 1,
+                         text = u'%s' % self.original )
+
+        # Are there "Title" styles?
+        _hasTitle = False
+        _n = 0
+        for _i in _list:
+            if _i['tags']['style'] == u'Title':
+                _hasTitle = True
+                _n = 1
+        
+        # Process
+        for _i in _list:
+            # Closing tables
+            if _i['tags']['type'] != u'table-cell' and self.nrow != -1:
+                self.data += '|\n'
+                self.nrow = -1 
+            # Title
+            if _i['tags']['style'] == u'Title':
+                self.addHeading( level = 2,
+                                 text =  _i['text'])
+            elif _i['tags']['type'] == u'heading':
+                # TODO: 'level' should already arrive as Integer
+                self.addHeading( level = int(_i['tags']['level'])+1+_n,
+                                 text =  _i['text'])
+            elif _i['tags']['type'] == u'paragraph':
+                self.addParagraph( text = _i['text'] )
+            elif _i['tags']['type'] == u'table-cell':
+                self.add_cell( _i )
+            elif _i['tags']['type'] == u'list-item':
+                self.addEnumeration( list_item = _i )
+            elif _i['tags']['type'] == u'image':
+                self.add_image( _i)
+
+        # Footnotes
+        if _extra.__contains__('footnotes'):
+            if _extra['footnotes'] != []:
+                self.data += u'** Footnotes\n'
+                for _i in _extra['footnotes']:
+                    self.data += _i[0]
+                    for _j in _i[1]:
+                        self.data += _j['text'] +u'\n'
+                    
+    def add_image(self, _item):
+        _ini = _item['text']
+        _tmp = os.path.split(_ini)[1]
+        _newfile = self.prefix+'_'+_tmp
+        # TODO: check for overwrite
+        if os.path.isfile( _newfile) and self.overwrite:
+            os.remove(_newfile)
         else:
-            _start = 0
-            self.addHeading( level = 1,
-                             text = [u'%s' % os.path.split(self.filename)[1]] )
+            raise OrgfileError(u"The file already exists: %s" % _newfile)
             
-        for _i in _list[_start:]:
-            if _i[3]: # Is heading (a dictionary would be more readable)
-                self.addHeading( level = int(_i[0])+1,
-                                 text = _i[2] )
-            elif _i[4]: # Is heading (a dictionary would be more readable)
-                self.addParagraph( text = _i[2] )
-            elif _i[5] == 'table':
-                self.addTable( table = _i[2] )
-            elif _i[5] == 'enumeration':
-                self.addEnumeration( enumeration = _i[2] )
-                
+        os.rename(_ini,_newfile)
+        #print _newfile
+        #print os.path.split(_ini)[0]
+        _outputdir = os.path.split(self.filename)[0]
+        _relpath = os.path.relpath( _newfile, _outputdir)
+        _tmp = u'[[file:%s][%s]]\n' % (_relpath,_item['tags']['name'])
+        self.data += _tmp
 
     def addHeading(self, 
                    level = 1,
-                   text = [] ):
-        #print text
-        # A heading should only have one line.
-        if len( text ) > 1:
-            pass # TODO: juntar todas las líneas
-            # text.replace('\r\n',' ')    
-            #text.replace('\n',' ')
+                   text = u'' ):
+        """Headings can only have one line in org-mode
+        """
+        text = text.replace('\r\n',' _ ')    
+        text = text.replace('\n',' _ ')
 
-        # Check level is valid
-        if level < 1:
-            text = [u'* [WARNING] ' + text[0]]
-            self.isWarning = True
-        else:
-            text = [u'*'*(level) + u' %s' % text[0]]
-
+        text = u'*'*(level) + u' %s\n' % text
         self.data = self.data + text
 
     def addParagraph(self, 
                      text = [] ):
-        self.data = self.data + text
+        self.data = self.data + text + '\n'
 
-    def addTable(self, 
-                 table = []):
-        _list = []
-        if len(table) > 0:
-            # First line
-            _list = _list + [u'|-']
-            _list = _list + self.__addRow__( table[0] )
-            _list = _list + [u'|-']
-        if len(table) > 1:
-            for _row in table[1:]:
-                _list = _list + self.__addRow__( _row )
-            _list = _list + [u'|-']
+    def add_cell(self, _cell):
+        """
+        """
+        #print _cell
+        _nrow = _cell['tags']['nrow']
+        if _nrow == 0:
+            self.nrow = _nrow
 
-        self.data = self.data +  _list 
-
-    def __addRow__(self, _row):
-        '''Helper function to show rows'''
-        _list = []
-        _n = 0
-        _flag = True
-        while _flag:
-            _text = u'| '
-            for _cell in _row:
-                _flag = False
-                if len(_cell) > _n:
-                    _text = _text + u'%s |' % _cell[_n]
-                    _flag = True
-                else:
-                    _text = _text + u'   |'
-            if _flag:
-                _list.append( _text )
-            _n = _n + 1
-        return _list
+        if _nrow > self.nrow:
+            self.data += u'|\n| %s ' % _cell['text']
+            self.nrow = _nrow
+        else:
+            self.data += u'| %s ' %  _cell['text']
 
     def addEnumeration(self, 
-                       enumeration = []):
-        _lista = []
-        for _i in enumeration:
-            _lista = _lista + [u' ' * (_i[0]-1) + u'- ' + _i[1][0]]
-            if len(_i[1]) > 1:
-                for _j in _i[1:]:
-                    _lista = _lista + [ _j ]
-
-        self.data = self.data +  _lista
-
-    def clean(self):
-        pass
+                       list_item = {}):
+        _level = int(list_item['tags']['level'])
+        _tmp = u'  '*(_level -1) + u'- '+ list_item['text'] +'\n'
+        self.data = self.data +  _tmp
 
     def export(self):
-       #try:
-           _tmp = u''
-           for _i in self.data:
-               _tmp = _tmp + u'%s\n' % _i
-           _fp = codecs.open( self.filename, 'w', 'utf-8')
-           _fp.write( _tmp )
-           _fp.close()
-       #except:
-       #    print u'ERROR: the file was processed, but it failed when writting it:\n   %s' % self.filename
-
+        _fp = codecs.open( self.filename, 'w', 'utf-8')
+        _fp.write( self.data )
+        _fp.close()
 
 if __name__ == '__main__':
-    pass
+    pass