Commits

Anonymous committed ffe3a0e

Started support for footnotes

Comments (0)

Files changed (1)

             '{%s}%s' % (self.ns['draw'],'name') : 'framename',
             '{%s}%s' % (self.ns['text'],'outline-level') : 'outline-level',
             '{%s}%s' % (self.ns['text'],'list'): 'list',
+            '{%s}%s' % (self.ns['text'],'list-item'): 'list-item',
             '{%s}%s' % (self.ns['text'],'style-name'): 'style-name',
             '{%s}%s' % (self.ns['table'],'table'): 'table',
             '{%s}%s' % (self.ns['draw'],'object-ole'): 'object-ole',
             '{%s}%s' % (self.ns['draw'],'image'): 'image',
             '{%s}%s' % (self.ns['text'],'s') : 'spaces',
-            '{%s}%s' % (self.ns['text'],'tab') : 'tabs'
-
+            '{%s}%s' % (self.ns['text'],'tab') : 'tabs',
+            '{%s}%s' % (self.ns['text'],'note') : 'note',
+            '{%s}%s' % (self.ns['text'],'note-citation') : 'note-citation',
+            '{%s}%s' % (self.ns['text'],'note-body') : 'note-body'
                         }
         #
 
         return _body.xpath('office:text', 
                             namespaces= { 'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'} )[0]
 
+
     def gen_list(self):
         """Creates a list that is understood by ORGfile class.
         """
+        # Get all the nodes.
+        _xml = self._get_textbody()
+        _list = []
+
+        for _child in _xml.iterdescendants():
+            _level = 0
+            for _i in _child.iterancestors():
+                _level += 1
+            
+            _tmp = { 'child' : _child,
+                     'pretext' : _child.text,
+                     'posttext' : _child.tail,
+                     'nesting' : _level}
+            _list.append( _tmp )
+
+        # Extract text from paragraphs, heading, spaces, tabs.
+
+        # - Extract spaces, tabs
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('child'):
+                _child = _i['child']
+                _tag = _child.tag
+                if self.types.__contains__(_tag):
+                    _type = self.types[_tag]
+                    if _type == 'spaces':
+                        _txt = self.get_spaces(_child)
+                        if _i['pretext'] != None:
+                            _txt = _i['pretext'] + _txt
+                        if _i['posttext'] != None:
+                            _txt = _txt + _i['posttext']
+
+                        _tmp = {'txt' : _txt, 'nesting': _i['nesting']}
+                        _newlist.append(_tmp)
+                    elif _type == 'tabs':
+                        _txt = self.get_tabs(_child)
+                        if _i['pretext'] != None:
+                            _txt = _i['pretext'] + _txt
+                        if _i['posttext'] != None:
+                            _txt = _txt + _i['posttext']
+                        _tmp = {'txt' : _txt, 'nesting': _i['nesting']}
+                        _newlist.append(_tmp)
+                    else:
+                        _newlist.append(_i)
+                else:
+                    _newlist.append(_i)
+            else:
+                _newlist.append(_i)
+            
+        _list = _newlist  
+
+        # - Footnotes
+        _footnotes = []
+        _list = self.group_type( _list, tag = 'note')
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('note'):
+                _txt, _paragraphs = self.get_footnote(_i)
+                _newlist.append( {'txt' : _txt, 
+                                  'nesting' : _i['nesting']})
+                
+                _footnotes.append((_txt, _paragraphs))
+            else:
+                _newlist.append(_i)
+        _list = _newlist
+
+        # - Regroup: move spaces, tabs into the prior child.
+        _newlist = []
+        _tmp = u''
+        for _i in xrange(len(_list)):
+            _item = _list[_i]
+            if _item.__contains__('child'):
+                if _tmp != u'':
+                    _newlist[-1].update({'txt' : _tmp})
+                    _tmp = u''
+                _newlist.append(_item)
+            else:
+                _tmp = _tmp + _item['txt']
+            
+        _list = _newlist        
+         
+        # - Getting heading and paragraph text.
+        _newlist = []
+        for _i in _list:
+            _flag = True
+            # - Heading
+            try:
+                if self.types[ _i['child'].tag] == 'heading':
+                    _tmp = u''
+                    if _i['pretext'] != None:
+                        _tmp += _i['pretext']
+                    if _i.__contains__('txt'):
+                        if _i['txt'] != None:
+                            _tmp += _i['txt']
+                    if _i['posttext'] != None:
+                        _tmp += _i['posttext']
+                    _tags =  self.get_tags( _i['child'] )
+                    #print _i['child']
+                    _tags.update({'type' : 'heading'})
+                    _tmp = { 'text' : _tmp,
+                             'nesting': _i['nesting'],
+                             'tags' : _tags}
+
+                    _newlist.append(_tmp)
+                    _flag = False
+            except:
+                pass 
+            # - Paragraph
+            try:
+                if self.types[ _i['child'].tag] == 'paragraph':
+                    _tmp = u''
+                    if _i['pretext'] != None:
+                        _tmp += _i['pretext']
+                    if _i.__contains__('txt'):
+                        if _i['txt'] != None:
+                            _tmp += _i['txt']
+                    if _i['posttext'] != None:
+                        _tmp += _i['posttext']
+                    _tags =  self.get_tags( _i['child'] )
+                    _tags.update({'type' : 'paragraph'})
+                    _tmp = { 'text' : _tmp,
+                             'nesting': _i['nesting'],
+                             'tags' : _tags}
+                    _newlist.append(_tmp)
+                    _flag = False
+            except:
+                pass 
+            if _flag:
+                _newlist.append(_i)
+            
+        _list = _newlist        
+        # LIST
+        _list = self.group_type( _list, tag = 'list', clean= True,groupchildren=True)
+        for _i in _list:
+            print _i
+        _list = self.process_enumeration(_list)
+
+        # TABLE
+        _list = self.group_type( _list, tag = 'table')
+
+        # PRINT
+        #for _i in _list:
+        #    print _i
+        #print _footnotes
+        return _list
+
+    def process_enumeration(self, _list):
+        """
+        """
+        _newlist = []
+        for _i in _list:
+            if _i.__contains__('list'):
+                #print _i,'\n'
+
+                _tmp = self.group_type( _i['list'], 
+                                        tag = 'list-item',
+                                        clean = True)
+  
+                # Clean from 'list' and 'list-item'
+                #_tmp = self.clean(_tmp, tags = ['list','list-item'])
+
+                _tmp1 = _i.copy()
+                _tmp1['list'] = _tmp
+
+                _newlist.append( _tmp1 )
+            else:
+                _newlist.append( _i )
+        #print '\n\n\n'        
+        return _newlist
+
+    def clean(self, _list, tags = []):
+        _newlist = []
+        for _i in _list:
+            try:
+                _tag = _i['child'].tag
+                _type = self.types[_tag]
+            except:
+                _type = None
+            if not tags.__contains__(_type):
+                _newlist.append(_i)
+        return _newlist
+                
+
+    def __processEnumeration__(self,
+                               _child):
+        '''Structure for Enumerations:
+        <text:list>
+           <text:list-item>
+                <text:p>
+        '''
+        _list = []
+        _level = 1
+        _levels = { _child : {'type' : 'list', 'level' : 1}}
+        for _i in _child.iterdescendants(): # We follow all the nested structure.
+            # list-item: mismo nivel que su padre.
+            if _i.tag == '{%s}%s' % (self.ns['text'],'list-item'):
+                _levels.update({ _i : {'type' : 'list-item',
+                                      'level' : _levels[ _i.getparent()]['level']}})
+            # list: si el padre era un list-item, se le suma uno.
+            if _i.tag == '{%s}%s' % (self.ns['text'],'list'):
+                _levels.update({ _i : {'type' : 'list',
+                                      'level' : _levels[ _i.getparent()]['level']+1}})
+                
+            if _i.tag == '{%s}%s' % (self.ns['text'],'p'):
+                _levels.update({ _i : {'type' : 'p',
+                                       'level' : _levels[ _i.getparent()]['level']}})
+                _n = _levels[ _i.getparent()]['level']
+                _tmp = []
+                for _j in _i.itertext():
+                    if _j != None:
+                        
+                        _tmp = _tmp + [u'%s' % _j]
+                        #_list = _list + [ (_n, _tmp )]
+                _list = _list + [ (_n, _tmp) ]
+        return _list
+    
+
+    def group_type( self,
+                    _list, 
+                    tag = 'note', 
+                    clean = False,
+                    groupchildren = False):
+        _pairs = self.extract_children(_list, tag, groupchildren)
+        if tag == 'list':
+            print '    PAIRS: ',_pairs
+        # Extract _list
+        _newlist = []
+        _oldidx = 0
+        for _pair in _pairs:
+            _newlist += _list[_oldidx:_pair[0]]
+            _note = _list[_pair[0]:_pair[1]]
+            _posttext = None
+            if _note[0]['posttext'] != None:
+                _posttext = _note[0]['posttext']
+                _note[0]['posttext'] = None
+            _nesting = _list[_pair[0]]['nesting']
+            if clean:
+                _note = _note[1:]
+            _newlist.append( { tag : _note,
+                               'nesting' : _nesting})
+            if _posttext != None:
+                _newlist.append( {'txt' : _posttext,
+                                  'nesting' : _nesting})
+            _oldidx = _pair[1]
+        _newlist += _list[_oldidx:]
+        return _newlist
+
+    def extract_children( self, _list, tag = 'note', groupchildren = False):
+        _newlist = []
+        _limits = []
+        _flag = False
+        _nesting = 0
+        #if tag == 'list-item':
+        #   print '== OK ==', _limits
+        for _i in xrange(len(_list)):
+            _item = _list[_i]
+            try:
+                if self.types[ _item['child'].tag ] == tag:
+                    _test = groupchildren and _item['nesting']< _nesting
+                    if not _test:
+                        if _flag:
+                            _limits.append(_i)
+                            _newlist.append( _limits)
+                        _flag = True
+                        _nesting = _item['nesting']+1
+                        _limits = [_i]                    
+                        _tmp = _item
+                    #if tag == 'list-item':
+                    #   print '== OK ==', _limits
+                    #   print _item['nesting']
+            except:
+                pass
+       
+            if _flag and _item != _tmp:
+                #if tag == 'list-item':
+                #    print '<< OK >>', _limits
+                #    print _item['nesting'], _nesting
+                if _item['nesting'] < _nesting:
+                    _flag = False
+                    _nesting = 0
+                    _limits.append(_i)
+
+                    _newlist.append( _limits)
+                    _limits = []
+        if _limits != []:
+            _limits.append(len(_list))
+            _newlist.append( _limits)
+         
+        return  _newlist        
+
+    def get_footnote(self, _note):
+        _paragraphs = []
+        _flag = False
+
+        for _item in _note['note']:
+            if _flag:
+                _paragraphs.append(_item)
+            try:
+                _type = self.types[_item['child'].tag]
+            except:
+                _type = ''
+            if _type == 'note-citation':
+                _id = _item['pretext']
+            elif _type == 'note-body':
+                _flag = True
+        _id = _id.replace('[','_')
+        _id = _id.replace(']','_')
+        _id = _id.replace(':','_')
+        _id = u'[fn:%s] ' % _id
+        #print _paragraphs
+        return _id, _paragraphs
+
+    def gen_list2(self):
+        """Creates a list that is understood by ORGfile class.
+        """
         _xml = self._get_textbody()
         _list = []
 
             _tmp = self.get_newlist( [{'child' : _child,
                                        'tags' : {'nesting' : 1}}] )
             if _tmp != None:
-                _list = _list + [_tmp]
+                _list = _list + _tmp
         for _i in _list:
             print _i
         return _list
 
+
     def get_newlist(self, _list ):
         """
         """
             _list = _newlist
         return _list 
 
-    def is_paragraph(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['text'],'p'):
-            return True
-        else:
-            return False
-
-    def is_heading(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['text'],'h'):
-            return True
-        else:
-            return False
-
-    def is_frame(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['draw'],'frame'):
-            return True
-        else:
-            return False
-
-    def is_framename(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['draw'],'name'):
-            return True
-        else:
-            return False
-
-    def is_level(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['text'],'outline-level'):
-            return True
-        else:
-            return False
-
-    def is_enum(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['text'],'list'):
-            return True
-        else:
-            return False
-
-    def is_style(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['text'],'style-name'):
-            return True
-        else:
-            return False
-
-    def is_table(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['table'],'table'):
-            return True
-        else:
-            return False
-
-    def is_object_ole(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['draw'],'object-ole'):
-            return True
-        else:
-            return False
-
-    def is_image(self,_child):
-        if _child.tag == '{%s}%s' % (self.ns['draw'],'image'):
-            return True
-        else:
-            return False
-
-
     def has_children(self, _list):
         for _i in _list:
             if _i.__contains__('child'):
         _list = []
 
         _type = self.get_type(_child)
+        if _tags.__contains__('type'):
+            _type = _tags['type'] + [_type]
+        else:
+            _type = [_type]
         _newtags = self.get_tags( _child, _dict)
         for _key in _newtags.keys():
             _tags.update({_key : _newtags[_key] })
         _tags.update( {'type': _type})
 
-        if _type == 'spaces':
+        if _type[-1] == 'spaces':
             _tmp = self.get_spaces( _child )
             _list.append( {'txt' : _tmp, 'tags' : _tags })
-        elif _type == 'tabs':
+        elif _type[-1] == 'tabs':
             _tmp = self.get_tabs( _child )
             _list.append( {'txt' : _tmp, 'tags' : _tags })
 
         
         return _list
 
-    def get_tags(self, _child, _dict = {}):
+    def get_tags(self, _child):
         """This function creates a tagged text
         """
+        #print '\n\n>>>>>>> ', _child
+        _dict = _child.attrib
+        #print _dict
         _tags = {}
         # Style
         _tagstyle = '{%s}%s' % (self.ns['text'],'style-name')
         return _tmp
 
     def get_tabs( self, _child):
-        """Deals with <text:s> which deals with extra spaces.
+        """Deals with <text:tab> which deals with extra spaces.
         """
         _tag = '{%s}%s' % (self.ns['text'],'tab')
         _tmp = '    '
-        #if _child.attrib.__contains__(_tag):
-        #    _value = _child.attrib[_tag]
-        #    _tmp = ' ' * int(_value)
-        #else:
-        #    _tmp = ' '
         return _tmp
 
-
-    def getParagraph(self, _child, mode = 'p'):
-       '''It checks this is paragraph and recovers the text.'''
-       _list = []
-       if _child.tag == '{%s}%s' % (self.ns['text'], mode):
-          _text = u''
-          for _i in _child.iter():
-             if _i.tag == '{%s}%s' % (self.ns['text'], mode):
-                if _i.text != None:
-                   _text = _text + _i.text
-             elif _i.tag == '{%s}%s' % (self.ns['text'],'s'):
-                try:
-                   _text = _text +  ' ' * int(_i.attrib['{%s}%s' % (self.ns['text'],'c')])
-                except KeyError:
-                   pass # It is used not only to add spaces.
-             elif _i.tag == '{%s}%s' % (self.ns['text'],'tab'):             
-                _text = _text + '\t'
-             #else:
-             #   print "[WARNING] Currently getParagraph does not implement:\n    %s" % _i
-
-             if _i.tail != None:
-                _text = _text + _i.tail
-          #if not self.isOriginal: # We clean the text.
-          #   _text = _text.replace('\t',' ').replace('\r',' ').replace('\n',' ').strip()
-          #   _text = _text.split(' ')
-          #   _text = [_i for _i in _text if _i != u'']
-          #   _text = string.join(_text, ' ')
-          return [_text]
-       else:
-          return []
-
-    def __processEnumeration__(self,
-                               _child):
-        '''Structure for Enumerations:
-        <text:list>
-           <text:list-item>
-                <text:p>
-        '''
-        _list = []
-        _level = 1
-        _levels = { _child : {'type' : 'list', 'level' : 1}}
-        for _i in _child.iterdescendants(): # We follow all the nested structure.
-            # list-item: mismo nivel que su padre.
-            if _i.tag == '{%s}%s' % (self.ns['text'],'list-item'):
-                _levels.update({ _i : {'type' : 'list-item',
-                                      'level' : _levels[ _i.getparent()]['level']}})
-            # list: si el padre era un list-item, se le suma uno.
-            if _i.tag == '{%s}%s' % (self.ns['text'],'list'):
-                _levels.update({ _i : {'type' : 'list',
-                                      'level' : _levels[ _i.getparent()]['level']+1}})
-                
-            if _i.tag == '{%s}%s' % (self.ns['text'],'p'):
-                _levels.update({ _i : {'type' : 'p',
-                                       'level' : _levels[ _i.getparent()]['level']}})
-                _n = _levels[ _i.getparent()]['level']
-                _tmp = []
-                for _j in _i.itertext():
-                    if _j != None:
-                        
-                        _tmp = _tmp + [u'%s' % _j]
-                        #_list = _list + [ (_n, _tmp )]
-                _list = _list + [ (_n, _tmp) ]
-        return _list
-    
     def getTable(self,_xml):
         #<table:table-row>
         #  <table:table-cell>