Commits

Anonymous committed 618eca3

More robust

  • Participants
  • Parent commits bc4b7e6

Comments (0)

Files changed (1)

             '{%s}%s' % (self.ns['text'],'style-name'): 'style-name',
             '{%s}%s' % (self.ns['text'],'s') : 'spaces',
             '{%s}%s' % (self.ns['text'],'tab') : 'tabs',
+            '{%s}%s' % (self.ns['text'],'span') : 'span',
             '{%s}%s' % (self.ns['text'],'note') : 'note',
             '{%s}%s' % (self.ns['text'],'note-citation') : 'note-citation',
             '{%s}%s' % (self.ns['text'],'note-body') : 'note-body',
+            '{%s}%s' % (self.ns['text'],'soft-page-break') : 'soft-page-break',
             '{%s}%s' % (self.ns['draw'],'frame') : 'frame',
             '{%s}%s' % (self.ns['draw'],'name') : 'framename',
             '{%s}%s' % (self.ns['draw'],'object-ole'): 'object-ole',
             '{%s}%s' % (self.ns['table'],'table-row'): 'table-row',
             '{%s}%s' % (self.ns['table'],'table-cell'): 'table-cell'
                         }
-        #
+
+        self.garbage = ['soft-page-break']
 
         if filename != None:
             self.filename = os.path.realpath( filename )
                      'nesting' : _level}
             _list.append( _tmp )
 
-        # Extract text from paragraphs, heading, spaces, tabs.
+        # Get warnings: information not managed
+        _warnings = []
+        for _i in _list:
+            try:
+                self.types[_i['child'].tag]
+            except:
+                _warnings.append(_i['child'].tag)
+        _warnings = set(_warnings)
+        print u"WARNING: following items are present in the document and won't be interpreted:"
+        for _i in _warnings:
+            print u"   %s" % _i
+        print u"\n"
+
+
+        # Ensure the list only contains info that we process.
+        _newlist = []
+        for _i in _list:
+            try:
+                _type = self.types[_i['child'].tag]
+                if not self.garbage.__contains__( _type):
+                    _newlist.append(_i)
+            except:
+                pass
+        _list = _newlist
+
+        # Extract text from paragraphs, heading, spaces, tabs, ...
 
         # - Extract spaces, tabs
         _newlist = []
             
         _list = _newlist  
 
+        # - Extract text:span
+        _newlist = []
+        for _i in _list:
+            try:
+                if self.types[_i['child'].tag] == 'span':
+                    _txt = u''
+                    if _i['pretext'] != None:
+                        _txt += _i['pretext']
+                    if _i['posttext'] != None:
+                        _txt += _i['posttext']
+                    _tmp = {'txt' : _txt, 'nesting': _i['nesting']-1}
+                    _newlist.append(_tmp)
+                else:
+                    _newlist.append(_i)
+            except:
+                _newlist.append(_i)
+
+        _list = _newlist
+
         # - Footnotes
         _footnotes = []
         _list = self.group_type( _list, tag = 'note')
          
         # - Getting heading and paragraph text.
         _list = self.process_headings_paragraphs( _list )
-        # LIST
+        # LIST:
         _list = self.group_children( _list, tag = 'list', clean= True)
         _list = self.process_enumeration(_list)
 
         _list = self.group_children( _list, tag = 'frame', clean= False)
         _list = self.process_framename(_list)
 
-        # PRINT
-        #for _i in _list:
-        #    print _i
-        #print _footnotes
         # Process Footnotes
         _tmp = []
         for _i in _footnotes:
-            print _i
             _tmp1 = self.process_regroup( _i[1] )
 
             _tmp1 = self.process_headings_paragraphs( _tmp1 )
             else:
                 _newlist.append( _i)
         _list = _newlist
+
         # Phase 2
         _newlist = []
         for _i in _list:
             if _i.__contains__('list'):
                 _tmp = []
+
                 for _j in _i['list']:
                     if _j.__contains__('list-item'):
                         _paragraphs = _j['list-item']
                         _txt = u''
                         for _k in _paragraphs:
                             _txt += _k['text'] +'\n'
-                        _txt = _txt[0:-1]
-                        _tmp1 = _paragraphs[0].copy()
-                        _tmp1['text'] = _txt
 
-                        _tmp1['tags']['type'] = 'list-item'
-                        _n = (_tmp1['nesting']-_i['nesting'] )/ 2
-                        _tmp1['tags']['level'] = _n
-                        _tmp1.pop('nesting')
-                        _tmp.append( _tmp1)
-                
+                        if len(_paragraphs) > 0:
+                            _txt = _txt[0:-1]
+                            _tmp1 = _paragraphs[0].copy()
+                            _tmp1['text'] = _txt
+
+                            _tmp1['tags']['type'] = 'list-item'
+                            _n = (_tmp1['nesting']-_i['nesting'] )/ 2
+                            _tmp1['tags']['level'] = _n
+                            _tmp.append( _tmp1)
+
                 _newlist += _tmp
             else:
                 _newlist.append( _i )
                     _i['text'] = _newfilename
                     #print _newfilename
                     _newlist.append(_i)
+                else:
+                    _newlist.append(_i)
             else:
                 _newlist.append(_i)
 
         # antes usaba: is_object_ole y también is_image, pero no entiendo con qué propósito.
         #_newfilename = self.__fileExtractor__( _filename )
         # '[[file:%s][%s]]\n' % (_newfilename,_name)
+
         return _newlist
 
     def clean(self, _list, tags = []):
         _nesting = _list[idx]['nesting']
         _count = 0
         for _i in xrange(idx+1, len(_list)):
+            try:
+                _list[_i]['nesting']
+            except:
+                print u"ERROR: it should contains 'nesting' information:\n", _list[_i]
             if _list[_i]['nesting'] > _nesting:
                 _count += 1
             else:
     
     def __guessFormat__(self):
         _data = self.__getItem__(0)
-        #print _data
         _KEY = None
         for _key in self._dict.keys():
             _kernel = re.compile(  _key )