Commits

Virgil Dupras committed 7b4680e

Fixed a crash preventing redeaing of PDFs for which stream IDs hosting page objects weren't in the xrefs.

Comments (0)

Files changed (4)

 dist
 MANIFEST
 __pycache__
-pdfminer/cmap/*.gz
+pdfminer/cmap/*.gz
+*.egg-info

pdfminer/converter.py

         return item.adv
 
     def handle_undefined_char(self, font, cid):
-        logging.debug('undefined: %r, %r', font, cid)
+        logging.warning('undefined: %r, %r', font, cid)
         return '(cid:%d)' % cid
 
     def receive_layout(self, ltpage):

pdfminer/pdfparser.py

 from .pdftypes import (PDFException, PDFTypeError, PDFNotImplementedError, PDFStream, PDFObjRef,
     resolve1, decipher_all, int_value, str_value, list_value, dict_value, stream_value)
 from .arcfour import Arcfour
-from .utils import choplist, nunpack
-from .utils import decode_text, ObjIdRange
-
+from .utils import choplist, nunpack, decode_text, ObjIdRange
 
 ##  Exceptions
 ##
 class PDFNoValidXRef(PDFSyntaxError): pass
 class PDFNoOutlines(PDFException): pass
 class PDFDestinationNotFound(PDFException): pass
+class PDFAlreadyParsed(PDFException): pass
 class PDFEncryptionError(PDFException): pass
 class PDFPasswordIncorrect(PDFEncryptionError): pass
 
         self._parser = None
         self._cached_objs = {}
         self._parsed_objs = {}
+        self._parsed_everything = False
     
     def _parse_next_object(self, parser):
         # This is a bit awkward and I suspect that it could be a lot more elegant, but it would
         # like these, the last resort is to read all objects at once so that our object reference
         # can finally be resolved. This is slower than the normal method, so ony use this when the
         # xref tables are corrupt/wrong/whatever.
+        if self._parsed_everything:
+            raise PDFAlreadyParsed()
         parser = self._parser
         parser.setpos(0)
         parser.reset()
         self._parse_whole(parser)
+        self._parsed_everything = True
     
     def set_parser(self, parser):
         "Set the document to use a given PDFParser object."
         """
         return self._parse_next_object(self._parser)
     
+    def find_obj_ref(self, objid):
+        for xref in self.xrefs:
+            try:
+                strmid, index = xref.get_pos(objid)
+                return strmid, index
+            except KeyError:
+                pass
+        else:
+            # return null for a nonexistent reference.
+            return None, None
+    
     def getobj(self, objid):
         if not self.xrefs:
             raise PDFException('PDFDocument is not initialized')
             genno = 0
             obj = self._cached_objs[objid]
         else:
-            for xref in self.xrefs:
-                try:
-                    (strmid, index) = xref.get_pos(objid)
-                    break
-                except KeyError:
-                    pass
-            else:
+            strmid, index = self.find_obj_ref(objid)
+            if index is None:
                 handle_error(PDFSyntaxError, 'Cannot locate objid=%r' % objid)
                 # return null for a nonexistent reference.
                 return None
             if strmid:
-                stream = stream_value(self.getobj(strmid))
+                stream = self.getobj(strmid)
+                if stream is None:
+                    return None
+                stream = stream_value(stream)
                 if stream.get('Type') is not LITERAL_OBJSTM:
                     handle_error(PDFSyntaxError, 'Not a stream object: %r' % stream)
                 try:

tools/pdfexplore.py

     @intarg()
     def do_sobj(self, arg):
         "Select object with ID X. The object has to have been read already."
+        obj = None
         if arg in self.doc._cached_objs:
             obj = self.doc._cached_objs[arg]
         elif arg in self.doc._parsed_objs:
             obj = self.doc._parsed_objs[arg]
         else:
             print("Object hasn't been read yet.")
-            return
-        self.current_obj = (arg, 0, obj)
-        self.do_st('')
+            strmid, index = self.doc.find_obj_ref(arg)
+            if index is not None:
+                print("However, our object id is in a xref")
+                if strmid:
+                    print("Stream ID: %d" % strmid)
+                print("Position: %d" % index)
+        if obj is not None:
+            self.current_obj = (arg, 0, obj)
+            self.do_st('')
     
     def do_dbgobj(self, arg):
         "Enter in debug mode with current obj as 'obj' in the local scope."