Commits

Philippe Lagadec  committed ed69b68

version 0.18 2007-12-05

  • Participants
  • Parent commits bfbbbe0

Comments (0)

Files changed (1)

File OleFileIO_PL/OleFileIO_PL.py

     Microsoft Compound Document File Format), such as Microsoft Office
     documents, Image Composer and FlashPix files, Outlook messages, ...
 
-version 0.17 2007-12-04 Philippe Lagadec - http://lagasoft.free.fr
+version 0.18 2007-12-05 Philippe Lagadec - http://lagasoft.free.fr
 
 Project website: http://lagasoft.free.fr/python/olefileio
 
 """
 
 __author__  = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec"
-__date__    = "2007-12-04"
-__version__ = '0.17'
+__date__    = "2007-12-08"
+__version__ = '0.18'
 
 #--- LICENSE ------------------------------------------------------------------
 
 #                      - added safety check in main for large or binary
 #                        properties
 #                      - allow size>0 for storages for some implementations
+# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and
+#                        streams
+#                      - added option '-c' in main to check all streams
 
 #-----------------------------------------------------------------------------
 # TODO (for version 1.0):
 # - add underscore to each private method, to avoid their display in
 #   pydoc/epydoc documentation
 # - replace all raised exceptions with _raise_defect (at least in OleFileIO)
+# - merge code from _OleStream and OleFileIO.getsect to read sectors
+#   (maybe add a class for FAT and MiniFAT ?)
 # - add method to check all streams (follow sectors chains without storing all
 #   stream in memory, and report anomalies)
 # - use _OleDirectoryEntry.kids_dict to improve _find and _list ?
         return    : a StringIO instance containing the OLE stream
         """
         debug('_OleStream.__init__:')
-        debug('  size=%d, offset=%d, sectorsize=%d, len(fat)=%d'
-            %(size,offset,sectorsize,len(fat)))
+        debug('  sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'
+            %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))
+        # for debugging messages, size of file where stream is read:
+        if isinstance(fp, StringIO.StringIO):
+            filesize = len(fp.getvalue())   # file in MiniFAT
+        else:
+            filesize = os.path.getsize(fp.name) # file on disk
         #[PL] To detect malformed documents with FAT loops, we compute the
         # expected number of sectors in the stream:
         unknown_size = False
             unknown_size = True
             debug('  stream with UNKNOWN SIZE')
         nb_sectors = (size + (sectorsize-1)) / sectorsize
+        debug('nb_sectors = %d' % nb_sectors)
         # This number should (at least) be less than the total number of
         # sectors in the given FAT:
         if nb_sectors > len(fat):
         data = []
         # if size is zero, then first sector index should be ENDOFCHAIN:
         if size == 0 and sect != ENDOFCHAIN:
+            debug('size == 0 and sect != ENDOFCHAIN:')
             raise IOError, 'incorrect OLE sector index for empty stream'
         #[PL] A fixed-length for loop is used instead of an undefined while
         # loop to avoid DoS attacks:
                     break
                 else:
                     # else this means that the stream is smaller than declared:
+                    debug('sect=ENDOFCHAIN before expected size')
                     raise IOError, 'incomplete OLE stream'
             # sector index should be within FAT:
             if sect<0 or sect>=len(fat):
-                debug('fp = '+ repr(fp))
-                debug('file size: %d' % os.path.getsize(fp.name))
-                debug('offset=%d, sectorsize=%d, sect=%d, seek=%d, len read=%d, len(fat)=%d' %
-                    (offset, sectorsize, sect, offset + sectorsize * sect, len(sector_data), len(fat)))
+                debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))
+                debug('i=%d / nb_sectors=%d' %(i, nb_sectors))
+##                tmp_data = string.join(data, "")
+##                f = open('test_debug.bin', 'wb')
+##                f.write(tmp_data)
+##                f.close()
+##                debug('data read so far: %d bytes' % len(tmp_data))
                 raise IOError, 'incorrect OLE FAT, sector index out of range'
+            #TODO: merge this code with OleFileIO.getsect() ?
             #TODO: check if this works with 4K sectors:
-            fp.seek(offset + sectorsize * sect)
+            try:
+                fp.seek(offset + sectorsize * sect)
+            except:
+                debug('sect=%d, seek=%d, filesize=%d' %
+                    (sect, offset+sectorsize*sect, filesize))
+                raise IOError, 'OLE sector index out of range'
             sector_data = fp.read(sectorsize)
             # [PL] check if there was enough data:
-            if len(sector_data) != sectorsize:
+            # Note: if sector is the last of the file, sometimes it is not a
+            # complete sector (of 512 or 4K), so we may read less than
+            # sectorsize.
+            if len(sector_data)!=sectorsize and sect!=(len(fat)-1):
+                debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %
+                    (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))
+                debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))
                 raise IOError, 'incomplete OLE sector'
             data.append(sector_data)
             # jump to next sector in the FAT:
             raise IOError, 'incorrect last sector index in OLE stream'
         data = string.join(data, "")
         # Data is truncated to the actual stream size:
-        if len(data) > size:
+        if len(data) >= size:
             data = data[:size]
             # actual stream size is stored for future use:
             self.size = size
-        else:
+        elif unknown_size:
             # actual stream size was not known, now we know the size of read
             # data:
             self.size = len(data)
+        else:
+            # read data is less than expected:
+            debug('len(data)=%d, size=%d' % (len(data), size))
+            raise IOError, 'OLE stream size is less than declared'
         # when all data is read in memory, StringIO constructor is called
         StringIO.StringIO.__init__(self, data)
         # Then the _OleStream object can be used as a read-only file object.
         # name is converted from unicode to Latin-1:
         self.name = _unicode(name)
 
-        debug('DirEntry SID=%d: %s' % (self.sid, self.name))
+        debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
         debug(' - type: %d' % self.entry_type)
         debug(' - sect: %d' % self.isectStart)
         debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,
         only be called for the root object once.
         """
         debug('build_storage_tree: SID=%d - %s - sid_child=%d'
-            % (self.sid, self.name, self.sid_child))
+            % (self.sid, repr(self.name), self.sid_child))
         if self.sid_child != NOSTREAM:
             # if child SID is not NOSTREAM, then this entry is a storage.
             # Let's walk through the tree of children to fill the kids list:
         # get child direntry:
         child = self.olefile._load_direntry(child_sid) #direntries[child_sid]
         debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'
-            % (child.sid, child.name, child.sid_left, child.sid_right, child.sid_child))
+            % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))
         # the directory entries are organized as a red-black tree.
         # (cf. Wikipedia for details)
         # First walk through left side of the tree:
         header_size = struct.calcsize(fmt_header)
         debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )
         header1 = header[:header_size]
-        (Sig, clsid, MinorVersion, DllVersion, ByteOrder, SectorShift,
-        MiniSectorShift, Reserved, Reserved1, csectDir, self.csectFat, sectDirStart,
-        signature, MiniSectorCutoff, MiniFatStart, csectMiniFat, self.sectDifStart,
-        self.csectDif) = struct.unpack(fmt_header, header1)
+        (
+            self.Sig,
+            self.clsid,
+            self.MinorVersion,
+            self.DllVersion,
+            self.ByteOrder,
+            self.SectorShift,
+            self.MiniSectorShift,
+            self.Reserved, self.Reserved1,
+            self.csectDir,
+            self.csectFat,
+            self.sectDirStart,
+            self.signature,
+            self.MiniSectorCutoff,
+            self.MiniFatStart,
+            self.csectMiniFat,
+            self.sectDifStart,
+            self.csectDif
+        ) = struct.unpack(fmt_header, header1)
         debug( struct.unpack(fmt_header,    header1))
 
-        if Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
+        if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
             # OLE signature should always be present
             self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
-        if clsid != '\x00'*16:
+        if self.clsid != '\x00'*16:
             # according to AAF specs, CLSID should always be zero
             self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
-        debug( "MinorVersion = %d" % MinorVersion )
-        debug( "DllVersion   = %d" % DllVersion )
-        if DllVersion not in [3, 4]:
+        debug( "MinorVersion = %d" % self.MinorVersion )
+        debug( "DllVersion   = %d" % self.DllVersion )
+        if self.DllVersion not in [3, 4]:
             # version 3: usual format, 512 bytes per sector
             # version 4: large format, 4K per sector
             self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")
-        debug( "ByteOrder    = %X" % ByteOrder )
-        if ByteOrder != 0xFFFE:
+        debug( "ByteOrder    = %X" % self.ByteOrder )
+        if self.ByteOrder != 0xFFFE:
             # For now only common little-endian documents are handled correctly
             self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header")
             # TODO: add big-endian support for documents created on Mac ?
-        SectorSize = 2**SectorShift
-        debug( "SectorSize   = %d" % SectorSize )
-        if SectorSize not in [512, 4096]:
+        self.SectorSize = 2**self.SectorShift
+        debug( "SectorSize   = %d" % self.SectorSize )
+        if self.SectorSize not in [512, 4096]:
             self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header")
-        if (DllVersion==3 and SectorSize!=512) or (DllVersion==4 and SectorSize!=4096):
+        if (self.DllVersion==3 and self.SectorSize!=512) \
+        or (self.DllVersion==4 and self.SectorSize!=4096):
             self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header")
-        MiniSectorSize = 2**MiniSectorShift
-        debug( "MiniSectorSize   = %d" % MiniSectorSize )
-        if MiniSectorSize not in [64]:
+        self.MiniSectorSize = 2**self.MiniSectorShift
+        debug( "MiniSectorSize   = %d" % self.MiniSectorSize )
+        if self.MiniSectorSize not in [64]:
             self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header")
-        if Reserved != 0 or Reserved1 != 0:
+        if self.Reserved != 0 or self.Reserved1 != 0:
             self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")
-        debug( "csectDir     = %d" % csectDir )
-        if SectorSize==512 and csectDir!=0:
+        debug( "csectDir     = %d" % self.csectDir )
+        if self.SectorSize==512 and self.csectDir!=0:
             self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header")
         debug( "csectFat     = %d" % self.csectFat )
-        debug( "sectDirStart = %X" % sectDirStart )
-        debug( "signature    = %d" % signature )
+        debug( "sectDirStart = %X" % self.sectDirStart )
+        debug( "signature    = %d" % self.signature )
         # Signature should be zero, BUT some implementations do not follow this
         # rule => only a potential defect:
-        if signature != 0:
+        if self.signature != 0:
             self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)")
-        debug( "MiniSectorCutoff    = %d" % MiniSectorCutoff )
-        debug( "MiniFatStart    = %X" % MiniFatStart )
-        debug( "csectMiniFat    = %d" % csectMiniFat )
-        debug( "sectDifStart    = %X" % self.sectDifStart )
-        debug( "csectDif        = %d" % self.csectDif )
+        debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff )
+        debug( "MiniFatStart     = %X" % self.MiniFatStart )
+        debug( "csectMiniFat     = %d" % self.csectMiniFat )
+        debug( "sectDifStart     = %X" % self.sectDifStart )
+        debug( "csectDif         = %d" % self.csectDif )
 
         # calculate the number of sectors in the file
         # (-1 because header doesn't count)
-        self.nb_sect = (os.path.getsize(filename) / SectorSize) - 1
+        filesize = os.path.getsize(filename)
+        self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1
         debug( "Number of sectors in the file: %d" % self.nb_sect )
 
         # file clsid (probably never used, so we don't store it)
         clsid = _clsid(header[8:24])
-        self.sectorsize = 1 << i16(header, 30)
-        self.minisectorsize = 1 << i16(header, 32)
-        self.minisectorcutoff = i32(header, 56)
+        self.sectorsize = self.SectorSize #1 << i16(header, 30)
+        self.minisectorsize = self.MiniSectorSize  #1 << i16(header, 32)
+        self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56)
 
         # check known streams for duplicate references (these are always in FAT,
         # never in MiniFAT):
-        self._check_duplicate_stream(sectDirStart)
+        self._check_duplicate_stream(self.sectDirStart)
         # check MiniFAT only if it is not empty:
-        if csectMiniFat:
-            self._check_duplicate_stream(MiniFatStart)
+        if self.csectMiniFat:
+            self._check_duplicate_stream(self.MiniFatStart)
         # check DIFAT only if it is not empty:
         if self.csectDif:
             self._check_duplicate_stream(self.sectDifStart)
         self.loadfat(header)
         # Load direcory.  This sets both the direntries list (ordered by sid)
         # and the root (ordered by hierarchy) members.
-        self.loaddirectory(i32(header, 48))
+        self.loaddirectory(self.sectDirStart)#i32(header, 48))
         self.ministream = None
-        self.minifatsect = i32(header, 60)
+        self.minifatsect = self.MiniFatStart #i32(header, 60)
 
 
     def _check_duplicate_stream(self, first_sect, minifat=False):
 ##              # FAT should contain csectFat blocks
 ##              print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat)
 ##              raise IOError, 'incorrect DIFAT'
+        # since FAT is read from fixed-size sectors, it may contain more values
+        # than the actual number of sectors in the file.
+        # Keep only the relevant sector indexes:
+        if len(self.fat) > self.nb_sect:
+            debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))
+            self.fat = self.fat[:self.nb_sect]
+        debug('\nFAT:')
         self.dumpfat(self.fat)
 
 
         """
         Load the MiniFAT table.
         """
-        # This is stored in a standard  sub-stream, pointed to by a header
+        # MiniFAT is stored in a standard  sub-stream, pointed to by a header
         # field.
-        s = self._open(self.minifatsect).read()
+        # NOTE: there are two sizes to take into account for this stream:
+        # 1) Stream size is calculated according to the number of sectors
+        #    declared in the OLE header. This allocated stream may be more than
+        #    needed to store the actual sector indexes.
+        # (self.csectMiniFat is the number of sectors of size self.SectorSize)
+        stream_size = self.csectMiniFat * self.SectorSize
+        # 2) Actually used size is calculated by dividing the MiniStream size
+        #    (given by root entry size) by the size of mini sectors, *4 for
+        #    32 bits indexes:
+        nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize
+        used_size = nb_minisectors * 4
+        debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %
+            (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors))
+        if used_size > stream_size:
+            # This is not really a problem, but may indicate a wrong implementation:
+            self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')
+        # In any case, first read stream_size:
+        s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
         #[PL] Old code replaced by an array:
         #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
         self.minifat = array.array('L', s)
-
+        # Then shrink the array to used size, to avoid indexes out of MiniStream:
+        debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))
+        self.minifat = self.minifat[:nb_minisectors]
+        debug('loadminifat(): len=%d' % len(self.minifat))
+        debug('\nMiniFAT:')
+        self.dumpfat(self.minifat)
 
     def getsect(self, sect):
         """
         try:
             self.fp.seek(self.sectorsize * (sect+1))
         except:
+            debug('getsect(): sect=%X, seek=%d, filesize=%d' %
+                (sect, self.sectorsize*(sect+1), os.path.getsize(self.fp.name)))
             self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
         sector = self.fp.read(self.sectorsize)
         if len(sector) != self.sectorsize:
+            debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
+                (sect, len(sector), self.sectorsize))
             self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
         return sector
 
         self.root.dump()
 
 
-    def _open(self, start, size = 0x7FFFFFFF):
+    def _open(self, start, size = 0x7FFFFFFF, force_FAT=False):
         """
         Open a stream, either in FAT or MiniFAT according to its size.
         (openstream helper)
         
         start: index of first sector
         size: size of stream (or nothing if size is unknown)
+        force_FAT: if False (default), stream will be opened in FAT or MiniFAT
+                   according to size. If True, it will always be opened in FAT.
         """
+        debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' %
+            (start, size, str(force_FAT)))
         # stream size is compared to the MiniSectorCutoff threshold:
-        if size < self.minisectorcutoff:
+        if size < self.minisectorcutoff and not force_FAT:
             # ministream object
             if not self.ministream:
                 # load MiniFAT if it wasn't already done:
                 self.loadminifat()
                 # The first sector index of the miniFAT stream is stored in the
                 # root directory entry:
-                self.ministream = self._open(self.root.isectStart)
+                size_ministream = self.root.size
+                debug('Opening MiniStream: sect=%d, size=%d' %
+                    (self.root.isectStart, size_ministream))
+                self.ministream = self._open(self.root.isectStart,
+                    size_ministream, force_FAT=True)
             return _OleStream(self.ministream, start, size, 0,
                               self.minisectorsize, self.minifat)
         else:
         print """
 Launched from command line, this script parses OLE files and prints info.
 
-Usage: OleFileIO_PL.py [-d] <file> [file2 ...]
+Usage: OleFileIO_PL.py [-d] [-s] <file> [file2 ...]
 
 Options:
--d : debug mode (display a lot of messages, for developers only)
+-d : debug mode (display a lot of debug information, for developers only)
+-s : check all streams (for debugging purposes)
 """
         sys.exit()
 
+    check_streams = False
     for filename in sys.argv[1:]:
 ##      try:
+            # OPTIONS:
             if filename == '-d':
                 # option to switch debug mode on:
                 set_debug_mode(True)
                 continue
+            if filename == '-c':
+                # option to switch check streams mode on:
+                check_streams = True
+                continue
+
             ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT)
             print "-" * 68
             print filename
                                     v = '(binary data)'
                                     break
                         print "   ", k, v
+                        
+            if check_streams:
+                # Read all streams to check if there are errors:
+                print '\nChecking streams...'
+                for streamname in ole.listdir():
+                    # print name using repr() to convert binary chars to \xNN:
+                    print '-', repr('/'.join(streamname)),'-',
+                    st_type = ole.get_type(streamname)
+                    if st_type == STGTY_STREAM:
+                        print 'size %d' % ole.get_size(streamname)
+                        # just try to read stream in memory:
+                        ole.openstream(streamname)
+                    else:
+                        print 'NOT a stream : type=%d' % st_type
+                print ''
 
             #[PL] Test a few new methods:
             root = ole.get_rootentry_name()