Commits

Anonymous committed 1da49c5

hachoir-parser/misc: total rewrite of OLE parser, MS Office parsers, Word 2.0 parser

  • Participants
  • Parent commits 5b1575c

Comments (0)

Files changed (8)

hachoir-metadata/hachoir_metadata/misc.py

     def extract(self, ole2):
         self._extract(ole2)
 
-    def _extract(self, fieldset, main_document=True):
-        if main_document:
-            # _feedAll() is needed to make sure that we get all root[*] fragments
+    def _extract(self, fieldset):
+        try:
             fieldset._feedAll()
-            if "root[0]" in fieldset:
-                self.useRoot(fieldset["root[0]"])
-        doc_summary = self.getField(fieldset, main_document, "doc_summary[0]")
+        except StopIteration:
+            pass
+        if "root[0]" in fieldset:
+            self._extract(self.getFragment(fieldset["root[0]"]))
+        doc_summary = self.getField(fieldset, "doc_summary[0]")
         if doc_summary:
             self.useSummary(doc_summary, True)
-        word_doc = self.getField(fieldset, main_document, "word_doc[0]")
+        word_doc = self.getField(fieldset, "word_doc[0]")
         if word_doc:
             self.useWordDocument(word_doc)
-        summary = self.getField(fieldset, main_document, "summary[0]")
+        summary = self.getField(fieldset, "summary[0]")
         if summary:
             self.useSummary(summary, False)
 
-    @fault_tolerant
-    def useRoot(self, root):
-        stream = root.getSubIStream()
+    def getFragment(self, frag):
+        stream = frag.getSubIStream()
         ministream = guessParser(stream)
         if not ministream:
             warning("Unable to create the OLE2 mini stream parser!")
-            return
-        self._extract(ministream, main_document=False)
+            return frag
+        return ministream
 
-    def getField(self, fieldset, main_document, name):
+    def getField(self, fieldset, name):
+        # _feedAll() is needed to make sure that we get all fragments
+        # eg. summary[0], summary[1], ..., summary[n]
+        try:
+            fieldset._feedAll()
+        except StopIteration:
+            pass
         if name not in fieldset:
             return None
-        # _feedAll() is needed to make sure that we get all fragments
-        # eg. summary[0], summary[1], ..., summary[n]
-        fieldset._feedAll()
         field = fieldset[name]
-        if main_document:
-            stream = field.getSubIStream()
-            field = guessParser(stream)
-            if not field:
-                warning("Unable to create the OLE2 parser for %s!" % name)
-                return None
-        return field
+        return self.getFragment(field)
 
     @fault_tolerant
     def useSummary(self, summary, is_doc_summary):
 
     @fault_tolerant
     def useWordDocument(self, doc):
-        self.comment = "Encrypted: %s" % doc["fEncrypted"].value
+        self.comment = "Encrypted: %s" % doc["FIB/fEncrypted"].value
 
     @fault_tolerant
     def useProperty(self, summary, property, is_doc_summary):

hachoir-parser/hachoir_parser/misc/__init__.py

 from hachoir_parser.misc.gnome_keyring import GnomeKeyring
 from hachoir_parser.misc.bplist import BPList
 from hachoir_parser.misc.dsstore import DSStore
+from hachoir_parser.misc.word_doc import WordDocumentParser
+from hachoir_parser.misc.word_2 import Word2DocumentParser

hachoir-parser/hachoir_parser/misc/msoffice.py

 
 Documents:
  - goffice source code
+ - Microsoft Office PowerPoint 97-2007 Binary File Format (.ppt) Specification
+    http://download.microsoft.com/download/0/B/E/0BE8BDD7-E5E8-422A-ABFD-4342ED7AD886/PowerPoint97-2007BinaryFileFormat(ppt)Specification.pdf
 
 Author: Robert Xiao, Victor Stinner
-Creation: 2006-04-23
+Creation: 8 january 2005
 """
 
-from hachoir_parser import HachoirParser
-from hachoir_core.field import FieldSet, RootSeekableFieldSet, RawBytes
-from hachoir_core.endian import LITTLE_ENDIAN
+from hachoir_core.field import (SubFile, FieldSet,
+    UInt8, UInt16, Int32, UInt32, Enum, String, CString,
+    Bits, RawBytes)
+from hachoir_core.text_handler import textHandler, hexadecimal
+from hachoir_parser.misc.ole2_util import OLE2FragmentParser, RawParser
 from hachoir_core.stream import StringInputStream
-from hachoir_parser.misc.msoffice_summary import SummaryFieldSet, CompObj
-from hachoir_parser.misc.word_doc import WordDocumentFieldSet
+from hachoir_parser.misc.msoffice_summary import Summary, CompObj
+from hachoir_parser.misc.word_doc import WordDocumentParser, WordTableParser
 
-PROPERTY_NAME = {
-    u"\5DocumentSummaryInformation": "doc_summary",
-    u"\5SummaryInformation": "summary",
-    u"WordDocument": "word_doc",
-}
-
-class OfficeRootEntry(HachoirParser, RootSeekableFieldSet):
-    PARSER_TAGS = {
-        "description": "Microsoft Office document subfragments",
-    }
-    endian = LITTLE_ENDIAN
-
-    def __init__(self, stream, **args):
-        RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
-        HachoirParser.__init__(self, stream, **args)
-
-    def validate(self):
-        return True
+class RootEntry(OLE2FragmentParser):
+    ENDIAN_CHECK=False
 
     def createFields(self):
         for index, property in enumerate(self.ole2.properties):
             if index == 0:
                 continue
             try:
-                name = PROPERTY_NAME[property["name"].value]
+                name,parser = PROPERTY_NAME[property["name"].value]
             except LookupError:
                 name = property.name+"content"
-            for field in self.parseProperty(index, property, name):
+                parser = RawParser
+            for field in self.parseProperty(property, name, parser):
                 yield field
+    def seekSBlock(self, block):
+        self.seekBit(block * self.ole2.ss_size)
 
-    def parseProperty(self, property_index, property, name_prefix):
+    def parseProperty(self, property, name_prefix, parser=RawParser):
         ole2 = self.ole2
         if not property["size"].value:
             return
         first = None
         previous = None
         size = 0
-        start = property["start"].value
-        chain = ole2.getChain(start, True)
-        blocksize = ole2.ss_size
-        desc_format = "Small blocks %s..%s (%s)"
+        fragment_group = None
+        chain = ole2.getChain(property["start"].value, ole2.ss_fat)
         while True:
             try:
                 block = chain.next()
                 contiguous = False
-                if not first:
+                if first is None:
                     first = block
                     contiguous = True
-                if previous and block == (previous+1):
+                if previous is not None and block == (previous+1):
                     contiguous = True
                 if contiguous:
                     previous = block
-                    size += blocksize
+                    size += ole2.ss_size
                     continue
             except StopIteration:
                 block = None
+            if first is None:
+                break
             self.seekSBlock(first)
-            desc = desc_format % (first, previous, previous-first+1)
-            size = min(size, property["size"].value*8)
-            if name_prefix in ("summary", "doc_summary"):
-                yield SummaryFieldSet(self, name, desc, size=size)
-            elif name_prefix == "word_doc":
-                yield WordDocumentFieldSet(self, name, desc, size=size)
-            elif property_index == 1:
-                yield CompObj(self, "comp_obj", desc, size=size)
-            else:
-                yield RawBytes(self, name, size//8, desc)
+            desc = "Small blocks %s..%s (%s)" % (first, previous, previous-first+1)
+            desc += " of %s bytes" % (ole2.ss_size//8)
+            field = CustomFragment(self, name, size, parser, desc, fragment_group)
+            yield field
+            if not fragment_group:
+                fragment_group = field.group
+                fragment_group.args["datasize"] = property["size"].value
+                fragment_group.args["ole2name"] = property["name"].value
             if block is None:
                 break
             first = block
             previous = block
-            size = ole2.sector_size
-
-    def seekSBlock(self, block):
-        self.seekBit(block * self.ole2.ss_size)
+            size = ole2.ss_size
 
 class FragmentGroup:
     def __init__(self, parser):
         self.items = []
         self.parser = parser
+        self.args = {}
 
     def add(self, item):
         self.items.append(item)
         data = "".join(data)
 
         # FIXME: Use smarter code to send arguments
-        args = {"ole2": self.items[0].root}
-        tags = {"class": self.parser, "args": args}
+        self.args["ole2"] = self.items[0].root
+        tags = {"class": self.parser, "args": self.args}
         tags = tags.iteritems()
         return StringInputStream(data, "<fragment group>", tags=tags)
 
     def _createInputStream(self, **args):
         return self.group.createInputStream()
 
+class Pictures(OLE2FragmentParser):
+    class Picture(FieldSet):
+        def createFields(self):
+            yield RawBytes(self, "identifier", 4, "some kind of marker (A0461DF0)")
+            yield UInt32(self, "size")
+            yield RawBytes(self, "unknown[]", 16)
+            yield RawBytes(self, "unknown[]", 1)
+            yield SubFile(self, "image", self["size"].value-17, "Image Data")
+    ENDIAN_CHECK=False
+
+    def createFields(self):
+        pos=0
+        while pos//8 < self.datasize:
+            newpic=Pictures.Picture(self, "picture[]")
+            yield newpic
+            pos+=newpic.size
+
+class PowerPointDocument(OLE2FragmentParser):
+    OBJ_TYPES={ 0:"Unknown",
+                1000:"Document",
+                1001:"DocumentAtom",
+                1002:"EndDocument",
+                1003:"SlidePersist",
+                1004:"SlideBase",
+                1005:"SlideBaseAtom",
+                1006:"Slide",
+                1007:"SlideAtom",
+                1008:"Notes",
+                1009:"NotesAtom",
+                1010:"Environment",
+                1011:"SlidePersistAtom",
+                1012:"Scheme",
+                1013:"SchemeAtom",
+                1014:"DocViewInfo",
+                1015:"SSlideLayoutAtom",
+                1016:"MainMaster",
+                1017:"SSSlideInfoAtom",
+                1018:"SlideViewInfo",
+                1019:"GuideAtom",
+                1020:"ViewInfo",
+                1021:"ViewInfoAtom",
+                1022:"SlideViewInfoAtom",
+                1023:"VBAInfo",
+                1024:"VBAInfoAtom",
+                1025:"SSDocInfoAtom",
+                1026:"Summary",
+                1027:"Texture",
+                1028:"VBASlideInfo",
+                1029:"VBASlideInfoAtom",
+                1030:"DocRoutingSlip",
+                1031:"OutlineViewInfo",
+                1032:"SorterViewInfo",
+                1033:"ExObjList",
+                1034:"ExObjListAtom",
+                1035:"PPDrawingGroup", #FIXME: Office Art File Format Docu
+                1036:"PPDrawing", #FIXME: Office Art File Format Docu
+                1038:"Theme",
+                1039:"ColorMapping",
+                1040:"NamedShows", # don't know if container
+                1041:"NamedShow",
+                1042:"NamedShowSlides", # don't know if container
+                1052:"OriginalMainMasterId",
+                1053:"CompositeMasterId",
+                1054:"RoundTripContentMasterInfo12",
+                1055:"RoundTripShapeId12",
+                1056:"RoundTripHFPlaceholder12",
+                1058:"RoundTripContentMasterId12",
+                1059:"RoundTripOArtTextStyles12",
+                1060:"HeaderFooterDefaults12",
+                1061:"DocFlags12",
+                1062:"RoundTripShapeCheckSumForCustomLayouts12",
+                1063:"RoundTripNotesMasterTextStyles12",
+                1064:"RoundTripCustomTableStyles12",
+                2000:"List",
+                2005:"FontCollection",
+                2017:"ListPlaceholder",
+                2019:"BookmarkCollection",
+                2020:"SoundCollection",
+                2021:"SoundCollAtom",
+                2022:"Sound",
+                2023:"SoundData",
+                2025:"BookmarkSeedAtom",
+                2026:"GuideList",
+                2028:"RunArray",
+                2029:"RunArrayAtom",
+                2030:"ArrayElementAtom",
+                2031:"Int4ArrayAtom",
+                2032:"ColorSchemeAtom",
+                3008:"OEShape",
+                3009:"ExObjRefAtom",
+                3011:"OEPlaceholderAtom",
+                3020:"GrColor",
+                3024:"GPointAtom",
+                3025:"GrectAtom",
+                3031:"GRatioAtom",
+                3032:"Gscaling",
+                3034:"GpointAtom",
+                3035:"OEShapeAtom",
+                3037:"OEPlaceholderNewPlaceholderId12",
+                3998:"OutlineTextRefAtom",
+                3999:"TextHeaderAtom",
+                4000:"TextCharsAtom",
+                4001:"StyleTextPropAtom",
+                4002:"BaseTextPropAtom",
+                4003:"TxMasterStyleAtom",
+                4004:"TxCFStyleAtom",
+                4005:"TxPFStyleAtom",
+                4006:"TextRulerAtom",
+                4007:"TextBookmarkAtom",
+                4008:"TextBytesAtom",
+                4009:"TxSIStyleAtom",
+                4010:"TextSpecInfoAtom",
+                4011:"DefaultRulerAtom",
+                4023:"FontEntityAtom",
+                4024:"FontEmbeddedData",
+                4025:"TypeFace",
+                4026:"CString",
+                4027:"ExternalObject",
+                4033:"MetaFile",
+                4034:"ExOleObj",
+                4035:"ExOleObjAtom",
+                4036:"ExPlainLinkAtom",
+                4037:"CorePict",
+                4038:"CorePictAtom",
+                4039:"ExPlainAtom",
+                4040:"SrKinsoku",
+                4041:"HandOut",
+                4044:"ExEmbed",
+                4045:"ExEmbedAtom",
+                4046:"ExLink",
+                4047:"ExLinkAtom_old",
+                4048:"BookmarkEntityAtom",
+                4049:"ExLinkAtom",
+                4050:"SrKinsokuAtom",
+                4051:"ExHyperlinkAtom",
+                4053:"ExPlain",
+                4054:"ExPlainLink",
+                4055:"ExHyperlink",
+                4056:"SlideNumberMCAtom",
+                4057:"HeadersFooters",
+                4058:"HeadersFootersAtom",
+                4062:"RecolorEntryAtom",
+                4063:"TxInteractiveInfoAtom",
+                4065:"EmFormatAtom",
+                4066:"CharFormatAtom",
+                4067:"ParaFormatAtom",
+                4068:"MasterText",
+                4071:"RecolorInfoAtom",
+                4073:"ExQuickTime",
+                4074:"ExQuickTimeMovie",
+                4075:"ExQuickTimeMovieData",
+                4076:"ExSubscription",
+                4077:"ExSubscriptionSection",
+                4078:"ExControl",
+                4080:"SlideListWithText",
+                4081:"AnimationInfoAtom",
+                4082:"InteractiveInfo",
+                4083:"InteractiveInfoAtom",
+                4084:"SlideList",
+                4085:"UserEditAtom",
+                4086:"CurrentUserAtom",
+                4087:"DateTimeMCAtom",
+                4088:"GenericDateMCAtom",
+                4090:"FooterMCAtom",
+                4091:"ExControlAtom",
+                4100:"ExMediaAtom",
+                4101:"ExVideo",
+                4102:"ExAviMovie",
+                4103:"ExMCIMovie",
+                4109:"ExMIDIAudio",
+                4110:"ExCDAudio",
+                4111:"ExWAVAudioEmbedded",
+                4112:"ExWAVAudioLink",
+                4113:"ExOleObjStg",
+                4114:"ExCDAudioAtom",
+                4115:"ExWAVAudioEmbeddedAtom",
+                4116:"AnimationInfoAtom",
+                4117:"RTFDateTimeMCAtom",
+                5000:"ProgTags", # don't know if container
+                5001:"ProgStringTag",
+                5002:"ProgBinaryTag",
+                5003:"BinaryTagData",
+                6000:"PrintOptions",
+                6001:"PersistPtrFullBlock", # don't know if container
+                6002:"PersistPtrIncrementalBlock", # don't know if container
+                10000:"RulerIndentAtom",
+                10001:"GScalingAtom",
+                10002:"GRColorAtom",
+                10003:"GLPointAtom",
+                10004:"GlineAtom",
+                11019:"AnimationAtom12",
+                11021:"AnimationHashAtom12",
+                14100:"SlideSyncInfo12",
+                14101:"SlideSyncInfoAtom12",
+                0xf000:"EscherDggContainer", # Drawing Group Container 
+                0xf006:"EscherDgg",
+                0xf016:"EscherCLSID",
+                0xf00b:"EscherOPT",
+                0xf001:"EscherBStoreContainer",
+                0xf007:"EscherBSE",
+                0xf018:"EscherBlip_START", # Blip types are between 
+                0xf117:"EscherBlip_END", # these two values 
+                0xf002:"EscherDgContainer", # Drawing Container 
+                0xf008:"EscherDg",
+                0xf118:"EscherRegroupItems",
+                0xf120:"EscherColorScheme", # bug in docs 
+                0xf003:"EscherSpgrContainer",
+                0xf004:"EscherSpContainer",
+                0xf009:"EscherSpgr",
+                0xf00a:"EscherSp",
+                0xf00c:"EscherTextbox",
+                0xf00d:"EscherClientTextbox",
+                0xf00e:"EscherAnchor",
+                0xf00f:"EscherChildAnchor",
+                0xf010:"EscherClientAnchor",
+                0xf011:"EscherClientData",
+                0xf005:"EscherSolverContainer",
+                0xf012:"EscherConnectorRule", # bug in docs 
+                0xf013:"EscherAlignRule",
+                0xf014:"EscherArcRule",
+                0xf015:"EscherClientRule",
+                0xf017:"EscherCalloutRule",
+                0xf119:"EscherSelection",
+                0xf11a:"EscherColorMRU",
+                0xf11d:"EscherDeletedPspl", # bug in docs 
+                0xf11e:"EscherSplitMenuColors",
+                0xf11f:"EscherOleObject",
+                0xf122:"EscherUserDefined"}
+    class CurrentUserAtom(FieldSet):
+        def createFields(self):
+            yield UInt32(self, "size")
+            yield textHandler(UInt32(self, "magic", "0xe391c05f for normal PPT, 0xf3d1c4df for encrypted PPT"), hexadecimal)
+            yield UInt32(self, "offsetToCurrentEdit", "Offset in main stream to current edit field")
+            yield UInt16(self, "lenUserName", "Length of user name")
+            yield UInt16(self, "docFileVersion", "1012 for PP97+")
+            yield UInt8(self, "majorVersion", "3 for PP97+")
+            yield UInt8(self, "minorVersion", "0 for PP97+")
+            yield UInt16(self, "unknown")
+            yield String(self, "userName", self["lenUserName"].value, "ANSI version of the username")
+            yield UInt32(self, "relVersion", "Release version: 8 for regular PPT file, 9 for multiple-master PPT file")
+
+    class PowerPointObject(FieldSet):
+        def createFields(self):
+            yield Bits(self, "version", 4)
+            yield Bits(self, "instance", 12)
+            yield Enum(UInt16(self, "type"),PowerPointDocument.OBJ_TYPES)
+            yield UInt32(self, "length")
+            self._size = self["length"].value * 8 + 64
+            obj_type = self["type"].display
+            obj_len = self["length"].value
+            # type 1064 (RoundTripCustomTableStyles12) may appear to be a container, but it is not.
+            if self["version"].value==0xF and self["type"].value != 1064:
+                while (self.current_size)//8 < obj_len+8:
+                    yield PowerPointDocument.PowerPointObject(self, "object[]")
+            elif obj_len:
+                if obj_type=="FontEntityAtom":
+                    yield String(self, "data", obj_len, charset="UTF-16-LE", truncate="\0", strip="\0")
+                elif obj_type=="TextCharsAtom":
+                    yield String(self, "data", obj_len, charset="UTF-16-LE")
+                elif obj_type=="TextBytesAtom":
+                    yield String(self, "data", obj_len, charset="ASCII")
+                elif hasattr(PowerPointDocument, obj_type):
+                    field = getattr(PowerPointDocument, obj_type)(self, "data")
+                    field._size = obj_len * 8
+                    yield field
+                else:
+                    yield RawBytes(self, "data", obj_len)
+        def createDescription(self):
+            if self["version"].value==0xF:
+                return "PowerPoint Object Container; instance %s, type %s"%(self["instance"].value,self["type"].display)
+            return "PowerPoint Object; version %s, instance %s, type %s"%(self["version"].value,self["instance"].value,self["type"].display)
+    ENDIAN_CHECK=False
+    OS_CHECK=False
+    def createFields(self):
+        pos=0
+        while pos//8 < self.datasize:
+            newobj=PowerPointDocument.PowerPointObject(self, "object[]")
+            yield newobj
+            pos+=newobj.size
+
+class CurrentUser(OLE2FragmentParser):
+    def createFields(self):
+        yield PowerPointDocument.PowerPointObject(self, "current_user")
+        if self.current_size < self.size:
+            yield String(self, "unicode_name", self["current_user/data/lenUserName"].value * 2, charset="UTF-16-LE")
+        
+
+class ExcelWorkbook(OLE2FragmentParser):
+    BIFF_TYPES={0x000:"DIMENSIONS_v0",
+                0x200:"DIMENSIONS_v2",
+                0x001:"BLANK_v0",
+                0x201:"BLANK_v2",
+                0x002:"INTEGER",
+                0x003:"NUMBER_v0",
+                0x203:"NUMBER_v2",
+                0x004:"LABEL_v0",
+                0x204:"LABEL_v2",
+                0x005:"BOOLERR_v0",
+                0x205:"BOOLERR_v2",
+                0x006:"FORMULA_v0",
+                0x206:"FORMULA_v2",
+                0x406:"FORMULA_v4",
+                0x007:"STRING_v0",
+                0x207:"STRING_v2",
+                0x008:"ROW_v0",
+                0x208:"ROW_v2",
+                0x009:"BOF_v0",
+                0x209:"BOF_v2",
+                0x409:"BOF_v4",
+                0x809:"BOF_v8",
+                0x00a:"EOF",
+                0x00b:"INDEX_v0",
+                0x20b:"INDEX_v2",
+                0x00c:"CALCCOUNT",
+                0x00d:"CALCMODE",
+                0x00e:"PRECISION",
+                0x00f:"REFMODE",
+                0x010:"DELTA",
+                0x011:"ITERATION",
+                0x012:"PROTECT",
+                0x013:"PASSWORD",
+                0x014:"HEADER",
+                0x015:"FOOTER",
+                0x016:"EXTERNCOUNT",
+                0x017:"EXTERNSHEET",
+                0x018:"NAME_v0",
+                0x218:"NAME_v2",
+                0x019:"WINDOWPROTECT",
+                0x01a:"VERTICALPAGEBREAKS",
+                0x01b:"HORIZONTALPAGEBREAKS",
+                0x01c:"NOTE",
+                0x01d:"SELECTION",
+                0x01e:"FORMAT_v0",
+                0x41e:"FORMAT_v4",
+                0x01f:"FORMATCOUNT",	# Undocumented 
+                0x020:"COLUMNDEFAULT",	# Undocumented 
+                0x021:"ARRAY_v0",
+                0x221:"ARRAY_v2",
+                0x022:"1904",
+                0x023:"EXTERNNAME_v0",
+                0x223:"EXTERNNAME_v2",
+                0x024:"COLWIDTH",	# Undocumented 
+                0x025:"DEFAULTROWHEIGHT_v0",
+                0x225:"DEFAULTROWHEIGHT_v2",
+                0x026:"LEFT_MARGIN",
+                0x027:"RIGHT_MARGIN",
+                0x028:"TOP_MARGIN",
+                0x029:"BOTTOM_MARGIN",
+                0x02a:"PRINTHEADERS",
+                0x02b:"PRINTGRIDLINES",
+                0x02f:"FILEPASS",
+                0x031:"FONT_v0",
+                0x231:"FONT_v2",
+                0x032:"FONTCOUNT",	# Undocumented 
+                0x033:"PRINTSIZE",	# Undocumented 
+                0x036:"TABLE_v0",
+                0x236:"TABLE_v2",
+                0x037:"TABLE2",	# OOo has docs 
+                0x038:"WNDESK",	# Undocumented 
+                0x039:"ZOOM",	# Undocumented 
+                0x03a:"BEGINPREF",	# Undocumented 
+                0x03b:"ENDPREF",	# Undocumented 
+                0x03c:"CONTINUE",
+                0x03d:"WINDOW1",
+                0x03e:"WINDOW2_v0",
+                0x23e:"WINDOW2_v2",
+                0x03f:"PANE_V2",	# Undocumented 
+                0x040:"BACKUP",
+                0x041:"PANE",
+                0x042:"CODEPAGE",
+                0x043:"XF_OLD_v0",
+                0x243:"XF_OLD_v2",
+                0x443:"XF_OLD_v4",
+                0x044:"XF_INDEX",
+                0x045:"FONT_COLOR",
+                0x04d:"PLS",
+                0x050:"DCON",
+                0x051:"DCONREF",
+                0x052:"DCONNAME",
+                0x055:"DEFCOLWIDTH",
+                0x059:"XCT",
+                0x05a:"CRN",
+                0x05b:"FILESHARING",
+                0x05c:"WRITEACCESS",
+                0x05d:"OBJ",
+                0x05e:"UNCALCED",
+                0x05f:"SAVERECALC",
+                0x060:"TEMPLATE",
+                0x061:"INTL",	# Undocumented 
+                0x862:"TAB_COLOR",	# Undocumented, OO calls it SHEETLAYOUT 
+                0x063:"OBJPROTECT",
+                0x07d:"COLINFO",
+                0x27e:"RK", # Odd that there is no 0x7e 
+                0x07f:"IMDATA",
+                0x080:"GUTS",
+                0x081:"WSBOOL",
+                0x082:"GRIDSET",
+                0x083:"HCENTER",
+                0x084:"VCENTER",
+                0x085:"BOUNDSHEET",
+                0x086:"WRITEPROT",
+                0x087:"ADDIN",
+                0x088:"EDG",
+                0x089:"PUB",
+                0x08c:"COUNTRY",
+                0x08d:"HIDEOBJ",
+                0x08e:"BUNDLESOFFSET",	# Undocumented 
+                0x08f:"BUNDLEHEADER",	# Undocumented 
+                0x090:"SORT",
+                0x091:"SUB",
+                0x092:"PALETTE",
+                0x293:"STYLE", # Odd that there is no 0x93 
+                0x094:"LHRECORD",
+                0x095:"LHNGRAPH",
+                0x096:"SOUND",
+                0x097:"SYNC",	# Undocumented 
+                0x098:"LPR",
+                0x099:"STANDARDWIDTH",
+                0x09a:"FNGROUPNAME",
+                0x09b:"FILTERMODE",
+                0x09c:"FNGROUPCOUNT",
+                0x09d:"AUTOFILTERINFO",
+                0x09e:"AUTOFILTER",
+                0x0a0:"SCL",
+                0x0a1:"SETUP",
+                0x0a4:"TOOLBARVER",	# Undocumented 
+                0x0a9:"COORDLIST",
+                0x0ab:"GCW",
+                0x0ae:"SCENMAN",
+                0x0af:"SCENARIO",
+                0x0b0:"SXVIEW",
+                0x0b1:"SXVD",
+                0x0b2:"SXVI",
+                0x0b3:"SXSI",	# Undocumented 
+                0x0b4:"SXIVD",
+                0x0b5:"SXLI",
+                0x0b6:"SXPI",
+                0x0b7:"FACENUM",	# Undocumented
+                0x0b8:"DOCROUTE",
+                0x0b9:"RECIPNAME",
+                0x0ba:"SSLIST",	# Undocumented 
+                0x0bb:"MASKIMDATA",	# Undocumented 
+                0x4bc:"SHRFMLA",
+                0x0bd:"MULRK",
+                0x0be:"MULBLANK",
+                0x0bf:"TOOLBARHDR",	# Undocumented 
+                0x0c0:"TOOLBAREND",	# Undocumented 
+                0x0c1:"MMS",
+                0x0c2:"ADDMENU",
+                0x0c3:"DELMENU",
+                0x0c4:"TIPHISTORY",	# Undocumented 
+                0x0c5:"SXDI",
+                0x0c6:"SXDB",
+                0x0c7:"SXFDB",	# guessed 
+                0x0c8:"SXDDB",	# guessed 
+                0x0c9:"SXNUM",	# guessed 
+                0x0ca:"SXBOOL",	# guessed 
+                0x0cb:"SXERR",	# guessed 
+                0x0cc:"SXINT",	# guessed 
+                0x0cd:"SXSTRING",
+                0x0ce:"SXDTR",	# guessed 
+                0x0cf:"SXNIL",	# guessed 
+                0x0d0:"SXTBL",
+                0x0d1:"SXTBRGIITM",
+                0x0d2:"SXTBPG",
+                0x0d3:"OBPROJ",
+                0x0d5:"SXIDSTM",
+                0x0d6:"RSTRING",
+                0x0d7:"DBCELL",
+                0x0d8:"SXNUMGROUP",	# from OO : numerical grouping in pivot cache field 
+                0x0da:"BOOKBOOL",
+                0x0dc:"PARAMQRY",	# DUPLICATE dc 
+                0x0dc:"SXEXT",	# DUPLICATE dc 
+                0x0dd:"SCENPROTECT",
+                0x0de:"OLESIZE",
+                0x0df:"UDDESC",
+                0x0e0:"XF",
+                0x0e1:"INTERFACEHDR",
+                0x0e2:"INTERFACEEND",
+                0x0e3:"SXVS",
+                0x0e5:"MERGECELLS",	# guessed 
+                0x0e9:"BG_PIC",	# Undocumented 
+                0x0ea:"TABIDCONF",
+                0x0eb:"MS_O_DRAWING_GROUP",
+                0x0ec:"MS_O_DRAWING",
+                0x0ed:"MS_O_DRAWING_SELECTION",
+                0x0ef:"PHONETIC",	# semi-Undocumented 
+                0x0f0:"SXRULE",
+                0x0f1:"SXEX",
+                0x0f2:"SXFILT",
+                0x0f6:"SXNAME",
+                0x0f7:"SXSELECT",
+                0x0f8:"SXPAIR",
+                0x0f9:"SXFMLA",
+                0x0fb:"SXFORMAT",
+                0x0fc:"SST",
+                0x0fd:"LABELSST",
+                0x0ff:"EXTSST",
+                0x100:"SXVDEX",
+                0x103:"SXFORMULA",
+                0x122:"SXDBEX",
+                0x137:"CHTRINSERT",
+                0x138:"CHTRINFO",
+                0x13B:"CHTRCELLCONTENT",
+                0x13d:"TABID",
+                0x140:"CHTRMOVERANGE",
+                0x14D:"CHTRINSERTTAB",
+                0x15F:"LABELRANGES",
+                0x160:"USESELFS",
+                0x161:"DSF",
+                0x162:"XL5MODIFY",
+                0x196:"CHTRHEADER",
+                0x1a5:"FILESHARING2",
+                0x1a9:"USERDBVIEW",
+                0x1aa:"USERSVIEWBEGIN",
+                0x1ab:"USERSVIEWEND",
+                0x1ad:"QSI",
+                0x1ae:"SUPBOOK",
+                0x1af:"PROT4REV",
+                0x1b0:"CONDFMT",
+                0x1b1:"CF",
+                0x1b2:"DVAL",
+                0x1b5:"DCONBIN",
+                0x1b6:"TXO",
+                0x1b7:"REFRESHALL",
+                0x1b8:"HLINK",
+                0x1ba:"CODENAME",	# TYPO in MS Docs 
+                0x1bb:"SXFDBTYPE",
+                0x1bc:"PROT4REVPASS",
+                0x1be:"DV",
+                0x1c0:"XL9FILE",
+                0x1c1:"RECALCID",
+                0x800:"LINK_TIP",	# follows an hlink 
+                0x802:"UNKNOWN_802",	# OO exports it but has not name or docs 
+                0x803:"WQSETT",	# OO named it and can export it, but does not include it in the docs 
+                0x804:"WQTABLES",	# OO named it and can export it, but does not include it in the docs 
+                0x805:"UNKNOWN_805",	# No name or docs, seems related to web query see #153260 for sample 
+                0x810:"PIVOT_AUTOFORMAT",	# Seems to contain pivot table autoformat indicies, plus ?? 
+                0x864:"UNKNOWN_864",	# seems related to pivot tables 
+                0x867:"SHEETPROTECTION",	# OO named it, and has docs 
+                0x868:"RANGEPROTECTION",	# OO named it, no docs yet 
+
+                0x1001:"CHART_units",
+                0x1002:"CHART_chart",
+                0x1003:"CHART_series",
+                0x1006:"CHART_dataformat",
+                0x1007:"CHART_lineformat",
+                0x1009:"CHART_markerformat",
+                0x100a:"CHART_areaformat",
+                0x100b:"CHART_pieformat",
+                0x100c:"CHART_attachedlabel",
+                0x100d:"CHART_seriestext",
+                0x1014:"CHART_chartformat",
+                0x1015:"CHART_legend",
+                0x1016:"CHART_serieslist",
+                0x1017:"CHART_bar",
+                0x1018:"CHART_line",
+                0x1019:"CHART_pie",
+                0x101a:"CHART_area",
+                0x101b:"CHART_scatter",
+                0x101c:"CHART_chartline",
+                0x101d:"CHART_axis",
+                0x101e:"CHART_tick",
+                0x101f:"CHART_valuerange",
+                0x1020:"CHART_catserrange",
+                0x1021:"CHART_axislineformat",
+                0x1022:"CHART_chartformatlink",
+                0x1024:"CHART_defaulttext",
+                0x1025:"CHART_text",
+                0x1026:"CHART_fontx",
+                0x1027:"CHART_objectlink",
+                0x1032:"CHART_frame",
+                0x1033:"CHART_begin",
+                0x1034:"CHART_end",
+                0x1035:"CHART_plotarea",
+                0x103a:"CHART_3d",
+                0x103c:"CHART_picf",
+                0x103d:"CHART_dropbar",
+                0x103e:"CHART_radar",
+                0x103f:"CHART_surf",
+                0x1040:"CHART_radararea",
+                0x1041:"CHART_axisparent",
+                0x1043:"CHART_legendxn",
+                0x1044:"CHART_shtprops",
+                0x1045:"CHART_sertocrt",
+                0x1046:"CHART_axesused",
+                0x1048:"CHART_sbaseref",
+                0x104a:"CHART_serparent",
+                0x104b:"CHART_serauxtrend",
+                0x104e:"CHART_ifmt",
+                0x104f:"CHART_pos",
+                0x1050:"CHART_alruns",
+                0x1051:"CHART_ai",
+                0x105b:"CHART_serauxerrbar",
+                0x105c:"CHART_clrtclient",	# Undocumented 
+                0x105d:"CHART_serfmt",
+                0x105f:"CHART_3dbarshape",	# Undocumented 
+                0x1060:"CHART_fbi",
+                0x1061:"CHART_boppop",
+                0x1062:"CHART_axcext",
+                0x1063:"CHART_dat",
+                0x1064:"CHART_plotgrowth",
+                0x1065:"CHART_siindex",
+                0x1066:"CHART_gelframe",
+                0x1067:"CHART_boppopcustom",}
+    class BIFF(FieldSet):
+        def createFields(self):
+            yield Enum(UInt16(self, "type"),ExcelWorkbook.BIFF_TYPES)
+            yield UInt16(self, "length")
+            if self["length"].value:
+                yield RawBytes(self, "data", self["length"].value)
+        def createDescription(self):
+            return "Excel BIFF; type %s"%self["type"].display
+    def createFields(self):
+        pos=0
+        while pos//8 < self.datasize:
+            newobj=ExcelWorkbook.BIFF(self, "BIFF[]")
+            yield newobj
+            pos+=newobj.size
+
+class ThumbsCatalog(OLE2FragmentParser):
+    class ThumbsEntry(FieldSet):
+        def createFields(self):
+            yield UInt32(self, "size")
+            yield UInt32(self, "index")
+            yield Bits(self, "flags", 8)
+            yield RawBytes(self, "unknown[]", 5)
+            yield UInt16(self, "unknown[]")
+            yield CString(self, "name", charset="UTF-16-LE")
+            if self.current_size // 8 != self['size'].value:
+                yield RawBytes(self, "padding", self['size'].value - self.current_size // 8)
+        def createDescription(self):
+            return "Thumbnail entry for %s"%self["name"].display
+
+    def createFields(self):
+        yield UInt16(self, "unknown[]")
+        yield UInt16(self, "unknown[]")
+        yield UInt32(self, "count")
+        yield UInt32(self, "unknown[]")
+        yield UInt32(self, "unknown[]")
+        for i in xrange(self['count'].value):
+            yield ThumbsCatalog.ThumbsEntry(self, "entry[]")
+
+PROPERTY_NAME = {
+    u"Root Entry": ("root",RootEntry),
+    u"\5DocumentSummaryInformation": ("doc_summary",Summary),
+    u"\5SummaryInformation": ("summary",Summary),
+    u"\1CompObj": ("compobj",CompObj),
+    u"Pictures": ("pictures",Pictures),
+    u"PowerPoint Document": ("powerpointdoc",PowerPointDocument),
+    u"Current User": ("current_user",CurrentUser),
+    u"Workbook": ("workbook",ExcelWorkbook),
+    u"Catalog": ("catalog",ThumbsCatalog),
+    u"WordDocument": ("word_doc",WordDocumentParser),
+    u"0Table": ("table0",WordTableParser),
+    u"1Table": ("table1",WordTableParser),
+}

hachoir-parser/hachoir_parser/misc/msoffice_summary.py

  - Apache POI (HPSF Internals):
    http://poi.apache.org/hpsf/internals.html
 """
+from hachoir_core.endian import BIG_ENDIAN,LITTLE_ENDIAN
 from hachoir_parser import HachoirParser
 from hachoir_core.field import (FieldSet, ParserError,
-    RootSeekableFieldSet, SeekableFieldSet,
+    SeekableFieldSet,
     Bit, Bits, NullBits,
     UInt8, UInt16, UInt32, TimestampWin64, TimedeltaWin64, Enum,
-    Bytes, RawBytes, NullBytes, String,
+    Bytes, RawBytes, NullBytes, PaddingBits, String,
     Int8, Int32, Float32, Float64, PascalString32)
 from hachoir_core.text_handler import textHandler, hexadecimal, filesizeHandler
-from hachoir_core.tools import createDict
-from hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN
+from hachoir_core.tools import createDict, paddingSize
 from hachoir_parser.common.win32 import GUID, PascalStringWin32, CODEPAGE_CHARSET
 from hachoir_parser.image.bmp import BmpHeader, parseImageData
+from hachoir_parser.misc.ole2_util import OLE2FragmentParser
 
 MAX_SECTION_COUNT = 100
 
             yield RawBytes(self, "data", size)
 
 class PropertyContent(FieldSet):
+    class NullHandler(FieldSet):
+        def createFields(self):
+            yield UInt32(self, "unknown[]")
+            yield PascalString32(self, "data")
+        def createValue(self):
+            return self["data"].value
+    class BlobHandler(FieldSet):
+        def createFields(self):
+            self.osconfig = self.parent.osconfig
+            yield UInt32(self, "size")
+            yield UInt32(self, "count")
+            for i in range(self["count"].value):
+                yield PropertyContent(self, "item[]")
+                n=paddingSize(self.current_size,32)
+                if n: yield PaddingBits(self, "padding[]", n)
+    class WidePascalString32(FieldSet):
+        ''' uses number of characters instead of number of bytes '''
+        def __init__(self,parent,name,charset='ASCII'):
+            FieldSet.__init__(self,parent,name)
+            self.charset=charset
+        def createFields(self):
+            yield UInt32(self, "length", "Length of this string")
+            yield String(self, "data", self["length"].value*2, charset=self.charset)
+        def createValue(self):
+            return self["data"].value
+        def createDisplay(self):
+            return 'u'+self["data"].display
     TYPE_LPSTR = 30
     TYPE_INFO = {
         0: ("EMPTY", None),
-        1: ("NULL", None),
+        1: ("NULL", NullHandler),
         2: ("UInt16", UInt16),
         3: ("UInt32", UInt32),
         4: ("Float32", Float32),
         28: ("CARRAY", None),
         29: ("USERDEFINED", None),
         30: ("LPSTR", PascalString32),
-        31: ("LPWSTR", PascalString32),
+        31: ("LPWSTR", WidePascalString32),
         64: ("FILETIME", TimestampWin64),
-        65: ("BLOB", None),
+        65: ("BLOB", BlobHandler),
         66: ("STREAM", None),
         67: ("STORAGE", None),
         68: ("STREAMED_OBJECT", None),
         kw = {}
         try:
             handler = self.TYPE_INFO[tag][1]
-            if handler == PascalString32:
-                osconfig = self.osconfig
+            if handler in (self.WidePascalString32,PascalString32):
+                cur = self
+                while not hasattr(cur,'osconfig'):
+                    cur=cur.parent
+                    if cur is None:
+                        raise LookupError('Cannot find osconfig')
+                osconfig = cur.osconfig
                 if tag == self.TYPE_LPSTR:
                     kw["charset"] = osconfig.charset
                 else:
         except LookupError:
             handler = None
         if not handler:
-            raise ParserError("OLE2: Unable to parse property of type %s" \
+            log.warning("OLE2: Unable to parse property of type %s" \
                 % self["type"].display)
-        if self["is_vector"].value:
+            # raise ParserError(
+        elif self["is_vector"].value:
             yield UInt32(self, "count")
             for index in xrange(self["count"].value):
                 yield handler(self, "item[]", **kw)
         yield String(self, "name", 16)
         yield UInt32(self, "offset")
 
-class BaseSummary:
-    endian = LITTLE_ENDIAN
+class Summary(OLE2FragmentParser):
+    ENDIAN_CHECK=True
 
-    def __init__(self):
-        if self["endian"].value == "\xFF\xFE":
-            self.endian = BIG_ENDIAN
-        elif self["endian"].value == "\xFE\xFF":
-            self.endian = LITTLE_ENDIAN
-        else:
-            raise ParserError("OLE2: Invalid endian value")
-        self.osconfig = OSConfig(self["os_type"].value == OS_MAC)
+    def __init__(self, stream, **args):
+        OLE2FragmentParser.__init__(self, stream, **args)
+        #self.osconfig = OSConfig(self["os_type"].value == OS_MAC)
+        self.osconfig = OSConfig(self.endian == BIG_ENDIAN)
 
     def createFields(self):
-        yield Bytes(self, "endian", 2, "Endian (0xFF 0xFE for Intel)")
+        yield Bytes(self, "endian", 2, "Endian (\\xfe\\xff for little endian)")
         yield UInt16(self, "format", "Format (0)")
         yield UInt8(self, "os_version")
         yield UInt8(self, "os_revision")
         if 0 < size:
             yield NullBytes(self, "end_padding", size)
 
-class SummaryParser(BaseSummary, HachoirParser, RootSeekableFieldSet):
-    PARSER_TAGS = {
-        "description": "Microsoft Office summary",
-    }
+class CompObj(OLE2FragmentParser):
+    ENDIAN_CHECK=True
 
-    def __init__(self, stream, **kw):
-        RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
-        HachoirParser.__init__(self, stream, **kw)
-        BaseSummary.__init__(self)
-
-    def validate(self):
-        return True
-
-class SummaryFieldSet(BaseSummary, FieldSet):
-    def __init__(self, parent, name, description=None, size=None):
-        FieldSet.__init__(self, parent, name, description=description, size=size)
-        BaseSummary.__init__(self)
-
-class CompObj(FieldSet):
-    OS_VERSION = {
-        0x0a03: "Windows 3.1",
-    }
+    def __init__(self, stream, **args):
+        OLE2FragmentParser.__init__(self, stream, **args)
+        self.osconfig = OSConfig(self["os"].value == OS_MAC)
+        
     def createFields(self):
         # Header
         yield UInt16(self, "version", "Version (=1)")
-        yield textHandler(UInt16(self, "endian", "Endian (0xFF 0xFE for Intel)"), hexadecimal)
+        yield Bytes(self, "endian", 2, "Endian (\\xfe\\xff for little endian)")
         yield UInt8(self, "os_version")
         yield UInt8(self, "os_revision")
-        yield Enum(UInt16(self, "os_type"), OS_NAME)
+        yield Enum(UInt16(self, "os"), OS_NAME)
         yield Int32(self, "unused", "(=-1)")
         yield GUID(self, "clsid")
 
         yield PascalString32(self, "user_type", strip="\0")
 
         # Clipboard format
-        if self["os_type"].value == OS_MAC:
+        if self["os"].value == OS_MAC:
             yield Int32(self, "unused[]", "(=-2)")
             yield String(self, "clipboard_format", 4)
         else:
             yield PascalString32(self, "clipboard_format", strip="\0")
-        if self.current_size == self.size:
+        if self._current_size // 8 == self.datasize:
             return
 
         #-- OLE 2.01 ---
         # Program ID
         yield PascalString32(self, "prog_id", strip="\0")
 
-        if self["os_type"].value != OS_MAC:
+        if self["os"].value != OS_MAC:
             # Magic number
             yield textHandler(UInt32(self, "magic", "Magic number (0x71B239F4)"), hexadecimal)
 
             yield PascalStringWin32(self, "clipboard_format_unicode", strip="\0")
             yield PascalStringWin32(self, "prog_id_unicode", strip="\0")
 
-        size = (self.size - self.current_size) // 8
+        size = self.datasize - (self._current_size // 8) # _current_size because current_size returns _current_max_size
         if size:
             yield NullBytes(self, "end_padding", size)
 
+        if self.datasize<self.size//8: yield RawBytes(self,"slack_space",(self.size//8)-self.datasize)

hachoir-parser/hachoir_parser/misc/ole2.py

 """
 Microsoft Office documents parser.
+OLE2 files are also used by many other programs to store data.
 
 Informations:
 * wordole.c of AntiWord program (v0.35)
 from hachoir_core.field import (
     FieldSet, ParserError, SeekableFieldSet, RootSeekableFieldSet,
     UInt8, UInt16, UInt32, UInt64, TimestampWin64, Enum,
-    Bytes, RawBytes, NullBytes, String)
+    Bytes, NullBytes, String)
 from hachoir_core.text_handler import filesizeHandler
-from hachoir_core.endian import LITTLE_ENDIAN
+from hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN
 from hachoir_parser.common.win32 import GUID
-from hachoir_parser.misc.msoffice import CustomFragment, OfficeRootEntry, PROPERTY_NAME
-from hachoir_parser.misc.word_doc import WordDocumentParser
-from hachoir_parser.misc.msoffice_summary import SummaryParser
+from hachoir_parser.misc.msoffice import PROPERTY_NAME, RootEntry, RawParser, CustomFragment
 
 MIN_BIG_BLOCK_LOG2 = 6   # 512 bytes
 MAX_BIG_BLOCK_LOG2 = 14  # 64 kB
         for index in xrange(NB_DIFAT):
             yield SECT(self, "index[%u]" % index)
 
-        for index in xrange(self.count):
+        difat_sect = self.start
+        index = NB_DIFAT
+        entries_per_sect = self.parent.sector_size / 32 - 1
+        for ctr in xrange(self.count):
             # this is relative to real DIFAT start
-            self.seekBit(NB_DIFAT * SECT.static_size+self.parent.sector_size*(self.start+index))
-            for sect_index in xrange(NB_DIFAT*(index+1),NB_DIFAT*(index+2)):
-                yield SECT(self, "index[%u]" % sect_index)
+            self.seekBit(NB_DIFAT*SECT.static_size + self.parent.sector_size*difat_sect)
+            for sect_index in xrange(entries_per_sect):
+                yield SECT(self, "index[%u]" % (index+sect_index))
+            index += entries_per_sect
+            next = SECT(self, "difat[%u]" % ctr)
+            yield next
+            difat_sect = next.value
 
 class Header(FieldSet):
     static_size = 68 * 8
     def createFields(self):
         yield GUID(self, "clsid", "16 bytes GUID used by some apps")
         yield UInt16(self, "ver_min", "Minor version")
-        yield UInt16(self, "ver_maj", "Minor version")
-        yield Bytes(self, "endian", 2, "Endian (0xFFFE for Intel)")
+        yield UInt16(self, "ver_maj", "Major version")
+        yield Bytes(self, "endian", 2, "Endian (\\xfe\\xff for little endian)")
         yield UInt16(self, "bb_shift", "Log, base 2, of the big block size")
         yield UInt16(self, "sb_shift", "Log, base 2, of the small block size")
         yield NullBytes(self, "reserved[]", 6, "(reserved)")
         "id": "ole2",
         "category": "misc",
         "file_ext": (
+            "db",                        # Thumbs.db
             "doc", "dot",                # Microsoft Word
             "ppt", "ppz", "pps", "pot",  # Microsoft Powerpoint
             "xls", "xla",                # Microsoft Excel
         # Parse first property
         for index, property in enumerate(self.properties):
             if index == 0:
-                name = "root"
+                name, parser = 'root', RootEntry
             else:
                 try:
-                    name = PROPERTY_NAME[property["name"].value]
+                    name, parser = PROPERTY_NAME[property["name"].value]
                 except LookupError:
                     name = property.name+"content"
-            for field in self.parseProperty(property, name):
+                    parser = RawParser
+            for field in self.parseProperty(property, name, parser):
                 yield field
 
-    def parseProperty(self, property, name_prefix):
+    def parseProperty(self, property, name_prefix, parser=RawParser):
         if not property["size"].value:
             return
-        if property.name != "property[0]" \
-        and (property["size"].value < self["header/threshold"].value):
-            # Field is stored in the ministream, skip it
+        if property["size"].value < self["header/threshold"].value and name_prefix!='root':
             return
         name = "%s[]" % name_prefix
         first = None
             try:
                 block = chain.next()
                 contiguous = False
-                if not first:
+                if first is None:
                     first = block
                     contiguous = True
-                if previous and block == (previous+1):
+                if previous is not None and block == (previous+1):
                     contiguous = True
                 if contiguous:
                     previous = block
             self.seekBlock(first)
             desc = "Big blocks %s..%s (%s)" % (first, previous, previous-first+1)
             desc += " of %s bytes" % (self.sector_size // 8)
-            if name_prefix in set(("root", "summary", "doc_summary", "word_doc")):
-                if name_prefix == "root":
-                    parser = OfficeRootEntry
-                elif name_prefix == "word_doc":
-                    parser = WordDocumentParser
-                else:
-                    parser = SummaryParser
-                field = CustomFragment(self, name, size, parser, desc, fragment_group)
-                yield field
-                if not fragment_group:
-                    fragment_group = field.group
-            else:
-                yield RawBytes(self, name, size//8, desc)
+            field = CustomFragment(self, name, size, parser, desc, fragment_group)
+            if not fragment_group:
+                fragment_group = field.group
+                fragment_group.args["datasize"] = property["size"].value
+                fragment_group.args["ole2name"] = property["name"].value
+            yield field
             if block is None:
                 break
             first = block
             index = block // items_per_fat
             try:
                 block = fat[index]["index[%u]" % block].value
-            except LookupError:
+            except LookupError, err:
                 break
 
     def readBFAT(self):

hachoir-parser/hachoir_parser/misc/ole2_util.py

+from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN
+from hachoir_core.field import RawBytes, RootSeekableFieldSet, ParserError
+from hachoir_parser import HachoirParser
+
+class OLE2FragmentParser(HachoirParser,RootSeekableFieldSet):
+    tags = {
+        "description": "Microsoft Office document subfragments",
+    }
+    endian = LITTLE_ENDIAN
+
+    ENDIAN_CHECK=False
+
+    def __init__(self, stream, **args):
+        RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
+        HachoirParser.__init__(self, stream, **args)
+        if self.ENDIAN_CHECK:
+            if self["endian"].value == "\xFF\xFE":
+                self.endian = BIG_ENDIAN
+            elif self["endian"].value == "\xFE\xFF":
+                self.endian = LITTLE_ENDIAN
+            else:
+                raise ParserError("OLE2: Invalid endian value")
+
+    def validate(self):
+        if self.ENDIAN_CHECK:
+            if self["endian"].value not in ["\xFF\xFE", "\xFE\xFF"]:
+                return "Unknown endian value %s"%self["endian"].value.encode('hex')
+        return True
+
+class RawParser(OLE2FragmentParser):
+    ENDIAN_CHECK=False
+    OS_CHECK=False
+    def createFields(self):
+        yield RawBytes(self,"rawdata",self.datasize)
+        if self.datasize<self.size//8: yield RawBytes(self,"slack_space",(self.size//8)-self.datasize)

hachoir-parser/hachoir_parser/misc/word_2.py

+"""
+Documents:
+
+* "Microsoft Word for Windows 2.0 Binary Format"
+   http://www.wotsit.org/download.asp?f=word2&sc=275927573
+"""
+
+from hachoir_core.field import (FieldSet, Enum,
+    Bit, Bits,
+    UInt8, Int16, UInt16, UInt32, Int32,
+    NullBytes, Bytes, RawBytes, PascalString16,
+    DateTimeMSDOS32, TimeDateMSDOS32)
+from hachoir_core.endian import LITTLE_ENDIAN
+from hachoir_parser.misc.ole2_util import OLE2FragmentParser
+from hachoir_core.tools import paddingSize
+from hachoir_parser.common.win32_lang_id import LANGUAGE_ID
+TIMESTAMP = DateTimeMSDOS32
+
+class FC_CB(FieldSet):
+    def createFields(self):
+        yield Int32(self, "fc", "File Offset")
+        yield UInt16(self, "cb", "Byte Count")
+    def createValue(self):
+        return (self['fc'].value,self['cb'].value)
+
+class FIB(FieldSet):
+    def createFields(self):
+        yield UInt16(self, "wIdent", "Magic Number")
+        yield UInt16(self, "nFib", "File Information Block (FIB) Version")
+        yield UInt16(self, "nProduct", "Product Version")
+        yield Enum(UInt16(self, "lid", "Language ID"), LANGUAGE_ID)
+        yield Int16(self, "pnNext")
+
+        yield Bit(self, "fDot", "Is the document a document template?")
+        yield Bit(self, "fGlsy", "Is the document a glossary?")
+        yield Bit(self, "fComplex", "Is the document in Complex format?")
+        yield Bit(self, "fHasPic", "Does the document have embedded images?")
+        yield Bits(self, "cQuickSaves", 4, "Number of times the document was quick-saved")
+        yield Bit(self, "fEncrypted", "Is the document encrypted?")
+        yield Bits(self, "reserved[]", 7)
+
+        yield UInt16(self, "nFibBack")
+        yield UInt32(self, "reserved[]")
+        yield NullBytes(self, "rgwSpare", 6)
+
+        yield UInt32(self, "fcMin", "File offset of first text character")
+        yield UInt32(self, "fcMax", "File offset of last text character + 1")
+        yield Int32(self, "cbMax", "File offset of last byte + 1")
+        yield NullBytes(self, "fcSpare", 16)
+
+        yield UInt32(self, "ccpText", "Length of main document text stream")
+        yield Int32(self, "ccpFtn", "Length of footnote subdocument text stream")
+        yield Int32(self, "ccpHdr", "Length of header subdocument text stream")
+        yield Int32(self, "ccpMcr", "Length of macro subdocument text stream")
+        yield Int32(self, "ccpAtn", "Length of annotation subdocument text stream")
+        yield NullBytes(self, "ccpSpare", 16)
+
+        yield FC_CB(self, "StshfOrig", "Original STSH allocation")
+        yield FC_CB(self, "Stshf", "Current STSH allocation")
+        yield FC_CB(self, "PlcffndRef", "Footnote reference PLC")
+        yield FC_CB(self, "PlcffndTxt", "Footnote text PLC")
+        yield FC_CB(self, "PlcfandRef", "Annotation reference PLC")
+        yield FC_CB(self, "PlcfandTxt", "Annotation text PLC")
+        yield FC_CB(self, "Plcfsed", "Section descriptor PLC")
+        yield FC_CB(self, "Plcfpgd", "Page descriptor PLC")
+        yield FC_CB(self, "Plcfphe", "Paragraph heights PLC")
+        yield FC_CB(self, "Sttbfglsy", "Glossary string table")
+        yield FC_CB(self, "Plcfglsy", "Glossary PLC")
+        yield FC_CB(self, "Plcfhdd", "Header PLC")
+        yield FC_CB(self, "PlcfbteChpx", "Character property bin table PLC")
+        yield FC_CB(self, "PlcfbtePapx", "Paragraph property bin table PLC")
+        yield FC_CB(self, "Plcfsea", "Private Use PLC")
+        yield FC_CB(self, "Sttbfffn")
+        yield FC_CB(self, "PlcffldMom")
+        yield FC_CB(self, "PlcffldHdr")
+        yield FC_CB(self, "PlcffldFtn")
+        yield FC_CB(self, "PlcffldAtn")
+        yield FC_CB(self, "PlcffldMcr")
+        yield FC_CB(self, "Sttbfbkmk")
+        yield FC_CB(self, "Plcfbkf")
+        yield FC_CB(self, "Plcfbkl")
+        yield FC_CB(self, "Cmds")
+        yield FC_CB(self, "Plcmcr")
+        yield FC_CB(self, "Sttbfmcr")
+        yield FC_CB(self, "PrDrvr", "Printer Driver information")
+        yield FC_CB(self, "PrEnvPort", "Printer environment for Portrait mode")
+        yield FC_CB(self, "PrEnvLand", "Printer environment for Landscape mode")
+        yield FC_CB(self, "Wss", "Window Save State")
+        yield FC_CB(self, "Dop", "Document Property data")
+        yield FC_CB(self, "SttbfAssoc")
+        yield FC_CB(self, "Clx", "'Complex' file format data")
+        yield FC_CB(self, "PlcfpgdFtn", "Footnote page descriptor PLC")
+        yield FC_CB(self, "AutosaveSource", "Original filename for Autosave purposes")
+        yield FC_CB(self, "Spare5")
+        yield FC_CB(self, "Spare6")
+
+        yield Int16(self, "wSpare4")
+        yield UInt16(self, "pnChpFirst")
+        yield UInt16(self, "pnPapFirst")
+        yield UInt16(self, "cpnBteChp", "Count of CHPX FKPs recorded in file")
+        yield UInt16(self, "cpnBtePap", "Count of PAPX FKPs recorded in file")
+
+class SEPX(FieldSet):
+    def createFields(self):
+        yield UInt8(self, "size")
+        self._size=(self['size'].value+1)*8
+        yield RawBytes(self, "raw[]", self['size'].value)
+
+class SEPXGroup(FieldSet):
+    def __init__(self, parent, name, size, description=None):
+        FieldSet.__init__(self, parent, name, description=description)
+        self._size=size*8
+    def createFields(self):
+        while self.current_size < self.size:
+            next=self.stream.readBytes(self.absolute_address+self.current_size,1)
+            if next=='\x00':
+                padding = paddingSize((self.absolute_address + self.current_size)//8, 512)
+                if padding:
+                    yield NullBytes(self, "padding[]", padding)
+                if self.current_size >= self.size: break
+            yield SEPX(self, "sepx[]")
+
+class Word2DocumentParser(OLE2FragmentParser):
+    MAGIC='\xdb\xa5' # 42459
+    PARSER_TAGS = {
+        "id": "word_v2_document",
+        "min_size": 8,
+        "magic": ((MAGIC, 0),),
+        "file_ext": ("doc",),
+        "description": "Microsoft Office Word Version 2.0 document",
+    }
+    endian = LITTLE_ENDIAN
+
+    def __init__(self, stream, **args):
+        OLE2FragmentParser.__init__(self, stream, **args)
+
+    def validate(self):
+        if self.stream.readBytes(0,2) != self.MAGIC:
+            return "Invalid magic."
+        if self['FIB/nFib'].value not in (45,):
+            return "Unknown FIB version."
+        return True
+
+    def createFields(self):
+        yield FIB(self, "FIB", "File Information Block")
+        
+        padding = (self['FIB/fcMin'].value - self.current_size//8)
+        if padding:
+            yield NullBytes(self, "padding[]", padding)
+        if self['FIB/ccpText'].value:
+            yield Bytes(self, "text", self['FIB/ccpText'].value)
+        if self['FIB/ccpFtn'].value:
+            yield Bytes(self, "text_footnote", self['FIB/ccpFtn'].value)
+        if self['FIB/ccpHdr'].value:
+            yield Bytes(self, "text_header", self['FIB/ccpHdr'].value)
+        if self['FIB/ccpMcr'].value:
+            yield Bytes(self, "text_macro", self['FIB/ccpMcr'].value)
+        if self['FIB/ccpAtn'].value:
+            yield Bytes(self, "text_annotation", self['FIB/ccpAtn'].value)
+
+        padding = (self['FIB/fcMax'].value - self.current_size//8)
+        if padding:
+            yield RawBytes(self, "padding[]", padding)
+        
+        sepx_size = (self['FIB/pnChpFirst'].value*512 - self.current_size//8)
+        if sepx_size:
+            yield SEPXGroup(self, "sepx", sepx_size)
+

hachoir-parser/hachoir_parser/misc/word_doc.py

    section. Revised Dec 21 1998, added missing Document Properties (section).
 """
 
-from hachoir_parser import Parser
-from hachoir_core.field import (FieldSet,
+from hachoir_core.field import (FieldSet, Enum,
     Bit, Bits,
     UInt8, Int16, UInt16, UInt32, Int32,
-    NullBytes, RawBytes, PascalString16,
-    DateTimeMSDOS32)
+    NullBytes, Bytes, RawBytes, PascalString8, PascalString16, CString, String,
+    TimestampMac32, TimestampWin64)
+from hachoir_core.text_handler import displayHandler
 from hachoir_core.endian import LITTLE_ENDIAN
+from hachoir_parser import guessParser
+from hachoir_parser.misc.ole2_util import OLE2FragmentParser
+from hachoir_parser.common.win32_lang_id import LANGUAGE_ID
 
-TIMESTAMP = DateTimeMSDOS32
+CREATOR_ID={0x6A62: "Microsoft Word"}
+class ShortArray(FieldSet):
+    def createFields(self):
+        yield UInt16(self, "csw", "Count of fields in the array of shorts")
+        self._size = self['csw'].value*16+16
+        yield Enum(UInt16(self, "wMagicCreated", "File creator ID"), CREATOR_ID)
+        yield Enum(UInt16(self, "wMagicRevised", "File last modifier ID"), CREATOR_ID)
+        yield UInt16(self, "wMagicCreatePrivate")
+        yield UInt16(self, "wMagicCreatedPrivate")
+        yield UInt16(self, "pnFbpChpFirst_W6")
+        yield UInt16(self, "pnChpFirst_W6")
+        yield UInt16(self, "cpnBteChp_W6")
+        yield UInt16(self, "pnFbpPapFirst_W6")
+        yield UInt16(self, "pnPapFirst_W6")
+        yield UInt16(self, "cpnBtePap_W6")
+        yield UInt16(self, "pnFbpLvcFirst_W6")
+        yield UInt16(self, "pnLvcFirst_W6")
+        yield UInt16(self, "cpnBteLvc_W6")
+        yield Enum(UInt16(self, "lidFE", "Language ID if a Far East version of Word was used"), LANGUAGE_ID)
+        while self.current_size < self.size:
+            yield Int16(self, "unknown[]")
 
-class BaseWordDocument:
+def buildDateHandler(v):
+    md,y=divmod(v,100)
+    m,d=divmod(md,100)
+    if y < 60: y=2000+y
+    else: y=1900+y
+    return "%04i-%02i-%02i"%(y,m,d)
+
+class LongArray(FieldSet):
     def createFields(self):
-        yield UInt16(self, "wIdent", 2)
-        yield UInt16(self, "nFib")
-        yield UInt16(self, "nProduct")
-        yield UInt16(self, "lid")
+        yield UInt16(self, "clw", "Count of fields in the array of longs")
+        self._size = self['clw'].value*32+16
+        yield Int32(self, "cbMax", "Stream offset of last byte + 1")
+        yield displayHandler(UInt32(self, "lProductCreated", "Date when the creator program was built"),buildDateHandler)
+        yield displayHandler(UInt32(self, "lProductRevised", "Date when the last modifier program was built"),buildDateHandler)
+
+        yield UInt32(self, "ccpText", "Length of main document text stream")
+        yield Int32(self, "ccpFtn", "Length of footnote subdocument text stream")
+        yield Int32(self, "ccpHdr", "Length of header subdocument text stream")
+        yield Int32(self, "ccpMcr", "Length of macro subdocument text stream")
+        yield Int32(self, "ccpAtn", "Length of annotation subdocument text stream")
+        yield Int32(self, "ccpEdn", "Length of endnote subdocument text stream")
+        yield Int32(self, "ccpTxbx", "Length of textbox subdocument text stream")
+        yield Int32(self, "ccpHdrTxbx", "Length of header textbox subdocument text stream")
+        yield Int32(self, "pnFbpChpFirst", "Start of CHPX (Character Property) sector chain (sector = 512-byte 'page')")
+        yield Int32(self, "pnChpFirst", "First CHPX sector")
+        yield Int32(self, "cpnBteChp", "Number of CHPX sectors in the file")
+        yield Int32(self, "pnFbpPapFirst", "Start of PAPX (Paragraph Property) sector chain")
+        yield Int32(self, "pnPapFirst", "First PAPX sector")
+        yield Int32(self, "cpnBtePap", "Number of PAPX sectors in the file")
+        yield Int32(self, "pnFbpLvcFirst", "Start of LVC sector chain")
+        yield Int32(self, "pnLvcFirst", "First LVC sector")
+        yield Int32(self, "cpnBteLvc", "Number of LVC sectors in the file")
+        yield Int32(self, "fcIslandFirst")
+        yield Int32(self, "fcIslandLim")
+        while self.current_size < self.size:
+            yield Int32(self, "unknown[]")
+
+class FCLCB(FieldSet):
+    static_size=64
+    def createFields(self):
+        yield Int32(self, "fc", "Table Stream Offset")
+        yield UInt32(self, "lcb", "Byte Count")
+    def createValue(self):
+        return (self['fc'].value,self['lcb'].value)
+
+class FCLCBArray(FieldSet):
+    def createFields(self):
+        yield UInt16(self, "cfclcb", "Count of fields in the array of FC/LCB pairs")
+        self._size = self['cfclcb'].value*64+16
+        
+        yield FCLCB(self, "StshfOrig", "Original STSH allocation")
+        yield FCLCB(self, "Stshf", "Current STSH allocation")
+        yield FCLCB(self, "PlcffndRef", "Footnote reference (FRD) PLC")
+        yield FCLCB(self, "PlcffndTxt", "Footnote text PLC")
+        yield FCLCB(self, "PlcfandRef", "Annotation reference (ATRD) PLC")
+        yield FCLCB(self, "PlcfandTxt", "Annotation text PLC")
+        yield FCLCB(self, "Plcfsed", "Section descriptor (SED) PLC")
+        yield FCLCB(self, "Plcpad", "No longer used; used to be Plcfpgd (Page descriptor PLC)")
+        yield FCLCB(self, "Plcfphe", "Paragraph heights (PHE) PLC (only for Complex files)")
+        yield FCLCB(self, "Sttbfglsy", "Glossary string table")
+        yield FCLCB(self, "Plcfglsy", "Glossary PLC")
+        yield FCLCB(self, "Plcfhdd", "Header (HDD) PLC")
+        yield FCLCB(self, "PlcfbteChpx", "Character property bin table PLC")
+        yield FCLCB(self, "PlcfbtePapx", "Paragraph property bin table PLC")
+        yield FCLCB(self, "Plcfsea", "Private Use PLC")
+        yield FCLCB(self, "Sttbfffn", "Font information STTB")
+        yield FCLCB(self, "PlcffldMom", "Main document field position (FLD) PLC")
+        yield FCLCB(self, "PlcffldHdr", "Header subdocument field position (FLD) PLC")
+        yield FCLCB(self, "PlcffldFtn", "Footnote subdocument field position (FLD) PLC")
+        yield FCLCB(self, "PlcffldAtn", "Annotation subdocument field position (FLD) PLC")
+        yield FCLCB(self, "PlcffldMcr", "No longer used")
+        yield FCLCB(self, "Sttbfbkmk", "Bookmark names STTB")
+        yield FCLCB(self, "Plcfbkf", "Bookmark begin position (BKF) PLC")
+        yield FCLCB(self, "Plcfbkl", "Bookmark end position (BKL) PLC")
+        yield FCLCB(self, "Cmds", "Macro commands")
+        yield FCLCB(self, "Plcmcr", "No longer used")
+        yield FCLCB(self, "Sttbfmcr", "No longer used")
+        yield FCLCB(self, "PrDrvr", "Printer Driver information")
+        yield FCLCB(self, "PrEnvPort", "Printer environment for Portrait mode")
+        yield FCLCB(self, "PrEnvLand", "Printer environment for Landscape mode")
+        yield FCLCB(self, "Wss", "Window Save State")
+        yield FCLCB(self, "Dop", "Document Property data")
+        yield FCLCB(self, "SttbfAssoc", "Associated strings STTB")
+        yield FCLCB(self, "Clx", "Complex file information")
+        yield FCLCB(self, "PlcfpgdFtn", "Not used")
+        yield FCLCB(self, "AutosaveSource", "Original filename for Autosave purposes")
+        yield FCLCB(self, "GrpXstAtnOwners", "String Group for Annotation Owner Names")
+        yield FCLCB(self, "SttbfAtnbkmk", "Annotation subdocument bookmark names STTB")
+        yield FCLCB(self, "PlcdoaMom", "No longer used")
+        yield FCLCB(self, "PlcdoaHdr", "No longer used")
+        yield FCLCB(self, "PlcspaMom", "Main document File Shape (FSPA) PLC")
+        yield FCLCB(self, "PlcspaHdr", "Header subdocument FSPA PLC")
+        yield FCLCB(self, "PlcfAtnbkf", "Annotation subdocument bookmark begin position (BKF) PLC")
+        yield FCLCB(self, "PlcfAtnbkl", "Annotation subdocument bookmark end position (BKL) PLC")
+        yield FCLCB(self, "Pms", "Print Merge State")
+        yield FCLCB(self, "FormFldSttbs", "Form field values STTB")
+        yield FCLCB(self, "PlcfendRef", "Endnote Reference (FRD) PLC")
+        yield FCLCB(self, "PlcfendTxt", "Endnote Text PLC")
+        yield FCLCB(self, "PlcffldEdn", "Endnote subdocument field position (FLD) PLC)")
+        yield FCLCB(self, "PlcfpgdEdn", "not used")
+        yield FCLCB(self, "DggInfo", "Office Art Object Table Data")
+        yield FCLCB(self, "SttbfRMark", "Editor Author Abbreviations STTB")
+        yield FCLCB(self, "SttbCaption", "Caption Title STTB")
+        yield FCLCB(self, "SttbAutoCaption", "Auto Caption Title STTB")
+        yield FCLCB(self, "Plcfwkb", "WKB PLC")
+        yield FCLCB(self, "Plcfspl", "Spell Check State PLC")
+        yield FCLCB(self, "PlcftxbxTxt", "Text Box Text PLC")
+        yield FCLCB(self, "PlcffldTxbx", "Text Box Reference (FLD) PLC")
+        yield FCLCB(self, "PlcfhdrtxbxTxt", "Header Text Box Text PLC")
+        yield FCLCB(self, "PlcffldHdrTxbx", "Header Text Box Reference (FLD) PLC")
+        yield FCLCB(self, "StwUser", "Macro User storage")
+        yield FCLCB(self, "Sttbttmbd", "Embedded TrueType Font Data")
+        yield FCLCB(self, "Unused")
+        yield FCLCB(self, "PgdMother", "Main text page descriptors PLF")
+        yield FCLCB(self, "BkdMother", "Main text break descriptors PLF")
+        yield FCLCB(self, "PgdFtn", "Footnote text page descriptors PLF")
+        yield FCLCB(self, "BkdFtn", "Footnote text break descriptors PLF")
+        yield FCLCB(self, "PgdEdn", "Endnote text page descriptors PLF")
+        yield FCLCB(self, "BkdEdn", "Endnote text break descriptors PLF")
+        yield FCLCB(self, "SttbfIntlFld", "Field keywords STTB")
+        yield FCLCB(self, "RouteSlip", "Mailer Routing Slip")
+        yield FCLCB(self, "SttbSavedBy", "STTB of names of users who have saved the document")
+        yield FCLCB(self, "SttbFnm", "STTB of filenames of documents referenced by this one")
+        yield FCLCB(self, "PlcfLst", "List Format information PLC")
+        yield FCLCB(self, "PlfLfo", "List Format Override information PLC")
+        yield FCLCB(self, "PlcftxbxBkd", "Main document textbox break table (BKD) PLC")
+        yield FCLCB(self, "PlcftxbxHdrBkd", "Header subdocument textbox break table (BKD) PLC")
+        yield FCLCB(self, "DocUndo", "Undo/Versioning data")
+        yield FCLCB(self, "Rgbuse", "Undo/Versioning data")
+        yield FCLCB(self, "Usp", "Undo/Versioning data")
+        yield FCLCB(self, "Uskf", "Undo/Versioning data")
+        yield FCLCB(self, "PlcupcRgbuse", "Undo/Versioning data")
+        yield FCLCB(self, "PlcupcUsp", "Undo/Versioning data")
+        yield FCLCB(self, "SttbGlsyStyle", "Glossary entry style names STTB")
+        yield FCLCB(self, "Plgosl", "Grammar options PL")
+        yield FCLCB(self, "Plcocx", "OCX data PLC")
+        yield FCLCB(self, "PlcfbteLvc", "Character property bin table PLC")
+        if self['../fMac'].value:
+            yield TimestampMac32(self, "ftModified", "Date last modified")
+            yield Int32(self, "padding[]")
+        else:
+            yield TimestampWin64(self, "ftModified", "Date last modified")
+        yield FCLCB(self, "Plcflvc", "LVC PLC")
+        yield FCLCB(self, "Plcasumy", "Autosummary PLC")
+        yield FCLCB(self, "Plcfgram", "Grammar check PLC")
+        yield FCLCB(self, "SttbListNames", "List names STTB")
+        yield FCLCB(self, "SttbfUssr", "Undo/Versioning data")
+        while self.current_size < self.size:
+            yield FCLCB(self, "unknown[]")
+
+class FIB(FieldSet):
+    def createFields(self):
+        yield UInt16(self, "wIdent", "Magic Number")
+        yield UInt16(self, "nFib", "File Information Block (FIB) Version")
+        yield UInt16(self, "nProduct", "Product Version")
+        yield Enum(UInt16(self, "lid", "Language ID"), LANGUAGE_ID)
         yield Int16(self, "pnNext")
 
-        yield Bit(self, "fDot")
-        yield Bit(self, "fGlsy")
-        yield Bit(self, "fComplex")
-        yield Bit(self, "fHasPic")
-        yield Bits(self, "cQuickSaves", 4)
-        yield Bit(self, "fEncrypted")
-        yield Bit(self, "fWhichTblStm")
-        yield Bit(self, "fReadOnlyRecommanded")
-        yield Bit(self, "fWriteReservation")
-        yield Bit(self, "fExtChar")
+        yield Bit(self, "fDot", "Is the document a document template?")
+        yield Bit(self, "fGlsy", "Is the document a glossary?")
+        yield Bit(self, "fComplex", "Is the document in Complex format?")
+        yield Bit(self, "fHasPic", "Does the document have embedded images?")
+        yield Bits(self, "cQuickSaves", 4, "Number of times the document was quick-saved")
+        yield Bit(self, "fEncrypted", "Is the document encrypted?")
+        yield Bits(self, "fWhichTblStm", 1, "Which table stream (0Table or 1Table) to use")
+        yield Bit(self, "fReadOnlyRecommended", "Should the file be opened read-only?")
+        yield Bit(self, "fWriteReservation", "Is the file write-reserved?")
+        yield Bit(self, "fExtChar", "Does the file use an extended character set?")
         yield Bit(self, "fLoadOverride")
-        yield Bit(self, "fFarEeast")
+        yield Bit(self, "fFarEast")
         yield Bit(self, "fCrypto")
 
-        yield UInt16(self, "nFibBack")
-        yield UInt32(self, "lKey")
-        yield UInt8(self, "envr")
+        yield UInt16(self, "nFibBack", "Document is backwards compatible down to this FIB version")
+        yield UInt32(self, "lKey", "File encryption key (only if fEncrypted)")
+        yield Enum(UInt8(self, "envr", "Document creation environment"), {0:'Word for Windows',1:'Macintosh Word'})
 
-        yield Bit(self, "fMac")
+        yield Bit(self, "fMac", "Was this file last saved on a Mac?")
         yield Bit(self, "fEmptySpecial")
         yield Bit(self, "fLoadOverridePage")
         yield Bit(self, "fFutureSavedUndo")
         yield Bit(self, "fWord97Save")
         yield Bits(self, "fSpare0", 3)
+        CHARSET={0:'Windows ANSI',256:'Macintosh'}
+        yield Enum(UInt16(self, "chse", "Character set for document text"),CHARSET)
+        yield Enum(UInt16(self, "chsTables", "Character set for internal table text"),CHARSET)
+        yield UInt32(self, "fcMin", "File offset for the first character of text")
+        yield UInt32(self, "fcMax", "File offset for the last character of text + 1")
 
-        yield UInt16(self, "chse")
-        yield UInt16(self, "chsTables")
-        yield UInt32(self, "fcMin")
-        yield UInt32(self, "fcMac")
+        yield ShortArray(self, "array1", "Array of shorts")
+        yield LongArray(self, "array2", "Array of longs")
+        yield FCLCBArray(self, "array3", "Array of File Offset/Byte Count (FC/LCB) pairs")
 
-        yield PascalString16(self, "file_creator", strip="\0")
+def getRootParser(ole2):
+    return guessParser(ole2["root[0]"].getSubIStream())
 
-        yield NullBytes(self, "reserved[]", 12)
+def getOLE2Parser(ole2, path):
+    name = path+"[0]"
+    if name in ole2:
+        fragment = ole2[name]
+    else:
+        fragment = getRootParser(ole2)[name]
+    return guessParser(fragment.getSubIStream())
 
-        yield Int16(self, "lidFE")
-        yield UInt16(self, "clw")
-        yield Int32(self, "cbMac")
-        yield UInt32(self, "lProductCreated")
-        yield TIMESTAMP(self, "lProductRevised")
-
-        yield UInt32(self, "ccpText")
-        yield Int32(self, "ccpFtn")
-        yield Int32(self, "ccpHdr")
-        yield Int32(self, "ccpMcr")
-        yield Int32(self, "ccpAtn")
-        yield Int32(self, "ccpEdn")
-        yield Int32(self, "ccpTxbx")
-        yield Int32(self, "ccpHdrTxbx")
-        yield Int32(self, "pnFbpChpFirst")
-        yield Int32(self, "pnChpFirst")
-        yield Int32(self, "cpnBteChp")
-        yield Int32(self, "pnFbpPapFirst")
-        yield Int32(self, "pnPapFirst")
-        yield Int32(self, "cpnBtePap")
-        yield Int32(self, "pnFbpLvcFirst")
-        yield Int32(self, "pnLvcFirst")
-        yield Int32(self, "cpnBteLvc")
-        yield Int32(self, "fcIslandFirst")
-        yield Int32(self, "fcIslandLim")
-        yield UInt16(self, "cfclcb")
-        yield Int32(self, "fcStshfOrig")
-        yield UInt32(self, "lcbStshfOrig")
-        yield Int32(self, "fcStshf")
-        yield UInt32(self, "lcbStshf")
-
-        yield Int32(self, "fcPlcffndRef")
-        yield UInt32(self, "lcbPlcffndRef")
-        yield Int32(self, "fcPlcffndTxt")
-        yield UInt32(self, "lcbPlcffndTxt")
-        yield Int32(self, "fcPlcfandRef")
-        yield UInt32(self, "lcbPlcfandRef")
-        yield Int32(self, "fcPlcfandTxt")
-        yield UInt32(self, "lcbPlcfandTxt")
-        yield Int32(self, "fcPlcfsed")
-        yield UInt32(self, "lcbPlcfsed")
-        yield Int32(self, "fcPlcpad")
-        yield UInt32(self, "lcbPlcpad")
-        yield Int32(self, "fcPlcfphe")
-        yield UInt32(self, "lcbPlcfphe")
-        yield Int32(self, "fcSttbfglsy")
-        yield UInt32(self, "lcbSttbfglsy")
-        yield Int32(self, "fcPlcfglsy")
-        yield UInt32(self, "lcbPlcfglsy")
-        yield Int32(self, "fcPlcfhdd")
-        yield UInt32(self, "lcbPlcfhdd")
-        yield Int32(self, "fcPlcfbteChpx")
-        yield UInt32(self, "lcbPlcfbteChpx")
-        yield Int32(self, "fcPlcfbtePapx")
-        yield UInt32(self, "lcbPlcfbtePapx")
-        yield Int32(self, "fcPlcfsea")
-        yield UInt32(self, "lcbPlcfsea")
-        yield Int32(self, "fcSttbfffn")
-        yield UInt32(self, "lcbSttbfffn")
-        yield Int32(self, "fcPlcffldMom")
-        yield UInt32(self, "lcbPlcffldMom")
-        yield Int32(self, "fcPlcffldHdr")
-        yield UInt32(self, "lcbPlcffldHdr")
-        yield Int32(self, "fcPlcffldFtn")
-        yield UInt32(self, "lcbPlcffldFtn")
-        yield Int32(self, "fcPlcffldAtn")
-        yield UInt32(self, "lcbPlcffldAtn")
-        yield Int32(self, "fcPlcffldMcr")
-        yield UInt32(self, "lcbPlcffldMcr")
-        yield Int32(self, "fcSttbfbkmk")
-        yield UInt32(self, "lcbSttbfbkmk")
-        yield Int32(self, "fcPlcfbkf")
-        yield UInt32(self, "lcbPlcfbkf")
-        yield Int32(self, "fcPlcfbkl")
-        yield UInt32(self, "lcbPlcfbkl")
-        yield Int32(self, "fcCmds")
-        yield UInt32(self, "lcbCmds")
-        yield Int32(self, "fcPlcmcr")
-        yield UInt32(self, "lcbPlcmcr")
-        yield Int32(self, "fcSttbfmcr")
-        yield UInt32(self, "lcbSttbfmcr")
-        yield Int32(self, "fcPrDrvr")
-        yield UInt32(self, "lcbPrDrvr")
-        yield Int32(self, "fcPrEnvPort")
-        yield UInt32(self, "lcbPrEnvPort")
-        yield Int32(self, "fcPrEnvLand")
-        yield UInt32(self, "lcbPrEnvLand")
-        yield Int32(self, "fcWss")
-        yield UInt32(self, "lcbWss")
-        yield Int32(self, "fcDop")
-        yield UInt32(self, "lcbDop")
-        yield Int32(self, "fcSttbfAssoc")
-        yield UInt32(self, "lcbSttbfAssoc")
-        yield Int32(self, "fcClx")
-        yield UInt32(self, "lcbClx")
-        yield Int32(self, "fcPlcfpgdFtn")
-        yield UInt32(self, "lcbPlcfpgdFtn")
-        yield Int32(self, "fcAutosaveSource")
-        yield UInt32(self, "lcbAutosaveSource")
-        yield Int32(self, "fcGrpXstAtnOwners")
-        yield UInt32(self, "lcbGrpXstAtnOwners")
-        yield Int32(self, "fcSttbfAtnbkmk")
-        yield UInt32(self, "lcbSttbfAtnbkmk")
-        yield Int32(self, "fcPlcdoaMom")
-        yield UInt32(self, "lcbPlcdoaMom")
-        yield Int32(self, "fcPlcdoaHdr")
-        yield UInt32(self, "lcbPlcdoaHdr")
-        yield Int32(self, "fcPlcspaMom")
-        yield UInt32(self, "lcbPlcspaMom")
-        yield Int32(self, "fcPlcspaHdr")
-        yield UInt32(self, "lcbPlcspaHdr")
-        yield Int32(self, "fcPlcfAtnbkf")
-        yield UInt32(self, "lcbPlcfAtnbkf")
-        yield Int32(self, "fcPlcfAtnbkl")
-        yield UInt32(self, "lcbPlcfAtnbkl")
-        yield Int32(self, "fcPms")
-        yield UInt32(self, "lcbPms")
-        yield Int32(self, "fcFormFldSttbs")
-        yield UInt32(self, "lcbFormFldSttbs")
-        yield Int32(self, "fcPlcfendRef")
-        yield UInt32(self, "lcbPlcfendRef")
-        yield Int32(self, "fcPlcfendTxt")
-        yield UInt32(self, "lcbPlcfendTxt")
-        yield Int32(self, "fcPlcffldEdn")
-        yield UInt32(self, "lcbPlcffldEdn")
-        yield Int32(self, "fcPlcfpgdEdn")
-        yield UInt32(self, "lcbPlcfpgdEdn")
-        yield Int32(self, "fcDggInfo")
-        yield UInt32(self, "lcbDggInfo")
-        yield Int32(self, "fcSttbfRMark")
-        yield UInt32(self, "lcbSttbfRMark")
-        yield Int32(self, "fcSttbCaption")
-        yield UInt32(self, "lcbSttbCaption")
-        yield Int32(self, "fcSttbAutoCaption")
-        yield UInt32(self, "lcbSttbAutoCaption")
-        yield Int32(self, "fcPlcfwkb")
-        yield UInt32(self, "lcbPlcfwkb")
-        yield Int32(self, "fcPlcfspl")
-        yield UInt32(self, "lcbPlcfspl")
-        yield Int32(self, "fcPlcftxbxTxt")
-        yield UInt32(self, "lcbPlcftxbxTxt")