Commits

Anonymous committed 0f14f6b

fixed indexer bug; now handle bad directories

Comments (0)

Files changed (27)

+# -*- coding: utf-8 -*-
+'''
+Open and modify Microsoft Word 2007 docx files (called 'OpenXML' and 'Office OpenXML' by Microsoft)
+
+Part of Python's docx module - http://github.com/mikemaccana/python-docx
+See LICENSE for licensing information.
+'''
+
+from core import Docx
+from elements import *
+# -*- coding: utf-8 -*-
+from datetime import datetime
+import logging
+import os
+from os.path import join
+import re
+import shutil
+from tempfile import NamedTemporaryFile
+from zipfile import ZipFile
+
+from lxml import etree
+import dateutil.parser
+    
+from utils import findTypeParent, dir_to_docx
+from metadata import nsprefixes
+
+log = logging.getLogger(__name__)
+
+# Record template directory's location which is just 'template' for a docx
+# developer or 'site-packages/docx-template' if you have installed docx
+template_dir = join(os.path.dirname(__file__),'docx-template') # installed
+if not os.path.isdir(template_dir):
+    template_dir = join(os.path.dirname(__file__),'template') # dev
+
+    
+class Docx(object):
+    trees_and_files = {
+        "document": 'word/document.xml',
+        "coreprops":'docProps/core.xml',
+        "appprops":'docProps/app.xml',
+        "contenttypes":'[Content_Types].xml',
+        "websettings":'word/webSettings.xml',
+        "wordrelationships":'word/_rels/document.xml.rels'
+    }
+    def __init__(self, f=None):
+        create_new_doc = f is None
+        self._orig_docx = f
+        self._tmp_file = NamedTemporaryFile()
+        
+        if create_new_doc:
+            f = self.__generate_empty_docx()
+        
+        shutil.copyfile(f, self._tmp_file.name)
+        self._docx = ZipFile(self._tmp_file.name, mode='a')
+        
+        for tree, f in self.trees_and_files.items():
+            self._load_etree(tree, f)
+            
+        self.docbody = self.document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
+        
+        if create_new_doc: 
+            self.created = datetime.utcnow()
+        
+    def __new__(cls, *args, **kwargs):
+        # Make getters and setter for the core properties
+        def set_coreprop_property(prop, to_python=unicode, to_str=unicode):
+            getter = lambda self: to_python(self._get_coreprop_val(prop))
+            setter = lambda self, val: self._set_coreprop_val(prop, to_str(val))
+            setattr(cls, prop, property(getter, setter))
+            
+        for prop in ['title', 'subject', 'creator', 'description', 
+                     'lastModifiedBy', 'revision']:
+            set_coreprop_property(prop)
+            
+        for datetimeprop in ['created', 'modified']:
+            set_coreprop_property(datetimeprop, 
+                to_python=dateutil.parser.parse,
+                to_str=lambda obj: (obj.isoformat() 
+                                    if hasattr(obj, 'isoformat') 
+                                    else dateutil.parser.parse(obj).isoformat())
+            )
+        return super(Docx, cls).__new__(cls, *args, **kwargs)
+            
+    def append(self, *args, **kwargs):
+        return self.docbody.append(*args, **kwargs)
+    
+    def search(self, search):
+        '''Search a document for a regex, return success / fail result'''
+        document = self.docbody
+        
+        result = False
+        searchre = re.compile(search)
+        for element in document.iter():
+            if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
+                if element.text:
+                    if searchre.search(element.text):
+                        result = True
+        return result
+    
+    def replace(self, search, replace):
+        '''Replace all occurences of string with a different string, return updated document'''
+        newdocument = self.docbody
+        searchre = re.compile(search)
+        for element in newdocument.iter():
+            if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
+                if element.text:
+                    if searchre.search(element.text):
+                        element.text = re.sub(search,replace,element.text)
+        return newdocument
+    
+    def clean(self):
+        """ Perform misc cleaning operations on documents.
+            Returns cleaned document.
+        """
+    
+        newdocument = self.document
+    
+        # Clean empty text and r tags
+        for t in ('t', 'r'):
+            rmlist = []
+            for element in newdocument.iter():
+                if element.tag == '{%s}%s' % (nsprefixes['w'], t):
+                    if not element.text and not len(element):
+                        rmlist.append(element)
+            for element in rmlist:
+                element.getparent().remove(element)
+    
+        return newdocument
+    
+    def advanced_replace(self, search, replace, max_blocks=3):
+        '''Replace all occurences of string with a different string, return updated document
+    
+        This is a modified version of python-docx.replace() that takes into
+        account blocks of <bs> elements at a time. The replace element can also
+        be a string or an xml etree element.
+    
+        What it does:
+        It searches the entire document body for text blocks.
+        Then scan thos text blocks for replace.
+        Since the text to search could be spawned across multiple text blocks,
+        we need to adopt some sort of algorithm to handle this situation.
+        The smaller matching group of blocks (up to bs) is then adopted.
+        If the matching group has more than one block, blocks other than first
+        are cleared and all the replacement text is put on first block.
+    
+        Examples:
+        original text blocks : [ 'Hel', 'lo,', ' world!' ]
+        search / replace: 'Hello,' / 'Hi!'
+        output blocks : [ 'Hi!', '', ' world!' ]
+    
+        original text blocks : [ 'Hel', 'lo,', ' world!' ]
+        search / replace: 'Hello, world' / 'Hi!'
+        output blocks : [ 'Hi!!', '', '' ]
+    
+        original text blocks : [ 'Hel', 'lo,', ' world!' ]
+        search / replace: 'Hel' / 'Hal'
+        output blocks : [ 'Hal', 'lo,', ' world!' ]
+    
+        @param instance  document: The original document
+        @param str       search: The text to search for (regexp)
+        @param mixed replace: The replacement text or lxml.etree element to
+                              append, or a list of etree elements
+        @param int       max_blocks: See above
+    
+        @return instance The document with replacement applied
+    
+        '''
+        # Enables debug output
+        DEBUG = False
+    
+        newdocument = self.docbody
+    
+        # Compile the search regexp
+        searchre = re.compile(search)
+    
+        # Will match against searchels. Searchels is a list that contains last
+        # n text elements found in the document. 1 < n < max_blocks
+        searchels = []
+    
+        for element in newdocument.iter():
+            if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
+                if element.text:
+                    # Add this element to searchels
+                    searchels.append(element)
+                    if len(searchels) > max_blocks:
+                        # Is searchels is too long, remove first elements
+                        searchels.pop(0)
+    
+                    # Search all combinations, of searchels, starting from
+                    # smaller up to bigger ones
+                    # l = search lenght
+                    # s = search start
+                    # e = element IDs to merge
+                    found = False
+                    for l in range(1,len(searchels)+1):
+                        if found:
+                            break
+                        #print "slen:", l
+                        for s in range(len(searchels)):
+                            if found:
+                                break
+                            if s+l <= len(searchels):
+                                e = range(s,s+l)
+                                #print "elems:", e
+                                txtsearch = ''
+                                for k in e:
+                                    txtsearch += searchels[k].text
+    
+                                # Searcs for the text in the whole txtsearch
+                                match = searchre.search(txtsearch)
+                                if match:
+                                    found = True
+    
+                                    # I've found something :)
+                                    if DEBUG:
+                                        log.debug("Found element!")
+                                        log.debug("Search regexp: %s", searchre.pattern)
+                                        log.debug("Requested replacement: %s", replace)
+                                        log.debug("Matched text: %s", txtsearch)
+                                        log.debug( "Matched text (splitted): %s", map(lambda i:i.text,searchels))
+                                        log.debug("Matched at position: %s", match.start())
+                                        log.debug( "matched in elements: %s", e)
+                                        if isinstance(replace, etree._Element):
+                                            log.debug("Will replace with XML CODE")
+                                        elif isinstance(replace (list, tuple)):
+                                            log.debug("Will replace with LIST OF ELEMENTS")
+                                        else:
+                                            log.debug("Will replace with:", re.sub(search,replace,txtsearch))
+    
+                                    curlen = 0
+                                    replaced = False
+                                    for i in e:
+                                        curlen += len(searchels[i].text)
+                                        if curlen > match.start() and not replaced:
+                                            # The match occurred in THIS element. Puth in the
+                                            # whole replaced text
+                                            if isinstance(replace, etree._Element):
+                                                # Convert to a list and process it later
+                                                replace = [ replace, ]
+                                            if isinstance(replace, (list,tuple)):
+                                                # I'm replacing with a list of etree elements
+                                                # clear the text in the tag and append the element after the
+                                                # parent paragraph
+                                                # (because t elements cannot have childs)
+                                                p = findTypeParent(searchels[i], '{%s}p' % nsprefixes['w'])
+                                                searchels[i].text = re.sub(search,'',txtsearch)
+                                                insindex = p.getparent().index(p) + 1
+                                                for r in replace:
+                                                    p.getparent().insert(insindex, r)
+                                                    insindex += 1
+                                            else:
+                                                # Replacing with pure text
+                                                searchels[i].text = re.sub(search,replace,txtsearch)
+                                            replaced = True
+                                            log.debug("Replacing in element #: %s", i)
+                                        else:
+                                            # Clears the other text elements
+                                            searchels[i].text = ''
+        return newdocument
+        
+    def _get_etree(self, xmldoc):
+        return etree.fromstring(self._docx.read(xmldoc))
+        
+    def _load_etree(self, name, xmldoc):
+        setattr(self, name, self._get_etree(xmldoc))
+
+    def template(self, cx, max_blocks=5, raw_document=False):
+        """
+        Accepts a context dictionary (cx) and looks for the dict keys wrapped 
+        in {{key}}. Replaces occurances with the correspoding value from the
+        cx dictionary.
+        
+        example:
+            with the context...
+                cx = {
+                    'name': 'James',
+                    'lang': 'English'
+                }
+            
+            ...and a docx file containing:
+                
+                Hi! My name is {{name}} and I speak {{lang}}
+                
+            Calling `docx.template(cx)` will return a new docx instance (the
+            original is not modified) that looks like:
+            
+                Hi! My name is James and I speak English
+                
+            Note: the template must not have spaces in the curly braces unless
+            the dict key does (i.e., `{{ name }}` will not work unless your
+            dictionary has `{" name ": ...}`)
+                
+        The `raw_document` argument accepts a boolean, which (if True) will 
+        treat the word/document.xml file as a text template (rather than only 
+        replacing text that is visible in the document via a word processor)
+        
+        If you pass `max_blocks=None` you will cause the template function to
+        use `docx.replace()` rather than `docx.advanced_replace()`.
+        
+        When `max_blocks` is a number, it is passed to the advanced replace
+        method as is.   
+        """
+        output = self.copy()
+        
+        if raw_document:
+            raw_doc = etree.tostring(output.document)
+            
+        for key, val in cx.items():
+            key = "{{%s}}" % key
+            if raw_document:
+                raw_doc = raw_doc.replace(key, unicode(val))
+            elif max_blocks is None: 
+                output.replace(key, unicode(val))
+            else:                  
+                output.advanced_replace(key, val, max_blocks=max_blocks)
+            
+        if raw_document:
+            output.document = etree.fromstring(raw_doc)
+            
+        return output
+
+    def save(self, dest=None):
+        self.modified = datetime.utcnow()
+        
+        outf = NamedTemporaryFile()
+        out_zip = ZipFile(outf.name, mode='w')
+        
+        orig_contents = self._docx.namelist()
+        modified_contents = self.trees_and_files.values()
+        
+        # Serialize our trees into our zip file
+        for tree, dest_file in self.trees_and_files.items():
+            log.info('Saving: ' + dest_file)
+            out_zip.writestr(dest_file, etree.tostring(getattr(self, tree), pretty_print=True))
+        
+        for dest_file in set(orig_contents) - set(modified_contents):
+            out_zip.writestr(dest_file, self._docx.read(dest_file))
+    
+        
+        # docx file doesn't save properly unless it gets closed
+        out_zip.close()
+        if dest is not None:
+            log.info('Saved new file to: %r', dest)
+            shutil.copyfile(outf.name, dest)
+            outf.close()
+        else:
+             
+            self._docx.close()
+            
+            shutil.copyfile(outf.name, self._tmp_file.name)
+            
+            # reopen the file so it can continue to be used
+            self._docx = ZipFile(self._tmp_file.name, mode='a')
+            
+    def copy(self):
+        tmp = NamedTemporaryFile()
+        self.save(tmp.name)
+        docx = self.__class__(tmp.name)
+        docx._orig_docx = self._orig_docx
+        tmp.close()
+        return docx
+    
+    def __del__(self):
+        try: 
+            self.__empty_docx.close()
+        except AttributeError: 
+            pass
+        self._docx.close()
+        self._tmp_file.close()
+        
+    def _get_coreprop(self, tagname):
+        return self.coreprops.xpath("*[local-name()='title']")[0]
+    
+    def _get_coreprop_val(self, tagname):
+        return self._get_coreprop(tagname).text
+    
+    def _set_coreprop_val(self, tagname, val):
+        self._get_coreprop(tagname).text = val
+        
+    def __generate_empty_docx(self):
+        self.__empty_docx = NamedTemporaryFile()
+        loc = self.__empty_docx.name
+        
+        dir_to_docx(template_dir, loc)
+        
+        return loc
+    
+    @property
+    def text(self):
+        '''Return the raw text of a document, as a list of paragraphs.'''
+        document = self.docbody
+        paratextlist = []
+        # Compile a list of all paragraph (p) elements
+        paralist = []
+        for element in document.iter():
+            # Find p (paragraph) elements
+            if element.tag == '{'+nsprefixes['w']+'}p':
+                paralist.append(element)
+        # Since a single sentence might be spread over multiple text elements, iterate through each
+        # paragraph, appending all text (t) children to that paragraphs text.
+        for para in paralist:
+            paratext=u''
+            # Loop through each paragraph
+            for element in para.iter():
+                # Find t (text) elements
+                if element.tag == '{'+nsprefixes['w']+'}t':
+                    if element.text:
+                        paratext = paratext+element.text
+            # Add our completed paragraph text to the list of paragraph text
+            if not len(paratext) == 0:
+                paratextlist.append(paratext)
+        return paratextlist
+from contextlib import contextmanager
+from core import Docx
+import elements
+
+meta = {
+    "title": "",
+    "subject": "",
+    "creator": "",
+    "keywords": [],
+}
+
+doc = None
+def start_doc(**kwargs):
+    global doc, meta
+    
+    if kwargs.get("meta", None) is not None: 
+        meta = kwargs['meta']
+        
+    doc = Docx()
+    
+### DSL
+
+h_ = lambda level, txt: doc.append(elements.heading(txt, level))
+h1 = lambda txt: h_(1, txt)
+h2 = lambda txt: h_(2, txt)
+h3 = lambda txt: h_(3, txt)
+h4 = lambda txt: h_(4, txt)
+
+p = lambda txt: doc.append(elements.paragraph(txt))
+
+br = lambda **kwargs: doc.append(elements.pagebreak(**kwargs))
+
+def img(src, alt=""):
+    raise NotImplementedError
+    relationships = None
+    relationships, picpara = doc.picture(relationships, src, alt)
+    doc.append(picpara)
+
+@contextmanager
+def ul():
+    yield lambda txt: doc.append(elements.paragraph(txt, style='ListBullet'))
+        
+@contextmanager
+def ol():
+    yield lambda txt: doc.append(elements.paragraph(txt, style='ListNumber'))
+
+@contextmanager
+def table():
+    t = []
+    
+    @contextmanager    
+    def tr():
+        r = []
+        t.append(r)
+        yield lambda txt: r.append(txt)
+                
+    yield tr
+    doc.append(elements.table(t, 
+        heading=False, 
+        borders={"all": {'sz': 2, 'color': 'cccccc'}},
+        celstyle=[{'fill': "ffffff"}] * len(t)
+    ))
+        
+## utility functions...
+def write_docx(f):
+    doc.title = str(meta.get('title', ''))
+    doc.subject = str(meta.get('subject', ''))
+    doc.creator = str(meta.get('creator', ''))
+    doc.keywords = list(meta.get('keywords', []))
+       
+    doc.save(f)
+# -*- coding: utf-8 -*-
+import os
+from os.path import join
+import shutil
+
+from lxml import etree
+try:
+    from PIL import Image
+except ImportError:
+    import Image
+    
+from metadata import TEMPLATE_DIR, nsprefixes
+
+def Element(tagname, tagtext=None, nsprefix='w', attributes=None,attrnsprefix=None):
+    '''Create an element & return it'''
+    # Deal with list of nsprefix by making namespacemap
+    namespacemap = None
+    if isinstance(nsprefix, list):
+        namespacemap = {}
+        for prefix in nsprefix:
+            namespacemap[prefix] = nsprefixes[prefix]
+        nsprefix = nsprefix[0] # FIXME: rest of code below expects a single prefix
+    if nsprefix:
+        namespace = '{'+nsprefixes[nsprefix]+'}'
+    else:
+        # For when namespace = None
+        namespace = ''
+    newelement = etree.Element(namespace+tagname, nsmap=namespacemap)
+    # Add attributes with namespaces
+    if attributes:
+        # If they haven't bothered setting attribute namespace, use an empty string
+        # (equivalent of no namespace)
+        if not attrnsprefix:
+            # Quick hack: it seems every element that has a 'w' nsprefix for its tag uses the same prefix for it's attributes
+            if nsprefix == 'w':
+                attributenamespace = namespace
+            else:
+                attributenamespace = ''
+        else:
+            attributenamespace = '{'+nsprefixes[attrnsprefix]+'}'
+
+        for tagattribute in attributes:
+            newelement.set(attributenamespace+tagattribute, attributes[tagattribute])
+    if tagtext:
+        newelement.text = tagtext
+    return newelement
+
+
+def pagebreak(breaktype='page', orient='portrait'):
+    '''Insert a break, default 'page'.
+    See http://openxmldeveloper.org/forums/thread/4075.aspx
+    Return our page break element.'''
+    # Need to enumerate different types of page breaks.
+    validtypes = ['page', 'section']
+    if breaktype not in validtypes:
+        raise ValueError('Page break style "%s" not implemented. Valid styles: %s.' % (breaktype, validtypes))
+    pagebreak = Element('p')
+    if breaktype == 'page':
+        run = Element('r')
+        br = Element('br',attributes={'type':breaktype})
+        run.append(br)
+        pagebreak.append(run)
+    elif breaktype == 'section':
+        pPr = Element('pPr')
+        sectPr = Element('sectPr')
+        if orient == 'portrait':
+            pgSz = Element('pgSz',attributes={'w':'12240','h':'15840'})
+        elif orient == 'landscape':
+            pgSz = Element('pgSz',attributes={'h':'12240','w':'15840', 'orient':'landscape'})
+        sectPr.append(pgSz)
+        pPr.append(sectPr)
+        pagebreak.append(pPr)
+    return pagebreak
+
+def paragraph(paratext,style='BodyText',breakbefore=False,jc='left'):
+    '''Make a new paragraph element, containing a run, and some text.
+    Return the paragraph element.
+
+    @param string jc: Paragraph alignment, possible values:
+                      left, center, right, both (justified), ...
+                      see http://www.schemacentral.com/sc/ooxml/t-w_ST_Jc.html
+                      for a full list
+
+    If paratext is a list, spawn multiple run/text elements.
+    Support text styles (paratext must then be a list of lists in the form
+    <text> / <style>. Stile is a string containing a combination od 'bui' chars
+
+    example
+    paratext = [
+        ('some bold text', 'b'),
+        ('some normal text', ''),
+        ('some italic underlined text', 'iu'),
+    ]
+
+    '''
+    # Make our elements
+    paragraph = Element('p')
+
+    if isinstance(paratext, list):
+        text = []
+        for pt in paratext:
+            if isinstance(pt, (list,tuple)):
+                text.append([Element('t',tagtext=pt[0]), pt[1]])
+            else:
+                text.append([Element('t',tagtext=pt), ''])
+    else:
+        text = [[Element('t',tagtext=paratext),''],]
+    pPr = Element('pPr')
+    pStyle = Element('pStyle',attributes={'val':style})
+    pJc = Element('jc',attributes={'val':jc})
+    pPr.append(pStyle)
+    pPr.append(pJc)
+
+    # Add the text the run, and the run to the paragraph
+    paragraph.append(pPr)
+    for t in text:
+        run = Element('r')
+        rPr = Element('rPr')
+        # Apply styles
+        if t[1].find('b') > -1:
+            b = Element('b')
+            rPr.append(b)
+        if t[1].find('u') > -1:
+            u = Element('u',attributes={'val':'single'})
+            rPr.append(u)
+        if t[1].find('i') > -1:
+            i = Element('i')
+            rPr.append(i)
+        run.append(rPr)
+        # Insert lastRenderedPageBreak for assistive technologies like
+        # document narrators to know when a page break occurred.
+        if breakbefore:
+            lastRenderedPageBreak = Element('lastRenderedPageBreak')
+            run.append(lastRenderedPageBreak)
+        run.append(t[0])
+        paragraph.append(run)
+    # Return the combined paragraph
+    return paragraph
+
+def heading(headingtext,headinglevel,lang='en'):
+    '''Make a new heading, return the heading element'''
+    lmap = {
+        'en': 'Heading',
+        'it': 'Titolo',
+    }
+    # Make our elements
+    paragraph = Element('p')
+    pr = Element('pPr')
+    pStyle = Element('pStyle',attributes={'val':lmap[lang]+str(headinglevel)})
+    run = Element('r')
+    text = Element('t',tagtext=headingtext)
+    # Add the text the run, and the run to the paragraph
+    pr.append(pStyle)
+    run.append(text)
+    paragraph.append(pr)
+    paragraph.append(run)
+    # Return the combined paragraph
+    return paragraph
+
+def table(contents, heading=True, colw=None, cwunit='dxa', tblw=0, twunit='auto', borders={}, celstyle=None):
+    '''Get a list of lists, return a table
+
+        @param list contents: A list of lists describing contents
+                              Every item in the list can be a string or a valid
+                              XML element itself. It can also be a list. In that case
+                              all the listed elements will be merged into the cell.
+        @param bool heading: Tells whether first line should be threated as heading
+                             or not
+        @param list colw: A list of interger. The list must have same element
+                          count of content lines. Specify column Widths in
+                          wunitS
+        @param string cwunit: Unit user for column width:
+                                'pct': fifties of a percent
+                                'dxa': twenties of a point
+                                'nil': no width
+                                'auto': automagically determined
+        @param int tblw: Table width
+        @param int twunit: Unit used for table width. Same as cwunit
+        @param dict borders: Dictionary defining table border. Supported keys are:
+                             'top', 'left', 'bottom', 'right', 'insideH', 'insideV', 'all'
+                             When specified, the 'all' key has precedence over others.
+                             Each key must define a dict of border attributes:
+                             color: The color of the border, in hex or 'auto'
+                             space: The space, measured in points
+                             sz: The size of the border, in eights of a point
+                             val: The style of the border, see http://www.schemacentral.com/sc/ooxml/t-w_ST_Border.htm
+        @param list celstyle: Specify the style for each colum, list of dicts.
+                              supported keys:
+                              'align': specify the alignment, see paragraph documentation,
+
+        @return lxml.etree: Generated XML etree element
+    '''
+    table = Element('tbl')
+    columns = len(contents[0])
+    # Table properties
+    tableprops = Element('tblPr')
+    tablestyle = Element('tblStyle',attributes={'val':'ColorfulGrid-Accent1'})
+    tableprops.append(tablestyle)
+    tablewidth = Element('tblW',attributes={'w':str(tblw),'type':str(twunit)})
+    tableprops.append(tablewidth)
+    if len(borders.keys()):
+        tableborders = Element('tblBorders')
+        for b in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
+            if b in borders.keys() or 'all' in borders.keys():
+                k = 'all' if 'all' in borders.keys() else b
+                attrs = {}
+                for a in borders[k].keys():
+                    attrs[a] = unicode(borders[k][a])
+                borderelem = Element(b,attributes=attrs)
+                tableborders.append(borderelem)
+        tableprops.append(tableborders)
+    tablelook = Element('tblLook',attributes={'val':'0400'})
+    tableprops.append(tablelook)
+    table.append(tableprops)
+    # Table Grid
+    tablegrid = Element('tblGrid')
+    for i in range(columns):
+        tablegrid.append(Element('gridCol',attributes={'w':str(colw[i]) if colw else '2390'}))
+    table.append(tablegrid)
+    # Heading Row
+    row = Element('tr')
+    rowprops = Element('trPr')
+    cnfStyle = Element('cnfStyle',attributes={'val':'000000100000'})
+    rowprops.append(cnfStyle)
+    row.append(rowprops)
+    if heading:
+        i = 0
+        for heading in contents[0]:
+            cell = Element('tc')
+            # Cell properties
+            cellprops = Element('tcPr')
+            if colw:
+                wattr = {'w':str(colw[i]),'type':cwunit}
+            else:
+                wattr = {'w':'0','type':'auto'}
+            cellwidth = Element('tcW',attributes=wattr)
+            cellstyle = Element('shd',attributes={'val':'clear','color':'auto','fill':'548DD4','themeFill':'text2','themeFillTint':'99'})
+            cellprops.append(cellwidth)
+            cellprops.append(cellstyle)
+            cell.append(cellprops)
+            # Paragraph (Content)
+            if not isinstance(heading, (list, tuple)):
+                heading = [heading,]
+            for h in heading:
+                if isinstance(h, etree._Element):
+                    cell.append(h)
+                else:
+                    cell.append(paragraph(h,jc='center'))
+            row.append(cell)
+            i += 1
+        table.append(row)
+    # Contents Rows
+    for contentrow in contents[1 if heading else 0:]:
+        row = Element('tr')
+        i = 0
+        for content in contentrow:
+            cell = Element('tc')
+            # Properties
+            cellprops = Element('tcPr')
+            if colw:
+                wattr = {'w':str(colw[i]),'type':cwunit}
+            else:
+                wattr = {'w':'0','type':'auto'}
+            cellwidth = Element('tcW',attributes=wattr)
+            cellprops.append(cellwidth)
+            cell.append(cellprops)
+            # Paragraph (Content)
+            if not isinstance(content, (list, tuple)):
+                content = [content,]
+            for c in content:
+                if isinstance(c, etree._Element):
+                    cell.append(c)
+                else:
+                    if celstyle and 'align' in celstyle[i].keys():
+                        align = celstyle[i]['align']
+                    else:
+                        align = 'left'
+                    cell.append(paragraph(c,jc=align))
+            row.append(cell)
+            i += 1
+        table.append(row)
+    return table
+
+def picture(relationshiplist, picname, picdescription, pixelwidth=None,
+            pixelheight=None, nochangeaspect=True, nochangearrowheads=True):
+    '''Take a relationshiplist, picture file name, and return a paragraph containing the image
+    and an updated relationshiplist'''
+    # http://openxmldeveloper.org/articles/462.aspx
+    # Create an image. Size may be specified, otherwise it will based on the
+    # pixel size of image. Return a paragraph containing the picture'''
+    # Copy the file into the media dir
+    media_dir = join(TEMPLATE_DIR,'word','media')
+    if not os.path.isdir(media_dir):
+        os.mkdir(media_dir)
+    shutil.copyfile(picname, join(media_dir,picname))
+
+    # Check if the user has specified a size
+    if not pixelwidth or not pixelheight:
+        # If not, get info from the picture itself
+        pixelwidth,pixelheight = Image.open(picname).size[0:2]
+
+    # OpenXML measures on-screen objects in English Metric Units
+    # 1cm = 36000 EMUs
+    emuperpixel = 12667
+    width = str(pixelwidth * emuperpixel)
+    height = str(pixelheight * emuperpixel)
+
+    # Set relationship ID to the first available
+    picid = '2'
+    picrelid = 'rId'+str(len(relationshiplist)+1)
+    relationshiplist.append([
+        'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
+        'media/'+picname])
+
+    # There are 3 main elements inside a picture
+    # 1. The Blipfill - specifies how the image fills the picture area (stretch, tile, etc.)
+    blipfill = Element('blipFill',nsprefix='pic')
+    blipfill.append(Element('blip',nsprefix='a',attrnsprefix='r',attributes={'embed':picrelid}))
+    stretch = Element('stretch',nsprefix='a')
+    stretch.append(Element('fillRect',nsprefix='a'))
+    blipfill.append(Element('srcRect',nsprefix='a'))
+    blipfill.append(stretch)
+
+    # 2. The non visual picture properties
+    nvpicpr = Element('nvPicPr',nsprefix='pic')
+    cnvpr = Element('cNvPr',nsprefix='pic',
+                        attributes={'id':'0','name':'Picture 1','descr':picname})
+    nvpicpr.append(cnvpr)
+    cnvpicpr = Element('cNvPicPr',nsprefix='pic')
+    cnvpicpr.append(Element('picLocks', nsprefix='a',
+                    attributes={'noChangeAspect':str(int(nochangeaspect)),
+                    'noChangeArrowheads':str(int(nochangearrowheads))}))
+    nvpicpr.append(cnvpicpr)
+
+    # 3. The Shape properties
+    sppr = Element('spPr',nsprefix='pic',attributes={'bwMode':'auto'})
+    xfrm = Element('xfrm',nsprefix='a')
+    xfrm.append(Element('off',nsprefix='a',attributes={'x':'0','y':'0'}))
+    xfrm.append(Element('ext',nsprefix='a',attributes={'cx':width,'cy':height}))
+    prstgeom = Element('prstGeom',nsprefix='a',attributes={'prst':'rect'})
+    prstgeom.append(Element('avLst',nsprefix='a'))
+    sppr.append(xfrm)
+    sppr.append(prstgeom)
+
+    # Add our 3 parts to the picture element
+    pic = Element('pic',nsprefix='pic')
+    pic.append(nvpicpr)
+    pic.append(blipfill)
+    pic.append(sppr)
+
+    # Now make the supporting elements
+    # The following sequence is just: make element, then add its children
+    graphicdata = Element('graphicData',nsprefix='a',
+        attributes={'uri':'http://schemas.openxmlformats.org/drawingml/2006/picture'})
+    graphicdata.append(pic)
+    graphic = Element('graphic',nsprefix='a')
+    graphic.append(graphicdata)
+
+    framelocks = Element('graphicFrameLocks',nsprefix='a',attributes={'noChangeAspect':'1'})
+    framepr = Element('cNvGraphicFramePr',nsprefix='wp')
+    framepr.append(framelocks)
+    docpr = Element('docPr',nsprefix='wp',
+        attributes={'id':picid,'name':'Picture 1','descr':picdescription})
+    effectextent = Element('effectExtent',nsprefix='wp',
+        attributes={'l':'25400','t':'0','r':'0','b':'0'})
+    extent = Element('extent',nsprefix='wp',attributes={'cx':width,'cy':height})
+    inline = Element('inline',
+        attributes={'distT':"0",'distB':"0",'distL':"0",'distR':"0"},nsprefix='wp')
+    inline.append(extent)
+    inline.append(effectextent)
+    inline.append(docpr)
+    inline.append(framepr)
+    inline.append(graphic)
+    drawing = Element('drawing')
+    drawing.append(inline)
+    run = Element('r')
+    run.append(drawing)
+    paragraph = Element('p')
+    paragraph.append(run)
+    return relationshiplist,paragraph
+
+from os.path import abspath, join, dirname
+
+PACKAGE_DIR = abspath(dirname(__file__))
+TEMPLATE_DIR = join(PACKAGE_DIR, 'template')
+
+# All Word prefixes / namespace matches used in document.xml & core.xml.
+# LXML doesn't actually use prefixes (just the real namespace) , but these
+# make it easier to copy Word output more easily.
+nsprefixes = {
+    # Text Content
+    'mv':'urn:schemas-microsoft-com:mac:vml',
+    'mo':'http://schemas.microsoft.com/office/mac/office/2008/main',
+    've':'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    'o':'urn:schemas-microsoft-com:office:office',
+    'r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+    'm':'http://schemas.openxmlformats.org/officeDocument/2006/math',
+    'v':'urn:schemas-microsoft-com:vml',
+    'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+    'w10':'urn:schemas-microsoft-com:office:word',
+    'wne':'http://schemas.microsoft.com/office/word/2006/wordml',
+    # Drawing
+    'wp':'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+    'a':'http://schemas.openxmlformats.org/drawingml/2006/main',
+    'pic':'http://schemas.openxmlformats.org/drawingml/2006/picture',
+    # Properties (core and extended)
+    'cp':"http://schemas.openxmlformats.org/package/2006/metadata/core-properties",
+    'dc':"http://purl.org/dc/elements/1.1/",
+    'dcterms':"http://purl.org/dc/terms/",
+    'dcmitype':"http://purl.org/dc/dcmitype/",
+    'xsi':"http://www.w3.org/2001/XMLSchema-instance",
+    'ep':'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
+    # Content Types (we're just making up our own namespaces here to save time)
+    'ct':'http://schemas.openxmlformats.org/package/2006/content-types',
+    # Package Relationships (we're just making up our own namespaces here to save time)
+    'pr':'http://schemas.openxmlformats.org/package/2006/relationships'
+    }
+# -*- coding: utf-8 -*-
+import os
+from os.path import join, abspath
+from zipfile import ZipFile, ZIP_DEFLATED
+
+
+def findTypeParent(element, tag):
+        """ Finds fist parent of element of the given type
+        
+        @param object element: etree element
+        @param string the tag parent to search for
+        
+        @return object element: the found parent or None when not found
+        """
+        
+        p = element
+        while True:
+            p = p.getparent()
+            if p.tag == tag:
+                return p
+        
+        # Not found
+        return None
+    
+def dir_to_docx(source_dir, output_loc):
+    # make sure the output ends up where we expect it
+    output_loc = abspath(output_loc)
+    
+    # Move into the source_dir
+    prev_dir = abspath('.') # save previous working dir
+    os.chdir(source_dir)
+    
+    docxfile = ZipFile(output_loc, mode='w', compression=ZIP_DEFLATED)    
+
+    # Add & compress support files
+    files_to_ignore = set(['.DS_Store']) # nuisance from some os's
+    for dirpath, dirnames, filenames in os.walk('.'): #@UnusedVariable
+        for filename in filenames:
+            if filename in files_to_ignore:
+                continue
+            templatefile = join(dirpath,filename)
+            archivename = templatefile[2:]
+            docxfile.write(templatefile, archivename)
+    docxfile.close()
+    
+    os.chdir(prev_dir) # restore previous working dir
 
     def init_eyfo(self):
 
-        if conf['whoosh_index_db_dir']:
-            d = conf['whoosh_index_db_dir']
+        if conf['index_db_dir']:
+            d = conf['index_db_dir']
             if not isdir(d): d = "d:\\temp\\whoosh"   # xxx todo: find a better directory...
-            whoosh_indexer.whoosh_index_db_dir = d
+            whoosh_indexer.index_db_dir = d
 
-        if conf['index_start_folder']:
-            d = conf['index_start_folder']
+        if conf['root_folder']:
+            d = conf['root_folder']
             if not isdir(d): d = "c:\\"   # xxx todo: find a better directory...
-            whoosh_indexer.index_start_folder = d
+            whoosh_indexer.root_folder = d
 
         if conf['windowleft'] and conf['windowtop']:
             self.SetPosition( (conf['windowleft'], conf['windowtop']) )
             return
 
         self.progress_indicator.Play()
-        root = conf['index_start_folder']
-        i = whoosh_indexer.Eyfo_index()
+        root = conf['root_folder']
+        i = whoosh_indexer.Eyfo_index( conf['index_db_dir'], clean=False )
         i.only_fnames = False  #conf['index_names_only']
         i.incremental_index( dirname=root )
         self.progress_indicator.Stop()
         wx.GetApp().Yield(True)   # lets start the progress indicator
 
         self.file_list.DeleteAllItems() # .ClearAll()
-        print conf
-        s = whoosh_indexer.Eyfo_Search()
+        s = whoosh_indexer.Eyfo_Search( conf['index_db_dir'] )
         # todo
         s.only_fnames = (self.where_to_search=='Search &File names only')
         query = self.search_words_combo.Value
     elif os.name == 'posix':
         subprocess.call(('xdg-open', f))
 
+
+def dir_size( root, recursive=True ):
+    if recursive:
+        file_walker = (
+            os.path.join(root, f)
+            for root, _, files in os.walk(root)
+            for f in files
+        )
+        return sum(os.path.getsize(f) for f in file_walker)
+
+    else:
+        return sum(os.path.getsize(f) for f in os.listdir( root ) )
+
+
+
+def human_readable_numbers( num):
+    for x in ['bytes','KB','MB','GB','TB']:
+        if num < 1024.0:
+            return "%3.1f %s" % (num, x)
+        num /= 1024.0
+    return num
+
+
+def read_stdout_from_cmd(cmd, additional_env={}):
+    ''' cmd must be an array for unicode stuff,
+        whose elememnts are encoded as much as possible as unicode
+        '''
+
+    env = os.environ.copy()
+    env.update( additional_env )
+
+    log('cmd>std:', cmd)
+    if type(cmd) is list or type(cmd) is tuple:
+        cmd = encode_command_arguments(cmd)
+    if DRY_RUN:
+        return '(dry-run)'
+    sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, env=env)
+    return sp.stdout.read()
+
+
+
+def encode_command_arguments(cmd):
+    return [arg.encode(filesystem_encoding) if isinstance(arg,unicode) else arg for arg in cmd]
+
+
+def is_unicode_file(f, inspection_level=SIMPLE):
+    log('is_unicode_file(%s)' % f)
+    s = open(f,'rb').readline()
+    if len(s)<2:
+        err('is_unicode_file: too small (%d)' % len(s))
+        return False
+    elif inspection_level==SIMPLE:
+        if len(s)>=2 and s[:2]=='\xff\xfe': return 'utf-16'
+        elif len(s)>=3 and s[:3]=='\xef\xbb\xbf': return 'utf-8'
+    else:
+        fp = open(f, 'rb')
+        s = fp.read(MAX_SIZE_FOR_ENOCDING)
+        ret = chardet.detect(s)
+        enc = ret['encoding']
+        if enc in ['utf-16', 'utf-8', 'utf16', 'utf8', ]:
+            return enc
+    return False
+
+
+
+def err(s, newline=u'\n'):
+
+    try:
+        sys.stderr.write(unicode(s) + newline)
+    except:
+        print 'log msg (not printable on terminal)'
+
+
+
 def mmm():
     print "ahhh"

persistant_dict.py

 
     def __init__(self, fname):
         self.fname=fname
+        self.can_save=True
         if not isfile(fname):
             open(fname,'w').write('{}')
         else:
             self.load()
 
+
     def __setitem__(self, key, value):
         x = super(persistant_dict, self).__setitem__(key, value)
-        self.save()
+        if self.can_save: self.save()
         return x
 
     def __delitem__(self, key):
         return None
 
     def update(self, *args, **kwargs):
+        self.can_save = False
         super(persistant_dict, self).update(*args, **kwargs)
+        self.can_save = True
+        self.save()
+
+
+    def update_if_not_empty(self, d):
+        ''' update values only if a value is neither None nor Empty String'''
+        self.can_save = False
+        for k in d:
+            if d[k] is not None and d[k]!='':
+                self[k] = d[k]
+        self.can_save = True
         self.save()
 
     def update_missing(self, d):
         ''' update values only if they do not exist in object to update '''
+        self.can_save
         for k in d:
             if k not in self:
                 #self.__setitem__( k, d[k] )
                 self[k] = d[k]
+        self.can_save = True
         self.save()
 
     # --- additions to dict ------
 #import file_types
-#def archive(filename, temp_dir)
-from whooshlib import *
-#from html2text import html2text
-from library import html2text, get_html_headers
+from library import html2text, get_html_headers, err
 
 extensions = ['html', 'htm', ]
 
-from library import read_stdout_from_cmd
-#from library import html2text
-#from library import get_html_headers
-from whooshlib import err
+from library import read_stdout_from_cmd, err
 import re, sys
 import chardet
 
 import re
-from whooshlib import *
 #from html2text import html2text
-from library import html2text, get_html_headers
+from library import err, html2text, get_html_headers
 
 extensions = ['sql', ]
 namereg = re.compile('`[a-z0-9_]+?`', re.I+re.M+re.S)
 #import file_types
 import library
-from library import detect_encoding
-from whooshlib import err
+from library import detect_encoding, err
 import re
 
 library.DEBUG= True
-from whooshlib import err
+from library import err
 import zipfile
 #import file_types
 #def archive(filename, temp_dir)

whoosh_indexer.py

 '''
 
 import os
-from os.path import join, abspath
+from os.path import expanduser, isdir, isfile, join, basename, abspath, splitext
 from os import listdir
 import sys
 import re
 import time
 from fnmatch import fnmatch as shell_pattern_match
+from argparse import ArgumentParser
+from argparse import RawDescriptionHelpFormatter as formatter
+from library import err
+
 
 
 def install_dependencies():
 
 installed=[]
 modules = {  #   m: mandatory, o: optional, l: local
-    'whooshlib': 'm',
     'dateutil': 'o',
     'lxml': 'o',
     'xlrd': 'o',
 from library import import_file, PKZIP_PREFIX, decode_filename, filelist_to_html_index
 from library import encode_command_arguments, decode_command_arguments, intersection, find_user_dir
 
-from whooshlib import *
 from whoosh.index import open_dir
 from whoosh.index import create_in
 from whoosh.fields import *
 from whoosh.filedb.filestore import FileStorage
 from whoosh.writing import BufferedWriter
 
-PLUGINS_SUB_DIR = '.'
-TEST_DIR = PLUGINS_SUB_DIR
-MAX_RESULTS = 2000
-MAXSIZE=1
-TEST=2
-JUNK = 4
-SIMPLE = 0
-
-whoosh_index_db_dir = 'c:\\temp'       # bad defaults
-index_start_folder = 'C:\\'     # bad defaults
-exclude_patterns = []
-
-
 
 class D(UserDict.UserDict):
     def __missing__(self, item):
         return None
 
-exts = D({
-    'css': D( { MAXSIZE: 0.5, TEST: u'background-url', JUNK: ['width', 'height', 'background', 'color', ], }),
-    'doc': D( { MAXSIZE: 90, TEST: u'מסויים', JUNK: [ ], }),
-    'docx': D( { MAXSIZE: 10, TEST: u'Saadat', JUNK: [ ], }),
-    'htm': D( { MAXSIZE: 1.8, TEST: u'בדיקה', JUNK: [ ], }),
-    'html': D( { MAXSIZE: 1.8, TEST: u'בדיקה', JUNK: [ ], }),
-    'php': D( { MAXSIZE: 0.5, TEST: u'require', JUNK: ['foreach', 'case', 'true', 'false', 'include', 'for', ], }),
-    'py': D( { MAXSIZE: 0.7, TEST: u'include', JUNK: ['def', 'from', 'type',  ], }),
-    'txt': D( { MAXSIZE: 0.5, TEST: u'derigion', JUNK: [ ], }),
-    'xls': D( { MAXSIZE: 4, TEST: u'macedonia', JUNK: [ ], }),
-    'sql': D( { MAXSIZE: 4000, JUNK: ['table', 'update', 'select', 'join', 'create', 'drop', 'left', 'right', 'in', 'where', 'group', 'by', 'order', 'ignore', ], }),
-    'zip': D( { MAXSIZE: 500, TEST: u'veronica', JUNK: ['readme', 'descript', 'ion', ], }),
-})
-exts['wbk'] = exts['doc'].copy()
-parseable_exts = []
-quick_hack_junk_words = ['for', 'as', 'at', 'is', 'are', 'was', 'lol', ':)', 'if',  ]
-
 
 def log_type_and_encoding(varname, value):
     print '-------' + varname + '----------'
             except:
                 err('error in test')
 
-parser_functions = {}
-def load_plugins():
+def load_plugins( raise_hell=False ):
     #plugins_dir = join(os.path.split(sys.argv[0])[0], PLUGINS_SUB_DIR)
     plugins_dir = os.path.split(sys.argv[0])[0]
     if plugins_dir == '': plugins_dir = '.'
     for f in listdir(plugins_dir):
         if f.startswith('plugin_') and f.endswith('.py'):
             f = join(abspath(plugins_dir), f)
-            err('loading plugin ' + f)
-            try:
+            if raise_hell:
                 m = import_file(f)
-            except:
-                err('error in import! skipping '+f)
-                continue
+            else:
+                try:
+                    m = import_file(f)
+                except:
+                    err('error in import! skipping '+f+'   <--------------<')
+                    continue
             #m = __import__(f)
             #import modulename as m
             exts = m.extensions
             for ext in exts:
                 parser_functions[ext] = func
                 parseable_exts.append(ext)
+            err('loading plugin ' + f + ' : great success!')
 
 
-class Eyfo_Base:
 
-    def open_or_create_db(self):
-        #try:
-        storage = FileStorage(self.index_db_dir)        # Create an index
 
-        if self.clean:
-            err('creating a NEW index at:'+ self.index_db_dir)
-            self.ix = storage.create_index(self.schema)
-
-        else:
-            err('opening index from:'+ self.index_db_dir)
-            self.ix = storage.open_index()
-
-
-
-    def __init__(self, clean=False, test=False):
+class Eyfo_Base:
 
+    def __init__(self, index_db_dir, clean=False, test=False):
+        '''  '''
         self.schema = Schema(
             titles=TEXT(stored=True),
             body=TEXT(stored=True),
         self.clean = clean
         self.test = test
         self.only_fnames = False
-        self.index_db_dir = whoosh_index_db_dir
-        print '---------',self.index_db_dir
-        self.root = index_start_folder
+        self.index_db_dir = index_db_dir
+        self.root = root_folder
         self.excludes = ['tmp', 'bak', 'swp', 'pyc', 'pyo', ]
 
         if not self.test:
 
 
     def initialize_index(self):
+        '''  '''
+        print 'self.index_db_dir',self.index_db_dir
+        if not isdir(self.index_db_dir):
+            os.makedirs(self.index_db_dir)
         self.open_or_create_db()
 
 
+    def open_or_create_db(self):
+        '''  '''
+        storage = FileStorage(self.index_db_dir)        # Create an index
+
+        if self.clean:
+            err('creating a NEW index at:'+ self.index_db_dir)
+            self.ix = storage.create_index(self.schema)
+
+        else:
+            err('opening index from:'+ self.index_db_dir)
+            self.ix = storage.open_index()
+
+
+
 class FileData:
     body = u''
     titles = []
     ext=u''
 
     def __init__(self, f=''):
+        '''  '''
         self.fname=f
 
     def titlesstr(self):
+        '''  '''
         try:
             return u' ; \n'.join([unicode(t) for t in self.titles])
         except:
             return ''
 
     def __repr__(self):
+        '''  '''
         s = clean_text(self.body)[:70] + clean_text(self.titlesstr())[:70]
         return s
 
+
+
+
 def clean_text(s):
     s = s.replace('\n', ' ')
     s = s.replace('  ', ' ')
     return s
 
 class Eyfo_index(Eyfo_Base):
-
+    '''  '''
 
     def default_parser(self, f, data):
+        '''  '''
         data.body = open(f, 'rb').read()
         data.body = convert_encoding(data.body)
 
 
     def add_doc(self, writer, f):
-
+        '''  '''
         data = FileData(f)
         err(f)
-        '''
-        try:
+        '''try:
             err(' + ' + f)
         except:
             f = guess_encoding(f, ['cp862', 'cp1255', 'utf-8', 'utf-16', 'latin-1'])
-            err(' + ' + f)
-        '''
+            err(' + ' + f)'''
 
         if data.ext in parseable_exts and not self.only_fnames:
             parser_function = parser_functions[data.ext]
 
         return data
 
+
+
+
     def file_should_be_indexed(self, f):
+        '''  '''
         ext = get_file_extension(f)
 
         if ext in self.excludes:
         return True
 
 
-    def index_files(self, files_to_index=False, indexed_paths=[]):
-        #writer = self.ix.writer()
 
+    def index_files(self, files_to_index=False, indexed_paths=[]):
+        '''  '''
         if self.gui:
             import wx
 
-        try:
+        try:  # only to catch Ctrl-c
+
             writer = BufferedWriter(self.ix, period=120, limit=100)
 
             for root, subFolders, files in os.walk(unicode(self.root)):
                         self.add_doc(writer, f)
 
                     if self.gui:
-                        wx.GetApp().Yield(True)
+                        wx.GetApp().Yield(True)  # i wish it was helpful..
 
                 # commit every subdir
                 writer.commit()
+
         except KeyboardInterrupt:
             err('pressed Ctrl-C')
 
         writer.close()
 
 
-    def incremental_index(self, dirname='' ):
+
+    def incremental_index(self, root_dir='' ):
+        '''  '''
         searcher = self.ix.searcher()
-        if dirname:
-            self.root = dirname
+        if root_dir:
+            if isdir(root_dir):
+                self.root = root_dir
+            else:
+                print 'error! root_dir <%s> is not a directory.' % root_dir
+                return False
 
         # The set of all paths in the index
         indexed_paths = set()
         # -- last change, 07/08/2011 -- writer = self.ix.writer()
 
         # Loop over the stored fields in the index
+        count = 0
         for fields in searcher.all_stored_fields():
             indexed_path = fields['path']
             indexed_paths.add(indexed_path)
-
+            err(count, '  \r')
             if not os.path.exists(indexed_path):
                 # This file was deleted since it was indexed
                 writer.delete_by_term('path', indexed_path)
-
+                count -= 1
             else:
                 # Check if this file was changed since it was indexed
                 indexed_time = fields['time']
                 mtime = os.path.getmtime(indexed_path)
-                #print "mtime ",mtime
-                #print "indexed_time",indexed_time
-                #if mtime > indexed_time:
+                if mtime< 1:
+                    continue
+                    # that is usually a "bug" in the file
+
                 if time.localtime(mtime) > indexed_time.timetuple():
-                    # The file has changed, delete it and add it to the list of
-                    # files to reindex
+                    # The file changed, delete it and add it to the list of files to reindex
                     writer.delete_by_term('path', indexed_path)
                     to_index.add(indexed_path)
+                    count+=1
                 else:
                     print 'time not changed'
 
         writer.commit()
         #writer.cancel()
 
-        self.index_files( files_to_index=to_index, indexed_paths=indexed_paths)
-
+        count = self.index_files( files_to_index=to_index, indexed_paths=indexed_paths)
         #writer.commit()
         #writer.close()
 
+        return count
+
 
 
 class Eyfo_Search(Eyfo_Base):
 
     def search(self, q, gui=False):
+        '''  '''
         searcher = self.ix.searcher()
         q = unicode(q)
 
         results.upgrade_and_extend(allresults)
         '''
 
-
         searcher.close()
         return out
 
 
-load_plugins()
 
 def usage(s=''):
     print '''usage:
 
-    whoosh_indexer.py [action] [args...]
+    '''
+    print s
+    sys.exit()
 
-    actions:
-        index (or "i")
-        reindex (or "r")
-        search (or "s")
-        test (or "t")
 
-    args (must come at the end)
-        only-fnames == don't parse any content, just read filenames
-        no-parse == same
 
-    e.g.
-     > whoosh_indexer.py reindex e:\\path\\files\\
-     > whoosh_indexer.py search momo turtle
-     > whoosh_indexer.py s momo turtle
-     > whoosh_indexer.py t "path\\filename.ext"
 
-    '''
-    print s
-    sys.exit()
 
+def process_cmd_line_args(argv):
+    desc= ''' hello! '''
+    epilog = ''' goodbye. '''
 
-if __name__=='__main__':
+    parser = ArgumentParser( formatter_class=formatter, description=desc, epilog=epilog )
+    parser.add_argument("--action", dest="action",                              default='', help='possible actions:   index (start a new index),  reindex (refresh existing index),  search (by query), test, info')
+    parser.add_argument("--config-file", dest="config_file",                    default='', help="use specific config file")
+    parser.add_argument("--log-file", dest="log_file",                          default='eyfo.log', help="use specific log file")
+    parser.add_argument("--root-dir", dest="root_dir",                          default=None, help='root folder for importing images from')
+    parser.add_argument("--index-db-dir", dest="index_db_dir",                  default=None, help='where the index_db is')
+    parser.add_argument("--silent", action='store_true', dest='silent',         default=False, help="don't print progress details")
+    parser.add_argument("--file-names", action='store_true', dest='only_fnames', default=False, help="search / index only names of files, not their contents")
+    parser.add_argument('query', metavar='query', type=str, nargs='*',          default='', help='search query')
+
+
+    results = parser.parse_args()
+    args = dict(results._get_kwargs())
+    if not args['action']:
+        print "\nMUST have at least --action=xxxx !"
+        parser.print_help()
+        sys.exit(2)
+    return args
+
+
+
+
+# =======================================================
+
+# --- defaults ---
+# will be overritten with config file values
+
+PLUGINS_SUB_DIR = '.'
+TEST_DIR = PLUGINS_SUB_DIR
+MAX_RESULTS = 2000
+index_db_dir = 'c:\\temp'       # bad defaults
+root_folder = 'C:\\'            # bad defaults
+exclude_patterns = []
+
+# --- enum of something ---
+MAXSIZE=1
+TEST=2
+JUNK = 4
+SIMPLE = 0
+
+# --- that should probably go to a config file, too
+exts = D({
+    'css': D( { MAXSIZE: 0.5, TEST: u'background-url', JUNK: ['width', 'height', 'background', 'color', ], }),
+    'doc': D( { MAXSIZE: 90, TEST: u'מסויים', JUNK: [ ], }),
+    'docx': D( { MAXSIZE: 10, TEST: u'Saadat', JUNK: [ ], }),
+    'htm': D( { MAXSIZE: 1.8, TEST: u'בדיקה', JUNK: [ ], }),
+    'html': D( { MAXSIZE: 1.8, TEST: u'בדיקה', JUNK: [ ], }),
+    'php': D( { MAXSIZE: 0.5, TEST: u'require', JUNK: ['foreach', 'case', 'true', 'false', 'include', 'for', ], }),
+    'py': D( { MAXSIZE: 0.7, TEST: u'include', JUNK: ['def', 'from', 'type',  ], }),
+    'txt': D( { MAXSIZE: 0.5, TEST: u'derigion', JUNK: [ ], }),
+    'xls': D( { MAXSIZE: 4, TEST: u'macedonia', JUNK: [ ], }),
+    'sql': D( { MAXSIZE: 4000, JUNK: ['table', 'update', 'select', 'join', 'create', 'drop', 'left', 'right', 'in', 'where', 'group', 'by', 'order', 'ignore', ], }),
+    'zip': D( { MAXSIZE: 500, TEST: u'veronica', JUNK: ['readme', 'descript', 'ion', ], }),
+})
 
-    if len(sys.argv) <2: usage('less than 2 args')
-    if intersection( sys.argv, ['--help', '-h', '/?', '?', 'help']): usage('showing help')
-    action = sys.argv[1]
+exts['wbk'] = exts['doc'].copy()   # it's exactly the same format
+parseable_exts = []
+quick_hack_junk_words = ['for', 'as', 'at', 'is', 'are', 'was', 'lol', ':)', 'if',  ]
+
+parser_functions = {}
+load_plugins( True )
 
-    only_fnames_args = ['-fn', 'np', 'no-parse', 'only-fnames', '--only-filenames']
-    only_fnames = intersection(sys.argv, only_fnames_args)
+
+
+
+
+if __name__=='__main__':
 
     from eyfo_defaults import default_config_values, CONF_FILE_BASE_NAME
     from persistant_dict import persistant_dict
     import library
+    import pprint
+    pp = pprint.PrettyPrinter(indent=3)
+
+    args = process_cmd_line_args(sys.argv)
     configfile = library.find_user_dir( particular_file=CONF_FILE_BASE_NAME, return_file_or_home='file', accept_current_dir=True, create_if_not_exist=True )
     conf = persistant_dict( configfile )
     conf.update_missing( default_config_values )
-    whoosh_index_db_dir = conf['whoosh_index_db_dir']
-    index_start_folder = conf['index_start_folder']
+
+    conf.update_if_not_empty(args)
+
+    action = conf['action']
     exclude_patterns = conf['exclude_patterns']
 
+    if action in ['info', 'version']:
+        print __doc__
+        pp.pprint(conf)
+        d = conf['index_db_dir']
+        print 'index size: ', library.human_readable_numbers( library.dir_size( d ) )
+        print 'index  dir: ', d
+        sys.exit(0)
+
+
     if action in ['test', 't', ]:
         arg=''
         if len(sys.argv) > 2:
             arg = sys.argv[2]
         test_plugins(arg)
+        sys.exit(0)
+
+    for d in ['root_dir', 'index_db_dir']:
+        if not conf[d]:
+            print 'error: %s is empty. Please provide a valid folder (existing or new) on the command line!' % d
+            sys.exit(4)
+
+
+    if action in ['index', 'i']:
+        if isfile(conf['index_db_dir']):
+            usage(conf['index_db_dir'], " is a file, not a folder.")
+        elif not isdir(conf['index_db_dir']):
+            print 'dir not exist, but will be created'
+        else:
+            if not conf['index_db_dir']:
+                usage("error: 'index_db_dir' in config file does not exist, and no index_db_dir provided in the command line")
+        if not isdir(conf['root_folder']):
+            usage("error - root folder not found")
+
+        # remember for next time
+        i = Eyfo_index( conf['index_db_dir'], clean=True )
+        i.root = conf['root_folder']
+        i.only_fnames = conf['only_fnames']
+        i.index_files( indexed_paths = conf['root_folder'])
+        sys.exit(0)
+
 
-    elif action in ['index', 'i']:
-        if len(sys.argv)<3: usage('not enough args for "index"')
-        root = sys.argv[2]
-        i = Eyfo_index(clean=True)
-        i.root = root
-        i.only_fnames = only_fnames
-        i.index_files()
     elif action in ['reindex', 'r']:
-        if len(sys.argv)<3: usage('not enough args for "reindex"')
-        root = sys.argv[2]
-        i = Eyfo_index()
+        i = Eyfo_index( conf['index_db_dir'], clean=False )
         i.gui = False
-        i.only_fnames = only_fnames
-        i.incremental_index( dirname=root )
+        i.only_fnames = conf['only_fnames']
+        i.incremental_index( root_dir = conf['root_folder'])
+
 
     elif action in ['s', 'search', 'find', ]:
-        if len(sys.argv)<3: usage('not enough args for "search"')
-        s = Eyfo_Search()
-        s.only_fnames = only_fnames
-        q = ' '.join(sys.argv[2:])
+        if not isdir(conf['root_dir']):
+            print 'root dir not exist. where is your whoosh db?!'
+            sys.exit(3)
+        s = Eyfo_Search( conf['index_db_dir'] )
+        s.only_fnames = conf['only_fnames']
+        q = conf['query'].join(' ')
         s.search( q, gui=False )
 
     elif action in ['gui', 'search-gui', 'find-gui', ]:
         q=q.strip()
 
         if q:
-
+            # -- woohoo! we got a query! ---
             open(lastqf, 'w').write(q)   # .encode('utf-8'))
 
             s = Eyfo_Search()
             txt = s.search( q, gui=True )
-            #print txt
             html= filelist_to_html_index(txt.splitlines(), title=q, header1=q, subtitle="Eyfo indexer" )
-            #html = unicode(html, 'utf-8', 'replace')
             tmp = tempfile.mktemp(prefix='whoosh', suffix="-index.html")
-            #print html
             open(tmp,'wb').write(html.encode('utf-8', 'ascii'))
-            #cmd = 'showhtml "%s"' % tmp
             cmd = 'start %s' % tmp
             os.system(cmd)
 
 
-    else:
+    else:  # user didn't provide us with valid command line
         usage()
 
+
+
+
+from os import path
+
+from .info import __VERSION__
+
+# <p>Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd</p>
+# <p>This module is part of the xlrd package, which is released under a
+# BSD-style licence.</p>
+
+from . import licences
+
+##
+# <p><b>A Python module for extracting data from MS Excel (TM) spreadsheet files.
+# <br /><br />
+# Version 0.7.4 -- April 2012
+# </b></p>
+#
+# <h2>General information</h2>
+#
+# <h3>Acknowledgements</h3>
+#
+# <p>
+# Development of this module would not have been possible without the document
+# "OpenOffice.org's Documentation of the Microsoft Excel File Format"
+# ("OOo docs" for short).
+# The latest version is available from OpenOffice.org in
+# <a href=http://sc.openoffice.org/excelfileformat.pdf> PDF format</a>
+# and
+# <a href=http://sc.openoffice.org/excelfileformat.odt> ODT format.</a>
+# Small portions of the OOo docs are reproduced in this
+# document. A study of the OOo docs is recommended for those who wish a
+# deeper understanding of the Excel file layout than the xlrd docs can provide.
+# </p>
+#
+# <p>Backporting to Python 2.1 was partially funded by
+#   <a href=http://journyx.com/>
+#       Journyx - provider of timesheet and project accounting solutions.
+#   </a>
+# </p>
+#
+# <p>Provision of formatting information in version 0.6.1 was funded by
+#   <a href=http://www.simplistix.co.uk>
+#       Simplistix Ltd.
+#   </a>
+# </p>
+#
+# <h3>Unicode</h3>
+#
+# <p>This module presents all text strings as Python unicode objects.
+# From Excel 97 onwards, text in Excel spreadsheets has been stored as Unicode.
+# Older files (Excel 95 and earlier) don't keep strings in Unicode;
+# a CODEPAGE record provides a codepage number (for example, 1252) which is
+# used by xlrd to derive the encoding (for same example: "cp1252") which is
+# used to translate to Unicode.</p>
+# <small>
+# <p>If the CODEPAGE record is missing (possible if the file was created
+# by third-party software), xlrd will assume that the encoding is ascii, and keep going.
+# If the actual encoding is not ascii, a UnicodeDecodeError exception will be raised and
+# you will need to determine the encoding yourself, and tell xlrd:
+# <pre>
+#     book = xlrd.open_workbook(..., encoding_override="cp1252")
+# </pre></p>
+# <p>If the CODEPAGE record exists but is wrong (for example, the codepage
+# number is 1251, but the strings are actually encoded in koi8_r),
+# it can be overridden using the same mechanism.
+# The supplied runxlrd.py has a corresponding command-line argument, which
+# may be used for experimentation:
+# <pre>
+#     runxlrd.py -e koi8_r 3rows myfile.xls
+# </pre></p>
+# <p>The first place to look for an encoding ("codec name") is
+# <a href=http://docs.python.org/lib/standard-encodings.html>
+# the Python documentation</a>.
+# </p>
+# </small>
+#
+# <h3>Dates in Excel spreadsheets</h3>
+#
+# <p>In reality, there are no such things. What you have are floating point
+# numbers and pious hope.
+# There are several problems with Excel dates:</p>
+#
+# <p>(1) Dates are not stored as a separate data type; they are stored as
+# floating point numbers and you have to rely on
+# (a) the "number format" applied to them in Excel and/or
+# (b) knowing which cells are supposed to have dates in them.
+# This module helps with (a) by inspecting the
+# format that has been applied to each number cell;
+# if it appears to be a date format, the cell
+# is classified as a date rather than a number. Feedback on this feature,
+# especially from non-English-speaking locales, would be appreciated.</p>
+#
+# <p>(2) Excel for Windows stores dates by default as the number of
+# days (or fraction thereof) since 1899-12-31T00:00:00. Excel for
+# Macintosh uses a default start date of 1904-01-01T00:00:00. The date
+# system can be changed in Excel on a per-workbook basis (for example:
+# Tools -> Options -> Calculation, tick the "1904 date system" box).
+# This is of course a bad idea if there are already dates in the
+# workbook. There is no good reason to change it even if there are no
+# dates in the workbook. Which date system is in use is recorded in the
+# workbook. A workbook transported from Windows to Macintosh (or vice
+# versa) will work correctly with the host Excel. When using this
+# module's xldate_as_tuple function to convert numbers from a workbook,
+# you must use the datemode attribute of the Book object. If you guess,
+# or make a judgement depending on where you believe the workbook was
+# created, you run the risk of being 1462 days out of kilter.</p>
+#
+# <p>Reference:
+# http://support.microsoft.com/default.aspx?scid=KB;EN-US;q180162</p>
+#
+#
+# <p>(3) The Excel implementation of the Windows-default 1900-based date system works on the
+# incorrect premise that 1900 was a leap year. It interprets the number 60 as meaning 1900-02-29,
+# which is not a valid date. Consequently any number less than 61 is ambiguous. Example: is 59 the
+# result of 1900-02-28 entered directly, or is it 1900-03-01 minus 2 days? The OpenOffice.org Calc
+# program "corrects" the Microsoft problem; entering 1900-02-27 causes the number 59 to be stored.
+# Save as an XLS file, then open the file with Excel -- you'll see 1900-02-28 displayed.</p>
+#
+# <p>Reference: http://support.microsoft.com/default.aspx?scid=kb;en-us;214326</p>
+#
+# <p>(4) The Macintosh-default 1904-based date system counts 1904-01-02 as day 1 and 1904-01-01 as day zero.
+# Thus any number such that (0.0 <= number < 1.0) is ambiguous. Is 0.625 a time of day (15:00:00),
+# independent of the calendar,
+# or should it be interpreted as an instant on a particular day (1904-01-01T15:00:00)?
+# The xldate_* functions in this module
+# take the view that such a number is a calendar-independent time of day (like Python's datetime.time type) for both
+# date systems. This is consistent with more recent Microsoft documentation
+# (for example, the help file for Excel 2002 which says that the first day
+# in the 1904 date system is 1904-01-02).
+#
+# <p>(5) Usage of the Excel DATE() function may leave strange dates in a spreadsheet. Quoting the help file,
+# in respect of the 1900 date system: "If year is between 0 (zero) and 1899 (inclusive),
+# Excel adds that value to 1900 to calculate the year. For example, DATE(108,1,2) returns January 2, 2008 (1900+108)."
+# This gimmick, semi-defensible only for arguments up to 99 and only in the pre-Y2K-awareness era,
+# means that DATE(1899, 12, 31) is interpreted as 3799-12-31.</p>
+#
+# <p>For further information, please refer to the documentation for the xldate_* functions.</p>
+#
+# <h3> Named references, constants, formulas, and macros</h3>
+#
+# <p>
+# A name is used to refer to a cell, a group of cells, a constant
+# value, a formula, or a macro. Usually the scope of a name is global
+# across the whole workbook. However it can be local to a worksheet.
+# For example, if the sales figures are in different cells in
+# different sheets, the user may define the name "Sales" in each
+# sheet. There are built-in names, like "Print_Area" and
+# "Print_Titles"; these two are naturally local to a sheet.
+# </p><p>
+# To inspect the names with a user interface like MS Excel, OOo Calc,
+# or Gnumeric, click on Insert/Names/Define. This will show the global
+# names, plus those local to the currently selected sheet.
+# </p><p>
+# A Book object provides two dictionaries (name_map and
+# name_and_scope_map) and a list (name_obj_list) which allow various
+# ways of accessing the Name objects. There is one Name object for
+# each NAME record found in the workbook. Name objects have many
+# attributes, several of which are relevant only when obj.macro is 1.
+# </p><p>
+# In the examples directory you will find namesdemo.xls which
+# showcases the many different ways that names can be used, and
+# xlrdnamesAPIdemo.py which offers 3 different queries for inspecting
+# the names in your files, and shows how to extract whatever a name is
+# referring to. There is currently one "convenience method",
+# Name.cell(), which extracts the value in the case where the name
+# refers to a single cell. More convenience methods are planned. The
+# source code for Name.cell (in __init__.py) is an extra source of
+# information on how the Name attributes hang together.
+# </p>
+#
+# <p><i>Name information is <b>not</b> extracted from files older than
+# Excel 5.0 (Book.biff_version < 50)</i></p>
+#
+# <h3>Formatting</h3>
+#
+# <h4>Introduction</h4>
+#
+# <p>This collection of features, new in xlrd version 0.6.1, is intended
+# to provide the information needed to (1) display/render spreadsheet contents
+# (say) on a screen or in a PDF file, and (2) copy spreadsheet data to another
+# file without losing the ability to display/render it.</p>
+#
+# <h4>The Palette; Colour Indexes</h4>
+#
+# <p>A colour is represented in Excel as a (red, green, blue) ("RGB") tuple
+# with each component in range(256). However it is not possible to access an
+# unlimited number of colours; each spreadsheet is limited to a palette of 64 different
+# colours (24 in Excel 3.0 and 4.0, 8 in Excel 2.0). Colours are referenced by an index
+# ("colour index") into this palette.
+#
+# Colour indexes 0 to 7 represent 8 fixed built-in colours: black, white, red, green, blue,
+# yellow, magenta, and cyan.<p>
+#
+# The remaining colours in the palette (8 to 63 in Excel 5.0 and later)
+# can be changed by the user. In the Excel 2003 UI, Tools/Options/Color presents a palette
+# of 7 rows of 8 colours. The last two rows are reserved for use in charts.<br />
+# The correspondence between this grid and the assigned
+# colour indexes is NOT left-to-right top-to-bottom.<br />
+# Indexes 8 to 15 correspond to changeable
+# parallels of the 8 fixed colours -- for example, index 7 is forever cyan;
+# index 15 starts off being cyan but can be changed by the user.<br />
+#
+# The default colour for each index depends on the file version; tables of the defaults
+# are available in the source code. If the user changes one or more colours,
+# a PALETTE record appears in the XLS file -- it gives the RGB values for *all* changeable
+# indexes.<br />
+# Note that colours can be used in "number formats": "[CYAN]...." and "[COLOR8]...." refer
+# to colour index 7; "[COLOR16]...." will produce cyan
+# unless the user changes colour index 15 to something else.<br />
+#
+# <p>In addition, there are several "magic" colour indexes used by Excel:<br />
+# 0x18 (BIFF3-BIFF4), 0x40 (BIFF5-BIFF8): System window text colour for border lines
+# (used in XF, CF, and WINDOW2 records)<br />
+# 0x19 (BIFF3-BIFF4), 0x41 (BIFF5-BIFF8): System window background colour for pattern background
+# (used in XF and CF records )<br />
+# 0x43: System face colour (dialogue background colour)<br />
+# 0x4D: System window text colour for chart border lines<br />
+# 0x4E: System window background colour for chart areas<br />
+# 0x4F: Automatic colour for chart border lines (seems to be always Black)<br />
+# 0x50: System ToolTip background colour (used in note objects)<br />
+# 0x51: System ToolTip text colour (used in note objects)<br />
+# 0x7FFF: System window text colour for fonts (used in FONT and CF records)<br />
+# Note 0x7FFF appears to be the *default* colour index. It appears quite often in FONT
+# records.<br />
+#
+# <h4>Default Formatting</h4>
+#
+# Default formatting is applied to all empty cells (those not described by a cell record).
+# Firstly row default information (ROW record, Rowinfo class) is used if available.
+# Failing that, column default information (COLINFO record, Colinfo class) is used if available.
+# As a last resort the worksheet/workbook default cell format will be used; this
+# should always be present in an Excel file,
+# described by the XF record with the fixed index 15 (0-based). By default, it uses the
+# worksheet/workbook default cell style, described by the very first XF record (index 0).
+#
+# <h4> Formatting features not included in xlrd version 0.6.1</h4>
+# <ul>
+#   <li>Rich text i.e. strings containing partial <b>bold</b> <i>italic</i>
+#       and <u>underlined</u> text, change of font inside a string, etc.
+#       See OOo docs s3.4 and s3.2.
+#       <i> Rich text is included in version 0.7.2</i></li>
+#   <li>Asian phonetic text (known as "ruby"), used for Japanese furigana. See OOo docs
+#       s3.4.2 (p15)</li>
+#   <li>Conditional formatting. See OOo docs
+#       s5.12, s6.21 (CONDFMT record), s6.16 (CF record)</li>
+#   <li>Miscellaneous sheet-level and book-level items e.g. printing layout, screen panes. </li>
+#   <li>Modern Excel file versions don't keep most of the built-in
+#       "number formats" in the file; Excel loads formats according to the
+#       user's locale. Currently xlrd's emulation of this is limited to
+#       a hard-wired table that applies to the US English locale. This may mean
+#       that currency symbols, date order, thousands separator, decimals separator, etc
+#       are inappropriate. Note that this does not affect users who are copying XLS
+#       files, only those who are visually rendering cells.</li>
+# </ul>
+#
+# <h3>Loading worksheets on demand</h3>
+#
+# <p>This feature, new in version 0.7.1, is governed by the on_demand argument
+# to the open_workbook() function and allows saving memory and time by loading
+# only those sheets that the caller is interested in, and releasing sheets
+# when no longer required.</p>
+#
+# <p>on_demand=False (default): No change. open_workbook() loads global data
+# and all sheets, releases resources no longer required (principally the
+# str or mmap object containing the Workbook stream), and returns.</p>
+#
+# <p>on_demand=True and BIFF version < 5.0: A warning message is emitted,
+# on_demand is recorded as False, and the old process is followed.</p>
+#
+# <p>on_demand=True and BIFF version >= 5.0: open_workbook() loads global
+# data and returns without releasing resources. At this stage, the only
+# information available about sheets is Book.nsheets and Book.sheet_names().</p>
+#
+# <p>Book.sheet_by_name() and Book.sheet_by_index() will load the requested
+# sheet if it is not already loaded.</p>
+#
+# <p>Book.sheets() will load all/any unloaded sheets.</p>
+#
+# <p>The caller may save memory by calling
+# Book.unload_sheet(sheet_name_or_index) when finished with the sheet.
+# This applies irrespective of the state of on_demand.</p>
+#
+# <p>The caller may re-load an unloaded sheet by calling Book.sheet_by_xxxx()
+#  -- except if those required resources have been released (which will
+# have happened automatically when on_demand is false). This is the only
+# case where an exception will be raised.</p>
+#
+# <p>The caller may query the state of a sheet:
+# Book.sheet_loaded(sheet_name_or_index) -> a bool</p>
+#
+# <p> Book.release_resources() may used to save memory and close
+# any memory-mapped file before proceding to examine already-loaded
+# sheets. Once resources are released, no further sheets can be loaded.</p>
+#
+# <p> When using on-demand, it is advisable to ensure that
+# Book.release_resources() is always called even if an exception
+# is raised in your own code; otherwise if the input file has been
+# memory-mapped, the mmap.mmap object will not be closed and you will
+# not be able to access the physical file until your Python process
+# terminates. This can be done by calling Book.release_resources()
+# explicitly in the finally suite of a try/finally block.
+# New in xlrd 0.7.2: the Book object is a "context manager", so if
+# using Python 2.5 or later, you can wrap your code in a "with"
+# statement.</p>
+##
+
+import sys, zipfile, pprint
+from . import timemachine
+from .biffh import (
+    XLRDError,
+    biff_text_from_num,
+    error_text_from_code,
+    XL_CELL_BLANK,
+    XL_CELL_TEXT,
+    XL_CELL_BOOLEAN,
+    XL_CELL_ERROR,
+    XL_CELL_EMPTY,
+    XL_CELL_DATE,
+    XL_CELL_NUMBER
+    )
+from .formula import * # is constrained by __all__
+from .book import Book, colname #### TODO #### formula also has `colname` (restricted to 256 cols)
+from .sheet import empty_cell
+from .xldate import XLDateError, xldate_as_tuple
+
+if sys.version.startswith("IronPython"):
+    # print >> sys.stderr, "...importing encodings"
+    import encodings
+
+try:
+    import mmap
+    MMAP_AVAILABLE = 1
+except ImportError:
+    MMAP_AVAILABLE = 0
+USE_MMAP = MMAP_AVAILABLE
+
+##
+#
+# Open a spreadsheet file for data extraction.
+#
+# @param filename The path to the spreadsheet file to be opened.
+#
+# @param logfile An open file to which messages and diagnostics are written.
+#
+# @param verbosity Increases the volume of trace material written to the logfile.
+#
+# @param pickleable Default is true. In Python 2.4 or earlier, setting to false
+# will cause use of array.array objects which save some memory but can't be pickled.
+# In Python 2.5, array.arrays are used unconditionally. Note: if you have large files that
+# you need to read multiple times, it can be much faster to cPickle.dump() the xlrd.Book object