Matt Chaput avatar Matt Chaput committed b3bf9c2

Added Reader.set_caching_policy(), moved caching policy from FileReader to fieldcache.FieldCachingPolicy.
Turned field cache saving back on by default.
Removed check for already deleted docs - issue #95.
Minor docs fixups.
Bumping version.

Comments (0)

Files changed (8)

-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?eclipse-pydev version="1.0"?>
-
-<pydev_project>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 2.5</pydev_property>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.5</pydev_property>
-<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
-<path>/whoosh/src</path>
-<path>/whoosh/tests</path>
-<path>/whoosh/benchmark</path>
-<path>/whoosh/stress</path>
-</pydev_pathproperty>
-</pydev_project>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?>
+
+<pydev_project>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 2.7</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.5</pydev_property>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/whoosh/src</path>
+<path>/whoosh/tests</path>
+<path>/whoosh/benchmark</path>
+<path>/whoosh/stress</path>
+</pydev_pathproperty>
+</pydev_project>

src/whoosh/__init__.py

 # limitations under the License.
 #===============================================================================
 
-__version__ = (1, 5, 2)
+__version__ = (1, 5, 3)
 
 
 def versionstring(build=True, extra=True):

src/whoosh/filedb/fieldcache.py

 from heapq import heappush, heapreplace
 from struct import Struct
 
-from whoosh.system import _INT_SIZE
+from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
 from whoosh.util import utf8encode
 
 
                 and self.order == other.order
                 and self.texts == other.texts)
     
+    def size(self):
+        """Returns the size in bytes (or as accurate an estimate as is
+        practical, anyway) of this cache.
+        """
+        
+        orderlen = len(self.order)
+        if self.typecode == "B":
+            total = orderlen
+        elif self.typecode in "Ii":
+            total = orderlen * _INT_SIZE
+        elif self.typecode == "f":
+            total = orderlen * _FLOAT_SIZE
+        elif self.typecode in "Qq":
+            total = orderlen * _LONG_SIZE
+        
+        if self.hastexts:
+            total += sum(len(t) for t in self.texts)
+            
+        return total
+    
     # Class constructor for building a field cache from a reader
     
     @classmethod
                 else:
                     order[id] = sortable
         
+        # Compact the order array if possible
+        if hastexts:
+            if len(texts) < 255:
+                newcode = "B"
+            elif len(texts) < 65535:
+                newcode = "H"
+            
+            if newcode != order.typecode:
+                order = array(newcode, order)
+                typecode = newcode
+        
         return cls(order, texts, hastexts=hastexts, typecode=typecode)
     
     # Class constructor for defining a field cache using arbitrary queries
         dbfile.close()
     
 
+# Caching policies
 
+class FieldCachingPolicy(object):
+    """Base class for caching policies.
+    """
+    
+    def put(self, key, obj, save=True):
+        raise NotImplementedError
+    
+    def __contains__(self, key):
+        raise NotImplementedError
+    
+    def is_loaded(self, key):
+        raise NotImplementedError
+    
+    def get(self, key):
+        raise NotImplementedError
+    
+    def delete(self, key):
+        pass
+    
+
+class NoCaching(FieldCachingPolicy):
+    def put(self, key, obj, save=True):
+        pass
+    
+    def __contains__(self, key):
+        return False
+    
+    def is_loaded(self, key):
+        return False
+    
+    def get(self, key):
+        return None
+    
+
+class DefaultFieldCachingPolicy(FieldCachingPolicy):
+    def __init__(self, basename, storage=None, gzip_caches=False,
+                 fcclass=FieldCache):
+        self.basename = basename
+        self.storage = storage
+        self.caches = {}
+        self.gzip_caches = gzip_caches
+        self.fcclass = fcclass
+    
+    def __contains__(self, key):
+        return self.is_loaded(key) or self._file_exists(key)
+    
+    def _filename(self, key):
+        return "%s.%s.fc" % (self.basename, key)
+    
+    def _file_exists(self, key):
+        if not self.storage: return False
+        
+        filename = self._filename(key)
+        gzfilename = filename + ".gz"
+        return (self.storage.file_exists(filename)
+                or self.storage.file_exists(gzfilename))
+    
+    def _save(self, key, cache):
+        filename = self._filename(key)
+        if self.gzip_caches:
+            filename += ".gz"
+        
+        try:
+            f = self.storage.create_file(filename, gzip=self.gzip_caches,
+                                         excl=True)
+            cache.to_file(f)
+            f.close()
+        except OSError:
+            pass
+    
+    def _load(self, key):
+        storage = self.storage
+        filename = self._filename(key)
+        gzfilename = filename + ".gz"
+        gzipped = False
+        if storage.file_exists(gzfilename) and not storage.file_exists(filename):
+            filename = gzfilename
+            gzipped = True
+        
+        f = storage.open_file(filename, mapped=False, gzip=gzipped)
+        cache = self.fcclass.from_file(f)
+        f.close()
+        return cache
+    
+    def is_loaded(self, key):
+        return key in self.caches
+    
+    def put(self, key, cache, save=True):
+        self.caches[key] = cache
+        if save and self.storage:
+            self._save(key, cache)
+    
+    def get(self, key):
+        if key in self.caches:
+            return self.caches.get(key)
+        elif self._file_exists(key):
+            try:
+                return self._load(key)
+            except OSError:
+                return None
+
+    def delete(self, key):
+        try:
+            del self.caches[key]
+        except KeyError:
+            pass
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

src/whoosh/filedb/fileindex.py

         if delete:
             if self.deleted is None:
                 self.deleted = set()
-            elif docnum in self.deleted:
-                raise KeyError("Document %s in segment %r is already deleted"
-                               % (docnum, self.name))
-
             self.deleted.add(docnum)
-        else:
-            if self.deleted is None or docnum not in self.deleted:
-                raise KeyError("Document %s is not deleted" % docnum)
-
+        elif self.deleted is not None and docnum in self.deleted:
             self.deleted.clear(docnum)
 
     def is_deleted(self, docnum):

src/whoosh/filedb/filereading.py

 from heapq import nlargest, nsmallest
 from threading import Lock
 
-from whoosh.filedb.fieldcache import FieldCache
+from whoosh.filedb.fieldcache import FieldCache, DefaultFieldCachingPolicy
 from whoosh.filedb.filepostings import FilePostingReader
 from whoosh.filedb.filestore import ReadOnlyError
 from whoosh.filedb.filetables import (TermIndexReader, StoredFieldReader,
 from whoosh.reading import IndexReader, TermNotFound
 from whoosh.util import protected
 
-SAVE_BY_DEFAULT = False
+SAVE_BY_DEFAULT = True
 
 # Reader class
 
         self.dc = segment.doc_count_all()
         assert self.dc == self.storedfields.length
         
-        self.caches = {}
+        self.set_caching_policy()
         
         self.is_closed = False
         self._sync_lock = Lock()
     def supports_caches(self):
         return True
 
-    def _fieldcache_filename(self, fieldname):
-        return "%s.%s.fc" % (self.segment.name, fieldname)
-
-    def _put_fieldcache(self, name, fieldcache):
-        self.caches[name] = fieldcache
-
-    def _load_fieldcache(self, fieldname):
-        storage = self.storage
-        filename = self._fieldcache_filename(fieldname)
-        gzipped = False
+    def set_caching_policy(self, cp=None, save=True, storage=None):
+        """This method lets you control the caching policy of the reader. You
+        can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
+        as the first argument, *or* use the `save` and `storage` keywords to
+        alter the default caching policy::
         
-        # It's possible to load GZip'd caches but for it's MUCH slower,
-        # especially for large caches
-        gzname = filename + ".gz"
-        if storage.file_exists(gzname) and not storage.file_exists(filename):
-            filename = gzname
-            gzipped = True
+            # Use a custom field caching policy object
+            reader.set_caching_policy(MyPolicy())
+            
+            # Use the default caching policy but turn off saving caches to disk
+            reader.set_caching_policy(save=False)
+            
+            # Use the default caching policy but save caches to a custom storage
+            from whoosh.filedb.filestore import FileStorage
+            mystorage = FileStorage("path/to/cachedir")
+            reader.set_caching_policy(storage=mystorage)
         
-        f = storage.open_file(filename, mapped=False, gzip=gzipped)
-        cache = FieldCache.from_file(f)
-        f.close()
-        return cache
-
-    def _cachefile_exists(self, fieldname):
-        storage = self.storage
-        filename = self._fieldcache_filename(fieldname)
-        gzname = filename + ".gz"
-        return storage.file_exists(filename) or storage.file_exists(gzname)
-
-    def _save_fieldcache(self, name, cache):
-        filename = self._fieldcache_filename(name)
-        if self.GZIP_CACHES:
-            filename += ".gz"
+        :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
+            object. If this argument is not given, the default caching policy
+            is used.
+        :param save: save field caches to disk for re-use. If a caching policy
+            object is specified using `cp`, this argument is ignored.
+        :param storage: a custom :class:`whoosh.store.Storage` object to use
+            for saving field caches. If a caching policy object is specified
+            using `cp` or `save` is `False`, this argument is ignored. 
+        """
+        
+        if not cp:
+            if save and storage is None:
+                storage = self.storage
+            else:
+                storage = None
+            cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage)
             
-        f = self.storage.create_file(filename, gzip=self.GZIP_CACHES)
-        cache.to_file(f)
-        f.close()
-
-    def _create_fieldcache(self, fieldname, save=SAVE_BY_DEFAULT, name=None,
-                           default=u''):
-        if name in self.schema:
-            raise Exception("Custom name %r is the name of a field")
-        savename = name if name else fieldname
+        if type(cp) is type:
+            cp = cp()
         
-        if self.fieldcache_available(savename):
-            # Don't recreate the cache if it already exists
-            return None
-        
-        cache = FieldCache.from_field(self, fieldname, default=default)
-        if save and not self.storage.readonly:
-            self._save_fieldcache(savename, cache)
-        return cache
+        self.caching_policy = cp
 
     def define_facets(self, name, qs, save=SAVE_BY_DEFAULT):
         if name in self.schema:
             return
         
         cache = FieldCache.from_lists(qs, self.doc_count_all())
-        if save and not self.storage.readonly:
-            self._save_fieldcache(name, cache)
-        self._put_fieldcache(name, cache)
+        self.caching_policy.put(name, cache, save=save)
 
     def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT):
         """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for
             doesn't already exist.
         """
         
-        if fieldname in self.caches:
-            return self.caches[fieldname]
-        elif self._cachefile_exists(fieldname):
-            fc = self._load_fieldcache(fieldname)
-        else:
-            fc = self._create_fieldcache(fieldname, save=SAVE_BY_DEFAULT)
-        self._put_fieldcache(fieldname, fc)
+        fc = self.caching_policy.get(fieldname)
+        if not fc:
+            fc = FieldCache.from_field(self, fieldname)
+            self.caching_policy.put(fieldname, fc, save=save)
         return fc
     
     def fieldcache_available(self, fieldname):
         memory already or on disk).
         """
         
-        return fieldname in self.caches or self._cachefile_exists(fieldname)
+        return fieldname in self.caching_policy
     
     def fieldcache_loaded(self, fieldname):
         """Returns True if a field cache for the given field is in memory.
         """
         
-        return fieldname in self.caches
+        return self.caching_policy.is_loaded(fieldname)
 
     def unload_fieldcache(self, name):
-        try:
-            del self.caches[name]
-        except:
-            pass
+        self.caching_policy.delete(name)
         
-    def delete_fieldcache(self, name):
-        self.unload_fieldcache(name)
-        filename = self._fieldcache_filename(name)
-        if self.storage.file_exists(filename):
-            try:
-                self.storage.delete_file(filename)
-            except:
-                pass
-
     # Sorting and faceting methods
     
     def key_fn(self, fieldname):

src/whoosh/filedb/filestore.py

         from whoosh.filedb.fileindex import FileIndex
         return FileIndex(self, schema=schema, indexname=indexname)
 
-    def create_file(self, name, **kwargs):
+    def create_file(self, name, excl=False, mode="wb", **kwargs):
         if self.readonly:
             raise ReadOnlyError
         
-        f = StructFile(open(self._fpath(name), "wb"), name=name,
-                       mapped=self.mapped, **kwargs)
+        path = self._fpath(name)
+        if excl:
+            fd = os.open(path, os.O_CREAT | os.O_EXCL)
+            fileobj = os.fdopen(fd, mode)
+        else:
+            fileobj = open(path, mode)
+        
+        f = StructFile(fileobj, name=name, mapped=self.mapped, **kwargs)
         return f
 
     def open_file(self, name, *args, **kwargs):

src/whoosh/reading.py

         """
         
         raise NotImplementedError
+    
+    def set_caching_policy(self, *args, **kwargs):
+        """Sets the field caching policy for this reader.
+        """
+        
+        pass
         
 
 # Fake IndexReader class for empty indexes
     def leaf_readers(self):
         return zip(self.readers, self.doc_offsets)
 
-    
+    def set_caching_policy(self, *args, **kwargs):
+        for r in self.readers:
+            r.set_caching_policy(*args, **kwargs)
 
 
 

src/whoosh/support/filelock.py

 # limitations under the License.
 #===============================================================================
 
+"""
+This module contains classes implementing exclusive locks for platforms with
+fcntl (UNIX and Mac OS X) and Windows. Whoosh originally used directory creation
+as a locking method, but it had the problem that if the program crashed the
+lock directory was left behind and would keep the index locked until it was
+cleaned up. Using OS-level file locks fixes this.
+"""
+
 import errno, os, time
 
-
 def try_for(fn, timeout=5.0, delay=0.1):
     """Calls ``fn`` every ``delay`` seconds until it returns True or ``timeout``
-    seconds elapse. Returns True if the lock was acquired, or False if the timeout
-    was reached.
+    seconds elapse. Returns True if the lock was acquired, or False if the
+    timeout was reached.
 
     :param timeout: Length of time (in seconds) to keep retrying to acquire the
         lock. 0 means return immediately. Only used when blocking is False.
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.