Commits

Matt Chaput committed c9d30b9

Changed code to store temporary files under the index storage dir by default.
Added WrappingCodec class.
Removed accidentally committed debug prints.
Minor cleanups.

Comments (0)

Files changed (14)

src/whoosh/codec/base.py

 This module contains base classes/interfaces for "codec" objects.
 """
 
-import random
 from bisect import bisect_right
 
 from whoosh import columns
 from whoosh.compat import abstractmethod, izip, xrange
 from whoosh.filedb.compound import CompoundStorage
 from whoosh.system import emptybytes
+from whoosh.util import random_name
 
 
 # Exceptions
         raise NotImplementedError
 
 
+class WrappingCodec(Codec):
+    def __init__(self, child):
+        self._child = child
+
+    def per_document_writer(self, storage, segment):
+        return self._child.per_document_writer(storage, segment)
+
+    def field_writer(self, storage, segment):
+        return self._child.field_writer(storage, segment)
+
+    def postings_writer(self, dbfile, byteids=False):
+        return self._child.postings_writer(dbfile, byteids=byteids)
+
+    def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
+        return self._child.postings_reader(dbfile, terminfo, format_, term=term,
+                                           scorer=scorer)
+
+    def terms_reader(self, storage, segment):
+        return self._child.terms_reader(storage, segment)
+
+    def per_document_reader(self, storage, segment):
+        return self._child.per_document_reader(storage, segment)
+
+    def supports_graph(self):
+        return self._child.supports_graph()
+
+    def graph_reader(self, storage, segment):
+        return self._child.graph_reader(storage, segment)
+
+    def new_segment(self, storage, indexname):
+        return self._child.new_segment(storage, indexname)
+
+
 # Writer classes
 
 class PerDocumentWriter(object):
     along the way).
     """
 
-    # These must be valid separate characters in CASE-INSENSTIVE filenames
-    IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
     # Extension for compound segment files
     COMPOUND_EXT = ".seg"
 
         self.compound = False
 
     @classmethod
-    def _random_id(cls, size=12):
-        return "".join(random.choice(cls.IDCHARS) for _ in xrange(size))
+    def _random_id(cls, size=16):
+        return random_name(size=size)
 
     def __repr__(self):
-        return "<%s %s>" % (self.__class__.__name__,
-                            self.segment_id())
+        return "<%s %s>" % (self.__class__.__name__, self.segment_id())
 
     def codec(self):
         raise NotImplementedError

src/whoosh/codec/whoosh3.py

         self._storage = storage
         self._segment = segment
 
-        self._cols = compound.CompoundWriter()
+        tempst = storage.temp_storage("%s.tmp" % segment.indexname)
+        self._cols = compound.CompoundWriter(tempst)
         self._colwriters = {}
         self._create_column("_stored", STORED_COLUMN)
 
         self._fieldobj = None
         self._format = None
 
-        self._fieldmap = {}
         _tifile = self._create_file(W3Codec.TERMS_EXT)
         self._tindex = filetables.OrderedHashWriter(_tifile)
-        self._tindex.extras["fieldmap"] = self._fieldmap
+        self._fieldmap = self._tindex.extras["fieldmap"] = {}
 
         self._postfile = self._create_file(W3Codec.POSTS_EXT)
 
         # Minify values
 
         fixedsize = self._format.fixed_value_size()
-        print "format=", self._format, "***fixedsize=", fixedsize
         values = self._values
 
         if fixedsize is None or fixedsize < 0:
 
         # De-minify the values
         fixedsize = self._fixedsize
-        print "data=", self._data, "format=", self.format, "fixed=", fixedsize
         vs = self._data[2]
         if fixedsize is None or fixedsize < 0:
             self._values = vs

src/whoosh/collectors.py

                         break
                     usequality = self._use_block_quality()
                     replacecounter = self.replace
-                    minscore = self.minscore
+
+                    if self.minscore != minscore:
+                        checkquality = True
+                        minscore = self.minscore
+
                 replacecounter -= 1
 
             # If we're using block quality optimizations, and the checkquality

src/whoosh/externalsort.py

     tuples, lists, and dicts).
     """
 
-    filenamechars = "abcdefghijklmnopqrstuvwxyz_1234567890"
-
     def __init__(self, maxsize=1000000, tempdir=None, prefix="",
                  suffix=".run"):
         """
         f = os.fdopen(fd, "wb")
         return path, f
 
-    @staticmethod
-    def _read_run(path):
-        import os.path
-        f = open(path, "rb")
+    def _open_run(self, path):
+        return open(path, "rb")
+
+    def _remove_run(self, path):
+        os.remove(path)
+
+    def _read_run(self, path):
+        f = self._open_run(path)
         try:
             while True:
                 yield load(f)
             return
         finally:
             f.close()
-            os.remove(path)
+            self._remove_run(path)
 
-    @classmethod
-    def _merge_runs(cls, paths):
-        iters = [cls._read_run(path) for path in paths]
+    def _merge_runs(self, paths):
+        iters = [self._read_run(path) for path in paths]
         for item in imerge(iters):
             yield item
 

src/whoosh/filedb/compound.py

 # policies, either expressed or implied, of Matt Chaput.
 
 import errno, sys
-from tempfile import TemporaryFile
 from threading import Lock
 from shutil import copyfileobj
 
 from whoosh.filedb.structfile import BufferFile, StructFile
 from whoosh.filedb.filestore import FileStorage
 from whoosh.system import emptybytes
+from whoosh.util import random_name
 
 
 class CompoundStorage(FileStorage):
             pos = self._pos + where
         elif whence == 2:  # From end
             pos = self._length - where
+        else:
+            raise ValueError
 
         self._pos = pos
 
 
 
 class CompoundWriter(object):
-    def __init__(self, buffersize=32 * 1024):
+    def __init__(self, tempstorage, buffersize=32 * 1024):
         assert isinstance(buffersize, int)
-        self._temp = TemporaryFile()
+        self._tempstorage = tempstorage
+        self._tempname = "%s.ctmp" % random_name()
+        self._temp = tempstorage.create_file(self._tempname, mode="w+b")
         self._buffersize = buffersize
         self._streams = {}
 
 
             yield (name, gen)
         temp.close()
+        self._tempstorage.delete_file(self._tempname)
 
     def save_as_compound(self, dbfile):
         basepos = dbfile.tell()

src/whoosh/filedb/filestore.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-import errno, os, random, sys
+import errno, os, sys, tempfile
 from threading import Lock
 
 from whoosh.compat import BytesIO, memoryview_
 from whoosh.filedb.structfile import BufferFile, StructFile
 from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError
+from whoosh.util import random_name
 from whoosh.util.filelock import FileLock
 
 
 
         raise NotImplementedError
 
-    def create_temp(self):
-        """Creates a randomly-named temporary file.
-
-        :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
-        """
-
-        name = hex(random.getrandbits(128))[2:] + ".tmp"
-        return name, self.create_file(name)
-
     def open_file(self, name, *args, **kwargs):
         """Opens a file with the given name in this storage.
 
 
         pass
 
+    def temp_storage(self, name=None):
+        """Creates a new storage object for temporary files. You can call
+        :meth:`Storage.destroy` on the new storage when you're finished with
+        it.
+
+        :param name: a name for the new storage. This may be optional or
+            required depending on the storage implementation.
+        :rtype: :class:`Storage`
+        """
+
+        raise NotImplementedError
+
 
 class OverlayStorage(Storage):
     """Overlays two storage objects. Reads are processed from the first if it
     def create_file(self, *args, **kwargs):
         return self.b.create_file(*args, **kwargs)
 
-    def create_temp(self, *args, **kwargs):
-        return self.b.create_temp(*args, **kwargs)
-
     def open_file(self, name, *args, **kwargs):
         if self.a.file_exists(name):
             return self.a.open_file(name, *args, **kwargs)
         self.a.optimize()
         self.b.optimize()
 
+    def temp_storage(self, name=None):
+        return self.b.temp_storage(name=name)
+
 
 class FileStorage(Storage):
     """Storage object that stores the index as files in a directory on disk.
         # If the given directory does not already exist, try to create it
         try:
             os.makedirs(dirpath)
-        except IOError:
+        except OSError:
             # This is necessary for compatibility between Py2 and Py3
             e = sys.exc_info()[1]
             # If we get an error because the path already exists, ignore it
     def _fpath(self, fname):
         return os.path.abspath(os.path.join(self.folder, fname))
 
-    def clean(self):
+    def clean(self, ignore=False):
         if self.readonly:
             raise ReadOnlyError
 
         path = self.folder
         files = self.list()
         for fname in files:
-            os.remove(os.path.join(path, fname))
+            try:
+                os.remove(os.path.join(path, fname))
+            except OSError:
+                if not ignore:
+                    raise
 
     def list(self):
         try:
     def lock(self, name):
         return FileLock(self._fpath(name))
 
+    def temp_storage(self, name=None):
+        name = name or "%s.tmp" % random_name()
+        path = os.path.join(self.folder, name)
+        tempstore = FileStorage(path)
+        return tempstore.create()
+
 
 class RamStorage(Storage):
     """Storage object that keeps the index in memory.
             self.locks[name] = Lock()
         return self.locks[name]
 
+    def temp_storage(self, name=None):
+        tdir = tempfile.gettempdir()
+        name = name or "%s.tmp" % random_name()
+        path = os.path.join(tdir, name)
+        tempstore = FileStorage(path)
+        return tempstore.create()
+
+
+def copy_storage(sourcestore, deststore):
+    """Copies the files from the source storage object to the destination
+    storage object using ``shutil.copyfileobj``.
+    """
+    from shutil import copyfileobj
+
+    for name in sourcestore.list():
+        with sourcestore.open_file(name) as source:
+            with deststore.create_file(name) as dest:
+                copyfileobj(source, dest)
+
 
 def copy_to_ram(storage):
     """Copies the given FileStorage object into a new RamStorage object.
     :rtype: :class:`RamStorage`
     """
 
-    import shutil
     ram = RamStorage()
-    for name in storage.list():
-        f = storage.open_file(name)
-        r = ram.create_file(name)
-        shutil.copyfileobj(f.file, r.file)
-        f.close()
-        r.close()
+    copy_storage(storage, ram)
     return ram

src/whoosh/filedb/structfile.py

     def __iter__(self):
         return iter(self.file)
 
+    def raw_file(self):
+        return self.file
+
     def read(self, *args, **kwargs):
         return self.file.read(*args, **kwargs)
 
 
 
 class ChecksumFile(StructFile):
-    def __init__(self, child):
-        self._child = child
+    def __init__(self, *args, **kwargs):
+        StructFile.__init__(self, *args, **kwargs)
         self._check = 0
         self._crc32 = __import__("zlib").crc32
 
-    def __getattr__(self, name):
-        if name in self.__dict__:
-            return self.__dict__[name]
-        else:
-            return getattr(self._child, name)
-
     def __iter__(self):
-        for line in self._child:
+        for line in self.file:
             self._check = self._crc32(line, self._check)
             yield line
 
         raise Exception("Cannot seek on a ChecksumFile")
 
     def read(self, *args, **kwargs):
-        b = self._child.read(*args, **kwargs)
+        b = self.file.read(*args, **kwargs)
         self._check = self._crc32(b, self._check)
         return b
 
     def write(self, b):
         self._check = self._crc32(b, self._check)
-        self._child.write(b)
+        self.file.write(b)
 
     def checksum(self):
         return self._check & 0xffffffff

src/whoosh/matching/combo.py

 
 class CombinationMatcher(mcore.Matcher):
     def __init__(self, submatchers, boost=1.0):
-        assert submatchers, submatchers
         self._submatchers = submatchers
         self._boost = boost
 

src/whoosh/multiproc.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from __future__ import with_statement
-import os, tempfile
+import os
 from multiprocessing import Process, Queue, cpu_count
 
 from whoosh.compat import xrange, iteritems, pickle
 from whoosh.codec import base
 from whoosh.writing import PostingPool, SegmentWriter
 from whoosh.externalsort import imerge
+from whoosh.util import random_name
 
 
 def finish_subsegment(writer, k=64):
         # the only command codes is 0=add_document
 
         writer = self.writer
+        tempstorage = writer.temp_storage()
+
         load = pickle.load
-        with open(filename, "rb") as f:
+        with tempstorage.open_file(filename).raw_file() as f:
             for _ in xrange(doc_count):
                 # Load the next pickled tuple from the file
                 code, args = load(f)
                 assert code == 0
                 writer.add_document(**args)
         # Remove the job file
-        os.remove(filename)
+        tempstorage.delete_file(filename)
 
     def cancel(self):
         self.running = False
         docbuffer = self.docbuffer
         dump = pickle.dump
         length = len(docbuffer)
-        fd, filename = tempfile.mkstemp(".doclist")
-        with os.fdopen(fd, "wb") as f:
+
+        filename = "%s.doclist" % random_name()
+        with self.temp_storage().create_file(filename).raw_file() as f:
             for item in docbuffer:
                 dump(item, f, -1)
 
         # Note that SortingPool._read_run() automatically deletes the run file
         # when it's finished
 
-        gen = PostingPool._read_run(path)
+        gen = self.pool._read_run(path)
         # If offset is 0, just return the items unchanged
         if not offset:
             return gen
                                  merge=merge)
 
     def _commit(self, mergetype, optimize, merge):
-        try:
-            # Index the remaining documents in the doc buffer
-            if self.docbuffer:
-                self._enqueue()
-            # Tell the tasks to finish
-            for task in self.tasks:
-                self.jobqueue.put(None)
+        # Index the remaining documents in the doc buffer
+        if self.docbuffer:
+            self._enqueue()
+        # Tell the tasks to finish
+        for task in self.tasks:
+            self.jobqueue.put(None)
 
-            # Merge existing segments
-            finalsegments = self._merge_segments(mergetype, optimize, merge)
+        # Merge existing segments
+        finalsegments = self._merge_segments(mergetype, optimize, merge)
 
-            # Wait for the subtasks to finish
-            for task in self.tasks:
-                task.join()
+        # Wait for the subtasks to finish
+        for task in self.tasks:
+            task.join()
 
-            # Pull a (run_file_name, segment) tuple off the result queue for
-            # each sub-task, representing the final results of the task
-            results = []
-            for task in self.tasks:
-                results.append(self.resultqueue.get(timeout=5))
+        # Pull a (run_file_name, segment) tuple off the result queue for
+        # each sub-task, representing the final results of the task
+        results = []
+        for task in self.tasks:
+            results.append(self.resultqueue.get(timeout=5))
 
-            if self.multisegment:
-                finalsegments += [s for _, s in results]
-                if self._added:
-                    finalsegments.append(self._finalize_segment())
-                else:
-                    self._close_segment()
+        if self.multisegment:
+            finalsegments += [s for _, s in results]
+            if self._added:
+                finalsegments.append(self._finalize_segment())
             else:
-                # Merge the posting sources from the sub-writers and my
-                # postings into this writer
-                self._merge_subsegments(results, mergetype)
                 self._close_segment()
-                self._assemble_segment()
-                finalsegments.append(self.get_segment())
-            self._commit_toc(finalsegments)
-        finally:
-            self._finish()
+            assert self.perdocwriter.is_closed
+        else:
+            # Merge the posting sources from the sub-writers and my
+            # postings into this writer
+            self._merge_subsegments(results, mergetype)
+            self._close_segment()
+            self._assemble_segment()
+            finalsegments.append(self.get_segment())
+            assert self.perdocwriter.is_closed
+
+        self._commit_toc(finalsegments)
+        self._finish()
 
     def _merge_subsegments(self, results, mergetype):
         storage = self.storage
     def _commit(self, mergetype, optimize, merge):
         # Pull a (run_file_name, segment) tuple off the result queue for each
         # sub-task, representing the final results of the task
-        try:
-            # Merge existing segments
-            finalsegments = self._merge_segments(mergetype, optimize, merge)
-            results = []
-            for writer in self.tasks:
-                results.append(finish_subsegment(writer))
-            self._merge_subsegments(results, mergetype)
-            self._close_segment()
-            self._assemble_segment()
-            finalsegments.append(self.get_segment())
-            self._commit_toc(finalsegments)
-        finally:
-            self._finish()
+
+        # Merge existing segments
+        finalsegments = self._merge_segments(mergetype, optimize, merge)
+        results = []
+        for writer in self.tasks:
+            results.append(finish_subsegment(writer))
+
+        self._merge_subsegments(results, mergetype)
+        self._close_segment()
+        self._assemble_segment()
+        finalsegments.append(self.get_segment())
+
+        self._commit_toc(finalsegments)
+        self._finish()
 
 
 # For compatibility with old multiproc module

src/whoosh/reading.py

 
         return None
 
+    def storage(self):
+        """Returns the :class:`whoosh.filedb.filestore.Storage` object used by
+        this reader to read its files. If the reader is not atomic,
+        (``reader.is_atomic() == True``), returns None.
+        """
+
+        return None
+
     def is_atomic(self):
         return True
 
         self.schema = schema
         self.is_closed = False
 
-        self._storage = storage
         self._segment = segment
         self._segid = self._segment.segment_id()
         self._gen = generation
             # Use an overlay here instead of just the compound storage, in rare
             # circumstances a segment file may be added after the segment is
             # written
-            self._files = OverlayStorage(files, self._storage)
+            self._storage = OverlayStorage(files, storage)
         else:
-            self._files = storage
+            self._storage = storage
 
         # Get subreaders from codec
         self._codec = codec if codec else segment.codec()
-        self._terms = self._codec.terms_reader(self._files, segment)
-        self._perdoc = self._codec.per_document_reader(self._files, segment)
+        self._terms = self._codec.terms_reader(self._storage, segment)
+        self._perdoc = self._codec.per_document_reader(self._storage, segment)
         self._graph = None  # Lazy open with self._get_graph()
 
     def _get_graph(self):
         if not self._graph:
-            self._graph = self._codec.graph_reader(self._files, self._segment)
+            self._graph = self._codec.graph_reader(self._storage, self._segment)
         return self._graph
 
     def codec(self):
     def segment(self):
         return self._segment
 
+    def storage(self):
+        return self._storage
+
     def has_deletions(self):
         return self._perdoc.has_deletions()
 
 
         # It's possible some weird codec that doesn't use storage might have
         # passed None instead of a storage object
-        if self._files:
-            self._files.close()
+        if self._storage:
+            self._storage.close()
 
         self.is_closed = True
 

src/whoosh/util/__init__.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from __future__ import with_statement
-import re, sys, time
+import random, sys, time
 from bisect import insort, bisect_left
 from functools import wraps
 
-from whoosh.compat import string_type
+
+# These must be valid separate characters in CASE-INSENSTIVE filenames
+IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
 
 
 if sys.platform == 'win32':
     now = time.time
 
 
+def random_name(size=28):
+    return "".join(random.choice(IDCHARS) for _ in xrange(size))
+
+
 def make_binary_tree(fn, args, **kwargs):
     """Takes a function/class that takes two positional arguments and a list of
     arguments and returns a binary tree of results/instances.

src/whoosh/util/numlists.py

 
     def get(self, f, pos, i):
         f.seek(pos)
+        n = None
         for n in self.read_nums(f, i + 1):
             pass
         return n
         """
 
         count = 0
-
+        key = None
         for _ in xrange(n):
             if count == 0:
                 key = f.read_byte()

src/whoosh/writing.py

 from whoosh.fields import UnknownFieldError
 from whoosh.index import LockError
 from whoosh.system import emptybytes
-from whoosh.util import fib
+from whoosh.util import fib, random_name
 from whoosh.util.filelock import try_for
 from whoosh.util.text import utf8encode
 
     # Subclass whoosh.externalsort.SortingPool to use knowledge of
     # postings to set run size in bytes instead of items
 
-    def __init__(self, limitmb=128, **kwargs):
+    namechars = "abcdefghijklmnopqrstuvwxyz0123456789"
+
+    def __init__(self, tempstore, segment, limitmb=128, **kwargs):
         SortingPool.__init__(self, **kwargs)
+        self.tempstore = tempstore
+        self.segment = segment
         self.limit = limitmb * 1024 * 1024
         self.currentsize = 0
 
+    def _new_run(self):
+        path = "%s.run" % random_name()
+        f = self.tempstore.create_file(path).raw_file()
+        return path, f
+
+    def _open_run(self, path):
+        return self.tempstore.open_file(path).raw_file()
+
+    def _remove_run(self, path):
+        return self.tempstore.delete_file(path)
+
     def add(self, item):
         # item = (fieldname, tbytes, docnum, weight, vbytes)
         assert isinstance(item[1], bytes_type), "tbytes=%r" % item[1]
         self._setup_doc_offsets()
 
         # Internals
-        poolprefix = "whoosh_%s_" % self.indexname
-        self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix)
-        newsegment = self.newsegment = codec.new_segment(self.storage,
-                                                         self.indexname)
+        self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname)
+        newsegment = codec.new_segment(self.storage, self.indexname)
+        self.newsegment = newsegment
         self.compound = compound and newsegment.should_assemble()
         self.is_closed = False
         self._added = False
+        self.pool = PostingPool(self._tempstorage, self.newsegment,
+                                limitmb=limitmb)
 
         # Set up writers
         self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
 
             yield (fieldname, text, newdoc, weight, vbytes)
 
+    def temp_storage(self):
+        return self._tempstorage
+
     def add_field(self, fieldname, fieldspec, **kwargs):
         self._check_state()
         if self._added:
     def _finish(self):
         if self.writelock:
             self.writelock.release()
+        self._tempstorage.destroy()
         self.is_closed = True
         #self.storage.close()
 
         """
 
         self._check_state()
-        try:
-            # Merge old segments if necessary
-            finalsegments = self._merge_segments(mergetype, optimize, merge)
-            if self._added:
-                # Flush the current segment being written and add it to the
-                # list of remaining segments returned by the merge policy
-                # function
-                finalsegments.append(self._finalize_segment())
-            else:
-                # Close segment files
-                self._close_segment()
-            # Write TOC
-            self._commit_toc(finalsegments)
-        finally:
-            # Final cleanup
-            self._finish()
+        # Merge old segments if necessary
+        finalsegments = self._merge_segments(mergetype, optimize, merge)
+        if self._added:
+            # Flush the current segment being written and add it to the
+            # list of remaining segments returned by the merge policy
+            # function
+            finalsegments.append(self._finalize_segment())
+        else:
+            # Close segment files
+            self._close_segment()
+        # Write TOC
+        self._commit_toc(finalsegments)
+
+        # Final cleanup
+        self._finish()
 
     def cancel(self):
         self._check_state()

tests/test_columns.py

               ("b", "ijk"), ("c", "fGgHh"), ("a", "9abc")]
 
     st = RamStorage()
-    msw = compound.CompoundWriter()
+    msw = compound.CompoundWriter(st)
     files = dict((name, msw.create_file(name)) for name in "abc")
     for name, data in domain:
         files[name].write(b(data))
     outfiles = dict((name, BytesIO(value)) for name, value in domain.items())
 
     with TempStorage() as st:
-        msw = compound.CompoundWriter(buffersize=4096)
+        msw = compound.CompoundWriter(st, buffersize=4096)
         mfiles = {}
         for name in domain:
             mfiles[name] = msw.create_file(name)