Commits

Matt Chaput committed c9d30b9

Changed code to store temporary files under the index storage dir by default.
Added WrappingCodec class.
Removed accidentally committed debug prints.
Minor cleanups.

  • Participants
  • Parent commits 9238dd8

Comments (0)

Files changed (14)

File src/whoosh/codec/base.py

 This module contains base classes/interfaces for "codec" objects.
 """
 
-import random
 from bisect import bisect_right
 
 from whoosh import columns
 from whoosh.compat import abstractmethod, izip, xrange
 from whoosh.filedb.compound import CompoundStorage
 from whoosh.system import emptybytes
+from whoosh.util import random_name
 
 
 # Exceptions
         raise NotImplementedError
 
 
+class WrappingCodec(Codec):
+    def __init__(self, child):
+        self._child = child
+
+    def per_document_writer(self, storage, segment):
+        return self._child.per_document_writer(storage, segment)
+
+    def field_writer(self, storage, segment):
+        return self._child.field_writer(storage, segment)
+
+    def postings_writer(self, dbfile, byteids=False):
+        return self._child.postings_writer(dbfile, byteids=byteids)
+
+    def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
+        return self._child.postings_reader(dbfile, terminfo, format_, term=term,
+                                           scorer=scorer)
+
+    def terms_reader(self, storage, segment):
+        return self._child.terms_reader(storage, segment)
+
+    def per_document_reader(self, storage, segment):
+        return self._child.per_document_reader(storage, segment)
+
+    def supports_graph(self):
+        return self._child.supports_graph()
+
+    def graph_reader(self, storage, segment):
+        return self._child.graph_reader(storage, segment)
+
+    def new_segment(self, storage, indexname):
+        return self._child.new_segment(storage, indexname)
+
+
 # Writer classes
 
 class PerDocumentWriter(object):
     along the way).
     """
 
-    # These must be valid separate characters in CASE-INSENSTIVE filenames
-    IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
     # Extension for compound segment files
     COMPOUND_EXT = ".seg"
 
         self.compound = False
 
     @classmethod
-    def _random_id(cls, size=12):
-        return "".join(random.choice(cls.IDCHARS) for _ in xrange(size))
+    def _random_id(cls, size=16):
+        return random_name(size=size)
 
     def __repr__(self):
-        return "<%s %s>" % (self.__class__.__name__,
-                            self.segment_id())
+        return "<%s %s>" % (self.__class__.__name__, self.segment_id())
 
     def codec(self):
         raise NotImplementedError

File src/whoosh/codec/whoosh3.py

         self._storage = storage
         self._segment = segment
 
-        self._cols = compound.CompoundWriter()
+        tempst = storage.temp_storage("%s.tmp" % segment.indexname)
+        self._cols = compound.CompoundWriter(tempst)
         self._colwriters = {}
         self._create_column("_stored", STORED_COLUMN)
 
         self._fieldobj = None
         self._format = None
 
-        self._fieldmap = {}
         _tifile = self._create_file(W3Codec.TERMS_EXT)
         self._tindex = filetables.OrderedHashWriter(_tifile)
-        self._tindex.extras["fieldmap"] = self._fieldmap
+        self._fieldmap = self._tindex.extras["fieldmap"] = {}
 
         self._postfile = self._create_file(W3Codec.POSTS_EXT)
 
         # Minify values
 
         fixedsize = self._format.fixed_value_size()
-        print "format=", self._format, "***fixedsize=", fixedsize
         values = self._values
 
         if fixedsize is None or fixedsize < 0:
 
         # De-minify the values
         fixedsize = self._fixedsize
-        print "data=", self._data, "format=", self.format, "fixed=", fixedsize
         vs = self._data[2]
         if fixedsize is None or fixedsize < 0:
             self._values = vs

File src/whoosh/collectors.py

                         break
                     usequality = self._use_block_quality()
                     replacecounter = self.replace
-                    minscore = self.minscore
+
+                    if self.minscore != minscore:
+                        checkquality = True
+                        minscore = self.minscore
+
                 replacecounter -= 1
 
             # If we're using block quality optimizations, and the checkquality

File src/whoosh/externalsort.py

     tuples, lists, and dicts).
     """
 
-    filenamechars = "abcdefghijklmnopqrstuvwxyz_1234567890"
-
     def __init__(self, maxsize=1000000, tempdir=None, prefix="",
                  suffix=".run"):
         """
         f = os.fdopen(fd, "wb")
         return path, f
 
-    @staticmethod
-    def _read_run(path):
-        import os.path
-        f = open(path, "rb")
+    def _open_run(self, path):
+        return open(path, "rb")
+
+    def _remove_run(self, path):
+        os.remove(path)
+
+    def _read_run(self, path):
+        f = self._open_run(path)
         try:
             while True:
                 yield load(f)
             return
         finally:
             f.close()
-            os.remove(path)
+            self._remove_run(path)
 
-    @classmethod
-    def _merge_runs(cls, paths):
-        iters = [cls._read_run(path) for path in paths]
+    def _merge_runs(self, paths):
+        iters = [self._read_run(path) for path in paths]
         for item in imerge(iters):
             yield item
 

File src/whoosh/filedb/compound.py

 # policies, either expressed or implied, of Matt Chaput.
 
 import errno, sys
-from tempfile import TemporaryFile
 from threading import Lock
 from shutil import copyfileobj
 
 from whoosh.filedb.structfile import BufferFile, StructFile
 from whoosh.filedb.filestore import FileStorage
 from whoosh.system import emptybytes
+from whoosh.util import random_name
 
 
 class CompoundStorage(FileStorage):
             pos = self._pos + where
         elif whence == 2:  # From end
             pos = self._length - where
+        else:
+            raise ValueError
 
         self._pos = pos
 
 
 
 class CompoundWriter(object):
-    def __init__(self, buffersize=32 * 1024):
+    def __init__(self, tempstorage, buffersize=32 * 1024):
         assert isinstance(buffersize, int)
-        self._temp = TemporaryFile()
+        self._tempstorage = tempstorage
+        self._tempname = "%s.ctmp" % random_name()
+        self._temp = tempstorage.create_file(self._tempname, mode="w+b")
         self._buffersize = buffersize
         self._streams = {}
 
 
             yield (name, gen)
         temp.close()
+        self._tempstorage.delete_file(self._tempname)
 
     def save_as_compound(self, dbfile):
         basepos = dbfile.tell()

File src/whoosh/filedb/filestore.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-import errno, os, random, sys
+import errno, os, sys, tempfile
 from threading import Lock
 
 from whoosh.compat import BytesIO, memoryview_
 from whoosh.filedb.structfile import BufferFile, StructFile
 from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError
+from whoosh.util import random_name
 from whoosh.util.filelock import FileLock
 
 
 
         raise NotImplementedError
 
-    def create_temp(self):
-        """Creates a randomly-named temporary file.
-
-        :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
-        """
-
-        name = hex(random.getrandbits(128))[2:] + ".tmp"
-        return name, self.create_file(name)
-
     def open_file(self, name, *args, **kwargs):
         """Opens a file with the given name in this storage.
 
 
         pass
 
+    def temp_storage(self, name=None):
+        """Creates a new storage object for temporary files. You can call
+        :meth:`Storage.destroy` on the new storage when you're finished with
+        it.
+
+        :param name: a name for the new storage. This may be optional or
+            required depending on the storage implementation.
+        :rtype: :class:`Storage`
+        """
+
+        raise NotImplementedError
+
 
 class OverlayStorage(Storage):
     """Overlays two storage objects. Reads are processed from the first if it
     def create_file(self, *args, **kwargs):
         return self.b.create_file(*args, **kwargs)
 
-    def create_temp(self, *args, **kwargs):
-        return self.b.create_temp(*args, **kwargs)
-
     def open_file(self, name, *args, **kwargs):
         if self.a.file_exists(name):
             return self.a.open_file(name, *args, **kwargs)
         self.a.optimize()
         self.b.optimize()
 
+    def temp_storage(self, name=None):
+        return self.b.temp_storage(name=name)
+
 
 class FileStorage(Storage):
     """Storage object that stores the index as files in a directory on disk.
         # If the given directory does not already exist, try to create it
         try:
             os.makedirs(dirpath)
-        except IOError:
+        except OSError:
             # This is necessary for compatibility between Py2 and Py3
             e = sys.exc_info()[1]
             # If we get an error because the path already exists, ignore it
     def _fpath(self, fname):
         return os.path.abspath(os.path.join(self.folder, fname))
 
-    def clean(self):
+    def clean(self, ignore=False):
         if self.readonly:
             raise ReadOnlyError
 
         path = self.folder
         files = self.list()
         for fname in files:
-            os.remove(os.path.join(path, fname))
+            try:
+                os.remove(os.path.join(path, fname))
+            except OSError:
+                if not ignore:
+                    raise
 
     def list(self):
         try:
     def lock(self, name):
         return FileLock(self._fpath(name))
 
+    def temp_storage(self, name=None):
+        name = name or "%s.tmp" % random_name()
+        path = os.path.join(self.folder, name)
+        tempstore = FileStorage(path)
+        return tempstore.create()
+
 
 class RamStorage(Storage):
     """Storage object that keeps the index in memory.
             self.locks[name] = Lock()
         return self.locks[name]
 
+    def temp_storage(self, name=None):
+        tdir = tempfile.gettempdir()
+        name = name or "%s.tmp" % random_name()
+        path = os.path.join(tdir, name)
+        tempstore = FileStorage(path)
+        return tempstore.create()
+
+
+def copy_storage(sourcestore, deststore):
+    """Copies the files from the source storage object to the destination
+    storage object using ``shutil.copyfileobj``.
+    """
+    from shutil import copyfileobj
+
+    for name in sourcestore.list():
+        with sourcestore.open_file(name) as source:
+            with deststore.create_file(name) as dest:
+                copyfileobj(source, dest)
+
 
 def copy_to_ram(storage):
     """Copies the given FileStorage object into a new RamStorage object.
     :rtype: :class:`RamStorage`
     """
 
-    import shutil
     ram = RamStorage()
-    for name in storage.list():
-        f = storage.open_file(name)
-        r = ram.create_file(name)
-        shutil.copyfileobj(f.file, r.file)
-        f.close()
-        r.close()
+    copy_storage(storage, ram)
     return ram

File src/whoosh/filedb/structfile.py

     def __iter__(self):
         return iter(self.file)
 
+    def raw_file(self):
+        return self.file
+
     def read(self, *args, **kwargs):
         return self.file.read(*args, **kwargs)
 
 
 
 class ChecksumFile(StructFile):
-    def __init__(self, child):
-        self._child = child
+    def __init__(self, *args, **kwargs):
+        StructFile.__init__(self, *args, **kwargs)
         self._check = 0
         self._crc32 = __import__("zlib").crc32
 
-    def __getattr__(self, name):
-        if name in self.__dict__:
-            return self.__dict__[name]
-        else:
-            return getattr(self._child, name)
-
     def __iter__(self):
-        for line in self._child:
+        for line in self.file:
             self._check = self._crc32(line, self._check)
             yield line
 
         raise Exception("Cannot seek on a ChecksumFile")
 
     def read(self, *args, **kwargs):
-        b = self._child.read(*args, **kwargs)
+        b = self.file.read(*args, **kwargs)
         self._check = self._crc32(b, self._check)
         return b
 
     def write(self, b):
         self._check = self._crc32(b, self._check)
-        self._child.write(b)
+        self.file.write(b)
 
     def checksum(self):
         return self._check & 0xffffffff

File src/whoosh/matching/combo.py

 
 class CombinationMatcher(mcore.Matcher):
     def __init__(self, submatchers, boost=1.0):
-        assert submatchers, submatchers
         self._submatchers = submatchers
         self._boost = boost
 

File src/whoosh/multiproc.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from __future__ import with_statement
-import os, tempfile
+import os
 from multiprocessing import Process, Queue, cpu_count
 
 from whoosh.compat import xrange, iteritems, pickle
 from whoosh.codec import base
 from whoosh.writing import PostingPool, SegmentWriter
 from whoosh.externalsort import imerge
+from whoosh.util import random_name
 
 
 def finish_subsegment(writer, k=64):
         # the only command codes is 0=add_document
 
         writer = self.writer
+        tempstorage = writer.temp_storage()
+
         load = pickle.load
-        with open(filename, "rb") as f:
+        with tempstorage.open_file(filename).raw_file() as f:
             for _ in xrange(doc_count):
                 # Load the next pickled tuple from the file
                 code, args = load(f)
                 assert code == 0
                 writer.add_document(**args)
         # Remove the job file
-        os.remove(filename)
+        tempstorage.delete_file(filename)
 
     def cancel(self):
         self.running = False
         docbuffer = self.docbuffer
         dump = pickle.dump
         length = len(docbuffer)
-        fd, filename = tempfile.mkstemp(".doclist")
-        with os.fdopen(fd, "wb") as f:
+
+        filename = "%s.doclist" % random_name()
+        with self.temp_storage().create_file(filename).raw_file() as f:
             for item in docbuffer:
                 dump(item, f, -1)
 
         # Note that SortingPool._read_run() automatically deletes the run file
         # when it's finished
 
-        gen = PostingPool._read_run(path)
+        gen = self.pool._read_run(path)
         # If offset is 0, just return the items unchanged
         if not offset:
             return gen
                                  merge=merge)
 
     def _commit(self, mergetype, optimize, merge):
-        try:
-            # Index the remaining documents in the doc buffer
-            if self.docbuffer:
-                self._enqueue()
-            # Tell the tasks to finish
-            for task in self.tasks:
-                self.jobqueue.put(None)
+        # Index the remaining documents in the doc buffer
+        if self.docbuffer:
+            self._enqueue()
+        # Tell the tasks to finish
+        for task in self.tasks:
+            self.jobqueue.put(None)
 
-            # Merge existing segments
-            finalsegments = self._merge_segments(mergetype, optimize, merge)
+        # Merge existing segments
+        finalsegments = self._merge_segments(mergetype, optimize, merge)
 
-            # Wait for the subtasks to finish
-            for task in self.tasks:
-                task.join()
+        # Wait for the subtasks to finish
+        for task in self.tasks:
+            task.join()
 
-            # Pull a (run_file_name, segment) tuple off the result queue for
-            # each sub-task, representing the final results of the task
-            results = []
-            for task in self.tasks:
-                results.append(self.resultqueue.get(timeout=5))
+        # Pull a (run_file_name, segment) tuple off the result queue for
+        # each sub-task, representing the final results of the task
+        results = []
+        for task in self.tasks:
+            results.append(self.resultqueue.get(timeout=5))
 
-            if self.multisegment:
-                finalsegments += [s for _, s in results]
-                if self._added:
-                    finalsegments.append(self._finalize_segment())
-                else:
-                    self._close_segment()
+        if self.multisegment:
+            finalsegments += [s for _, s in results]
+            if self._added:
+                finalsegments.append(self._finalize_segment())
             else:
-                # Merge the posting sources from the sub-writers and my
-                # postings into this writer
-                self._merge_subsegments(results, mergetype)
                 self._close_segment()
-                self._assemble_segment()
-                finalsegments.append(self.get_segment())
-            self._commit_toc(finalsegments)
-        finally:
-            self._finish()
+            assert self.perdocwriter.is_closed
+        else:
+            # Merge the posting sources from the sub-writers and my
+            # postings into this writer
+            self._merge_subsegments(results, mergetype)
+            self._close_segment()
+            self._assemble_segment()
+            finalsegments.append(self.get_segment())
+            assert self.perdocwriter.is_closed
+
+        self._commit_toc(finalsegments)
+        self._finish()
 
     def _merge_subsegments(self, results, mergetype):
         storage = self.storage
     def _commit(self, mergetype, optimize, merge):
         # Pull a (run_file_name, segment) tuple off the result queue for each
         # sub-task, representing the final results of the task
-        try:
-            # Merge existing segments
-            finalsegments = self._merge_segments(mergetype, optimize, merge)
-            results = []
-            for writer in self.tasks:
-                results.append(finish_subsegment(writer))
-            self._merge_subsegments(results, mergetype)
-            self._close_segment()
-            self._assemble_segment()
-            finalsegments.append(self.get_segment())
-            self._commit_toc(finalsegments)
-        finally:
-            self._finish()
+
+        # Merge existing segments
+        finalsegments = self._merge_segments(mergetype, optimize, merge)
+        results = []
+        for writer in self.tasks:
+            results.append(finish_subsegment(writer))
+
+        self._merge_subsegments(results, mergetype)
+        self._close_segment()
+        self._assemble_segment()
+        finalsegments.append(self.get_segment())
+
+        self._commit_toc(finalsegments)
+        self._finish()
 
 
 # For compatibility with old multiproc module

File src/whoosh/reading.py

 
         return None
 
+    def storage(self):
+        """Returns the :class:`whoosh.filedb.filestore.Storage` object used by
+        this reader to read its files. If the reader is not atomic,
+        (``reader.is_atomic() == True``), returns None.
+        """
+
+        return None
+
     def is_atomic(self):
         return True
 
         self.schema = schema
         self.is_closed = False
 
-        self._storage = storage
         self._segment = segment
         self._segid = self._segment.segment_id()
         self._gen = generation
             # Use an overlay here instead of just the compound storage, in rare
             # circumstances a segment file may be added after the segment is
             # written
-            self._files = OverlayStorage(files, self._storage)
+            self._storage = OverlayStorage(files, storage)
         else:
-            self._files = storage
+            self._storage = storage
 
         # Get subreaders from codec
         self._codec = codec if codec else segment.codec()
-        self._terms = self._codec.terms_reader(self._files, segment)
-        self._perdoc = self._codec.per_document_reader(self._files, segment)
+        self._terms = self._codec.terms_reader(self._storage, segment)
+        self._perdoc = self._codec.per_document_reader(self._storage, segment)
         self._graph = None  # Lazy open with self._get_graph()
 
     def _get_graph(self):
         if not self._graph:
-            self._graph = self._codec.graph_reader(self._files, self._segment)
+            self._graph = self._codec.graph_reader(self._storage, self._segment)
         return self._graph
 
     def codec(self):
     def segment(self):
         return self._segment
 
+    def storage(self):
+        return self._storage
+
     def has_deletions(self):
         return self._perdoc.has_deletions()
 
 
         # It's possible some weird codec that doesn't use storage might have
         # passed None instead of a storage object
-        if self._files:
-            self._files.close()
+        if self._storage:
+            self._storage.close()
 
         self.is_closed = True
 

File src/whoosh/util/__init__.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from __future__ import with_statement
-import re, sys, time
+import random, sys, time
 from bisect import insort, bisect_left
 from functools import wraps
 
-from whoosh.compat import string_type
+
+# These must be valid separate characters in CASE-INSENSTIVE filenames
+IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
 
 
 if sys.platform == 'win32':
     now = time.time
 
 
+def random_name(size=28):
+    return "".join(random.choice(IDCHARS) for _ in xrange(size))
+
+
 def make_binary_tree(fn, args, **kwargs):
     """Takes a function/class that takes two positional arguments and a list of
     arguments and returns a binary tree of results/instances.

File src/whoosh/util/numlists.py

 
     def get(self, f, pos, i):
         f.seek(pos)
+        n = None
         for n in self.read_nums(f, i + 1):
             pass
         return n
         """
 
         count = 0
-
+        key = None
         for _ in xrange(n):
             if count == 0:
                 key = f.read_byte()

File src/whoosh/writing.py

 from whoosh.fields import UnknownFieldError
 from whoosh.index import LockError
 from whoosh.system import emptybytes
-from whoosh.util import fib
+from whoosh.util import fib, random_name
 from whoosh.util.filelock import try_for
 from whoosh.util.text import utf8encode
 
     # Subclass whoosh.externalsort.SortingPool to use knowledge of
     # postings to set run size in bytes instead of items
 
-    def __init__(self, limitmb=128, **kwargs):
+    namechars = "abcdefghijklmnopqrstuvwxyz0123456789"
+
+    def __init__(self, tempstore, segment, limitmb=128, **kwargs):
         SortingPool.__init__(self, **kwargs)
+        self.tempstore = tempstore
+        self.segment = segment
         self.limit = limitmb * 1024 * 1024
         self.currentsize = 0
 
+    def _new_run(self):
+        path = "%s.run" % random_name()
+        f = self.tempstore.create_file(path).raw_file()
+        return path, f
+
+    def _open_run(self, path):
+        return self.tempstore.open_file(path).raw_file()
+
+    def _remove_run(self, path):
+        return self.tempstore.delete_file(path)
+
     def add(self, item):
         # item = (fieldname, tbytes, docnum, weight, vbytes)
         assert isinstance(item[1], bytes_type), "tbytes=%r" % item[1]
         self._setup_doc_offsets()
 
         # Internals
-        poolprefix = "whoosh_%s_" % self.indexname
-        self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix)
-        newsegment = self.newsegment = codec.new_segment(self.storage,
-                                                         self.indexname)
+        self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname)
+        newsegment = codec.new_segment(self.storage, self.indexname)
+        self.newsegment = newsegment
         self.compound = compound and newsegment.should_assemble()
         self.is_closed = False
         self._added = False
+        self.pool = PostingPool(self._tempstorage, self.newsegment,
+                                limitmb=limitmb)
 
         # Set up writers
         self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
 
             yield (fieldname, text, newdoc, weight, vbytes)
 
+    def temp_storage(self):
+        return self._tempstorage
+
     def add_field(self, fieldname, fieldspec, **kwargs):
         self._check_state()
         if self._added:
     def _finish(self):
         if self.writelock:
             self.writelock.release()
+        self._tempstorage.destroy()
         self.is_closed = True
         #self.storage.close()
 
         """
 
         self._check_state()
-        try:
-            # Merge old segments if necessary
-            finalsegments = self._merge_segments(mergetype, optimize, merge)
-            if self._added:
-                # Flush the current segment being written and add it to the
-                # list of remaining segments returned by the merge policy
-                # function
-                finalsegments.append(self._finalize_segment())
-            else:
-                # Close segment files
-                self._close_segment()
-            # Write TOC
-            self._commit_toc(finalsegments)
-        finally:
-            # Final cleanup
-            self._finish()
+        # Merge old segments if necessary
+        finalsegments = self._merge_segments(mergetype, optimize, merge)
+        if self._added:
+            # Flush the current segment being written and add it to the
+            # list of remaining segments returned by the merge policy
+            # function
+            finalsegments.append(self._finalize_segment())
+        else:
+            # Close segment files
+            self._close_segment()
+        # Write TOC
+        self._commit_toc(finalsegments)
+
+        # Final cleanup
+        self._finish()
 
     def cancel(self):
         self._check_state()

File tests/test_columns.py

               ("b", "ijk"), ("c", "fGgHh"), ("a", "9abc")]
 
     st = RamStorage()
-    msw = compound.CompoundWriter()
+    msw = compound.CompoundWriter(st)
     files = dict((name, msw.create_file(name)) for name in "abc")
     for name, data in domain:
         files[name].write(b(data))
     outfiles = dict((name, BytesIO(value)) for name, value in domain.items())
 
     with TempStorage() as st:
-        msw = compound.CompoundWriter(buffersize=4096)
+        msw = compound.CompoundWriter(st, buffersize=4096)
         mfiles = {}
         for name in domain:
             mfiles[name] = msw.create_file(name)