Commits

Matt Chaput committed 71e4c1e

Writer now assembles segment files into a single compound file by default.
(Use myindex.writer(compound=False) to disable.)
Removed uses of mmap other than with compound file.
Fixed bugs in file cleanup, RamIndex.

Comments (0)

Files changed (16)

src/whoosh/codec/base.py

 
 from array import array
 from struct import Struct, pack
+from bisect import bisect_right
 
 from whoosh.compat import loads, dumps, b, bytes_type, string_type, xrange
 from whoosh.matching import Matcher, ReadTooFar
 # Base classes
 
 class Codec(object):
-    def __init__(self, storage):
-        self.storage = storage
-
     # Per document value writer
-    def per_document_writer(self, segment):
+    def per_document_writer(self, storage, segment):
         raise NotImplementedError
 
     # Inverted index writer
-    def field_writer(self, segment):
+    def field_writer(self, storage, segment):
         raise NotImplementedError
 
     # Readers
 
-    def terms_reader(self, segment):
+    def terms_reader(self, storage, segment):
         raise NotImplementedError
 
-    def lengths_reader(self, segment):
+    def lengths_reader(self, storage, segment):
         raise NotImplementedError
 
-    def vector_reader(self, segment):
+    def vector_reader(self, storage, segment):
         raise NotImplementedError
 
-    def stored_fields_reader(self, segment):
+    def stored_fields_reader(self, storage, segment):
         raise NotImplementedError
 
-    def graph_reader(self, segment):
+    def graph_reader(self, storage, segment):
         raise NotImplementedError
 
     # Generations
 
-    def commit_toc(self, indexname, schema, segments, generation):
+    def commit_toc(self, storage, indexname, schema, segments, generation):
         raise NotImplementedError
 
 
         # items = (fieldname, text, docnum, weight, valuestring) ...
         lastfn = None
         lasttext = None
-        getlen = lengths.get
+        dfl = lengths.doc_field_length
         for fieldname, text, docnum, weight, valuestring in items:
             # Items where docnum is None indicate words that should be added
             # to the spelling graph
                     lastfn = fieldname
                 start_term(text)
                 lasttext = text
-            length = getlen(docnum, fieldname)
+            length = dfl(docnum, fieldname)
             add(docnum, weight, valuestring, length)
         if lasttext is not None:
             finish_term()
 
 
 class LengthsReader(object):
-    def get(self, docnum, fieldname):
+    def doc_field_length(self, docnum, fieldname, default=0):
         raise NotImplementedError
 
     def field_length(self, fieldname):
         pass
 
 
+class MultiLengths(LengthsReader):
+    def __init__(self, lengths):
+        self.lengths = lengths
+        self.doc_offsets = []
+        self._count = 0
+        for lr in self.lengths:
+            self.doc_offsets.append(self._count)
+            self._count += lr.doc_count_all()
+        self.is_closed = False
+
+    def _document_reader(self, docnum):
+        return max(0, bisect_right(self.doc_offsets, docnum) - 1)
+
+    def _reader_and_docnum(self, docnum):
+        lnum = self._document_reader(docnum)
+        offset = self.doc_offsets[lnum]
+        return lnum, docnum - offset
+
+    def doc_count_all(self):
+        return self._count
+
+    def doc_field_length(self, docnum, fieldname, default=0):
+        x, y = self._reader_and_docnum(docnum)
+        return self.lengths[x].doc_field_length(y, fieldname, default=default)
+
+    def min_field_length(self):
+        return min(lr.min_field_length() for lr in self.lengths)
+
+    def max_field_length(self):
+        return max(lr.max_field_length() for lr in self.lengths)
+
+    def close(self):
+        for lr in self.lengths:
+            lr.close()
+        self.is_closed = True
+
+
 class StoredFieldsReader(object):
     def __iter__(self):
         raise NotImplementedError

src/whoosh/codec/legacy.py

         return block
 
     def read_ids(self):
-        dataoffset = self.dataoffset
-        string = self.postfile.map[dataoffset:dataoffset + self.idslen]
+        self.postfile.seek(self.dataoffset)
+        string = self.postfile.read(self.idslen)
         self.ids = deminimize_ids(string)
 
     def read_weights(self):
             return [1.0] * self.count
         else:
             offset = self.dataoffset + self.idslen
-            string = self.postfile.map[offset:offset + self.weightslen]
+            self.postfile.seek(offset)
+            string = self.postfile.read(self.weightslen)
             return deminimize_weights(self.count, string, self.cmp)
 
     def read_values(self):
             return [None] * self.count
         else:
             offset = self.dataoffset + self.idslen + self.weightslen
-            string = self.postfile.map[offset:self.nextoffset]
+            self.postfile.seek(offset)
+            string = self.postfile.read(self.nextoffset - offset)
             return deminimize_values(postingsize, self.count, string, self.cmp)
 
 
 
         postingsize = self.postingsize
         if postingsize != 0:
-            values_string = postfile.map[startoffset:endoffset]
+            postfile.seek(startoffset)
+            values_string = postfile.read(endoffset - startoffset)
 
             if self.wtslen:
                 # Values string is compressed

src/whoosh/codec/standard.py

 from whoosh.codec.base import (minimize_ids, deminimize_ids, minimize_weights,
                                deminimize_weights, minimize_values,
                                deminimize_values)
-from whoosh.filedb.fileindex import TOC, clean_files
+from whoosh.filedb.fileindex import Segment, TOC, clean_files
 from whoosh.filedb.filetables import CodedOrderedWriter, CodedOrderedReader
 from whoosh.matching import ListMatcher
 from whoosh.reading import TermNotFound
+from whoosh.store import Storage
 from whoosh.support.dawg import GraphWriter, GraphReader
 from whoosh.system import (pack_ushort, pack_long, unpack_ushort, unpack_long,
                            _INT_SIZE, _LONG_SIZE)
     VPOSTS_EXT = ".vps"  # Vector postings
     STORED_EXT = ".sto"  # Stored fields file
 
-    def __init__(self, storage, blocklimit=128, compression=3,
-                 loadlengths=False, inlinelimit=1):
-        self.storage = storage
+    def __init__(self, blocklimit=128, compression=3, loadlengths=False,
+                 inlinelimit=1):
         self.blocklimit = blocklimit
         self.compression = compression
         self.loadlengths = loadlengths
         self.inlinelimit = inlinelimit
 
     # Per-document value writer
-    def per_document_writer(self, segment):
-        return StdPerDocWriter(self.storage, segment,
-                               blocklimit=self.blocklimit,
+    def per_document_writer(self, storage, segment):
+        return StdPerDocWriter(storage, segment, blocklimit=self.blocklimit,
                                compression=self.compression)
 
     # Inverted index writer
-    def field_writer(self, segment):
-        return StdFieldWriter(self.storage, segment,
-                               blocklimit=self.blocklimit,
-                               compression=self.compression,
-                               inlinelimit=self.inlinelimit)
+    def field_writer(self, storage, segment):
+        return StdFieldWriter(storage, segment, blocklimit=self.blocklimit,
+                              compression=self.compression,
+                              inlinelimit=self.inlinelimit)
 
     # Readers
 
-    def terms_reader(self, segment):
-        tifile = segment.open_file(self.storage, self.TERMS_EXT)
-        postfile = segment.open_file(self.storage, self.POSTS_EXT, mapped=False)
+    def terms_reader(self, storage, segment):
+        tifile = segment.open_file(storage, self.TERMS_EXT)
+        postfile = segment.open_file(storage, self.POSTS_EXT)
         return StdTermsReader(tifile, postfile)
 
-    def lengths_reader(self, segment):
-        flfile = segment.open_file(self.storage, self.LENGTHS_EXT)
+    def lengths_reader(self, storage, segment):
+        flfile = segment.open_file(storage, self.LENGTHS_EXT)
         doccount = segment.doc_count_all()
 
         # Check the first byte of the file to see if it's an old format
             lengths = OnDiskLengths(flfile, doccount)
         return lengths
 
-    def vector_reader(self, segment):
-        vifile = segment.open_file(self.storage, self.VECTOR_EXT)
-        postfile = segment.open_file(self.storage, self.VPOSTS_EXT, mapped=False)
+    def vector_reader(self, storage, segment):
+        vifile = segment.open_file(storage, self.VECTOR_EXT)
+        postfile = segment.open_file(storage, self.VPOSTS_EXT)
         return StdVectorReader(vifile, postfile)
 
-    def stored_fields_reader(self, segment):
-        sffile = segment.open_file(self.storage, self.STORED_EXT, mapped=False)
+    def stored_fields_reader(self, storage, segment):
+        sffile = segment.open_file(storage, self.STORED_EXT)
         return StoredFieldReader(sffile)
 
-    def graph_reader(self, segment):
-        dawgfile = segment.open_file(self.storage, self.DAWG_EXT, mapped=False)
+    def graph_reader(self, storage, segment):
+        dawgfile = segment.open_file(storage, self.DAWG_EXT)
         return GraphReader(dawgfile)
 
     # Generations
 
-    def commit_toc(self, indexname, schema, segments, generation):
+    def commit_toc(self, storage, indexname, schema, segments, generation,
+                   clean=True):
         toc = TOC(schema, segments, generation)
-        toc.write(self.storage, indexname)
+        toc.write(storage, indexname)
         # Delete leftover files
-        clean_files(self.storage, indexname, generation, segments)
+        if clean:
+            clean_files(storage, indexname, generation, segments)
 
 
 # Per-document value writer
 
 class StdPerDocWriter(base.PerDocumentWriter):
     def __init__(self, storage, segment, blocklimit=128, compression=3):
+        if not isinstance(blocklimit, int):
+            raise ValueError
         self.storage = storage
         self.segment = segment
         self.blocklimit = blocklimit
 
         # We'll wait to create the vector files until someone actually tries
         # to add a vector
+        self.vindex = self.vpostfile = None
+
+    def _make_vector_files(self):
         vifile = self.segment.create_file(self.storage, StdCodec.VECTOR_EXT)
         self.vindex = VectorWriter(vifile)
         self.vpostfile = self.segment.create_file(self.storage, StdCodec.VPOSTS_EXT)
         return StdBlock(postingsize, stringids=True)
 
     def add_vector_items(self, fieldname, fieldobj, items):
+        if self.vindex is None:
+            self._make_vector_files()
+
         # items = (text, freq, weight, valuestring) ...
         postfile = self.vpostfile
         blocklimit = self.blocklimit
 class StdFieldWriter(base.FieldWriter):
     def __init__(self, storage, segment, blocklimit=128, compression=3,
                  inlinelimit=1):
+        assert isinstance(storage, Storage)
+        assert isinstance(segment, Segment)
+        assert isinstance(blocklimit, int)
+        assert isinstance(compression, int)
+        assert isinstance(inlinelimit, int)
+
         self.storage = storage
         self.segment = segment
         self.fieldname = None
         for fieldname in self.starts:
             self.starts[fieldname] += eoh
 
+    def doc_count_all(self):
+        return self._count
+
     def field_length(self, fieldname):
         return self.totals.get(fieldname, 0)
 
 
     # Get
 
-    def get(self, docnum, fieldname, default=0):
+    def doc_field_length(self, docnum, fieldname, default=0):
         try:
             arry = self.lengths[fieldname]
         except KeyError:
         self.dbfile = dbfile
         self._read_header(dbfile, doccount)
 
-    def get(self, docnum, fieldname, default=0):
+    def doc_field_length(self, docnum, fieldname, default=0):
         try:
             start = self.starts[fieldname]
         except KeyError:
             raise Exception("Error reading %r @%s %s < %s"
                             % (dbfile, start, len(ptr), stored_pointer_size))
         position, length = unpack_stored_pointer(ptr)
-        vlist = loads(dbfile.map[position:position + length] + b("."))
+        dbfile.seek(position)
+        vlist = loads(dbfile.read(length) + b("."))
 
         names = self.names
         # Recreate a dictionary by putting the field names and values back
 
     def read_ids(self):
         offset = self.dataoffset
-        idstring = self.postfile.map[offset:offset + self.idslen]
+        self.postfile.seek(offset)
+        idstring = self.postfile.read(self.idslen)
         ids = deminimize_ids(self.idcode, self.count, idstring,
                              self.compression)
         self.ids = ids
             weights = [1.0] * self.count
         else:
             offset = self.dataoffset + self.idslen
-            wtstring = self.postfile.map[offset:offset + self.weightslen]
+            self.postfile.seek(offset)
+            wtstring = self.postfile.read(self.weightslen)
             weights = deminimize_weights(self.count, wtstring,
                                          self.compression)
         self.weights = weights
             values = [None] * self.count
         else:
             offset = self.dataoffset + self.idslen + self.weightslen
-            vstring = self.postfile.map[offset:self.nextoffset]
+            self.postfile.seek(offset)
+            vstring = self.postfile.read(self.nextoffset - offset)
             values = deminimize_values(postingsize, self.count, vstring,
                                        self.compression)
         self.values = values

src/whoosh/filedb/compound.py

+# Copyright 2011 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+import mmap
+from threading import Lock
+from shutil import copyfileobj
+
+from whoosh.compat import BytesIO, PY3
+from whoosh.filedb.structfile import StructFile
+from whoosh.filedb.filestore import FileStorage
+
+
+class CompoundStorage(FileStorage):
+    readonly = True
+
+    def __init__(self, store, name, basepos=0):
+        self.name = name
+        f = store.open_file(name)
+        f.seek(basepos)
+
+        self.diroffset = f.read_long()
+        self.dirlength = f.read_int()
+        f.seek(self.diroffset)
+        self.dir = f.read_pickle()
+        self.options = f.read_pickle()
+
+        if store.supports_mmap:
+            self.source = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+        else:
+            # Can't mmap files in this storage object, so we'll have to take
+            # the hit and read the whole file as a string :(
+            f.seek(basepos)
+            self.source = f.read(self.diroffset)
+        f.close()
+        self.locks = {}
+
+    def __repr__(self):
+        return "<%s (%s)>" % (self.__class__.__name__, self.name)
+
+    def open_file(self, name, *args, **kwargs):
+        info = self.dir[name]
+        offset = info["offset"]
+        length = info["length"]
+        if PY3:
+            buf = memoryview(self.source)[offset:offset + length]
+        else:
+            buf = buffer(self.source, offset, length)
+        f = StructFile(BytesIO(buf), name=name)
+        return f
+
+    def list(self):
+        return list(self.dir.keys())
+
+    def file_exists(self, name):
+        return name in self.dir
+
+    def file_length(self, name):
+        info = self.dir[name]
+        return info["length"]
+
+    def file_modified(self, name):
+        info = self.dir[name]
+        return info["modified"]
+
+    def lock(self, name):
+        if name not in self.locks:
+            self.locks[name] = Lock()
+        return self.locks[name]
+
+    @staticmethod
+    def assemble(out, store, names, **options):
+        assert names, names
+
+        dir = {}
+        basepos = out.tell()
+        out.write_long(0)  # Directory position
+        out.write_int(0)  # Directory length
+
+        # Copy the files into the compound file
+        for name in names:
+            if name.endswith(".toc") or name.endswith(".seg"):
+                raise Exception(name)
+
+        for name in names:
+            offset = out.tell()
+            length = store.file_length(name)
+            modified = store.file_modified(name)
+            dir[name] = {"offset": offset, "length": length,
+                         "modified": modified}
+            f = store.open_file(name)
+            copyfileobj(f, out)
+            f.close()
+
+        dirpos = out.tell()  # Remember the start of the directory
+        out.write_pickle(dir)  # Write the directory
+        out.write_pickle(options)
+        endpos = out.tell()  # Remember the end of the directory
+        out.flush()
+        out.seek(basepos)  # Seek back to the start
+        out.write_long(dirpos)  # Directory position
+        out.write_int(endpos - dirpos)  # Directory length
+
+        out.close()
+
+
+
+
+

src/whoosh/filedb/fieldcache.py

             filename = gzfilename
             gzipped = True
 
-        f = storage.open_file(filename, mapped=False, gzip=gzipped)
+        f = storage.open_file(filename, gzip=gzipped)
         cache = self.fcclass.from_file(f)
         f.close()
         return cache

src/whoosh/filedb/fileindex.py

 from whoosh import __version__
 from whoosh.compat import pickle, integer_types, string_type, xrange
 from whoosh.fields import ensure_schema
+from whoosh.filedb.compound import CompoundStorage
 from whoosh.index import (Index, EmptyIndexError, IndexVersionError,
                           _DEF_INDEX_NAME)
 from whoosh.reading import EmptyReader, MultiReader
 
     @classmethod
     def _segment_pattern(cls, indexname):
-        return re.compile("(_%s_[0-9a-z]+)[.][a-z]+" % indexname)
+        return re.compile("_(%s_[0-9a-z]+)[.][a-z]+" % indexname)
 
     @classmethod
     def _latest_generation(cls, storage, indexname):
     # probably be deleted eventually by a later call to clean_files.
 
     current_segment_names = set(s.segment_id() for s in segments)
-
     tocpattern = TOC._pattern(indexname)
     segpattern = TOC._segment_pattern(indexname)
 
     todelete = set()
     for filename in storage:
+        if filename.startswith("."):
+            continue
         tocm = tocpattern.match(filename)
         segm = segpattern.match(filename)
         if tocm:
     def is_empty(self):
         return len(self._read_toc().segments) == 0
 
-    def optimize(self):
-        w = self.writer()
+    def optimize(self, **kwargs):
+        w = self.writer(**kwargs)
         w.commit(optimize=True)
 
     # searcher
 
     def writer(self, procs=1, **kwargs):
         if procs > 1:
-            from whoosh.filedb.multiproc2 import MpWriter
+            from whoosh.filedb.multiproc import MpWriter
             return MpWriter(self, **kwargs)
         else:
             from whoosh.filedb.filewriting import SegmentWriter
     along the way).
     """
 
+    # These must be valid separate characters in CASE-INSENSTIVE filenames
     IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
+    # Extension for compound segment files
+    COMPOUND_EXT = ".seg"
 
     @classmethod
     def _random_id(cls, size=12):
         self.doccount = doccount
         self.segid = self._random_id() if segid is None else segid
         self.deleted = deleted
+        self.compound = False
 
     def __repr__(self):
         return "<%s %s>" % (self.__class__.__name__, getattr(self, "segid", ""))
             return "%s_%s" % (self.indexname, self.segid)
 
     def make_filename(self, ext):
-        return "%s%s" % (self.segment_id(), ext)
+        return "_%s%s" % (self.segment_id(), ext)
+
+    def list_files(self, storage):
+        prefix = "_%s." % self.segment_id()
+        return [name for name in storage.list() if name.startswith(prefix)]
 
     def create_file(self, storage, ext, **kwargs):
         """Convenience method to create a new file in the given storage named
         fname = self.make_filename(ext)
         return storage.open_file(fname, **kwargs)
 
+    def create_compound_file(self, storage):
+        segfiles = self.list_files(storage)
+        assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles)
+        cfile = self.create_file(storage, self.COMPOUND_EXT)
+        CompoundStorage.assemble(cfile, storage, segfiles)
+        for name in segfiles:
+            storage.delete_file(name)
+
+    def open_compound_file(self, storage):
+        name = self.make_filename(self.COMPOUND_EXT)
+        return CompoundStorage(storage, name)
+
     def doc_count_all(self):
         """
         :returns: the total number of documents, DELETED OR UNDELETED, in this

src/whoosh/filedb/filereading.py

 
 from bisect import bisect_left
 
-from whoosh.compat import iteritems, string_type, integer_types, xrange
+from whoosh.compat import iteritems, xrange
+from whoosh.filedb.compound import CompoundStorage
 from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.fieldcache import FieldCache, DefaultFieldCachingPolicy
-from whoosh.matching import FilterMatcher, ListMatcher
+from whoosh.matching import FilterMatcher
 from whoosh.reading import IndexReader, TermNotFound
+from whoosh.store import OverlayStorage
 from whoosh.support import dawg
 
 
         self._dc = segment.doc_count()
         self._dc_all = segment.doc_count_all()
         if hasattr(self.segment, "segment_id"):
-            self.segid = str(self.segment.segment_id())
+            self.segid = self.segment.segment_id()
         else:
             self.segid = Segment._random_id()
 
+        # self.files is a storage object from which to load the segment files.
+        # This is different from the general storage (which will be used for
+        # cahces) if the segment is in a compound file.
+        if segment.compound:
+            # Use an overlay here instead of just the compound storage because
+            # in rare circumstances a segment file may be added after the
+            # segment is written
+            self.files = OverlayStorage(segment.open_compound_file(storage),
+                                        self.storage)
+        else:
+            self.files = storage
+
         # Get microreaders from codec
         if codec is None:
             from whoosh.codec.standard import StdCodec
-            codec = StdCodec(self.storage)
+            codec = StdCodec()
         self._codec = codec
-        self._terms = codec.terms_reader(self.segment)
-        self._lengths = codec.lengths_reader(self.segment)
-        self._stored = codec.stored_fields_reader(self.segment)
+        self._terms = codec.terms_reader(self.files, self.segment)
+        self._lengths = codec.lengths_reader(self.files, self.segment)
+        self._stored = codec.stored_fields_reader(self.files, self.segment)
         self._vectors = None  # Lazy open with self._open_vectors()
         self._graph = None  # Lazy open with self._open_dawg()
 
     def _open_vectors(self):
         if self._vectors:
             return
-        self._vectors = self._codec.vector_reader(self.segment)
+        self._vectors = self._codec.vector_reader(self.files, self.segment)
 
     def _open_dawg(self):
         if self._graph:
             return
-        self._graph = self._codec.graph_reader(self.segment)
+        self._graph = self._codec.graph_reader(self.files, self.segment)
 
     def has_deletions(self):
         return self._has_deletions
         return self._lengths.max_field_length(fieldname)
 
     def doc_field_length(self, docnum, fieldname, default=0):
-        return self._lengths.get(docnum, fieldname, default=default)
+        return self._lengths.doc_field_length(docnum, fieldname,
+                                              default=default)
 
     def has_vector(self, docnum, fieldname):
         if self.schema[fieldname].vector:
-            self._open_vectors()
+            try:
+                self._open_vectors()
+            except (NameError, IOError):
+                return False
             return (docnum, fieldname) in self._vectors
         else:
             return False
             return False
         if not self.schema[fieldname].spelling:
             return False
-        self._open_dawg()
+        try:
+            self._open_dawg()
+        except (NameError, IOError):
+            return False
         return self._graph.has_root(fieldname)
 
     def word_graph(self, fieldname):

src/whoosh/filedb/filestore.py

     pass
 
 
+def create_index(storage, schema, indexname):
+    from whoosh.filedb.fileindex import TOC, FileIndex
+
+    if storage.readonly:
+        raise ReadOnlyError
+    TOC.create(storage, schema, indexname)
+    return FileIndex(storage, schema, indexname)
+
+
+def open_index(storage, schema, indexname):
+    from whoosh.filedb.fileindex import FileIndex
+
+    return FileIndex(storage, schema=schema, indexname=indexname)
+
 class FileStorage(Storage):
     """Storage object that stores the index as files in a directory on disk.
     """
 
-    def __init__(self, path, mapped=True, readonly=False):
+    supports_mmap = True
+
+    def __init__(self, path, mapped=False, readonly=False):
         self.folder = path
         self.mapped = mapped
         self.readonly = readonly
             raise IOError("Directory %s does not exist" % path)
 
     def create_index(self, schema, indexname=_DEF_INDEX_NAME):
-        from whoosh.filedb.fileindex import TOC, FileIndex
-
-        if self.readonly:
-            raise ReadOnlyError
-        TOC.create(self, schema, indexname)
-        return FileIndex(self, schema, indexname)
+        return create_index(self, schema, indexname)
 
     def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
-        from whoosh.filedb.fileindex import FileIndex
-
-        return FileIndex(self, schema=schema, indexname=indexname)
+        return open_index(self, schema, indexname)
 
     def create_file(self, name, excl=False, mode="wb", **kwargs):
         if self.readonly:
         else:
             fileobj = open(path, mode)
 
-        f = StructFile(fileobj, name=name, mapped=self.mapped, **kwargs)
+        f = StructFile(fileobj, name=name, **kwargs)
         return f
 
     def open_file(self, name, *args, **kwargs):
-        try:
-            f = StructFile(open(self._fpath(name), "rb"), name=name, *args,
-                           **kwargs)
-        except IOError:
-            #print("Tried to open %r, files=%r" % (name, self.list()))
-            raise
+        f = StructFile(open(self._fpath(name), "rb"), name=name, *args,
+                       **kwargs)
         return f
 
     def _fpath(self, fname):
-        return os.path.join(self.folder, fname)
+        return os.path.abspath(os.path.join(self.folder, fname))
 
     def clean(self):
+        if self.readonly:
+            raise ReadOnlyError
+
         path = self.folder
         if not os.path.exists(path):
             os.mkdir(path)
 
         files = self.list()
-        for file in files:
-            os.remove(os.path.join(path, file))
+        for fname in files:
+            os.remove(os.path.join(path, fname))
 
     def list(self):
         try:
         return os.path.getsize(self._fpath(name))
 
     def delete_file(self, name):
+        if self.readonly:
+            raise ReadOnlyError
+
         os.remove(self._fpath(name))
 
-    def rename_file(self, frm, to, safe=False):
-        if os.path.exists(self._fpath(to)):
+    def rename_file(self, oldname, newname, safe=False):
+        if self.readonly:
+            raise ReadOnlyError
+
+        if os.path.exists(self._fpath(newname)):
             if safe:
-                raise NameError("File %r exists" % to)
+                raise NameError("File %r exists" % newname)
             else:
-                os.remove(self._fpath(to))
-        os.rename(self._fpath(frm), self._fpath(to))
+                os.remove(self._fpath(newname))
+        os.rename(self._fpath(oldname), self._fpath(newname))
 
     def lock(self, name):
         return FileLock(self._fpath(name))
         return "%s(%s)" % (self.__class__.__name__, repr(self.folder))
 
 
-class RamStorage(FileStorage):
+class RamStorage(Storage):
     """Storage object that keeps the index in memory.
     """
 
+    supports_mmap = False
+
     def __init__(self):
         self.files = {}
         self.locks = {}
         self.folder = ''
 
+    def create_index(self, schema, indexname=_DEF_INDEX_NAME):
+        return create_index(self, schema, indexname)
+
+    def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
+        return open_index(self, schema, indexname)
+
     def list(self):
         return list(self.files.keys())
 
 
     def file_length(self, name):
         if name not in self.files:
-            raise NameError
+            raise NameError(name)
         return len(self.files[name])
 
+    def file_modified(self, name):
+        return -1
+
     def delete_file(self, name):
         if name not in self.files:
-            raise NameError
+            raise NameError(name)
         del self.files[name]
 
     def rename_file(self, name, newname, safe=False):
         if name not in self.files:
-            raise NameError("File %r does not exist" % name)
+            raise NameError(name)
         if safe and newname in self.files:
             raise NameError("File %r exists" % newname)
 
 
     def open_file(self, name, *args, **kwargs):
         if name not in self.files:
-            raise NameError("No such file %r" % name)
+            raise NameError(name)
         return StructFile(BytesIO(self.files[name]), name=name, *args,
                           **kwargs)
 

src/whoosh/filedb/filetables.py

 class HashReader(object):
     def __init__(self, dbfile):
         self.dbfile = dbfile
-        self.map = dbfile.map
 
         dbfile.seek(0)
         magic = dbfile.read(4)
     def close(self):
         if self.is_closed:
             raise Exception("Tried to close %r twice" % self)
-        del self.map
         self.dbfile.close()
         self.is_closed = True
 
     def read(self, position, length):
-        return self.map[position:position + length]
+        self.dbfile.seek(position)
+        return self.dbfile.read(length)
 
     def _ranges(self, pos=None):
         if pos is None:

src/whoosh/filedb/filewriting.py

 from whoosh.store import LockError
 from whoosh.support.filelock import try_for
 from whoosh.support.externalsort import SortingPool
-from whoosh.util import fib, utf8encode
+from whoosh.util import fib
 from whoosh.writing import IndexWriter, IndexingError
 
 
         self.currentsize = 0
 
 
+def renumber_postings(reader, startdoc, docmap):
+    for fieldname, text, docnum, weight, value in reader.iter_postings():
+        newdoc = docmap[docnum] if docmap else startdoc + docnum
+        yield (fieldname, text, newdoc, weight, value)
+
+
 # Writer object
 
 class SegmentWriter(IndexWriter):
     def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True,
-                 limitmb=128, docbase=0, codec=None, **kwargs):
+                 limitmb=128, docbase=0, codec=None, compound=True, **kwargs):
         # Lock the index
         self.writelock = None
         if _lk:
         self._setup_doc_offsets()
 
         # Internals
+        self.compound = compound
         poolprefix = "whoosh_%s_" % self.indexname
         self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix)
-        self.newsegment = Segment(self.indexname, 0)
+        newsegment = self.newsegment = Segment(self.indexname, 0)
         self.is_closed = False
         self._added = False
 
         # Set up writers
         if codec is None:
             from whoosh.codec.standard import StdCodec
-            codec = StdCodec(self.storage)
+            codec = StdCodec()
         self.codec = codec
-        self.perdocwriter = codec.per_document_writer(self.newsegment)
-        self.fieldwriter = codec.field_writer(self.newsegment)
+        self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
+        self.fieldwriter = codec.field_writer(self.storage, newsegment)
 
     def _setup_doc_offsets(self):
         self._doc_offsets = []
                 yield (fieldname, text, newdoc, weight, valuestring)
         self.fieldwriter.add_postings(schema, lengths, gen())
 
-    def add_reader(self, reader):
-        self._check_state()
-        schema = self.schema
-        perdocwriter = self.perdocwriter
+    def _make_docmap(self, reader, newdoc):
+        # If the reader has deletions, make a dictionary mapping the docnums
+        # of undeleted documents to new sequential docnums starting at newdoc
         hasdel = reader.has_deletions()
         if hasdel:
-            # Documents will be renumbered because the deleted documents will
-            # be skipped, so keep a mapping between old and new docnums
             docmap = {}
+            for docnum in reader.all_doc_ids():
+                if reader.is_deleted(docnum):
+                    continue
+                docmap[docnum] = newdoc
+                newdoc += 1
         else:
             docmap = None
+            newdoc += reader.doc_count_all()
+        # Return the map and the new lowest unused document number
+        return docmap, newdoc
+
+    def _merge_per_doc(self, reader, docmap):
+        schema = self.schema
+        newdoc = self.docnum
+        perdocwriter = self.perdocwriter
         sharedfields = set(schema.names()) & set(reader.schema.names())
 
-        # Add per-document values
-        startdoc = newdoc = self.docnum
         for docnum in reader.all_doc_ids():
             # Skip deleted documents
-            if (not hasdel) or (not reader.is_deleted(docnum)):
-                if hasdel:
-                    docmap[docnum] = newdoc
+            if docmap and docnum not in docmap:
+                continue
+            # Renumber around deletions
+            if docmap:
+                newdoc = docmap[docnum]
 
-                # Get the stored fields
-                d = reader.stored_fields(docnum)
-                # Start a new document in the writer
-                perdocwriter.start_doc(newdoc)
-                # For each field in the document, copy its stored value,
-                # length, and vectors (if any) to the writer
-                for fieldname in sharedfields:
-                    field = schema[fieldname]
-                    length = (reader.doc_field_length(docnum, fieldname, 0)
-                              if field.scorable else 0)
-                    perdocwriter.add_field(fieldname, field, d.get(fieldname),
-                                           length)
-                    if field.vector and reader.has_vector(docnum, fieldname):
-                        v = reader.vector(docnum, fieldname)
-                        perdocwriter.add_vector_matcher(fieldname, field, v)
-                # Finish the new document 
-                perdocwriter.finish_doc()
-                newdoc += 1
-        self.docnum = newdoc
+            # Get the stored fields
+            d = reader.stored_fields(docnum)
+            # Start a new document in the writer
+            perdocwriter.start_doc(newdoc)
+            # For each field in the document, copy its stored value,
+            # length, and vectors (if any) to the writer
+            for fieldname in sharedfields:
+                field = schema[fieldname]
+                length = (reader.doc_field_length(docnum, fieldname, 0)
+                          if field.scorable else 0)
+                perdocwriter.add_field(fieldname, field, d.get(fieldname),
+                                       length)
+                if field.vector and reader.has_vector(docnum, fieldname):
+                    v = reader.vector(docnum, fieldname)
+                    perdocwriter.add_vector_matcher(fieldname, field, v)
+            # Finish the new document 
+            perdocwriter.finish_doc()
+            newdoc += 1
 
+    def _merge_fields(self, reader, docmap):
         # Add inverted index postings to the pool, renumbering document number
         # references as necessary
         add_post = self.pool.add
-        for fieldname, text, docnum, weight, value in reader.iter_postings():
-            # Remap the document number if necessary
-            newdoc = docmap[docnum] if hasdel else startdoc + docnum
-            # Add the posting to the pool
-            add_post((fieldname, text, newdoc, weight, value))
+        # Note: iter_postings() only yields postings for undeleted docs
+        for p in renumber_postings(reader, self.docnum, docmap):
+            add_post(p)
 
+    def add_reader(self, reader):
+        self._check_state()
+
+        # Make a docnum map to renumber around deleted documents
+        docmap, newdoc = self._make_docmap(reader, self.docnum)
+        # Add per-document values
+        self._merge_per_doc(reader, docmap)
+        # Add field postings
+        self._merge_fields(reader, docmap)
+
+        self.docnum = newdoc
         self._added = True
 
     def _check_fields(self, schema, fieldnames):
 
         self._check_state()
         schema = self.schema
+        storage = self.storage
         try:
             if mergetype:
                 pass
             else:
                 self.pool.cleanup()
 
-            # Close all files, write a new TOC with the new segment list, and
-            # release the lock.
+            # Close all files
             self._close_all()
-            self.codec.commit_toc(self.indexname, self.schema, finalsegments,
-                                  self.generation)
+
+            if self._added and self.compound:
+                # Assemble the segment files into a compound file
+                newsegment.create_compound_file(storage)
+                newsegment.compound = True
+
+            # Write a new TOC with the new segment list (and delete old files)
+            self.codec.commit_toc(storage, self.indexname, schema,
+                                  finalsegments, self.generation)
         finally:
             if self.writelock:
                 self.writelock.release()

src/whoosh/filedb/structfile.py

     "write_varint" and "write_long".
     """
 
-    def __init__(self, fileobj, name=None, onclose=None, mapped=True,
-                 gzip=False):
+    def __init__(self, fileobj, name=None, onclose=None, gzip=False):
 
         if gzip:
             fileobj = GzipFile(fileobj=fileobj)
                 setattr(self, attr, getattr(fileobj, attr))
 
         self.is_real = not gzip and hasattr(fileobj, "fileno")
-
-        # If mapped is True, set the 'map' attribute to a memory-mapped
-        # representation of the file. Otherwise, the fake 'map' that set up by
-        # the base class will be used.
-        if (mapped and self.is_real
-            and hasattr(fileobj, "mode") and "r" in fileobj.mode):
-            fd = fileobj.fileno()
-            self.size = os.fstat(fd).st_size
-            if self.size > 0:
-                import mmap
-
-                try:
-                    self.map = mmap.mmap(fd, self.size,
-                                         access=mmap.ACCESS_READ)
-                except OSError:
-                    self._setup_fake_map()
-        else:
-            self._setup_fake_map()
+        if self.is_real:
+            self.fileno = fileobj.fileno
 
     def __repr__(self):
         return "%s(%r)" % (self.__class__.__name__, self._name)
         """Flushes the buffer of the wrapped file. This is a no-op if the
         wrapped file does not have a flush method.
         """
+
         if hasattr(self.file, "flush"):
             self.file.flush()
 
     def close(self):
-        """Closes the wrapped file. This is a no-op if the wrapped file does
-        not have a close method.
+        """Closes the wrapped file.
         """
 
         if self.is_closed:
             raise Exception("This file is already closed")
-        del self.map
         if self.onclose:
             self.onclose(self)
         if hasattr(self.file, "close"):
             self.file.close()
         self.is_closed = True
 
-    def _setup_fake_map(self):
-        _self = self
-
-        class fakemap(object):
-            def __getitem__(self, slc):
-                if isinstance(slc, integer_types):
-                    _self.seek(slc)
-                    return _self.read(1)[0]
-                else:
-                    _self.seek(slc.start)
-                    return _self.read(slc.stop - slc.start)
-
-        self.map = fakemap()
-
     def write_string(self, s):
         """Writes a string to the wrapped file. This method writes the length
         of the string first, so you can read the string back without having to
     def read_byte(self):
         return ord(self.file.read(1))
 
-    def get_byte(self, position):
-        v = self.map[position]
-        if PY3:  # Getting an item returns an int
-            return v
-        else:  # Getting an item returns a 1-character str
-            return ord(v[0])
-
     def write_8bitfloat(self, f, mantissabits=5, zeroexp=2):
         """Writes a byte-sized representation of floating point value f to the
         wrapped file.
             a.byteswap()
         return a
 
+    def get_byte(self, position):
+        self.file.seek(position)
+        return self.read_byte()
+
     def get_sbyte(self, position):
-        return unpack_sbyte(self.map[position:position + 1])[0]
+        self.file.seek(position)
+        return self.read_sbyte()
 
     def get_int(self, position):
-        return unpack_int(self.map[position:position + _INT_SIZE])[0]
+        self.file.seek(position)
+        return self.read_int()
 
     def get_uint(self, position):
-        return unpack_uint(self.map[position:position + _INT_SIZE])[0]
+        self.file.seek(position)
+        return self.read_uint()
 
     def get_ushort(self, position):
-        return unpack_ushort(self.map[position:position + _SHORT_SIZE])[0]
+        self.file.seek(position)
+        return self.read_ushort()
 
     def get_long(self, position):
-        return unpack_long(self.map[position:position + _LONG_SIZE])[0]
+        self.file.seek(position)
+        return self.read_long()
 
     def get_float(self, position):
-        return unpack_float(self.map[position:position + _FLOAT_SIZE])[0]
+        self.file.seek(position)
+        return self.read_float()
 
     def get_array(self, position, typecode, length):
-        source = self.map[position:position + length * _SIZEMAP[typecode]]
-        a = array(typecode)
-        a.fromstring(source)
-        if IS_LITTLE:
-            a.byteswap()
-        return a
+        self.file.seek(position)
+        self.read_array(typecode, length)
+
+
+
+

src/whoosh/index.py

     return storage.create_index(schema, indexname)
 
 
-def open_dir(dirname, indexname=None, mapped=True, readonly=False):
+def open_dir(dirname, indexname=None, readonly=False):
     """Convenience function for opening an index in a directory. Takes care of
     creating a FileStorage object for you. dirname is the filename of the
     directory in containing the index. indexname is the name of the index to
         index.
     :param indexname: the name of the index to create; you only need to specify
         this if you have multiple indexes within the same storage object.
-    :param mapped: whether to use memory mapping to speed up disk reading.
-    :returns: :class:`Index`
     """
 
     if indexname is None:
         indexname = _DEF_INDEX_NAME
 
     from whoosh.filedb.filestore import FileStorage
-    storage = FileStorage(dirname, mapped=mapped, readonly=readonly)
+    storage = FileStorage(dirname, readonly=readonly)
     return storage.open_index(indexname)
 
 

src/whoosh/store.py

     """
 
     readonly = False
+    supports_mmap = False
 
     def __iter__(self):
         return iter(self.list())
 
     def optimize(self):
         pass
+
+
+class OverlayStorage(Storage):
+    """Overlays two storage objects. Reads are processed from the first if it
+    has the named file, otherwise the second. Writes always go to the second.
+    """
+
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+
+    def create_index(self, *args, **kwargs):
+        self.b.create_index(*args, **kwargs)
+
+    def open_index(self, *args, **kwargs):
+        self.a.open_index(*args, **kwargs)
+
+    def create_file(self, *args, **kwargs):
+        return self.b.create_file(*args, **kwargs)
+
+    def create_temp(self, *args, **kwargs):
+        return self.b.create_temp(*args, **kwargs)
+
+    def open_file(self, name, *args, **kwargs):
+        if self.a.file_exists(name):
+            return self.a.open_file(name, *args, **kwargs)
+        else:
+            return self.b.open_file(name, *args, **kwargs)
+
+    def list(self):
+        return list(set(self.a.list()) | set(self.b.list()))
+
+    def file_exists(self, name):
+        return self.a.file_exists(name) or self.b.file_exists(name)
+
+    def file_modified(self, name):
+        if self.a.file_exists(name):
+            return self.a.file_modified(name)
+        else:
+            return self.b.file_modified(name)
+
+    def file_length(self, name):
+        if self.a.file_exists(name):
+            return self.a.file_length(name)
+        else:
+            return self.b.file_length(name)
+
+    def delete_file(self, name):
+        return self.b.delete_file(name)
+
+    def rename_file(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def lock(self, name):
+        return self.b.lock(name)
+
+    def close(self):
+        self.a.close()
+        self.b.close()
+
+    def optimize(self):
+        self.a.optimize()
+        self.b.optimize()
+
+
+
+
+
+
+
+
+
+
+
+
+

tests/test_codecs.py

 
 def _make_codec(**kwargs):
     st = RamStorage()
-    codec = standard.StdCodec(st, **kwargs)
+    codec = standard.StdCodec(**kwargs)
     seg = Segment("test")
-    return codec, seg
+    return st, codec, seg
 
 def test_termkey():
     from whoosh.codec.standard import TermIndexWriter
 
 def test_docwriter_one():
     field = fields.TEXT(stored=True)
-    codec, seg = _make_codec()
-    dw = codec.per_document_writer(seg)
+    st, codec, seg = _make_codec()
+    dw = codec.per_document_writer(st, seg)
     dw.start_doc(0)
     dw.add_field("text", field, "Testing one two three", 4)
     dw.finish_doc()
     dw.close()
     seg.doccount = 1
 
-    lr = codec.lengths_reader(seg)
-    assert_equal(lr.get(0, "text"), 4)
+    lr = codec.lengths_reader(st, seg)
+    assert_equal(lr.doc_field_length(0, "text"), 4)
 
-    sr = codec.stored_fields_reader(seg)
+    sr = codec.stored_fields_reader(st, seg)
     assert_equal(sr[0], {"text": "Testing one two three"})
 
 def test_docwriter_two():
     field = fields.TEXT(stored=True)
-    codec, seg = _make_codec()
-    dw = codec.per_document_writer(seg)
+    st, codec, seg = _make_codec()
+    dw = codec.per_document_writer(st, seg)
     dw.start_doc(0)
     dw.add_field("title", field, ("a", "b"), 2)
     dw.add_field("text", field, "Testing one two three", 4)
     dw.close()
     seg.doccount = 2
 
-    lr = codec.lengths_reader(seg)
-    assert_equal(lr.get(0, "title"), 2)
-    assert_equal(lr.get(0, "text"), 4)
-    assert_equal(lr.get(1, "title"), 3)
-    assert_equal(lr.get(1, "text"), 1)
+    lr = codec.lengths_reader(st, seg)
+    assert_equal(lr.doc_field_length(0, "title"), 2)
+    assert_equal(lr.doc_field_length(0, "text"), 4)
+    assert_equal(lr.doc_field_length(1, "title"), 3)
+    assert_equal(lr.doc_field_length(1, "text"), 1)
 
-    sr = codec.stored_fields_reader(seg)
+    sr = codec.stored_fields_reader(st, seg)
     assert_equal(sr[0], {"title": ("a", "b"), "text": "Testing one two three"})
     assert_equal(sr[1], {"title": "The second document", "text": 500})
 
 def test_vector():
     field = fields.TEXT(vector=True)
-    codec, seg = _make_codec()
-    dw = codec.per_document_writer(seg)
+    st, codec, seg = _make_codec()
+    dw = codec.per_document_writer(st, seg)
     dw.start_doc(0)
     dw.add_field("title", field, None, 1)
     dw.add_vector_items("title", field, [(u("alfa"), 1, 1.0, "t1"),
     dw.close()
     seg.doccount = 1
 
-    sf = codec.stored_fields_reader(seg)
+    sf = codec.stored_fields_reader(st, seg)
     assert_equal(sf[0], {})
 
-    vr = codec.vector_reader(seg)
+    vr = codec.vector_reader(st, seg)
     m = vr.matcher(0, "title", field.vector)
     assert m.is_active()
     ps = []
 
 def test_vector_values():
     field = fields.TEXT(vector=formats.Frequency())
-    codec, seg = _make_codec()
+    st, codec, seg = _make_codec()
     content = u("alfa bravo charlie alfa")
 
-    dw = codec.per_document_writer(seg)
+    dw = codec.per_document_writer(st, seg)
     dw.start_doc(0)
     vals = sorted(field.vector.word_values(content, field.analyzer))
     dw.add_vector_items("f1", field, vals)
     dw.finish_doc()
     dw.close()
 
-    vr = codec.vector_reader(seg)
+    vr = codec.vector_reader(st, seg)
     m = vr.matcher(0, "f1", field.vector)
     assert_equal(list(m.items_as("frequency")), [("alfa", 2), ("bravo", 1),
                                                  ("charlie", 1)])
 
 def test_no_lengths():
     f1 = fields.ID()
-    codec, seg = _make_codec()
-    dw = codec.per_document_writer(seg)
+    st, codec, seg = _make_codec()
+    dw = codec.per_document_writer(st, seg)
     dw.start_doc(0)
     dw.add_field("name", f1, None, None)
     dw.finish_doc()
     dw.close()
     seg.doccount = 3
 
-    lr = codec.lengths_reader(seg)
-    assert_equal(lr.get(0, "name"), 0)
-    assert_equal(lr.get(1, "name"), 0)
-    assert_equal(lr.get(2, "name"), 0)
+    lr = codec.lengths_reader(st, seg)
+    assert_equal(lr.doc_field_length(0, "name"), 0)
+    assert_equal(lr.doc_field_length(1, "name"), 0)
+    assert_equal(lr.doc_field_length(2, "name"), 0)
 
 def test_store_zero():
     f1 = fields.ID(stored=True)
-    codec, seg = _make_codec()
-    dw = codec.per_document_writer(seg)
+    st, codec, seg = _make_codec()
+    dw = codec.per_document_writer(st, seg)
     dw.start_doc(0)
     dw.add_field("name", f1, 0, None)
     dw.finish_doc()
     dw.close()
     seg.doccount = 1
 
-    sr = codec.stored_fields_reader(seg)
+    sr = codec.stored_fields_reader(st, seg)
     assert_equal(sr[0], {"name": 0})
 
 def test_fieldwriter_single_term():
     field = fields.TEXT()
-    codec, seg = _make_codec()
+    st, codec, seg = _make_codec()
 
-    fw = codec.field_writer(seg)
+    fw = codec.field_writer(st, seg)
     fw.start_field("text", field)
     fw.start_term(u("alfa"))
     fw.add(0, 1.5, b("test"), 1)
     fw.finish_field()
     fw.close()
 
-    tr = codec.terms_reader(seg)
+    tr = codec.terms_reader(st, seg)
     assert ("text", "alfa") in tr
     ti = tr.terminfo("text", "alfa")
     assert_equal(ti.weight(), 1.5)
 
 def test_fieldwriter_two_terms():
     field = fields.TEXT()
-    codec, seg = _make_codec()
+    st, codec, seg = _make_codec()
 
-    fw = codec.field_writer(seg)
+    fw = codec.field_writer(st, seg)
     fw.start_field("text", field)
     fw.start_term(u("alfa"))
     fw.add(0, 2.0, b("test1"), 2)
     fw.finish_field()
     fw.close()
 
-    tr = codec.terms_reader(seg)
+    tr = codec.terms_reader(st, seg)
     assert ("text", "alfa") in tr
     ti = tr.terminfo("text", "alfa")
     assert_equal(ti.weight(), 3.0)
 
 def test_fieldwriter_multiblock():
     field = fields.TEXT()
-    codec, seg = _make_codec(blocklimit=2)
+    st, codec, seg = _make_codec(blocklimit=2)
 
-    fw = codec.field_writer(seg)
+    fw = codec.field_writer(st, seg)
     fw.start_field("text", field)
     fw.start_term(u("alfa"))
     fw.add(0, 2.0, b("test1"), 2)
     fw.finish_field()
     fw.close()
 
-    tr = codec.terms_reader(seg)
+    tr = codec.terms_reader(st, seg)
     ti = tr.terminfo("text", "alfa")
     assert_equal(ti.weight(), 15.0)
     assert_equal(ti.doc_frequency(), 5)
 
 def test_term_values():
     field = fields.TEXT(phrase=False)
-    codec, seg = _make_codec()
+    st, codec, seg = _make_codec()
     content = u("alfa bravo charlie alfa")
 
-    fw = codec.field_writer(seg)
+    fw = codec.field_writer(st, seg)
     fw.start_field("f1", field)
     for text, freq, weight, val in sorted(field.index(content)):
         fw.start_term(text)
     fw.finish_field()
     fw.close()
 
-    tr = codec.terms_reader(seg)
+    tr = codec.terms_reader(st, seg)
     ps = [(text, ti.weight(), ti.doc_frequency()) for text, ti in tr.items()]
     assert_equal(ps, [(("f1", "alfa"), 2.0, 1), (("f1", "bravo"), 1.0, 1),
                       (("f1", "charlie"), 1.0, 1)])
     _random_docnums = [1, 3, 12, 34, 43, 67, 68, 102, 145, 212, 283, 291, 412,
                        900, 905, 1024, 1800, 2048, 15000]
     with TempStorage("skip") as st:
-        codec = standard.StdCodec(st)
+        codec = standard.StdCodec()
         seg = Segment("")
         field = fields.TEXT()
 
-        fw = codec.field_writer(seg)
+        fw = codec.field_writer(st, seg)
         fw.start_field("f1", field)
         fw.start_term(u("test"))
         for n in _random_docnums:
         fw.finish_field()
         fw.close()
 
-        tr = codec.terms_reader(seg)
+        tr = codec.terms_reader(st, seg)
         m = tr.matcher("f1", "test", field.format)
         assert_equal(m.id(), 1)
         m.skip_to(220)
 
 def test_spelled_field():
     field = fields.TEXT(spelling=True)
-    codec, seg = _make_codec()
+    st, codec, seg = _make_codec()
 
-    fw = codec.field_writer(seg)
+    fw = codec.field_writer(st, seg)
     fw.start_field("text", field)
     fw.start_term(u("special"))
     fw.add(0, 1.0, b("test1"), 1)
     fw.finish_field()
     fw.close()
 
-    gr = codec.graph_reader(seg)
+    gr = codec.graph_reader(st, seg)
     assert gr.has_root("text")
     cur = gr.cursor("text")
     assert_equal(list(cur.flatten_strings()), ["special", "specific"])
     from whoosh.analysis import StemmingAnalyzer
 
     field = fields.TEXT(analyzer=StemmingAnalyzer(), spelling=True)
-    codec, seg = _make_codec()
+    st, codec, seg = _make_codec()
 
-    fw = codec.field_writer(seg)
+    fw = codec.field_writer(st, seg)
     fw.start_field("text", field)
     fw.start_term(u("special"))
     fw.add(0, 1.0, b("test1"), 1)
     fw.finish_field()
     fw.close()
 
-    tr = codec.terms_reader(seg)
+    tr = codec.terms_reader(st, seg)
     assert_equal(list(tr.keys()), [("text", "special"), ("text", "specific")])
 
-    cur = codec.graph_reader(seg).cursor("text")
+    cur = codec.graph_reader(st, seg).cursor("text")
     assert_equal(list(cur.flatten_strings()), ["specials", "specifically"])
 
 

tests/test_postings.py

 
 def _roundtrip(content, format_, astype, ana=None):
     with TempStorage("roundtrip") as st:
-        codec = StdCodec(st)
+        codec = StdCodec()
         seg = Segment("")
         ana = ana or analysis.StandardAnalyzer()
         field = fields.FieldType(format=format_, analyzer=ana)
 
-        fw = codec.field_writer(seg)
+        fw = codec.field_writer(st, seg)
         fw.start_field("f1", field)
         for text, _, weight, valuestring in sorted(field.index(content)):
             fw.start_term(text)
         fw.finish_field()
         fw.close()
 
-        tr = codec.terms_reader(seg)
+        tr = codec.terms_reader(st, seg)
         ps = []
         for fieldname, text in tr.keys():
             m = tr.matcher(fieldname, text, format_)

tests/test_writing.py

         with ix.searcher() as s:
             fs = s.document(b=u("india"))
             assert_equal(fs, {"b": "india", "cat": "juliet"})
+
+def test_add_reader():
+    schema = fields.Schema(i=fields.ID(stored=True, unique=True),
+                           a=fields.TEXT(stored=True, spelling=True),
+                           b=fields.TEXT(vector=True))
+    with TempIndex(schema, "addreader") as ix:
+        with ix.writer() as w:
+            w.add_document(i=u("0"), a=u("alfa bravo charlie delta"),
+                           b=u("able baker coxwell dog"))
+            w.add_document(i=u("1"), a=u("bravo charlie delta echo"),
+                           b=u("elf fabio gong hiker"))
+            w.add_document(i=u("2"), a=u("charlie delta echo foxtrot"),
+                           b=u("india joker king loopy"))
+            w.add_document(i=u("3"), a=u("delta echo foxtrot golf"),
+                           b=u("mister noogie oompah pancake"))
+
+        with ix.writer() as w:
+            w.delete_by_term("i", "1")
+            w.delete_by_term("i", "3")
+
+        with ix.writer() as w:
+            w.add_document(i=u("4"), a=u("hotel india juliet kilo"),
+                           b=u("quick rhubarb soggy trap"))
+            w.add_document(i=u("5"), a=u("india juliet kilo lima"),
+                           b=u("umber violet weird xray"))
+
+        with ix.reader() as r:
+            assert_equal(r.doc_count_all(), 4)
+
+            sfs = list(r.all_stored_fields())
+            assert_equal(sfs, [{"i": u("4"), "a": u("hotel india juliet kilo")},
+                               {"i": u("5"), "a": u("india juliet kilo lima")},
+                               {"i": u("0"), "a": u("alfa bravo charlie delta")},
+                               {"i": u("2"), "a": u("charlie delta echo foxtrot")},
+                               ])
+
+            assert_equal(list(r.lexicon("a")),
+                         ["alfa", "bravo", "charlie", "delta", "echo",
+                          "foxtrot", "hotel", "india", "juliet", "kilo", "lima"])
+
+            vs = []
+            for docnum in r.all_doc_ids():
+                v = r.vector(docnum, "b")
+                vs.append(list(v.all_ids()))
+            assert_equal(vs, [["quick", "rhubarb", "soggy", "trap"],
+                              ["umber", "violet", "weird", "xray"],
+                              ["able", "baker", "coxwell", "dog"],
+                              ["india", "joker", "king", "loopy"]
+                              ])
+
+            gr = r.word_graph("a")
+            assert_equal(list(gr.flatten_strings()),
+                         ["alfa", "bravo", "charlie", "delta", "echo",
+                          "foxtrot", "hotel", "india", "juliet", "kilo",
+                          "lima", ])
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+