Commits

Matt Chaput  committed 6f79230

Big refactoring to make filedb use a pluggable codec for writing and reading to disk.
This is still unstable. Multiprocessing isn't done and it might not work with old indices.

  • Participants
  • Parent commits 5e885af

Comments (0)

Files changed (39)

 from whoosh import __version__, versionstring
 
 setup(
-    name = "Whoosh",
-    version = versionstring(),
-    package_dir = {'': 'src'},
-    packages = ["whoosh", "whoosh.filedb", "whoosh.lang", "whoosh.qparser", "whoosh.support"],
+    name="Whoosh",
+    version=versionstring(),
+    package_dir={'': 'src'},
+    packages=["whoosh", "whoosh.codec", "whoosh.filedb", "whoosh.lang",
+              "whoosh.qparser", "whoosh.support"],
 
-    author = "Matt Chaput",
-    author_email = "matt@whoosh.ca",
+    author="Matt Chaput",
+    author_email="matt@whoosh.ca",
 
-    description = "Fast, pure-Python full text indexing, search, and spell checking library.",
-    long_description = open("README.txt").read(),
+    description="Fast, pure-Python full text indexing, search, and spell checking library.",
+    long_description=open("README.txt").read(),
 
-    license = "Two-clause BSD license",
-    keywords = "index search text spell",
-    url = "http://bitbucket.org/mchaput/whoosh",
+    license="Two-clause BSD license",
+    keywords="index search text spell",
+    url="http://bitbucket.org/mchaput/whoosh",
 
-    zip_safe = True,
-    test_suite = "nose.collector",
+    zip_safe=True,
+    test_suite="nose.collector",
 
-    classifiers = [
+    classifiers=[
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "License :: OSI Approved :: BSD License",

File src/whoosh/analysis.py

             newec = buf[-1][3]  # end char of last item in buffer
             parts.insert(insertat, (newtext, newpos, newsc, newec))
 
-        for item in parts[:]:
+        for item in list(parts):
             # item = (text, pos, startchar, endchar)
             text = item[0]
             pos = item[1]
         delim = self.delim
         attr = self.attr
         default = self.default
-        typ = self.type
+        type_ = self.type
 
         for t in tokens:
             text = t.text
             pos = text.find(delim)
             if pos > -1:
-                setattr(t, attr, typ(text[pos + 1:]))
+                setattr(t, attr, type_(text[pos + 1:]))
+                if t.chars:
+                    t.endchar -= len(t.text) - pos
                 t.text = text[:pos]
             else:
                 setattr(t, attr, default)

File src/whoosh/codec/__init__.py

Empty file added.

File src/whoosh/codec/base.py

+# Copyright 2011 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+
+from array import array
+from struct import Struct, pack
+
+from whoosh.compat import loads, dumps, b, bytes_type, string_type
+from whoosh.matching import Matcher, ReadTooFar
+from whoosh.reading import TermInfo
+from whoosh.spans import Span
+from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_long, unpack_long,
+                           IS_LITTLE)
+from whoosh.util import byte_to_length, length_to_byte
+
+
+try:
+    from zlib import compress, decompress
+    can_compress = True
+except ImportError:
+    can_compress = False
+
+
+# Base classes
+
+class Codec(object):
+    def __init__(self, storage):
+        self.storage = storage
+
+    # Per document value writer
+    def per_document_writer(self, segment):
+        raise NotImplementedError
+
+    # Inverted index writer
+    def field_writer(self, segment):
+        raise NotImplementedError
+
+    # Readers
+
+    def terms_reader(self, segment):
+        raise NotImplementedError
+
+    def lengths_reader(self, segment):
+        raise NotImplementedError
+
+    def vector_reader(self, segment):
+        raise NotImplementedError
+
+    def stored_fields_reader(self, segment):
+        raise NotImplementedError
+
+    def word_graph_reader(self, segment):
+        raise NotImplementedError
+
+    # Generations
+
+    def commit_toc(self, indexname, schema, segments, generation):
+        raise NotImplementedError
+
+
+# Writer classes
+
+class PerDocumentWriter(object):
+    def start_doc(self, docnum):
+        raise NotImplementedError
+
+    def add_field(self, fieldname, fieldobj, value, length):
+        raise NotImplementedError
+
+    def add_vector_items(self, fieldname, fieldobj, items):
+        raise NotImplementedError
+
+    def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
+        raise NotImplementedError
+
+    def finish_doc(self):
+        pass
+
+    def lengths_reader(self):
+        raise NotImplementedError
+
+
+class FieldWriter(object):
+    def add_iter(self, schema, lengths, items):
+        # items = (fieldname, text, docnum, weight, valuestring) ...
+        lastfn = None
+        lasttext = None
+        getlen = lengths.get
+        add = self.add
+        for fieldname, text, docnum, weight, valuestring in items:
+            # Items where docnum is None indicate words that should be added
+            # to the spelling graph
+            if docnum is None and (fieldname != lastfn or text != lasttext):
+                self.add_spell_word(fieldname, text)
+                lastfn = fieldname
+                lasttext = text
+                continue
+
+            if fieldname < lastfn or (fieldname == lastfn and text < lasttext):
+                raise Exception("Postings are out of order: %r:%s .. %r:%s" %
+                                (lastfn, lasttext, fieldname, text))
+            if fieldname != lastfn or text != lasttext:
+                if lasttext is not None:
+                    self.finish_term()
+                if fieldname != lastfn:
+                    if lastfn is not None:
+                        self.finish_field()
+                    self.start_field(fieldname, schema[fieldname])
+                    lastfn = fieldname
+                self.start_term(text)
+                lasttext = text
+            length = getlen(docnum, fieldname)
+            add(docnum, weight, valuestring, length)
+        if lasttext is not None:
+            self.finish_term()
+            self.finish_field()
+
+    def start_field(self, fieldname, fieldobj):
+        raise NotImplementedError
+
+    def start_term(self, text):
+        raise NotImplementedError
+
+    def add(self, docnum, weight, valuestring, length):
+        raise NotImplementedError
+
+    def add_spell_word(self, fieldname, text):
+        raise NotImplementedError
+
+    def finish_term(self):
+        raise NotImplementedError
+
+    def finish_field(self):
+        pass
+
+    def close(self):
+        pass
+
+
+# Reader classes
+
+class TermsReader(object):
+    def __contains__(self, term):
+        raise NotImplementedError
+
+    def terminfo(self, fieldname, text):
+        raise NotImplementedError
+
+    def word_graph(self, fieldname, text):
+        raise NotImplementedError
+
+    def matcher(self, fieldname, text, fmt):
+        raise NotImplementedError
+
+    def close(self):
+        pass
+
+
+class VectorReader(object):
+    def __contains__(self, key):
+        raise NotImplementedError
+
+    def matcher(self, docnum, fieldname, format_):
+        raise NotImplementedError
+
+
+class LengthsReader(object):
+    def get(self, docnum, fieldname):
+        raise NotImplementedError
+
+    def field_length(self, fieldname):
+        raise NotImplementedError
+
+    def min_field_length(self, fieldname):
+        raise NotImplementedError
+
+    def max_field_length(self, fieldname):
+        raise NotImplementedError
+
+    def close(self):
+        pass
+
+
+class StoredFieldsReader(object):
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __getitem__(self, docnum):
+        raise NotImplementedError
+
+    def cell(self, docnum, fieldname):
+        fielddict = self.get(docnum)
+        return fielddict.get(fieldname)
+
+    def column(self, fieldname):
+        for fielddict in self:
+            yield fielddict.get(fieldname)
+
+    def close(self):
+        pass
+
+
+# File posting matcher middleware
+
+class FilePostingMatcher(Matcher):
+    # Subclasses need to set
+    #   self._term -- (fieldname, text) or None
+    #   self.scorer -- a Scorer object or None
+    #   self.format -- Format object for the posting values
+
+    def __repr__(self):
+        return "%s(%r, %r, %s)" % (self.__class__.__name__, str(self.postfile),
+                                   self.term(), self.is_active())
+
+    def term(self):
+        return self._term
+
+    def items_as(self, astype):
+        decoder = self.format.decoder(astype)
+        for id, value in self.all_items():
+            yield (id, decoder(value))
+
+    def supports(self, astype):
+        return self.format.supports(astype)
+
+    def value_as(self, astype):
+        decoder = self.format.decoder(astype)
+        return decoder(self.value())
+
+    def spans(self):
+        if self.supports("characters"):
+            return [Span(pos, startchar=startchar, endchar=endchar)
+                    for pos, startchar, endchar in self.value_as("characters")]
+        elif self.supports("positions"):
+            return [Span(pos) for pos in self.value_as("positions")]
+        else:
+            raise Exception("Field does not support positions (%r)"
+                            % self._term)
+
+    def supports_block_quality(self):
+        return self.scorer and self.scorer.supports_block_quality()
+
+    def max_quality(self):
+        return self.scorer.max_quality
+
+    def block_quality(self):
+        return self.scorer.block_quality(self)
+
+
+class BlockPostingMatcher(FilePostingMatcher):
+    # Subclasses need to set
+    #   self.block -- BlockBase object for the current block
+    #   self.i -- Numerical index to the current place in the block
+    # And implement
+    #   _read_block()
+    #   _next_block()
+    #   _skip_to_block()
+
+    def id(self):
+        return self.block.ids[self.i]
+
+    def weight(self):
+        weights = self.block.weights
+        if weights is None:
+            weights = self.block.read_weights()
+        return weights[self.i]
+
+    def value(self):
+        values = self.block.values
+        if values is None:
+            values = self.block.read_values()
+        return values[self.i]
+
+    def all_ids(self):
+        nextoffset = self.baseoffset
+        for _ in xrange(self.blockcount):
+            block = self._read_block(nextoffset)
+            nextoffset = block.nextoffset
+            ids = block.read_ids()
+            for id in ids:
+                yield id
+
+    def next(self):
+        if self.i == self.block.count - 1:
+            self._next_block()
+            return True
+        else:
+            self.i += 1
+            return False
+
+    def skip_to(self, id):
+        if not self.is_active():
+            raise ReadTooFar
+
+        i = self.i
+        # If we're already in the block with the target ID, do nothing
+        if id <= self.block.ids[i]:
+            return
+
+        # Skip to the block that would contain the target ID
+        if id > self.block.maxid:
+            self._skip_to_block(lambda: id > self.block.maxid)
+        if not self.is_active():
+            return
+
+        # Iterate through the IDs in the block until we find or pass the
+        # target
+        ids = self.block.ids
+        i = self.i
+        while ids[i] < id:
+            i += 1
+            if i == len(ids):
+                self._active = False
+                return
+        self.i = i
+
+    def skip_to_quality(self, minquality):
+        bq = self.block_quality
+        if bq() > minquality:
+            return 0
+        return self._skip_to_block(lambda: bq() <= minquality)
+
+    def block_min_length(self):
+        return self.block.min_length()
+
+    def block_max_length(self):
+        return self.block.max_length()
+
+    def block_max_weight(self):
+        return self.block.max_weight()
+
+    def block_max_wol(self):
+        return self.block.max_wol()
+
+
+# File TermInfo
+
+NO_ID = 0xffffffff
+
+
+class FileTermInfo(TermInfo):
+    # Freq, Doc freq, min len, max length, max weight, unused, min ID, max ID
+    struct = Struct("!fIBBffII")
+
+    def __init__(self, *args, **kwargs):
+        self.postings = None
+        if "postings" in kwargs:
+            self.postings = kwargs["postings"]
+            del kwargs["postings"]
+        TermInfo.__init__(self, *args, **kwargs)
+
+    # filedb specific methods
+
+    def add_block(self, block):
+        self._weight += sum(block.weights)
+        self._df += len(block)
+
+        ml = block.min_length()
+        if self._minlength is None:
+            self._minlength = ml
+        else:
+            self._minlength = min(self._minlength, ml)
+
+        self._maxlength = max(self._maxlength, block.max_length())
+        self._maxweight = max(self._maxweight, block.max_weight())
+        if self._minid is None:
+            self._minid = block.ids[0]
+        self._maxid = block.ids[-1]
+
+    def to_string(self):
+        # Encode the lengths as 0-255 values
+        ml = 0 if self._minlength is None else length_to_byte(self._minlength)
+        xl = length_to_byte(self._maxlength)
+        # Convert None values to the out-of-band NO_ID constant so they can be
+        # stored as unsigned ints
+        mid = NO_ID if self._minid is None else self._minid
+        xid = NO_ID if self._maxid is None else self._maxid
+
+        # Pack the term info into bytes
+        st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight,
+                              0, mid, xid)
+
+        if isinstance(self.postings, tuple):
+            # Postings are inlined - dump them using the pickle protocol
+            isinlined = 1
+            st += dumps(self.postings, -1)[2:-1]
+        else:
+            # Append postings pointer as long to end of term info bytes
+            isinlined = 0
+            # It's possible for a term info to not have a pointer to postings
+            # on disk, in which case postings will be None. Convert a None
+            # value to -1 so it can be stored as a long.
+            p = -1 if self.postings is None else self.postings
+            st += pack_long(p)
+
+        # Prepend byte indicating whether the postings are inlined to the term
+        # info bytes
+        return pack("B", isinlined) + st
+
+    @classmethod
+    def from_string(cls, s):
+        assert isinstance(s, bytes_type)
+
+        if isinstance(s, string_type):
+            hbyte = ord(s[0])  # Python 2.x - str
+        else:
+            hbyte = s[0]  # Python 3 - bytes
+
+        if hbyte < 2:
+            st = cls.struct
+            # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID
+            w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1])
+            mid = None if mid == NO_ID else mid
+            xid = None if xid == NO_ID else xid
+            # Postings
+            pstr = s[st.size + 1:]
+            if hbyte == 0:
+                p = unpack_long(pstr)[0]
+            else:
+                p = loads(pstr + b("."))
+        else:
+            # Old format was encoded as a variable length pickled tuple
+            v = loads(s + b("."))
+            if len(v) == 1:
+                w = df = 1
+                p = v[0]
+            elif len(v) == 2:
+                w = df = v[1]
+                p = v[0]
+            else:
+                w, p, df = v
+            # Fake values for stats which weren't stored before
+            ml = 1
+            xl = 255
+            xw = 999999999
+            mid = -1
+            xid = -1
+
+        ml = byte_to_length(ml)
+        xl = byte_to_length(xl)
+        obj = cls(w, df, ml, xl, xw, mid, xid)
+        obj.postings = p
+        return obj
+
+    @classmethod
+    def read_weight(cls, dbfile, datapos):
+        return dbfile.get_float(datapos + 1)
+
+    @classmethod
+    def read_doc_freq(cls, dbfile, datapos):
+        return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE)
+
+    @classmethod
+    def read_min_and_max_length(cls, dbfile, datapos):
+        lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE
+        ml = byte_to_length(dbfile.get_byte(lenpos))
+        xl = byte_to_length(dbfile.get_byte(lenpos + 1))
+        return ml, xl
+
+    @classmethod
+    def read_max_weight(cls, dbfile, datapos):
+        weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2
+        return dbfile.get_float(weightspos)
+
+
+# Posting block format
+
+class BlockBase(object):
+    def __init__(self, postingsize, stringids=False):
+        self.postingsize = postingsize
+        self.stringids = stringids
+        self.ids = [] if stringids else array("I")
+        self.weights = array("f")
+        self.values = None
+
+        self.minlength = None
+        self.maxlength = 0
+        self.maxweight = 0
+
+    def __len__(self):
+        return len(self.ids)
+
+    def __nonzero__(self):
+        return bool(self.ids)
+
+    def min_id(self):
+        if self.ids:
+            return self.ids[0]
+        else:
+            raise IndexError
+
+    def max_id(self):
+        if self.ids:
+            return self.ids[-1]
+        else:
+            raise IndexError
+
+    def min_length(self):
+        return self.minlength
+
+    def max_length(self):
+        return self.maxlength
+
+    def max_weight(self):
+        return self.maxweight
+
+    def add(self, id_, weight, valuestring, length=None):
+        self.ids.append(id_)
+        self.weights.append(weight)
+        if weight > self.maxweight:
+            self.maxweight = weight
+        if valuestring:
+            if self.values is None:
+                self.values = []
+            self.values.append(valuestring)
+        if length:
+            if self.minlength is None or length < self.minlength:
+                self.minlength = length
+            if length > self.maxlength:
+                self.maxlength = length
+
+    def to_file(self, postfile):
+        raise NotImplementedError
+
+    @classmethod
+    def from_file(cls, postfile):
+        raise NotImplementedError
+
+
+# Utility functions
+
+def minimize_ids(arry, stringids, compression=0):
+    amax = arry[-1]
+
+    if stringids:
+        typecode = ''
+        string = dumps(arry)
+    else:
+        code = arry.typecode
+        if amax <= 255:
+            typecode = "B"
+        elif amax <= 65535:
+            typecode = "H"
+        if typecode != code:
+            arry = array(typecode, iter(arry))
+        if not IS_LITTLE:
+            arry.byteswap()
+        string = arry.tostring()
+    if compression:
+        string = compress(string, compression)
+    return (typecode, string)
+
+def deminimize_ids(typecode, count, string, compression=0):
+    if compression:
+        string = decompress(string)
+    if typecode == '':
+        return loads(string)
+    else:
+        arry = array(typecode)
+        arry.fromstring(string)
+        if not IS_LITTLE:
+            arry.byteswap()
+        return arry
+
+def minimize_weights(weights, compression=0):
+    if all(w == 1.0 for w in weights):
+        string = ""
+    else:
+        if not IS_LITTLE:
+            weights.byteswap()
+        string = weights.tostring()
+    if string and compression:
+        string = compress(string, compression)
+    return string
+
+def deminimize_weights(count, string, compression=0):
+    if not string:
+        return array("f", (1.0 for _ in xrange(count)))
+    if compression:
+        string = decompress(string)
+    arry = array("f")
+    arry.fromstring(string)
+    if not IS_LITTLE:
+        arry.byteswap()
+    return arry
+
+def minimize_values(postingsize, values, compression=0):
+    if postingsize < 0:
+        string = dumps(values, -1)[2:]
+    elif postingsize == 0:
+        string = b('')
+    else:
+        string = b('').join(values)
+    if string and compression:
+        string = compress(string, compression)
+    return string
+
+def deminimize_values(postingsize, count, string, compression=0):
+    if compression:
+        string = decompress(string)
+
+    if postingsize < 0:
+        return loads(string)
+    elif postingsize == 0:
+        return [None] * count
+    else:
+        return [string[i:i + postingsize] for i
+                in xrange(0, len(string), postingsize)]
+
+
+
+
+
+
+

File src/whoosh/codec/legacy.py

+# Copyright 2011 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+
+from array import array
+from struct import Struct
+
+from whoosh.compat import load, xrange
+from whoosh.codec import base
+from whoosh.codec.base import (deminimize_ids, deminimize_weights,
+                               deminimize_values)
+from whoosh.system import _INT_SIZE, _FLOAT_SIZE, IS_LITTLE
+from whoosh.util import byte_to_length, utf8decode
+
+
+try:
+    from zlib import decompress
+except ImportError:
+    pass
+
+
+# Old field lengths format
+
+def load_old_lengths(obj, dbfile, doccount):
+    fieldcount = dbfile.read_ushort()  # Number of fields
+    for _ in xrange(fieldcount):
+        fieldname = dbfile.read_string().decode("utf-8")
+        obj.lengths[fieldname] = dbfile.read_array("B", doccount)
+        # Old format didn't store totals, so fake it by adding up the codes
+        obj.totals[fieldname] = sum(byte_to_length(b) for b
+                                    in obj.lengths[fieldname])
+    dbfile.close()
+
+
+# Old block formats
+
+def old_block_type(magic):
+    if magic == "Blk2":
+        return Block2
+    elif magic == "\x0eB\xff\xff":
+        return Block1
+    else:
+        raise Exception("Unknown block header %r" % magic)
+
+
+class Block2(base.BlockBase):
+    _struct = Struct("<iBBcBiiffHBBB")
+
+    @classmethod
+    def from_file(cls, postfile, postingsize, stringids=False):
+        start = postfile.tell()
+        block = cls(postingsize, stringids=stringids)
+        block.postfile = postfile
+        header = cls._struct.unpack(postfile.read(cls._struct.size))
+        block.nextoffset = start + header[0]
+        block.cmp = header[1]
+        block.count = header[2]
+        block.idcode = header[3].decode("Latin1")
+        block.idslen = header[5]
+        block.wtslen = header[6]
+        block.maxweight = header[7]
+        block.maxlength = byte_to_length(header[11])
+        block.minlength = byte_to_length(header[12])
+
+        block.maxid = load(postfile) if stringids else postfile.read_uint()
+        block.dataoffset = postfile.tell()
+        return block
+
+    def read_ids(self):
+        dataoffset = self.dataoffset
+        string = self.postfile.map[dataoffset:dataoffset + self.idslen]
+        self.ids = deminimize_ids(string)
+
+    def read_weights(self):
+        if self.wtslen == 0:
+            return [1.0] * self.count
+        else:
+            offset = self.dataoffset + self.idslen
+            string = self.postfile.map[offset:offset + self.weightslen]
+            return deminimize_weights(self.count, string, self.cmp)
+
+    def read_values(self):
+        postingsize = self.postingsize
+        if postingsize == 0:
+            return [None] * self.count
+        else:
+            offset = self.dataoffset + self.idslen + self.weightslen
+            string = self.postfile.map[offset:self.nextoffset]
+            return deminimize_values(postingsize, self.count, string, self.cmp)
+
+
+class Block1(base.BlockBase):
+    _struct = Struct("!BBHiHHBfffB")
+
+    @classmethod
+    def from_file(cls, postfile, stringids=False):
+        pos = postfile.tell()
+        block = cls(postfile, stringids=stringids)
+        block.postfile = postfile
+        header = cls._struct.unpack(postfile.read(cls._struct.size))
+        block.nextoffset = pos + header[3]
+        block.idslen = header[4]
+        block.wtslen = header[5]
+        block.count = header[6]
+        block.maxweight = header[7]
+        block.minlength = byte_to_length(header[10])
+
+        if stringids:
+            block.maxid = utf8decode(postfile.read_string())[0]
+        else:
+            block.maxid = postfile.read_uint()
+        block.dataoffset = postfile.tell()
+        return block
+
+    def read_ids(self):
+        postfile = self.postfile
+        offset = self.dataoffset
+        postcount = self.count
+        postfile.seek(offset)
+
+        if self.stringids:
+            rs = postfile.read_string
+            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
+            newoffset = postfile.tell()
+        elif self.idslen:
+            ids = array("I")
+            ids.fromstring(decompress(postfile.read(self.idslen)))
+            if IS_LITTLE:
+                ids.byteswap()
+            newoffset = offset + self.idslen
+        else:
+            ids = postfile.read_array("I", postcount)
+            newoffset = offset + _INT_SIZE * postcount
+
+        self.ids = ids
+        self.weights_offset = newoffset
+        return ids
+
+    def read_weights(self):
+        postfile = self.postfile
+        offset = self.weights_offset
+        postfile.seek(offset)
+        weightslen = self.wtslen
+        postcount = self.count
+
+        if weightslen == 1:
+            weights = None
+            newoffset = offset
+        elif weightslen:
+            weights = array("f")
+            weights.fromstring(decompress(postfile.read(weightslen)))
+            if IS_LITTLE:
+                weights.byteswap()
+            newoffset = offset + weightslen
+        else:
+            weights = postfile.get_array(offset, "f", postcount)
+            newoffset = offset + _FLOAT_SIZE * postcount
+
+        self.weights = weights
+        self.values_offset = newoffset
+        return weights
+
+    def read_values(self):
+        postfile = self.postfile
+        startoffset = self.values_offset
+        endoffset = self.nextoffset
+        postcount = self.count
+
+        postingsize = self.postingsize
+        if postingsize != 0:
+            values_string = postfile.map[startoffset:endoffset]
+
+            if self.wtslen:
+                # Values string is compressed
+                values_string = decompress(values_string)
+
+            if postingsize < 0:
+                # Pull the array of value lengths off the front of the string
+                lengths = array("i")
+                lengths.fromstring(values_string[:_INT_SIZE * postcount])
+                values_string = values_string[_INT_SIZE * postcount:]
+
+            # Chop up the block string into individual valuestrings
+            if postingsize > 0:
+                # Format has a fixed posting size, just chop up the values
+                # equally
+                values = [values_string[i * postingsize: i * postingsize + postingsize]
+                          for i in xrange(postcount)]
+            else:
+                # Format has a variable posting size, use the array of lengths
+                # to chop up the values.
+                pos = 0
+                values = []
+                for length in lengths:
+                    values.append(values_string[pos:pos + length])
+                    pos += length
+        else:
+            # Format does not store values (i.e. Existence), just create fake
+            # values
+            values = (None,) * postcount
+
+        self.values = values
+

File src/whoosh/codec/standard.py

+# Copyright 2011 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+
+from array import array
+from collections import defaultdict
+from struct import Struct
+
+from whoosh.compat import (loads, dumps, xrange, iteritems, itervalues, b,
+                           bytes_type, integer_types)
+from whoosh.codec import base
+from whoosh.codec.base import (minimize_ids, deminimize_ids, minimize_weights,
+                               deminimize_weights, minimize_values,
+                               deminimize_values)
+from whoosh.filedb.fileindex import TOC, clean_files
+from whoosh.filedb.filetables import CodedOrderedWriter, CodedOrderedReader
+from whoosh.matching import ListMatcher
+from whoosh.reading import TermNotFound
+from whoosh.support.dawg import DawgBuilder, DiskNode
+from whoosh.system import (pack_ushort, pack_long, unpack_ushort, unpack_long,
+                           _INT_SIZE, _LONG_SIZE)
+from whoosh.util import byte_to_length, length_to_byte, utf8encode, utf8decode
+
+
+# Standard codec top-level object
+
+class StdCodec(base.Codec):
+    TERMS_EXT = ".trm"  # Term index
+    POSTS_EXT = ".pst"  # Term postings
+    DAWG_EXT = ".dag"  # Spelling graph file
+    LENGTHS_EXT = ".fln"  # Field lengths file
+    VECTOR_EXT = ".vec"  # Vector index
+    VPOSTS_EXT = ".vps"  # Vector postings
+    STORED_EXT = ".sto"  # Stored fields file
+
+    def __init__(self, storage, blocklimit=128, compression=3,
+                 loadlengths=False, inlinelimit=1):
+        self.storage = storage
+        self.blocklimit = blocklimit
+        self.compression = compression
+        self.loadlengths = loadlengths
+        self.inlinelimit = inlinelimit
+
+    # Per-document value writer
+    def per_document_writer(self, segment):
+        return StdPerDocWriter(self.storage, segment,
+                               blocklimit=self.blocklimit,
+                               compression=self.compression)
+
+    # Inverted index writer
+    def field_writer(self, segment):
+        return StdFieldWriter(self.storage, segment,
+                               blocklimit=self.blocklimit,
+                               compression=self.compression,
+                               inlinelimit=self.inlinelimit)
+
+    # Readers
+
+    def terms_reader(self, segment):
+        tifile = segment.open_file(self.storage, self.TERMS_EXT)
+        postfile = segment.open_file(self.storage, self.POSTS_EXT, mapped=False)
+        return StdTermsReader(tifile, postfile)
+
+    def lengths_reader(self, segment):
+        flfile = segment.open_file(self.storage, self.LENGTHS_EXT)
+        doccount = segment.doc_count_all()
+
+        # Check the first byte of the file to see if it's an old format
+        firstbyte = flfile.read(1)
+        flfile.seek(0)
+        if firstbyte != b("~"):
+            from whoosh.codec.legacy import load_old_lengths
+            lengths = load_old_lengths(InMemoryLengths(), flfile, doccount)
+        elif self.loadlengths:
+            lengths = InMemoryLengths.from_file(flfile, doccount)
+        else:
+            lengths = OnDiskLengths(flfile, doccount)
+        return lengths
+
+    def vector_reader(self, segment):
+        vifile = segment.open_file(self.storage, self.VECTOR_EXT)
+        postfile = segment.open_file(self.storage, self.VPOSTS_EXT, mapped=False)
+        return StdVectorReader(vifile, postfile)
+
+    def stored_fields_reader(self, segment):
+        sffile = segment.open_file(self.storage, self.STORED_EXT, mapped=False)
+        return StoredFieldReader(sffile)
+
+    def word_graph(self, segment):
+        dawgfile = segment.open_file(self.storage, self.DAWG_EXT, mapped=False)
+        return DiskNode.load(dawgfile, expand=False)
+
+    # Generations
+
+    def commit_toc(self, indexname, schema, segments, generation):
+        toc = TOC(schema, segments, generation)
+        toc.write(self.storage, indexname)
+        # Delete leftover files
+        clean_files(self.storage, indexname, generation, segments)
+
+
+# Per-document value writer
+
+class StdPerDocWriter(base.PerDocumentWriter):
+    def __init__(self, storage, segment, blocklimit=128, compression=3):
+        self.storage = storage
+        self.segment = segment
+        self.blocklimit = blocklimit
+        self.compression = compression
+        self.doccount = 0
+
+        sffile = segment.create_file(storage, StdCodec.STORED_EXT)
+        self.stored = StoredFieldWriter(sffile)
+        self.storedfields = None
+
+        self.flfile = segment.create_file(storage, StdCodec.LENGTHS_EXT)
+        self.lengths = InMemoryLengths()
+
+        # We'll wait to create the vector files until someone actually tries
+        # to add a vector
+        vifile = self.segment.create_file(self.storage, StdCodec.VECTOR_EXT)
+        self.vindex = VectorWriter(vifile)
+        self.vpostfile = self.segment.create_file(self.storage, StdCodec.VPOSTS_EXT)
+
+    def start_doc(self, docnum):
+        self.docnum = docnum
+        self.storedfields = {}
+        self.doccount = max(self.doccount, docnum + 1)
+
+    def add_field(self, fieldname, fieldobj, value, length):
+        if length:
+            self.lengths.add(self.docnum, fieldname, length)
+        if value is not None:
+            self.storedfields[fieldname] = value
+
+    def _new_block(self, vformat):
+        postingsize = vformat.posting_size
+        return StdBlock(postingsize, stringids=True)
+
+    def add_vector_items(self, fieldname, fieldobj, items):
+        # items = (text, freq, weight, valuestring) ...
+        postfile = self.vpostfile
+        blocklimit = self.blocklimit
+        block = self._new_block(fieldobj.vector)
+
+        startoffset = postfile.tell()
+        postfile.write(block.magic)  # Magic number
+        blockcount = 0
+        postfile.write_uint(0)  # Placeholder for block count
+
+        countdown = blocklimit
+        for text, _, weight, valuestring in items:
+            block.add(text, weight, valuestring)
+            countdown -= 1
+            if countdown == 0:
+                block.to_file(postfile, compression=self.compression)
+                block = self._new_block(fieldobj.vector)
+                blockcount += 1
+                countdown = blocklimit
+        # If there are leftover items in the current block, write them out
+        if block:
+            block.to_file(postfile, compression=self.compression)
+            blockcount += 1
+
+        # Seek back to the start of this list of posting blocks and write the
+        # number of blocks
+        postfile.flush()
+        here = postfile.tell()
+        postfile.seek(startoffset + 4)
+        postfile.write_uint(blockcount)
+        postfile.seek(here)
+
+        # Add to the index
+        self.vindex.add((self.docnum, fieldname), startoffset)
+
+    def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
+        def readitems():
+            while vmatcher.is_active():
+                text = vmatcher.id()
+                weight = vmatcher.weight()
+                valuestring = vmatcher.value()
+                yield (text, None, weight, valuestring)
+                vmatcher.next()
+        self.add_vector_items(fieldname, fieldobj, readitems())
+
+    def finish_doc(self):
+        self.stored.add(self.storedfields)
+        self.storedfields = None
+
+    def lengths_reader(self):
+        return self.lengths
+
+    def close(self):
+        if self.storedfields is not None:
+            self.stored.add(self.storedfields)
+        self.stored.close()
+        self.lengths.to_file(self.flfile, self.doccount)
+        if self.vindex:
+            self.vindex.close()
+            self.vpostfile.close()
+
+
+# Inverted index writer
+
+class StdFieldWriter(base.FieldWriter):
+    def __init__(self, storage, segment, blocklimit=128, compression=3,
+                 inlinelimit=1):
+        self.storage = storage
+        self.segment = segment
+        self.fieldname = None
+        self.text = None
+        self.field = None
+        self.format = None
+        self.spelling = False
+
+        tifile = segment.create_file(storage, StdCodec.TERMS_EXT)
+        self.termsindex = TermIndexWriter(tifile)
+        self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT)
+
+        # We'll wait to create the DAWG builder until someone actually adds
+        # a spelled field
+        self.dawg = None
+
+        self.blocklimit = blocklimit
+        self.compression = compression
+        self.inlinelimit = inlinelimit
+        self.block = None
+        self.terminfo = None
+
+    def _make_dawg_files(self):
+        dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
+        self.dawg = DawgBuilder(dawgfile, field_root=True)
+
+    def _reset_block(self):
+        self.block = StdBlock(self.format.posting_size)
+
+    def _write_block(self):
+        self.terminfo.add_block(self.block)
+        self.block.to_file(self.postfile, compression=self.compression)
+        self._reset_block()
+        self.blockcount += 1
+
+    def _start_blocklist(self):
+        postfile = self.postfile
+        self._reset_block()
+
+        # Magic number
+        self.startoffset = postfile.tell()
+        postfile.write(StdBlock.magic)
+        # Placeholder for block count
+        self.blockcount = 0
+        postfile.write_uint(0)
+
+    def _finish_blocklist(self):
+        if self.block:
+            self._write_block()
+
+        # Seek back to the start of this list of posting blocks and write the
+        # number of blocks
+        postfile = self.postfile
+        postfile.flush()
+        here = postfile.tell()
+        postfile.seek(self.startoffset + 4)
+        postfile.write_uint(self.blockcount)
+        postfile.seek(here)
+
+        self.block = None
+
+    def start_field(self, fieldname, fieldobj):
+        self.fieldname = fieldname
+        self.field = fieldobj
+        self.format = fieldobj.format
+        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()
+
+    def start_term(self, text):
+        if self.block is not None:
+            raise Exception("Called start_term in a block")
+        self.text = text
+        self.terminfo = base.FileTermInfo()
+        if self.spelling:
+            if self.dawg is None:
+                self._make_dawg_files()
+            self.dawg.insert((self.fieldname,) + tuple(text))
+        self._start_blocklist()
+
+    def add(self, docnum, weight, valuestring, length):
+        self.block.add(docnum, weight, valuestring, length)
+        if len(self.block) > self.blocklimit:
+            self._write_block()
+
+    def add_spell_word(self, fieldname, text):
+        if self.dawg is None:
+            self._make_dawg_files()
+        self.dawg.insert((fieldname,) + tuple(text))
+
+    def finish_term(self):
+        if self.block is None:
+            raise Exception("Called finish_term when not in a block")
+        block = self.block
+        terminfo = self.terminfo
+        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
+            # Inline the single block
+            terminfo.add_block(block)
+            vals = None if not block.values else tuple(block.values)
+            postings = (tuple(block.ids), tuple(block.weights), vals)
+        else:
+            self._finish_blocklist()
+            postings = self.startoffset
+
+        self.block = None
+        terminfo.postings = postings
+        self.termsindex.add((self.fieldname, self.text), terminfo)
+
+    def close(self):
+        self.termsindex.close()
+        self.postfile.close()
+        if self.dawg is not None:
+            self.dawg.close()
+
+
+# Matcher
+
+class PostingMatcher(base.BlockPostingMatcher):
+    def __init__(self, postfile, startoffset, fmt, scorer=None, term=None,
+                 stringids=False):
+        self.postfile = postfile
+        self.startoffset = startoffset
+        self.format = fmt
+        self.scorer = scorer
+        self._term = term
+        self.stringids = stringids
+
+        postfile.seek(startoffset)
+        magic = postfile.read(4)
+        if magic != StdBlock.magic:
+            from whoosh.codec.legacy import old_block_type
+            self.blockclass = old_block_type(magic)
+        else:
+            self.blockclass = StdBlock
+
+        self.blockcount = postfile.read_uint()
+        self.baseoffset = postfile.tell()
+
+        self._active = True
+        self.currentblock = -1
+        self._next_block()
+
+    def is_active(self):
+        return self._active
+
+    def _read_block(self, offset):
+        pf = self.postfile
+        pf.seek(offset)
+        return self.blockclass.from_file(pf, self.format.posting_size,
+                                         stringids=self.stringids)
+
+    def _consume_block(self):
+        self.block.read_ids()
+        self.block.read_weights()
+        self.i = 0
+
+    def _next_block(self, consume=True):
+        if not (self.currentblock < self.blockcount):
+            raise Exception("No next block")
+
+        self.currentblock += 1
+        if self.currentblock == self.blockcount:
+            self._active = False
+            return
+
+        if self.currentblock == 0:
+            pos = self.baseoffset
+        else:
+            pos = self.block.nextoffset
+
+        self.block = self._read_block(pos)
+        if consume:
+            self._consume_block()
+
+    def _skip_to_block(self, targetfn):
+        skipped = 0
+        while self._active and targetfn():
+            self._next_block(consume=False)
+            skipped += 1
+
+        if self._active:
+            self._consume_block()
+
+        return skipped
+
+    def score(self):
+        return self.scorer.score(self)
+
+
+# Tables
+
+# Term index
+
+class TermIndexWriter(CodedOrderedWriter):
+    def __init__(self, dbfile):
+        super(TermIndexWriter, self).__init__(dbfile)
+        self.fieldcounter = 0
+        self.fieldmap = {}
+
+    def keycoder(self, key):
+        # Encode term
+        fieldmap = self.fieldmap
+        fieldname, text = key
+
+        if fieldname in fieldmap:
+            fieldnum = fieldmap[fieldname]
+        else:
+            fieldnum = self.fieldcounter
+            fieldmap[fieldname] = fieldnum
+            self.fieldcounter += 1
+
+        key = pack_ushort(fieldnum) + utf8encode(text)[0]
+        return key
+
+    def valuecoder(self, terminfo):
+        return terminfo.to_string()
+
+    def close(self):
+        self._write_hashes()
+        dbfile = self.dbfile
+
+        dbfile.write_uint(len(self.index))
+        for n in self.index:
+            dbfile.write_long(n)
+        dbfile.write_pickle(self.fieldmap)
+
+        self._write_directory()
+        self.dbfile.close()
+
+
+class PostingIndexBase(CodedOrderedReader):
+    # Shared base class for terms index and vector index readers
+    def __init__(self, dbfile, postfile):
+        CodedOrderedReader.__init__(self, dbfile)
+        self.postfile = postfile
+
+        dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
+        self.fieldmap = dbfile.read_pickle()
+        self.names = [None] * len(self.fieldmap)
+        for name, num in iteritems(self.fieldmap):
+            self.names[num] = name
+
+
+class StdTermsReader(PostingIndexBase):
+    # Implements whoosh.codec.base.TermsReader
+
+    def terminfo(self, fieldname, text):
+        return self[fieldname, text]
+
+    def matcher(self, fieldname, text, format_, scorer=None):
+        # Note this does not filter out deleted documents; a higher level is
+        # expected to wrap this matcher to eliminate deleted docs
+        pf = self.postfile
+        term = (fieldname, text)
+        try:
+            terminfo = self[term]
+        except KeyError:
+            raise TermNotFound("No term %s:%r" % (fieldname, text))
+
+        p = terminfo.postings
+        if isinstance(p, integer_types):
+            # terminfo.postings is an offset into the posting file
+            pr = PostingMatcher(pf, p, format_, scorer=scorer, term=term)
+        else:
+            # terminfo.postings is an inlined tuple of (ids, weights, values)
+            docids, weights, values = p
+            pr = ListMatcher(docids, weights, values, format_, scorer=scorer,
+                             term=term, terminfo=terminfo)
+        return pr
+
+    def keycoder(self, key):
+        fieldname, text = key
+        fnum = self.fieldmap.get(fieldname, 65535)
+        return pack_ushort(fnum) + utf8encode(text)[0]
+
+    def keydecoder(self, v):
+        assert isinstance(v, bytes_type)
+        return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
+
+    def valuedecoder(self, v):
+        assert isinstance(v, bytes_type)
+        return base.FileTermInfo.from_string(v)
+
+    def frequency(self, key):
+        datapos = self.range_for_key(key)[0]
+        return base.FileTermInfo.read_weight(self.dbfile, datapos)
+
+    def doc_frequency(self, key):
+        datapos = self.range_for_key(key)[0]
+        return base.FileTermInfo.read_doc_freq(self.dbfile, datapos)
+
+
+# Vector index
+
+# docnum, fieldnum
+_vectorkey_struct = Struct("!IH")
+
+
+class VectorWriter(TermIndexWriter):
+    def keycoder(self, key):
+        fieldmap = self.fieldmap
+        docnum, fieldname = key
+
+        if fieldname in fieldmap:
+            fieldnum = fieldmap[fieldname]
+        else:
+            fieldnum = self.fieldcounter
+            fieldmap[fieldname] = fieldnum
+            self.fieldcounter += 1
+
+        return _vectorkey_struct.pack(docnum, fieldnum)
+
+    def valuecoder(self, offset):
+        return pack_long(offset)
+
+
+class StdVectorReader(PostingIndexBase):
+    # Implements whoosh.codec.base.VectorReader
+
+    def matcher(self, docnum, fieldname, format_):
+        pf = self.postfile
+        offset = self[(docnum, fieldname)]
+        pr = PostingMatcher(pf, offset, format_, stringids=True)
+        return pr
+
+    def keycoder(self, key):
+        return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]])
+
+    def keydecoder(self, v):
+        docnum, fieldnum = _vectorkey_struct.unpack(v)
+        return (docnum, self.names[fieldnum])
+
+    def valuedecoder(self, v):
+        return unpack_long(v)[0]
+
+
+# Field lengths
+
+class LengthsBase(base.LengthsReader):
+    magic = b("~LN1")
+
+    def __init__(self):
+        self.starts = {}
+        self.totals = {}
+        self.minlens = {}
+        self.maxlens = {}
+
+    def _read_header(self, dbfile, doccount):
+        first = dbfile.read(4)  # Magic
+        assert first == self.magic
+        version = dbfile.read_int()  # Version number
+        assert version == 1
+
+        dc = dbfile.read_uint()  # Number of documents saved
+        if doccount is None:
+            doccount = dc
+        assert dc == doccount, "read=%s argument=%s" % (dc, doccount)
+        self._count = doccount
+
+        fieldcount = dbfile.read_ushort()  # Number of fields
+        # Read per-field info
+        for i in xrange(fieldcount):
+            fieldname = dbfile.read_string().decode('utf-8')
+            self.totals[fieldname] = dbfile.read_long()
+            self.minlens[fieldname] = byte_to_length(dbfile.read_byte())
+            self.maxlens[fieldname] = byte_to_length(dbfile.read_byte())
+            self.starts[fieldname] = i * doccount
+
+        # Add header length to per-field offsets
+        eoh = dbfile.tell()  # End of header
+        for fieldname in self.starts:
+            self.starts[fieldname] += eoh
+
+    def field_length(self, fieldname):
+        return self.totals.get(fieldname, 0)
+
+    def min_field_length(self, fieldname):
+        return self.minlens.get(fieldname, 0)
+
+    def max_field_length(self, fieldname):
+        return self.maxlens.get(fieldname, 0)
+
+
+class InMemoryLengths(LengthsBase):
+    def __init__(self):
+        LengthsBase.__init__(self)
+        self.totals = defaultdict(int)
+        self.lengths = {}
+        self._count = 0
+
+    # IO
+
+    def to_file(self, dbfile, doccount):
+        self._pad_arrays(doccount)
+        fieldnames = list(self.lengths.keys())
+
+        dbfile.write(self.magic)
+        dbfile.write_int(1)  # Format version number
+        dbfile.write_uint(doccount)  # Number of documents
+        dbfile.write_ushort(len(self.lengths))  # Number of fields
+
+        # Write per-field info
+        for fieldname in fieldnames:
+            dbfile.write_string(fieldname.encode('utf-8'))  # Fieldname
+            dbfile.write_long(self.field_length(fieldname))
+            dbfile.write_byte(length_to_byte(self.min_field_length(fieldname)))
+            dbfile.write_byte(length_to_byte(self.max_field_length(fieldname)))
+
+        # Write byte arrays
+        for fieldname in fieldnames:
+            dbfile.write_array(self.lengths[fieldname])
+        dbfile.close()
+
+    @classmethod
+    def from_file(cls, dbfile, doccount=None):
+        obj = cls()
+        obj._read_header(dbfile, doccount)
+        for fieldname, start in iteritems(obj.starts):
+            obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count)
+        dbfile.close()
+        return obj
+
+    # Get
+
+    def get(self, docnum, fieldname, default=0):
+        try:
+            arry = self.lengths[fieldname]
+        except KeyError:
+            return default
+        if docnum >= len(arry):
+            return default
+        return byte_to_length(arry[docnum])
+
+    # Min/max cache setup -- not meant to be called while adding
+
+    def _minmax(self, fieldname, op, cache):
+        if fieldname in cache:
+            return cache[fieldname]
+        else:
+            ls = self.lengths[fieldname]
+            if ls:
+                result = byte_to_length(op(ls))
+            else:
+                result = 0
+            cache[fieldname] = result
+            return result
+
+    def min_field_length(self, fieldname):
+        return self._minmax(fieldname, min, self.minlens)
+
+    def max_field_length(self, fieldname):
+        return self._minmax(fieldname, max, self.maxlens)
+
+    # Add
+
+    def _create_field(self, fieldname, docnum):
+        dc = max(self._count, docnum + 1)
+        self.lengths[fieldname] = array("B", (0 for _ in xrange(dc)))
+        self._count = dc
+
+    def _pad_arrays(self, doccount):
+        # Pad out arrays to full length
+        for fieldname in self.lengths.keys():
+            arry = self.lengths[fieldname]
+            if len(arry) < doccount:
+                for _ in xrange(doccount - len(arry)):
+                    arry.append(0)
+        self._count = doccount
+
+    def add(self, docnum, fieldname, length):
+        lengths = self.lengths
+        if length:
+            if fieldname not in lengths:
+                self._create_field(fieldname, docnum)
+
+            arry = self.lengths[fieldname]
+            count = docnum + 1
+            if len(arry) < count:
+                for _ in xrange(count - len(arry)):
+                    arry.append(0)
+            if count > self._count:
+                self._count = count
+            byte = length_to_byte(length)
+            arry[docnum] = byte
+            self.totals[fieldname] += length
+
+    def add_other(self, other):
+        lengths = self.lengths
+        totals = self.totals
+        doccount = self._count
+        for fname in other.lengths:
+            if fname not in lengths:
+                lengths[fname] = array("B")
+        self._pad_arrays(doccount)
+
+        for fname in other.lengths:
+            lengths[fname].extend(other.lengths[fname])
+        self._count = doccount + other._count
+        self._pad_arrays(self._count)
+
+        for fname in other.totals:
+            totals[fname] += other.totals[fname]
+
+
+class OnDiskLengths(LengthsBase):
+    def __init__(self, dbfile, doccount=None):
+        LengthsBase.__init__(self)
+        self.dbfile = dbfile
+        self._read_header(dbfile, doccount)
+
+    def get(self, docnum, fieldname, default=0):
+        try:
+            start = self.starts[fieldname]
+        except KeyError:
+            return default
+        return byte_to_length(self.dbfile.get_byte(start + docnum))
+
+    def close(self):
+        self.dbfile.close()
+
+
+# Stored fields
+
+_stored_pointer_struct = Struct("!qI")  # offset, length
+stored_pointer_size = _stored_pointer_struct.size
+pack_stored_pointer = _stored_pointer_struct.pack
+unpack_stored_pointer = _stored_pointer_struct.unpack
+
+
+class StoredFieldWriter(object):
+    def __init__(self, dbfile):
+        self.dbfile = dbfile
+        self.length = 0
+        self.directory = []
+
+        self.dbfile.write_long(0)
+        self.dbfile.write_uint(0)
+
+        self.names = []
+        self.name_map = {}
+
+    def add(self, vdict):
+        f = self.dbfile
+        names = self.names
+        name_map = self.name_map
+
+        vlist = [None] * len(names)
+        for k, v in iteritems(vdict):
+            if k in name_map:
+                vlist[name_map[k]] = v
+            else:
+                name_map[k] = len(names)
+                names.append(k)
+                vlist.append(v)
+
+        vstring = dumps(tuple(vlist), -1)[2:-1]
+        self.length += 1
+        self.directory.append(pack_stored_pointer(f.tell(), len(vstring)))
+        f.write(vstring)
+
+    def close(self):
+        f = self.dbfile
+        dirpos = f.tell()
+        f.write_pickle(self.names)
+        for pair in self.directory:
+            f.write(pair)
+        f.flush()
+        f.seek(0)
+        f.write_long(dirpos)
+        f.write_uint(self.length)
+        f.close()
+
+
+class StoredFieldReader(object):
+    def __init__(self, dbfile):
+        self.dbfile = dbfile
+
+        dbfile.seek(0)
+        dirpos = dbfile.read_long()
+        self.length = dbfile.read_uint()
+
+        dbfile.seek(dirpos)
+
+        nameobj = dbfile.read_pickle()
+        if isinstance(nameobj, dict):
+            # Previous versions stored the list of names as a map of names to
+            # positions... it seemed to make sense at the time...
+            self.names = [None] * len(nameobj)
+            for name, pos in iteritems(nameobj):
+                self.names[pos] = name
+        else:
+            self.names = nameobj
+        self.directory_offset = dbfile.tell()
+
+    def close(self):
+        self.dbfile.close()
+
+    def __getitem__(self, num):
+        if num > self.length - 1:
+            raise IndexError("Tried to get document %s, file has %s"
+                             % (num, self.length))
+
+        dbfile = self.dbfile
+        start = self.directory_offset + num * stored_pointer_size
+        dbfile.seek(start)
+        ptr = dbfile.read(stored_pointer_size)
+        if len(ptr) != stored_pointer_size:
+            raise Exception("Error reading %r @%s %s < %s"
+                            % (dbfile, start, len(ptr), stored_pointer_size))
+        position, length = unpack_stored_pointer(ptr)
+        vlist = loads(dbfile.map[position:position + length] + b("."))
+
+        names = self.names
+        # Recreate a dictionary by putting the field names and values back
+        # together by position. We can't just use dict(zip(...)) because we
+        # want to filter out the None values.
+        vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
+                     if vlist[i] is not None)
+        return vdict
+
+
+# Posting blocks
+
+class StdBlock(base.BlockBase):
+    magic = b("Blk3")
+
+    infokeys = ("count", "maxid", "maxweight", "minlength", "maxlength",
+                "idcode", "compression", "idslen", "weightslen")
+
+    def to_file(self, postfile, compression=3):
+        ids = self.ids
+        idcode, idstring = minimize_ids(ids, self.stringids, compression)
+        wtstring = minimize_weights(self.weights, compression)
+        vstring = minimize_values(self.postingsize, self.values, compression)
+
+        info = (len(ids), ids[-1], self.maxweight,
+                length_to_byte(self.minlength), length_to_byte(self.maxlength),
+                idcode, compression, len(idstring), len(wtstring))
+        infostring = dumps(info, -1)
+
+        # Offset to next block
+        postfile.write_uint(len(infostring) + len(idstring) + len(wtstring)
+                            + len(vstring))
+        # Block contents
+        postfile.write(infostring)
+        postfile.write(idstring)
+        postfile.write(wtstring)
+        postfile.write(vstring)
+
+    @classmethod
+    def from_file(cls, postfile, postingsize, stringids=False):
+        block = cls(postingsize, stringids=stringids)
+        block.postfile = postfile
+
+        delta = postfile.read_uint()
+        block.nextoffset = postfile.tell() + delta
+        info = postfile.read_pickle()
+        block.dataoffset = postfile.tell()
+
+        for key, value in zip(cls.infokeys, info):
+            if key in ("minlength", "maxlength"):
+                value = byte_to_length(value)
+            setattr(block, key, value)
+
+        return block
+
+    def read_ids(self):
+        offset = self.dataoffset
+        idstring = self.postfile.map[offset:offset + self.idslen]
+        ids = deminimize_ids(self.idcode, self.count, idstring,
+                             self.compression)
+        self.ids = ids
+        return ids
+
+    def read_weights(self):
+        if self.weightslen == 0:
+            weights = [1.0] * self.count
+        else:
+            offset = self.dataoffset + self.idslen
+            wtstring = self.postfile.map[offset:offset + self.weightslen]
+            weights = deminimize_weights(self.count, wtstring,
+                                         self.compression)
+        self.weights = weights
+        return weights
+
+    def read_values(self):
+        postingsize = self.postingsize
+        if postingsize == 0:
+            values = [None] * self.count
+        else:
+            offset = self.dataoffset + self.idslen + self.weightslen
+            vstring = self.postfile.map[offset:self.nextoffset]
+            values = deminimize_values(postingsize, self.count, vstring,
+                                       self.compression)
+        self.values = values
+        return values
+
+

File src/whoosh/compat.py

     from cPickle import dumps, loads, dump, load
     string_type = basestring
     text_type = unicode
+    bytes_type = str
     unichr = unichr
     from urllib import urlretrieve
 
     StringIO = io.StringIO
     string_type = str
     text_type = str
+    bytes_tyep = bytes
     unichr = chr
     from urllib.request import urlretrieve
 

File src/whoosh/fields.py

                             % (self.__class__.__name__, self))
         if not isinstance(value, (text_type, list, tuple)):
             raise ValueError("%r is not unicode or sequence" % value)
-        assert isinstance(self.format, formats.Format), type(self.format)
-        return self.format.word_values(value, self.analyzer, mode="index",
-                                       **kwargs)
+        assert isinstance(self.format, formats.Format)
+
+        if "mode" not in kwargs:
+            kwargs["mode"] = "index"
+        return self.format.word_values(value, self.analyzer, **kwargs)
 
     def index_(self, fieldname, value, **kwargs):
         for w, freq, weight, value in self.index(value, **kwargs):
         for shift in xrange(0, bitlen, self.shift_step):
             yield self.to_text(num, shift=shift)
 
-    def index(self, num):
+    def index(self, num, **kwargs):
         # If the user gave us a list of numbers, recurse on the list
         if isinstance(num, (list, tuple)):
             items = []
             raise ValueError("%r is not a boolean")
         return self.strings[int(bit)]
 
-    def index(self, bit):
+    def index(self, bit, **kwargs):
         bit = bool(bit)
         # word, freq, weight, valuestring
         return [(self.strings[int(bit)], 1, 1.0, '')]

File src/whoosh/filedb/fileindex.py

         storage.rename_file(tempfilename, tocfilename, safe=True)
 
 
-def _clean_files(storage, indexname, gen, segments):
+def clean_files(storage, indexname, gen, segments):
     # Attempts to remove unused index files (called when a new generation
     # is created). If existing Index and/or reader objects have the files
     # open, they may not be deleted immediately (i.e. on Windows) but will
     along the way).
     """
 
-    EXTENSIONS = {"dawg": "dag",
-                  "fieldlengths": "fln",
-                  "storedfields": "sto",
-                  "termsindex": "trm",
-                  "termposts": "pst",
-                  "vectorindex": "vec",
-                  "vectorposts": "vps"}
     IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
 
     @classmethod
     def _random_id(cls, size=12):
         return "".join(random.choice(cls.IDCHARS) for _ in xrange(size))
 
-    def __init__(self, indexname, doccount, segid=None, deleted=None):
+    def __init__(self, indexname, doccount=0, segid=None, deleted=None):
         """
         :param name: The name of the segment (the Index object computes this
             from its name and the generation).
         self.deleted = deleted
 
     def __repr__(self):
-        return "<%s %r %s>" % (self.__class__.__name__, self.name,
-                               getattr(self, "segid", ""))
-
-    def __getattr__(self, name):
-        # Capture accesses to e.g. Segment.fieldlengths_filename and return
-        # the appropriate filename
-        part2 = "_filename"
-        if name.endswith(part2):
-            part1 = name[:0 - len(part2)]
-            if part1 in self.EXTENSIONS:
-                return self.make_filename(self.EXTENSIONS[part1])
-
-        raise AttributeError(name)
+        return "<%s %s>" % (self.__class__.__name__, getattr(self, "segid", ""))
 
     def segment_id(self):
         if hasattr(self, "name"):
             return "%s_%s" % (self.indexname, self.segid)
 
     def make_filename(self, ext):
-        return "%s.%s" % (self.segment_id(), ext)
+        return "%s%s" % (self.segment_id(), ext)
+
+    def create_file(self, storage, ext, **kwargs):
+        """Convenience method to create a new file in the given storage named
+        with this segment's ID and the given extension. Any keyword arguments
+        are passed to the storage's create_file method.
+        """
+
+        fname = self.make_filename(ext)
+        return storage.create_file(fname, **kwargs)
+
+    def open_file(self, storage, ext, **kwargs):
+        """Convenience method to open a file in the given storage named with
+        this segment's ID and the given extension. Any keyword arguments are
+        passed to the storage's open_file method.
+        """
+
+        fname = self.make_filename(ext)
+        return storage.open_file(fname, **kwargs)
 
     def doc_count_all(self):
         """

File src/whoosh/filedb/filepostings.py

-# Copyright 2010 Matt Chaput. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#    1. Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#
-#    2. Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
-# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# The views and conclusions contained in the software and documentation are
-# those of the authors and should not be interpreted as representing official
-# policies, either expressed or implied, of Matt Chaput.
-
-from whoosh.compat import integer_types, xrange
-from whoosh.formats import Format
-from whoosh.writing import PostingWriter
-from whoosh.matching import Matcher, ReadTooFar
-from whoosh.spans import Span
-from whoosh.system import _INT_SIZE
-from whoosh.filedb import postblocks
-from whoosh.filedb.filetables import FileTermInfo
-
-
-class FilePostingWriter(PostingWriter):
-    blockclass = postblocks.current
-
-    def __init__(self, postfile, stringids=False, blocklimit=128,
-                 compression=3):
-        self.postfile = postfile
-        self.stringids = stringids