Commits

Matt Chaput committed 9db6fa3

Moved several functions included for backwards compatibility to compat from util.
Fixed bug in StructFile.get_array().
Fixed references to filedb.fileindex.Segment.
Added "gint" variable-length integer compression functions to util.

  • Participants
  • Parent commits 6456ee7

Comments (0)

Files changed (22)

File src/whoosh/compat.py

 import sys
 
+
+# Run time aliasing of Python2/3 differences
+
 if sys.version_info[0] < 3:
     PY3 = False
 
         else:
             return mv
 
+
+# Implementations missing from older versions of Python
+
+try:
+    from itertools import permutations  # @UnusedImport
+except ImportError:
+    # Python 2.5
+    def permutations(iterable, r=None):
+        pool = tuple(iterable)
+        n = len(pool)
+        r = n if r is None else r
+        if r > n:
+            return
+        indices = range(n)
+        cycles = range(n, n - r, -1)
+        yield tuple(pool[i] for i in indices[:r])
+        while n:
+            for i in reversed(range(r)):
+                cycles[i] -= 1
+                if cycles[i] == 0:
+                    indices[i:] = indices[i + 1:] + indices[i:i + 1]
+                    cycles[i] = n - i
+                else:
+                    j = cycles[i]
+                    indices[i], indices[-j] = indices[-j], indices[i]
+                    yield tuple(pool[i] for i in indices[:r])
+                    break
+            else:
+                return
+
+
+try:
+    # Python 2.6-2.7
+    from itertools import izip_longest  # @UnusedImport
+except ImportError:
+    try:
+        # Python 3.0
+        from itertools import zip_longest as izip_longest  # @UnusedImport
+    except ImportError:
+        # Python 2.5
+        from itertools import chain, izip, repeat
+
+        def izip_longest(*args, **kwds):
+            fillvalue = kwds.get('fillvalue')
+
+            def sentinel(counter=([fillvalue] * (len(args) - 1)).pop):
+                yield counter()
+
+            fillers = repeat(fillvalue)
+            iters = [chain(it, sentinel(), fillers) for it in args]
+            try:
+                for tup in izip(*iters):
+                    yield tup
+            except IndexError:
+                pass
+
+
+try:
+    from operator import methodcaller  # @UnusedImport
+except ImportError:
+    # Python 2.5
+    def methodcaller(name, *args, **kwargs):
+        def caller(obj):
+            return getattr(obj, name)(*args, **kwargs)
+        return caller
+
+
+try:
+    from abc import abstractmethod  # @UnusedImport
+except ImportError:
+    # Python 2.5
+    def abstractmethod(funcobj):
+        """A decorator indicating abstract methods.
+        """
+        funcobj.__isabstractmethod__ = True
+        return funcobj
+
+
+

File src/whoosh/filedb/fileindex.py

                 sleep(0.05)
 
 
-from whoosh.codec.whoosh2 import W2Segment as Segment  # @UnusedImport
+# from whoosh.codec.whoosh2 import W2Segment as Segment  # @UnusedImport
 

File src/whoosh/filedb/filereading.py

 
 from whoosh.compat import iteritems, xrange
 from whoosh.filedb.compound import CompoundStorage
-from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.fieldcache import FieldCache, DefaultFieldCachingPolicy
 from whoosh.matching import FilterMatcher
 from whoosh.reading import IndexReader, TermNotFound
         if hasattr(self.segment, "segment_id"):
             self.segid = self.segment.segment_id()
         else:
+            from whoosh.codec.base import Segment
             self.segid = Segment._random_id()
 
         # self.files is a storage object from which to load the segment files.

File src/whoosh/filedb/filewriting.py

 from bisect import bisect_right
 
 from whoosh.fields import UnknownFieldError
-from whoosh.filedb.fileindex import Segment
 from whoosh.store import LockError
 from whoosh.support.filelock import try_for
 from whoosh.support.externalsort import SortingPool
     def add_postings(self, lengths, items, startdoc, docmap):
         # items = (fieldname, text, docnum, weight, valuestring) ...
         schema = self.schema
+
         # Make a generator to strip out deleted fields and renumber the docs
         # before passing them down to the field writer
         def gen():
                 if field.vector and reader.has_vector(docnum, fieldname):
                     v = reader.vector(docnum, fieldname)
                     perdocwriter.add_vector_matcher(fieldname, field, v)
-            # Finish the new document 
+            # Finish the new document
             perdocwriter.finish_doc()
             newdoc += 1
 

File src/whoosh/filedb/structfile.py

 
     def get_array(self, position, typecode, length):
         self.file.seek(position)
-        self.read_array(typecode, length)
+        return self.read_array(typecode, length)
 
 
 

File src/whoosh/matching/mcore.py

 from itertools import repeat
 
 from whoosh.compat import izip, xrange
-from whoosh.util import abstractmethod
+from whoosh.compat import abstractmethod
 
 
 # Exceptions

File src/whoosh/query/qcore.py

 from whoosh import matching
 from whoosh.compat import u
 from whoosh.reading import TermNotFound
-from whoosh.util import methodcaller
+from whoosh.compat import methodcaller
 
 
 # Exceptions

File src/whoosh/reading.py

 from whoosh.support.levenshtein import distance
 from whoosh.util import ClosableMixin
 from whoosh.matching import MultiMatcher
-from whoosh.util import abstractmethod
+from whoosh.compat import abstractmethod
 
 
 # Exceptions

File src/whoosh/support/bitvector.py

 from array import array
 from bisect import bisect_left, bisect_right, insort
 
-from whoosh.compat import integer_types, izip, xrange
+from whoosh.compat import integer_types, izip, izip_longest, xrange
 
 
 # Number of '1' bits in each byte (0-255)
             bits[-1] = bits[-1] & mask
 
     def _logic(self, obj, op, other):
-        from whoosh.util import izip_longest
-
         objbits = obj.bits
         for i, (byte1, byte2) in enumerate(izip_longest(objbits, other.bits,
                                                         fillvalue=0)):

File src/whoosh/system.py

 _uint_struct = Struct("!I")
 _long_struct = Struct("!q")
 _float_struct = Struct("!f")
+_ushort_le_struct = Struct("<H")
+_uint_le_struct = Struct("<I")
 
 pack_byte = _byte_struct.pack
 pack_sbyte = _sbyte_struct.pack
 pack_uint = _uint_struct.pack
 pack_long = _long_struct.pack
 pack_float = _float_struct.pack
+pack_ushort_le = _ushort_le_struct.pack
+pack_uint_le = _uint_le_struct.pack
 
 unpack_byte = _byte_struct.unpack  # ord() might be faster
 unpack_sbyte = _sbyte_struct.unpack
 unpack_uint = _uint_struct.unpack
 unpack_long = _long_struct.unpack
 unpack_float = _float_struct.unpack
+unpack_ushort_le = _ushort_le_struct.unpack
+unpack_uint_le = _uint_le_struct.unpack
 
 if sys.version_info[0] < 3:
     emptybytes = ""

File src/whoosh/util.py

 import sys
 import time
 from array import array
-from bisect import insort, bisect_left
+from bisect import insort, bisect_left, bisect_right
 from copy import copy
 from functools import wraps
 from struct import pack, unpack
 
 from whoosh.compat import xrange, u, b, string_type
 from whoosh.system import IS_LITTLE
-
-
-try:
-    from itertools import permutations  # @UnusedImport
-except ImportError:
-    # Python 2.5
-    def permutations(iterable, r=None):
-        pool = tuple(iterable)
-        n = len(pool)
-        r = n if r is None else r
-        if r > n:
-            return
-        indices = range(n)
-        cycles = range(n, n - r, -1)
-        yield tuple(pool[i] for i in indices[:r])
-        while n:
-            for i in reversed(range(r)):
-                cycles[i] -= 1
-                if cycles[i] == 0:
-                    indices[i:] = indices[i + 1:] + indices[i:i + 1]
-                    cycles[i] = n - i
-                else:
-                    j = cycles[i]
-                    indices[i], indices[-j] = indices[-j], indices[i]
-                    yield tuple(pool[i] for i in indices[:r])
-                    break
-            else:
-                return
-
-try:
-    # Python 2.6-2.7
-    from itertools import izip_longest  # @UnusedImport
-except ImportError:
-    try:
-        # Python 3.0
-        from itertools import zip_longest as izip_longest
-    except ImportError:
-        # Python 2.5
-        from itertools import chain, izip, repeat
-
-        def izip_longest(*args, **kwds):
-            fillvalue = kwds.get('fillvalue')
-
-            def sentinel(counter=([fillvalue] * (len(args) - 1)).pop):
-                yield counter()
-
-            fillers = repeat(fillvalue)
-            iters = [chain(it, sentinel(), fillers) for it in args]
-            try:
-                for tup in izip(*iters):
-                    yield tup
-            except IndexError:
-                pass
-
-try:
-    from operator import methodcaller  # @UnusedImport
-except ImportError:
-    # Python 2.5
-    def methodcaller(name, *args, **kwargs):
-        def caller(obj):
-            return getattr(obj, name)(*args, **kwargs)
-        return caller
+from whoosh.system import pack_ushort_le, pack_uint_le
+from whoosh.system import unpack_ushort_le, unpack_uint_le
 
 
 if sys.platform == 'win32':
     return i
 
 
+# Google Packed Ints algorithm: a set of four numbers is preceded by a "key"
+# byte, which encodes how many bytes each of the next four integers use
+# (stored in the byte as four 2-bit numbers)
+
+# Number of future bytes to expect after a "key" byte value of N -- used to
+# skip ahead from a key byte
+
+_gint_lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6,
+7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 5,
+6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9,
+10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
+13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7,
+8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11,
+12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13,
+14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10,
+8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10,
+11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12,
+13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16])
+
+
+def key_to_sizes(key):
+    """Returns a list of the sizes of the next four numbers given a key byte.
+    """
+
+    return [(key >> (i * 2) & 3) + 1 for i in xrange(4)]
+
+
+def write_gints(dbfile, nums):
+    """Write the integers in the iterator nums to bytes stream dbfile.
+    """
+
+    buf = array("c")
+    count = 0
+    key = 0
+    for v in nums:
+        shift = count * 2
+        if v < 256:
+            buf.append(chr(v))
+        elif v < 65536:
+            key |= 1 << shift
+            buf.extend(pack_ushort_le(v))
+        elif v < 16777216:
+            key |= 2 << shift
+            buf.extend(pack_uint_le(v)[:3])
+        else:
+            key |= 3 << shift
+            buf.extend(pack_uint_le(v))
+
+        count += 1
+        if count == 4:
+            #print bin(key), repr(buf)
+            dbfile.write(chr(key))
+            dbfile.write(buf.tostring())
+            count = 0
+            key = 0
+            del buf[0:len(buf)]
+
+    if count:
+        dbfile.write(chr(key))
+        dbfile.write(buf.tostring())
+
+
+def read_gints(dbfile, n):
+    """Read N integers from the bytes stream dbfile. Expects that the file
+    starts at a key byte.
+    """
+
+    count = 0
+    read = dbfile.read
+    for _ in xrange(n):
+        if count == 0:
+            key = ord(dbfile.read(1))
+        code = key >> (count * 2) & 3
+        if code == 0:
+            yield ord(read(1))
+        elif code == 1:
+            yield unpack_ushort_le(read(2))[0]
+        elif code == 2:
+            yield unpack_uint_le(read(3) + "\x00")[0]
+        else:
+            yield unpack_uint_le(read(4))[0]
+
+        count = (count + 1) % 4
+
+
+def index_gints(dbfile, n, factor):
+    """"Creates an index of skips over a list of g-ints. Returns a list of
+    the ordinal of the number at the start of each skip block, and a
+    corresponding list of the starting positions in the file of the block.
+    """
+
+    ords = array("i")
+    poses = array("i")
+    count = 0
+    pos = dbfile.tell()
+    while count < n:
+        key = ord(dbfile.read(1))
+        if not count % factor:
+            ords.append(count)
+            poses.append(pos)
+        delta = _gint_lens[key]
+        dbfile.seek(delta, 1)
+        pos += delta + 1
+        count += 4
+    return ords, poses
+
+
+def get_gint(dbfile, i, ords, poses):
+    """Given a byte stream, the ordinal position to get, and the skip lists
+    created by index_gints, returns the value of the i-th number in the
+    stream.
+    """
+
+    # Find which skip block the number is in
+    j = bisect_right(ords, i) - 1
+    # Ordinal position within the block: the ordinal position of the start of
+    # the block minus the ordinal position we're looking for
+    wbi = i - ords[j]
+    # Seek to the start of the block
+    dbfile.seek(poses[j])
+
+    # Skip as many 4-int sets as we can
+    while wbi // 4 >= 1:
+        key = ord(dbfile.read(1))
+        dbfile.seek(_gint_lens[key], 1)
+        wbi -= 4
+
+    # Read enough numbers serially to get the one we're looking for
+    for n in read_gints(dbfile, wbi + 1):
+        pass
+    # Return the last read number
+    return n
+
+
 # Fibonacci function
 
 _fib_cache = {}
     return re.compile(pattern, re.UNICODE | flags)
 
 
-try:
-    from abc import abstractmethod  # @UnusedImport
-except ImportError:
-    def abstractmethod(funcobj):
-        """A decorator indicating abstract methods.
-        """
-        funcobj.__isabstractmethod__ = True
-        return funcobj
 
 
+
+

File src/whoosh/writing.py

 import time
 from contextlib import contextmanager
 
+from whoosh.compat import abstractmethod
 from whoosh.store import LockError
-from whoosh.util import abstractmethod, synchronized
+from whoosh.util import synchronized
 
 
 # Exceptions

File tests/test_codecs.py

 from whoosh.codec.base import FileTermInfo
 from whoosh.codec import default_codec
 from whoosh.filedb.filestore import RamStorage
-from whoosh.filedb.fileindex import Segment
 from whoosh.support.testing import TempStorage
 from whoosh.util import byte_to_length, length_to_byte
 
 def _make_codec(**kwargs):
     st = RamStorage()
     codec = default_codec(**kwargs)
-    seg = Segment("test")
+    seg = codec.new_segment(st, "test")
     return st, codec, seg
 
 
 
 def test_stored_fields():
     codec = default_codec()
-    seg = Segment("test")
     fieldobj = fields.TEXT(stored=True)
     with TempStorage("storedfields") as st:
+        seg = codec.new_segment(st, "test")
+
         dw = codec.per_document_writer(st, seg)
         dw.start_doc(0)
         dw.add_field("a", fieldobj, "hello", 1)

File tests/test_indexing.py

 from datetime import datetime
 
 from whoosh import fields, query
-from whoosh.compat import u, xrange, text_type, PY3
+from whoosh.compat import u, xrange, text_type, PY3, permutations
 from whoosh.filedb.filestore import RamStorage
 from whoosh.filedb.filewriting import NO_MERGE
-from whoosh.util import length_to_byte, byte_to_length, permutations
+from whoosh.util import length_to_byte, byte_to_length
 from whoosh.writing import IndexingError
 from whoosh.support.testing import TempIndex
 

File tests/test_matching.py

 from nose.tools import assert_equal, assert_not_equal  # @UnresolvedImport
 
 from whoosh import fields, matching, query
-from whoosh.compat import u, xrange
+from whoosh.compat import u, xrange, permutations
 from whoosh.filedb.filestore import RamStorage
 from whoosh.query import And, Term
-from whoosh.util import make_binary_tree, permutations
+from whoosh.util import make_binary_tree
 
 
 def _keys(searcher, docnums):

File tests/test_mpwriter.py

 from nose.tools import assert_equal  # @UnresolvedImport
 
 from whoosh import fields, query
-from whoosh.compat import u, xrange
+from whoosh.compat import u, xrange, permutations
 from whoosh.support.testing import TempIndex, skip_if
-from whoosh.util import permutations, length_to_byte, byte_to_length
+from whoosh.util import length_to_byte, byte_to_length
 
 
 def no_multi():

File tests/test_postings.py

 from whoosh import analysis, fields
 from whoosh.compat import xrange, u
 from whoosh.codec import default_codec
-from whoosh.filedb.fileindex import Segment
 from whoosh.formats import (Characters, CharacterBoosts, Existence, Frequency,
                             Positions, PositionBoosts)
 from whoosh.support.testing import TempStorage
 def _roundtrip(content, format_, astype, ana=None):
     with TempStorage("roundtrip") as st:
         codec = default_codec()
-        seg = Segment("")
+        seg = codec.new_segment(st, "")
         ana = ana or analysis.StandardAnalyzer()
         field = fields.FieldType(format=format_, analyzer=ana)
 

File tests/test_results.py

 from nose.tools import assert_equal, assert_not_equal, assert_raises
 
 from whoosh import analysis, fields, formats, highlight, qparser, query
-from whoosh.compat import u, xrange, text_type
+from whoosh.compat import u, xrange, text_type, permutations
 from whoosh.filedb.filestore import RamStorage
-from whoosh.util import permutations
 
 
 def test_score_retrieval():

File tests/test_searching.py

 from nose.tools import assert_equal, assert_raises  # @UnresolvedImport
 
 from whoosh import analysis, fields, index, qparser, query, searching, scoring
-from whoosh.compat import u, xrange, text_type
+from whoosh.compat import u, xrange, text_type, permutations
 from whoosh.filedb.filestore import RamStorage
-from whoosh.util import permutations
 
 
 def make_index():

File tests/test_spans.py

 from nose.tools import assert_equal  # @UnresolvedImport
 
 from whoosh import analysis, fields, formats, spans
-from whoosh.compat import u, xrange
+from whoosh.compat import u, xrange, permutations
 from whoosh.filedb.filestore import RamStorage
 from whoosh.query import And, Or, Term, Phrase
-from whoosh.util import permutations
 
 
 domain = ("alfa", "bravo", "bravo", "charlie", "delta", "echo")

File tests/test_spelling.py

 from __future__ import with_statement
 import gzip
 
-from nose.tools import assert_equal, assert_not_equal, assert_raises  #@UnresolvedImport
+from nose.tools import assert_equal, assert_not_equal, assert_raises
 
 from whoosh import analysis, fields, highlight, spelling
-from whoosh.compat import u
+from whoosh.compat import u, permutations
 from whoosh.filedb.filestore import RamStorage
 from whoosh.qparser import QueryParser
 from whoosh.support import dawg
 from whoosh.support.testing import TempStorage
-from whoosh.util import permutations
 
 
 def words_to_corrector(words):

File tests/test_weightings.py

 from nose.tools import assert_equal  # @UnresolvedImport
 
 from whoosh import fields, query, scoring
-from whoosh.compat import u, xrange
+from whoosh.compat import u, xrange, permutations
 from whoosh.filedb.filestore import RamStorage
-from whoosh.util import permutations
 
 
 def _weighting_classes(ignore):