Commits

Matt Chaput  committed a4b2068

Term index and vector files now use 16-bit codes instead of full field names.
Changed architecture of term index and term vector reader/writers.

  • Participants
  • Parent commits 01a8dd8

Comments (0)

Files changed (7)

File benchmark/enron.py

 from __future__ import division
-from bz2 import compress, decompress
+from zlib import compress, decompress
 from email import message_from_string
 import os.path, tarfile
 from marshal import dump, load
     for d in get_cached_messages(cache):
         skipc -= 1
         if not skipc:
-            d["_stored_body"] = compress(d["body"])
+            d["_stored_body"] = compress(d["body"], 9)
             w.add_document(**d)
             skipc = skip
             c += 1

File src/whoosh/filedb/filereading.py

 from threading import Lock
 
 from whoosh.filedb.filepostings import FilePostingReader
-from whoosh.filedb.filetables import (CodedOrderedReader, StoredFieldReader,
-                                      LengthReader, CodedHashReader)
-from whoosh.filedb import misc
+from whoosh.filedb.filetables import (TermIndexReader, StoredFieldReader,
+                                      LengthReader, TermVectorReader)
 from whoosh.matching import ExcludeMatcher
 from whoosh.reading import IndexReader, TermNotFound
 from whoosh.util import protected
         
         # Term index
         tf = storage.open_file(segment.termsindex_filename)
-        self.termsindex = CodedOrderedReader(tf, keycoder=misc.encode_termkey,
-                                             keydecoder=misc.decode_termkey,
-                                             valuedecoder=misc.decode_terminfo)
+        self.termsindex = TermIndexReader(tf)
         
         # Term postings file, vector index, and vector postings: lazy load
         self.postfile = None
         
         # Vector index
         vf = storage.open_file(segment.vectorindex_filename)
-        self.vectorindex = CodedHashReader(vf, keycoder=misc.encode_vectorkey,
-                                           keydecoder=misc.decode_vectorkey,
-                                           valuedecoder=misc.decode_vectoroffset)
+        self.vectorindex = TermVectorReader(vf)
         
         # Vector postings file
         self.vpostfile = storage.open_file(segment.vectorposts_filename,

File src/whoosh/filedb/filetables.py

 from cPickle import loads, dumps
 from struct import Struct
 
-from whoosh.filedb.misc import enpickle, depickle
-from whoosh.system import _INT_SIZE, _LONG_SIZE
-from whoosh.util import byte_to_length
+from whoosh.system import (_INT_SIZE, _LONG_SIZE, pack_ushort, pack_uint,
+                           pack_long, unpack_ushort, unpack_uint, unpack_long)
+from whoosh.util import byte_to_length, utf8encode, utf8decode
 
 
+_4GB = 4 * 1024 * 1024 * 1024
+
 #def cdb_hash(key):
 #    h = 5381L
 #    for c in key:
     def close(self):
         self._write_hashes()
         dbfile = self.dbfile
-        index = self.index
         
-        dbfile.write_uint(len(index))
-        for pos in index:
-            dbfile.write_long(pos)
+        dbfile.write_uint(len(self.index))
+        for n in self.index:
+            dbfile.write_long(n)
         
         self._write_directory()
         self.dbfile.close()
 
 
 class CodedHashWriter(HashWriter):
-    def __init__(self, dbfile, keycoder=None, valuecoder=None):
+    # Abstract base class, subclass must implement keycoder and valuecoder
+    
+    def __init__(self, dbfile):
         sup = super(CodedHashWriter, self)
         sup.__init__(dbfile)
-        self.keycoder = keycoder or str
-        self.valuecoder = valuecoder or enpickle
 
         self._add = sup.add
         
         
 
 class CodedHashReader(HashReader):
-    def __init__(self, dbfile, keycoder=None, keydecoder=None,
-                 valuedecoder=None):
+    # Abstract base class, subclass must implement keycoder, keydecoder and
+    # valuecoder
+    
+    def __init__(self, dbfile):
         sup = super(CodedHashReader, self)
         sup.__init__(dbfile)
-        self.keycoder = keycoder or str
-        self.keydecoder = keydecoder or int
-        self.valuedecoder = valuedecoder or depickle
 
         self._items = sup.items
         self._keys = sup.keys
 
 
 class CodedOrderedWriter(OrderedHashWriter):
-    def __init__(self, dbfile, keycoder=None, valuecoder=None):
+    # Abstract base class, subclasses must implement keycoder and valuecoder
+    
+    def __init__(self, dbfile):
         sup = super(CodedOrderedWriter, self)
         sup.__init__(dbfile)
-        self.keycoder = keycoder or str
-        self.valuecoder = valuecoder or enpickle
-
         self._add = sup.add
 
     def add(self, key, data):
 
 
 class CodedOrderedReader(OrderedHashReader):
-    def __init__(self, dbfile, keycoder=None, keydecoder=None,
-                 valuedecoder=None):
+    # Abstract base class, subclasses must implement keycoder, keydecoder,
+    # and valuedecoder
+    
+    def __init__(self, dbfile):
         sup = super(CodedOrderedReader, self)
         sup.__init__(dbfile)
-        self.keycoder = keycoder or str
-        self.keydecoder = keydecoder or int
-        self.valuedecoder = valuedecoder or depickle
 
         self._items = sup.items
         self._items_from = sup.items_from
         return self.valuedecoder(self._getitem(k))
 
     def __contains__(self, key):
-        return self._contains(self.keycoder(key))
+        try:
+            codedkey = self.keycoder(key)
+        except KeyError:
+            return False
+        return self._contains(codedkey)
 
     def get(self, key, default=None):
         k = self.keycoder(key)
             yield kd(k)
 
 
+# weight, offset, postcount
+_terminfo_struct0 = Struct("!BIB")
+_terminfo_struct1 = Struct("!fII")
+_terminfo_struct2 = Struct("!fqI")
+
+class TermIndexWriter(CodedOrderedWriter):
+    def __init__(self, dbfile):
+        super(TermIndexWriter, self).__init__(dbfile)
+        self.fieldcounter = 0
+        self.fieldmap = {}
+    
+    def keycoder(self, key):
+        # Encode term
+        fieldmap = self.fieldmap
+        fieldname, text = key
+        
+        if fieldname in fieldmap:
+            fieldnum = fieldmap[fieldname]
+        else:
+            fieldnum = self.fieldcounter
+            fieldmap[fieldname] = fieldnum
+            self.fieldcounter += 1
+        
+        key = pack_ushort(fieldnum) + utf8encode(text)[0]
+        return key
+    
+    def valuecoder(self, data):
+        # Encode term info
+        w, offset, df = data
+        if offset < _4GB:
+            iw = int(w)
+            if w == 1 and df == 1 :
+                return pack_uint(offset)
+            elif w == iw and w <= 255 and df <= 255:
+                return _terminfo_struct0.pack(iw, offset, df)
+            else:
+                return _terminfo_struct1.pack(w, offset, df)
+        else:
+            return _terminfo_struct2.pack(w, offset, df)
+    
+    def close(self):
+        self._write_hashes()
+        dbfile = self.dbfile
+        
+        dbfile.write_uint(len(self.index))
+        for n in self.index:
+            dbfile.write_long(n)
+        dbfile.write_pickle(self.fieldmap)
+        
+        self._write_directory()
+        self.dbfile.close()
+
+
+class TermIndexReader(CodedOrderedReader):
+    def __init__(self, dbfile):
+        super(TermIndexReader, self).__init__(dbfile)
+        
+        dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
+        self.fieldmap = dbfile.read_pickle()
+        self.names = [None] * len(self.fieldmap)
+        for name, num in self.fieldmap.iteritems():
+            self.names[num] = name
+    
+    def keycoder(self, key):
+        return pack_ushort(self.fieldmap[key[0]]) + utf8encode(key[1])[0]
+        
+    def keydecoder(self, v):
+        return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
+    
+    def valuedecoder(self, v):
+        if len(v) == _INT_SIZE:
+            return (1.0, unpack_uint(v)[0], 1)
+        elif len(v) == _terminfo_struct0.size:
+            return _terminfo_struct0.unpack(v)
+        elif len(v) == _terminfo_struct1.size:
+            return _terminfo_struct1.unpack(v)
+        else:
+            return _terminfo_struct2.unpack(v)
+    
+
+# docnum, fieldnum
+_vectorkey_struct = Struct("!IH")
+
+class TermVectorWriter(TermIndexWriter):
+    def keycoder(self, key):
+        fieldmap = self.fieldmap
+        docnum, fieldname = key
+        
+        if fieldname in fieldmap:
+            fieldnum = fieldmap[fieldname]
+        else:
+            fieldnum = self.fieldcounter
+            fieldmap[fieldname] = fieldnum
+            self.fieldcounter += 1
+        
+        return _vectorkey_struct.pack(docnum, fieldnum)
+    
+    def valuecoder(self, offset):
+        return pack_long(offset)
+        
+
+class TermVectorReader(TermIndexReader):
+    def keycoder(self, key):
+        return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]])
+        
+    def keydecoder(self, v):
+        docnum, fieldnum = _vectorkey_struct.unpack(v)
+        return (docnum, self.names[fieldnum])
+    
+    def valuedecoder(self, v):
+        return unpack_long(v)[0]
+    
+
 class LengthWriter(object):
     def __init__(self, dbfile, doccount, lengths=None):
         self.dbfile = dbfile

File src/whoosh/filedb/filewriting.py

 from whoosh.fields import UnknownFieldError
 from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.filepostings import FilePostingWriter
-from whoosh.filedb.filetables import (StoredFieldWriter, CodedOrderedWriter,
-                                      CodedHashWriter)
-from whoosh.filedb import misc
+from whoosh.filedb.filetables import (TermIndexWriter, StoredFieldWriter,
+                                      TermVectorWriter)
 from whoosh.filedb.pools import TempfilePool
 from whoosh.reading import TermNotFound
 from whoosh.store import LockError
         
         # Terms index
         tf = self.storage.create_file(segment.termsindex_filename)
-        self.termsindex = CodedOrderedWriter(tf,
-                                             keycoder=misc.encode_termkey,
-                                             valuecoder=misc.encode_terminfo)
+        self.termsindex = TermIndexWriter(tf)
         
         # Term postings file
         pf = self.storage.create_file(segment.termposts_filename)
         if self.schema.has_vectored_fields():
             # Vector index
             vf = self.storage.create_file(segment.vectorindex_filename)
-            self.vectorindex = CodedHashWriter(vf,
-                                               keycoder=misc.encode_vectorkey,
-                                               valuecoder=misc.encode_vectoroffset)
+            self.vectorindex = TermVectorWriter(vf)
             
             # Vector posting file
             vpf = self.storage.create_file(segment.vectorposts_filename)

File src/whoosh/filedb/misc.py

-#===============================================================================
-# Copyright 2010 Matt Chaput
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#    http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-import struct
-from cPickle import loads, dumps
-from marshal import dumps as mdumps
-from marshal import loads as mloads
-
-from whoosh.system import (pack_uint, unpack_uint, pack_long, unpack_long,
-                           _INT_SIZE)
-from whoosh.util import utf8encode, utf8decode
-
-
-def encode_termkey(term):
-    fieldname, text = term
-    return "%s %s" % (utf8encode(fieldname)[0], utf8encode(text)[0])
-def decode_termkey(key):
-    fieldname, text = key.split(" ", 1)
-    return (utf8decode(fieldname)[0], utf8decode(text)[0])
-
-
-_terminfo_struct0 = struct.Struct("!BIB")
-_terminfo_struct1 = struct.Struct("!fqI") # weight, offset, postcount
-_4gb = 4 * 1024 * 1024 * 1024
-def encode_terminfo(w_off_df):
-    w, offset, df = w_off_df
-    if offset < _4gb:
-        iw = int(w)
-        if w == 1 and df == 1 :
-            return pack_uint(offset)
-        elif w == iw and w <= 255 and df <= 255:
-            return _terminfo_struct0.pack(iw, offset, df)
-    return _terminfo_struct1.pack(w, offset, df)
-def decode_terminfo(v):
-    if len(v) == _INT_SIZE:
-        return (1.0, unpack_uint(v)[0], 1)
-    elif len(v) == _INT_SIZE + 2:
-        return _terminfo_struct0.unpack(v)
-    else:
-        return _terminfo_struct1.unpack(v)
-
-
-def encode_vectorkey(docnum_and_fieldname):
-    docnum, fieldname = docnum_and_fieldname
-    return pack_uint(docnum) + fieldname
-
-def decode_vectorkey(key):
-    return unpack_uint(key[:_INT_SIZE]), key[_INT_SIZE:]
-
-encode_vectoroffset = pack_long
-decode_vectoroffset = lambda x: unpack_long(x)[0]
-
-
-encode_docnum = pack_uint
-decode_docnum = lambda x: unpack_uint(x)[0]
-
-enpickle = lambda data: dumps(data, -1)
-depickle = loads
-
-enmarshal = mdumps
-demarshal = mloads

File src/whoosh/searching.py

     
     >>> r = searcher.search(query.Term("content", "render"))
     >>> r[0]
-    Hit{title=u"Rendering the scene"}
+    <Hit {title=u"Rendering the scene"}>
     >>> r[0].docnum
     4592L
     >>> r[0].score

File tests/test_tables.py

 from os.path import exists
 from shutil import rmtree
 
-from whoosh.filedb.filestore import FileStorage
+from whoosh.filedb.filestore import FileStorage, RamStorage
 from whoosh.filedb.filetables import (HashReader, HashWriter,
                                       OrderedHashWriter, OrderedHashReader,
-                                      StoredFieldWriter, StoredFieldReader)
-from whoosh.filedb.misc import encode_termkey, decode_termkey
+                                      StoredFieldWriter, StoredFieldReader,
+                                      TermIndexWriter, TermIndexReader)
 
 
 class TestTables(unittest.TestCase):
                 pass
     
     def test_termkey(self):
-        term = ("alfa", u"bravo")
-        self.assertEqual(term, decode_termkey(encode_termkey(term)))
+        st = RamStorage()
+        tw = TermIndexWriter(st.create_file("test.trm"))
+        tw.add(("alfa", u"bravo"), (1.0, 2, 3))
+        tw.add((u"alfa", u"æï�ú"), (4.0, 5, 6))
+        tw.add((u"text", u"日本語"), (7.0, 8, 9))
+        tw.close()
         
-        term = ("text", u"hello there")
-        self.assertEqual(term, decode_termkey(encode_termkey(term)))
-        
-    def test_unicode_termkey(self):
-        term = (u"alfa", u"æï�ú")
-        self.assertEqual(term, decode_termkey(encode_termkey(term)))
-        
-        term = (u"text", u"日本語")
-        self.assertEqual(term, decode_termkey(encode_termkey(term)))
+        tr = TermIndexReader(st.open_file("test.trm"))
+        self.assertTrue(("alfa", u"bravo") in tr)
+        self.assertTrue((u"alfa", u"æï�ú") in tr)
+        self.assertTrue((u"text", u"日本語") in tr)
         
     def test_random_termkeys(self):
         def random_fieldname():
         def random_token():
             return "".join(unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20))
         
-        for _ in xrange(1000):
-            term = (random_fieldname(), random_token())
-            self.assertEqual(term, decode_termkey(encode_termkey(term)), term)
-    
+        domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)])
+        
+        st = RamStorage()
+        tw = TermIndexWriter(st.create_file("test.trm"))
+        for term in domain:
+            tw.add(term, (1.0, 0, 1))
+        tw.close()
+        
+        tr = TermIndexReader(st.open_file("test.trm"))
+        for term in domain:
+            self.assertTrue(term in tr)
+        
     def test_hash(self):
         st = self.make_storage("testindex")
         hwf = st.create_file("test.hsh")
         self.assertEqual(sfr[1], {"a": "one", "b": "two"})
         
         self.destroy_dir("testindex")
+        
+    def test_termindex(self):
+        terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"),
+                 ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")]
+        st = RamStorage()
+        
+        tw = TermIndexWriter(st.create_file("test.trm"))
+        for i, t in enumerate(terms):
+            tw.add(t, (1.0, i * 1000, 1))
+        tw.close()
+        
+        tr = TermIndexReader(st.open_file("test.trm"))
+        for i, (t1, t2) in enumerate(zip(tr.keys(), terms)):
+            self.assertEqual(t1, t2)
+            self.assertEqual(tr.get(t1), (1.0, i * 1000, 1))
+        
     
 
 if __name__ == '__main__':