Commits

Matt Chaput committed 12ae1ad

Speed improvements for overlapping facets.
OverlappingFieldCategorizer now uses term vectors if available.
StoredFieldFacet/Categorizer now support allow_overlap by splitting the stored value.
Added vector keyword argument to KEYWORD field type.
Raise KeyError instead of generic Exception when a given document does not have a term vector for a given field.
Added stress testing code for a faceting a large tag collection.
See issue #185.

Comments (0)

Files changed (5)

                          scorable=bool, unique=bool, field_boost=float)
 
     def __init__(self, stored=False, lowercase=False, commas=False,
-                 scorable=False, unique=False, field_boost=1.0,
+                 vector=None, scorable=False, unique=False, field_boost=1.0,
                  spelling=False):
         """
         :param stored: Whether to store the value of the field with the
         self.unique = unique
         self.spelling = spelling
 
+        if vector:
+            if type(vector) is type:
+                vector = vector()
+            elif isinstance(vector, formats.Format):
+                pass
+            else:
+                vector = self.format
+        else:
+            vector = None
+        self.vector = vector
+
 
 class TEXT(FieldType):
     """Configured field type for text fields (for example, the body text of an

src/whoosh/filedb/filereading.py

             raise Exception("No vectors are stored for field %r" % fieldname)
 
         self._open_vectors()
-        offset = self.vectorindex.get((docnum, fieldname))
-        if offset is None:
-            raise Exception("No vector found for document"
-                            " %s field %r" % (docnum, fieldname))
+        try:
+            offset = self.vectorindex.get((docnum, fieldname))
+        except KeyError:
+            raise KeyError("No vector found for document "
+                           "%s field %r" % (docnum, fieldname))
 
         return FilePostingReader(self.vpostfile, offset, vformat,
                                  stringids=True)
 import os.path
 import sys
 
-from whoosh import fields, store
-
 
 _DEF_INDEX_NAME = "MAIN"
 

src/whoosh/sorting.py

 
         def __init__(self, fieldname):
             self.fieldname = fieldname
+            self.use_vectors = False
 
         def set_searcher(self, searcher, docoffset):
             fieldname = self.fieldname
             field = searcher.schema[fieldname]
             reader = searcher.reader()
 
-            self.lists = [[] for _ in xrange(dc)]
-            for t, _ in field.sortable_values(reader, fieldname):
-                postings = reader.postings(fieldname, t)
-                for docid in postings.all_ids():
-                    self.lists[docid].append(t)
+            if field.vector:
+                # If the field was indexed with term vectors, use the vectors
+                # to get the list of values in each matched document
+                self.use_vectors = True
+                self.searcher = searcher
+            else:
+                # Otherwise, cache the values in each document in a huge list
+                # of lists
+                self.use_vectors = False
+                self.lists = [[] for _ in xrange(dc)]
+                for t, _ in field.sortable_values(reader, fieldname):
+                    postings = reader.postings(fieldname, t)
+                    for docid in postings.all_ids():
+                        self.lists[docid].append(t)
 
         def keys_for_id(self, docid):
-            return self.lists[docid] or None
+            if self.use_vectors:
+                try:
+                    v = self.searcher.vector(docid, self.fieldname)
+                    return list(v.all_ids())
+                except KeyError:
+                    return None
+            else:
+                return self.lists[docid] or None
 
         def key_for_id(self, docid):
-            ls = self.lists[docid]
-            if ls:
-                return ls[0]
+            if self.use_vectors:
+                try:
+                    v = self.searcher.vector(docid, self.fieldname)
+                    return v.id()
+                except KeyError:
+                    return None
             else:
-                return None
+                ls = self.lists[docid]
+                if ls:
+                    return ls[0]
+                else:
+                    return None
 
 
 class QueryFacet(FacetType):
 
 class StoredFieldFacet(FacetType):
     """Lets you sort/group using the value in an unindexed, stored field (e.g.
-    STORED). This is slower than using an indexed field.
+    STORED). This is usually slower than using an indexed field.
+    
+    For fields where the stored value is a space-separated list of keywords,
+    (e.g. ``"tag1 tag2 tag3"``), you can use the ``allow_overlap`` keyword
+    argument to allow overlapped faceting on the result of calling the
+    ``split()`` method on the field value (or calling a custom split function
+    if one is supplied).
     """
 
-    def __init__(self, fieldname):
+    def __init__(self, fieldname, allow_overlap=False, split_fn=None):
+        """
+        :param fieldname: the name of the stored field.
+        :param allow_overlap: if True, when grouping, allow documents to appear
+            in multiple groups when they have multiple terms in the field. The
+            categorizer uses ``string.split()`` or the custom ``split_fn`` to
+            convert the stored value into a list of facet values.
+        :param split_fn: a custom function to split a stored field value into
+            facet values. If not supplied, the categorizer simply calls the
+            value's ``split()`` method.
+        """
+
         self.fieldname = fieldname
+        self.allow_overlap = allow_overlap
+        self.split_fn = None
 
     def categorizer(self, searcher):
-        return self.StoredFieldCategorizer(self.fieldname)
+        return self.StoredFieldCategorizer(self.fieldname, self.allow_overlap,
+                                           self.split_fn)
 
     class StoredFieldCategorizer(Categorizer):
-        def __init__(self, fieldname):
+        def __init__(self, fieldname, allow_overlap, split_fn):
             self.fieldname = fieldname
+            self.allow_overlap = allow_overlap
+            self.split_fn = split_fn
 
         def set_searcher(self, searcher, docoffset):
             self.searcher = searcher
 
+        def keys_for_id(self, docid):
+            value = self.searcher.stored_fields(docid).get(self.fieldname)
+            if self.split_fn:
+                return self.split_fn(value)
+            else:
+                return value.split()
+
         def key_for_id(self, docid):
             fields = self.searcher.stored_fields(docid)
-            return fields[self.fieldname]
+            return fields.get(self.fieldname)
 
 
 class MultiFacet(FacetType):

stress/test_bigfacet.py

+from __future__ import with_statement
+import os.path, random, string
+import sqlite3 as sqlite
+
+from whoosh import fields, formats, index, query, sorting
+from whoosh.util import now
+
+
+tagcount = 100
+doccount = 500000
+dirname = "tagindex"
+
+schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence()))
+
+if not os.path.exists(dirname):
+    os.mkdir(dirname)
+
+reindex = False
+if reindex or not index.exists_in(dirname):
+    tags = []
+    for _ in xrange(tagcount):
+        tag = u"".join(random.choice(string.ascii_lowercase) for _ in xrange(5))
+        tags.append(tag)
+
+    ix = index.create_in(dirname, schema)
+    t = now()
+    with ix.writer() as w:
+        for i in xrange(doccount):
+            doc = u" ".join(random.sample(tags, random.randint(10, 20)))
+            w.add_document(tags=doc)
+            if not i % 10000:
+                print i
+    print now() - t
+
+
+ix = index.open_dir(dirname)
+with ix.searcher() as s:
+    tags = list(s.lexicon("tags"))
+    facet = sorting.FieldFacet("tags", allow_overlap=True)
+    qtag = random.choice(tags)
+    print "tag=", qtag
+    q = query.Term("tags", qtag)
+    r = s.search(q, groupedby={"tags": facet})
+    print r.runtime
+
+    facet = sorting.StoredFieldFacet("tags", allow_overlap=True)
+    r = s.search(q, groupedby={"tags": facet})
+    print r.runtime
+
+