Commits

Matt Chaput committed ce5438a

Fixed new sorting/faceting architecture so all tests pass.

Comments (0)

Files changed (7)

src/whoosh/filedb/filepostings.py

             self._write_block()
 
     def finish(self, inlinelimit=1):
+        assert isinstance(inlinelimit, integer_types)
         if self.block is None:
             raise Exception("Called finish() when not in a block")
 

src/whoosh/filedb/filewriting.py

 except ImportError:
     has_sqlite = False
 
-from whoosh.compat import iteritems, text_type
+from whoosh.compat import integer_types, iteritems, text_type
 from whoosh.fields import UnknownFieldError
 from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.filepostings import FilePostingWriter
         pf = self.storage.create_file(segment.termposts_filename)
         pw = FilePostingWriter(pf, blocklimit=blocklimit)
         # Terms writer
-        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg,
-                                       self.wordsets)
+        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg)
         
         if self.schema.has_vectored_fields():
             # Vector index
         
         # Posting lists with <= this number of postings will be inlined into
         # the terms index instead of being written to the posting file
+        assert isinstance(inlinelimit, integer_types)
         self.inlinelimit = inlinelimit
         
         self.spelling = False

src/whoosh/query.py

         reader = searcher.reader()
         
         if fieldname in (None, "", "*"):
+            # This takes into account deletions
             doclist = list(reader.all_doc_ids())
         elif reader.supports_caches() and reader.fieldcache_available(self.fieldname):
             # If the reader has a field cache, use it to quickly get the list
         return ListMatcher(doclist, all_weights=self.boost)
 
 
-
-
 class _NullQuery(Query):
     "Represents a query that won't match anything."
     

src/whoosh/searching.py

         self.timer = None
         self.timedout = True
     
-    def collect(self, docid):
+    def collect(self, id, offsetid):
         docset = self.docset
         if docset is not None:
-            docset.add(docid)
+            docset.add(offsetid)
         
         if self.facets is not None:
             groups = self.groups
             for name, catter in self.categorizers.items():
-                key = catter.key_for_id(docid)
+                key = catter.key_for_id(id)
                 if self.groupids:
                     if name not in groups:
                         groups[name] = defaultdict(list)
-                    groups[name][key].append(docid)
+                    groups[name][key].append(offsetid)
                 else:
                     if name not in groups:
                         groups[name] = defaultdict(int)
         self._reset()
         self._set_timer()
         
-        facet = sorting.MultiFacet(sortedby)
+        facet = sorting.MultiFacet.from_sortedby(sortedby)
         catter = facet.categorizer(searcher)
         keyfn = catter.key_for_matcher
         t = now()
             id = matcher.id()
             offsetid = id + offset
             
-            if allow and offsetid not in allow:
-                continue
-            if restrict and offsetid in restrict:
-                continue
-            
-            collect(offsetid)
-            if scorefn:
-                score = scorefn(matcher)
-            else:
-                score = matcher.score()
-            yield (score, offsetid)
+            # Check whether the document is filtered
+            if ((not allow or offsetid in allow)
+                and (not restrict or offsetid not in restrict)):
+                # Collect and yield this document
+                collect(id, offsetid)
+                if scorefn:
+                    score = scorefn(matcher)
+                else:
+                    score = matcher.score()
+                yield (score, offsetid)
             
             # Check whether the time limit expired
             if self.timedout:

src/whoosh/sorting.py

             score = matcher.score()
             if self.use_final:
                 score = self.final(self.searcher, matcher.id(), score)
-            return score
+            # Negate the score so higher values sort first
+            return 0 - score
 
 
 class FunctionFacet(FacetType):
             return self.array[docid + self.docoffset]
 
 
-class QueryFacet(object):
+class QueryFacet(FacetType):
     def __init__(self, querydict, other="none"):
         self.querydict = querydict
         self.other = other
     
     def categorizer(self, searcher):
-        return self.QueryCategorizer(searcher, self.querydict, self.other)
+        return self.QueryCategorizer(self.querydict, self.other)
     
     class QueryCategorizer(Categorizer):
-        def __init__(self, searcher, querydict, other):
-            self.docsets = dict((qname, set(q.docs(searcher)))
-                                for qname, q in querydict)
+        def __init__(self, querydict, other):
+            self.querydict = querydict
             self.other = other
+            
+        def set_searcher(self, searcher, offset):
+            self.docsets = {}
+            for qname, q in self.querydict.items():
+                docset = set(q.docs(searcher))
+                self.docsets[qname] = docset
+            self.offset = offset
         
         def key_for_id(self, docid):
-            for qname, docset in enumerate(self.docsets):
-                if docid in docset:
+            if docid > 0: raise Exception
+            print "docid=", docid, "docsets=", self.docsets
+            for qname in self.docsets:
+                if docid in self.docsets[qname]:
                     return qname
             return self.other
 
 
 class MultiFacet(FacetType):
-    def __init__(self, *items):
-        self.facets = list(items)
-    
+    def __init__(self, items=None):
+        self.facets = []
+        if items:
+            for item in items:
+                self._add(item)
+            
     @classmethod
     def from_sortedby(cls, sortedby):
         multi = cls()
-        def _add(item):
-            if isinstance(sortedby, FacetType):
-                multi.add_facet(sortedby)
-            elif isinstance(sortedby, string_type):
-                multi.add_field(sortedby)
-            else:
-                raise Exception("Don't know what to do with facet %r" % item)
-        
         if isinstance(sortedby, (list, tuple)) or hasattr(sortedby, "__iter__"):
             for item in sortedby:
-                _add(item)
+                multi._add(item)
         else:
-            _add(sortedby)
-        
+            multi._add(sortedby)
         return multi
     
+    def _add(self, item):
+        if isinstance(item, FacetType):
+            self.add_facet(item)
+        elif isinstance(item, string_type):
+            self.add_field(item)
+        else:
+            raise Exception("Don't know what to do with facet %r" % (item, ))
+    
     def add_field(self, fieldname, reverse=False):
         self.facets.append(FieldFacet(fieldname, reverse=reverse))
         return self

tests/test_classify.py

             docnum = s.document_number(id=u("1"))
             r = s.more_like(docnum, "text", **kwargs)
             assert_equal([hit["id"] for hit in r], ["6", "2", "3"])
-            
+    
     schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True))
     _check(schema)
 

tests/test_sorting.py

 
 from nose.tools import assert_equal  #@UnresolvedImport
 
-from whoosh import fields, query
+from whoosh import fields, query, sorting
 from whoosh.compat import u, xrange, long_type
 from whoosh.filedb.filestore import RamStorage
 from whoosh.support.testing import skip_if_unavailable, skip_if, TempIndex
     try_sort("id",  lambda d: d["id"], limit=5, reverse=True)
 
 def test_multisort():
-    try_sort(("tag", "id"), lambda d: (d["tag"], d["id"]))
-    try_sort(("tag", "id"), lambda d: (d["tag"], d["id"]), reverse=True)
-    try_sort(("tag", "id"), lambda d: (d["tag"], d["id"]), limit=5)
-    try_sort(("tag", "id"), lambda d: (d["tag"], d["id"]), reverse=True, limit=5)
+    mf = sorting.MultiFacet(["tag", "id"])
+    try_sort(mf, lambda d: (d["tag"], d["id"]))
+    try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True)
+    try_sort(mf, lambda d: (d["tag"], d["id"]), limit=5)
+    try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True, limit=5)
 
 def test_numeric():
     try_sort("num", lambda d: d["num"])
             assert_equal(len(rp), 0)
             assert rp.is_last_page()
 
+def test_score_facet():
+    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT, c=fields.ID)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c"))
+    w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c"))
+    w.commit()
+    w = ix.writer()
+    w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c"))
+    w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c"))
+    w.commit(merge=False)
+    w = ix.writer()
+    w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c"))
+    w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c"))
+    w.commit(merge=False)
+    
+    with ix.searcher() as s:
+        facet = sorting.MultiFacet(["b", sorting.ScoreFacet()])
+        r = s.search(q=query.Term("a", u("alfa")), sortedby=facet)
+        assert_equal([h["id"] for h in r], [6, 4, 5, 2, 1, 3])
+
+def test_function_facet():
+    schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    domain = ("alfa", "bravo", "charlie")
+    count = 1
+    for w1 in domain:
+        for w2 in domain:
+            for w3 in domain:
+                for w4 in domain:
+                    w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4)))
+                    count += 1
+    w.commit()
+    
+    def fn(searcher, docnum):
+        v = dict(searcher.vector_as("frequency", docnum, "text"))
+        # Give high score to documents that have equal number of "alfa"
+        # and "bravo". Negate value so higher values sort first
+        return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0))
+    
+    with ix.searcher() as s:
+        q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])
+        
+        fnfacet = sorting.FunctionFacet(fn)
+        r = s.search(q, sortedby=fnfacet)
+        texts = [hit["text"] for hit in r]
+        for t in texts[:10]:
+            tks = t.split()
+            assert_equal(tks.count("alfa"), tks.count("bravo"))
+
+def test_numeric_field_facet():
+    schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC, v2=fields.NUMERIC)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=1, v1=2, v2=100)
+    w.add_document(id=2, v1=1, v2=50)
+    w.commit()
+    w = ix.writer()
+    w.add_document(id=3, v1=2, v2=200)
+    w.add_document(id=4, v1=1, v2=100)
+    w.commit()
+    w = ix.writer(merge=False)
+    w.add_document(id=5, v1=2, v2=50)
+    w.add_document(id=6, v1=1, v2=200)
+    w.commit()
+    
+    with ix.searcher() as s:
+        mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True)
+        r = s.search(query.Every(), sortedby=mf)
+        assert_equal([hit["id"] for hit in r], [6, 4, 2, 3, 1, 5])
+
+def test_query_facet():
+    schema = fields.Schema(id=fields.STORED, v=fields.ID)
+    ix = RamStorage().create_index(schema)
+    for i, ltr in enumerate(u("iacgbehdf")):
+        w = ix.writer()
+        w.add_document(id=i, v=ltr)
+        w.commit(merge=False)
+    
+    with ix.searcher() as s:
+        q1 = query.TermRange("v", "a", "c")
+        q2 = query.TermRange("v", "d", "f")
+        q3 = query.TermRange("v", "g", "i")
+        
+        assert_equal([hit["id"] for hit in s.search(q1)], [1, 2, 4])
+        assert_equal([hit["id"] for hit in s.search(q2)], [5, 7, 8])
+        assert_equal([hit["id"] for hit in s.search(q3)], [0, 3, 6])
+        
+        facet = sorting.QueryFacet({"a-c": q1, "d-f": q2, "g-i": q3})
+        r = s.search(query.Every(), groupedby=facet)
+        # If you specify a facet withou a name, it's automatically called
+        # "facet"
+        print r.groups("facet")
+        assert_equal(r.groups("facet"), {"a-c": [1, 2, 4],
+                                         "d-f": [5, 7, 8],
+                                         "g-i": [0, 3, 6]})
+
 @skip_if_unavailable("multiprocessing")
 @skip_if(lambda: True)
 def test_mp_fieldcache():
             
             from whoosh.sorting import MultiFacet
             
-            facet = MultiFacet("tag", "size")
+            facet = MultiFacet(["tag", "size"])
             r = s.search(query.Every(), groupedby={"tag/size" : facet})
             cats = r.groups(("tag/size"))
             assert_equal(cats, correct)
             cs = s.sorter()
             cs.add_field("price")
             cs.add_field("quant", reverse=True)
-            print("crit=", cs.criteria)
-            print("is_simple=", cs.is_simple())
             r = cs.sort_query(query.Every(), limit=None)
             assert_equal([hit["name"] for hit in r], list(u("DCAFBE")))
             
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.