Commits

Matt Chaput  committed 553a1e7

Added filter argument to and fixed performance problem in Searcher.more_like().

Split filter argument to Collector.search() into two arguments, allow and restrict, and check them separately. This is more efficient for large collections than e.g. making a filter with all but one document turned on.
Added mask argument to Searcher.search() which corresponds to restrict on Collector.search().

Fixes issues #138, #139.

  • Participants
  • Parent commits a39b081

Comments (0)

Files changed (2)

File src/whoosh/searching.py

         return expander.expanded_terms(numterms, normalize=normalize)
 
     def more_like(self, docnum, fieldname, text=None, top=10, numterms=5,
-                  model=classify.Bo1Model, normalize=False):
+                  model=classify.Bo1Model, normalize=False, filter=None):
         """Returns a :class:`Results` object containing documents similar to
         the given document, based on "key terms" in the given field::
         
         :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use
             to compute "key terms".
         :param normalize: whether to normalize term weights.
+        :param filter: a query, Results object, or set of docnums. The results
+            will only contain documents that are also in the filter object.
         """
         
         if text:
         q = query.Or([query.Term(fieldname, word, boost=weight)
                       for word, weight in kts])
         
-        # Filter the original document out of the results using a bit vector
-        # with every bit set except the one for this document
-        size = self.doc_count_all()
-        comb = BitVector(size, [n for n in xrange(self.doc_count_all())
-                                if n != docnum])
-        return self.search(q, limit=top, filter=comb)
+        return self.search(q, limit=top, filter=filter, mask=set([docnum]))
 
     def search_page(self, query, pagenum, pagelen=10, **kwargs):
         """This method is Like the :meth:`Searcher.search` method, but returns
         return groups
     
     def search(self, q, limit=10, sortedby=None, reverse=False, groupedby=None,
-               optimize=True, scored=True, filter=None, collector=None):
+               optimize=True, scored=True, filter=None, mask=None,
+               collector=None):
         """Runs the query represented by the ``query`` object and returns a
         Results object.
         
             collect the found documents.
         :param filter: a query, Results object, or set of docnums. The results
             will only contain documents that are also in the filter object.
+        :param mask: a query, Results object, or set of docnums. The
+            results will not contain documents that are also in the mask
+            object.
         :rtype: :class:`Results`
         """
 
             collector.scored = scored
             collector.reverse = reverse
         
-        return collector.search(self, q, filter=filter)
+        return collector.search(self, q, allow=filter, restrict=mask)
         
 
 class Collector(object):
         self.timesup = False
         self.timer = None
     
-    def search(self, searcher, q, filter=None):
+    def search(self, searcher, q, allow=None, restrict=None):
         """Top-level method call which uses the given :class:`Searcher` and
         :class:`whoosh.query.Query` objects to return a :class:`Results`
         object.
         if self.limit and self.limit > searcher.doc_count_all():
             self.limit = None
         
-        self._comb = None
-        if filter:
-            self.add_filter(filter)
-        
+        self._allow = None
+        self._restrict = None
+        if allow:
+            self._allow = self._searcher._filter_to_comb(allow)
+        if restrict:
+            self._restrict = self._searcher._filter_to_comb(restrict)
+            
         if self.timelimit:
             self.timer = threading.Timer(self.timelimit, self._timestop)
             self.timer.start()
         # flag inside the add_(all|top)_matches loops.
         self.timesup = True
     
-    def add_filter(self, obj):
-        c = self._searcher._filter_to_comb(obj)
-        if self._comb is None:
-            self._comb = set()
-        self._comb |= c
-    
     def add_searcher(self, searcher, q):
         """Adds the documents from the given searcher with the given query to
         the collector. This is called by the :meth:`Collector.search` method.
         items = self._items
         usequality = self.usequality
         score = self.score
-        comb = self._comb
+        allow = self._allow
+        restrict = self._restrict
         timelimited = bool(self.timelimit)
         greedy = self.greedy
         
             if timelimited and not greedy and self.timesup:
                 raise TimeLimit
             
-            if comb and offsetid not in comb:
+            if allow and offsetid not in allow:
+                continue
+            if restrict and offsetid in restrict:
                 continue
             
             if len(items) < limit:
         items = self._items
         scored = self.scored
         score = self.score
-        comb = self._comb
+        allow = self._allow
+        restrict = self._restrict
         timelimited = bool(self.timelimit)
         greedy = self.greedy
         reverse = self.reverse
             if timelimited and not greedy and self.timesup:
                 raise TimeLimit
             
-            if comb and offsetid not in comb:
+            if allow and offsetid not in allow:
+                continue
+            if restrict and offsetid in restrict:
                 continue
             
             if keyfns:
                                        formatter=formatter, order=order)
     
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
-                       model=classify.Bo1Model, normalize=True):
+                       model=classify.Bo1Model, normalize=True, filter=None):
         """Returns a new Results object containing documents similar to this
         hit, based on "key terms" in the given field::
         
         
         return self.searcher.more_like(self.docnum, fieldname, text=text,
                                        top=top, numterms=numterms, model=model,
-                                       normalize=normalize)
+                                       normalize=normalize, filter=filter)
     
     def __repr__(self):
         return "<%s %r>" % (self.__class__.__name__, self.fields())

File tests/test_classify.py

     schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
     _check(schema, text=docs[0])
 
+def test_more_like():
+    schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=u"1", text=u"alfa bravo charlie")
+    w.add_document(id=u"2", text=u"bravo charlie delta")
+    w.add_document(id=u"3", text=u"echo")
+    w.add_document(id=u"4", text=u"delta echo foxtrot")
+    w.add_document(id=u"5", text=u"echo echo echo")
+    w.add_document(id=u"6", text=u"foxtrot golf hotel")
+    w.add_document(id=u"7", text=u"golf hotel india")
+    w.commit()
+    
+    with ix.searcher() as s:
+        docnum = s.document_number(id="3")
+        r = s.more_like(docnum, "text")
+        assert_equal([hit["id"] for hit in r], ["5", "4"])
+        
 
 
 
 
+
+
+
+
+
+
+
+
+