Commits

Matt Chaput  committed 04b5b1c

Added Searcher.more_like() and Hit.more_like_this() methods to make finding similar documents based on key words easier.

  • Participants
  • Parent commits beb11f6
  • Branches morelikethis

Comments (0)

Files changed (5)

File src/whoosh/classify.py

         
         :param number: The number of terms to return.
         :param normalize: Whether to normalize the weights.
-        :*returns*: A list of ("term", weight) tuples.
+        :returns: A list of ("term", weight) tuples.
         """
         
         model = self.model
         else:
             norm = maxweight
         tlist = [(weight / norm, t) for weight, t in tlist]
-        tlist.sort(reverse=True)
+        tlist.sort(key=lambda x: (0 - x[0], x[1]))
         
         return [(t, weight) for weight, t in tlist[:number]]
 

File src/whoosh/filedb/filereading.py

 
     @protected
     def has_vector(self, docnum, fieldname):
-        self._open_vectors()
-        return (docnum, fieldname) in self.vectorindex
+        if self.schema[fieldname].vector:
+            self._open_vectors()
+            return (docnum, fieldname) in self.vectorindex
+        else:
+            return False
 
     @protected
     def __iter__(self):

File src/whoosh/query.py

         
         return ListMatcher(doclist, all_weights=self.boost)
 
-            
+
 class NullQuery(Query):
     "Represents a query that won't match anything."
     def __call__(self):

File src/whoosh/searching.py

 import copy
 import threading
 from collections import defaultdict
-from heapq import nlargest, nsmallest, heappush, heapreplace
+from heapq import heappush, heapreplace
 from math import ceil
 
 from whoosh import classify, highlight, query, scoring
 from whoosh.reading import TermNotFound
-from whoosh.support.bitvector import BitSet
+from whoosh.support.bitvector import BitSet, BitVector
 from whoosh.util import now, lru_cache
 
 
     def _filter_to_comb(self, obj):
         if obj is None:
             return None
-        if isinstance(obj, set):
+        if isinstance(obj, (set, BitVector, BitSet)):
             c = obj
         elif isinstance(obj, Results):
             c = obj.docset
         expander.add_text(text)
         return expander.expanded_terms(numterms, normalize=normalize)
 
+    def more_like(self, docnum, fieldname, text=None, top=10, numterms=5,
+                  model=classify.Bo1Model, normalize=False):
+        """Returns a :class:`Results` object containing documents similar to
+        the given document, based on "key terms" in the given field::
+        
+            # Get the ID for the document you're interested in
+            docnum = search.document_number(path=u"/a/b/c")
+            
+            r = searcher.more_like(docnum)
+        
+            print "Documents like", searcher.stored_fields(docnum)["title"]
+            for hit in r:
+                print hit["title"]
+        
+        :param fieldname: the name of the field to use to test similarity.
+        :param text: by default, the method will attempt to load the contents
+            of the field from the stored fields for the document, or from a
+            term vector. If the field isn't stored or vectored in the index,
+            but you have access to the text another way (for example, loading
+            from a file or a database), you can supply it using the ``text``
+            parameter.
+        :param top: the number of results to return.
+        :param numterms: the number of "key terms" to extract from the hit and
+            search for. Using more terms is slower but gives potentially more
+            and more accurate results.
+        :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use
+            to compute "key terms".
+        :param normalize: whether to normalize term weights.
+        """
+        
+        if text:
+            kts = self.key_terms_from_text(fieldname, text, numterms=numterms,
+                                           model=model, normalize=normalize)
+        else:
+            kts = self.key_terms([docnum], fieldname, numterms=numterms,
+                                 model=model, normalize=normalize)
+        # Create an Or query from the key terms
+        q = query.Or([query.Term(fieldname, word, boost=weight)
+                      for word, weight in kts])
+        
+        # Filter the original document out of the results using a bit vector
+        # with every bit set except the one for this document
+        size = self.doc_count_all()
+        comb = BitVector(size, [n for n in xrange(self.doc_count_all())
+                                if n != docnum])
+        return self.search(q, limit=top, filter=comb)
+
     def search_page(self, query, pagenum, pagelen=10, **kwargs):
         if pagenum < 1:
             raise ValueError("pagenum must be >= 1")
                                        top=top, fragmenter=fragmenter,
                                        formatter=formatter, order=order)
     
+    def more_like_this(self, fieldname, text=None, top=10, numterms=5,
+                       model=classify.Bo1Model, normalize=True):
+        """Returns a new Results object containing documents similar to this
+        hit, based on "key terms" in the given field::
+        
+            r = searcher.search(myquery)
+            for hit in r:
+                print hit["title"]
+                print "Top 3 similar documents:"
+                for subhit in hit.more_like_this("content", top=3):
+                  print "  ", subhit["title"]
+                  
+        :param fieldname: the name of the field to use to test similarity.
+        :param text: by default, the method will attempt to load the contents
+            of the field from the stored fields for the document, or from a
+            term vector. If the field isn't stored or vectored in the index,
+            but you have access to the text another way (for example, loading
+            from a file or a database), you can supply it using the ``text``
+            parameter.
+        :param top: the number of results to return.
+        :param numterms: the number of "key terms" to extract from the hit and
+            search for. Using more terms is slower but gives potentially more
+            and more accurate results.
+        :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use
+            to compute "key terms".
+        :param normalize: whether to normalize term weights.
+        """
+        
+        return self.searcher.more_like(self.docnum, text=text, top=top,
+                                       numterms=numterms, model=model,
+                                       normalize=normalize)
+    
     def __repr__(self):
         return "<%s %r>" % (self.__class__.__name__, self.fields())
     

File tests/test_results.py

             r = s.search(q, limit=3)
             self.assertEqual(len(r), count)
 
+    def test_more_like_this_stored(self):
+        docs = [u"alfa bravo charlie delta echo foxtrot golf",
+                u"delta echo foxtrot golf hotel india juliet",
+                u"echo foxtrot golf hotel india juliet kilo",
+                u"foxtrot golf hotel india juliet kilo lima",
+                u"golf hotel india juliet kilo lima mike",
+                u"foxtrot golf hotel india alfa bravo charlie"]
+        
+        def _check(schema, **kwargs):
+            ix = RamStorage().create_index(schema)
+            with ix.writer() as w:
+                for i, text in enumerate(docs):
+                    w.add_document(id=unicode(i + 1), text=text)
+    
+            with ix.searcher() as s:
+                docnum = s.document_number(id=u"1")
+                r = s.more_like(docnum, "text", **kwargs)
+                self.assertEqual([hit["id"] for hit in r], ["6", "2", "3"])
+                
+        schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True))
+        _check(schema)
 
-
-
-
-
+        ana = analysis.StandardAnalyzer()
+        schema = fields.Schema(id=fields.ID(stored=True),
+                               text=fields.TEXT(analyzer=ana, vector=formats.Frequency(ana)))
+        _check(schema)
+        
+        schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
+        _check(schema, text=docs[0])