Commits

Matt Chaput committed 12ad088 Merge

Merged in eventh/whoosh/distinctive-terms (pull request #10)

  • Participants
  • Parent commits 60eb503, 05380b7

Comments (0)

Files changed (2)

File src/whoosh/reading.py

 """This module contains classes that allow reading from an index.
 """
 
+from math import log
 from bisect import bisect_right
 from heapq import heapify, heapreplace, heappop, nlargest
 
                in self.iter_prefix(fieldname, prefix))
         return nlargest(number, gen)
 
-    def most_distinctive_terms(self, fieldname, number=5, prefix=None):
+    def most_distinctive_terms(self, fieldname, number=5, prefix=''):
         """Returns the top 'number' terms with the highest `tf*idf` scores as
         a list of (score, text) tuples.
         """
 
-        gen = ((terminfo.weight() * (1.0 / terminfo.doc_frequency()), text)
+        N = float(self.doc_count())
+        gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text)
                for text, terminfo in self.iter_prefix(fieldname, prefix))
         return nlargest(number, gen)
 

File tests/test_reading.py

                  [(6, u('aa')), (5, u('bb')), (4, u('ee')), (3, u('cc')), (2, u('dd'))])
     assert_equal(list(reader.most_frequent_terms("content", prefix="a")),
                  [(6, u('aa')), (2, u('ax')), (1, u('ab'))])
+    assert_equal(list(reader.most_distinctive_terms("content", 3)),
+                 [(1.3862943611198906, u('ax')), (0.6931471805599453, u('ab')), (0.0, u('ee'))])
 
 def test_vector_postings():
     s = fields.Schema(id=fields.ID(stored=True, unique=True),