Commits

Matt Chaput committed 662b14f

Fixed up documentation on scoring a bit.
Added scoring.FunctionWeighting.

  • Participants
  • Parent commits 299aa9c

Comments (0)

Files changed (4)

File docs/source/api/scoring.rst

 
 .. automodule:: whoosh.scoring
 
-Scoring algorithm classes
-=========================
 
-.. autoclass:: Weighting
+Base classes
+============
+
+.. autoclass:: WeightingModel
     :members:
 
 .. autoclass:: BaseScorer
+    :members:
+
 .. autoclass:: WeightScorer
 .. autoclass:: WeightLengthScorer
 
+
+Scoring algorithm classes
+=========================
+
 .. autoclass:: BM25F
 
 .. autoclass:: TF_IDF
 Scoring utility classes
 =======================
 
+.. autoclass:: FunctionWeighting
+
 .. autoclass:: MultiWeighting
 
 .. autoclass:: ReverseWeighting

File docs/source/recipes.rst

         ...
 
 
-Sorting
-=======
+Sorting and scoring
+===================
 
 See :doc:`facets`.
 
 
+Score results based on the position of the matched term
+-------------------------------------------------------
+
+The following scoring function uses the position of the first occurance of a
+term in each document to calculate the score, so documents with the given term
+earlier in the document will score higher::
+
+    from whoosh import scoring
+
+    def pos_score_fn(searcher, fieldname, text, matcher):
+        poses = matcher.value_as("positions")
+        return 1.0 / (poses[0] + 1)
+        
+    pos_weighting = scoring.FunctionWeighting(pos_score_fn)
+    searcher = myindex.searcher(weighting=pos_weighting)    
+
+
 Results
 =======
 

File src/whoosh/scoring.py

 
 class BM25FScorer(WeightLengthScorer):
     def __init__(self, searcher, fieldname, text, B, K1, qf=1):
-        parent = searcher.get_parent()
+        # IDF and average field length are global statistics, so get them from
+        # the top-level searcher
+        parent = searcher.get_parent()  # Returns self if no parent
         self.idf = parent.idf(fieldname, text)
         self.avgfl = parent.avg_field_length(fieldname) or 1
+
         self.B = B
         self.K1 = K1
         self.qf = qf
 
 class DFreeScorer(WeightLengthScorer):
     def __init__(self, searcher, fieldname, text, qf=1):
-        parent = searcher.get_parent()
+        # Total term weight and total field length are global statistics, so
+        # get them from the top-level searcher
+        parent = searcher.get_parent()  # Returns self if no parent
         self.cf = parent.weight(fieldname, text)
         self.fl = parent.field_length(fieldname)
+
         self.qf = qf
         self.setup(searcher, fieldname, text)
 
 
 class PL2Scorer(WeightLengthScorer):
     def __init__(self, searcher, fieldname, text, c, qf=1):
-        parent = searcher.get_parent()
+        # Total term weight, document count, and average field length are
+        # global statistics, so get them from the top-level searcher
+        parent = searcher.get_parent()  # Returns self if no parent
         self.cf = parent.frequency(fieldname, text)
         self.dc = parent.doc_count_all()
         self.avgfl = parent.avg_field_length(fieldname) or 1
+
         self.c = c
         self.qf = qf
         self.setup(searcher, fieldname, text)
 
 class TF_IDF(WeightingModel):
     def scorer(self, searcher, fieldname, text, qf=1):
-        parent = searcher.get_parent()
+        # IDF is a global statistic, so get it from the top-level searcher
+        parent = searcher.get_parent()  # Returns self if no parent
         idf = parent.idf(fieldname, text)
+
         maxweight = searcher.term_info(fieldname, text).max_weight()
         return TF_IDFScorer(maxweight, idf)
 
     """This class provides backwards-compatibility with the old weighting
     class architecture, so any existing custom scorers don't need to be
     rewritten.
-    
-    It may also be useful for quick experimentation since you only need to
-    override the ``score()`` method to try a scoring algorithm, without having
-    to create an inner Scorer class::
-    
-        class MyWeighting(Weighting):
-            def score(searcher, fieldname, text, docnum, weight):
-                # Return the docnum as the score, for some reason
-                return docnum
-                
-        mysearcher = myindex.searcher(weighting=MyWeighting)
     """
 
     def scorer(self, searcher, fieldname, text, qf=1):
                                     matcher.id(), matcher.weight())
 
 
+class FunctionWeighting(WeightingModel):
+    """Uses a supplied function to do the scoring. For simple scoring functions
+    and experiments this may be simpler to use than writing a full weighting
+    model class and scorer class.
+    
+    The function should accept the arguments
+    ``searcher, fieldname, text, matcher``.
+    
+    For example, the following function will score documents based on the
+    earliest position of the query term in the document::
+    
+        def pos_score_fn(searcher, fieldname, text, matcher):
+            poses = matcher.value_as("positions")
+            return 1.0 / (poses[0] + 1)
+        
+        pos_weighting = scoring.FunctionWeighting(pos_score_fn)
+        searcher = myindex.searcher(weighting=pos_weighting)
+        
+    Note that the searcher passed to the function may be a per-segment searcher
+    for performance reasons. If you want to get global statistics inside the
+    function, you should use ``searcher.get_parent()`` to get the top-level
+    searcher. (However, if you are using global statistics, you should probably
+    write a real model/scorer combo so you can cache them on the object.)
+    """
+
+    def __init__(self, fn):
+        self.fn = fn
+
+    def scorer(self, searcher, fieldname, text, qf=1):
+        return self.FunctionScorer(self.fn, searcher, fieldname, text, qf=qf)
+
+    class FunctionScorer(BaseScorer):
+        def __init__(self, fn, searcher, fieldname, text, qf=1):
+            self.fn = fn
+            self.searcher = searcher
+            self.fieldname = fieldname
+            self.text = text
+            self.qf = qf
+
+        def score(self, matcher):
+            return self.fn(self.searcher, self.fieldname, self.text, matcher)
+
+
 class MultiWeighting(WeightingModel):
     """Chooses from multiple scoring algorithms based on the field.
     """

File tests/test_searching.py

                        ('key', 'alfa', 2, 2.0, 2), ('key', 'alfa', 0, 6.0, 6),
                        ('key', 'alfa', 1, 1.0, 1), ('key', 'alfa', 2, 5.0, 5)])
 
+def test_pos_scorer():
+    ana = analysis.SimpleAnalyzer()
+    schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=0, key=u("0 0 1 0 0 0"))
+    w.add_document(id=1, key=u("0 0 0 1 0 0"))
+    w.add_document(id=2, key=u("0 1 0 0 0 0"))
+    w.commit()
+    w = ix.writer()
+    w.add_document(id=3, key=u("0 0 0 0 0 1"))
+    w.add_document(id=4, key=u("1 0 0 0 0 0"))
+    w.add_document(id=5, key=u("0 0 0 0 1 0"))
+    w.commit(merge=False)
 
+    def pos_score_fn(searcher, fieldname, text, matcher):
+        poses = matcher.value_as("positions")
+        return 1.0 / (poses[0] + 1)
+    pos_weighting = scoring.FunctionWeighting(pos_score_fn)
+
+    s = ix.searcher(weighting=pos_weighting)
+    r = s.search(query.Term("key", "1"))
+    assert_equal([hit["id"] for hit in r], [4, 2, 0, 1, 5, 3])
+