Commits

Matt Chaput committed 55f9c48

Added minscore keyword argument to Hit.highlights() method.
Fixed fragment scoring logic to be >= minscore instead of > minscore.
Removed short-circuit in highlight method when no terms matched.
Added note to WholeFragmenter docs about always returning the fragment.
Fixes issue #216.

Comments (0)

Files changed (3)

src/whoosh/highlight.py

     """Doesn't fragment the token stream. This object just returns the entire
     entire stream as one "fragment". This is useful if you want to highlight
     the entire text.
+
+    Note that even if you use the `WholeFragmenter`, the highlight code will
+    return no fragment if no terms matched in the given field. To return the
+    whole fragment even in that case, call `highlights()` with `minscore=0`::
+
+        # Query where no terms match in the "text" field
+        q = query.Term("tag", "new")
+
+        r = mysearcher.search(q)
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        # Since no terms in the "text" field matched, we get no fragments back
+        assert r[0].highlights("text") == ""
+
+        # If we lower the minimum score to 0, we get a fragment even though it
+        # has no matching terms
+        assert r[0].highlights("text", minscore=0) == "This is the text field."
+
     """
 
     def __init__(self, charlimit=DEFAULT_CHARLIMIT):
 def top_fragments(fragments, count, scorer, order, minscore=1):
     scored_fragments = ((scorer(f), f) for f in fragments)
     scored_fragments = nlargest(count, scored_fragments)
-    best_fragments = [sf for score, sf in scored_fragments if score > minscore]
+    best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
     best_fragments.sort(key=order)
     return best_fragments
 
                     assert m.id() == docnum
                     cache[docnum][text] = m.value_as("characters")
 
-    def highlight_hit(self, hitobj, fieldname, text=None, top=3):
+    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
         results = hitobj.results
         schema = results.searcher.schema
         field = schema[fieldname]
         # Convert bytes to unicode
         words = frozenset(from_bytes(term[1]) for term in bterms)
 
-        if not words:
-            # No terms matches in this field
-            return self.formatter.format([])
+        # if not words:
+        #     # No terms matches in this field
+        #     return self.formatter.format([])
 
         # If we can do "pinpoint" highlighting...
         if self.can_load_chars(results, fieldname):
             tokens = set_matched_filter(tokens, words)
             fragments = self.fragmenter.fragment_tokens(text, tokens)
 
-        fragments = top_fragments(fragments, top, self.scorer, self.order)
+        fragments = top_fragments(fragments, top, self.scorer, self.order,
+                                  minscore=minscore)
         output = self.formatter.format(fragments)
         return output

src/whoosh/searching.py

             raise NoTermsException
         return self.results.docterms[self.docnum]
 
-    def highlights(self, fieldname, text=None, top=3):
+    def highlights(self, fieldname, text=None, top=3, minscore=1):
         """Returns highlighted snippets from the given field::
 
             r = searcher.search(myquery)
             access to the text another way (for example, loading from a file or
             a database), you can supply it using the ``text`` parameter.
         :param top: the maximum number of fragments to return.
+        :param minscore: the minimum score for fragments to appear in the
+            highlights.
         """
 
         hliter = self.results.highlighter
-        return hliter.highlight_hit(self, fieldname, text=text, top=top)
+        return hliter.highlight_hit(self, fieldname, text=text, top=top,
+                                    minscore=minscore)
 
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):

tests/test_highlighting.py

                                  fragmenter=highlight.ContextFragmenter(),
                                  formatter=highlight.UppercaseFormatter())
     assert result == "INDEXED!\n1"
+
+
+def test_whole_noterms():
+    schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"),
+                       tag=u("foo"))
+
+    with ix.searcher() as s:
+        r = s.search(query.Term("text", u("delta")))
+        assert len(r) == 1
+
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        hi = r[0].highlights("text")
+        assert hi == u("alfa bravo charlie DELTA echo foxtrot golf")
+
+        r = s.search(query.Term("tag", u("foo")))
+        assert len(r) == 1
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        hi = r[0].highlights("text")
+        assert hi == u("")
+
+        hi = r[0].highlights("text", minscore=0)
+        assert hi == u("alfa bravo charlie delta echo foxtrot golf")