Commits

Matt Chaput committed bb72cba

Fixed highlighting of terms matched by a "multiterm" query such as Wildcard.
Added "expand" keyword argument to Query.existing_terms().
Fixes issue #164.

Comments (0)

Files changed (5)

src/whoosh/highlight.py

         if results.has_matched_terms() is None:
             terms = hitobj.matched_terms()
         else:
-            terms = results.query_terms()
+            terms = results.query_terms(expand=True)
         # Get the words searched for in the field
         words = set(termtext for fname, termtext in terms if fname == fieldname)
         if not words:

src/whoosh/query.py

                 if phrases or not isinstance(q, Phrase):
                     termset.update(q.terms())
         return termset
+    
+    def _existing_terms_helper(self, ixreader, termset, reverse):
+        if termset is None:
+            termset = set()
+        if reverse:
+            test = lambda t: t not in ixreader
+        else:
+            test = lambda t: t in ixreader
         
+        return termset, test
+    
     def existing_terms(self, ixreader, termset=None, reverse=False,
-                       phrases=True):
+                       phrases=True, expand=False):
         """Returns a set of all terms in this query tree that exist in the
         given ixreaderder.
         
         :param reverse: If True, this method adds *missing* terms rather than
             *existing* terms to the set.
         :param phrases: Whether to add words found in Phrase queries.
+        :param expand: If True, queries that match multiple terms
+            (such as :class:`Wildcard` and :class:`Prefix`) will return all
+            matching expansions.
         :rtype: set
         """
-
-        if termset is None:
-            termset = set()
-        if reverse:
-            test = lambda t: t not in ixreader
+        
+        # By default, this method calls all_terms() and then filters based on
+        # the contents of the reader. Subclasses that need to use the reader to
+        # generate the terms (i.e. MultiTerm) need to override this
+        # implementation
+        
+        termset, test = self._existing_terms_helper(ixreader, termset, reverse)
+        if self.is_leaf():
+            gen = self.all_terms(phrases=phrases)
+            termset.update(t for t in gen if test(t))
         else:
-            test = lambda t: t in ixreader
-        
-        termset.update(t for t in self.all_terms(phrases=phrases) if test(t))
+            for q in self.children():
+                q.existing_terms(ixreader, termset, reverse, phrases, expand)
         return termset
 
     def leaves(self):
         return min(ixreader.doc_frequency(self.fieldname, text)
                    for text in self._words(ixreader))
 
+    def existing_terms(self, ixreader, termset=None, reverse=False,
+                       phrases=True, expand=False):
+        termset, test = self._existing_terms_helper(ixreader, termset, reverse)
+        
+        if not expand:
+            return termset
+        fieldname = self.field()
+        if fieldname is None:
+            return termset
+        
+        for word in self._words(ixreader):
+            term = (fieldname, word)
+            if test(term):
+                termset.add(term)
+        return termset
+
     def matcher(self, searcher):
         fieldname = self.fieldname
         reader = searcher.reader()

src/whoosh/searching.py

         """
         return self.top_n[n][1]
 
-    def query_terms(self):
-        return self.q.existing_terms(self.searcher.reader())
+    def query_terms(self, expand=False):
+        return self.q.existing_terms(self.searcher.reader(), expand=expand)
 
     def has_matched_terms(self):
         """Returns True if the search recorded which terms matched in which
         :param top: the maximum number of fragments to return.
         """
         
-        hhit = self.results.highlighter.highlight_hit
-        return hhit(self, fieldname, text=text, top=top)
+        hliter = self.results.highlighter
+        return hliter.highlight_hit(self, fieldname, text=text, top=top)
     
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):

tests/test_highlighting.py

         hi.fragmenter.autotrim = True
         assert_equal(hi.highlight_hit(hit, "text"), "golf hotel india JULIET kilo lima mike")
 
+def test_highlight_wildcards():
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("alfa bravo charlie delta cookie echo"))
+    
+    with ix.searcher() as s:
+        qp = qparser.QueryParser("text", ix.schema)
+        q = qp.parse(u("c*"))
+        r = s.search(q)
+        assert_equal(r.scored_length(), 1)
+        r.formatter = highlight.UppercaseFormatter()
+        hit = r[0]
+        assert_equal(hit.highlights("text"), "alfa bravo CHARLIE delta COOKIE echo")
 
 
+
+
+

tests/test_queries.py

 
 import copy
 
-from whoosh import fields
+from whoosh import fields, query
 from whoosh.compat import u
 from whoosh.filedb.filestore import RamStorage
 from whoosh.qparser import QueryParser
 
 def test_existing_terms():
     s = fields.Schema(key=fields.ID, value=fields.TEXT)
-    st = RamStorage()
-    ix = st.create_index(s)
+    ix = RamStorage().create_index(s)
     
     w = ix.writer()
     w.add_document(key=u("a"), value=u("alfa bravo charlie delta echo"))
     q = QueryParser("value", None).parse(u('alfa hotel tango "sierra bravo"'))
     
     ts = q.existing_terms(r, phrases=False)
-    print("ts=", sorted(ts))
     assert_equal(sorted(ts), [("value", "alfa"), ("value", "hotel")])
     
     ts = q.existing_terms(r)
     q.existing_terms(r, ts, reverse=True)
     assert_equal(sorted(ts), [("value", "sierra"), ("value", "tango")])
 
+def test_wildcard_existing_terms():
+    s = fields.Schema(key=fields.ID, value=fields.TEXT)
+    ix = RamStorage().create_index(s)
+    
+    w = ix.writer()
+    w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta"))
+    w.add_document(key=u("a"), value=u("boggle echo render rendering renders"))
+    w.commit()
+    r = ix.reader()
+    qp = QueryParser("value", ix.schema)
+    
+    def words(terms):
+        z = []
+        for t in terms:
+            assert t[0] == "value"
+            z.append(t[1])
+        return " ".join(sorted(z))
+    
+    q = qp.parse(u("b*"))
+    ts = q.existing_terms(r)
+    assert_equal(ts, set())
+    ts = q.existing_terms(r, expand=True)
+    assert_equal(words(ts), "bear boggle bravo")
+    
+    q = qp.parse(u("[a TO f]"))
+    ts = q.existing_terms(r)
+    assert_equal(ts, set())
+    ts = q.existing_terms(r, expand=True)
+    assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo")
+    
+    q = query.Variations("value", "render")
+    ts = q.existing_terms(r, expand=False)
+    assert_equal(ts, set())
+    ts = q.existing_terms(r, expand=True)
+    assert_equal(words(ts), "render rendering renders")
+    
 def test_replace():
     q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0)])
     q = q.replace("a", "b", "BB")
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.