Commits

Matt Chaput committed 3a52259

Added optional recording of matching terms per document, off by default.
Removed TermTrackingCollector.
Added matcher methods to find terms in tree.

  • Participants
  • Parent commits 6dd85af

Comments (0)

Files changed (11)

File src/whoosh/filedb/filepostings.py

         
 
 class FilePostingReader(Matcher):
-    def __init__(self, postfile, offset, format, scorer=None,
-                 fieldname=None, text=None, stringids=False):
+    def __init__(self, postfile, offset, format, scorer=None, term=None,
+                 stringids=False):
         
         assert isinstance(offset, integer_types), "offset is %r/%s" % (offset, type(offset))
         assert isinstance(format, Format), "format is %r/%s" % (format, type(format))
         self.supports_chars = self.format.supports("characters")
         self.supports_poses = self.format.supports("positions")
         self.scorer = scorer
-        self.fieldname = fieldname
-        self.text = text
+        self._term = term
         self.stringids = stringids
         
         magic = postfile.get_int(offset)
         self._next_block()
 
     def __repr__(self):
-        r = "%s(%r, %r, %r, %s" % (self.__class__.__name__, str(self.postfile),
-                                   self.fieldname, self.text, self.is_active())
+        r = "%s(%r, %r, %s" % (self.__class__.__name__, str(self.postfile),
+                                   self._text, self.is_active())
         if self.is_active():
             r += ", %r" % self.id()
         r += ")"
 
     def copy(self):
         return self.__class__(self.postfile, self.startoffset, self.format,
-                              scorer=self.scorer, fieldname=self.fieldname,
-                              text=self.text, stringids=self.stringids)
+                              scorer=self.scorer, term=self._term,
+                              stringids=self.stringids)
 
     def is_active(self):
         return self._active
 
+    def term(self):
+        return self._term
+
     def id(self):
         return self.block.ids[self.i]
 
         elif self.supports_poses:
             return [Span(pos) for pos in self.value_as("positions")]
         else:
-            raise Exception("Field does not support positions (%r)" % self.fieldname)
+            raise Exception("Field does not support positions (%r)" % self._term)
 
     def weight(self):
         weights = self.block.weights

File src/whoosh/filedb/filereading.py

         postings = terminfo.postings
         if isinstance(postings, integer_types):
             postreader = FilePostingReader(self.postfile, postings, format,
-                                           scorer=scorer, fieldname=fieldname,
-                                           text=text)
+                                           scorer=scorer, term=(fieldname, text))
         else:
             docids, weights, values = postings
             postreader = ListMatcher(docids, weights, values, format,
-                                     scorer=scorer)
+                                     scorer=scorer, term=(fieldname, text))
         
         deleted = self.segment.deleted
         if deleted:

File src/whoosh/matching.py

         
         raise NotImplementedError
     
+    def term(self):
+        """Returns a ("fieldname", "termtext") tuple for the term this matcher
+        matches, or None if this matcher is not a term matcher.
+        """
+        
+        return None
+    
+    def term_matchers(self):
+        """Returns an iterator of term matchers in this tree.
+        """
+        
+        if self.term() is not None:
+            yield self
+        else:
+            for cm in self.children():
+                for m in cm.term_matchers():
+                    yield m
+    
+    def matching_terms(self, id=None):
+        """Returns an iterator of ("fieldname", "termtext") tuples for the
+        CURRENTLY MATCHING term matchers in this tree.
+        """
+        
+        if not self.is_active():
+            return
+        
+        if id is None:
+            id = self.id()
+        elif id != self.id():
+            return
+        
+        t = self.term()
+        if t is None:
+            for c in self.children():
+                for t in c.matching_terms(id):
+                    yield t
+        else:
+            yield t
+    
+    def children(self):
+        """Returns an (possibly empty) list of the submatchers of this
+        matcher.
+        """
+        
+        return []
+    
     def replace(self, minquality=0):
         """Returns a possibly-simplified version of this matcher. For example,
         if one of the children of a UnionMatcher is no longer active, calling
     """
     
     def __init__(self, ids, weights=None, values=None, format=None,
-                 scorer=None, position=0, all_weights=None):
+                 scorer=None, position=0, all_weights=None, term=None):
         """
         :param ids: a list of doc IDs.
         :param weights: a list of weights corresponding to the list of IDs.
             format of the field.
         :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring
             the postings.
+        :param term: a ("fieldname", "text") tuple, or None if this is not a
+            term matcher.
         """
         
         self._ids = ids
         self._i = position
         self._format = format
         self._scorer = scorer
+        self._term = term
     
     def __repr__(self):
         return "<%s>" % self.__class__.__name__
     def is_active(self):
         return self._i < len(self._ids)
     
+    def term(self):
+        return self._term
+    
     def copy(self):
         return self.__class__(self._ids, self._weights, self._values,
                               self._format, self._scorer, self._i,
     def is_active(self):
         return self.child.is_active()
     
+    def children(self):
+        return [self.child]
+    
     def supports(self, astype):
         return self.child.supports(astype)
     
     def is_active(self):
         return self.current < len(self.matchers)
     
+    def children(self):
+        return [self.matchers[self.current]]
+    
     def _next_matcher(self):
         matchers = self.matchers
         while self.current < len(matchers) and not matchers[self.current].is_active():
                               boost=self.boost)
     
     def _replacement(self, newchild):
-        return self.__class__(newchild, self._ids, exclude=self._exclude, boost=self.boost)
+        return self.__class__(newchild, self._ids, exclude=self._exclude,
+                              boost=self.boost)
     
     def _find_next(self):
         child = self.child
     def __repr__(self):
         return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b)
 
+    def children(self):
+        return [self.a, self.b]
+
     def copy(self):
         return self.__class__(self.a.copy(), self.b.copy())
 
     
     def score(self):
         return self._score
-    
+
+
+
+
 
 #class PhraseMatcher(WrappingMatcher):
 #    """Matches postings where a list of sub-matchers occur next to each other

File src/whoosh/query.py

             return m
         else:
             ids = array("I", m.all_ids())
-            return ListMatcher(ids, all_weights=self.score)
+            return ListMatcher(ids, all_weights=self.score, term=m.term())
     
 
 class BinaryQuery(CompoundQuery):

File src/whoosh/searching.py

 class TimeLimit(Exception):
     pass
 
+class NoTermsException(Exception):
+    """Exception raised you try to access matched terms on a :class:`Results`
+    object was created without them. To record which terms matched in which
+    document, you need to call the :meth:`Searcher.search` method with
+    ``terms=True``.
+    """
+    
+    message = "Results were created without recording terms"
 
 # Searcher class
 
                 yield docnum
     
     def search(self, q, limit=10, sortedby=None, reverse=False, groupedby=None,
-               optimize=True, filter=None, mask=None, groupids=True):
+               optimize=True, filter=None, mask=None, groupids=True,
+               terms=False):
         """Runs the query represented by the ``query`` object and returns a
         Results object.
         
             document numbers associated with that key. To map to a simple count
             of the number of documents instead of a list, use
             ``groupids=False``.
+        :param terms: if True, record which terms were found in each matching
+            document. You can use :meth:`Results.contains_term` or
+            :meth:`Hit.contains_term` to check whether a hit contains a
+            particular term.
         :rtype: :class:`Results`
         """
 
             raise ValueError("limit must be >= 1")
 
         collector = Collector(limit=limit, usequality=optimize,
-                              groupedby=groupedby, reverse=reverse,
-                              groupids=groupids)
+                              groupedby=groupedby, groupids=groupids,
+                              terms=terms)
         
         if sortedby:
-            return collector.sort(self, q, sortedby, allow=filter,
-                                  restrict=mask)
+            return collector.sort(self, q, sortedby, reverse=reverse,
+                                  allow=filter, restrict=mask)
         else:
             return collector.search(self, q, allow=filter, restrict=mask)
     
         
 
 class Collector(object):
-    def __init__(self, limit=10, usequality=True, replace=10, groupedby=None,
-                 timelimit=None, greedy=False, reverse=False, groupids=True):
-        """A Collector finds the matching documents, scores them, collects them
-        into a list, and produces a Results object from them.
-        
-        Normally you do not need to instantiate an instance of the base
-        Collector class, the :meth:`Searcher.search` method does that for you.
-        
-        If you create a custom Collector instance or subclass you can use its
-        ``search()`` method instead of :meth:`Searcher.search`::
-        
-            mycollector = MyCollector()
-            results = mycollector.search(mysearcher, myquery)
-        
-        **Do not** re-use or share Collector instances between searches. You
-        should create a new Collector instance for each search.
-        
-        To limit the amount of time a search can take, pass the number of
-        seconds to the ``timelimit`` keyword argument::
-        
-            # Limit the search to 4.5 seconds
-            col = Collector(timelimit=4.5, greedy=False)
-            # If this call takes more than 4.5 seconds, it will raise a
-            # whoosh.searching.TimeLimit exception
-            try:
-                r = searcher.search(myquery, collector=col)
-            except TimeLimit, tl:
-                # You can still retrieve partial results from the collector
-                r = col.results()
-        
-        If the ``greedy`` keyword is ``True``, the collector will finish adding
-        the most recent hit before raising the ``TimeLimit`` exception.
+    """A Collector finds the matching documents, scores them, collects them
+    into a list, and produces a Results object from them.
+    
+    Normally you do not need to instantiate an instance of the base
+    Collector class, the :meth:`Searcher.search` method does that for you.
+    
+    If you create a custom Collector instance or subclass you can use its
+    ``search()`` method instead of :meth:`Searcher.search`::
+    
+        mycollector = MyCollector()
+        results = mycollector.search(mysearcher, myquery)
+    
+    **Do not** re-use or share Collector instances between searches. You
+    should create a new Collector instance for each search.
+    
+    To limit the amount of time a search can take, pass the number of
+    seconds to the ``timelimit`` keyword argument::
+    
+        # Limit the search to 4.5 seconds
+        col = Collector(timelimit=4.5, greedy=False)
+        # If this call takes more than 4.5 seconds, it will raise a
+        # whoosh.searching.TimeLimit exception
+        try:
+            r = searcher.search(myquery, collector=col)
+        except TimeLimit, tl:
+            # You can still retrieve partial results from the collector
+            r = col.results()
+    
+    If the ``greedy`` keyword is ``True``, the collector will finish adding
+    the most recent hit before raising the ``TimeLimit`` exception.
+    """
+    
+    def __init__(self, limit=10, usequality=True, groupedby=None,
+                 groupids=True, timelimit=None, greedy=False, terms=False,
+                 replace=10):
+        """
+        :param limit: the maximum number of hits to collect. If this is None,
+            collect all hits.
+        :param usequality: whether to use block quality optimizations when
+            available. This is mostly useful for debugging purposes.
+        :param groupedby: see :doc:`/facets` for information.
+        :param groupids: if True, saves lists of document IDs for facets. If
+            False, only saves a count of the number of documents in each group.
+        :param timelimit: the maximum amount of time (in possibly fractional
+            seconds) to allow for searching. If the search takes longer than
+            this, it will raise a ``TimeLimit`` exception.
+        :param greedy: if ``True``, the collector will finish adding the most
+            recent hit before raising the ``TimeLimit`` exception.
+        :param terms: if ``True``, record which terms matched in each document.
         """
         
         self.limit = limit
         self.replace = replace
         self.timelimit = timelimit
         self.greedy = greedy
-        self.reverse = reverse
         self.groupids = groupids
+        self.termlists = defaultdict(set) if terms else None
         
         self.facets = None
         if groupedby:
         does not use the final() method, etc.).
         """
         
-        use = (self.usequality and not searcher.weighting.use_final
+        use = (self.usequality
+               and not searcher.weighting.use_final
                and not self.should_add_all())
         if matcher:
             use = use and matcher.supports_block_quality()
         matcher = q.matcher(searcher)
         usequality = self.use_block_quality(searcher, matcher)
         
-        for score, offsetid in self.pull_matches(searcher, matcher, offset,
-                                                 scorefn, usequality):
+        for score, offsetid in self.pull_matches(matcher, offset, scorefn,
+                                                 usequality):
             # Document numbers are negated before putting them in the heap so
             # that higher document numbers have lower "priority" in the queue.
             # Lower document numbers should always come before higher document
                     heapreplace(items, (score, negated_offsetid))
                     self.minscore = items[0][0]
     
-    def pull_matches(self, searcher, matcher, offset, scorefn, usequality):
+    def pull_matches(self, matcher, offset, scorefn, usequality):
         """Low-level method yields (docid, score) pairs from the given matcher.
         Called by :meth:`Collector.add_matches`.
         """
         replacecounter = 0
         timelimited = bool(self.timelimit)
         
+        termlists = self.termlists
+        recordterms = termlists is not None
+        if recordterms:
+            termmatchers = list(matcher.term_matchers())
+        else:
+            termmatchers = None
+        
         # A flag to indicate whether we should check block quality at the start
         # of the next loop
         checkquality = True
                         break
                     replacecounter = replace
                     minscore = self.minscore
+                    if recordterms:
+                        termmatchers = list(matcher.term_matchers())
                 replacecounter -= 1
             
             # Check whether the time limit expired since the last match
                     score = matcher.score()
                 yield (score, offsetid)
             
+            # If recording terms, add the document to the termlists
+            if recordterms:
+                for m in termmatchers:
+                    if m.is_active() and m.id() == id:
+                        termlists[m.term()].add(offsetid)
+            
             # Check whether the time limit expired
             if timelimited and self.timedout:
                 raise TimeLimit
             # again.
             checkquality = matcher.next()
     
-    def sort(self, searcher, q, sortedby, allow=None, restrict=None):
+    def sort(self, searcher, q, sortedby, reverse=False, allow=None,
+             restrict=None):
         self.searcher = searcher
         self.q = q
         self.docset = set()
         
         items = self.items
         limit = self.limit
-        heapfn = nlargest if self.reverse else nsmallest
+        heapfn = nlargest if reverse else nsmallest
         addall = self.should_add_all()
         
         facet = sorting.MultiFacet.from_sortedby(sortedby)
             catter.set_searcher(s, offset)
             matcher = q.matcher(s)
             
-            if catter.requires_matcher:
-                ls = list(self.pull_matches(s, matcher, offset,
+            if catter.requires_matcher or self.termlists:
+                ls = list(self.pull_matches(matcher, offset,
                                             catter.key_for_matcher, False))
             else:
                 ls = list(self.pull_unscored_matches(matcher, offset,
         
         self.items = items
         self.runtime = now() - t
-        return self.results(scores=False)
+        return self.results(scores=False, reverse=reverse)
     
     def pull_unscored_matches(self, matcher, offset, keyfn):
         allow = self.allow
             if timelimited and self.timedout:
                 raise TimeLimit
     
-    def results(self, scores=True):
+    def results(self, scores=True, reverse=False):
         """Returns the current results from the collector. This is useful for
         getting the results out of a collector that was stopped by a time
         limit exception.
             # Sort by negated scores so that higher scores go first, then by
             # document number to keep the order stable when documents have the
             # same score
-            items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse)
+            items.sort(key=lambda x: (0 - x[0], x[1]))
         else:
-            items = sorted(self.items, reverse=self.reverse)
+            items = sorted(self.items, reverse=reverse)
         
         return Results(self.searcher, self.q, items, self.docset,
                        groups=self.groups, runtime=self.runtime,
-                       filter=self.allow, mask=self.restrict)
-
-
-class TermTrackingCollector(Collector):
-    """This is an experiment. For a more straightforward but possibly slightly
-    slower method for determining if a given term was found, see
-    :meth:`Results.contains_term`.
-    
-    This collector records which parts of the query matched which documents
-    in the final results. The results for each part of the query are available
-    as a dictionary in the ``catalog`` attribute of the collector after the
-    search, where the keys are representations of the parts of the query and
-    the values are sets of document numbers that matched that part of the
-    query.
-    
-    This feature is experimental and may change in future releases.
-    
-    How to choose a key to represent query objects in the ``catalog``
-    dictionary was not entirely clear. The current implementation uses the
-    unicode representation of the query object, which usually returns something
-    at least recognizable (for example, ``unicode(Term("f", u"a")) == u"f:a"``
-    and ``unicode(Prefix("f", "b")) == u"f:b*"``).
-    
-    >>> myparser = qparser.QueryParser("content", myindex.schema)
-    >>> myquery = myparser.parse(u"apple OR bear NOT camel")
-    >>> col = TermTrackingCollector()
-    >>> results = col.search(searcher, myquery)
-    >>> # The docnums in the results that contained "apple"
-    >>> col.catalog["content:apple"]
-    set([1, 2, 3])
-    >>> for hit in results:
-    ...     print hit.rank, ":", hit["title"]
-    ...     for key, docset in col.catalog.keys():
-    ...         if hit.docnum in docset:
-    ...             print "   - Contains", key
-    """
-    
-    # This collector works by rewriting the query with "TaggedQuery" wrappers
-    # around the leaf nodes before it searches. When base collector generates
-    # a matcher tree from the query tree, these wrappers "phone home" to this
-    # collector and register the leaf matchers. Then, when collecting hits, the
-    # collector checks with the leaf matchers at each hit to see if they are
-    # matching the current document.
-    
-    def __init__(self, *args, **kwargs):
-        super(TermTrackingCollector, self).__init__(*args, **kwargs)
-        self.catalog = {}
-    
-    def should_add_all(self):
-        # If you're using this collector, you need to examine all documents
-        return True
-    
-    def add_matches(self, searcher, q, offset, scorefn):
-        sup = super(TermTrackingCollector, self)
-        self.matchers = []
-        q = self._tag(q)
-        return sup.add_matches(searcher, q, offset, scorefn)
-    
-    def pull_matches(self, searcher, matcher, offset, scorefn, usequality):
-        super_method = super(TermTrackingCollector, self).pull_matches
-        
-        for score, offsetid in super_method(searcher, matcher, offset,
-                                            scorefn, usequality):
-            for key, m in self.matchers:
-                if m.is_active() and m.id() == offsetid - offset:
-                    if key not in self.catalog:
-                        self.catalog[key] = set()
-                    self.catalog[key].add(offsetid)
-            
-            yield (score, offsetid)
-    
-    def _tag(self, q):
-        # Takes a query and returns a copy of the query with a TaggedQuery
-        # wrapper around any leaf nodes in the query tree
-        if isinstance(q, query.Not):
-            return q
-        elif q.is_leaf():
-            return TermTrackingCollector.TaggedQuery(q, self)
-        else:
-            return q.apply(self._tag)
-        
-    def _tag_matcher(self, key, m):
-        # This method is called from the TaggedQuery wrappers that the _tag
-        # method added to the query
-        self.matchers.append((key, m))
-        
-    class TaggedQuery(query.WrappingQuery):
-        # The only purpose of this query wrapper is to "call home" to the
-        # TrackingCollector instance when the child query generates a matcher
-        # so the TrackingCollector can register it
-        
-        def __init__(self, child, tracker):
-            self.child = child
-            self.tracker = tracker
-        
-        def matcher(self, searcher):
-            m = self.child.matcher(searcher)
-            self.tracker._tag_matcher(text_type(self.child), m)
-            return m
+                       filter=self.allow, mask=self.restrict,
+                       termlists=self.termlists)
 
 
 class Results(object):
     """
 
     def __init__(self, searcher, q, top_n, docset, groups=None, runtime=-1,
-                 filter=None, mask=None):
+                 filter=None, mask=None, termlists=None):
         """
         :param searcher: the :class:`Searcher` object that produced these
             results.
         self.runtime = runtime
         self._filter = filter
         self._mask = mask
-        self._terms = None
+        self._termlists = termlists
         
         self.fragmenter = highlight.ContextFragmenter()
         self.fragment_scorer = highlight.BasicFragmentScorer()
         
         return ((docnum, score) for score, docnum in self.top_n)
 
-    def terms(self):
-        if self._terms is  None:
-            self._terms = self.q.existing_terms(self.searcher.reader())
-        return self._terms
-
     def fields(self, n):
         """Returns the stored fields for the document at the ``n`` th position
         in the results. Use :meth:`Results.docnum` if you want the raw
         """
         return self.top_n[n][1]
 
+    def matched_terms(self):
+        """Returns the set of ``("fieldname", "text")`` tuples representing
+        terms from the query that matched one or more of the TOP N documents
+        (this does not report terms for documents that match the query but did
+        not score high enough to make the top N results). You can compare this
+        set to the terms from the original query to find terms which didn't
+        occur in any matching documents.
+        
+        This is only valid if you used ``terms=True`` in the search call to
+        record matching terms. Otherwise it will raise an exception.
+        
+        >>> q = myparser.parse("alfa OR bravo OR charlie")
+        >>> results = searcher.search(q, terms=True)
+        >>> results.terms()
+        set([("content", "alfa"), ("content", "charlie")])
+        >>> q.all_terms() - results.terms()
+        set([("content", "bravo")])
+        """
+        
+        if self._termlists is None:
+            raise NoTermsException
+        return set(self._termlists.keys())
+
     def highlights(self, n, fieldname, text=None, top=3, fragmenter=None,
-                   formatter=None, order=highlight.FIRST):
+                   formatter=None, order=highlight.FIRST, force=True):
         """Returns highlighted snippets for the document in the Nth position
         in the results. It is usually more convenient to call this method on a
         Hit object instead of the Results.
         fragmenter = fragmenter or self.fragmenter
         formatter = formatter or self.formatter
         
-        terms = set(ttext for fname, ttext in self.terms()
-                    if fname == fieldname)
+        if self._termlists is None:
+            terms = self.q.existing_terms(self.searcher.reader())
+        else:
+            terms = self.matched_terms()
+        terms = set(ttext for fname, ttext in terms if fname == fieldname)
+        if not terms and not force:
+            return None
+        
         return highlight.highlight(text, terms, analyzer, fragmenter,
-                                   formatter, top=top,
-                                   scorer=self.fragment_scorer, order=order)
+                                   formatter, top=top, order=order,
+                                   scorer=self.fragment_scorer)
 
     def key_terms(self, fieldname, docs=10, numterms=5,
                   model=classify.Bo1Model, normalize=True):
         self.docset = docs | otherdocs
         self.top_n = arein + notin + other
 
-    def contains_term(self, fieldname, text):
-        """Returns True if the given term exists in at least one of the
-        documents in this results set.
-        """
-        
-        docset = self.docs()
-        minid = min(docset)
-        maxid = max(docset)
-        
-        field = self.searcher.schema[fieldname]
-        text = field.to_text(text)
-        postings = self.searcher.postings(fieldname, text)
-        postings.skip_to(minid)
-        for id in postings.all_ids():
-            if id in docset:
-                return True
-            if id >= maxid:
-                break
-        return False
-
 
 class Hit(object):
     """Represents a single search result ("hit") in a Results object.
             self._fields = self.searcher.stored_fields(self.docnum)
         return self._fields
     
+    def matched_terms(self):
+        """Returns the set of ``("fieldname", "text")`` tuples representing
+        terms from the query that matched in this document. You can
+        compare this set to the terms from the original query to find terms
+        which didn't occur in this document.
+        
+        This is only valid if you used ``terms=True`` in the search call to
+        record matching terms. Otherwise it will raise an exception.
+        
+        >>> q = myparser.parse("alfa OR bravo OR charlie")
+        >>> results = searcher.search(q, terms=True)
+        >>> for hit in results:
+        ...   print(hit["title"])
+        ...   print("Contains:", hit.matched_terms())
+        ...   print("Doesn't contain:", q.all_terms() - hit.matched_terms())
+        """
+        
+        termlists = self.results._termlists
+        if termlists is None:
+            raise NoTermsException
+        
+        # termlists maps terms->set of docnums, so we have to check every term
+        # to see if this document is in its list
+        s = set()
+        for term in termlists.keys():
+            if self.docnum in termlists[term]:
+                s.add(term)
+        return s
+    
     def highlights(self, fieldname, text=None, top=3, fragmenter=None,
-                   formatter=None, order=highlight.FIRST):
+                   formatter=None, order=highlight.FIRST, force=True):
         """Returns highlighted snippets from the given field::
         
             r = searcher.search(myquery)
             :func:`whoosh.highlight.LONGER`,
             :func:`whoosh.highlight.SHORTER`, or a custom sorting function. The
             default is ``highlight.FIRST``.
+        :param force: if True (the default), returns "highlights" even if the
+            document does not contain any matching terms. If False, returns
+            None instead of highlights when the document does not contain any
+            matching terms. This can save time by avoiding retokenizing large
+            amounts of text.
         """
         
         return self.results.highlights(self.rank, fieldname, text=text,
                                        top=top, fragmenter=fragmenter,
-                                       formatter=formatter, order=order)
+                                       formatter=formatter, order=order,
+                                       force=force)
     
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):
                                        top=top, numterms=numterms, model=model,
                                        normalize=normalize, filter=filter)
     
+    def contains_term(self, fieldname, text):
+        """Returns True if the given query term exists in this document. This
+        only works for terms that were in the original query.
+        """
+        
+        termlists = self.results._termlists
+        if termlists is not None:
+            term = (fieldname, text)
+            if term in termlists:
+                docset = termlists[term]
+                return self.docnum in docset
+        
+        return False
+    
     def __repr__(self):
         return "<%s %r>" % (self.__class__.__name__, self.fields())
     

File src/whoosh/sorting.py

                    groupedby=None):
         from whoosh.searching import Collector
         
-        collector = Collector(limit=limit, groupedby=groupedby, reverse=reverse)
-        return collector.sort(self.searcher, q, self.multi, allow=filter,
-                              restrict=mask)
+        collector = Collector(limit=limit, groupedby=groupedby)
+        return collector.sort(self.searcher, q, self.multi, reverse=reverse,
+                              allow=filter, restrict=mask)
     
 
 # Faceting objects

File tests/test_highlighting.py

         # Parse the user query
         parser = qparser.QueryParser("title", schema=ix.schema)
         q = parser.parse(u("man"))
-        r = s.search(q)
+        r = s.search(q, terms=True)
         assert_equal(len(r), 2)
         
         r.fragmenter = highlight.WholeFragmenter()

File tests/test_matching.py

 
 from nose.tools import assert_equal, assert_not_equal  #@UnresolvedImport
 
-from whoosh import fields
+from whoosh import fields, matching, query
 from whoosh.compat import u
 from whoosh.filedb.filestore import RamStorage
-from whoosh.matching import *
 from whoosh.query import And, Term
-from whoosh.util import make_binary_tree
+from whoosh.util import make_binary_tree, permutations
 
 
 def _keys(searcher, docnums):
     return sorted([searcher.stored_fields(docnum)['key'] for docnum in docnums])
 
 def test_nullmatcher():
-    nm = NullMatcher()
+    nm = matching.NullMatcher()
     assert not nm.is_active()
     assert_equal(list(nm.all_ids()), [])
 
 def test_listmatcher():
     ids = [1, 2, 5, 9, 10]
     
-    lm = ListMatcher(ids)
+    lm = matching.ListMatcher(ids)
     ls = []
     while lm.is_active():
         ls.append((lm.id(), lm.score()))
         lm.next()
     assert_equal(ls, [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)])
     
-    lm = ListMatcher(ids)
+    lm = matching.ListMatcher(ids)
     assert_equal(list(lm.all_ids()), ids)
     
-    lm = ListMatcher(ids, position=3)
+    lm = matching.ListMatcher(ids, position=3)
     ls = []
     while lm.is_active():
         ls.append(lm.id())
         lm.next()
     assert_equal(ls, [9, 10])
     
-    lm = ListMatcher(ids)
+    lm = matching.ListMatcher(ids)
     for _ in xrange(3):
         lm.next()
     lm = lm.copy()
     assert_equal(ls, [9, 10])
 
 def test_wrapper():
-    wm = WrappingMatcher(ListMatcher([1, 2, 5, 9, 10]), boost=2.0)
+    wm = matching.WrappingMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), boost=2.0)
     ls = []
     while wm.is_active():
         ls.append((wm.id(), wm.score()))
     assert_equal(ls, [(1, 2.0), (2, 2.0), (5, 2.0), (9, 2.0), (10, 2.0)])
     
     ids = [1, 2, 5, 9, 10]
-    wm = WrappingMatcher(ListMatcher(ids), boost=2.0)
+    wm = matching.WrappingMatcher(matching.ListMatcher(ids), boost=2.0)
     assert_equal(list(wm.all_ids()), ids)
 
 def test_filter():
-    lm = lambda: ListMatcher(list(range(2, 10)))
+    lm = lambda: matching.ListMatcher(list(range(2, 10)))
     
-    fm = FilterMatcher(lm(), frozenset([3, 9]))
+    fm = matching.FilterMatcher(lm(), frozenset([3, 9]))
     assert_equal(list(fm.all_ids()), [3, 9])
     
-    fm = FilterMatcher(lm(), frozenset([1, 5, 9, 13]))
+    fm = matching.FilterMatcher(lm(), frozenset([1, 5, 9, 13]))
     assert_equal(list(fm.all_ids()), [5, 9])
 
 def test_exclude():
-    em = FilterMatcher(ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
+    em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     assert_equal(list(em.all_ids()), [1, 5, 10])
     
-    em = FilterMatcher(ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
+    em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     assert_equal(list(em.all_ids()), [1, 5, 10])
     
-    em = FilterMatcher(ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
+    em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     em.next()
     em.next()
     em = em.copy()
     assert_equal(ls, [10])
 
 def test_simple_union():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    um = UnionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    um = matching.UnionMatcher(lm1, lm2)
     ls = []
     while um.is_active():
         ls.append((um.id(), um.score()))
         um.next()
     assert_equal(ls, [(0, 1.0), (1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    um = UnionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    um = matching.UnionMatcher(lm1, lm2)
     assert_equal(list(um.all_ids()), [0, 1, 4, 10, 20, 90])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    um = UnionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    um = matching.UnionMatcher(lm1, lm2)
     um.next()
     um.next()
     um = um.copy()
     assert_equal(ls, [4, 10, 20, 90])
     
 def test_simple_intersection():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    im = IntersectionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    im = matching.IntersectionMatcher(lm1, lm2)
     ls = []
     while im.is_active():
         ls.append((im.id(), im.score()))
         im.next()
     assert_equal(ls, [(4, 2.0), (20, 2.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    im = IntersectionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    im = matching.IntersectionMatcher(lm1, lm2)
     assert_equal(list(im.all_ids()), [4, 20])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    im = IntersectionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    im = matching.IntersectionMatcher(lm1, lm2)
     im.next()
     im.next()
     im = im.copy()
     assert_equal(ls, [])
 
 def test_andnot():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    anm = AndNotMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    anm = matching.AndNotMatcher(lm1, lm2)
     ls = []
     while anm.is_active():
         ls.append((anm.id(), anm.score()))
         anm.next()
     assert_equal(ls, [(1, 1.0), (10, 1.0), (90, 1.0)])
     
-    echo_lm = ListMatcher([0, 1, 2, 3, 4])
-    bravo_lm = ListMatcher([0, 1])
-    anm = AndNotMatcher(echo_lm, bravo_lm)
+    echo_lm = matching.ListMatcher([0, 1, 2, 3, 4])
+    bravo_lm = matching.ListMatcher([0, 1])
+    anm = matching.AndNotMatcher(echo_lm, bravo_lm)
     assert_equal(list(anm.all_ids()), [2, 3, 4])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    anm = AndNotMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    anm = matching.AndNotMatcher(lm1, lm2)
     assert_equal(list(anm.all_ids()), [1, 10, 90])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    anm = AndNotMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    anm = matching.AndNotMatcher(lm1, lm2)
     anm.next()
     anm.next()
     anm = anm.copy()
     assert_equal(ls, [90])
 
 def test_require():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    rm = RequireMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    rm = matching.RequireMatcher(lm1, lm2)
     ls = []
     while rm.is_active():
         ls.append((rm.id(), rm.score()))
         rm.next()
     assert_equal(ls, [(4, 1.0), (20, 1.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    rm = RequireMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    rm = matching.RequireMatcher(lm1, lm2)
     assert_equal(list(rm.all_ids()), [4, 20])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    rm = RequireMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    rm = matching.RequireMatcher(lm1, lm2)
     rm.next()
     rm.next()
     rm = rm.copy()
     assert_equal(ls, [])
 
 def test_andmaybe():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    amm = AndMaybeMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    amm = matching.AndMaybeMatcher(lm1, lm2)
     ls = []
     while amm.is_active():
         ls.append((amm.id(), amm.score()))
         amm.next()
     assert_equal(ls, [(1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    amm = AndMaybeMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    amm = matching.AndMaybeMatcher(lm1, lm2)
     assert_equal(list(amm.all_ids()), [1, 4, 10, 20, 90])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    amm = AndMaybeMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    amm = matching.AndMaybeMatcher(lm1, lm2)
     amm.next()
     amm.next()
     amm = amm.copy()
             assert_equal(_keys(s, ids1), target)
 
 def test_union():
-    s1 = ListMatcher([1, 2, 3, 4, 5, 6, 7, 8])
-    s2 = ListMatcher([2, 4, 8, 10, 20, 30])
-    s3 = ListMatcher([10, 100, 200])
+    s1 = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8])
+    s2 = matching.ListMatcher([2, 4, 8, 10, 20, 30])
+    s3 = matching.ListMatcher([10, 100, 200])
     target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 100, 200]
-    um = UnionMatcher(s1, UnionMatcher(s2, s3))
+    um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3))
     assert_equal(target, list(um.all_ids()))
     
 def test_union_scores():
-    s1 = ListMatcher([1, 2, 3])
-    s2 = ListMatcher([2, 4, 8])
-    s3 = ListMatcher([2, 3, 8])
+    s1 = matching.ListMatcher([1, 2, 3])
+    s2 = matching.ListMatcher([2, 4, 8])
+    s3 = matching.ListMatcher([2, 3, 8])
     target = [(1, 1.0), (2, 3.0), (3, 2.0), (4, 1.0), (8, 2.0)]
-    um = UnionMatcher(s1, UnionMatcher(s2, s3))
+    um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3))
     result = []
     while um.is_active():
         result.append((um.id(), um.score()))
         for _ in xrange(randint(*clauselimits)):
             nums = sample(vals, randint(*rangelimits))
             target = target.union(nums)
-            matchers.append(ListMatcher(sorted(nums)))
+            matchers.append(matching.ListMatcher(sorted(nums)))
         target = sorted(target)
-        um = make_binary_tree(UnionMatcher, matchers)
+        um = make_binary_tree(matching.UnionMatcher, matchers)
         assert_equal(list(um.all_ids()), target)
 
 def test_inverse():
-    s = ListMatcher([1, 5, 10, 11, 13])
-    inv = InverseMatcher(s, 15)
+    s = matching.ListMatcher([1, 5, 10, 11, 13])
+    inv = matching.InverseMatcher(s, 15)
     ids = []
     while inv.is_active():
         ids.append(inv.id())
     assert_equal(ids, [0, 2, 3, 4, 6, 7, 8, 9, 12, 14])
     
 def test_inverse_skip():
-    s = ListMatcher([1, 5, 10, 11, 13])
-    inv = InverseMatcher(s, 15)
+    s = matching.ListMatcher([1, 5, 10, 11, 13])
+    inv = matching.InverseMatcher(s, 15)
     inv.skip_to(8)
     
     ids = []
     assert_equal([8, 9, 12, 14], ids)
 
 def test_empty_andnot():
-    pos = NullMatcher()
-    neg = NullMatcher()
-    anm = AndNotMatcher(pos, neg)
+    pos = matching.NullMatcher()
+    neg = matching.NullMatcher()
+    anm = matching.AndNotMatcher(pos, neg)
     assert not anm.is_active()
     assert_equal(list(anm.all_ids()), [])
     
-    pos = ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    neg = NullMatcher()
-    ans = AndNotMatcher(pos, neg)
+    pos = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    neg = matching.NullMatcher()
+    ans = matching.AndNotMatcher(pos, neg)
     ids = list(ans.all_ids())
     assert_equal(ids, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 
         negset = frozenset(negs)
         matched = [n for n in rng if n not in negset]
         
-        pos = ListMatcher(rng)
-        neg = ListMatcher(negs)
+        pos = matching.ListMatcher(rng)
+        neg = matching.ListMatcher(negs)
         
-        anm = AndNotMatcher(pos, neg)
+        anm = matching.AndNotMatcher(pos, neg)
         ids = list(anm.all_ids())
         assert_equal(ids, matched)
 
+def test_current_terms():
+    domain = u("alfa bravo charlie delta").split()
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    for ls in permutations(domain, 3):
+        w.add_document(text=" ".join(ls), _stored_text=ls)
+    w.commit()
+    
+    with ix.searcher() as s:
+        q = query.And([query.Term("text", "alfa"), query.Term("text", "charlie")])
+        m = q.matcher(s)
+    
+        while m.is_active():
+            assert_equal(sorted(m.matching_terms()), [("text", "alfa"), ("text", "charlie")])
+            m.next()
 
 
 
+
+
+
+
+

File tests/test_queries.py

     w.commit()
     
     s = ix.searcher()
-    r = s.search(Term('content', u('train')))
+    r = s.search(Term('content', u('train')), terms=True)
     assert_equal(len(r), 1)
     assert_equal(r[0]["id"], "2")
     assert_equal(r[0].highlights("content"), 'India for a life changing <b class="match term0">train</b> journey')

File tests/test_results.py

     with ix.searcher() as s:
         qp = qparser.QueryParser("text", ix.schema)
         q = qp.parse(u("key"))
-        r = s.search(q)
+        r = s.search(q, terms=True)
         r.formatter = highlight.UppercaseFormatter()
         
         assert_equal(sorted([hit.highlights("text") for hit in r]), sorted(target))
     w.commit()
     
     q = query.Or([query.Term("text", "bravo"), query.Term("text", "charlie")])
-    r = ix.searcher().search(q)
-    assert not r.contains_term("text", "alfa")
-    assert r.contains_term("text", "bravo")
-    assert r.contains_term("text", "charlie")
-    assert r.contains_term("text", "delta")
-    assert r.contains_term("text", "echo")
-    assert not r.contains_term("text", "foxtrot")
+    r = ix.searcher().search(q, terms=True)
+    for hit in r:
+        assert not hit.contains_term("text", "alfa")
+        assert (hit.contains_term("text", "bravo")
+                or hit.contains_term("text", "charlie"))
+        assert not hit.contains_term("text", "foxtrot")
 
-
-
+def test_terms():
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(text=u("alfa sierra tango"))
+    w.add_document(text=u("bravo charlie delta"))
+    w.add_document(text=u("charlie delta echo"))
+    w.add_document(text=u("delta echo foxtrot"))
+    w.commit()
+    
+    qp = qparser.QueryParser("text", ix.schema)
+    q = qp.parse(u("(bravo AND charlie) OR foxtrot OR missing"))
+    r = ix.searcher().search(q, terms=True)
+    
+    def txts(tset):
+        return sorted(t[1] for t in tset)
+    
+    assert_equal(txts(r.matched_terms()), ["bravo", "charlie", "foxtrot"])
+    for hit in r:
+        value = hit["text"]
+        for txt in txts(hit.matched_terms()):
+            assert txt in value
+    
 
 
 

File tests/test_searching.py

             if "bravo charlie delta" in hit["title"]:
                 assert hit.score > 100.0
 
-def test_trackingcollector():
-    schema = fields.Schema(text=fields.TEXT(stored=True))
-    ix = RamStorage().create_index(schema)
-    domain = u("alfa bravo charlie delta echo").split()
-    w = ix.writer()
-    for ls in list(permutations(domain, 3))[::2]:
-        w.add_document(text=u(" ").join(ls))
-    w.commit()
-    
-    with ix.searcher() as s:
-        q = Or([Term("text", u("alfa")),Term("text", u("bravo")),
-                Not(Term("text", "charlie"))])
-        
-        col = searching.TermTrackingCollector()
-        _ = col.search(s, q)
-        
-        for docnum in col.catalog["text:alfa"]:
-            words = s.stored_fields(docnum)["text"].split()
-            assert "alfa" in words
-            assert "charlie" not in words
-        
-        for docnum in col.catalog["text:bravo"]:
-            words = s.stored_fields(docnum)["text"].split()
-            assert "bravo" in words
-            assert "charlie" not in words
-
 def test_filter():
     schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT)
     ix = RamStorage().create_index(schema)