Matt Chaput avatar Matt Chaput committed 3a52259

Added optional recording of matching terms per document, off by default.
Removed TermTrackingCollector.
Added matcher methods to find terms in tree.

Comments (0)

Files changed (11)

src/whoosh/filedb/filepostings.py

         
 
 class FilePostingReader(Matcher):
-    def __init__(self, postfile, offset, format, scorer=None,
-                 fieldname=None, text=None, stringids=False):
+    def __init__(self, postfile, offset, format, scorer=None, term=None,
+                 stringids=False):
         
         assert isinstance(offset, integer_types), "offset is %r/%s" % (offset, type(offset))
         assert isinstance(format, Format), "format is %r/%s" % (format, type(format))
         self.supports_chars = self.format.supports("characters")
         self.supports_poses = self.format.supports("positions")
         self.scorer = scorer
-        self.fieldname = fieldname
-        self.text = text
+        self._term = term
         self.stringids = stringids
         
         magic = postfile.get_int(offset)
         self._next_block()
 
     def __repr__(self):
-        r = "%s(%r, %r, %r, %s" % (self.__class__.__name__, str(self.postfile),
-                                   self.fieldname, self.text, self.is_active())
+        r = "%s(%r, %r, %s" % (self.__class__.__name__, str(self.postfile),
+                                   self._text, self.is_active())
         if self.is_active():
             r += ", %r" % self.id()
         r += ")"
 
     def copy(self):
         return self.__class__(self.postfile, self.startoffset, self.format,
-                              scorer=self.scorer, fieldname=self.fieldname,
-                              text=self.text, stringids=self.stringids)
+                              scorer=self.scorer, term=self._term,
+                              stringids=self.stringids)
 
     def is_active(self):
         return self._active
 
+    def term(self):
+        return self._term
+
     def id(self):
         return self.block.ids[self.i]
 
         elif self.supports_poses:
             return [Span(pos) for pos in self.value_as("positions")]
         else:
-            raise Exception("Field does not support positions (%r)" % self.fieldname)
+            raise Exception("Field does not support positions (%r)" % self._term)
 
     def weight(self):
         weights = self.block.weights

src/whoosh/filedb/filereading.py

         postings = terminfo.postings
         if isinstance(postings, integer_types):
             postreader = FilePostingReader(self.postfile, postings, format,
-                                           scorer=scorer, fieldname=fieldname,
-                                           text=text)
+                                           scorer=scorer, term=(fieldname, text))
         else:
             docids, weights, values = postings
             postreader = ListMatcher(docids, weights, values, format,
-                                     scorer=scorer)
+                                     scorer=scorer, term=(fieldname, text))
         
         deleted = self.segment.deleted
         if deleted:

src/whoosh/matching.py

         
         raise NotImplementedError
     
+    def term(self):
+        """Returns a ("fieldname", "termtext") tuple for the term this matcher
+        matches, or None if this matcher is not a term matcher.
+        """
+        
+        return None
+    
+    def term_matchers(self):
+        """Returns an iterator of term matchers in this tree.
+        """
+        
+        if self.term() is not None:
+            yield self
+        else:
+            for cm in self.children():
+                for m in cm.term_matchers():
+                    yield m
+    
+    def matching_terms(self, id=None):
+        """Returns an iterator of ("fieldname", "termtext") tuples for the
+        CURRENTLY MATCHING term matchers in this tree.
+        """
+        
+        if not self.is_active():
+            return
+        
+        if id is None:
+            id = self.id()
+        elif id != self.id():
+            return
+        
+        t = self.term()
+        if t is None:
+            for c in self.children():
+                for t in c.matching_terms(id):
+                    yield t
+        else:
+            yield t
+    
+    def children(self):
+        """Returns an (possibly empty) list of the submatchers of this
+        matcher.
+        """
+        
+        return []
+    
     def replace(self, minquality=0):
         """Returns a possibly-simplified version of this matcher. For example,
         if one of the children of a UnionMatcher is no longer active, calling
     """
     
     def __init__(self, ids, weights=None, values=None, format=None,
-                 scorer=None, position=0, all_weights=None):
+                 scorer=None, position=0, all_weights=None, term=None):
         """
         :param ids: a list of doc IDs.
         :param weights: a list of weights corresponding to the list of IDs.
             format of the field.
         :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring
             the postings.
+        :param term: a ("fieldname", "text") tuple, or None if this is not a
+            term matcher.
         """
         
         self._ids = ids
         self._i = position
         self._format = format
         self._scorer = scorer
+        self._term = term
     
     def __repr__(self):
         return "<%s>" % self.__class__.__name__
     def is_active(self):
         return self._i < len(self._ids)
     
+    def term(self):
+        return self._term
+    
     def copy(self):
         return self.__class__(self._ids, self._weights, self._values,
                               self._format, self._scorer, self._i,
     def is_active(self):
         return self.child.is_active()
     
+    def children(self):
+        return [self.child]
+    
     def supports(self, astype):
         return self.child.supports(astype)
     
     def is_active(self):
         return self.current < len(self.matchers)
     
+    def children(self):
+        return [self.matchers[self.current]]
+    
     def _next_matcher(self):
         matchers = self.matchers
         while self.current < len(matchers) and not matchers[self.current].is_active():
                               boost=self.boost)
     
     def _replacement(self, newchild):
-        return self.__class__(newchild, self._ids, exclude=self._exclude, boost=self.boost)
+        return self.__class__(newchild, self._ids, exclude=self._exclude,
+                              boost=self.boost)
     
     def _find_next(self):
         child = self.child
     def __repr__(self):
         return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b)
 
+    def children(self):
+        return [self.a, self.b]
+
     def copy(self):
         return self.__class__(self.a.copy(), self.b.copy())
 
     
     def score(self):
         return self._score
-    
+
+
+
+
 
 #class PhraseMatcher(WrappingMatcher):
 #    """Matches postings where a list of sub-matchers occur next to each other

src/whoosh/query.py

             return m
         else:
             ids = array("I", m.all_ids())
-            return ListMatcher(ids, all_weights=self.score)
+            return ListMatcher(ids, all_weights=self.score, term=m.term())
     
 
 class BinaryQuery(CompoundQuery):

src/whoosh/searching.py

 class TimeLimit(Exception):
     pass
 
+class NoTermsException(Exception):
+    """Exception raised you try to access matched terms on a :class:`Results`
+    object was created without them. To record which terms matched in which
+    document, you need to call the :meth:`Searcher.search` method with
+    ``terms=True``.
+    """
+    
+    message = "Results were created without recording terms"
 
 # Searcher class
 
                 yield docnum
     
     def search(self, q, limit=10, sortedby=None, reverse=False, groupedby=None,
-               optimize=True, filter=None, mask=None, groupids=True):
+               optimize=True, filter=None, mask=None, groupids=True,
+               terms=False):
         """Runs the query represented by the ``query`` object and returns a
         Results object.
         
             document numbers associated with that key. To map to a simple count
             of the number of documents instead of a list, use
             ``groupids=False``.
+        :param terms: if True, record which terms were found in each matching
+            document. You can use :meth:`Results.contains_term` or
+            :meth:`Hit.contains_term` to check whether a hit contains a
+            particular term.
         :rtype: :class:`Results`
         """
 
             raise ValueError("limit must be >= 1")
 
         collector = Collector(limit=limit, usequality=optimize,
-                              groupedby=groupedby, reverse=reverse,
-                              groupids=groupids)
+                              groupedby=groupedby, groupids=groupids,
+                              terms=terms)
         
         if sortedby:
-            return collector.sort(self, q, sortedby, allow=filter,
-                                  restrict=mask)
+            return collector.sort(self, q, sortedby, reverse=reverse,
+                                  allow=filter, restrict=mask)
         else:
             return collector.search(self, q, allow=filter, restrict=mask)
     
         
 
 class Collector(object):
-    def __init__(self, limit=10, usequality=True, replace=10, groupedby=None,
-                 timelimit=None, greedy=False, reverse=False, groupids=True):
-        """A Collector finds the matching documents, scores them, collects them
-        into a list, and produces a Results object from them.
-        
-        Normally you do not need to instantiate an instance of the base
-        Collector class, the :meth:`Searcher.search` method does that for you.
-        
-        If you create a custom Collector instance or subclass you can use its
-        ``search()`` method instead of :meth:`Searcher.search`::
-        
-            mycollector = MyCollector()
-            results = mycollector.search(mysearcher, myquery)
-        
-        **Do not** re-use or share Collector instances between searches. You
-        should create a new Collector instance for each search.
-        
-        To limit the amount of time a search can take, pass the number of
-        seconds to the ``timelimit`` keyword argument::
-        
-            # Limit the search to 4.5 seconds
-            col = Collector(timelimit=4.5, greedy=False)
-            # If this call takes more than 4.5 seconds, it will raise a
-            # whoosh.searching.TimeLimit exception
-            try:
-                r = searcher.search(myquery, collector=col)
-            except TimeLimit, tl:
-                # You can still retrieve partial results from the collector
-                r = col.results()
-        
-        If the ``greedy`` keyword is ``True``, the collector will finish adding
-        the most recent hit before raising the ``TimeLimit`` exception.
+    """A Collector finds the matching documents, scores them, collects them
+    into a list, and produces a Results object from them.
+    
+    Normally you do not need to instantiate an instance of the base
+    Collector class, the :meth:`Searcher.search` method does that for you.
+    
+    If you create a custom Collector instance or subclass you can use its
+    ``search()`` method instead of :meth:`Searcher.search`::
+    
+        mycollector = MyCollector()
+        results = mycollector.search(mysearcher, myquery)
+    
+    **Do not** re-use or share Collector instances between searches. You
+    should create a new Collector instance for each search.
+    
+    To limit the amount of time a search can take, pass the number of
+    seconds to the ``timelimit`` keyword argument::
+    
+        # Limit the search to 4.5 seconds
+        col = Collector(timelimit=4.5, greedy=False)
+        # If this call takes more than 4.5 seconds, it will raise a
+        # whoosh.searching.TimeLimit exception
+        try:
+            r = searcher.search(myquery, collector=col)
+        except TimeLimit, tl:
+            # You can still retrieve partial results from the collector
+            r = col.results()
+    
+    If the ``greedy`` keyword is ``True``, the collector will finish adding
+    the most recent hit before raising the ``TimeLimit`` exception.
+    """
+    
+    def __init__(self, limit=10, usequality=True, groupedby=None,
+                 groupids=True, timelimit=None, greedy=False, terms=False,
+                 replace=10):
+        """
+        :param limit: the maximum number of hits to collect. If this is None,
+            collect all hits.
+        :param usequality: whether to use block quality optimizations when
+            available. This is mostly useful for debugging purposes.
+        :param groupedby: see :doc:`/facets` for information.
+        :param groupids: if True, saves lists of document IDs for facets. If
+            False, only saves a count of the number of documents in each group.
+        :param timelimit: the maximum amount of time (in possibly fractional
+            seconds) to allow for searching. If the search takes longer than
+            this, it will raise a ``TimeLimit`` exception.
+        :param greedy: if ``True``, the collector will finish adding the most
+            recent hit before raising the ``TimeLimit`` exception.
+        :param terms: if ``True``, record which terms matched in each document.
         """
         
         self.limit = limit
         self.replace = replace
         self.timelimit = timelimit
         self.greedy = greedy
-        self.reverse = reverse
         self.groupids = groupids
+        self.termlists = defaultdict(set) if terms else None
         
         self.facets = None
         if groupedby:
         does not use the final() method, etc.).
         """
         
-        use = (self.usequality and not searcher.weighting.use_final
+        use = (self.usequality
+               and not searcher.weighting.use_final
                and not self.should_add_all())
         if matcher:
             use = use and matcher.supports_block_quality()
         matcher = q.matcher(searcher)
         usequality = self.use_block_quality(searcher, matcher)
         
-        for score, offsetid in self.pull_matches(searcher, matcher, offset,
-                                                 scorefn, usequality):
+        for score, offsetid in self.pull_matches(matcher, offset, scorefn,
+                                                 usequality):
             # Document numbers are negated before putting them in the heap so
             # that higher document numbers have lower "priority" in the queue.
             # Lower document numbers should always come before higher document
                     heapreplace(items, (score, negated_offsetid))
                     self.minscore = items[0][0]
     
-    def pull_matches(self, searcher, matcher, offset, scorefn, usequality):
+    def pull_matches(self, matcher, offset, scorefn, usequality):
         """Low-level method yields (docid, score) pairs from the given matcher.
         Called by :meth:`Collector.add_matches`.
         """
         replacecounter = 0
         timelimited = bool(self.timelimit)
         
+        termlists = self.termlists
+        recordterms = termlists is not None
+        if recordterms:
+            termmatchers = list(matcher.term_matchers())
+        else:
+            termmatchers = None
+        
         # A flag to indicate whether we should check block quality at the start
         # of the next loop
         checkquality = True
                         break
                     replacecounter = replace
                     minscore = self.minscore
+                    if recordterms:
+                        termmatchers = list(matcher.term_matchers())
                 replacecounter -= 1
             
             # Check whether the time limit expired since the last match
                     score = matcher.score()
                 yield (score, offsetid)
             
+            # If recording terms, add the document to the termlists
+            if recordterms:
+                for m in termmatchers:
+                    if m.is_active() and m.id() == id:
+                        termlists[m.term()].add(offsetid)
+            
             # Check whether the time limit expired
             if timelimited and self.timedout:
                 raise TimeLimit
             # again.
             checkquality = matcher.next()
     
-    def sort(self, searcher, q, sortedby, allow=None, restrict=None):
+    def sort(self, searcher, q, sortedby, reverse=False, allow=None,
+             restrict=None):
         self.searcher = searcher
         self.q = q
         self.docset = set()
         
         items = self.items
         limit = self.limit
-        heapfn = nlargest if self.reverse else nsmallest
+        heapfn = nlargest if reverse else nsmallest
         addall = self.should_add_all()
         
         facet = sorting.MultiFacet.from_sortedby(sortedby)
             catter.set_searcher(s, offset)
             matcher = q.matcher(s)
             
-            if catter.requires_matcher:
-                ls = list(self.pull_matches(s, matcher, offset,
+            if catter.requires_matcher or self.termlists:
+                ls = list(self.pull_matches(matcher, offset,
                                             catter.key_for_matcher, False))
             else:
                 ls = list(self.pull_unscored_matches(matcher, offset,
         
         self.items = items
         self.runtime = now() - t
-        return self.results(scores=False)
+        return self.results(scores=False, reverse=reverse)
     
     def pull_unscored_matches(self, matcher, offset, keyfn):
         allow = self.allow
             if timelimited and self.timedout:
                 raise TimeLimit
     
-    def results(self, scores=True):
+    def results(self, scores=True, reverse=False):
         """Returns the current results from the collector. This is useful for
         getting the results out of a collector that was stopped by a time
         limit exception.
             # Sort by negated scores so that higher scores go first, then by
             # document number to keep the order stable when documents have the
             # same score
-            items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse)
+            items.sort(key=lambda x: (0 - x[0], x[1]))
         else:
-            items = sorted(self.items, reverse=self.reverse)
+            items = sorted(self.items, reverse=reverse)
         
         return Results(self.searcher, self.q, items, self.docset,
                        groups=self.groups, runtime=self.runtime,
-                       filter=self.allow, mask=self.restrict)
-
-
-class TermTrackingCollector(Collector):
-    """This is an experiment. For a more straightforward but possibly slightly
-    slower method for determining if a given term was found, see
-    :meth:`Results.contains_term`.
-    
-    This collector records which parts of the query matched which documents
-    in the final results. The results for each part of the query are available
-    as a dictionary in the ``catalog`` attribute of the collector after the
-    search, where the keys are representations of the parts of the query and
-    the values are sets of document numbers that matched that part of the
-    query.
-    
-    This feature is experimental and may change in future releases.
-    
-    How to choose a key to represent query objects in the ``catalog``
-    dictionary was not entirely clear. The current implementation uses the
-    unicode representation of the query object, which usually returns something
-    at least recognizable (for example, ``unicode(Term("f", u"a")) == u"f:a"``
-    and ``unicode(Prefix("f", "b")) == u"f:b*"``).
-    
-    >>> myparser = qparser.QueryParser("content", myindex.schema)
-    >>> myquery = myparser.parse(u"apple OR bear NOT camel")
-    >>> col = TermTrackingCollector()
-    >>> results = col.search(searcher, myquery)
-    >>> # The docnums in the results that contained "apple"
-    >>> col.catalog["content:apple"]
-    set([1, 2, 3])
-    >>> for hit in results:
-    ...     print hit.rank, ":", hit["title"]
-    ...     for key, docset in col.catalog.keys():
-    ...         if hit.docnum in docset:
-    ...             print "   - Contains", key
-    """
-    
-    # This collector works by rewriting the query with "TaggedQuery" wrappers
-    # around the leaf nodes before it searches. When base collector generates
-    # a matcher tree from the query tree, these wrappers "phone home" to this
-    # collector and register the leaf matchers. Then, when collecting hits, the
-    # collector checks with the leaf matchers at each hit to see if they are
-    # matching the current document.
-    
-    def __init__(self, *args, **kwargs):
-        super(TermTrackingCollector, self).__init__(*args, **kwargs)
-        self.catalog = {}
-    
-    def should_add_all(self):
-        # If you're using this collector, you need to examine all documents
-        return True
-    
-    def add_matches(self, searcher, q, offset, scorefn):
-        sup = super(TermTrackingCollector, self)
-        self.matchers = []
-        q = self._tag(q)
-        return sup.add_matches(searcher, q, offset, scorefn)
-    
-    def pull_matches(self, searcher, matcher, offset, scorefn, usequality):
-        super_method = super(TermTrackingCollector, self).pull_matches
-        
-        for score, offsetid in super_method(searcher, matcher, offset,
-                                            scorefn, usequality):
-            for key, m in self.matchers:
-                if m.is_active() and m.id() == offsetid - offset:
-                    if key not in self.catalog:
-                        self.catalog[key] = set()
-                    self.catalog[key].add(offsetid)
-            
-            yield (score, offsetid)
-    
-    def _tag(self, q):
-        # Takes a query and returns a copy of the query with a TaggedQuery
-        # wrapper around any leaf nodes in the query tree
-        if isinstance(q, query.Not):
-            return q
-        elif q.is_leaf():
-            return TermTrackingCollector.TaggedQuery(q, self)
-        else:
-            return q.apply(self._tag)
-        
-    def _tag_matcher(self, key, m):
-        # This method is called from the TaggedQuery wrappers that the _tag
-        # method added to the query
-        self.matchers.append((key, m))
-        
-    class TaggedQuery(query.WrappingQuery):
-        # The only purpose of this query wrapper is to "call home" to the
-        # TrackingCollector instance when the child query generates a matcher
-        # so the TrackingCollector can register it
-        
-        def __init__(self, child, tracker):
-            self.child = child
-            self.tracker = tracker
-        
-        def matcher(self, searcher):
-            m = self.child.matcher(searcher)
-            self.tracker._tag_matcher(text_type(self.child), m)
-            return m
+                       filter=self.allow, mask=self.restrict,
+                       termlists=self.termlists)
 
 
 class Results(object):
     """
 
     def __init__(self, searcher, q, top_n, docset, groups=None, runtime=-1,
-                 filter=None, mask=None):
+                 filter=None, mask=None, termlists=None):
         """
         :param searcher: the :class:`Searcher` object that produced these
             results.
         self.runtime = runtime
         self._filter = filter
         self._mask = mask
-        self._terms = None
+        self._termlists = termlists
         
         self.fragmenter = highlight.ContextFragmenter()
         self.fragment_scorer = highlight.BasicFragmentScorer()
         
         return ((docnum, score) for score, docnum in self.top_n)
 
-    def terms(self):
-        if self._terms is  None:
-            self._terms = self.q.existing_terms(self.searcher.reader())
-        return self._terms
-
     def fields(self, n):
         """Returns the stored fields for the document at the ``n`` th position
         in the results. Use :meth:`Results.docnum` if you want the raw
         """
         return self.top_n[n][1]
 
+    def matched_terms(self):
+        """Returns the set of ``("fieldname", "text")`` tuples representing
+        terms from the query that matched one or more of the TOP N documents
+        (this does not report terms for documents that match the query but did
+        not score high enough to make the top N results). You can compare this
+        set to the terms from the original query to find terms which didn't
+        occur in any matching documents.
+        
+        This is only valid if you used ``terms=True`` in the search call to
+        record matching terms. Otherwise it will raise an exception.
+        
+        >>> q = myparser.parse("alfa OR bravo OR charlie")
+        >>> results = searcher.search(q, terms=True)
+        >>> results.terms()
+        set([("content", "alfa"), ("content", "charlie")])
+        >>> q.all_terms() - results.terms()
+        set([("content", "bravo")])
+        """
+        
+        if self._termlists is None:
+            raise NoTermsException
+        return set(self._termlists.keys())
+
     def highlights(self, n, fieldname, text=None, top=3, fragmenter=None,
-                   formatter=None, order=highlight.FIRST):
+                   formatter=None, order=highlight.FIRST, force=True):
         """Returns highlighted snippets for the document in the Nth position
         in the results. It is usually more convenient to call this method on a
         Hit object instead of the Results.
         fragmenter = fragmenter or self.fragmenter
         formatter = formatter or self.formatter
         
-        terms = set(ttext for fname, ttext in self.terms()
-                    if fname == fieldname)
+        if self._termlists is None:
+            terms = self.q.existing_terms(self.searcher.reader())
+        else:
+            terms = self.matched_terms()
+        terms = set(ttext for fname, ttext in terms if fname == fieldname)
+        if not terms and not force:
+            return None
+        
         return highlight.highlight(text, terms, analyzer, fragmenter,
-                                   formatter, top=top,
-                                   scorer=self.fragment_scorer, order=order)
+                                   formatter, top=top, order=order,
+                                   scorer=self.fragment_scorer)
 
     def key_terms(self, fieldname, docs=10, numterms=5,
                   model=classify.Bo1Model, normalize=True):
         self.docset = docs | otherdocs
         self.top_n = arein + notin + other
 
-    def contains_term(self, fieldname, text):
-        """Returns True if the given term exists in at least one of the
-        documents in this results set.
-        """
-        
-        docset = self.docs()
-        minid = min(docset)
-        maxid = max(docset)
-        
-        field = self.searcher.schema[fieldname]
-        text = field.to_text(text)
-        postings = self.searcher.postings(fieldname, text)
-        postings.skip_to(minid)
-        for id in postings.all_ids():
-            if id in docset:
-                return True
-            if id >= maxid:
-                break
-        return False
-
 
 class Hit(object):
     """Represents a single search result ("hit") in a Results object.
             self._fields = self.searcher.stored_fields(self.docnum)
         return self._fields
     
+    def matched_terms(self):
+        """Returns the set of ``("fieldname", "text")`` tuples representing
+        terms from the query that matched in this document. You can
+        compare this set to the terms from the original query to find terms
+        which didn't occur in this document.
+        
+        This is only valid if you used ``terms=True`` in the search call to
+        record matching terms. Otherwise it will raise an exception.
+        
+        >>> q = myparser.parse("alfa OR bravo OR charlie")
+        >>> results = searcher.search(q, terms=True)
+        >>> for hit in results:
+        ...   print(hit["title"])
+        ...   print("Contains:", hit.matched_terms())
+        ...   print("Doesn't contain:", q.all_terms() - hit.matched_terms())
+        """
+        
+        termlists = self.results._termlists
+        if termlists is None:
+            raise NoTermsException
+        
+        # termlists maps terms->set of docnums, so we have to check every term
+        # to see if this document is in its list
+        s = set()
+        for term in termlists.keys():
+            if self.docnum in termlists[term]:
+                s.add(term)
+        return s
+    
     def highlights(self, fieldname, text=None, top=3, fragmenter=None,
-                   formatter=None, order=highlight.FIRST):
+                   formatter=None, order=highlight.FIRST, force=True):
         """Returns highlighted snippets from the given field::
         
             r = searcher.search(myquery)
             :func:`whoosh.highlight.LONGER`,
             :func:`whoosh.highlight.SHORTER`, or a custom sorting function. The
             default is ``highlight.FIRST``.
+        :param force: if True (the default), returns "highlights" even if the
+            document does not contain any matching terms. If False, returns
+            None instead of highlights when the document does not contain any
+            matching terms. This can save time by avoiding retokenizing large
+            amounts of text.
         """
         
         return self.results.highlights(self.rank, fieldname, text=text,
                                        top=top, fragmenter=fragmenter,
-                                       formatter=formatter, order=order)
+                                       formatter=formatter, order=order,
+                                       force=force)
     
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):
                                        top=top, numterms=numterms, model=model,
                                        normalize=normalize, filter=filter)
     
+    def contains_term(self, fieldname, text):
+        """Returns True if the given query term exists in this document. This
+        only works for terms that were in the original query.
+        """
+        
+        termlists = self.results._termlists
+        if termlists is not None:
+            term = (fieldname, text)
+            if term in termlists:
+                docset = termlists[term]
+                return self.docnum in docset
+        
+        return False
+    
     def __repr__(self):
         return "<%s %r>" % (self.__class__.__name__, self.fields())
     

src/whoosh/sorting.py

                    groupedby=None):
         from whoosh.searching import Collector
         
-        collector = Collector(limit=limit, groupedby=groupedby, reverse=reverse)
-        return collector.sort(self.searcher, q, self.multi, allow=filter,
-                              restrict=mask)
+        collector = Collector(limit=limit, groupedby=groupedby)
+        return collector.sort(self.searcher, q, self.multi, reverse=reverse,
+                              allow=filter, restrict=mask)
     
 
 # Faceting objects

tests/test_highlighting.py

         # Parse the user query
         parser = qparser.QueryParser("title", schema=ix.schema)
         q = parser.parse(u("man"))
-        r = s.search(q)
+        r = s.search(q, terms=True)
         assert_equal(len(r), 2)
         
         r.fragmenter = highlight.WholeFragmenter()

tests/test_matching.py

 
 from nose.tools import assert_equal, assert_not_equal  #@UnresolvedImport
 
-from whoosh import fields
+from whoosh import fields, matching, query
 from whoosh.compat import u
 from whoosh.filedb.filestore import RamStorage
-from whoosh.matching import *
 from whoosh.query import And, Term
-from whoosh.util import make_binary_tree
+from whoosh.util import make_binary_tree, permutations
 
 
 def _keys(searcher, docnums):
     return sorted([searcher.stored_fields(docnum)['key'] for docnum in docnums])
 
 def test_nullmatcher():
-    nm = NullMatcher()
+    nm = matching.NullMatcher()
     assert not nm.is_active()
     assert_equal(list(nm.all_ids()), [])
 
 def test_listmatcher():
     ids = [1, 2, 5, 9, 10]
     
-    lm = ListMatcher(ids)
+    lm = matching.ListMatcher(ids)
     ls = []
     while lm.is_active():
         ls.append((lm.id(), lm.score()))
         lm.next()
     assert_equal(ls, [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)])
     
-    lm = ListMatcher(ids)
+    lm = matching.ListMatcher(ids)
     assert_equal(list(lm.all_ids()), ids)
     
-    lm = ListMatcher(ids, position=3)
+    lm = matching.ListMatcher(ids, position=3)
     ls = []
     while lm.is_active():
         ls.append(lm.id())
         lm.next()
     assert_equal(ls, [9, 10])
     
-    lm = ListMatcher(ids)
+    lm = matching.ListMatcher(ids)
     for _ in xrange(3):
         lm.next()
     lm = lm.copy()
     assert_equal(ls, [9, 10])
 
 def test_wrapper():
-    wm = WrappingMatcher(ListMatcher([1, 2, 5, 9, 10]), boost=2.0)
+    wm = matching.WrappingMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), boost=2.0)
     ls = []
     while wm.is_active():
         ls.append((wm.id(), wm.score()))
     assert_equal(ls, [(1, 2.0), (2, 2.0), (5, 2.0), (9, 2.0), (10, 2.0)])
     
     ids = [1, 2, 5, 9, 10]
-    wm = WrappingMatcher(ListMatcher(ids), boost=2.0)
+    wm = matching.WrappingMatcher(matching.ListMatcher(ids), boost=2.0)
     assert_equal(list(wm.all_ids()), ids)
 
 def test_filter():
-    lm = lambda: ListMatcher(list(range(2, 10)))
+    lm = lambda: matching.ListMatcher(list(range(2, 10)))
     
-    fm = FilterMatcher(lm(), frozenset([3, 9]))
+    fm = matching.FilterMatcher(lm(), frozenset([3, 9]))
     assert_equal(list(fm.all_ids()), [3, 9])
     
-    fm = FilterMatcher(lm(), frozenset([1, 5, 9, 13]))
+    fm = matching.FilterMatcher(lm(), frozenset([1, 5, 9, 13]))
     assert_equal(list(fm.all_ids()), [5, 9])
 
 def test_exclude():
-    em = FilterMatcher(ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
+    em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     assert_equal(list(em.all_ids()), [1, 5, 10])
     
-    em = FilterMatcher(ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
+    em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     assert_equal(list(em.all_ids()), [1, 5, 10])
     
-    em = FilterMatcher(ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
+    em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     em.next()
     em.next()
     em = em.copy()
     assert_equal(ls, [10])
 
 def test_simple_union():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    um = UnionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    um = matching.UnionMatcher(lm1, lm2)
     ls = []
     while um.is_active():
         ls.append((um.id(), um.score()))
         um.next()
     assert_equal(ls, [(0, 1.0), (1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    um = UnionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    um = matching.UnionMatcher(lm1, lm2)
     assert_equal(list(um.all_ids()), [0, 1, 4, 10, 20, 90])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    um = UnionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    um = matching.UnionMatcher(lm1, lm2)
     um.next()
     um.next()
     um = um.copy()
     assert_equal(ls, [4, 10, 20, 90])
     
 def test_simple_intersection():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    im = IntersectionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    im = matching.IntersectionMatcher(lm1, lm2)
     ls = []
     while im.is_active():
         ls.append((im.id(), im.score()))
         im.next()
     assert_equal(ls, [(4, 2.0), (20, 2.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    im = IntersectionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    im = matching.IntersectionMatcher(lm1, lm2)
     assert_equal(list(im.all_ids()), [4, 20])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    im = IntersectionMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    im = matching.IntersectionMatcher(lm1, lm2)
     im.next()
     im.next()
     im = im.copy()
     assert_equal(ls, [])
 
 def test_andnot():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    anm = AndNotMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    anm = matching.AndNotMatcher(lm1, lm2)
     ls = []
     while anm.is_active():
         ls.append((anm.id(), anm.score()))
         anm.next()
     assert_equal(ls, [(1, 1.0), (10, 1.0), (90, 1.0)])
     
-    echo_lm = ListMatcher([0, 1, 2, 3, 4])
-    bravo_lm = ListMatcher([0, 1])
-    anm = AndNotMatcher(echo_lm, bravo_lm)
+    echo_lm = matching.ListMatcher([0, 1, 2, 3, 4])
+    bravo_lm = matching.ListMatcher([0, 1])
+    anm = matching.AndNotMatcher(echo_lm, bravo_lm)
     assert_equal(list(anm.all_ids()), [2, 3, 4])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    anm = AndNotMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    anm = matching.AndNotMatcher(lm1, lm2)
     assert_equal(list(anm.all_ids()), [1, 10, 90])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    anm = AndNotMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    anm = matching.AndNotMatcher(lm1, lm2)
     anm.next()
     anm.next()
     anm = anm.copy()
     assert_equal(ls, [90])
 
 def test_require():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    rm = RequireMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    rm = matching.RequireMatcher(lm1, lm2)
     ls = []
     while rm.is_active():
         ls.append((rm.id(), rm.score()))
         rm.next()
     assert_equal(ls, [(4, 1.0), (20, 1.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    rm = RequireMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    rm = matching.RequireMatcher(lm1, lm2)
     assert_equal(list(rm.all_ids()), [4, 20])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    rm = RequireMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    rm = matching.RequireMatcher(lm1, lm2)
     rm.next()
     rm.next()
     rm = rm.copy()
     assert_equal(ls, [])
 
 def test_andmaybe():
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    amm = AndMaybeMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    amm = matching.AndMaybeMatcher(lm1, lm2)
     ls = []
     while amm.is_active():
         ls.append((amm.id(), amm.score()))
         amm.next()
     assert_equal(ls, [(1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    amm = AndMaybeMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    amm = matching.AndMaybeMatcher(lm1, lm2)
     assert_equal(list(amm.all_ids()), [1, 4, 10, 20, 90])
     
-    lm1 = ListMatcher([1, 4, 10, 20, 90])
-    lm2 = ListMatcher([0, 4, 20])
-    amm = AndMaybeMatcher(lm1, lm2)
+    lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
+    lm2 = matching.ListMatcher([0, 4, 20])
+    amm = matching.AndMaybeMatcher(lm1, lm2)
     amm.next()
     amm.next()
     amm = amm.copy()
             assert_equal(_keys(s, ids1), target)
 
 def test_union():
-    s1 = ListMatcher([1, 2, 3, 4, 5, 6, 7, 8])
-    s2 = ListMatcher([2, 4, 8, 10, 20, 30])
-    s3 = ListMatcher([10, 100, 200])
+    s1 = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8])
+    s2 = matching.ListMatcher([2, 4, 8, 10, 20, 30])
+    s3 = matching.ListMatcher([10, 100, 200])
     target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 100, 200]
-    um = UnionMatcher(s1, UnionMatcher(s2, s3))
+    um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3))
     assert_equal(target, list(um.all_ids()))
     
 def test_union_scores():
-    s1 = ListMatcher([1, 2, 3])
-    s2 = ListMatcher([2, 4, 8])
-    s3 = ListMatcher([2, 3, 8])
+    s1 = matching.ListMatcher([1, 2, 3])
+    s2 = matching.ListMatcher([2, 4, 8])
+    s3 = matching.ListMatcher([2, 3, 8])
     target = [(1, 1.0), (2, 3.0), (3, 2.0), (4, 1.0), (8, 2.0)]
-    um = UnionMatcher(s1, UnionMatcher(s2, s3))
+    um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3))
     result = []
     while um.is_active():
         result.append((um.id(), um.score()))
         for _ in xrange(randint(*clauselimits)):
             nums = sample(vals, randint(*rangelimits))
             target = target.union(nums)
-            matchers.append(ListMatcher(sorted(nums)))
+            matchers.append(matching.ListMatcher(sorted(nums)))
         target = sorted(target)
-        um = make_binary_tree(UnionMatcher, matchers)
+        um = make_binary_tree(matching.UnionMatcher, matchers)
         assert_equal(list(um.all_ids()), target)
 
 def test_inverse():
-    s = ListMatcher([1, 5, 10, 11, 13])
-    inv = InverseMatcher(s, 15)
+    s = matching.ListMatcher([1, 5, 10, 11, 13])
+    inv = matching.InverseMatcher(s, 15)
     ids = []
     while inv.is_active():
         ids.append(inv.id())
     assert_equal(ids, [0, 2, 3, 4, 6, 7, 8, 9, 12, 14])
     
 def test_inverse_skip():
-    s = ListMatcher([1, 5, 10, 11, 13])
-    inv = InverseMatcher(s, 15)
+    s = matching.ListMatcher([1, 5, 10, 11, 13])
+    inv = matching.InverseMatcher(s, 15)
     inv.skip_to(8)
     
     ids = []
     assert_equal([8, 9, 12, 14], ids)
 
 def test_empty_andnot():
-    pos = NullMatcher()
-    neg = NullMatcher()
-    anm = AndNotMatcher(pos, neg)
+    pos = matching.NullMatcher()
+    neg = matching.NullMatcher()
+    anm = matching.AndNotMatcher(pos, neg)
     assert not anm.is_active()
     assert_equal(list(anm.all_ids()), [])
     
-    pos = ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    neg = NullMatcher()
-    ans = AndNotMatcher(pos, neg)
+    pos = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    neg = matching.NullMatcher()
+    ans = matching.AndNotMatcher(pos, neg)
     ids = list(ans.all_ids())
     assert_equal(ids, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 
         negset = frozenset(negs)
         matched = [n for n in rng if n not in negset]
         
-        pos = ListMatcher(rng)
-        neg = ListMatcher(negs)
+        pos = matching.ListMatcher(rng)
+        neg = matching.ListMatcher(negs)
         
-        anm = AndNotMatcher(pos, neg)
+        anm = matching.AndNotMatcher(pos, neg)
         ids = list(anm.all_ids())
         assert_equal(ids, matched)
 
+def test_current_terms():
+    domain = u("alfa bravo charlie delta").split()
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    for ls in permutations(domain, 3):
+        w.add_document(text=" ".join(ls), _stored_text=ls)
+    w.commit()
+    
+    with ix.searcher() as s:
+        q = query.And([query.Term("text", "alfa"), query.Term("text", "charlie")])
+        m = q.matcher(s)
+    
+        while m.is_active():
+            assert_equal(sorted(m.matching_terms()), [("text", "alfa"), ("text", "charlie")])
+            m.next()
 
 
 
+
+
+
+
+

tests/test_queries.py

     w.commit()
     
     s = ix.searcher()
-    r = s.search(Term('content', u('train')))
+    r = s.search(Term('content', u('train')), terms=True)
     assert_equal(len(r), 1)
     assert_equal(r[0]["id"], "2")
     assert_equal(r[0].highlights("content"), 'India for a life changing <b class="match term0">train</b> journey')

tests/test_results.py

     with ix.searcher() as s:
         qp = qparser.QueryParser("text", ix.schema)
         q = qp.parse(u("key"))
-        r = s.search(q)
+        r = s.search(q, terms=True)
         r.formatter = highlight.UppercaseFormatter()
         
         assert_equal(sorted([hit.highlights("text") for hit in r]), sorted(target))
     w.commit()
     
     q = query.Or([query.Term("text", "bravo"), query.Term("text", "charlie")])
-    r = ix.searcher().search(q)
-    assert not r.contains_term("text", "alfa")
-    assert r.contains_term("text", "bravo")
-    assert r.contains_term("text", "charlie")
-    assert r.contains_term("text", "delta")
-    assert r.contains_term("text", "echo")
-    assert not r.contains_term("text", "foxtrot")
+    r = ix.searcher().search(q, terms=True)
+    for hit in r:
+        assert not hit.contains_term("text", "alfa")
+        assert (hit.contains_term("text", "bravo")
+                or hit.contains_term("text", "charlie"))
+        assert not hit.contains_term("text", "foxtrot")
 
-
-
+def test_terms():
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(text=u("alfa sierra tango"))
+    w.add_document(text=u("bravo charlie delta"))
+    w.add_document(text=u("charlie delta echo"))
+    w.add_document(text=u("delta echo foxtrot"))
+    w.commit()
+    
+    qp = qparser.QueryParser("text", ix.schema)
+    q = qp.parse(u("(bravo AND charlie) OR foxtrot OR missing"))
+    r = ix.searcher().search(q, terms=True)
+    
+    def txts(tset):
+        return sorted(t[1] for t in tset)
+    
+    assert_equal(txts(r.matched_terms()), ["bravo", "charlie", "foxtrot"])
+    for hit in r:
+        value = hit["text"]
+        for txt in txts(hit.matched_terms()):
+            assert txt in value
+    
 
 
 

tests/test_searching.py

             if "bravo charlie delta" in hit["title"]:
                 assert hit.score > 100.0
 
-def test_trackingcollector():
-    schema = fields.Schema(text=fields.TEXT(stored=True))
-    ix = RamStorage().create_index(schema)
-    domain = u("alfa bravo charlie delta echo").split()
-    w = ix.writer()
-    for ls in list(permutations(domain, 3))[::2]:
-        w.add_document(text=u(" ").join(ls))
-    w.commit()
-    
-    with ix.searcher() as s:
-        q = Or([Term("text", u("alfa")),Term("text", u("bravo")),
-                Not(Term("text", "charlie"))])
-        
-        col = searching.TermTrackingCollector()
-        _ = col.search(s, q)
-        
-        for docnum in col.catalog["text:alfa"]:
-            words = s.stored_fields(docnum)["text"].split()
-            assert "alfa" in words
-            assert "charlie" not in words
-        
-        for docnum in col.catalog["text:bravo"]:
-            words = s.stored_fields(docnum)["text"].split()
-            assert "bravo" in words
-            assert "charlie" not in words
-
 def test_filter():
     schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT)
     ix = RamStorage().create_index(schema)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.