1. pombredanne
  2. whoosh

Commits

Matt Chaput  committed 40cafca

Improved performance when creating tons of SpanNear queries.
Fixes issue #345.

The main performance difference in the test code was actually due to
a fix: in Whoosh 2.5, the Phrase query was fixed to call to_bytes() on
the phrase words. When making phrases 100s or 1000s of words long, the
difference was very noticeable.

Made two fixes to improve the performance:

1. Fixed logic in Phrase.matcher() to fail early on a missing word,
instead of calling to_bytes() on all words and THEN checking if
they're missing. This made a huge difference in the test code because
it was mostly searching for non-existant words.

2. Added a new SpanNear2 query type and changed Phrase to use it
instead of SpanNear. SpanNear2 has slightly less overhead.

These two changes make the test code actually run faster than in 2.4.

  • Participants
  • Parent commits 54c0271
  • Branches default

Comments (0)

Files changed (4)

File docs/source/api/query.rst

View file
  • Ignore whitespace
 .. autoclass:: SpanQuery
 .. autoclass:: SpanFirst
 .. autoclass:: SpanNear
+.. autoclass:: SpanNear2
 .. autoclass:: SpanNot
 .. autoclass:: SpanOr
 .. autoclass:: SpanContains

File src/whoosh/query/positional.py

View file
  • Ignore whitespace
         return self._and_query().estimate_min_size(ixreader)
 
     def matcher(self, searcher, context=None):
+        from whoosh.query import Term, SpanNear2
+
         fieldname = self.fieldname
-        reader = searcher.reader()
-
         if fieldname not in searcher.schema:
             return matching.NullMatcher()
+
         field = searcher.schema[fieldname]
-
-        words = [field.to_bytes(word) for word in self.words]
-
-        # Shortcut the query if one of the words doesn't exist.
-        for word in words:
-            if (fieldname, word) not in reader:
-                return matching.NullMatcher()
-
         if not field.format or not field.format.supports("positions"):
             raise qcore.QueryError("Phrase search: %r field has no positions"
                                    % self.fieldname)
 
-        # Construct a tree of SpanNear queries representing the words in the
-        # phrase and return its matcher
-        from whoosh.query.spans import SpanNear
+        terms = []
+        # Build a list of Term queries from the words in the phrase
+        reader = searcher.reader()
+        for word in self.words:
+            word = field.to_bytes(word)
+            if (fieldname, word) not in reader:
+                # Shortcut the query if one of the words doesn't exist.
+                return matching.NullMatcher()
+            terms.append(Term(fieldname, word))
 
-        q = SpanNear.phrase(fieldname, words, slop=self.slop)
+        # Create the equivalent SpanNear2 query from the terms
+        q = SpanNear2(terms, slop=self.slop, ordered=True, mindist=1)
+        # Get the matcher
         m = q.matcher(searcher, context)
+
         if self.boost != 1.0:
             m = matching.WrappingMatcher(m, boost=self.boost)
         return m

File src/whoosh/query/spans.py

View file
  • Ignore whitespace
             return self.start - span.end
 
 
+def bisect_spans(spans, start):
+    lo = 0
+    hi = len(spans)
+    while lo < hi:
+        mid = (lo + hi) // 2
+        if spans[mid].start < start:
+            lo = mid + 1
+        else:
+            hi = mid
+    return lo
+
+
 # Base matchers
 
 class SpanWrappingMatcher(wrappers.WrappingMatcher):
     def copy(self):
         return self.__class__(self.a.copy(), self.b.copy())
 
+    def depth(self):
+        return 1 + max(self.a.depth(), self.b.depth())
+
     def replace(self, minquality=0):
         # TODO: fix this
         if not self.is_active():
 
 
 class SpanNear(SpanQuery):
-    """Matches queries that occur near each other. By default, only matches
+    """
+    Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2
+    takes a list of sub-queries instead of requiring you to create a binary
+    tree of query objects.
+
+    Matches queries that occur near each other. By default, only matches
     queries that occur right next to each other (slop=1) and in order
     (ordered=True).
 
 
                     # Check the distance between the spans
                     dist = aspan.distance_to(bspan)
-                    if dist >= mindist and dist <= slop:
+                    if mindist <= dist <= slop:
                         spans.add(aspan.to(bspan))
 
             return sorted(spans)
 
 
+class SpanNear2(SpanQuery):
+    """
+    Matches queries that occur near each other. By default, only matches
+    queries that occur right next to each other (slop=1) and in order
+    (ordered=True).
+
+    New code should use this query type instead of :class:`SpanNear`.
+
+    (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of
+    requiring you to build a binary tree of query objects. This query should
+    also be slightly faster due to less overhead.)
+
+    For example, to find documents where "whoosh" occurs next to "library"
+    in the "text" field::
+
+        from whoosh import query, spans
+        t1 = query.Term("text", "whoosh")
+        t2 = query.Term("text", "library")
+        q = spans.SpanNear2([t1, t2])
+
+    To find documents where "whoosh" occurs at most 5 positions before
+    "library"::
+
+        q = spans.SpanNear2([t1, t2], slop=5)
+
+    To find documents where "whoosh" occurs at most 5 positions before or after
+    "library"::
+
+        q = spans.SpanNear2(t1, t2, slop=5, ordered=False)
+    """
+
+    def __init__(self, qs, slop=1, ordered=True, mindist=1):
+        """
+        :param qs: a sequence of sub-queries to match.
+        :param slop: the number of positions within which the queries must
+            occur. Default is 1, meaning the queries must occur right next
+            to each other.
+        :param ordered: whether a must occur before b. Default is True.
+        :pram mindist: the minimum distance allowed between the queries.
+        """
+
+        self.qs = qs
+        self.slop = slop
+        self.ordered = ordered
+        self.mindist = mindist
+
+    def __repr__(self):
+        return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
+                % (self.__class__.__name__, self.qs, self.slop, self.ordered,
+                   self.mindist))
+
+    def __eq__(self, other):
+        return (other and self.__class__ == other.__class__
+                and self.qs == other.qs and self.slop == other.slop
+                and self.ordered == other.ordered
+                and self.mindist == other.mindist)
+
+    def __hash__(self):
+        h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist)
+        for q in self.qs:
+            h ^= hash(q)
+        return h
+
+    def is_leaf(self):
+        return False
+
+    def children(self):
+        return self.qs
+
+    def apply(self, fn):
+        return self.__class__([fn(q) for q in self.qs], slop=self.slop,
+                              ordered=self.ordered, mindist=self.mindist)
+
+    def matcher(self, searcher, context=None):
+        ms = [q.matcher(searcher, context) for q in self.qs]
+        return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered,
+                                     mindist=self.mindist)
+
+    class SpanNear2Matcher(SpanWrappingMatcher):
+        def __init__(self, ms, slop=1, ordered=True, mindist=1):
+            self.ms = ms
+            self.slop = slop
+            self.ordered = ordered
+            self.mindist = mindist
+            isect = make_binary_tree(binary.IntersectionMatcher, ms)
+            super(SpanNear2.SpanNear2Matcher, self).__init__(isect)
+
+        def copy(self):
+            return self.__class__([m.copy() for m in self.ms], slop=self.slop,
+                                  ordered=self.ordered, mindist=self.mindist)
+
+        def replace(self, minquality=0):
+            # TODO: fix this
+            if not self.is_active():
+                return mcore.NullMatcher()
+            return self
+
+        def _get_spans(self):
+            slop = self.slop
+            mindist = self.mindist
+            ordered = self.ordered
+            ms = self.ms
+
+            aspans = ms[0].spans()
+            i = 1
+            while i < len(ms) and aspans:
+                bspans = ms[i].spans()
+                spans = set()
+                for aspan in aspans:
+                    # Use a binary search to find the first position we should
+                    # start looking for possible matches
+                    if ordered:
+                        start = aspan.start
+                    else:
+                        start = max(0, aspan.start - slop)
+                    j = bisect_spans(bspans, start)
+
+                    while j < len(bspans):
+                        bspan = bspans[j]
+                        j += 1
+
+                        if (bspan.end < aspan.start - slop
+                            or (ordered and aspan.start > bspan.start)):
+                            # B is too far in front of A, or B is in front of A
+                            # *at all* when ordered is True
+                            continue
+                        if bspan.start > aspan.end + slop:
+                            # B is too far from A. Since spans are listed in
+                            # start position order, we know that all spans after
+                            # this one will also be too far.
+                            break
+
+                        # Check the distance between the spans
+                        dist = aspan.distance_to(bspan)
+                        if mindist <= dist <= slop:
+                            spans.add(aspan.to(bspan))
+                aspans = sorted(spans)
+                i += 1
+
+            if i == len(ms):
+                return aspans
+            else:
+                return []
+
+
 class SpanOr(SpanQuery):
     """Matches documents that match any of a list of sub-queries. Unlike
     query.Or, this class merges together matching spans from the different
 
         def _get_spans(self):
             return self.a.spans()
+
+
+
+
+

File tests/test_searching.py

View file
  • Ignore whitespace
         q = query.Phrase("value", [u("little"), u("miss"), u("muffet"),
                                    u("sat"), u("tuffet")])
         m = q.matcher(s)
-        assert m.__class__.__name__ == "SpanNearMatcher"
+        assert m.__class__.__name__ == "SpanNear2Matcher"
 
         r = s.search(q)
         assert names(r) == ["A"]
         assert r[0]["name"] == u("close")
 
 
-
-