Commits

Matt Chaput committed 7f4d75c

Added Sequence query type. Added ComplexPhrasePlugin to create sequence queries.

Comments (0)

Files changed (3)

src/whoosh/qparser/plugins.py

         def __init__(self, maxdist):
             self.maxdist = maxdist
 
+        def __repr__(self):
+            return "<~%d>" % (self.maxdist,)
+
     class FuzzyTermNode(syntax.TextNode):
         qclass = query.FuzzyTerm
 
             self.endchar = wordnode.endchar
             self.maxdist = maxdist
 
+        def r(self):
+            return "%s ~%d" % (self.text, self.maxdist)
+
         def query(self, parser):
             q = syntax.TextNode.query(self, parser)
             q.maxdist = self.maxdist
         return [(self.PhraseTagger(self.expr), 0)]
 
 
+class ComplexPhrasePlugin(Plugin):
+    """Adds the ability to group arbitrary queries inside double quotes to
+    produce a query matching the individual sub-queries in sequence.
+    
+    To enable this plugin, first remove the default PhrasePlugin, then add
+    this plugin::
+    
+        qp = qparser.QueryParser("field", my_schema)
+        qp.remove_plugin_class(qparser.PhrasePlugin)
+        qp.add_plugin(qparser.ComplexPhrasePlugin())
+    
+    This enables parsing "phrases" such as::
+    
+        "(jon OR john OR jonathan~1) smith*"
+    """
+
+    def __init__(self, expr='["]'):
+        """
+        :param expr: a regular expression for the marker at the start and end
+            of a phrase. The default is the double-quotes character.
+        """
+
+        self.expr = expr
+
+    class ComplexPhraseNode(syntax.GroupNode):
+        qclass = query.Sequence
+
+    class QuoteNode(syntax.MarkerNode):
+        pass
+
+    class QuoteTagger(RegexTagger):
+        def create(self, parser, match):
+            return ComplexPhrasePlugin.QuoteNode()
+
+    def taggers(self, parser):
+        return [(self.QuoteTagger(self.expr), 0)]
+
+    def filters(self, parser):
+        return [(self.do_quotes, 650)]
+
+    def do_quotes(self, parser, group):
+        newgroup = group.empty_copy()
+        phrase = None
+        for node in group:
+            if isinstance(node, syntax.GroupNode):
+                node = self.do_quotes(parser, node)
+
+            if isinstance(node, self.QuoteNode):
+                if phrase is None:
+                    # Start a new phrase
+                    phrase = []
+                else:
+                    # End the current phrase
+                    newgroup.append(self.ComplexPhraseNode(phrase))
+                    phrase = None
+            elif phrase is None:
+                # Not in a phrase, add directly
+                newgroup.append(node)
+            else:
+                # In a phrase, add it to the buffer
+                phrase.append(node)
+
+        # We can end up with buffered nodes if there was an unbalanced quote;
+        # just put the nodes back into the group
+        if phrase is not None:
+            newgroup.extend(phrase)
+
+        return newgroup
+
+
 class RangePlugin(Plugin):
     """Adds the ability to specify term ranges.
     """

src/whoosh/query/positional.py

 from whoosh.query import qcore, terms, nary
 
 
+class Sequence(nary.CompoundQuery):
+    """Matches documents containing a list of sub-queries in adjacent
+    positions.
+    
+    This object has no sanity check to prevent you from using queries in
+    different fields.
+    """
+
+    JOINT = " NEAR "
+    intersect_merge = True
+
+    def __init__(self, subqueries, slop=1, ordered=True, boost=1.0):
+        """
+        :param subqueries: a list of :class:`whoosh.query.Query` objects to
+            match in sequence.
+        :param slop: the maximum difference in position allowed between the
+            subqueries.
+        :param ordered: if True, the position differences between subqueries
+            must be positive (that is, each subquery in the list must appear
+            after the previous subquery in the document).
+        :param boost: a boost factor to add to the score of documents matching
+            this query.
+        """
+
+        nary.CompoundQuery.__init__(self, subqueries, boost=boost)
+        self.slop = slop
+        self.ordered = ordered
+
+    def __eq__(self, other):
+        return (other and type(self) is type(other)
+                and self.subqueries == other.subqueries
+                and self.boost == other.boost)
+
+    def __repr__(self):
+        return "%s(%r, slop=%d, boost=%f)" % (self.__class__.__name__,
+                                              self.subqueries, self.slop,
+                                              self.boost)
+
+    def __hash__(self):
+        h = hash(self.slop) ^ hash(self.boost)
+        for q in self.subqueries:
+            h ^= hash(q)
+        return h
+
+    def normalize(self):
+        # Because the subqueries are in sequence, we can't do the fancy merging
+        # that CompoundQuery does
+        return self.__class__([q.normalize() for q in self.subqueries],
+                              self.slop, self.ordered, self.boost)
+
+    def _and_query(self):
+        return nary.And([terms.Term(self.fieldname, word)
+                         for word in self.words])
+
+    def estimate_size(self, ixreader):
+        return self._and_query().estimate_size(ixreader)
+
+    def estimate_min_size(self, ixreader):
+        return self._and_query().estimate_min_size(ixreader)
+
+    def _matcher(self, subs, searcher, context):
+        from whoosh.query.spans import SpanNear
+
+        return self._tree_matcher(subs, SpanNear.SpanNearMatcher, searcher,
+                                  context, None, slop=self.slop,
+                                  ordered=self.ordered)
+
+
+class Ordered(Sequence):
+    """Matches documents containing a list of sub-queries in the given order.
+    """
+
+    JOINT = " BEFORE "
+
+    def _matcher(self, subs, searcher, context):
+        from whoosh.query.spans import SpanBefore
+
+        return self._tree_matcher(subs, SpanBefore._Matcher, searcher,
+                                  context, None)
+
+
 class Phrase(qcore.Query):
     """Matches documents containing a given phrase."""
 
         self.char_ranges = char_ranges
 
     def __eq__(self, other):
-        return (other and self.__class__ is other.__class__ and
-                self.fieldname == other.fieldname and self.words == other.words
-                and self.slop == other.slop and self.boost == other.boost)
+        return (other and self.__class__ is other.__class__
+                and self.fieldname == other.fieldname
+                and self.words == other.words
+                and self.slop == other.slop
+                and self.boost == other.boost)
 
     def __repr__(self):
         return "%s(%r, %r, slop=%s, boost=%f)" % (self.__class__.__name__,
 
     def __unicode__(self):
         return u('%s:"%s"') % (self.fieldname, u(" ").join(self.words))
+
     __str__ = __unicode__
 
     def __hash__(self):
         return m
 
 
-class Ordered(nary.And):
-    """Matches documents containing a list of sub-queries in the given order.
-    """
 
-    JOINT = " BEFORE "
 
-    def _matcher(self, subs, searcher, context):
-        from whoosh.query.spans import SpanBefore
 
-        return self._tree_matcher(subs, SpanBefore._Matcher, searcher,
-                                  context, None)
 
-
-

tests/test_parse_plugins.py

                 continue
             plist = [p for p in pis[:j] if p is not first] + [first]
             qp = qparser.QueryParser("text", None, plugins=plist)
-            try:
-                qp.parse(qs)
-            except Exception:
-                e = sys.exc_info()[1]
-                raise Exception(str(e) + " combo: %s %r" % (count, plist))
+            qp.parse(qs)
             count += 1
 
 
                  '(content:alfa AND (reverse:bravo OR reverse:ovarb))')
 
 
+def test_complex_phrase():
+    qp = default.QueryParser("f", None)
+    qp.remove_plugin_class(plugins.PhrasePlugin)
+    qp.add_plugin(plugins.FuzzyTermPlugin())
+    qp.add_plugin(plugins.ComplexPhrasePlugin())
 
+    q = qp.parse(u('alfa "bravo charlie~2 (delta OR echo)" foxtrot'))
+    assert_equal(q.__unicode__(),
+                 "(f:alfa AND "
+                 "(f:bravo NEAR f:charlie~2 NEAR (f:delta OR f:echo))"
+                 " AND f:foxtrot)")
+    assert_equal(q[1].__class__, query.Sequence)
 
+    q = qp.parse(u('alfa "bravo charlie~2 d?lt*'))
+    assert_equal(q[0].text, "alfa")
+    assert_equal(q[1].text, "bravo")
+    assert_equal(q[2].__class__, query.FuzzyTerm)
+    assert_equal(q[3].__class__, query.Wildcard)
 
+    q = qp.parse(u('alfa "bravo charlie~2" d?lt* "[a TO z] [0 TO 9]" echo'))
+    assert_equal(q.__unicode__(),
+                 "(f:alfa AND "
+                 "(f:bravo NEAR f:charlie~2)"
+                 " AND f:d?lt* AND "
+                 "(f:[a TO z] NEAR f:[0 TO 9])"
+                 " AND f:echo)")
+    assert_equal(q[0].text, "alfa")
+    assert_equal(q[1].__class__, query.Sequence)
+    assert_equal(q[2].__class__, query.Wildcard)
+    assert_equal(q[3].__class__, query.Sequence)
+    assert_equal(q[3][0].__class__, query.TermRange)
+    assert_equal(q[3][1].__class__, query.TermRange)
+    assert_equal(q[4].text, "echo")
 
 
 
 
 
 
+
+