Commits

Matt Chaput  committed c063b12

Fixed Regex query type, added RegexPlugin and PseudoFieldPlugin.
Fixes bug #189 and #190.

  • Participants
  • Parent commits d9899f2

Comments (0)

Files changed (7)

File docs/source/api/qparser.rst

 .. autoclass:: SingleQuotePlugin
 .. autoclass:: PrefixPlugin
 .. autoclass:: WildcardPlugin
+.. autoclass:: RegexPlugin
 .. autoclass:: BoostPlugin
 .. autoclass:: GroupPlugin
 .. autoclass:: EveryPlugin

File docs/source/api/query.rst

 .. autoclass:: Not
 .. autoclass:: Prefix
 .. autoclass:: Wildcard
+.. autoclass:: Regex
 .. autoclass:: TermRange
 .. autoclass:: NumericRange
 .. autoclass:: DateRange

File docs/source/releases/2_0.rst

 Whoosh 2.x release notes
 ========================
 
+Whoosh 2.3
+==========
+
+* Added a :class:`whoosh.query.Regex` term query type, similar to
+  :class:`whoosh.query.Wildcard`. The parser does not allow regex term queries
+  by default. You need to add the :class:`whoosh.qparser.RegexPlugin` plugin.
+  After you add the plugin, you can use ``r"expression"`` query syntax for
+  regular expression term queries. For example, ``r"foo.*bar"``.
+
+* Added the :class:`whoosh.qparser.PseudoFieldPlugin` parser plugin. This
+  plugin lets you create "pseudo-fields" that run a transform function on
+  whatever query syntax the user applies the field to. This is fairly advanced
+  functionality right now; I'm trying to think of ways to make its power easier
+  to access.
+
+
 Whoosh 2.2
 ==========
 

File src/whoosh/qparser/plugins.py

     nodetype = WildcardNode
 
 
+class RegexPlugin(TaggingPlugin):
+    """Adds the ability to specify regular expression term queries.
+    
+    The default syntax for a regular expression term is ``r"termexpr"``.
+    
+    >>> qp = qparser.QueryParser("content", myschema)
+    >>> qp.add_plugin(qparser.RegexPlugin())
+    >>> q = qp.parse('foo title:r"bar+"')
+    """
+
+    class RegexNode(syntax.TextNode):
+        qclass = query.Regex
+
+        def r(self):
+            return "Regex %r" % self.text
+
+    expr = 'r"(?P<text>[^"]*)"'
+    nodetype = RegexNode
+
+
 class BoostPlugin(TaggingPlugin):
     """Adds the ability to boost clauses of the query using the circumflex.
     
                     continue
             newgroup.append(node)
         return newgroup
+
+
+class PseudoFieldPlugin(Plugin):
+    """This is an advanced plugin that lets you define "pseudo-fields" the user
+    can use in their queries. When the parser encounters one of these fields,
+    it runs a given function on the following node in the abstract syntax tree.
+    
+    Unfortunately writing the transform function(s) requires knowledge of the
+    parser's abstract syntax tree classes. A transform function takes a
+    :class:`whoosh.qparser.SyntaxNode` and returns a
+    :class:`~whoosh.qparser.SyntaxNode` (or None if the node should be removed
+    instead of transformed).
+    
+    Some things you can do in the transform function::
+    
+        from whoosh import qparser
+    
+        def my_xform_fn(node):
+            # Is this a text node?
+            if node.has_text:
+                # Change the node's text
+                node.text = node.text + "foo"
+            
+                # Change the node into a prefix query
+                node = qparser.PrefixPlugin.PrefixNode(node.text)
+                
+                # Set the field the node should search in
+                node.set_fieldname("title")
+                
+                return node
+            else:
+                # If the pseudo-field wasn't applied to a text node (e.g.
+                # it preceded a group, as in ``pfield:(a OR b)`` ), remove the
+                # node. Alternatively you could just ``return node`` here to
+                # leave the non-text node intact.
+                return None
+    
+    In the following example, if the user types ``regex:foo.bar``, the function
+    transforms the text in the pseudo-field "regex" into a regular expression
+    query in the "content" field::
+    
+        from whoosh import qparser
+        
+        def regex_maker(node):
+            if node.has_text:
+                node = qparser.RegexPlugin.RegexNode(node.text)
+                node.set_fieldname("content")
+                return node
+    
+        qp = qparser.QueryParser("content", myindex.schema)
+        qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker}))
+        q = qp.parse("alfa regex:br.vo")
+    
+    The name of the "pseudo" field can be the same as an actual field. Imagine
+    the schema has a field named ``reverse``, and you want the user to be able
+    to type ``reverse:foo`` and transform it to ``reverse:(foo OR oof)``::
+        
+        def rev_text(node):
+            if node.has_text:
+                # Create a word node for the reversed text
+                revtext = node.text[::-1]  # Reverse the text
+                rnode = qparser.WordNode(revtext)
+                
+                # Put the original node and the reversed node in an OrGroup
+                group = qparser.OrGroup([node, rnode])
+                
+                # Need to set the fieldname here because the PseudoFieldPlugin
+                # removes the field name syntax
+                group.set_fieldname("reverse")
+                
+                return group
+        
+        qp = qparser.QueryParser("content", myindex.schema)
+        qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text}))
+        q = qp.parse("alfa reverse:bravo")
+    
+    Note that transforming the query like this can potentially really confuse
+    the spell checker!
+    
+    This plugin works as a filter, so it can only operate on the query after it
+    has been parsed into an abstract syntax tree. For parsing control (i.e. to
+    give a pseudo-field its own special syntax), you would need to write your
+    own parsing plugin.
+    """
+
+    def __init__(self, xform_map):
+        """
+        :param xform_map: a dictionary mapping psuedo-field names to transform
+            functions. The function should take a
+            :class:`whoosh.qparser.SyntaxNode` as an argument, and return a
+            :class:`~whoosh.qparser.SyntaxNode`. If the function returns None,
+            the node will be removed from the query.
+        """
+
+        self.xform_map = xform_map
+
+    def filters(self, parser):
+        # Run before the fieldname filter (100)
+        return [(self.do_pseudofield, 99)]
+
+    def do_pseudofield(self, parser, group):
+        xform_map = self.xform_map
+
+        newgroup = group.empty_copy()
+        xform_next = None
+        for node in group:
+            if isinstance(node, syntax.GroupNode):
+                node = self.do_pseudofield(parser, node)
+            elif (isinstance(node, syntax.FieldnameNode)
+                  and node.fieldname in xform_map):
+                xform_next = xform_map[node.fieldname]
+                continue
+
+            if xform_next:
+                newnode = xform_next(node)
+                xform_next = None
+                if newnode is None:
+                    continue
+                else:
+                    newnode.set_range(node.startchar, node.endchar)
+                    node = newnode
+
+            newgroup.append(node)
+
+        return newgroup
+

File src/whoosh/qparser/syntax.py

         for node in self.nodes:
             node.set_fieldname(name, override=override)
 
+    def set_range(self, startchar, endchar):
+        for node in self.nodes:
+            node.set_range(startchar, endchar)
+        return self
+
     # List-like methods
 
     def __nonzero__(self):

File src/whoosh/query.py

     def _get_prefix(self, text):
         if "|" in text:
             return ""
-        if text.startswith("^") or text.startswith("\\A"):
+        if text.startswith("^"):
             text = text[1:]
+        elif text.startswith("\\A"):
+            text = text[2:]
 
-        return PatternQuery._find_prefx(self, text)
+        return PatternQuery._find_prefix(self, text)
 
 
 class ExpandingTerm(MultiTerm):

File tests/test_parse_plugins.py

-from __future__ import with_statement, print_function
+from __future__ import with_statement
 import inspect
 from datetime import datetime
 import sys
 
 def test_combos():
     qs = 'w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) (apple ANDNOT bear)^2.3'
-    
+
     init_args = {plugins.MultifieldPlugin: (["content", "title"], {"content": 1.0, "title": 1.2}),
-                 plugins.FieldAliasPlugin: ({"content": ("text", "body")}, ),
-                 plugins.MultifieldPlugin: (["title", "content"], ),
-                 plugins.CopyFieldPlugin: ({"name": "phone"}, ),
+                 plugins.FieldAliasPlugin: ({"content": ("text", "body")},),
+                 plugins.MultifieldPlugin: (["title", "content"],),
+                 plugins.CopyFieldPlugin: ({"name": "phone"},),
+                 plugins.PseudoFieldPlugin: ({"name": lambda x: x}),
                  }
-    
+
     pis = _plugin_classes(())
     for i, plugin in enumerate(pis):
         try:
             pis[i] = plugin(*init_args.get(plugin, ()))
         except TypeError:
             raise TypeError("Error instantiating %s" % plugin)
-    
+
     count = 0
     for i, first in enumerate(pis):
         for j in xrange(len(pis)):
 def test_dateparser():
     schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME)
     qp = default.QueryParser("text", schema)
-    
+
     errs = []
     def cb(arg):
         errs.append(arg)
     basedate = datetime(2010, 9, 20, 15, 16, 6, 454000)
     qp.add_plugin(dateparse.DateParserPlugin(basedate, callback=cb))
-    
+
     q = qp.parse(u("hello date:'last tuesday'"))
     assert_equal(q.__class__, query.And)
     assert_equal(q[1].__class__, query.DateRange)
     assert_equal(q[1].startdate, adatetime(2010, 9, 14).floor())
     assert_equal(q[1].enddate, adatetime(2010, 9, 14).ceil())
-    
+
     q = qp.parse(u("date:'3am to 5pm'"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 9, 20, 3).floor())
     assert_equal(q.enddate, adatetime(2010, 9, 20, 17).ceil())
-    
+
     q = qp.parse(u("date:blah"))
     assert_equal(q, query.NullQuery)
     assert_equal(errs[0], "blah")
-    
+
     q = qp.parse(u("hello date:blarg"))
     assert_equal(q.__unicode__(), "(text:hello AND <_NullQuery>)")
     assert_equal(q[1].error, "blarg")
     assert_equal(errs[1], "blarg")
-    
+
     q = qp.parse(u("hello date:20055x10"))
     assert_equal(q.__unicode__(), "(text:hello AND <_NullQuery>)")
     assert_equal(q[1].error, "20055x10")
     assert_equal(errs[2], "20055x10")
-    
+
     q = qp.parse(u("hello date:'2005 19 32'"))
     assert_equal(q.__unicode__(), "(text:hello AND <_NullQuery>)")
     assert_equal(q[1].error, "2005 19 32")
     assert_equal(errs[3], "2005 19 32")
-    
+
     q = qp.parse(u("date:'march 24 to dec 12'"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 24).floor())
     assert_equal(q.enddate, adatetime(2010, 12, 12).ceil())
-    
+
     q = qp.parse(u("date:('30 june' OR '10 july') quick"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 2)
     assert_equal(q[0].__class__, query.Or)
     assert_equal(q[0][0].__class__, query.DateRange)
     assert_equal(q[0][1].__class__, query.DateRange)
-    
+
 def test_date_range():
     schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME)
     qp = qparser.QueryParser("text", schema)
     basedate = datetime(2010, 9, 20, 15, 16, 6, 454000)
     qp.add_plugin(dateparse.DateParserPlugin(basedate))
-    
+
     q = qp.parse(u("date:['30 march' to 'next wednesday']"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 30).floor())
     assert_equal(q.enddate, adatetime(2010, 9, 22).ceil())
-    
+
     q = qp.parse(u("date:[to 'next wednesday']"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, None)
     assert_equal(q.enddate, adatetime(2010, 9, 22).ceil())
-    
+
     q = qp.parse(u("date:['30 march' to]"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 30).floor())
     assert_equal(q.enddate, None)
-    
+
     print("!!!!!!!!!!!!!!!!!!!!")
     q = qp.parse(u("date:[30 march to next wednesday]"))
     print("q=", q)
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 30).floor())
     assert_equal(q.enddate, adatetime(2010, 9, 22).ceil())
-    
+
     q = qp.parse(u("date:[to next wednesday]"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, None)
     assert_equal(q.enddate, adatetime(2010, 9, 22).ceil())
-    
+
     q = qp.parse(u("date:[30 march to]"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 30).floor())
 def test_daterange_empty_field():
     schema = fields.Schema(test=fields.DATETIME)
     ix = RamStorage().create_index(schema)
-                    
+
     writer = ix.writer()
     writer.add_document(test=None)
     writer.commit()
-    
+
     with ix.searcher() as s:
         q = query.DateRange("test", datetime.fromtimestamp(0), datetime.today())
         r = s.search(q)
     qp = qparser.QueryParser("text", schema)
     basedate = datetime(2010, 9, 20, 15, 16, 6, 454000)
     qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True))
-    
+
     q = qp.parse(u("hello date:last tuesday"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 2)
     assert_equal(q[1].__class__, query.DateRange)
     assert_equal(q[1].startdate, adatetime(2010, 9, 14).floor())
     assert_equal(q[1].enddate, adatetime(2010, 9, 14).ceil())
-    
+
     q = qp.parse(u("date:mar 29 1972 hello"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 2)
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2005, 3, 2).floor())
     assert_equal(q.enddate, adatetime(2005, 3, 2).ceil())
-    
+
     q = qp.parse(u("date:'2005' march 2"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 3)
     assert_equal(q[1].__class__, query.Term)
     assert_equal(q[1].fieldname, "text")
     assert_equal(q[1].text, "march")
-    
+
     q = qp.parse(u("date:march 24 to dec 12"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 24).floor())
     assert_equal(q.enddate, adatetime(2010, 12, 12).ceil())
-    
+
     q = qp.parse(u("date:5:10pm"))
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 9, 20, 17, 10).floor())
     assert_equal(q.enddate, adatetime(2010, 9, 20, 17, 10).ceil())
-    
+
     q = qp.parse(u("(date:30 june OR date:10 july) quick"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 2)
     assert_equal(q[0].__class__, query.Or)
     assert_equal(q[0][0].__class__, query.DateRange)
     assert_equal(q[0][1].__class__, query.DateRange)
-    
+
 def test_prefix_plugin():
     schema = fields.Schema(id=fields.ID, text=fields.TEXT)
     ix = RamStorage().create_index(schema)
-    
+
     w = ix.writer()
     w.add_document(id=u("1"), text=u("alfa"))
     w.add_document(id=u("2"), text=u("bravo"))
     w.add_document(id=u("3"), text=u("buono"))
     w.commit()
-    
+
     with ix.searcher() as s:
         qp = qparser.QueryParser("text", schema)
         qp.remove_plugin_class(plugins.WildcardPlugin)
         qp.add_plugin(plugins.PrefixPlugin)
-        
+
         q = qp.parse(u("b*"))
         r = s.search(q, limit=None)
         assert_equal(len(r), 2)
-        
+
         q = qp.parse(u("br*"))
         r = s.search(q, limit=None)
         assert_equal(len(r), 1)
-    
+
 def test_custom_tokens():
     qp = qparser.QueryParser("text", None)
     qp.remove_plugin_class(plugins.OperatorsPlugin)
-    
+
     cp = plugins.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~",
                                  Not="-")
     qp.add_plugin(cp)
-    
+
     q = qp.parse("this | that")
     assert_equal(q.__class__, query.Or)
     assert_equal(q[0].__class__, query.Term)
     assert_equal(q[0].text, "this")
     assert_equal(q[1].__class__, query.Term)
     assert_equal(q[1].text, "that")
-    
+
     q = qp.parse("this&!that")
     assert_equal(q.__class__, query.AndNot)
     assert_equal(q.a.__class__, query.Term)
     assert_equal(q.a.text, "this")
     assert_equal(q.b.__class__, query.Term)
     assert_equal(q.b.text, "that")
-    
+
     q = qp.parse("alfa -bravo NOT charlie")
     assert_equal(len(q), 4)
     assert_equal(q[1].__class__, query.Not)
     assert_equal(q[1].query.text, "bravo")
     assert_equal(q[2].text, "NOT")
-    
+
 def test_copyfield():
     qp = qparser.QueryParser("a", None)
     qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None))
     assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND b:matt AND c:matt)")
-    
+
     qp = qparser.QueryParser("a", None)
     qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup))
     assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND (b:matt ANDMAYBE c:matt))")
-    
+
     qp = qparser.QueryParser("a", None)
     qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup))
     assert_equal(text_type(qp.parse("hello (there OR b:matt)")), "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))")
-    
+
     qp = qparser.QueryParser("a", None)
     qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup))
     assert_equal(text_type(qp.parse("hello there")), "((a:hello OR c:hello) AND (a:there OR c:there))")
-    
+
     qp = qparser.QueryParser("a", None)
     qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True))
     assert_equal(text_type(qp.parse("hello c:matt")), "(a:hello AND (c:matt OR b:matt))")
-    
+
     qp = qparser.QueryParser("a", None)
     qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True))
     assert_equal(text_type(qp.parse("hello c:matt")), "((a:hello OR c:hello) AND (c:matt OR a:matt))")
-    
+
     ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter()
     fmt = formats.Frequency()
     schema = fields.Schema(name=fields.KEYWORD, name_phone=fields.FieldType(fmt, ana, multitoken_query="or"))
     qp = qparser.QueryParser("name", schema)
     qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"}))
     assert_equal(text_type(qp.parse(u("spruce view"))), "((name:spruce OR name_phone:SPRS) AND (name:view OR name_phone:F OR name_phone:FF))")
-    
+
 def test_gtlt():
     schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC,
                            c=fields.KEYWORD,
     qp = qparser.QueryParser("a", schema)
     qp.add_plugin(plugins.GtLtPlugin())
     qp.add_plugin(dateparse.DateParserPlugin())
-    
+
     q = qp.parse(u("a:hello b:>100 c:<=z there"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 4)
     assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True))
     assert_equal(q[2], query.TermRange("c", None, 'z'))
     assert_equal(q[3], query.Term("a", "there"))
-    
+
     q = qp.parse(u("hello e:>'29 mar 2001' there"))
     assert_equal(q.__class__, query.And)
     assert_equal(len(q), 3)
     # As of this writing, date ranges don't support startexcl/endexcl
     assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None))
     assert_equal(q[2], query.Term("a", "there"))
-    
+
     q = qp.parse(u("a:> alfa c:<= bravo"))
     assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)")
-    
+
     qp.remove_plugin_class(plugins.FieldsPlugin)
     qp.remove_plugin_class(plugins.RangePlugin)
     q = qp.parse(u("hello a:>500 there"))
     assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
+
+def test_regex():
+    schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT)
+    qp = qparser.QueryParser("a", schema)
+    qp.add_plugin(plugins.RegexPlugin())
+
+    q = qp.parse(u("a:foo-bar b:foo-bar"))
+    assert_equal(q.__unicode__(), '(a:foo-bar AND b:foo AND b:bar)')
+
+    q = qp.parse(u('a:r"foo-bar" b:r"foo-bar"'))
+    assert_equal(q.__unicode__(), '(a:r"foo-bar" AND b:r"foo-bar")')
+
+def test_pseudofield():
+    schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT)
+
+    def regex_maker(node):
+        if node.has_text:
+            node = qparser.RegexPlugin.RegexNode(node.text)
+            node.set_fieldname("content")
+            return node
+
+    qp = qparser.QueryParser("a", schema)
+    qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker}))
+    q = qp.parse(u("alfa regex:br.vo"))
+    assert_equal(q.__unicode__(), '(a:alfa AND content:r"br.vo")')
+
+    def rev_text(node):
+        if node.has_text:
+            # Create a word node for the reversed text
+            revtext = node.text[::-1]  # Reverse the text
+            rnode = qparser.WordNode(revtext)
+            # Duplicate the original node's start and end char
+            rnode.set_range(node.startchar, node.endchar)
+
+            # Put the original node and the reversed node in an OrGroup
+            group = qparser.OrGroup([node, rnode])
+
+            # Need to set the fieldname here because the PseudoFieldPlugin
+            # removes the field name syntax
+            group.set_fieldname("reverse")
+
+            return group
+
+    qp = qparser.QueryParser("content", schema)
+    qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text}))
+    q = qp.parse(u("alfa reverse:bravo"))
+    assert_equal(q.__unicode__(), '(content:alfa AND (reverse:bravo OR reverse:ovarb))')
+
+
+
+
+
+
+
+
+
+
+
+