Commits

Matt Chaput committed af1dd04

Changed wildcard plugin to anneal ? and * with surrounding text.
This should fix a huge class of bugs. The way it used to work was pretty dumb.

  • Participants
  • Parent commits 006e695

Comments (0)

Files changed (3)

File src/whoosh/qparser/plugins.py

 
     def filters(self, parser):
         """Should return a list of ``(filter_function, priority)`` tuples to
-        add to parser.
+        add to parser. Lower priority numbers run first.
         
         Filter functions will be called with ``(parser, groupnode)`` and should
         return a group node.
 
 
 class WildcardPlugin(TaggingPlugin):
+    # \u055E = Armenian question mark
+    # \u061F = Arabic question mark
+    # \u1367 = Ethiopic question mark
+    qmarks = u("?\u055E\u061F\u1367")
+    expr = "(?P<text>[*%s])" % qmarks
+
+    def filters(self, parser):
+        # Run early, but definitely before multifield plugin
+        return [(self.do_wildcards, 50)]
+
+    def do_wildcards(self, parser, group):
+        i = 0
+        while i < len(group):
+            node = group[i]
+            if isinstance(node, self.WildcardNode):
+                if i < len(group) - 1 and group[i + 1].is_text():
+                    nextnode = group.pop(i + 1)
+                    node.text += nextnode.text
+                if i > 0 and group[i - 1].is_text():
+                    prevnode = group.pop(i - 1)
+                    node.text = prevnode.text + node.text
+                else:
+                    i += 1
+            else:
+                if isinstance(node, syntax.GroupNode):
+                    self.do_wildcards(parser, node)
+                i += 1
+
+        for i in xrange(len(group)):
+            node = group[i]
+            if isinstance(node, self.WildcardNode):
+                text = node.text
+                if len(text) > 1 and not any(qm in text for qm in self.qmarks):
+                    if text.find("*") == len(text) - 1:
+                        newnode = PrefixPlugin.PrefixNode(text[:-1])
+                        newnode.startchar = node.startchar
+                        newnode.endchar = node.endchar
+                        group[i] = newnode
+        return group
+
     class WildcardNode(syntax.TextNode):
         # Note that this node inherits tokenize = False from TextNode,
         # so the text in this node will not be analyzed... just passed
         # straight to the query
 
-        # TODO: instead of parsing a "wildcard word", create marker nodes for
-        # individual ? and * characters. This will have to wait for a more
-        # advanced wikiparser-like parser.
-
         qclass = query.Wildcard
 
         def r(self):
             return "Wild %r" % self.text
 
-    # Any number of word chars, followed by at least one question mark or
-    # star, followed by any number of word chars, question marks, or stars
-    # \u055E = Armenian question mark
-    # \u061F = Arabic question mark
-    # \u1367 = Ethiopic question mark
-    qms = u("\u055E\u061F\u1367")
-    expr = u("(?P<text>(\\w|[-])*[*?%s](\\w|[-*?%s])*)") % (qms, qms)
     nodetype = WildcardNode
 
 
                 self.reverse[value] = key
 
     def filters(self, parser):
+        # Run before fields plugin at 100
         return [(self.do_aliases, 90)]
 
     def do_aliases(self, parser, group):

File src/whoosh/qparser/syntax.py

 
         return False
 
+    def is_text(self):
+        return False
+
     def set_fieldname(self, name, override=False):
         """Sets the fieldname associated with this node. If ``override`` is
         False (the default), the fieldname will only be replaced if this node
     def extend(self, vs):
         self.nodes.extend(vs)
 
-    def pop(self):
-        return self.nodes.pop()
+    def pop(self, *args, **kwargs):
+        return self.nodes.pop(*args, **kwargs)
 
     def reverse(self):
         self.nodes.reverse()
     def r(self):
         return "%s %r" % (self.__class__.__name__, self.text)
 
+    def is_text(self):
+        return True
+
     def query(self, parser):
         fieldname = self.fieldname or parser.fieldname
         termclass = self.qclass or parser.termclass

File tests/test_parsing.py

                  "<AndGroup <None:'a'>, <None:'b'*>, <None:'c'>>")
 
 
-def test_wild():
-    p = default.QueryParser("t", None, [plugins.WhitespacePlugin(),
-                                        plugins.WildcardPlugin()])
-    assert_equal(repr(p.process("a b*c? d")),
-                 "<AndGroup <None:'a'>, <None:Wild 'b*c?'>, <None:'d'>>")
-
-
 def test_range():
     p = default.QueryParser("t", None, [plugins.WhitespacePlugin(),
                                         plugins.RangePlugin()])
     assert_equal(q.__unicode__(), "(t:alfa AND t:bravo^24.0 AND t:charlie)")
 
 
-def test_wildcard1():
+def test_wild():
+    qp = default.QueryParser("t", None, [plugins.WhitespacePlugin(),
+                                         plugins.WildcardPlugin()])
+    assert_equal(repr(qp.process("a b*c? d")),
+                 "<AndGroup <None:'a'>, <None:Wild 'b*c?'>, <None:'d'>>")
+    assert_equal(repr(qp.process("a * ? d")),
+                 "<AndGroup <None:'a'>, <None:Wild '*'>, "
+                 "<None:Wild '?'>, <None:'d'>>")
+
+    #
     qp = default.QueryParser("content", None)
     q = qp.parse(u("hello *the?e* ?star*s? test"))
     assert_equal(len(q), 4)
     assert_equal(q[3].__class__, query.Term)
     assert_equal(q[3].text, "test")
 
-
-def test_wildcard2():
+    #
     qp = default.QueryParser("content", None)
     q = qp.parse(u("*the?e*"))
     assert_equal(q.__class__, query.Wildcard)
 
 
 def test_dash():
-    ana = analysis.StandardAnalyzer("[ \t\r\n()*?]+")
+    ana = analysis.StandardAnalyzer("[^ \t\r\n()*?]+")
     schema = fields.Schema(title=fields.TEXT(analyzer=ana),
-                           text=fields.TEXT(analyzer=ana), time=fields.ID)
+                           text=fields.TEXT(analyzer=ana),
+                           time=fields.ID)
     qtext = u("*Ben-Hayden*")
 
     qp = default.QueryParser("text", schema)