Commits

Matt Chaput  committed 2d6cb63

Changed default multitoken_query behavior to new default value, which just uses the parser's default grouping. Fixes issue #158.

  • Participants
  • Parent commits 18cf204

Comments (0)

Files changed (4)

File src/whoosh/fields.py

 import sys
 from decimal import Decimal
 
+from whoosh import formats
 from whoosh.analysis import (IDAnalyzer, RegexAnalyzer, KeywordAnalyzer,
                              StandardAnalyzer, NgramAnalyzer, Tokenizer,
                              NgramWordAnalyzer, Analyzer)
 from whoosh.compat import (with_metaclass, itervalues, string_type, u,
                            integer_types, long_type, text_type, xrange, PY3)
-from whoosh.formats import Format, Existence, Frequency, Positions
 from whoosh.support.numeric import (int_to_text, text_to_int, long_to_text,
                                     text_to_long, float_to_text, text_to_float,
                                     )
       a "word" in a user query parses into multiple tokens. The string is
       interpreted by the query parser. The strings understood by the default
       query parser are "first" (use first token only), "and" (join the tokens
-      with an AND query), "or" (join the tokens with OR), and "phrase" (join
-      the tokens with a phrase query).
-     
+      with an AND query), "or" (join the tokens with OR), "phrase" (join
+      the tokens with a phrase query), and "default" (use the query parser's
+      default join type).
+    
     The constructor for the base field type simply lets you supply your own
     configured field format, vector format, and scorable and stored values.
     Subclasses may configure some or all of this for you.
     
     analyzer = format = vector = scorable = stored = unique = None
     indexed = True
-    multitoken_query = "first"
+    multitoken_query = "default"
     sortable_typecode = None
     spelling = False
     
-    __inittypes__ = dict(format=Format, vector=Format,
+    __inittypes__ = dict(format=formats.Format, vector=formats.Format,
                          scorable=bool, stored=bool, unique=bool)
     
     def __init__(self, format, analyzer, vector=None, scorable=False,
-                 stored=False, unique=False, multitoken_query="first"):
-        assert isinstance(format, Format)
+                 stored=False, unique=False, multitoken_query="default"):
+        assert isinstance(format, formats.Format)
         assert isinstance(analyzer, Analyzer)
         
         self.format = format
             raise Exception("%s field cannot index without a format" % self.__class__)
         if not isinstance(value, (text_type, list, tuple)):
             raise ValueError("%r is not unicode or sequence" % value)
-        assert isinstance(self.format, Format), type(self.format)
+        assert isinstance(self.format, formats.Format), type(self.format)
         return self.format.word_values(value, self.analyzer, mode="index", **kwargs)
     
     def process_text(self, qstring, mode='', **kwargs):
         """
         
         self.analyzer = IDAnalyzer()
-        self.format = Existence(field_boost=field_boost)
+        self.format = formats.Existence(field_boost=field_boost)
         self.stored = stored
         self.unique = unique
         self.spelling = spelling
         
         expression = expression or re.compile(r"[^\r\n\t ,;]+")
         self.analyzer = RegexAnalyzer(expression=expression)
-        self.format = Existence(field_boost=field_boost)
+        self.format = formats.Existence(field_boost=field_boost)
         self.stored = stored
         self.unique = unique
         self.spelling = spelling
         self.shift_step = shift_step
         self.signed = signed
         self.analyzer = IDAnalyzer()
-        self.format = Existence(field_boost=field_boost)
+        self.format = formats.Existence(field_boost=field_boost)
     
     def sortable_default(self):
         return NUMERIC_DEFAULTS[self.sortable_typecode]
         
         self.stored = stored
         self.field_boost = field_boost
-        self.format = Existence(field_boost=field_boost)
+        self.format = formats.Existence(field_boost=field_boost)
     
     def to_text(self, bit):
         if isinstance(bit, string_type):
         """
         
         self.analyzer = KeywordAnalyzer(lowercase=lowercase, commas=commas)
-        self.format = Frequency(field_boost=field_boost)
+        self.format = formats.Frequency(field_boost=field_boost)
         self.scorable = scorable
         self.stored = stored
         self.unique = unique
                          stored=bool, field_boost=float)
     
     def __init__(self, analyzer=None, phrase=True, vector=None, stored=False,
-                 field_boost=1.0, multitoken_query="first", spelling=False):
+                 field_boost=1.0, multitoken_query="default", spelling=False,
+                 chars=False):
         """
         :param analyzer: The analysis.Analyzer to use to index the field
             contents. See the analysis module for more information. If you omit
         
         self.analyzer = analyzer or StandardAnalyzer()
         
+        if chars:
+            formatclass = formats.Characters
         if phrase:
-            formatclass = Positions
+            formatclass = formats.Positions
         else:
-            formatclass = Frequency
+            formatclass = formats.Frequency
         self.format = formatclass(field_boost=field_boost)
         
         if vector:
             if type(vector) is type:
                 vector = vector()
-            elif isinstance(vector, Format):
+            elif isinstance(vector, formats.Format):
                 pass
             else:
                 vector = formatclass()
             searching. The default is off.
         """
         
-        formatclass = Frequency
+        formatclass = formats.Frequency
         if phrase:
-            formatclass = Positions
+            formatclass = formats.Positions
         
         self.analyzer = NgramAnalyzer(minsize, maxsize)
         self.format = formatclass(field_boost=field_boost)
         """
         
         self.analyzer = NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at)
-        self.format = Frequency(field_boost=field_boost)
+        self.format = formats.Frequency(field_boost=field_boost)
         self.stored = stored
         self.queryor = queryor
 

File src/whoosh/qparser/default.py

 
 from whoosh import query
 from whoosh.qparser import syntax
-from whoosh.qparser.common import print_debug
+from whoosh.qparser.common import print_debug, QueryParserError
 
 
 # Query parser object
     And([Term("content", u"hello"), Term("content", u"there")])
     """
     
-    _multitoken_query_map = {"and": query.And, "or": query.Or}
-    
     def __init__(self, fieldname, schema, plugins=None, termclass=query.Term,
                  phraseclass=query.Phrase, group=syntax.AndGroup):
         """
         # Return the sorted list without the priorities
         return [item for item, _ in items_and_priorities]
     
-    def multitoken_query(self, name, texts, fieldname, termclass, boost):
-        name = name.lower()
-        if name == "phrase":
+    def multitoken_query(self, spec, texts, fieldname, termclass, boost):
+        """Returns a query for multiple texts. This method implements the
+        intention specified in the field's ``multitoken_query`` attribute,
+        which specifies what to do when strings that look like single terms
+        to the parser turn out to yield multiple tokens when analyzed.
+        
+        :param spec: a string describing how to join the text strings into a
+            query. This is usually the value of the field's
+            ``multitoken_query`` attribute.
+        :param texts: a list of token strings.
+        :param fieldname: the name of the field.
+        :param termclass: the query class to use for single terms.
+        :param boost: the original term's boost in the query string, should be
+            applied to the returned query object.
+        """
+        
+        spec = spec.lower()
+        if spec == "first":
+            # Throw away all but the first token
+            return termclass(fieldname, texts[0], boost=boost)
+        elif spec == "phrase":
+            # Turn the token into a phrase
             return self.phraseclass(fieldname, texts, boost=boost)
         else:
-            qclass = self._multitoken_query_map.get(name)
-            if qclass:
-                return qclass([termclass(fieldname, t, boost=boost)
-                               for t in texts])
+            if spec == "default":
+                qclass = self.group.qclass
+            elif spec == "and":
+                qclass = query.And
+            elif spec == "or":
+                qclass = query.Or
+            else:
+                raise QueryParserError("Unknown multitoken_query value %r" % spec)
+            return qclass([termclass(fieldname, t, boost=boost)
+                           for t in texts])
     
     def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True,
                    removestops=True):
             # multitoken_query attribute to decide what query class, if any, to
             # use to put the tokens together
             if len(texts) > 1:
-                mtq = self.multitoken_query(field.multitoken_query, texts,
-                                            fieldname, termclass, boost)
-                if mtq:
-                    return mtq
+                return self.multitoken_query(field.multitoken_query, texts,
+                                             fieldname, termclass, boost)
                 
             # It's possible field.process_text() will return an empty list (for
             # example, on a stop word)

File tests/test_parsing.py

     assert_equal(q[0][1].text, "ello")
     assert_equal(q[1].text, "tom")
 
-def test_multitoken_words():
+def test_multitoken_default():
+    textfield = fields.TEXT()
+    assert textfield.multitoken_query == "default"
+    schema = fields.Schema(text=textfield)
+    parser = default.QueryParser('text', schema)
+    qstring = u("chaw-bacon")
+    
+    texts = list(schema["text"].process_text(qstring))
+    assert_equal(texts, ["chaw", "bacon"])
+    
+    q = parser.parse(qstring)
+    print("q=", q.__unicode__())
+    assert_equal(q.__class__, query.And)
+    assert_equal(len(q), 2)
+    assert_equal(q[0].__class__, query.Term)
+    assert_equal(q[0].text, "chaw")
+    assert_equal(q[1].__class__, query.Term)
+    assert_equal(q[1].text, "bacon")
+
+def test_multitoken_or():
     textfield = fields.TEXT()
     textfield.multitoken_query = "or"
     schema = fields.Schema(text=textfield)

File tests/test_reading.py

 
 from nose.tools import assert_equal  #@UnresolvedImport
 
-from whoosh import analysis, fields, reading
+from whoosh import analysis, fields, formats, reading
 from whoosh.compat import u, xrange
 from whoosh.filedb.filereading import SegmentReader
 from whoosh.filedb.filestore import RamStorage
 
 def test_vector_postings():
     s = fields.Schema(id=fields.ID(stored=True, unique=True),
-                      content=fields.TEXT(vector=fields.Positions(analyzer=analysis.StandardAnalyzer())))
+                      content=fields.TEXT(vector=formats.Positions(analyzer=analysis.StandardAnalyzer())))
     st = RamStorage()
     ix = st.create_index(s)
     
     assert_equal(r.doc_count_all(), 8)
 
 def test_reader_subclasses():
-    def is_abstract(attr):
-        return hasattr(attr, "__isabstractmethod__") and getattr(attr, "__isabstractmethod__")
-    def check_methods(base, subclass):
-        for attrname in dir(base):
-            if attrname.startswith("_"):
-                continue
-            attr = getattr(base, attrname)
-            if is_abstract(attr):
-                oattr = getattr(subclass, attrname)
-                assert not is_abstract(oattr), "%s.%s not overridden" % (subclass.__name__, attrname)
+    from whoosh.support.testing import check_abstract_methods
     
-    check_methods(reading.IndexReader, SegmentReader)
-    check_methods(reading.IndexReader, reading.MultiReader)
-    check_methods(reading.IndexReader, reading.EmptyReader)
-    check_methods(reading.IndexReader, RamIndex)
+    check_abstract_methods(reading.IndexReader, SegmentReader)
+    check_abstract_methods(reading.IndexReader, reading.MultiReader)
+    check_abstract_methods(reading.IndexReader, reading.EmptyReader)
+    check_abstract_methods(reading.IndexReader, RamIndex)