Commits

Matt Chaput committed aede423 Merge

Merging with bitbucket mainline.

  • Participants
  • Parent commits e183a2b, 2ef80e4
  • Branches 2.4x

Comments (0)

Files changed (11)

benchmark/marc21.py

 
 
 def uni(v):
-    return u"" if v is None else v.decode("utf8", "replace")
+    return u"" if v is None else v.decode("utf-8", "replace")
 
 
 # Indexing and searching
                    glob=options.glob)
 
     if args:
-        qstring = " ".join(args).decode("utf8")
+        qstring = " ".join(args).decode("utf-8")
         limit = int(options.limit)
         if limit < 1:
             limit = None

src/whoosh/analysis.py

-# coding: utf8
+# coding: utf-8
 
 # Copyright 2007 Matt Chaput. All rights reserved.
 #
                 this = 1
             elif text.isdigit():
                 this = 2
+            else:
+                this = None
 
             # Is this the same type as the previous part?
             if (buf and (this == last == 1 and mergewords)

src/whoosh/codec/base.py

 from whoosh.reading import TermInfo
 from whoosh.spans import Span
 from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_long, unpack_long,
-                           IS_LITTLE)
+                           IS_LITTLE, emptybytes)
 from whoosh.util import byte_to_length, length_to_byte
 
 
 
 class FieldWriter(object):
     def add_postings(self, schema, lengths, items):
+        # This method translates a generator of (fieldname, btext, docnum, w, v)
+        # postings into calls to start_field(), start_term(), add(),
+        # finish_term(), finish_field(), etc.
+
         start_field = self.start_field
         start_term = self.start_term
         add = self.add
         finish_term = self.finish_term
         finish_field = self.finish_field
 
-        # items = (fieldname, text, docnum, weight, valuestring) ...
+        if lengths:
+            dfl = lengths.doc_field_length
+        else:
+            dfl = lambda docnum, fieldname: 0
+
+        # The fieldname of the previous posting
         lastfn = None
+        # The bytes text of the previous posting
         lasttext = None
-        dfl = lengths.doc_field_length
-        for fieldname, text, docnum, weight, valuestring in items:
-            # Items where docnum is None indicate words that should be added
-            # to the spelling graph
-            if docnum is None and (fieldname != lastfn or text != lasttext):
-                self.add_spell_word(fieldname, text)
+        # The (fieldname, btext) of the previous spelling posting
+        lastspell = None
+        for fieldname, btext, docnum, weight, value in items:
+            # Check for out-of-order postings. This is convoluted because Python
+            # 3 removed the ability to compare a string to None
+            if lastfn is not None and fieldname < lastfn:
+                raise Exception("Field %r .. %r" % (lastfn, fieldname))
+            if fieldname == lastfn and lasttext and btext < lasttext:
+                raise Exception("Term %s:%r .. %s:%r"
+                                % (lastfn, lasttext, fieldname, btext))
+
+            # If the fieldname of this posting is different from the last one,
+            # tell the writer we're starting a new field
+            if fieldname != lastfn:
+                if lasttext is not None:
+                    finish_term()
+                if lastfn is not None and fieldname != lastfn:
+                    finish_field()
+                start_field(fieldname, schema[fieldname])
                 lastfn = fieldname
-                lasttext = text
+                lasttext = None
+
+            # HACK: items where docnum=None indicate words that should be added
+            # to the spelling graph, not the postings
+            if docnum is None:
+                spellterm = (fieldname, btext)
+                # There can be duplicates of spelling terms, so only add a spell
+                # term if it's greater than the last one
+                if lastspell is None or spellterm > lastspell:
+                    # TODO: how to decode the btext bytes?
+                    self.add_spell_word(fieldname, btext.decode("utf-8"))
+                    lastspell = spellterm
                 continue
 
-            # This comparison is so convoluted because Python 3 removed the
-            # ability to compare a string to None
-            if ((lastfn is not None and fieldname < lastfn)
-                or (fieldname == lastfn and lasttext is not None
-                    and text < lasttext)):
-                raise Exception("Postings are out of order: %r:%s .. %r:%s" %
-                                (lastfn, lasttext, fieldname, text))
-            if fieldname != lastfn or text != lasttext:
+            # If this term is different from the term in the previous posting,
+            # tell the writer to start a new term
+            if btext != lasttext:
                 if lasttext is not None:
                     finish_term()
-                if fieldname != lastfn:
-                    if lastfn is not None:
-                        finish_field()
-                    start_field(fieldname, schema[fieldname])
-                    lastfn = fieldname
-                start_term(text)
-                lasttext = text
+                start_term(btext)
+                lasttext = btext
+
+            # Add this posting
             length = dfl(docnum, fieldname)
-            add(docnum, weight, valuestring, length)
+            if value is None:
+                value = emptybytes
+            add(docnum, weight, value, length)
+
         if lasttext is not None:
             finish_term()
+        if lastfn is not None:
             finish_field()
 
     def start_field(self, fieldname, fieldobj):
     """Do not instantiate this object directly. It is used by the Index object
     to hold information about a segment. A list of objects of this class are
     pickled as part of the TOC file.
-    
+
     The TOC file stores a minimal amount of information -- mostly a list of
     Segment objects. Segments are the real reverse indexes. Having multiple
     segments allows quick incremental indexing: just create a new segment for

src/whoosh/lang/phonetic.py

-#encoding: utf8
+#encoding: utf-8
 
 """
 This module contains quasi-phonetic encoders for words in different languages.

src/whoosh/query/nary.py

         return sum(q.estimate_size(ixreader) for q in self.subqueries)
 
     def estimate_min_size(self, ixreader):
-        subs, nots = self._split_queries()
-        subs_min = min(q.estimate_min_size(ixreader) for q in subs)
-        if nots:
-            nots_sum = sum(q.estimate_size(ixreader) for q in nots)
-            subs_min = max(0, subs_min - nots_sum)
-        return subs_min
+        from whoosh.query import Not
+
+        subs = self.subqueries
+        for sub in subs:
+            if isinstance(sub, Not):
+                return 0
+
+        return min(q.estimate_min_size(ixreader) for q in subs)
 
     def normalize(self):
         from whoosh.query import Every, TermRange, NumericRange
 
         return self.__class__(subqs, boost=self.boost)
 
-    def _split_queries(self):
-        from whoosh.query import Not
+    def simplify(self, ixreader):
+        subs = self.subqueries
+        if subs:
+            q = self.__class__([subq.simplify(ixreader) for subq in subs],
+                                boost=self.boost).normalize()
+        else:
+            q = qcore.NullQuery
+        return q
 
-        subs = [q for q in self.subqueries if not isinstance(q, Not)]
-        nots = [q.query for q in self.subqueries if isinstance(q, Not)]
-        return (subs, nots)
+    def matcher(self, searcher, weighting=None):
+        # This method does a little sanity checking and then passes the info
+        # down to the _matcher() method which subclasses must implement
 
-    def simplify(self, ixreader):
-        subs, nots = self._split_queries()
+        subs = self.subqueries
+        if not subs:
+            return matching.NullMatcher()
 
-        if subs:
-            subs = self.__class__([subq.simplify(ixreader) for subq in subs],
-                                  boost=self.boost).normalize()
-            if nots:
-                nots = Or(nots).simplify().normalize()
-                return AndNot(subs, nots)
-            else:
-                return subs
+        if len(subs) == 1:
+            m = subs[0].matcher(searcher, weighting=weighting)
         else:
-            return qcore.NullQuery
+            m = self._matcher(subs, searcher, weighting)
+        return m
 
-    def _matcher(self, matchercls, q_weight_fn, searcher, weighting=None,
-                 **kwargs):
+    def _matcher(self, subs, searcher, weighting):
+        # Subclasses must implement this method
+
+        raise NotImplementedError
+
+    def _tree_matcher(self, subs, mcls, searcher, weighting, q_weight_fn,
+                      **kwargs):
         # q_weight_fn is a function which is called on each query and returns a
         # "weight" value which is used to build a huffman-like matcher tree. If
         # q_weight_fn is None, an order-preserving binary tree is used instead.
 
-        # Pull any queries inside a Not() out into their own list
-        subs, nots = self._split_queries()
-
-        if not subs:
-            return matching.NullMatcher()
-
         # Create a matcher from the list of subqueries
         subms = [q.matcher(searcher, weighting=weighting) for q in subs]
         if len(subms) == 1:
             m = subms[0]
         elif q_weight_fn is None:
-            m = make_binary_tree(matchercls, subms)
+            m = make_binary_tree(mcls, subms, **kwargs)
         else:
             w_subms = [(q_weight_fn(q), m) for q, m in zip(subs, subms)]
-            m = make_weighted_tree(matchercls, w_subms)
-
-        # If there were queries inside Not(), make a matcher for them and
-        # wrap the matchers in an AndNotMatcher
-        if nots:
-            if len(nots) == 1:
-                notm = nots[0].matcher(searcher)
-            else:
-                r = searcher.reader()
-                notms = [(q.estimate_size(r), q.matcher(searcher))
-                         for q in nots]
-                notm = make_weighted_tree(matching.UnionMatcher, notms)
-
-            if notm.is_active():
-                m = matching.AndNotMatcher(m, notm)
+            m = make_weighted_tree(mcls, w_subms, **kwargs)
 
         # If this query had a boost, add a wrapping matcher to apply the boost
         if self.boost != 1.0:
     def estimate_size(self, ixreader):
         return min(q.estimate_size(ixreader) for q in self.subqueries)
 
-    def matcher(self, searcher, weighting=None):
+    def _matcher(self, subs, searcher, weighting):
         r = searcher.reader()
-        return self._matcher(matching.IntersectionMatcher,
-                             lambda q: 0 - q.estimate_size(r), searcher,
-                             weighting=weighting)
+        q_weight_fn = lambda q: 0 - q.estimate_size(r)
+        return self._tree_matcher(subs, matching.IntersectionMatcher, searcher,
+                                  weighting, q_weight_fn)
 
 
 class Or(CompoundQuery):
         else:
             return set()
 
-    def matcher(self, searcher, weighting=None):
+    def _matcher(self, subs, searcher, weighting):
+        # Make a binary tree of UnionMatcher objects
         r = searcher.reader()
-        return self._matcher(self.matcher_class, lambda q: q.estimate_size(r),
-                             searcher, weighting=weighting)
+        q_weight_fn = lambda q: q.estimate_size(r)
+        m = self._tree_matcher(subs, matching.UnionMatcher, searcher,
+                               weighting, q_weight_fn)
+        return m
 
 
 class DisjunctionMax(CompoundQuery):
         else:
             return set()
 
-    def matcher(self, searcher, weighting=None):
+    def _matcher(self, subs, searcher, weighting):
         r = searcher.reader()
-        return self._matcher(matching.DisjunctionMaxMatcher,
-                             lambda q: q.estimate_size(r), searcher,
-                             weighting=weighting, tiebreak=self.tiebreak)
+        q_weight_fn = lambda q: q.estimate_size(r)
+        return self._tree_matcher(subs, matching.DisjunctionMaxMatcher,
+                                  searcher, weighting, q_weight_fn,
+                                  tiebreak=self.tiebreak)
 
 
 # Boolean queries
 
     JOINT = " OTHERWISE "
 
-    def matcher(self, searcher):
-        m = self.a.matcher(searcher)
+    def matcher(self, searcher, weighting=None):
+        m = self.a.matcher(searcher, weighting=weighting)
         if not m.is_active():
-            m = self.b.matcher(searcher)
+            m = self.b.matcher(searcher, weighting=weighting)
         return m
 
 

src/whoosh/query/positional.py

     def matcher(self, searcher, weighting=None):
         from whoosh.spans import SpanBefore
 
-        return self._matcher(SpanBefore._Matcher, None, searcher,
-                             weighting=weighting)
+        return self._tree_matcher(self.subqueries, SpanBefore._Matcher, searcher,
+                                  weighting, None)
 
 
 

src/whoosh/support/bench.py

                                           schema=ix.schema)
 
     def query(self):
-        qstring = " ".join(self.args).decode("utf8")
+        qstring = " ".join(self.args).decode("utf-8")
         return self.parser.parse(qstring)
 
     def find(self, q):

src/whoosh/util.py

     the prefix it shares with a, followed by the suffix encoded as UTF-8.
     """
     i = first_diff(a, b)
-    return chr(i) + b[i:].encode("utf8")
+    return chr(i) + b[i:].encode("utf-8")
 
 
 def prefix_encode_all(ls):
     last = u('')
     for w in ls:
         i = first_diff(last, w)
-        yield chr(i) + w[i:].encode("utf8")
+        yield chr(i) + w[i:].encode("utf-8")
         last = w
 
 
     last = u('')
     for w in ls:
         i = ord(w[0])
-        decoded = last[:i] + w[1:].decode("utf8")
+        decoded = last[:i] + w[1:].decode("utf-8")
         yield decoded
         last = decoded
 

tests/test_queries.py

         assert_equal(q._find_prefix(q.text), "a")
 
 
+def test_or_nots1():
+    # Issue #285
+    schema = fields.Schema(a=fields.KEYWORD(stored=True),
+                           b=fields.KEYWORD(stored=True))
+    st = RamStorage()
+    ix = st.create_index(schema)
+    with ix.writer() as w:
+        w.add_document(a=u("alfa"), b=u("charlie"))
 
+    with ix.searcher() as s:
+        q = query.And([query.Term("a", "alfa"),
+                       query.Or([query.Not(query.Term("b", "bravo")),
+                                 query.Not(query.Term("b", "charlie"))
+                                 ])
+                       ])
+        r = s.search(q)
+        assert len(r) == 1
 
 
+def test_or_nots2():
+    # Issue #286
+    schema = fields.Schema(a=fields.KEYWORD(stored=True),
+                           b=fields.KEYWORD(stored=True))
+    st = RamStorage()
+    ix = st.create_index(schema)
+    with ix.writer() as w:
+        w.add_document(b=u("bravo"))
 
+    with ix.searcher() as s:
+        q = query.Or([query.Term("a", "alfa"),
+                      query.Not(query.Term("b", "alfa"))
+                      ])
+        r = s.search(q)
+        assert len(r) == 1
 
 
 
 
 
 
+
+
+
+
+
+

tests/test_searching.py

 from nose.tools import assert_equal, assert_raises  # @UnresolvedImport
 
 from whoosh import analysis, fields, index, qparser, query, searching, scoring
-from whoosh.compat import u, xrange, text_type, permutations
+from whoosh.compat import b, u, xrange, text_type, permutations
 from whoosh.filedb.filestore import RamStorage
 
 
 
 
 def test_not():
-    _run_query(query.Or([query.Term("value", u("red")),
-                         query.Term("name", u("yellow")),
-                         query.Not(query.Term("name", u("quick")))]),
+    _run_query(query.And([query.Or([query.Term("value", u("red")),
+                                    query.Term("name", u("yellow"))]),
+                          query.Not(query.Term("name", u("quick")))]),
                [u("A"), u("E")])
 
 
         assert_equal([(i, [0]) for i in xrange(200)], items)
 
 
+def test_groupedby_with_terms():
+    schema = fields.Schema(content=fields.TEXT, organism=fields.ID)
+    ix = RamStorage().create_index(schema)
 
+    with ix.writer() as w:
+        w.add_document(organism=u("mus"), content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00"))
+        w.add_document(organism=u("mus"), content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study"))
+        w.add_document(organism=u("hs"), content=u("This is the first document we've added!"))
 
+    with ix.searcher() as s:
+        q = qparser.QueryParser("content", schema=ix.schema).parse(u("IPFSTD1"))
+        r = s.search(q, groupedby=["organism"], terms=True)
+        assert len(r) == 2
+        assert r.groups("organism") == {"mus": [1, 0]}
+        assert r.has_matched_terms()
+        assert r.matched_terms() == set([('content', b('ipfstd1'))])
 
 
 
 
 
 
+

tests/test_tables.py

-# encoding: utf8
+# encoding: utf-8
 
 from __future__ import with_statement
 import random