Commits

Matt Chaput committed 98915f9

Fixed problem with sub-searcher using parent's doc count. Maybe related to issue #237.
Minor refactorings.

  • Participants
  • Parent commits ea939ef

Comments (0)

Files changed (5)

File src/whoosh/query.py

         # as And and Or do special handling of Not subqueries.
         reader = searcher.reader()
         child = self.query.matcher(searcher)
-        return InverseMatcher(child, searcher.doc_count_all(),
+        return InverseMatcher(child, reader.doc_count_all(),
                               missing=reader.is_deleted)
 
 

File src/whoosh/scoring.py

         """Returns the inverse document frequency of the given term.
         """
 
-        n = searcher.doc_frequency(fieldname, text)
-        return log((searcher.doc_count_all()) / (n + 1)) + 1
+        parent = searcher.get_parent()
+        n = parent.doc_frequency(fieldname, text)
+        dc = parent.doc_count_all()
+        return log(dc / (n + 1)) + 1
 
     def scorer(self, searcher, fieldname, text, qf=1):
         """Returns an instance of :class:`whoosh.scoring.Scorer` configured

File src/whoosh/searching.py

         self.is_closed = False
         self._closereader = closereader
         self._ix = fromindex
+        self._doccount = self.ixreader.doc_count_all()
 
         if parent:
             self.parent = weakref.ref(parent)
             self.schema = parent.schema
-            self._doccount = parent._doccount
             self._idf_cache = parent._idf_cache
             self._filter_cache = parent._filter_cache
         else:
             self.parent = None
             self.schema = self.ixreader.schema
-            self._doccount = self.ixreader.doc_count_all()
             self._idf_cache = {}
             self._filter_cache = {}
 
             # again.
             checkquality = matcher.next()
 
-    def sort(self, searcher, q, sortedby, reverse=False, allow=None,
+    def sort(self, global_searcher, q, sortedby, reverse=False, allow=None,
              restrict=None):
-        self.searcher = searcher
+        self.searcher = global_searcher
         self.q = q
         self.docset = set()
         self._set_filters(allow, restrict)
         addall = self.should_add_all()
 
         facet = sorting.MultiFacet.from_sortedby(sortedby)
-        catter = facet.categorizer(searcher)
+        catter = facet.categorizer(global_searcher)
         t = now()
 
-        if searcher.is_atomic():
-            searchers = [(searcher, 0)]
+        if global_searcher.is_atomic():
+            searchers = [(global_searcher, 0)]
         else:
-            searchers = searcher.subsearchers
+            searchers = global_searcher.subsearchers
 
-        for s, offset in searchers:
-            self.subsearcher = s
-            self._set_categorizers(s, offset)
-            catter.set_searcher(s, offset)
+        for segment_searcher, offset in searchers:
+            self.subsearcher = segment_searcher
+            self._set_categorizers(segment_searcher, offset)
+            catter.set_searcher(segment_searcher, offset)
 
             if catter.requires_matcher or self.termlists:
                 ls = list(self.pull_matches(q, offset, catter.key_for_matcher))
             else:
-                ls = list(self.pull_unscored_matches(q, offset,
-                                                     catter.key_for_id))
+                kfi = catter.key_for_id
+                ls = list(self.pull_unscored_matches(q, offset, kfi))
 
             if addall:
                 items.extend(ls)
         timelimited = bool(self.timelimit)
 
         matcher = q.matcher(self.subsearcher)
-        for id in matcher.all_ids():
+        for docnum in matcher.all_ids():
             # Check whether the time limit expired since the last match
             if timelimited and self.timedout and not self.greedy:
                 raise TimeLimit
 
             # The current document ID 
-            offsetid = id + offset
+            offsetid = docnum + offset
 
             # Check whether the document is filtered
             if ((not allow or offsetid in allow)
                 and (not restrict or offsetid not in restrict)):
                 # Collect and yield this document
-                key = keyfn(id)
-                collect(id, offsetid, key)
+                key = keyfn(docnum)
+                collect(docnum, offsetid, key)
                 yield (key, offsetid)
 
             # Check whether the time limit expired

File src/whoosh/sorting.py

 
     maptype = None
 
-    def categorizer(self, searcher):
+    def categorizer(self, global_searcher):
         """Returns a :class:`Categorizer` corresponding to this facet.
+        
+        :param global_searcher: A parent searcher. You can use this searcher if
+            you need global document ID references.
         """
 
         raise NotImplementedError
     allow_overlap = False
     requires_matcher = False
 
-    def set_searcher(self, searcher, docoffset):
+    def set_searcher(self, segment_searcher, docoffset):
         """Called by the collector when the collector moves to a new segment.
-        The ``searcher`` will be atomic. The ``docoffset`` is the offset of
-        the segment's document numbers relative to the entire index. You can
-        use the offset to get absolute index docnums by adding the offset to
-        segment-relative docnums.
+        The ``segment_searcher`` will be atomic. The ``docoffset`` is the
+        offset of the segment's document numbers relative to the entire index.
+        You can use the offset to get absolute index docnums by adding the
+        offset to segment-relative docnums.
         """
 
         pass
     def default_name(self):
         return self.fieldname
 
-    def categorizer(self, searcher):
+    def categorizer(self, global_searcher):
         from whoosh.fields import NUMERIC, DATETIME
 
         # The searcher we're passed here may wrap a multireader, but the
         # Categorizer.set_searcher method call
         fieldname = self.fieldname
         field = None
-        if fieldname in searcher.schema:
-            field = searcher.schema[fieldname]
-        hascache = searcher.reader().supports_caches()
+        if fieldname in global_searcher.schema:
+            field = global_searcher.schema[fieldname]
+        hascache = global_searcher.reader().supports_caches()
 
         if self.allow_overlap:
             return self.OverlappingFieldCategorizer(fieldname)
         else:
             # If the reader does not support field caches or we need to
             # reverse-sort a string field, we need to do more work
-            return self.NoCacheFieldCategorizer(searcher, fieldname,
+            return self.NoCacheFieldCategorizer(global_searcher, fieldname,
                                                 self.reverse)
 
     class FieldCategorizer(Categorizer):
         def __init__(self, fieldname):
             self.fieldname = fieldname
 
-        def set_searcher(self, searcher, docoffset):
-            self.fieldcache = searcher.reader().fieldcache(self.fieldname)
+        def set_searcher(self, segment_searcher, docoffset):
+            r = segment_searcher.reader()
+            self.fieldcache = r.fieldcache(self.fieldname)
 
         def key_for_id(self, docid):
             return self.fieldcache.key_for(docid)
             self.fieldname = fieldname
             self.reverse = reverse
 
-        def set_searcher(self, searcher, docoffset):
-            self.default = searcher.schema[self.fieldname].sortable_default()
-            self.fieldcache = searcher.reader().fieldcache(self.fieldname)
+        def set_searcher(self, segment_searcher, docoffset):
+            r = segment_searcher.reader()
+            fieldobj = segment_searcher.schema[self.fieldname]
+            self.default = fieldobj.sortable_default()
+            self.fieldcache = r.fieldcache(self.fieldname)
 
         def key_for_id(self, docid):
             value = self.fieldcache.key_for(docid)
         order).
         """
 
-        def __init__(self, searcher, fieldname, reverse):
+        def __init__(self, global_searcher, fieldname, reverse):
             # Cache the relative positions of all docs with the given field
             # across the entire index
-            reader = searcher.reader()
+            reader = global_searcher.reader()
             dc = reader.doc_count_all()
             arry = array("i", [dc + 1] * dc)
-            field = searcher.schema[fieldname]
-            values = field.sortable_values(reader, fieldname)
+            fieldobj = global_searcher.schema[fieldname]
+            values = fieldobj.sortable_values(reader, fieldname)
+
+            values = list(values)
+
             for i, (t, _) in enumerate(values):
                 if reverse:
                     i = dc - i
                     arry[docid] = i
             self.array = arry
 
-        def set_searcher(self, searcher, docoffset):
+        def set_searcher(self, segment_searcher, docoffset):
             self.docoffset = docoffset
 
         def key_for_id(self, docid):
-            return self.array[docid + self.docoffset]
+            arry = self.array
+            offset = self.docoffset
+            global_id = offset + docid
+            assert docid >= 0
+            assert global_id < len(arry), ("%s + %s >= %s"
+                                           % (docid, offset, len(arry)))
+            return arry[global_id]
 
     class OverlappingFieldCategorizer(Categorizer):
         allow_overlap = True
             self.fieldname = fieldname
             self.use_vectors = False
 
-        def set_searcher(self, searcher, docoffset):
+        def set_searcher(self, segment_searcher, docoffset):
             fieldname = self.fieldname
-            dc = searcher.doc_count_all()
-            field = searcher.schema[fieldname]
-            reader = searcher.reader()
+            dc = segment_searcher.doc_count_all()
+            field = segment_searcher.schema[fieldname]
+            reader = segment_searcher.reader()
 
             if field.vector:
                 # If the field was indexed with term vectors, use the vectors
                 # to get the list of values in each matched document
                 self.use_vectors = True
-                self.searcher = searcher
+                self.segment_searcher = segment_searcher
             else:
                 # Otherwise, cache the values in each document in a huge list
                 # of lists
         def keys_for_id(self, docid):
             if self.use_vectors:
                 try:
-                    v = self.searcher.vector(docid, self.fieldname)
+                    v = self.segment_searcher.vector(docid, self.fieldname)
                     return list(v.all_ids())
                 except KeyError:
                     return None
         def key_for_id(self, docid):
             if self.use_vectors:
                 try:
-                    v = self.searcher.vector(docid, self.fieldname)
+                    v = self.segment_searcher.vector(docid, self.fieldname)
                     return v.id()
                 except KeyError:
                     return None
         self.other = other
         self.maptype = maptype
 
-    def categorizer(self, searcher):
+    def categorizer(self, global_searcher):
         return self.QueryCategorizer(self.querydict, self.other)
 
     class QueryCategorizer(Categorizer):
             self.other = other
             self.allow_overlap = allow_overlap
 
-        def set_searcher(self, searcher, offset):
+        def set_searcher(self, segment_searcher, offset):
             self.docsets = {}
             for qname, q in self.querydict.items():
-                docset = set(q.docs(searcher))
+                docset = set(q.docs(segment_searcher))
                 if docset:
                     self.docsets[qname] = docset
             self.offset = offset
 
             cstart = cend
 
-    def categorizer(self, searcher):
-        return QueryFacet(self.querydict).categorizer(searcher)
+    def categorizer(self, global_searcher):
+        return QueryFacet(self.querydict).categorizer(global_searcher)
 
 
 class DateRangeFacet(RangeFacet):
         results = searcher.search(myquery, sortedby=tag_score)
     """
 
-    def categorizer(self, searcher):
-        return self.ScoreCategorizer(searcher)
+    def categorizer(self, global_searcher):
+        return self.ScoreCategorizer(global_searcher)
 
     class ScoreCategorizer(Categorizer):
         requires_matcher = True
 
-        def __init__(self, searcher):
-            w = searcher.weighting
+        def __init__(self, global_searcher):
+            w = global_searcher.weighting
             self.use_final = w.use_final
             if w.use_final:
                 self.final = w.final
 
-        def set_searcher(self, searcher, offset):
-            self.searcher = searcher
+        def set_searcher(self, segment_searcher, offset):
+            self.segment_searcher = segment_searcher
 
         def key_for_matcher(self, matcher):
             score = matcher.score()
             if self.use_final:
-                score = self.final(self.searcher, matcher.id(), score)
+                score = self.final(self.segment_searcher, matcher.id(), score)
             # Negate the score so higher values sort first
             return 0 - score
 
         self.fn = fn
         self.maptype = maptype
 
-    def categorizer(self, searcher):
-        return self.FunctionCategorizer(searcher, self.fn)
+    def categorizer(self, global_searcher):
+        return self.FunctionCategorizer(global_searcher, self.fn)
 
     class FunctionCategorizer(Categorizer):
-        def __init__(self, searcher, fn):
-            self.searcher = searcher
+        def __init__(self, global_searcher, fn):
+            self.global_searcher = global_searcher
             self.fn = fn
 
-        def set_searcher(self, searcher, docoffset):
+        def set_searcher(self, segment_searcher, docoffset):
             self.offset = docoffset
 
         def key_for_id(self, docid):
-            return self.fn(self.searcher, docid + self.offset)
+            return self.fn(self.global_searcher, docid + self.offset)
 
 
 class StoredFieldFacet(FacetType):
     def default_name(self):
         return self.fieldname
 
-    def categorizer(self, searcher):
+    def categorizer(self, global_searcher):
         return self.StoredFieldCategorizer(self.fieldname, self.allow_overlap,
                                            self.split_fn)
 
             self.allow_overlap = allow_overlap
             self.split_fn = split_fn
 
-        def set_searcher(self, searcher, docoffset):
-            self.searcher = searcher
+        def set_searcher(self, segment_searcher, docoffset):
+            self.segment_searcher = segment_searcher
 
         def keys_for_id(self, docid):
-            value = self.searcher.stored_fields(docid).get(self.fieldname)
+            d = self.segment_searcher.stored_fields(docid)
+            value = d.get(self.fieldname)
             if self.split_fn:
                 return self.split_fn(value)
             else:
                 return value.split()
 
         def key_for_id(self, docid):
-            fields = self.searcher.stored_fields(docid)
-            return fields.get(self.fieldname)
+            d = self.segment_searcher.stored_fields(docid)
+            return d.get(self.fieldname)
 
 
 class MultiFacet(FacetType):
         multi = cls()
         if isinstance(sortedby, string_type):
             multi._add(sortedby)
-        elif (isinstance(sortedby, (list, tuple)) or hasattr(sortedby,
-                                                             "__iter__")):
+        elif (isinstance(sortedby, (list, tuple))
+              or hasattr(sortedby, "__iter__")):
             for item in sortedby:
                 multi._add(item)
         else:
         self.facets.append(facet)
         return self
 
-    def categorizer(self, searcher):
+    def categorizer(self, global_searcher):
         if not self.facets:
             raise Exception("No facets")
         elif len(self.facets) == 1:
-            catter = self.facets[0].categorizer(searcher)
+            catter = self.facets[0].categorizer(global_searcher)
         else:
-            catter = self.MultiCategorizer([facet.categorizer(searcher)
+            catter = self.MultiCategorizer([facet.categorizer(global_searcher)
                                             for facet in self.facets])
         return catter
 
         def requires_matcher(self):
             return any(c.requires_matcher for c in self.catters)
 
-        def set_searcher(self, searcher, docoffset):
+        def set_searcher(self, segment_searcher, docoffset):
             for catter in self.catters:
-                catter.set_searcher(searcher, docoffset)
+                catter.set_searcher(segment_searcher, docoffset)
 
         def key_for_matcher(self, matcher):
             return tuple(catter.key_for_matcher(matcher)

File tests/test_sorting.py

         assert_equal(gs["bear"], 3)
 
 
+def test_nocachefield_segments():
+    schema = fields.Schema(a=fields.ID(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(a=u("bravo"))
+    w.add_document(a=u("echo"))
+    w.add_document(a=u("juliet"))
+    w.commit()
+    w = ix.writer()
+    w.add_document(a=u("kilo"))
+    w.add_document(a=u("foxtrot"))
+    w.add_document(a=u("charlie"))
+    w.commit(merge=False)
+    w = ix.writer()
+    w.delete_by_term("a", u("echo"))
+    w.add_document(a=u("alfa"))
+    w.add_document(a=u("india"))
+    w.add_document(a=u("delta"))
+    w.commit(merge=False)
 
+    with ix.searcher() as s:
+        q = query.TermRange("a", u("bravo"), u("k"))
+        facet = sorting.FieldFacet("a", reverse=True)
+
+        cat = facet.categorizer(s)
+        assert_equal(cat.__class__, sorting.FieldFacet.NoCacheFieldCategorizer)
+
+        r = s.search(q, sortedby=facet)
+        assert_equal([hit["a"] for hit in r],
+                     ["juliet", "india", "foxtrot", "delta", "charlie",
+                      "bravo"])
+
+        mq = query.Or([query.Term("a", u("bravo")),
+                       query.Term("a", u("delta"))])
+        anq = query.AndNot(q, mq)
+        r = s.search(anq, sortedby=facet)
+        assert_equal([hit["a"] for hit in r],
+                     ["juliet", "india", "foxtrot", "charlie"])
+
+        mq = query.Or([query.Term("a", u("bravo")),
+                       query.Term("a", u("delta"))])
+        r = s.search(q, mask=mq, sortedby=facet)
+        assert_equal([hit["a"] for hit in r],
+                     ["juliet", "india", "foxtrot", "charlie"])
+
+        fq = query.Or([query.Term("a", u("alfa")),
+                       query.Term("a", u("charlie")),
+                       query.Term("a", u("echo")),
+                       query.Term("a", u("india")),
+                       ])
+        r = s.search(query.Every(), filter=fq, sortedby=facet)
+        assert_equal([hit["a"] for hit in r],
+                     ["india", "charlie", "alfa"])
+
+        nq = query.Not(query.Or([query.Term("a", u("alfa")),
+                                 query.Term("a", u("india"))]))
+        print list(s.docs_for_query(nq))
+        r = s.search(query.Every(), filter=nq, sortedby=facet)
+        assert_equal([hit["a"] for hit in r],
+                     ["kilo", "juliet", "foxtrot", "delta", "charlie",
+                      "bravo"])
+
+
+
+
+
+
+