Matt Chaput avatar Matt Chaput committed 1a1ef21

Fixed bugs in ramindex and ListMatcher handling of term statistics.

Comments (0)

Files changed (6)

src/whoosh/filedb/filereading.py

         else:
             docids, weights, values = postings
             postreader = ListMatcher(docids, weights, values, format,
-                                     scorer=scorer, term=(fieldname, text))
+                                     scorer=scorer, term=(fieldname, text),
+                                     terminfo=terminfo)
         
         deleted = self.segment.deleted
         if deleted:

src/whoosh/matching.py

     """
     
     def __init__(self, ids, weights=None, values=None, format=None,
-                 scorer=None, position=0, all_weights=None, term=None):
+                 scorer=None, position=0, all_weights=None, term=None,
+                 terminfo=None):
         """
         :param ids: a list of doc IDs.
         :param weights: a list of weights corresponding to the list of IDs.
         self._format = format
         self._scorer = scorer
         self._term = term
+        self._terminfo = terminfo
     
     def __repr__(self):
         return "<%s>" % self.__class__.__name__
             return 1.0
     
     def block_min_length(self):
-        return self._minlength
+        return self._terminfo.min_length()
     
     def block_max_length(self):
-        return self._maxlength
+        return self._terminfo.max_length()
     
     def block_max_weight(self):
         if self._all_weights:
             return self._all_weights
         elif self._weights:
             return max(self._weights)
+        elif self._terminfo is not None:
+            return self._terminfo.max_weight()
         else:
             return 1.0
     
     def block_max_wol(self):
-        return self.block_max_weight() / self.block_min_length()
+        return self._terminfo.max_wol()
     
     def score(self):
         if self._scorer:

src/whoosh/ramindex.py

     def postings(self, fieldname, text, scorer=None):
         self._test_field(fieldname)
         try:
-            postings = self.invindex[fieldname][text]
+            terminfo = self.term_info(fieldname, text)
         except KeyError:
             raise TermNotFound((fieldname, text))
         
+        format = self.schema[fieldname].format
+        postings = self.invindex[fieldname][text]
         excludeset = self.deleted
-        format = self.schema[fieldname].format
         if excludeset:
             postings = [x for x in postings if x[0] not in excludeset]
             if not postings:
                 return NullMatcher()
         ids, weights, values = zip_(*postings)
-        return ListMatcher(ids, weights, values, format=format)
+        lm = ListMatcher(ids, weights, values, format=format, scorer=scorer,
+                         term=(fieldname, text), terminfo=terminfo)
+        return lm
     
     def reader(self):
         return self
                     # Count of UNIQUE terms in the value
                     unique = 0
                     
+                    words = []
                     for w, freq, weight, valuestring in field.index(value):
                         weight *= fieldboost
                         
+                        words.append((w, weight))
                         if w not in fielddict:
                             fielddict[w] = []
                         fielddict[w].append((self.docnum, weight, valuestring))
                         
                         usage += 44 + len(valuestring)
                         
+                        # Record max weight and max wol
                         # min_length, max_length, max_weight, max_wol
                         wol = weight / count
-                        if (name, w) in termstats:
-                            ts = termstats[name, w]
+                    
+                    for w, weight in words:
+                        ts = termstats.get((name, w))
+                        # Record term stats for each term in this document
+                        wol = weight / count
+                        if ts is None:
+                            termstats[name, w] = [count, count, weight, wol]
+                        else:
                             if count < ts[0]:
                                 ts[0] = count
                             if count > ts[1]:
                                 ts[2] = weight
                             if wol > ts[3]:
                                 ts[3] = wol
-                        else:
-                            termstats[name, w] = [count, count, weight, wol]
                     
                     if field.scorable:
                         fieldlengths[self.docnum, name] = count

src/whoosh/scoring.py

 
 
 class WeightLengthScorer(BaseScorer):
-    """Base class for scorers where the only per-document variables are weight
-    and length.
+    """Base class for scorers where the only per-document variables are term
+    weight and field length.
     
     Subclasses should follow this pattern:
     

src/whoosh/writing.py

     To get a writer for a particular index, call
     :meth:`~whoosh.index.Index.writer` on the Index object.
     
-    >>> writer = my_index.writer()
+    >>> writer = myindex.writer()
     
     You can use this object as a context manager. If an exception is thrown
-    from within the context it calls cancel(), otherwise it calls commit() when
-    the context exits.
+    from within the context it calls :meth:`~IndexWriter.cancel` to clean up
+    temporary files, otherwise it calls :meth:`~IndexWriter.commit` when the
+    context exits.
+    
+    >>> with myindex.writer() as w:
+    ...     w.add_document(title="First document", content="Hello there.")
+    ...     w.add_document(title="Second document", content="This is easy!")
     """
     
     def __enter__(self):

tests/test_ramindex.py

             assert_equal([d["id"] for d in r], result)
         
         _runq(query.Term("text", u("format")), ["format", "vector"])
-        _runq(query.Term("text", u("the")), ["fieldtype", "format", "vector", "const", "stored"])
+        _runq(query.Term("text", u("the")), ["fieldtype", "format", "const", "vector", "stored"])
         _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"])
         _runq(query.Wildcard("id", u("*st*")), ["stored", "const"])
         _runq(query.TermRange("id", u("c"), u("s")), ["fieldtype", "format", "const"])
         _runq(query.NumericRange("subs", 10, 100), ["fieldtype", "format", "vector", "scorable"])
-        _runq(query.Phrase("text", ["this", "field"]), ["scorable", "stored", "unique"], limit=None)
+        _runq(query.Phrase("text", ["this", "field"]), ["scorable", "unique", "stored"], limit=None)
         _runq(query.Every(), ["fieldtype", "format", "vector", "scorable", "stored", "unique", "const"])
         _runq(query.Every("subs"), ["fieldtype", "format", "vector", "scorable", "stored", "unique", "const"])
 
     assert_raises(TermNotFound, ix.doc_frequency, "content", "foo")
     assert_equal(ix.doc_frequency("id", "foo"), 0)
 
+def test_missing_postings():
+    schema = fields.Schema(id=fields.ID)
+    ix = RamIndex(schema)
+    ix.add_document(id=u("one"))
+    assert_raises(TermNotFound, ix.postings, "content", "foo")
+    assert_raises(TermNotFound, ix.postings, "id", "foo")
 
+def test_block_info():
+    schema = fields.Schema(key=fields.KEYWORD)
+    ix = RamIndex(schema)
+    ix.add_document(key=u("alfa bravo charlie"))
+    ix.add_document(key=u("bravo delta"))
+    ix.add_document(key=u("charlie delta echo foxtrot"))
+    ix.add_document(key=u("delta echo foxtrot golf hotel india"))
+    ix.add_document(key=u("echo foxtrot golf hotel india juliet alfa bravo"))
+    s = ix.searcher()
+    p = s.postings("key", "bravo")
+    assert p.supports_block_quality()
+    assert_equal(p.block_min_length(), 2)
+    assert_equal(p.block_max_length(), 8)
+    assert_equal(p.block_max_wol(), 0.5)
+    
 
 
 
 
 
 
-
-
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.