Commits

Matt Chaput committed d5b5196

Starting "better quality" (betterq) branch.

Comments (0)

Files changed (8)

src/whoosh/filedb/filepostings.py

             return 0
         return self._skip_to_block(lambda: bq() <= minquality)
     
-    def block_maxweight(self):
-        return self.block.maxweight
+    def block_min_length(self):
+        return self.block.min_length()
     
-    def block_maxwol(self):
-        return self.block.maxwol
+    def block_max_length(self):
+        return self.block.max_length()
     
-    def block_maxid(self):
-        return self.block.maxid
+    def block_max_weight(self):
+        return self.block.max_weight()
     
-    def block_minlength(self):
-        return self.block.minlength
+    def block_max_wol(self):
+        return self.block.max_wol()
+    
+    def block_max_id(self):
+        return self.block.max_id()
     
     def score(self):
         return self.scorer.score(self)

src/whoosh/filedb/filetables.py

 from cPickle import loads, dumps
 from struct import Struct
 
-from whoosh.system import (_INT_SIZE, _LONG_SIZE, pack_ushort, pack_uint,
-                           pack_long, unpack_ushort, unpack_uint, unpack_long)
-from whoosh.util import byte_to_length, utf8encode, utf8decode
+from whoosh.matching import ListMatcher
+from whoosh.reading import TermStats
+from whoosh.system import (_INT_SIZE, _LONG_SIZE, _FLOAT_SIZE, pack_ushort,
+                           pack_uint, pack_long, unpack_ushort, unpack_uint,
+                           unpack_long)
+from whoosh.util import byte_to_length, length_to_byte, utf8encode, utf8decode
 
 
 _4GB = 4 * 1024 * 1024 * 1024
 
     def all(self, key):
         read = self.read
-        for datapos, datalen in self._get_ranges(key):
+        for datapos, datalen in self.ranges_for_key(key):
             yield read(datapos, datalen)
 
     def __contains__(self, key):
-        for _ in self._get_ranges(key):
+        for _ in self.ranges_for_key(key):
             return True
         return False
 
         keylen = self.dbfile.get_uint(pos)
         return self.read(pos + lengths_size, keylen)
 
-    def _get_ranges(self, key):
+    def ranges_for_key(self, key):
         read = self.read
         pointer_size = self.pointer_size
         keyhash = self.hash_func(key)
                 if keylen == len(key):
                     if key == read(pos + lengths_size, keylen):
                         yield (pos + lengths_size + keylen, datalen)
-                        
+    
+    def range_for_key(self, key):
+        for item in self.ranges_for_key(key):
+            return item
+    
     def end_of_hashes(self):
         if self.old_format:
             lastpos, lastnum = self.buckets[255]
         return values
 
 
+# TermInfo
+
+class TermInfo(TermStats):
+    struct = Struct("!IIBBff")
+    
+    def __init__(self, freq, docfreq, minlength, maxlength, maxweight, maxwol,
+                 postings):
+        self._freq = freq
+        self._docfreq = docfreq
+        self._minlength = minlength
+        self._maxlength = maxlength
+        self._maxweight = maxweight
+        self._maxwol = maxwol
+        self._postings = postings
+    
+    def to_string(self):
+        ml = length_to_byte(self._minlength)
+        xl = length_to_byte(self._maxlength)
+        st = self.struct.pack(self._freq, self._docfreq, ml, xl, self._maxweight,
+                              self._maxwol)
+        
+        if isinstance(self._postings, tuple):
+            magic = 1
+            st += dumps(self._postings, -1)[2:-1]
+        else:
+            magic = 0
+            st += pack_long(self._postings)
+        return chr(magic) + st
+
+    def postings(self, postfile, format, scorer=None):
+        p = self._postings
+        if isinstance(p, tuple):
+            docids, weights, values = p
+            pr = ListMatcher(docids, weights, values, format)
+
+    @classmethod
+    def from_string(cls, s):
+        hbyte = ord(s[0])
+        if hbyte < 2:
+            # Freq, Doc freq, min length, max length, max weight, max WOL
+            f, df, ml, xl, xw, xwol = cls.struct.unpack(s[1:])
+            ml = byte_to_length(ml)
+            xl = byte_to_length(xl)
+            # Postings
+            pstr = s[cls.struct.size + 1:]
+            if hbyte == 0:
+                p = unpack_long(pstr)
+            else:
+                p = loads(pstr + ".")
+        else:
+            raise Exception("Unknown struct header %s" % hbyte)
+        
+        return cls(f, df, ml, xl, xw, xwol, p)
+    
+    @classmethod
+    def read_frequency(cls, dbfile, datapos):
+        return dbfile.get_uint(datapos + 1)
+    
+    @classmethod
+    def read_doc_freq(cls, dbfile, datapos):
+        return dbfile.get_uint(datapos + 1 + _INT_SIZE)
+    
+    @classmethod
+    def read_min_and_max_length(cls, dbfile, datapos):
+        lenpos = datapos + 1 + (_INT_SIZE * 2)
+        ml = byte_to_length(dbfile.get_byte(lenpos))
+        xl = byte_to_length(dbfile.get_byte(lenpos + 1))
+        return ml, xl
+    
+    @classmethod
+    def read_max_weight(cls, dbfile, datapos):
+        return dbfile.get_float(datapos + 1 + (_INT_SIZE * 2) + 2)
+    
+    @classmethod
+    def read_max_wol(cls, dbfile, datapos):
+        return dbfile.get_float(datapos + 1 + (_INT_SIZE * 2) + 2 + _FLOAT_SIZE)
+    
+
 # Utility functions
 
 #def dump_hash(hashreader):

src/whoosh/filedb/postblocks.py

     def __nonzero__(self):
         return bool(self.ids)
     
+    def min_length(self):
+        if self.lengths:
+            return min(self.lengths)
+        return self._minlength
+    
+    def max_length(self):
+        if self.lengths:
+            return max(self.lengths)
+        return self._maxlength
+    
+    def max_weight(self):
+        return max(self.weights)
+    
+    def max_wol(self):
+        if self.lengths:
+            return max(w / l for w, l in zip(self.weights, self.lengths))
+        return 0.0
+    
     def stats(self):
         # Calculate block statistics
         maxweight = max(self.weights)
 
 
 current = Block2
-magic_map = {Block1.magic: Block1, Block2.magic: Block2}
+block_types = (Block1, Block2)
+magic_map = dict((b.magic, b) for b in block_types)

src/whoosh/formats.py

 from whoosh.analysis import unstopped, entoken
 from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_uint, unpack_uint,
                            pack_float, unpack_float)
-from whoosh.util import float_to_byte, byte_to_float
+from whoosh.util import (float_to_byte, byte_to_float)
 
 
 # Format base class
 
 
 
+
+    
+
+
+
+

src/whoosh/matching.py

         
         raise NotImplementedError
     
-    def replace(self):
+    def replace(self, minquality=0):
         """Returns a possibly-simplified version of this matcher. For example,
         if one of the children of a UnionMatcher is no longer active, calling
         this method on the UnionMatcher will return the other child.
     """
     
     def __init__(self, ids, weights=None, values=None, format=None,
-                 scorer=None, position=0, all_weights=None,
-                 maxwol=0.0, minlength=0):
+                 scorer=None, position=0, all_weights=None, termstats=None):
         """
         :param ids: a list of doc IDs.
         :param weights: a list of weights corresponding to the list of IDs.
         self._i = position
         self._format = format
         self._scorer = scorer
-        self._maxwol = maxwol
-        self._minlength = minlength
+        self._stats = termstats
     
     def __repr__(self):
         return "<%s>" % self.__class__.__name__
         else:
             return 1.0
     
-    def block_maxweight(self):
+    def block_min_length(self):
+        return self._stats.min_length()
+    
+    def block_max_length(self):
+        return self._stats.max_length()
+    
+    def block_max_weight(self):
         if self._all_weights:
             return self._all_weights
         elif self._weights:
         else:
             return 1.0
     
-    def block_maxwol(self):
-        return self._maxwol
+    def block_max_wol(self):
+        return self._stats.max_wol()
     
-    def block_maxid(self):
+    def block_max_id(self):
         return max(self._ids)
     
-    def block_minlength(self):
-        return self._minlength
-    
     def score(self):
         if self._scorer:
             return self._scorer.score(self)
     def _replacement(self, newchild):
         return self.__class__(newchild, boost=self.boost)
     
-    def replace(self):
-        r = self.child.replace()
+    def replace(self, minquality=0):
+        # Replace the child matcher
+        r = self.child.replace(minquality)
         if not r.is_active():
+            # If the replaced child is inactive, return an inactive matcher
             return NullMatcher()
-        if r is not self.child:
+        elif r is not self.child:
+            # If the child changed, return a new wrapper on the new child
             try:
+                # Subclasses of WrappingMatcher can override _replacement() to
+                # get the __init__ signature they need
                 return self._replacement(r)
             except TypeError, e:
                 raise TypeError("Class %s got exception %s trying "
         else:
             return self
     
+    def max_quality(self):
+        return self.child.max_quality()
+    
     def id(self):
         return self.child.id()
     
         else:
             return 0
     
-    def replace(self):
+    def replace(self, minquality=0):
+        if minquality:
+            # Skip sub-matchers that don't have a high enough max quality to
+            # contribute
+            while (self.is_active()
+                   and self.matchers[self.current].max_quality() < minquality):
+                self._next_matcher()
+        
         if not self.is_active():
             return NullMatcher()
+        
         # TODO: Possible optimization: if the last matcher is current, replace
         # this with the last matcher, but wrap it with a matcher that adds the
         # offset. Have to check whether that's actually faster, though.
         return self
     
+    def max_quality(self):
+        return self.matchers[self.current].max_quality()
+    
     def id(self):
         current = self.current
         return self.matchers[current].id() + self.offsets[current]
     added together.
     """
     
+    def max_quality(self):
+        q = 0.0
+        if self.a.is_active():
+            q += self.a.max_quality()
+        if self.b.is_active():
+            q += self.b.max_quality()
+        return q
+    
     def quality(self):
         q = 0.0
         if self.a.is_active():
     """Matches the union (OR) of the postings in the two sub-matchers.
     """
     
-    def replace(self):
-        a = self.a.replace()
-        b = self.b.replace()
-        
+    def replace(self, minquality=0):
+        a = self.a
+        b = self.b
         a_active = a.is_active()
         b_active = b.is_active()
+        
+        # If neither sub-matcher on its own has a high enough max quality to
+        # contribute, convert to an intersection matcher
+        if (minquality and a_active and b_active
+            and a.max_quality() < minquality and b.max_quality() < minquality):
+            return IntersectionMatcher(a, b).replace(minquality)
+        
+        # If one or both of the sub-matchers are inactive, convert
         if not (a_active or b_active):
             return NullMatcher()
-        if not a_active:
-            return b
-        if not b_active:
-            return a
+        elif not a_active:
+            return b.replace(minquality)
+        elif not b_active:
+            return a.replace(minquality)
         
+        # Can't pass minquality down here
+        a = a.replace()
+        b = b.replace()
+        # If one of the sub-matchers changed, return a new union
         if a is not self.a or b is not self.b:
             return self.__class__(a, b)
-        return self
+        else:
+            return self
     
     def is_active(self):
         return self.a.is_active() or self.b.is_active()
         return self.__class__(self.a.copy(), self.b.copy(),
                               tiebreak=self.tiebreak)
     
+    def replace(self, minquality=0):
+        a = self.a
+        b = self.b
+        a_active = a.is_active()
+        b_active = b.is_active()
+        
+        # DisMax takes the max of the sub-matcher qualities instead of adding
+        # them, so we need special logic here
+        if minquality and a_active and b_active:
+            a_max = a.max_quality()
+            b_max = b.max_quality()
+            
+            if a_max < minquality and b_max < minquality:
+                # If neither sub-matcher has a high enough max quality to
+                # contribute, return an inactive matcher
+                return NullMatcher()
+            elif b_max < minquality:
+                # If the b matcher can't contribute, return a
+                return a.replace(minquality)
+            elif a_max < minquality:
+                # If the a matcher can't contribute, return b
+                return b.replace(minquality)
+        
+        if not (a_active or b_active):
+            return NullMatcher()
+        elif not a_active:
+            return b.replace(minquality)
+        elif not b_active:
+            return a.replace(minquality)
+        
+        # We CAN pass the minquality down here, since we don't add the two
+        # scores together
+        a = a.replace(minquality)
+        b = b.replace(minquality)
+        a_active = a.is_active()
+        b_active = b.is_active()
+        # It's kind of tedious to check for inactive sub-matchers all over
+        # again here after we replace them, but it's probably better than
+        # returning a replacement with an inactive sub-matcher
+        if not (a_active and b_active):
+            return NullMatcher()
+        elif not a_active:
+            return b
+        elif not b_active:
+            return a
+        elif a is not self.a or b is not self.b:
+            # If one of the sub-matchers changed, return a new DisMax
+            return self.__class__(a, b)
+        else:
+            return self
+    
+    def max_quality(self):
+        return max(self.a.max_quality(), self.b.max_quality())
+    
     def score(self):
         if not self.a.is_active():
             return self.b.score()
     def skip_to_quality(self, minquality):
         a = self.a
         b = self.b
-        minquality = minquality
         
         # Short circuit if one matcher is inactive
         if not a.is_active():
             and self.a.id() != self.b.id()):
             self._find_next()
     
-    def replace(self):
-        a = self.a.replace()
-        b = self.b.replace()
+    def replace(self, minquality=0):
+        a = self.a
+        b = self.b
+        a_active = a.is_active()
+        b_active = b.is_active()
         
-        a_active = a
-        b_active = b.is_active()
         if not (a_active and b_active):
+            # Intersection matcher requires that both sub-matchers be active
+            return NullMatcher()
+        elif minquality and a.max_quality() + b.max_quality() < minquality:
+            # If the combined quality of the sub-matchers can't contribute,
+            # return an inactive matcher
             return NullMatcher()
         
+        # Can't pass the minquality down here
+        a = a.replace()
+        b = b.replace()
         if a is not self.a or b is not self.b:
             return self.__class__(a, b)
-        return self
+        else:
+            return self
     
     def is_active(self):
         return self.a.is_active() and self.b.is_active()
         
         return r
     
-    def replace(self):
+    def replace(self, minquality=0):
         if not self.a.is_active():
+            # The a matcher is required, so if it's inactive, return an
+            # inactive matcher
             return NullMatcher()
-        if not self.b.is_active():
-            return self.a.replace()
-        return self
+        elif (minquality
+              and self.a.max_quality() < minquality):
+            # If the quality of the required matcher isn't high enough to
+            # contribute, return an inactive matcher
+            return NullMatcher()
+        elif not self.b.is_active():
+            # If the prohibited matcher is inactive, convert to just the
+            # required matcher
+            return self.a.replace(minquality)
+        
+        a = self.a.replace(minquality)
+        b = self.b.replace()
+        if a is not self.a or b is not self.b:
+            # If one of the sub-matchers was replaced, return a new AndNot
+            return self.__class__(a, b)
+        else:
+            return self
+    
+    def max_quality(self):
+        return self.a.max_quality()
     
     def quality(self):
         return self.a.quality()
     def copy(self):
         return self.__class__(self.a.copy(), self.b.copy())
     
-    def replace(self):
+    def replace(self, minquality=0):
         if not self.child.is_active():
+            # If one of the sub-matchers is inactive, go inactive
             return NullMatcher()
-        return self
+        elif minquality and self.a.max_quality() < minquality:
+            # If the required matcher doesn't have a high enough max quality
+            # to possibly contribute, return an inactive matcher
+            return NullMatcher()
+        
+        new_a = self.a.replace(minquality)
+        new_b = self.b.replace()
+        if not new_a.is_active():
+            return NullMatcher()
+        elif new_a is not self.a or new_b is not self.b:
+            # If one of the sub-matchers changed, return a new Require
+            return self.__class__(new_a, self.b)
+        else:
+            return self
+    
+    def max_quality(self):
+        return self.a.max_quality()
     
     def quality(self):
         return self.a.quality()
             rb = self.b.skip_to(id)
         return ra or rb
     
-    def replace(self):
-        ar = self.a.replace()
-        br = self.b.replace()
-        if not ar.is_active():
+    def replace(self, minquality=0):
+        a = self.a
+        b = self.b
+        a_active = a.is_active()
+        b_active = b.is_active()
+        
+        if not a_active:
             return NullMatcher()
-        if not br.is_active():
-            return ar
-        if ar is not self.a or br is not self.b:
-            return self.__class__(ar, br)
-        return self
+        elif (minquality and b_active
+              and a.max_quality() + b.max_quality() < minquality):
+            # If the combined max quality of the sub-matchers isn't high
+            # enough to possibly contribute, return an inactive matcher
+            return NullMatcher()
+        elif not b_active:
+            return a.replace(minquality)
+        
+        # Can't pass the minquality down here
+        new_a = a.replace()
+        new_b = b.replace()
+        if new_a is not a or new_b is not b:
+            # If one of the sub-matchers changed, return a new AndMaybe
+            return self.__class__(new_a, new_b)
+        else:
+            return self
     
     def skip_to_quality(self, minquality):
         a = self.a
 #    def copy(self):
 #        return self.__class__(self.wordmatchers[:], slop=self.slop, boost=self.boost)
 #    
-#    def replace(self):
+#    def replace(self, minquality=0):
 #        if not self.is_active():
 #            return NullMatcher()
 #        return self

src/whoosh/reading.py

             r.set_caching_policy(*args, **kwargs)
 
         
+# Term stats class
 
+class TermStats(object):
+    def frequency(self):
+        return self._freq
+    
+    def doc_frequency(self):
+        return self._doc_freq
+    
+    def min_length(self):
+        return self._minlength
+    
+    def max_length(self):
+        return self._maxlength
+    
+    def max_weight(self):
+        return self._maxweight
+    
+    def max_wol(self):
+        return self._maxwol
+    
 
 
 

src/whoosh/scoring.py

         return matcher.weight() / self.dfl(matcher.id())
     
     def block_quality(self, matcher):
-        return matcher.block_maxwol()
+        return matcher.block_max_wol()
 
 
 # WeightScorer
         return matcher.weight()
     
     def block_quality(self, matcher):
-        return matcher.block_maxweight()
+        return matcher.block_max_weight()
 
 
 # WeightingModel implementations

src/whoosh/util.py

 
 
 def varint(i):
-    """Encodes the given integer into a string of the minimum number  of bytes.
+    """Encodes the given integer into a string of the minimum number of bytes.
     """
     if i < len(_varint_cache):
         return _varint_cache[i]