Matt Chaput avatar Matt Chaput committed ce6302d

Finished work on tiered numeric ranges.

Comments (0)

Files changed (4)

src/whoosh/query.py

 
 
 class NumericRange(Query):
+    """A range query for NUMERIC fields. Takes advantage of tiered indexing
+    to speed up large ranges by matching at a high resolution at the edges of
+    the range and a low resolution in the middle.
+    
+    >>> # Match numbers from 10 to 5925 in the "number" field.
+    >>> nr = NumericRange("number", 10, 5925)
+    """
+    
     def __init__(self, fieldname, start, end, startexcl=False, endexcl=False,
                  boost=1.0):
         """
         :param fieldname: The name of the field to search.
-        :param start: Match terms equal to or greater than this.
-        :param end: Match terms equal to or less than this.
+        :param start: Match terms equal to or greater than this number. This
+            should be a number type, not a string.
+        :param end: Match terms equal to or less than this number. This should
+            be a number type, not a string.
         :param startexcl: If True, the range start is exclusive. If False, the
             range start is inclusive.
         :param endexcl: If True, the range end is exclusive. If False, the
         return NumericRange(self.fieldname, self.start, self.end,
                             self.startexcl, self.endexcl, boost=self.boost)
     
-    def _query(self, searcher):
+    def simplify(self, ixreader):
+        return self._or_query(ixreader).simplify(ixreader)
+    
+    def estimate_size(self, ixreader):
+        return self._or_query(ixreader).estimate_size(ixreader)
+    
+    def docs(self, searcher, exclude_docs=None):
+        q = self._or_query(searcher.reader())
+        return q.docs(searcher, exclude_docs=exclude_docs)
+    
+    def _or_query(self, ixreader):
         from whoosh.fields import NUMERIC
-        from whoosh.support.numeric import split_range
+        from whoosh.support.numeric import tiered_ranges
         
-        field = searcher.field(self.fieldname)
+        field = ixreader.field(self.fieldname)
         if not isinstance(field, NUMERIC):
             raise Exception("NumericRange: field %r is not numeric" % self.fieldname)
         
-        if field.type is int:
-            valsize = 32
+        subqueries = []
+        # Get the term ranges for the different resolutions
+        for starttext, endtext in tiered_ranges(field.type, self.start,
+                                                self.end, field.shift_step):
+            if starttext == endtext:
+                subq = Term(self.fieldname, starttext)
+            else:
+                subq = TermRange(self.fieldname, starttext, endtext)
+            subqueries.append(subq)
+        
+        if len(subqueries) == 1:
+            return subqueries[0] 
+        elif subqueries:
+            return Or(subqueries, boost=self.boost)
         else:
-            valsize = 64
+            return NullQuery
+        
+    def matcher(self, searcher, exclude_docs=None):
+        q = self._or_query(searcher.reader())
+        return q.matcher(searcher, exclude_docs=exclude_docs)
         
 
 class Variations(MultiTerm):

src/whoosh/support/numeric.py

 # The functions for 7 bit encoding are still available (to_7bit and from_7bit)
 # if needed.
 
-def int_to_text(x, shift=0):
-    x += (1 << 16) - 1
-    if shift:
-        x >>= shift
-    return chr(shift) + u"%08x" % x
-
-def text_to_int(text):
-    if len(text) == 9:
-        text = text[1:]
-    x = int(text, 16)
-    x -= (1 << 16) - 1
-    return x
-
-def long_to_text(x, shift=0):
-    x += (1 << 32) - 1
-    if shift:
-        x >>= shift
-    return chr(shift) + u"%016x" % x
-
-def text_to_long(text):
-    if len(text) == 17:
-        text = text[1:]
-    x = long(text, 16)
-    x -= (1 << 32) - 1
-    return x
-
 _dstruct = struct.Struct("<d")
 _qstruct = struct.Struct("<q")
 _dpack, _dunpack = _dstruct.pack, _dstruct.unpack
 _qpack, _qunpack = _qstruct.pack, _qstruct.unpack
-def float_to_long(x, shift=0):
+
+# Functions for converting numbers to and from sortable representations
+
+def int_to_sortable_int(x):
+    x += 1 << 31
+    assert x >= 0
+    return x
+def sortable_int_to_int(x):
+    x -= 1 << 31
+    return x
+def long_to_sortable_long(x):
+    x += 1 << 63
+    assert x >= 0
+    return x
+def sortable_long_to_long(x):
+    x -= 1 << 63
+    return x
+def float_to_sortable_long(x):
     x = _qunpack(_dpack(x))[0]
-    x += (1 << (8 << 2)) - 1
-    if shift:
-        x >>= shift
+    if x<0:
+        x ^= 0x7fffffffffffffff
+    x += 1 << 63
+    assert x >= 0
     return x
-    
-def long_to_float(x):
-    x -= (1 << (8 << 2)) - 1
-    return _dunpack(_qpack(x))[0]
-
-def float_to_text(x, shift=0):
-    x = _qunpack(_dpack(x))[0]
-    x += (1 << (8 << 2)) - 1
-    if shift:
-        x >>= shift
-    return u"%016x" % x
-
-def text_to_float(text):
-    x = long(text, 16)
-    x -= (1 << (8 << 2)) - 1
+def sortable_long_to_float(x):
+    x -= 1 << 63
+    if x < 0:
+        x ^= 0x7fffffffffffffff
     x = _dunpack(_qpack(x))[0]
     return x
 
+# Functions for converting numbers to and from text
+
+def int_to_text(x, shift=0):
+    x = int_to_sortable_int(x)
+    return sortable_int_to_text(x, shift)
+
+def text_to_int(text):
+    x = text_to_sortable_int(text)
+    x = sortable_int_to_int(x)
+    return x
+
+def long_to_text(x, shift=0):
+    x = long_to_sortable_long(x)
+    return sortable_long_to_text(x, shift)
+
+def text_to_long(text):
+    x = text_to_sortable_long(text)
+    x = sortable_long_to_long(x)
+    return x
+
+def float_to_text(x, shift=0):
+    x = float_to_sortable_long(x)
+    return sortable_long_to_text(x, shift)
+
+def text_to_float(text):
+    x = text_to_sortable_long(text)
+    x = sortable_long_to_float(x)
+    return x
+
+# Functions for converting sortable representations to and from text
+
+def sortable_int_to_text(x, shift=0):
+    if shift:
+        x >>= shift
+    return chr(shift) + u"%08x" % x #struct.pack(">I", x)#
+def sortable_long_to_text(x, shift=0):
+    if shift:
+        x >>= shift
+    return chr(shift) + u"%016x" % x #struct.pack(">Q", x)#
+def text_to_sortable_int(text):
+    assert len(text) == 9
+    #return struct.unpack(">I", text[1:])[0]
+    return int(text[1:], 16)
+def text_to_sortable_long(text):
+    assert len(text) == 17
+    #return struct.unpack(">Q", text[1:])[0]
+    return long(text[1:], 16)
+
+
+# Functions for generating tiered ranges
+
+def tiered_ranges(numtype, start, end, shift_step):
+    # First, convert the start and end of the range to sortable representations
+    if numtype is int:
+        valsize = 32
+        start = int_to_sortable_int(start)
+        end = int_to_sortable_int(end)
+        to_text = sortable_int_to_text
+    else:
+        valsize = 64
+        if numtype is long:
+            start = long_to_sortable_long(start)
+            end = long_to_sortable_long(end)
+        elif numtype is float:
+            # Convert floats to longs
+            start = float_to_sortable_long(start)
+            end = float_to_sortable_long(end)
+        to_text = sortable_long_to_text
+    
+    # Yield the term ranges for the different resolutions
+    for rstart, rend, shift in split_range(valsize, shift_step, start, end):
+        starttext = to_text(rstart, shift=shift)
+        endtext = to_text(rend, shift=shift)
+        
+        yield (starttext, endtext)
 
 # Functions for encoding numeric values as sequences of 7-bit ascii characters
 

tests/test_fields.py

         floatf = fields.NUMERIC(float, shift_step=0)
         
         def roundtrip(obj, num):
-            self.assertEqual(obj.from_text(obj.to_text(num)), num)
+            self.assertAlmostEqual(obj.from_text(obj.to_text(num)), num, 4)
         
         roundtrip(intf, 0)
         roundtrip(intf, 12345)
         roundtrip(floatf, -582.592)
         roundtrip(floatf, -99.42)
         
+    def test_numeric_sort(self):
+        intf = fields.NUMERIC(int, shift_step=0)
+        longf = fields.NUMERIC(long, shift_step=0)
+        floatf = fields.NUMERIC(float, shift_step=0)
+        
         from random import shuffle
         def roundtrip_sort(obj, start, end, step):
             count = start
             shuffle(scrabled)
             round = [obj.from_text(t) for t
                      in sorted([obj.to_text(n) for n in scrabled])]
-            self.assertEqual(round, rng)
+            for n1, n2 in zip(round, rng):
+                self.assertAlmostEqual(n1, n2, 2, "n1=%r n2=%r type=%s" % (n1, n2, obj.type))
         
         roundtrip_sort(intf, -100, 100, 1)
         roundtrip_sort(longf, -58902, 58249, 43)
         self.assertEqual(r[0]["id"], "b")
     
     def test_numeric_range(self):
-        schema = fields.Schema(id=fields.ID(stored=True), number=fields.NUMERIC)
-        ix = RamStorage().create_index(schema)
+        from whoosh.util import now
+        def test_type(t, start, end, step, teststart, testend):
+            fld = fields.NUMERIC(t)
+            schema = fields.Schema(id=fields.STORED, number=fld)
+            ix = RamStorage().create_index(schema)
+            
+            w = ix.writer()
+            n = start
+            while n <= end:
+                w.add_document(id=n, number=n)
+                n += step
+            w.commit()
+            
+            qp = qparser.QueryParser("number", schema=schema)
+            
+            q = qp.parse("[%s to *]" % teststart)
+            self.assertEqual(q, query.NullQuery)
+            
+            q = qp.parse("[%s to]" % teststart)
+            self.assertEqual(q.__class__, query.NumericRange)
+            self.assertEqual(q.start, teststart)
+            self.assertEqual(q.end, None)
+            
+            q = qp.parse("[to %s]" % testend)
+            self.assertEqual(q.__class__, query.NumericRange)
+            self.assertEqual(q.start, None)
+            self.assertEqual(q.end, testend)
+            
+            q = qp.parse("[%s to %s]" % (teststart, testend))
+            self.assertEqual(q.__class__, query.NumericRange)
+            self.assertEqual(q.start, teststart)
+            self.assertEqual(q.end, testend)
+            
+            s = ix.searcher()
+            self.assertEqual(q._or_query(s.reader()).__class__, query.Or)
+            rng = []
+            count = teststart
+            while count <= testend:
+                rng.append(count)
+                count += step
+            
+            found = [s.stored_fields(d)["id"] for d in q.docs(s)]
+            self.assertEqual(found, rng)
         
-        w = ix.writer()
-        for n in xrange(-500, 500, 3):
-            w.add_document(id=unicode(n), number=n)
-        w.commit()
-        
-        qp = qparser.QueryParser("number", schema=schema)
-        
-        q = qp.parse("[10 to *]")
-        self.assertEqual(q, query.NullQuery)
-        
-        q = qp.parse("[to 400]")
-        self.assertEqual(q.__class__, query.NumericRange)
-        self.assertEqual(q.start, None)
-        self.assertEqual(q.end, 400)
-        
-        q = qp.parse("[10 to]")
-        self.assertEqual(q.__class__, query.NumericRange)
-        self.assertEqual(q.start, 10)
-        self.assertEqual(q.end, None)
-        
-        q = qp.parse("[10 to 400]")
-        self.assertEqual(q.__class__, query.NumericRange)
-        self.assertEqual(q.start, 10)
-        self.assertEqual(q.end, 400)
-        
-        
+        test_type(float, -50.0, 50.0, 0.5, -45.5, 39.0)
+        test_type(int, -5, 500, 1, 10, 400)
+        test_type(int, -500, 500, 5, -350, 280)
+        test_type(long, -1000, 1000, 5, -900, 90)
+    
     def test_datetime(self):
         schema = fields.Schema(id=fields.ID(stored=True),
                                date=fields.DATETIME(stored=True))

tests/test_parsing.py

         self.assertEqual(q.__class__, query.TermRange)
         self.assertEqual(q.start, "d")
         self.assertEqual(q.fieldname, "name")
+    
+    def test_empty_numeric_range(self):
+        schema = fields.Schema(name=fields.TEXT, num=fields.NUMERIC,
+                               date=fields.DATETIME)
+        qp = qparser.QueryParser("text", schema=schema)
+        
+        for fname in ("num", "name", "date"):
+            q = qp.parse("%s:[to]" % fname)
+            self.assertEqual(q.__class__, query.TermRange)
+            self.assertEqual(q.start, '')
+            self.assertEqual(q.end, u'\uffff')
         
     def test_stopped(self):
         schema = fields.Schema(text = fields.TEXT)
         q = parser.parse(u"a OR ((b AND c AND d AND e) OR f OR g) ANDNOT h")
         self.assertEqual(unicode(q), u"(a:a OR ((a:b AND a:c AND a:d AND a:e) OR a:f OR a:g) ANDNOT a:h)")
         
+    def test_math(self):
+        tk = analysis.RegexTokenizer(r"([A-Za-z_\\]+)|(-?[0-9.]+)|([^A-Za-z_0-9. \t\r\n]+)")
+        ana = tk | analysis.LowercaseFilter()
+        print [t.text for t in ana(u"x2+6x+7")]
+        
+        schema = fields.Schema(a=fields.TEXT(analyzer=ana))
+        parser = qparser.QueryParser("a", schema=schema)
+        print parser.parse(u"+")
+        print parser.parse(u"5+4")
+
+        
 
 
 if __name__ == '__main__':
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.