Commits

Matt Chaput committed 507ad38

Created whoosh.support.numeric to prepare for tiered numeric indexing. Minor fixes.

Comments (0)

Files changed (8)

 from whoosh import analysis, fields, index, qparser, query
 from whoosh.util import now
 
-ana = analysis.StemmingAnalyzer()
+#ana = analysis.StemmingAnalyzer()
+ana = analysis.StandardAnalyzer()
 schema = fields.Schema(id=fields.ID(stored=True),
                        headline=fields.STORED,
                        text=fields.TEXT(analyzer=ana, stored=True))
                              StandardAnalyzer, NgramAnalyzer, Tokenizer,
                              NgramWordAnalyzer, Analyzer)
 from whoosh.formats import Format, Existence, Frequency, Positions
+from whoosh.support.numeric import (int_to_text, text_to_int, long_to_text,
+                                    text_to_long, float_to_text, text_to_float)
 
 
 # Exceptions
     >>> w.commit()
     """
     
-    def __init__(self, type=int, stored=False, unique=False, field_boost=1.0):
+    def __init__(self, type=int, stored=False, unique=False, field_boost=1.0,
+                 small=True):
         """
         :param type: the type of numbers that can be stored in this field: one
             of ``int``, ``long``, or ``float``.
         """
         
         self.type = type
+        if self.type is int:
+            self._to_text = int_to_text
+            self._from_text = text_to_int
+        elif self.type is long:
+            self._to_text = long_to_text
+            self._from_text = text_to_long
+        elif self.type is float:
+            self._to_text =  float_to_text
+            self._from_text = text_to_float
+        
         self.stored = stored
         self.unique = unique
+        self.small = small
         self.format = Existence(analyzer=IDAnalyzer(), field_boost=field_boost)
     
     def index(self, num):
-        method = getattr(self, self.type.__name__ + "_to_text")
+        _to_text = self._to_text
         # word, freq, weight, valuestring
-        return [(method(num), 1, 1.0, '')]
+        return [(_to_text(num), 1, 1.0, '')]
     
     def to_text(self, x):
-        ntype = self.type
-        method = getattr(self, ntype.__name__ + "_to_text")
-        return method(ntype(x))
+        return self._to_text(self.type(x))
     
     def process_text(self, text, **kwargs):
         return (self.to_text(text),)
         from whoosh import query
         return query.Term(fieldname, self.to_text(qstring), boost=boost)
     
-    @staticmethod
-    def int_to_text(x):
-        x += (1 << (4 << 2)) - 1 # 4 means 32-bits
-        return u"%08x" % x
-    
-    @staticmethod
-    def text_to_int(text):
-        x = int(text, 16)
-        x -= (1 << (4 << 2)) - 1
-        return x
-    
-    @staticmethod
-    def long_to_text(x):
-        x += (1 << (8 << 2)) - 1
-        return u"%016x" % x
-    
-    @staticmethod
-    def text_to_long(text):
-        x = long(text, 16)
-        x -= (1 << (8 << 2)) - 1
-        return x
-    
-    @staticmethod
-    def float_to_text(x):
-        x = struct.unpack("<q", struct.pack("<d", x))[0]
-        x += (1 << (8 << 2)) - 1
-        return u"%016x" % x
-    
-    @staticmethod
-    def text_to_float(text):
-        x = long(text, 16)
-        x -= (1 << (8 << 2)) - 1
-        x = struct.unpack("<d", struct.pack("<q", x))[0]
-        return x
-    
 
 class DATETIME(FieldType):
     """Special field type that lets you index datetime objects. The field

src/whoosh/matching.py

         """Moves this matcher to the next posting.
         """
         
-        raise NotImplementedError
+        raise NotImplementedError(self.__class__.__name__)
     
     def weight(self):
         """Returns the weight of the current posting.
     def matcher(self, searcher, exclude_docs=None):
         fieldname = self.fieldname
         qs = [Term(fieldname, word) for word in self._words(searcher.reader())]
-        if qs:
-            return Or(qs).matcher(searcher, exclude_docs=exclude_docs)
+        if not qs: return NullMatcher()
+        
+        if len(qs) == 1:
+            q = qs[0]
         else:
-            return NullMatcher()
-
+            q = Or(qs)
+        return q.matcher(searcher, exclude_docs=exclude_docs)
+        
 
 # Concrete classes
 
         fieldname = self.fieldname
         s = set()
         
+        # This is a hacky hack, but just create an in-memory set of all the
+        # document numbers of every term in the field
         for text in searcher.lexicon(fieldname):
             pr = searcher.postings(fieldname, text)
-            s = s.union(pr.all_ids())
-        
+            s.update(pr.all_ids())
         if exclude_docs:
-            s.difference(exclude_docs)
+            s.difference_update(exclude_docs)
         
         return ListMatcher(sorted(s), weight=self.boost)
 

src/whoosh/support/numeric.py

+#===============================================================================
+# Copyright 2010 Matt Chaput
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+import struct
+from array import array
+
+
+def split_range(valsize, step, minbound, maxbound):
+    """Splits a range of numbers (from ``minbound`` to ``maxbound``, inclusive)
+    into a sequence of trie ranges of the form ``(start, end, shift)``.
+    The consumer of these tuples is expected to shift the ``start`` and ``end``
+    right by ``shift``.
+    
+    This is used for generating term ranges for a numeric field. The queries
+    for the edges of the range are generated at high precision and large blocks
+    in the middle are generated at low precision.
+    """
+    
+    shift = 0
+    while True:
+        diff = 1 << (shift + step)
+        mask = ((1 << step) - 1) << shift
+        
+        haslower = (minbound & mask) != 0
+        hasupper = (maxbound & mask) != mask
+        
+        not_mask = ~mask & ((1 << valsize+1) - 1)
+        nextmin = (minbound + diff if haslower else minbound) & not_mask
+        nextmax = (maxbound - diff if hasupper else maxbound) & not_mask
+        
+        if shift + step >= valsize or nextmin > nextmax:
+            yield (minbound, maxbound | ((1 << shift) - 1), shift)
+            break
+        
+        if haslower:
+            yield (minbound, (minbound | mask) | ((1 << shift) - 1), shift)
+        if hasupper:
+            yield (maxbound & not_mask, maxbound | ((1 << shift) - 1), shift)
+        
+        minbound = nextmin
+        maxbound = nextmax
+        shift += step
+
+
+def index_numbers(nums, ntype, step):
+    pass
+
+# These functions use hexadecimal strings to encode the numbers, rather than
+# converting them to text using a 7-bit encoding, because while the hex
+# representation uses more space (8 bytes as opposed to 5 bytes for a 32 bit
+# number), it's 5 times faster to encode/decode.
+#
+# The functions for 7 bit encoding are still available (to_7bit and from_7bit)
+# if needed.
+
+def int_to_text(x):
+    x += (1 << (4 << 2)) - 1 # 4 means 32-bits
+    return u"%08x" % x
+
+def text_to_int(text):
+    x = int(text, 16)
+    x -= (1 << (4 << 2)) - 1
+    return x
+
+def long_to_text(x):
+    x += (1 << (8 << 2)) - 1
+    return u"%016x" % x
+
+def text_to_long(text):
+    x = long(text, 16)
+    x -= (1 << (8 << 2)) - 1
+    return x
+
+def float_to_text(x):
+    x = struct.unpack("<q", struct.pack("<d", x))[0]
+    x += (1 << (8 << 2)) - 1
+    return u"%016x" % x
+
+def text_to_float(text):
+    x = long(text, 16)
+    x -= (1 << (8 << 2)) - 1
+    x = struct.unpack("<d", struct.pack("<q", x))[0]
+    return x
+
+
+# Functions for encoding numeric values as sequences of 7-bit ascii characters
+
+def to_7bit(x, islong):
+    if not islong:
+        shift = 31
+        nchars = 5
+    else:
+        shift = 62
+        nchars = 10
+
+    buffer = array("c", "\x00" * nchars)
+    x += (1 << shift) - 1
+    while x:
+        buffer[nchars - 1] = chr(x & 0x7f)
+        x >>= 7
+        nchars -= 1
+    return buffer.tostring()
+
+def from_7bit(text):
+    if len(text) == 5:
+        shift = 31
+    elif len(text) == 10:
+        shift = 62
+    else:
+        raise ValueError("text is not 5 or 10 bytes")
+
+    x = 0
+    for char in text:
+        x <<= 7
+        char = ord(char)
+        if char > 0x7f:
+            raise Exception
+        x |= char
+    x -= (1 << shift) - 1
+    return int(x)
         last = decoded
 
 
-def to_7bit(x, islong):
-    if not islong:
-        shift = 31
-        nchars = 5
-    else:
-        shift = 62
-        nchars = 10
-
-    buffer = array("c", "\x00" * nchars)
-    x += (1 << shift) - 1
-    while x:
-        buffer[nchars - 1] = chr(x & 0x7f)
-        x >>= 7
-        nchars -= 1
-        return buffer.tostring()
-
-def from_7bit(text):
-    if len(text) == 5:
-        shift = 31
-    elif len(text) == 10:
-        shift = 62
-    else:
-        raise ValueError("text is not 5 or 10 bytes")
-
-    x = 0
-    for char in text:
-        x <<= 7
-        char = ord(char)
-        if char > 0x7f:
-            raise Exception
-        x |= char
-    x -= (1 << shift) - 1
-    return x
-
-
 _nkre = re.compile(r"\D+|\d+", re.UNICODE)
 def _nkconv(i):
     try:

tests/test_results.py

         self.assertEqual(r.pagecount, 1)
         self.assertEqual(r.pagenum, 1)
     
+    def test_resultspage(self):
+        schema = fields.Schema(id=fields.STORED, content=fields.TEXT)
+        st = RamStorage()
+        ix = st.create_index(schema)
+        
+        domain = ("alfa", "bravo", "bravo", "charlie", "delta")
+        w = ix.writer()
+        i = 0
+        for lst in permutations(domain, 3):
+            w.add_document(id=unicode(i), content=u" ".join(lst))
+            i += 1
+        w.commit()
+        
+        s = ix.searcher()
+        q = query.Term("content", u"bravo")
+        r = s.search(q, limit=10)
+        tops = list(r)
+        
+        rp = s.search_page(q, 1, pagelen=5)
+        self.assertEqual(list(rp), tops[0:5])
+        
+        rp = s.search_page(q, 2, pagelen=5)
+        self.assertEqual(list(rp), tops[5:10])
+        
+        rp = s.search_page(q, 1, pagelen=10)
+        self.assertEqual(len(rp), 54)
+        self.assertEqual(rp.pagecount, 6)
+        rp = s.search_page(q, 6, pagelen=10)
+        self.assertEqual(len(list(rp)), 4)
+        self.assertTrue(rp.is_last_page())
+        
+        self.assertRaises(ValueError, s.search_page, q, 0)
+        self.assertRaises(ValueError, s.search_page, q, 7)
+        
+        rp = s.search_page(query.Term("content", "glonk"), 1)
+        self.assertEqual(len(rp), 0)
+        self.assertTrue(rp.is_last_page())
+    
     def test_keyterms(self):
         ana = analysis.StandardAnalyzer()
         vectorformat = formats.Frequency(ana)

tests/test_searching.py

         s = s.refresh()
         self.assertTrue(s.up_to_date())
 
-    def test_resultspage(self):
-        schema = fields.Schema(id=fields.STORED, content=fields.TEXT)
-        st = RamStorage()
-        ix = st.create_index(schema)
+    def test_find_missing(self):
+        schema = fields.Schema(id=fields.ID, text=fields.KEYWORD(stored=True))
+        ix = RamStorage().create_index(schema)
         
-        domain = ("alfa", "bravo", "bravo", "charlie", "delta")
         w = ix.writer()
-        i = 0
-        for lst in permutations(domain, 3):
-            w.add_document(id=unicode(i), content=u" ".join(lst))
-            i += 1
+        w.add_document(id=u"1", text=u"alfa")
+        w.add_document(id=u"2", text=u"bravo")
+        w.add_document(text=u"charlie")
+        w.add_document(id=u"4", text=u"delta")
+        w.add_document(text=u"echo")
+        w.add_document(id=u"6", text=u"foxtrot")
+        w.add_document(text=u"golf")
         w.commit()
         
         s = ix.searcher()
-        q = query.Term("content", u"bravo")
-        r = s.search(q, limit=10)
-        tops = list(r)
-        
-        rp = s.search_page(q, 1, pagelen=5)
-        self.assertEqual(list(rp), tops[0:5])
-        
-        rp = s.search_page(q, 2, pagelen=5)
-        self.assertEqual(list(rp), tops[5:10])
-        
-        rp = s.search_page(q, 1, pagelen=10)
-        self.assertEqual(len(rp), 54)
-        self.assertEqual(rp.pagecount, 6)
-        rp = s.search_page(q, 6, pagelen=10)
-        self.assertEqual(len(list(rp)), 4)
-        self.assertTrue(rp.is_last_page())
-        
-        self.assertRaises(ValueError, s.search_page, q, 0)
-        self.assertRaises(ValueError, s.search_page, q, 7)
-        
-        rp = s.search_page(query.Term("content", "glonk"), 1)
-        self.assertEqual(len(rp), 0)
-        self.assertTrue(rp.is_last_page())
-        
+        qp = qparser.QueryParser("text", schema=schema)
+        q = qp.parse(u"NOT id:*")
+        r = s.search(q, limit=None)
+        self.assertEqual(list(h["text"] for h in r), ["charlie", "echo", "golf"])
+
+
+
+
+
+
 
 
 if __name__ == '__main__':