Matt Chaput avatar Matt Chaput committed 507ad38

Created whoosh.support.numeric to prepare for tiered numeric indexing. Minor fixes.

Comments (0)

Files changed (8)

benchmark/reuters.py

 from whoosh import analysis, fields, index, qparser, query
 from whoosh.util import now
 
-ana = analysis.StemmingAnalyzer()
+#ana = analysis.StemmingAnalyzer()
+ana = analysis.StandardAnalyzer()
 schema = fields.Schema(id=fields.ID(stored=True),
                        headline=fields.STORED,
                        text=fields.TEXT(analyzer=ana, stored=True))

src/whoosh/fields.py

                              StandardAnalyzer, NgramAnalyzer, Tokenizer,
                              NgramWordAnalyzer, Analyzer)
 from whoosh.formats import Format, Existence, Frequency, Positions
+from whoosh.support.numeric import (int_to_text, text_to_int, long_to_text,
+                                    text_to_long, float_to_text, text_to_float)
 
 
 # Exceptions
     >>> w.commit()
     """
     
-    def __init__(self, type=int, stored=False, unique=False, field_boost=1.0):
+    def __init__(self, type=int, stored=False, unique=False, field_boost=1.0,
+                 small=True):
         """
         :param type: the type of numbers that can be stored in this field: one
             of ``int``, ``long``, or ``float``.
         """
         
         self.type = type
+        if self.type is int:
+            self._to_text = int_to_text
+            self._from_text = text_to_int
+        elif self.type is long:
+            self._to_text = long_to_text
+            self._from_text = text_to_long
+        elif self.type is float:
+            self._to_text =  float_to_text
+            self._from_text = text_to_float
+        
         self.stored = stored
         self.unique = unique
+        self.small = small
         self.format = Existence(analyzer=IDAnalyzer(), field_boost=field_boost)
     
     def index(self, num):
-        method = getattr(self, self.type.__name__ + "_to_text")
+        _to_text = self._to_text
         # word, freq, weight, valuestring
-        return [(method(num), 1, 1.0, '')]
+        return [(_to_text(num), 1, 1.0, '')]
     
     def to_text(self, x):
-        ntype = self.type
-        method = getattr(self, ntype.__name__ + "_to_text")
-        return method(ntype(x))
+        return self._to_text(self.type(x))
     
     def process_text(self, text, **kwargs):
         return (self.to_text(text),)
         from whoosh import query
         return query.Term(fieldname, self.to_text(qstring), boost=boost)
     
-    @staticmethod
-    def int_to_text(x):
-        x += (1 << (4 << 2)) - 1 # 4 means 32-bits
-        return u"%08x" % x
-    
-    @staticmethod
-    def text_to_int(text):
-        x = int(text, 16)
-        x -= (1 << (4 << 2)) - 1
-        return x
-    
-    @staticmethod
-    def long_to_text(x):
-        x += (1 << (8 << 2)) - 1
-        return u"%016x" % x
-    
-    @staticmethod
-    def text_to_long(text):
-        x = long(text, 16)
-        x -= (1 << (8 << 2)) - 1
-        return x
-    
-    @staticmethod
-    def float_to_text(x):
-        x = struct.unpack("<q", struct.pack("<d", x))[0]
-        x += (1 << (8 << 2)) - 1
-        return u"%016x" % x
-    
-    @staticmethod
-    def text_to_float(text):
-        x = long(text, 16)
-        x -= (1 << (8 << 2)) - 1
-        x = struct.unpack("<d", struct.pack("<q", x))[0]
-        return x
-    
 
 class DATETIME(FieldType):
     """Special field type that lets you index datetime objects. The field

src/whoosh/matching.py

         """Moves this matcher to the next posting.
         """
         
-        raise NotImplementedError
+        raise NotImplementedError(self.__class__.__name__)
     
     def weight(self):
         """Returns the weight of the current posting.

src/whoosh/query.py

     def matcher(self, searcher, exclude_docs=None):
         fieldname = self.fieldname
         qs = [Term(fieldname, word) for word in self._words(searcher.reader())]
-        if qs:
-            return Or(qs).matcher(searcher, exclude_docs=exclude_docs)
+        if not qs: return NullMatcher()
+        
+        if len(qs) == 1:
+            q = qs[0]
         else:
-            return NullMatcher()
-
+            q = Or(qs)
+        return q.matcher(searcher, exclude_docs=exclude_docs)
+        
 
 # Concrete classes
 
         fieldname = self.fieldname
         s = set()
         
+        # This is a hacky hack, but just create an in-memory set of all the
+        # document numbers of every term in the field
         for text in searcher.lexicon(fieldname):
             pr = searcher.postings(fieldname, text)
-            s = s.union(pr.all_ids())
-        
+            s.update(pr.all_ids())
         if exclude_docs:
-            s.difference(exclude_docs)
+            s.difference_update(exclude_docs)
         
         return ListMatcher(sorted(s), weight=self.boost)
 

src/whoosh/support/numeric.py

+#===============================================================================
+# Copyright 2010 Matt Chaput
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+import struct
+from array import array
+
+
+def split_range(valsize, step, minbound, maxbound):
+    """Splits a range of numbers (from ``minbound`` to ``maxbound``, inclusive)
+    into a sequence of trie ranges of the form ``(start, end, shift)``.
+    The consumer of these tuples is expected to shift the ``start`` and ``end``
+    right by ``shift``.
+    
+    This is used for generating term ranges for a numeric field. The queries
+    for the edges of the range are generated at high precision and large blocks
+    in the middle are generated at low precision.
+    """
+    
+    shift = 0
+    while True:
+        diff = 1 << (shift + step)
+        mask = ((1 << step) - 1) << shift
+        
+        haslower = (minbound & mask) != 0
+        hasupper = (maxbound & mask) != mask
+        
+        not_mask = ~mask & ((1 << valsize+1) - 1)
+        nextmin = (minbound + diff if haslower else minbound) & not_mask
+        nextmax = (maxbound - diff if hasupper else maxbound) & not_mask
+        
+        if shift + step >= valsize or nextmin > nextmax:
+            yield (minbound, maxbound | ((1 << shift) - 1), shift)
+            break
+        
+        if haslower:
+            yield (minbound, (minbound | mask) | ((1 << shift) - 1), shift)
+        if hasupper:
+            yield (maxbound & not_mask, maxbound | ((1 << shift) - 1), shift)
+        
+        minbound = nextmin
+        maxbound = nextmax
+        shift += step
+
+
+def index_numbers(nums, ntype, step):
+    pass
+
+# These functions use hexadecimal strings to encode the numbers, rather than
+# converting them to text using a 7-bit encoding, because while the hex
+# representation uses more space (8 bytes as opposed to 5 bytes for a 32 bit
+# number), it's 5 times faster to encode/decode.
+#
+# The functions for 7 bit encoding are still available (to_7bit and from_7bit)
+# if needed.
+
+def int_to_text(x):
+    x += (1 << (4 << 2)) - 1 # 4 means 32-bits
+    return u"%08x" % x
+
+def text_to_int(text):
+    x = int(text, 16)
+    x -= (1 << (4 << 2)) - 1
+    return x
+
+def long_to_text(x):
+    x += (1 << (8 << 2)) - 1
+    return u"%016x" % x
+
+def text_to_long(text):
+    x = long(text, 16)
+    x -= (1 << (8 << 2)) - 1
+    return x
+
+def float_to_text(x):
+    x = struct.unpack("<q", struct.pack("<d", x))[0]
+    x += (1 << (8 << 2)) - 1
+    return u"%016x" % x
+
+def text_to_float(text):
+    x = long(text, 16)
+    x -= (1 << (8 << 2)) - 1
+    x = struct.unpack("<d", struct.pack("<q", x))[0]
+    return x
+
+
+# Functions for encoding numeric values as sequences of 7-bit ascii characters
+
+def to_7bit(x, islong):
+    if not islong:
+        shift = 31
+        nchars = 5
+    else:
+        shift = 62
+        nchars = 10
+
+    buffer = array("c", "\x00" * nchars)
+    x += (1 << shift) - 1
+    while x:
+        buffer[nchars - 1] = chr(x & 0x7f)
+        x >>= 7
+        nchars -= 1
+    return buffer.tostring()
+
+def from_7bit(text):
+    if len(text) == 5:
+        shift = 31
+    elif len(text) == 10:
+        shift = 62
+    else:
+        raise ValueError("text is not 5 or 10 bytes")
+
+    x = 0
+    for char in text:
+        x <<= 7
+        char = ord(char)
+        if char > 0x7f:
+            raise Exception
+        x |= char
+    x -= (1 << shift) - 1
+    return int(x)

src/whoosh/util.py

         last = decoded
 
 
-def to_7bit(x, islong):
-    if not islong:
-        shift = 31
-        nchars = 5
-    else:
-        shift = 62
-        nchars = 10
-
-    buffer = array("c", "\x00" * nchars)
-    x += (1 << shift) - 1
-    while x:
-        buffer[nchars - 1] = chr(x & 0x7f)
-        x >>= 7
-        nchars -= 1
-        return buffer.tostring()
-
-def from_7bit(text):
-    if len(text) == 5:
-        shift = 31
-    elif len(text) == 10:
-        shift = 62
-    else:
-        raise ValueError("text is not 5 or 10 bytes")
-
-    x = 0
-    for char in text:
-        x <<= 7
-        char = ord(char)
-        if char > 0x7f:
-            raise Exception
-        x |= char
-    x -= (1 << shift) - 1
-    return x
-
-
 _nkre = re.compile(r"\D+|\d+", re.UNICODE)
 def _nkconv(i):
     try:

tests/test_results.py

         self.assertEqual(r.pagecount, 1)
         self.assertEqual(r.pagenum, 1)
     
+    def test_resultspage(self):
+        schema = fields.Schema(id=fields.STORED, content=fields.TEXT)
+        st = RamStorage()
+        ix = st.create_index(schema)
+        
+        domain = ("alfa", "bravo", "bravo", "charlie", "delta")
+        w = ix.writer()
+        i = 0
+        for lst in permutations(domain, 3):
+            w.add_document(id=unicode(i), content=u" ".join(lst))
+            i += 1
+        w.commit()
+        
+        s = ix.searcher()
+        q = query.Term("content", u"bravo")
+        r = s.search(q, limit=10)
+        tops = list(r)
+        
+        rp = s.search_page(q, 1, pagelen=5)
+        self.assertEqual(list(rp), tops[0:5])
+        
+        rp = s.search_page(q, 2, pagelen=5)
+        self.assertEqual(list(rp), tops[5:10])
+        
+        rp = s.search_page(q, 1, pagelen=10)
+        self.assertEqual(len(rp), 54)
+        self.assertEqual(rp.pagecount, 6)
+        rp = s.search_page(q, 6, pagelen=10)
+        self.assertEqual(len(list(rp)), 4)
+        self.assertTrue(rp.is_last_page())
+        
+        self.assertRaises(ValueError, s.search_page, q, 0)
+        self.assertRaises(ValueError, s.search_page, q, 7)
+        
+        rp = s.search_page(query.Term("content", "glonk"), 1)
+        self.assertEqual(len(rp), 0)
+        self.assertTrue(rp.is_last_page())
+    
     def test_keyterms(self):
         ana = analysis.StandardAnalyzer()
         vectorformat = formats.Frequency(ana)

tests/test_searching.py

         s = s.refresh()
         self.assertTrue(s.up_to_date())
 
-    def test_resultspage(self):
-        schema = fields.Schema(id=fields.STORED, content=fields.TEXT)
-        st = RamStorage()
-        ix = st.create_index(schema)
+    def test_find_missing(self):
+        schema = fields.Schema(id=fields.ID, text=fields.KEYWORD(stored=True))
+        ix = RamStorage().create_index(schema)
         
-        domain = ("alfa", "bravo", "bravo", "charlie", "delta")
         w = ix.writer()
-        i = 0
-        for lst in permutations(domain, 3):
-            w.add_document(id=unicode(i), content=u" ".join(lst))
-            i += 1
+        w.add_document(id=u"1", text=u"alfa")
+        w.add_document(id=u"2", text=u"bravo")
+        w.add_document(text=u"charlie")
+        w.add_document(id=u"4", text=u"delta")
+        w.add_document(text=u"echo")
+        w.add_document(id=u"6", text=u"foxtrot")
+        w.add_document(text=u"golf")
         w.commit()
         
         s = ix.searcher()
-        q = query.Term("content", u"bravo")
-        r = s.search(q, limit=10)
-        tops = list(r)
-        
-        rp = s.search_page(q, 1, pagelen=5)
-        self.assertEqual(list(rp), tops[0:5])
-        
-        rp = s.search_page(q, 2, pagelen=5)
-        self.assertEqual(list(rp), tops[5:10])
-        
-        rp = s.search_page(q, 1, pagelen=10)
-        self.assertEqual(len(rp), 54)
-        self.assertEqual(rp.pagecount, 6)
-        rp = s.search_page(q, 6, pagelen=10)
-        self.assertEqual(len(list(rp)), 4)
-        self.assertTrue(rp.is_last_page())
-        
-        self.assertRaises(ValueError, s.search_page, q, 0)
-        self.assertRaises(ValueError, s.search_page, q, 7)
-        
-        rp = s.search_page(query.Term("content", "glonk"), 1)
-        self.assertEqual(len(rp), 0)
-        self.assertTrue(rp.is_last_page())
-        
+        qp = qparser.QueryParser("text", schema=schema)
+        q = qp.parse(u"NOT id:*")
+        r = s.search(q, limit=None)
+        self.assertEqual(list(h["text"] for h in r), ["charlie", "echo", "golf"])
+
+
+
+
+
+
 
 
 if __name__ == '__main__':
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.