1. Matt Chaput
  2. whoosh

Commits

Matt Chaput  committed 54b57d7

Added range facets.
Added better "missing value" handling for faceting numeric and date fields.

  • Participants
  • Parent commits f36a348
  • Branches flexisort

Comments (0)

Files changed (7)

File src/whoosh/fields.py

View file
  • Ignore whitespace
 from whoosh.support.times import datetime_to_long
 
 
+# "Default" values to indicate missing values when sorting and faceting numeric
+# fields. There's no "out-of-band" value possible (except for floats, where we
+# use NaN), so we try to be conspicuous at least by using the maximum possible
+# value
+NUMERIC_DEFAULTS = {"b": 2**7-1, "B": 2**8-1, "h": 2**15-1, "H": 2**16-1,
+                    "i": 2**31-1, "I": 2**32-1, "q": 2**63-1, "Q": 2**64-1,
+                    "f": float("nan"), "d": float("nan"),
+                    }
+DEFAULT_LONG = NUMERIC_DEFAULTS["q"]
+
 # Exceptions
 
 class FieldConfigurationError(Exception):
     analyzer = format = vector = scorable = stored = unique = None
     indexed = True
     multitoken_query = "first"
-    sortable_type = text_type
     sortable_typecode = None
     spelling = False
     
         else:
             return False
     
+    def sortable_default(self):
+        """Returns a default value to use for "missing" values when sorting or
+        faceting in this field.
+        """
+        
+        return u("")
+    
     def to_text(self, value):
         """Returns a textual representation of the value. Non-textual fields
         (such as NUMERIC and DATETIME) will override this to encode objects
         for sorting. The default implementation simply returns the texts of all
         terms in the field.
         
-        The value of the field's ``sortable_type`` attribute should contain the
-        type of the second item (the sortable value) in the pairs, e.g.
-        ``unicode`` or ``int``.
-        
         This can be overridden by field types such as NUMERIC where some values
         in a field are not useful for sorting, and where the sortable values
         can be expressed more compactly as numbers.
         """
         
         self.type = type
-        if self.type is int:
-            self.sortable_type = int
-            if PY3:
-                self._to_text = long_to_text
-                self._from_text = text_to_long
-                self.sortable_typecode = "q" if signed else "Q"
-            else:
-                self._to_text = int_to_text
-                self._from_text = text_to_int
-                self.sortable_typecode = "i" if signed else "I"
-        elif self.type is long_type:
+        if self.type is long_type:
+            # This will catch the Python 3 int type
             self._to_text = long_to_text
             self._from_text = text_to_long
-            self.sortable_type = long_type
             self.sortable_typecode = "q" if signed else "Q"
+        elif self.type is int:
+            self._to_text = int_to_text
+            self._from_text = text_to_int
+            self.sortable_typecode = "i" if signed else "I"
         elif self.type is float:
             self._to_text = float_to_text
             self._from_text = text_to_float
         self.analyzer = IDAnalyzer()
         self.format = Existence(field_boost=field_boost)
     
+    def sortable_default(self):
+        return NUMERIC_DEFAULTS[self.sortable_typecode]
+    
     def _tiers(self, num):
         t = self.type
         if t is int and not PY3:

File src/whoosh/filedb/fieldcache.py

View file
  • Ignore whitespace
     # Class constructor for building a field cache from a reader
     
     @classmethod
-    def from_field(cls, ixreader, fieldname, default=u("")):
+    def from_field(cls, ixreader, fieldname):
         """Creates an in-memory field cache from a reader.
         
         >>> r = ix.reader()
         texts = None
         if hastexts:
             typecode = "I"
-            texts = [default]
+            texts = [field.sortable_default()]
+            defaultnum = 0
         else:
             typecode = field.sortable_typecode
+            defaultnum = field.sortable_default()
         
         doccount = ixreader.doc_count_all()
         # Python does not support arrays of long long see Issue 1172711
         if typecode.lower() == "q":
-            order = [0] * doccount
+            order = [defaultnum] * doccount
         else:
-            order = array(typecode, [0] * doccount)
+            order = array(typecode, [defaultnum] * doccount)
         
         enum = enumerate(field.sortable_values(ixreader, fieldname))
         for i, (text, sortable) in enum:

File src/whoosh/qparser/plugins.py

View file
  • Ignore whitespace
             prev_field_node = None
             
             for node in group:
-                if (isinstance(node, fnclass) and node.fieldname not in schema):
+                if isinstance(node, fnclass) and node.fieldname not in schema:
                     prev_field_node = node
                     continue
                 elif prev_field_node:

File src/whoosh/searching.py

View file
  • Ignore whitespace
         if self.facets is not None:
             groups = self.groups
             for name, catter in self.categorizers.items():
-                key = catter.key_for_id(id)
+                key = catter.key_to_name(catter.key_for_id(id))
                 if self.groupids:
                     if name not in groups:
                         groups[name] = defaultdict(list)
         
         if name not in self._groups:
             raise KeyError("%r not in group names %r" % (name, self._groups.keys()))
-        return self._groups[name]
+        return dict(self._groups[name])
     
     def _load_docs(self):
         # If self.docset is None, that means this results object was created

File src/whoosh/sorting.py

View file
  • Ignore whitespace
 
 from array import array
 
-from whoosh.compat import string_type
+from whoosh.compat import string_type, xrange
+from whoosh.fields import DEFAULT_LONG
+from whoosh.support.times import (long_to_datetime, datetime_to_long,
+                                  timedelta_to_usecs)
 
 
 class Sorter(object):
     def key_for_id(self, docid):
         raise NotImplementedError
     
+    def key_to_name(self, key):
+        return key
+    
 
 class ScoreFacet(FacetType):
     def categorizer(self, searcher):
         self.reverse = reverse
     
     def categorizer(self, searcher):
-        from whoosh.fields import NUMERIC
+        from whoosh.fields import NUMERIC, DATETIME
         
         # The searcher we're passed here may wrap a multireader, but the
         # actual key functions will always be called per-segment following a
         # Categorizer.set_searcher method call
         fieldname = self.fieldname
-        reader = searcher.reader()
         schema = searcher.schema
-        if fieldname in schema and isinstance(schema[fieldname], NUMERIC):
+        if fieldname in schema and isinstance(schema[fieldname], DATETIME):
+            # Return a subclass of NumericFieldCategorizer that formats dates
+            return self.DateFieldCategorizer(fieldname, self.reverse)
+        elif fieldname in schema and isinstance(schema[fieldname], NUMERIC):
             # Numeric fields are naturally reversible
-            return self.NumericFieldCategorizer(reader, fieldname, self.reverse)
+            return self.NumericFieldCategorizer(fieldname, self.reverse)
         elif self.reverse:
             # If we need to "reverse" a string field, we need to do more work
-            return self.RevFieldCategorizer(reader, fieldname, self.reverse)
+            return self.RevFieldCategorizer(searcher, fieldname, self.reverse)
         else:
             # Straightforward: use the field cache to sort/categorize
             return self.FieldCategorizer(fieldname)
         
         def key_for_id(self, docid):
             return self.fieldcache.key_for(docid)
-    
+        
     class NumericFieldCategorizer(Categorizer):
-        def __init__(self, reader, fieldname, reverse):
+        def __init__(self, fieldname, reverse):
             self.fieldname = fieldname
             self.reverse = reverse
         
         def set_searcher(self, searcher, docoffset):
+            self.default = searcher.schema[self.fieldname].sortable_default()
             self.fieldcache = searcher.reader().fieldcache(self.fieldname)
         
         def key_for_id(self, docid):
                 return 0 - value
             else:
                 return value
+        
+        def key_to_name(self, key):
+            if key == self.default:
+                return None
+            else:
+                return key
+    
+    class DateFieldCategorizer(NumericFieldCategorizer):
+        def key_to_name(self, key):
+            if key == DEFAULT_LONG:
+                return None
+            else:
+                return long_to_datetime(key)
     
     class RevFieldCategorizer(Categorizer):
         def __init__(self, reader, fieldname, reverse):
 
 
 class QueryFacet(FacetType):
-    def __init__(self, querydict, other="none"):
+    def __init__(self, querydict, other=None):
         self.querydict = querydict
         self.other = other
     
             self.docsets = {}
             for qname, q in self.querydict.items():
                 docset = set(q.docs(searcher))
-                self.docsets[qname] = docset
+                if docset:
+                    self.docsets[qname] = docset
             self.offset = offset
         
         def key_for_id(self, docid):
-            if docid > 0: raise Exception
-            print "docid=", docid, "docsets=", self.docsets
             for qname in self.docsets:
                 if docid in self.docsets[qname]:
                     return qname
             return self.other
 
 
+class RangeFacet(QueryFacet):
+    def __init__(self, fieldname, start, end, gap):
+        self.fieldname = fieldname
+        self.start = start
+        self.end = end
+        self.gap = gap
+        self._queries()
+    
+    def _range_name(self, startval, endval):
+        return (startval, endval)
+    
+    def _queries(self):
+        from whoosh import query
+        
+        self.querydict = {}
+        gap = self.gap
+        cstart = self.start
+        while cstart < self.end:
+            cend = min(self.end, cstart + gap)
+            rangename = self._range_name(cstart, cend)
+            q = query.NumericRange(self.fieldname, cstart, cend, endexcl=True)
+            self.querydict[rangename] = q
+            
+            cstart += gap
+    
+    def categorizer(self, searcher):
+        return QueryFacet(self.querydict).categorizer(searcher)
+    
+
+class DateRangeFacet(RangeFacet):
+    def __init__(self, fieldname, startdate, enddate, delta):
+        self.fieldname = fieldname
+        self.start = datetime_to_long(startdate)
+        self.end = datetime_to_long(enddate)
+        self.gap = timedelta_to_usecs(delta)
+        self._queries()
+    
+    def _range_name(self, startval, endval):
+        return (long_to_datetime(startval), long_to_datetime(endval))
+    
+
 class MultiFacet(FacetType):
     def __init__(self, items=None):
         self.facets = []

File src/whoosh/support/times.py

View file
  • Ignore whitespace
         return (current_wday + 7 - wday) % 7 * -1
 
 
+def timedelta_to_usecs(td):
+    total = td.days * 86400000000  # Microseconds in a day
+    total += td.seconds * 1000000  # Microseconds in a second
+    total += td.microseconds
+    return total
+
+
 def datetime_to_long(dt):
     """Converts a datetime object to a long integer representing the number
     of microseconds since ``datetime.min``.
     """
     
-    td = dt - dt.min
-    total = td.days * 86400000000  # Microseconds in a day
-    total += td.seconds * 1000000  # Microseconds in a second
-    total += td.microseconds
-    return total
+    return timedelta_to_usecs(dt - dt.min)
 
 
 def long_to_datetime(x):

File tests/test_sorting.py

View file
  • Ignore whitespace
                                          "d-f": [5, 7, 8],
                                          "g-i": [0, 3, 6]})
 
+def test_missing_field_facet():
+    schema = fields.Schema(id=fields.STORED, tag=fields.ID)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=0, tag=u("alfa"))
+    w.add_document(id=1, tag=u("alfa"))
+    w.add_document(id=2)
+    w.add_document(id=3, tag=u("bravo"))
+    w.add_document(id=4)
+    w.commit()
+    
+    with ix.searcher() as s:
+        r = s.search(query.Every(), groupedby="tag")
+        assert_equal(r.groups("tag"), {'': [2, 4], 'bravo': [3], 'alfa': [0, 1]})
+
+def test_missing_numeric_facet():
+    schema = fields.Schema(id=fields.STORED, tag=fields.NUMERIC)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=0, tag=1)
+    w.add_document(id=1, tag=1)
+    w.add_document(id=2)
+    w.add_document(id=3, tag=0)
+    w.add_document(id=4)
+    w.commit()
+    
+    with ix.searcher() as s:
+        r = s.search(query.Every(), groupedby="tag")
+        assert_equal(r.groups("tag"), {None: [2, 4], 0: [3], 1: [0, 1]})
+
+def test_date_facet():
+    from datetime import datetime
+    
+    schema = fields.Schema(id=fields.STORED, date=fields.DATETIME)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    d1 = datetime(2011, 7, 13)
+    d2 = datetime(1984, 3, 29)
+    w.add_document(id=0, date=d1)
+    w.add_document(id=1, date=d1)
+    w.add_document(id=2)
+    w.add_document(id=3, date=d2)
+    w.add_document(id=4)
+    w.commit()
+    
+    with ix.searcher() as s:
+        r = s.search(query.Every(), groupedby="date")
+        assert_equal(r.groups("date"),  {d1: [0, 1], d2: [3], None: [2, 4]})
+
+def test_range_facet():
+    schema = fields.Schema(id=fields.STORED, price=fields.NUMERIC)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=0, price=200)
+    w.add_document(id=1, price=100)
+    w.add_document(id=2)
+    w.add_document(id=3, price=50)
+    w.add_document(id=4, price=500)
+    w.add_document(id=5, price=125)
+    w.commit()
+    
+    with ix.searcher() as s:
+        rf = sorting.RangeFacet("price", 0, 1000, 100)
+        r = s.search(query.Every(), groupedby={"price": rf})
+        assert_equal(r.groups("price"), {(0, 100): [3], (100, 200): [1, 5],
+                                         (200, 300): [0], (500, 600): [4],
+                                         None: [2]})
+
+def test_daterange_facet():
+    from datetime import datetime, timedelta
+    
+    schema = fields.Schema(id=fields.STORED, date=fields.DATETIME)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=0, date=datetime(2001, 1, 15))
+    w.add_document(id=1, date=datetime(2001, 1, 10))
+    w.add_document(id=2)
+    w.add_document(id=3, date=datetime(2001, 1, 3))
+    w.add_document(id=4, date=datetime(2001, 1, 8))
+    w.add_document(id=5, date=datetime(2001, 1, 6))
+    w.commit()
+    
+    with ix.searcher() as s:
+        rf = sorting.DateRangeFacet("date", datetime(2001, 1, 1),
+                                    datetime(2001, 1, 20), timedelta(days=5))
+        r = s.search(query.Every(), groupedby={"date": rf})
+        dt = datetime
+        assert_equal(r.groups("date"), {(dt(2001, 1, 1, 0, 0), dt(2001, 1, 6, 0, 0)): [3],
+                                        (dt(2001, 1, 6, 0, 0), dt(2001, 1, 11, 0, 0)): [1, 4, 5],
+                                        (dt(2001, 1, 11, 0, 0), dt(2001, 1, 16, 0, 0)): [0],
+                                        None: [2]})
+
 @skip_if_unavailable("multiprocessing")
 @skip_if(lambda: True)
 def test_mp_fieldcache():