Commits

Matt Chaput committed c6af0c1

If a field has a column_type, but a particlular segment doesn't have a column file,
the SegmentReader will now return an EmptyColumnReader with the column object's
default value. This required changes to column-related APIs to change how
reversed sorting keys are requested.
Fixes issue #378.

  • Participants
  • Parent commits 50e1704

Comments (0)

Files changed (4)

src/whoosh/columns.py

 
         return self.Reader(dbfile, basepos, length, doccount)
 
-    def default_value(self):
+    def default_value(self, reverse=False):
         """Returns the default value for this column type.
         """
 
     def __getitem__(self, docnum):
         raise NotImplementedError
 
-    def sort_key(self, docnum, reverse=False):
+    def sort_key(self, docnum):
         return self[docnum]
 
     def __iter__(self):
     def load(self):
         return list(self)
 
+    def set_reverse(self):
+        raise NotImplementedError
+
 
 # Arbitrary bytes column
 
         return self.Reader(dbfile, basepos, length, doccount, self._typecode,
                            self._default)
 
+    def default_value(self, reverse=False):
+        v = self._default
+        if reverse:
+            v = 0 - v
+        return v
+
     class Writer(FixedBytesColumn.Writer):
         def __init__(self, dbfile, typecode, default):
             self._dbfile = dbfile
             self._basepos = basepos
             self._doccount = doccount
             self._default = default
+            self._reverse = False
 
             self._typecode = typecode
             self._unpack = struct.Struct("!" + typecode).unpack
             s = FixedBytesColumn.Reader.__getitem__(self, docnum)
             return self._unpack(s)[0]
 
-        def sort_key(self, docnum, reverse=False):
+        def sort_key(self, docnum):
             key = self[docnum]
-            if reverse:
+            if self._reverse:
                 key = 0 - key
             return key
 
             else:
                 return array(self._typecode, self)
 
+        def set_reverse(self):
+            self._reverse = True
+
 
 # Column of boolean values
 
     def writer(self, dbfile):
         return self.Writer(dbfile, self._compressat)
 
+    def default_value(self, reverse=False):
+        return self._default ^ reverse
+
     class Writer(ColumnWriter):
         def __init__(self, dbfile, compressat):
             self._dbfile = dbfile
             self._basepos = basepos
             self._length = length
             self._doccount = doccount
+            self._reverse = False
 
             compressed = dbfile.get_byte(basepos + (length - 1))
             if compressed:
         def __getitem__(self, i):
             return i in self._bitset
 
-        def sort_key(self, docnum, reverse=False):
-            return int(self[docnum] ^ reverse)
+        def sort_key(self, docnum):
+            return int(self[docnum] ^ self._reverse)
 
         def __iter__(self):
             i = 0
                 self._bitset = BitSet.from_bytes(bs)
             return self
 
+        def set_reverse(self):
+            self._reverse = True
+
 
 # Compressed variants
 
     def __getitem__(self, docnum):
         return self._translate(self._reader[docnum])
 
-    def sort_key(self, docnum, reverse=False):
-        return self._reader.sort_key(docnum, reverse=reverse)
+    def sort_key(self, docnum):
+        return self._reader.sort_key(docnum)
 
     def __iter__(self):
         translate = self._translate
         return (translate(v) for v in self._reader)
 
+    def set_reverse(self):
+        self._reader.set_reverse()
+
 
 # Column wrappers
 
     def __getitem__(self, docnum):
         return self._child[docnum]
 
-    def sort_key(self, docnum, reverse=False):
-        return self._child.sort_key(docnum, reverse=reverse)
+    def sort_key(self, docnum):
+        return self._child.sort_key(docnum)
 
     def __iter__(self):
         return iter(self._child)
     def load(self):
         return list(self)
 
+    def set_reverse(self):
+        self._child.set_reverse()
+
 
 class ClampedNumericColumn(WrappedColumn):
     """An experimental wrapper type for NumericColumn that clamps out-of-range
 
 
 class ListColumnReader(ColumnReader):
-    def sort_key(self, docnum, reverse=False):
+    def sort_key(self, docnum):
         return self[docnum][0]
 
     def __iter__(self):

src/whoosh/reading.py

     def has_column(self, fieldname):
         return False
 
-    def column_reader(self, fieldname):
+    def column_reader(self, fieldname, column=None, reverse=False,
+                      translate=False):
+        """
+
+        :param fieldname: the name of the field for which to get a reader.
+        :param column: if passed, use this Column object instead of the one
+            associated with the field in the Schema.
+        :param reverse: if passed, reverses the order of keys returned by the
+            reader's ``sort_key()`` method. If the column type is not
+            reversible, this will raise a ``NotImplementedError``.
+        :param translate: if True, wrap the reader to call the field's
+            ``from_bytes()`` method on the returned values.
+        :return: a :class:`whoosh.columns.ColumnReader` object.
+        """
+
         raise NotImplementedError
 
 
         coltype = self.schema[fieldname].column_type
         return coltype and self._perdoc.has_column(fieldname)
 
-    def column_reader(self, fieldname, column=None, translate=True):
+    def column_reader(self, fieldname, column=None, reverse=False,
+                      translate=True):
         if self.is_closed:
             raise ReaderClosed
+
         fieldobj = self.schema[fieldname]
-        if not self.has_column(fieldname):
-            raise Exception("No column for field %r" % fieldname)
+        column = column or fieldobj.column_type
+        if not column:
+            raise Exception("No column for field %r in %r"
+                            % (fieldname, self))
 
-        ctype = column or fieldobj.column_type
-        creader = self._perdoc.column_reader(fieldname, ctype)
+        if self._perdoc.has_column(fieldname):
+            creader = self._perdoc.column_reader(fieldname, column)
+            if reverse:
+                creader.set_reverse()
+        else:
+            # This segment doesn't have a column file for this field, so create
+            # a fake column reader that always returns the default value.
+            default = column.default_value(reverse)
+            creader = columns.EmptyColumnReader(default, self.doc_count_all())
+
         if translate:
             # Wrap the column in a Translator to give the caller
             # nice values instead of sortable representations
     def has_column(self, fieldname):
         return any(r.has_column(fieldname) for r in self.readers)
 
-    def column_reader(self, fieldname, translate=True):
-        column = self.schema[fieldname].column_type
+    def column_reader(self, fieldname, column=None, reverse=False,
+                      translate=True):
+        column = column or self.schema[fieldname].column_type
         if not column:
             raise Exception("Field %r has no column type" % (fieldname,))
 
-        default = column.default_value()
-        doccount = self.doc_count_all()
-
         creaders = []
         for r in self.readers:
-            if r.has_column(fieldname):
-                creaders.append(r.column_reader(fieldname, translate=translate))
-            else:
-                creaders.append(columns.EmptyColumnReader(default, doccount))
-
+            cr = r.column_reader(fieldname, column=column, reverse=reverse,
+                                 translate=translate)
+            creaders.append(cr)
         return columns.MultiColumnReader(creaders)

src/whoosh/sorting.py

     def __init__(self, global_searcher, fieldname, reverse=False):
         self._fieldname = fieldname
         self._fieldobj = global_searcher.schema[self._fieldname]
+        self._column_type = self._fieldobj.column_type
         self._reverse = reverse
 
+        # The column reader is set in set_searcher() as we iterate over the
+        # sub-searchers
+        self._creader = None
+
     def __repr__(self):
         return "%s(%r, %r, reverse=%r)" % (self.__class__.__name__,
                                            self._fieldobj, self._fieldname,
 
     def set_searcher(self, segment_searcher, docoffset):
         r = segment_searcher.reader()
-        self._creader = r.column_reader(self._fieldname, translate=False)
+        self._creader = r.column_reader(self._fieldname,
+                                        reverse=self._reverse,
+                                        translate=False)
 
     def key_for(self, matcher, segment_docnum):
-        return self._creader.sort_key(segment_docnum, self._reverse)
+        return self._creader.sort_key(segment_docnum)
 
     def key_to_name(self, key):
         return self._fieldobj.from_column_value(key)
         self._use_column = (reader.has_column(fieldname)
                             and field.column_type.stores_lists())
 
+        # These are set in set_searcher() as we iterate over the sub-searchers
+        self._segment_searcher = None
+        self._creader = None
+        self._lists = None
+
     def set_searcher(self, segment_searcher, docoffset):
         fieldname = self._fieldname
         self._segment_searcher = segment_searcher
                 self._add(item)
         self.maptype = maptype
 
+    def __repr__(self):
+        return "%s(%r, %r)" % (self.__class__.__name__,
+                               self.facets,
+                               self.maptype)
+
     @classmethod
     def from_sortedby(cls, sortedby):
         multi = cls()

tests/test_sorting.py

         assert chapr[0] == "alfa"
         assert pricer[0] == 100
 
+
+def test_missing_column():
+    from whoosh import collectors
+
+    schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(id=0, tags=u("alfa bravo charlie"))
+        w.add_document(id=1, tags=u("bravo charlie delta"))
+        w.add_document(id=2, tags=u("charlie delta echo"))
+        w.merge = False
+
+    with ix.writer() as w:
+        w.add_field("age", fields.NUMERIC(sortable=True))
+
+        w.add_document(id=3, tags=u("delta echo foxtrot"), age=10)
+        w.add_document(id=4, tags=u("echo foxtrot golf"), age=5)
+        w.add_document(id=5, tags=u("foxtrot golf alfa"), age=20)
+        w.merge = False
+
+    with ix.writer() as w:
+        w.add_document(id=6, tags=u("golf alfa bravo"), age=2)
+        w.add_document(id=7, tags=u("alfa hotel india"), age=50)
+        w.add_document(id=8, tags=u("hotel india bravo"), age=15)
+        w.merge = False
+
+    with ix.searcher() as s:
+        assert not s.is_atomic()
+
+        q = query.Term("tags", u("alfa"))
+
+        # Have to use yucky low-level collector API to make sure we used a
+        # ColumnCategorizer to do the sorting
+        c = s.collector(sortedby="age")
+        assert isinstance(c, collectors.SortingCollector)
+        s.search_with_collector(q, c)
+        assert isinstance(c.categorizer, sorting.ColumnCategorizer)
+
+        r = c.results()
+        assert [hit["id"] for hit in r] == [6, 5, 7, 0]
+
+        r = s.search(q, sortedby="age", reverse=True)
+        assert [hit["id"] for hit in r] == [0, 7, 5, 6]
+