Commits

Matt Chaput  committed bfd43ff

Reverted change to generate synthetic columns, since it didn't work for numeric fields.
Fixed some obsolete package references.

  • Participants
  • Parent commits 96433ea

Comments (0)

Files changed (9)

File docs/source/releases/2_0.rst

 * Whoosh 2.5 will read existing indexes, but segments created by 2.5 will not
   be readable by older versions of Whoosh.
 
-* You should now specify ``sortable=True`` on fields that you plan on using to
-  sort search results.
-  Note that you can still sort on fields without specifying ``sortable=True``,
-  however the first sort will be slow as Whoosh caches a column in memory.
+* As a replacement for field caches to speed up sorting, Whoosh now supports
+  adding a ``sortable=True`` keyword argument to fields. This makes Whoosh store
+  a sortable representation of the field's values in a "column" format
+  (which associates a "key" value with each document). This is more robust,
+  efficient, and customizable than the old behavior.
+  You should now specify ``sortable=True`` on fields that you plan on using to
+  sort or group search results.
 
-  Instead of using field caches to speed up sorting, Whoosh now supports adding
-  a ``sortable=True`` keyword argument to fields. This makes Whoosh store a
-  sortable representation of the field's values in a "column" format
-  (associating a "key" value with each document). This is more robust,
-  efficient, and customizable than the old behavior.
+  (You can still sort/group on fields that don't have ``sortable=True``,
+  however it will use more RAM and be slower as Whoosh caches the field values
+  in memory.)
 
-  Fields that use ``sortable=True`` can avoid specifying ``stored=True`` and the
+  Fields that use ``sortable=True`` can avoid specifying ``stored=True``. The
   field's value will still be available on ``Hit`` objects (the value will be
   retrieved from the column instead of from the stored fields). This may
   actually be faster for certain types of values.
 * Whoosh will now detect common types of OR queries and use optimized read-ahead
   matchers to speed them up by several times.
 
-* Whoosh now includes pure-python implementations of the Snowball stemmers and
+* Whoosh now includes pure-Python implementations of the Snowball stemmers and
   stop word lists for various languages adapted from NLTK. These are available
   through the :class:`whoosh.analysis.LanguageAnalyzer` analyzer or through the
   ``lang=`` keyword argument to the

File src/whoosh/columns.py

 from whoosh.compat import dumps, loads
 from whoosh.filedb.structfile import StructFile
 from whoosh.idsets import BitSet, OnDiskBitSet
-from whoosh.system import emptybytes, _INT_SIZE
+from whoosh.system import emptybytes
 from whoosh.util.cache import lru_cache
 from whoosh.util.numeric import typecode_max, typecode_min
 from whoosh.util.numlists import GrowableArray
-from whoosh.util.varints import varint, varint_to_int
+from whoosh.util.varints import varint
 
 
 # Utility functions
         return (translate(v) for v in self._reader)
 
 
-# Fake column reader for fields without columns
-
-class PostingColumnReader(ColumnReader):
-    """
-    Builds a synthetic column for fields that weren't indexed with column
-    storage. This object reads every posting for every term in the field, so
-    building it is quite expensive and the reader should cache it once it's
-    built.
-    """
-
-    def __init__(self, reader, fieldname):
-        self._length = reader.doc_count_all()
-        # Dictionary mapping document IDs to values
-        self._values = values = {}
-
-        fieldobj = reader.schema[fieldname]
-        self._frombytes =fieldobj.from_bytes
-
-        # Read the terms in the field in sorted order
-
-        btexts = fieldobj.sortable_terms(reader, fieldname)
-        for btext in btexts:
-            # Read the document IDs containing this term
-            # Global doc ids
-            postings = reader.postings(fieldname, btext)
-            for docid in postings.all_ids():
-                values[docid] = btext
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, docnum):
-        return self._values.get(docnum, emptybytes)
-
-    def __iter__(self):
-        values = self._values
-        for docnum in xrange(self._length):
-            yield values.get(docnum, emptybytes)
-
-
 # Column wrappers
 
 class WrappedColumn(Column):

File src/whoosh/reading.py

             self._storage = OverlayStorage(files, storage)
         else:
             self._storage = storage
-        self._column_cache = {}
 
         # Get subreaders from codec
         self._codec = codec if codec else segment.codec()
 
     def column_reader(self, fieldname, column=None, translate=True):
         fieldobj = self.schema[fieldname]
-        if self.has_column(fieldname):
-            ctype = column or fieldobj.column_type
-            creader = self._perdoc.column_reader(fieldname, ctype)
-            if translate:
-                # Wrap the column in a Translator to give the caller
-                # nice values instead of sortable representations
-                fcv = fieldobj.from_column_value
-                creader = columns.TranslatingColumnReader(creader, fcv)
-        else:
-            # If the field wasn't indexed with a column, create one from
-            # postings and cache it
-            if fieldname in self._column_cache:
-                creader = self._column_cache[fieldname]
-            else:
-                creader = columns.PostingColumnReader(self, fieldname)
-                self._column_cache[fieldname] = creader
+        if not self.has_column(fieldname):
+            raise Exception("No column for field %r" % fieldname)
+
+        ctype = column or fieldobj.column_type
+        creader = self._perdoc.column_reader(fieldname, ctype)
+        if translate:
+            # Wrap the column in a Translator to give the caller
+            # nice values instead of sortable representations
+            fcv = fieldobj.from_column_value
+            creader = columns.TranslatingColumnReader(creader, fcv)
 
         return creader
 
         return any(r.has_word_graph(fieldname) for r in self.readers)
 
     def word_graph(self, fieldname):
-        from whoosh.fst import UnionNode
+        from whoosh.automata.fst import UnionNode
         from whoosh.util import make_binary_tree
 
         if not self.has_word_graph(fieldname):
     def has_column(self, fieldname):
         return any(r.has_column(fieldname) for r in self.readers)
 
-    def column_reader(self, fieldname):
+    def column_reader(self, fieldname, translate=True):
         column = self.schema[fieldname].column_type
         if not column:
             raise Exception("Field %r has no column type" % (fieldname,))
+
         default = column.default_value()
         doccount = self.doc_count_all()
 
         creaders = []
         for r in self.readers:
             if r.has_column(fieldname):
-                creaders.append(r.column_reader(fieldname))
+                creaders.append(r.column_reader(fieldname, translate=translate))
             else:
                 creaders.append(columns.EmptyColumnReader(default, doccount))
 

File src/whoosh/searching.py

         self._closereader = closereader
         self._ix = fromindex
         self._doccount = self.ixreader.doc_count_all()
+        # Cache for PostingCategorizer objects (supports fields without columns)
+        self._field_caches = {}
 
         if parent:
             self.parent = weakref.ref(parent)

File src/whoosh/sorting.py

         fieldname = self.fieldname
         fieldobj = global_searcher.schema[fieldname]
 
+        # If we're grouping with allow_overlap=True, all we can use is
+        # OverlappingCategorizer
         if self.allow_overlap:
-            # If we're grouping with allow_overlap=True, all we can use is
-            # OverlappingCategorizer
-            c = OverlappingCategorizer(global_searcher, fieldname)
+            return OverlappingCategorizer(global_searcher, fieldname)
+
+        if global_searcher.reader().has_column(fieldname):
+            coltype = fieldobj.column_type
+            if coltype.reversible or not self.reverse:
+                c = ColumnCategorizer(global_searcher, fieldname, self.reverse)
+            else:
+                c = ReversedColumnCategorizer(global_searcher, fieldname)
         else:
-            # Check if the field has a real column and if it's reversible
-            reversible = False
-            if global_searcher.reader().has_column(fieldname):
-                reversible = fieldobj.column_type.reversible
-
-            # If the facet is reversed and the column isn't reversible, we have
-            # to use a ReversedColumnCategorizer
-            if self.reverse and not reversible:
-                c = ReversedColumnCategorizer(global_searcher, fieldname)
-            else:
-                c = ColumnCategorizer(global_searcher, fieldname, self.reverse)
-
+            c = PostingCategorizer(global_searcher, fieldname,
+                                   self.reverse)
         return c
 
 
         self._fieldobj = global_searcher.schema[self._fieldname]
         self._reverse = reverse
 
+    def __repr__(self):
+        return "%s(%r, %r, reverse=%r)" % (self.__class__.__name__,
+                                           self._fieldobj, self._fieldname,
+                                           self._reverse)
+
     def set_searcher(self, segment_searcher, docoffset):
         r = segment_searcher.reader()
         self._creader = r.column_reader(self._fieldname, translate=False)
                 return None
 
 
+class PostingCategorizer(Categorizer):
+    """
+    Categorizer for fields that don't store column values. This is very
+    inefficient. Instead of relying on this categorizer you should plan for
+    which fields you'll want to sort on and set ``sortable=True`` in their
+    field type.
+
+    This object builds an array caching the order of all documents according to
+    the field, then uses the cached order as a numeric key. This is useful when
+    a field cache is not available, and also for reversed fields (since field
+    cache keys for non- numeric fields are arbitrary data, it's not possible to
+    "negate" them to reverse the sort order).
+    """
+
+    def __init__(self, global_searcher, fieldname, reverse):
+        self.reverse = reverse
+
+        if fieldname in global_searcher._field_caches:
+            self.values, self.array = global_searcher._field_caches[fieldname]
+        else:
+            # Cache the relative positions of all docs with the given field
+            # across the entire index
+            reader = global_searcher.reader()
+            dc = reader.doc_count_all()
+            self._fieldobj = global_searcher.schema[fieldname]
+            from_bytes = self._fieldobj.from_bytes
+
+            self.values = []
+            self.array = array("i", [dc + 1] * dc)
+
+            btexts = self._fieldobj.sortable_terms(reader, fieldname)
+            for i, btext in enumerate(btexts):
+                self.values.append(from_bytes(btext))
+                # Get global docids from global reader
+                postings = reader.postings(fieldname, btext)
+                for docid in postings.all_ids():
+                    self.array[docid] = i
+
+            global_searcher._field_caches[fieldname] = (self.values, self.array)
+
+    def set_searcher(self, segment_searcher, docoffset):
+        self._searcher = segment_searcher
+        self.docoffset = docoffset
+
+    def key_for(self, matcher, segment_docnum):
+        global_docnum = self.docoffset + segment_docnum
+        i = self.array[global_docnum]
+        if self.reverse:
+            i = len(self.values) - i
+        return i
+
+    def key_to_name(self, i):
+        if i >= len(self.values):
+            return None
+        if self.reverse:
+            i = len(self.values) - i
+        return self.values[i]
+
+
 # Special facet types
 
 class QueryFacet(FacetType):

File src/whoosh/spelling.py

 from collections import defaultdict
 from heapq import heappush, heapreplace
 
-from whoosh import analysis, fields, fst, highlight, query, scoring
+from whoosh import analysis, fields, highlight, query, scoring
+from whoosh.automata import fst
 from whoosh.compat import xrange, string_type
 from whoosh.support.levenshtein import distance
 from whoosh.util.text import utf8encode

File tests/test_dawg.py

 import random
 from array import array
 
-from whoosh import fst
+from whoosh.automata import fst
 from whoosh.compat import b, u, xrange, array_tobytes
 from whoosh.filedb.filestore import RamStorage
 from whoosh.util.testing import TempStorage

File tests/test_highlighting.py

     result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa,
                                  fragmenter=highlight.ContextFragmenter(),
                                  formatter=highlight.UppercaseFormatter())
-    assert result == "INDEXED!"
+    assert result == "INDEXED!\n1"

File tests/test_sorting.py

 
 
 def test_date_facet():
+    from whoosh import columns
+
     schema = fields.Schema(id=fields.STORED, date=fields.DATETIME)
+    dc = schema["date"].default_column()
+    assert isinstance(dc, columns.NumericColumn)
+
     ix = RamStorage().create_index(schema)
     w = ix.writer()
     d1 = datetime(2011, 7, 13)
         q = query.TermRange("a", u("bravo"), u("k"))
         facet = sorting.FieldFacet("a", reverse=True)
 
-        cat = facet.categorizer(s)
-        assert cat.__class__ == sorting.PostingCategorizer
-
         r = s.search(q, sortedby=facet)
         assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "delta", "charlie", "bravo"]
 
             facet.add_field("title", reverse=True)
 
             r = s.search(query.Every(), sortedby=facet)
-            assert [hit["title"] for hit in r] == ["Visual and Statistical Thinking",
-                                                   "Cognitive Style of Powerpoint",
-                                                   "Beautiful Evidence",
-                                                   "Visual Explanations",
-                                                   "Visual Display of Quantitative Information, The",
-                                                   "Envisioning Information",
-                                                   ]
+            target = ["Visual and Statistical Thinking",
+                      "Cognitive Style of Powerpoint",
+                      "Beautiful Evidence",
+                      "Visual Explanations",
+                      "Visual Display of Quantitative Information, The",
+                      "Envisioning Information",
+                      ]
+            assert [hit["title"] for hit in r] == target
 
     # Single segment
     ix = RamStorage().create_index(schema)