Commits

Matt Chaput  committed c492da8

Starting 'flexisort' branch

  • Participants
  • Parent commits 7a6e56f
  • Branches flexisort

Comments (0)

Files changed (4)

File src/whoosh/filedb/filereading.py

         return self.dc
 
     def stored_fields(self, docnum):
+        assert docnum >= 0
         schema = self.schema
         return dict(item for item
                     in iteritems(self.storedfields[docnum])

File src/whoosh/searching.py

 import threading
 import weakref
 from collections import defaultdict
-from heapq import heappush, heapreplace
+from heapq import heappush, heapreplace, nlargest, nsmallest
 from math import ceil
 
-from whoosh import classify, highlight, query, scoring
+from whoosh import classify, highlight, query, scoring, sorting
 from whoosh.compat import (iteritems, itervalues, iterkeys, xrange, text_type,
                            string_type)
 from whoosh.reading import TermNotFound
         to get sorted search results.
         """
         
-        from whoosh.sorting import Sorter
-        
-        return Sorter(self, *args, **kwargs)
+        return sorting.Sorter(self, *args, **kwargs)
 
     def sort_query_using(self, q, fn, filter=None):
         """Returns a :class:`Results` object with the documents matching the
         :param limit: the maximum number of documents to score. If you're only
             interested in the top N documents, you can set limit=N to limit the
             scoring for a faster search.
-        :param sortedby: the name of a field to sort by, or a tuple of field
-            names to sort by multiple fields. This is a shortcut for using a
-            :class:`whoosh.sorting.Sorter` object to do a simple sort. To do
-            complex sorts (where different fields are sorted in different
-            directions), use :meth:`Searcher.sorter` to get a sorter and use it
-            to perform the sorted search.
+        :param sortedby: the name of a field to sort by, or a
+            :class:`whoosh.sorting.Sorter` object.
         :param reverse: Reverses the direction of the sort.
-        :param groupedby: a list of field names or facet names. If this
-            argument is not None, you can use the :meth:`Results.groups` method
-            on the results object to retrieve a dictionary mapping field/facet
-            values to document numbers.
+        :param groupedby: a list of field names or facet names, or a
+            :class:`whoosh.sorting.Facets` object. If you supply this
+            argument, you can use the :meth:`Results.groups` method on the
+            results object to retrieve a dictionary mapping field/facet values
+            to document numbers.
         :param optimize: use optimizations to get faster results when possible.
         :param filter: a query, Results object, or set of docnums. The results
             will only contain documents that are also in the filter object.
         if limit is not None and limit < 1:
             raise ValueError("limit must be >= 1")
 
-        if sortedby is not None:
-            sorter = self.sorter(sortedby=sortedby)
-            return sorter.sort_query(q, limit=limit, reverse=reverse,
-                                     filter=filter)
-        
         collector = Collector(limit=limit, usequality=optimize,
                               groupedby=groupedby, reverse=reverse)
-        return collector.search(self, q, allow=filter, restrict=mask)
         
+        if sortedby:
+            return collector.sort(self, q, sortedby, allow=filter,
+                                  restrict=mask)
+        else:
+            return collector.search(self, q, allow=filter, restrict=mask)
+    
     def correct_query(self, q, qstring, correctors=None, allfields=False,
                       terms=None, prefix=0, maxdist=2):
         """Returns a corrected version of the given user query using a default
 
 class Collector(object):
     def __init__(self, limit=10, usequality=True, replace=10, groupedby=None,
-                 timelimit=None, greedy=False, reverse=False):
+                 timelimit=None, greedy=False, reverse=False, groupids=True):
         """A Collector finds the matching documents, scores them, collects them
         into a list, and produces a Results object from them.
         
         self.timelimit = timelimit
         self.greedy = greedy
         self.reverse = reverse
+        self.groupids = groupids
         
-        # The groupedby attribute is expected to be a sequence of field names
-        if isinstance(groupedby, string_type):
-            groupedby = (groupedby, )
-        self.groupedby = groupedby
+        self.facets = None
+        if groupedby:
+            self.facets = sorting.Facets.from_groupedby(groupedby)
     
     def should_add_all(self):
         """Returns True if this collector needs to add all found documents (for
         limit = self.limit
         if limit:
             limit = min(limit, self.searcher.doc_count_all())
-        return not limit or self.groupedby
+        return not limit
     
     def use_block_quality(self, searcher, matcher=None):
         """Returns True if this collector can use block quality optimizations
             use = use and matcher.supports_block_quality()
         return use
     
-    def score(self, searcher, matcher):
-        """Called to compute the score for the current document in the given
-        :class:`whoosh.matching.Matcher`.
-        """
+    def _score_fn(self, searcher):
+        w = searcher.weighting
+        if w.use_final:
+            def scorefn(matcher):
+                score = matcher.score()
+                return w.final(searcher, matcher.id(), score)
+        else:
+            scorefn = None
+        return scorefn
+    
+    def _set_categorizers(self, searcher, offset):
+        if self.facets:
+            self.categorizers = dict((name, facet.categorizer(searcher))
+                                     for name, facet in self.facets.items())
+            for catter in self.categorizers.values():
+                catter.set_searcher(searcher, offset)
+    
+    def _set_filters(self, allow, restrict):
+        if allow:
+            allow = self.searcher._filter_to_comb(allow)
+        self.allow = allow
+        if restrict:
+            restrict = self.searcher._filter_to_comb(restrict)
+        self.restrict = restrict
+    
+    def _set_timer(self):
+        # If this collector is time limited, start the timer thread
+        self.timer = None
+        if self.timelimit:
+            self.timer = threading.Timer(self.timelimit, self._timestop)
+            self.timer.start()
+    
+    def _reset(self):
+        self.groups = {}
+        self.items = []
+        self.timedout = False
+        self.runtime = -1
+        self.minscore = None
+    
+    def _timestop(self):
+        # Called by the Timer when the time limit expires. Set an attribute on
+        # the collector to indicate that the timer has expired and the
+        # collector should raise a TimeLimit exception at the next consistent
+        # state.
+        self.timer = None
+        self.timedout = True
+    
+    def collect(self, docid):
+        docset = self.docset
+        if docset is not None:
+            docset.add(docid)
         
-        w = searcher.weighting
-        score = matcher.score()
-        if w.use_final:
-            score = w.final(searcher, matcher.id(), score)
-        return score
+        if self.facets is not None:
+            groups = self.groups
+            for name, catter in self.categorizers.items():
+                key = catter.key_for_id(docid)
+                if self.groupids:
+                    if name not in groups:
+                        groups[name] = defaultdict(list)
+                    groups[name][key].append(docid)
+                else:
+                    if name not in groups:
+                        groups[name] = defaultdict(int)
+                    groups[name][key] += 1
+    
+    def sort(self, searcher, q, sortedby, allow=None, restrict=None):
+        self.searcher = searcher
+        self.q = q
+        self.docset = set()
+        self._set_filters(allow, restrict)
+        self._reset()
+        self._set_timer()
+        
+        facet = sorting.MultiFacet(sortedby)
+        catter = facet.categorizer(searcher)
+        keyfn = catter.key_for_matcher
+        t = now()
+        if searcher.is_atomic():
+            self._set_categorizers(searcher, 0)
+            catter.set_searcher(searcher, 0)
+            self.add_sorted_matches(searcher, q, 0, keyfn)
+        else:
+            for s, offset in searcher.subsearchers:
+                self._set_categorizers(s, offset)
+                catter.set_searcher(s, offset)
+                self.add_sorted_matches(s, q, offset, keyfn)
+        self.runtime = now() - t
+        return self.results(scores=False)
+    
+    def add_sorted_matches(self, searcher, q, offset, keyfn):
+        items = self.items
+        limit = self.limit
+        reverse = self.reverse
+        heapfn = nlargest if reverse else nsmallest
+        addall = self.should_add_all()
+        matcher = q.matcher(searcher)
+        
+        ls = list(self.pull_matches(searcher, matcher, offset, keyfn, False))
+        if addall:
+            items.extend(ls)
+        else:
+            self.items = heapfn(limit, items + ls)
     
     def search(self, searcher, q, allow=None, restrict=None):
         """Top-level method call which uses the given :class:`Searcher` and
         
         self.searcher = searcher
         self.q = q
-        
-        if allow:
-            allow = searcher._filter_to_comb(allow)
-        self.allow = allow
-        if restrict:
-            restrict = searcher._filter_to_comb(restrict)
-        self.restrict = restrict
-        
-        self.groups = {}
-        self.items = []
-        self.minscore = None
-        self.timedout = False
-        self.runtime = -1
+        self._set_filters(allow, restrict)
+        self._reset()
+        self._set_timer()
         
         # If we're not using block quality, then we can add every document
         # number to a set as we see it, because we're not skipping low-quality
         # blocks
         self.docset = set() if not self.use_block_quality(searcher) else None
         
-        # If this collector is time limited, start the timer thread
-        self.timer = None
-        if self.timelimit:
-            self.timer = threading.Timer(self.timelimit, self._timestop)
-            self.timer.start()
-        
         # Perform the search
         t = now()
         if searcher.is_atomic():
-            self.add_matches(searcher, q)
+            scorefn = self._score_fn(searcher)
+            self._set_categorizers(searcher, 0)
+            self.add_matches(searcher, q, 0, scorefn)
         else:
             for s, offset in searcher.subsearchers:
-                self.add_matches(s, q, offset=offset)
+                scorefn = self._score_fn(s)
+                self._set_categorizers(s, offset)
+                self.add_matches(s, q, offset, scorefn)
         
         # If we started a time limit timer thread, cancel it
         if self.timelimit and self.timer:
         self.runtime = now() - t
         return self.results()
     
-    def _timestop(self):
-        # Called by the Timer when the time limit expires. Set an attribute on
-        # the collector to indicate that the timer has expired and the
-        # collector should raise a TimeLimit exception at the next consistent
-        # state.
-        self.timer = None
-        self.timedout = True
-    
-    def add_matches(self, searcher, q, offset=0):
-        allow = self.allow
-        restrict = self.restrict
+    def add_matches(self, searcher, q, offset, scorefn):
         items = self.items
-        groups = self.groups
         limit = self.limit
         addall = self.should_add_all()
         matcher = q.matcher(searcher)
         usequality = self.use_block_quality(searcher, matcher)
         
-        # If this collector has grouping enabled, set up the key functions
-        keyfns = None
-        if self.groupedby:
-            keyfns = {}
-            for name in self.groupedby:
-                keyfns[name] = searcher.reader().key_fn(name)
-        
-        for offsetid, score in self.pull_matches(searcher, matcher, usequality,
-                                                 offset):
-            if allow and offsetid not in allow:
-                continue
-            if restrict and offsetid in restrict:
-                continue
-            
-            if keyfns:
-                for name, keyfn in iteritems(keyfns):
-                    if name not in groups:
-                        groups[name] = defaultdict(list)
-                    key = keyfn(offsetid - offset)
-                    groups[name][key].append(offsetid)
-            
+        for score, offsetid in self.pull_matches(searcher, matcher, offset,
+                                                 scorefn, usequality):
             # Document numbers are negated before putting them in the heap so
             # that higher document numbers have lower "priority" in the queue.
             # Lower document numbers should always come before higher document
                 if score > items[0][0]:
                     heapreplace(items, (score, negated_offsetid))
                     self.minscore = items[0][0]
-                    
-    def pull_matches(self, searcher, matcher, usequality, offset):
+    
+    def pull_matches(self, searcher, matcher, offset, scorefn, usequality):
         """Low-level method yields (docid, score) pairs from the given matcher.
         Called by :meth:`Collector.add_matches`.
         """
         
-        scorefn = self.score
+        allow = self.allow
+        restrict = self.restrict
         replace = self.replace
-        docset = self.docset
+        collect = self.collect
         minscore = self.minscore
         replacecounter = 0
         timelimited = bool(self.timelimit)
             # The current document ID 
             id = matcher.id()
             offsetid = id + offset
-            # If we're keeping track of IDs encountered, add this one
-            if docset is not None:
-                docset.add(offsetid)
             
-            # If we're using quality optimizations, check whether the current
-            # posting has higher quality than the minimum before yielding it.
-            score = scorefn(searcher, matcher)
-            yield (offsetid, score)
+            if allow and offsetid not in allow:
+                continue
+            if restrict and offsetid in restrict:
+                continue
+            
+            collect(offsetid)
+            if scorefn:
+                score = scorefn(matcher)
+            else:
+                score = matcher.score()
+            yield (score, offsetid)
             
             # Check whether the time limit expired
             if self.timedout:
             # matcher has entered a new block, so we should check block quality
             # again.
             checkquality = matcher.next()
-            
-                    
-    def results(self):
+    
+    def results(self, scores=True):
         """Returns the current results from the collector. This is useful for
         getting the results out of a collector that was stopped by a time
         limit exception.
         """
         
-        # Docnums are stored as negative for reasons too obscure to go into
-        # here, re-negate them before returning
-        items = [(x[0], 0 - x[1]) for x in self.items]
+        if scores:
+            # Docnums are stored as negative for reasons too obscure to go into
+            # here, re-negate them before returning
+            items = [(x[0], 0 - x[1]) for x in self.items]
         
-        # Sort by negated scores so that higher scores go first, then by
-        # document number to keep the order stable when documents have the same
-        # score
-        items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse)
+            # Sort by negated scores so that higher scores go first, then by
+            # document number to keep the order stable when documents have the
+            # same score
+            items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse)
+        else:
+            items = sorted(self.items, reverse=self.reverse)
         
         return Results(self.searcher, self.q, items, self.docset,
                        groups=self.groups, runtime=self.runtime,
         # If you're using this collector, you need to examine all documents
         return True
     
-    def add_matches(self, searcher, q, offset=0):
+    def add_matches(self, searcher, q, offset, scorefn):
         sup = super(TermTrackingCollector, self)
         self.matchers = []
         q = self._tag(q)
-        return sup.add_matches(searcher, q, offset=offset)
+        return sup.add_matches(searcher, q, offset, scorefn)
     
-    def pull_matches(self, searcher, matcher, usequality, offset):
+    def pull_matches(self, searcher, matcher, offset, scorefn, usequality):
         super_method = super(TermTrackingCollector, self).pull_matches
         
-        for offsetid, score in super_method(searcher, matcher, usequality,
-                                            offset):
+        for score, offsetid in super_method(searcher, matcher, offset,
+                                            scorefn, usequality):
             for key, m in self.matchers:
                 if m.is_active() and m.id() == offsetid - offset:
                     if key not in self.catalog:
                         self.catalog[key] = set()
                     self.catalog[key].add(offsetid)
             
-            yield (offsetid, score)
+            yield (score, offsetid)
     
     def _tag(self, q):
         # Takes a query and returns a copy of the query with a TaggedQuery
         fields associated with a document ID.
         """
         
+        if name not in self._groups:
+            raise KeyError("%r not in group names %r" % (name, self._groups.keys()))
         return self._groups[name]
     
     def _load_docs(self):

File src/whoosh/sorting.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from array import array
-from heapq import nlargest, nsmallest
 
 from whoosh.compat import string_type
-from whoosh.searching import Results
-from whoosh.util import now
 
 
 class Sorter(object):
     sorter object to see the updates.
     """
 
-    def __init__(self, searcher, criteria=None, sortedby=None):
+    def __init__(self, searcher, sortedby=None):
         """
         :param searcher: a :class:`whoosh.searching.Searcher` object to use for
             searching.
-        :param criteria: a list of ``(fieldname, reversed)`` tuples, where the
-            second value in each tuple is a boolean indicating whether to
-            reverse the order of the sort for that field. Alternatively you can
-            use the :meth:`Sorter.add_field` method on the instantiated sorter.
         :param sortedby: a convenience that generates a proper "criteria" list
             from a fieldname string or list of fieldnames, to set up the sorter
             for a simple search.
         """
         
         self.searcher = searcher
-        self.criteria = criteria or []
+        self.facetlist = []
         if sortedby:
             if isinstance(sortedby, string_type):
                 sortedby = [sortedby]
             for fieldname in sortedby:
                 self.criteria.append((fieldname, False))
         
-        self.arrays = None
-
     def add_field(self, fieldname, reverse=False):
         """Adds a field to the sorting criteria. Results are sorted by the
         fields in the order you add them. For example, if you do::
         :param reverse: if True, reverses the natural ordering of the field.
         """
         
-        self.criteria.append((fieldname, reverse))
+        self.add_facet(FieldFacet(fieldname, reverse=reverse))
     
-    def is_simple(self):
-        """Returns ``True`` if this is a "simple" sort (all the fields are
-        sorted in the same direction).
-        """
-        
-        if len(self.criteria) < 2:
-            return True
-        
-        firstdir = self.criteria[0][1]
-        return all(c[1] == firstdir for c in self.criteria)
+    def add_facet(self, facet):
+        self.facetlist.append(facet)
     
-    def _results(self, q, docnums, docset, runtime):
-        top_n = [(None, docnum) for docnum in docnums]
-        return Results(self.searcher, q, top_n, docset, runtime=runtime)
-    
-    def _simple_sort_query(self, q, limit=None, reverse=False, filter=None):
-        # If the direction of all sort fields is the same, we can use field
-        # caches to do the sorting
-        
-        t = now()
-        docset = set()
-        sortedby = [c[0] for c in self.criteria]
-        reverse = self.criteria[0][1] ^ reverse
-        comb = self.searcher._filter_to_comb(filter)
-        
-        if self.searcher.subsearchers:
-            heap = []
-            
-            # I wish I could actually do a heap thing here, but the Python heap
-            # queue only works with greater-than, and I haven't thought of a
-            # smart way to get around that yet, so I'm being dumb and using
-            # nlargest/nsmallest on the heap + each subreader list :(
-            op = nlargest if reverse else nsmallest
-            
-            for s, offset in self.searcher.subsearchers:
-                # This searcher is wrapping a MultiReader, so push the sorting
-                # down to the leaf readers and then combine the results.
-                docnums = [docnum for docnum in q.docs(s)
-                           if (not comb) or docnum + offset in comb]
-                
-                # Add the docnums to the docset
-                docset.update(docnums)
-                
-                # Ask the reader to return a list of (key, docnum) pairs to
-                # sort by. If limit=None, the returned list is not sorted. If
-                # limit=True, it is sorted.
-                r = s.reader()
-                srt = r.key_docs_by(sortedby, docnums, limit, reverse=reverse,
-                                    offset=offset)
-                if limit:
-                    # Pick the "limit" smallest/largest items from the current
-                    # and new list
-                    heap = op(limit, heap + srt)
-                else:
-                    # If limit=None, we'll just add everything to the "heap"
-                    # and sort it at the end.
-                    heap.extend(srt)
-            
-            # Sort the heap and take the docnums
-            docnums = [docnum for _, docnum in sorted(heap, reverse=reverse)]
-            
-        else:
-            # This searcher is wrapping an atomic reader, so we don't need to
-            # get tricky combining the results of multiple readers, just ask
-            # the reader to sort the results.
-            r = self.searcher.reader()
-            docnums = [docnum for docnum in q.docs(self.searcher)
-                       if (not comb) or docnum in comb]
-            docnums = r.sort_docs_by(sortedby, docnums, reverse=reverse)
-            docset = set(docnums)
-            
-            # I artificially enforce the limit here, even thought the current
-            # implementation can't use it, so that the results don't change
-            # based on single- vs- multi-segment.
-            docnums = docnums[:limit]
-        
-        runtime = now() - t
-        return self._results(q, docnums, docset, runtime)
-    
-    def _complex_cache(self):
-        self.arrays = []
-        r = self.searcher.reader()
-        for name, reverse in self.criteria:
-            arry = array("i", [0] * r.doc_count_all())
-            field = self.searcher.schema[name]
-            for i, (t, _) in enumerate(field.sortable_values(r, name)):
-                if reverse:
-                    i = 0 - i
-                postings = r.postings(name, t)
-                for docid in postings.all_ids():
-                    arry[docid] = i
-            self.arrays.append(arry)
-
-    def _complex_key_fn(self, docnum):
-        return tuple(arry[docnum] for arry in self.arrays)
-
-    def _complex_sort_query(self, q, limit=None, reverse=False, filter=None):
-        t = now()
-        if self.arrays is None:
-            self._complex_cache()
-        comb = self.searcher._filter_to_comb(filter)
-        docnums = [docnum for docnum in self.searcher.docs_for_query(q)
-                   if (not comb) or docnum in comb]
-        docnums.sort(key=self._complex_key_fn, reverse=reverse)
-        docset = set(docnums)
-        
-        # I artificially enforce the limit here, even thought the current
-        # implementation can't use it, so that the results don't change based
-        # on single- vs- multi-segment.
-        if limit:
-            docnums = docnums[:limit]
-        runtime = now() - t
-        return self._results(q, docnums, docset, runtime)
-
-    def sort_query(self, q, limit=None, reverse=False, filter=None):
+    def sort_query(self, q, limit=None, reverse=False, filter=None, mask=None,
+                   groupedby=None):
         """Returns a :class:`whoosh.searching.Results` object for the given
         query, sorted according to the fields set up using the
         :meth:`Sorter.add_field` method.
         :meth:`whoosh.searching.Searcher.search` method.
         """
         
-        if self.is_simple():
-            meth = self._simple_sort_query
+        from whoosh.searching import Collector
+        
+        if len(self.facetlist) == 0:
+            raise Exception("No facets added for sorting")
+        elif len(self.facetlist) == 1:
+            facet = self.facetlist[0]
         else:
-            meth = self._complex_sort_query
-            
-        return meth(q, limit, reverse, filter)
+            facet = MultiFacet(self.facetlist)
+        
+        collector = Collector(limit=limit, groupedby=groupedby, reverse=reverse)
+        return collector.sort(self.searcher, q, facet, allow=filter,
+                              restrict=mask)
+    
+        
+# Faceting objects
+
+class FacetType(object):
+    def categorizer(self, searcher):
+        raise NotImplementedError
     
 
+class Categorizer(object):
+    def set_searcher(self, searcher, docoffset):
+        pass
+    
+    def key_for_matcher(self, matcher):
+        return self.key_for_id(matcher.id())
+    
+    def key_for_id(self, docid):
+        raise NotImplementedError
+    
 
+class ScoreFacet(FacetType):
+    def categorizer(self, searcher):
+        return self.ScoreCategorizer(searcher)
+    
+    class ScoreCategorizer(Categorizer):
+        def __init__(self, searcher):
+            w = searcher.weighting
+            self.use_final = w.use_final
+            if w.use_final:
+                self.final = w.final
+        
+        def set_searcher(self, searcher, offset):
+            self.searcher = searcher
+    
+        def key_for_matcher(self, matcher):
+            score = matcher.score()
+            if self.use_final:
+                score = self.final(self.searcher, matcher.id(), score)
+            return score
 
+
+class FunctionFacet(FacetType):
+    def __init__(self, fn):
+        self.fn = fn
+    
+    def categorizer(self, searcher):
+        return self.FunctionCategorizer(searcher, self.fn)
+    
+    class FunctionCategorizer(Categorizer):
+        def __init__(self, searcher, fn):
+            self.fn = fn
+        
+        def set_searcher(self, searcher, docoffset):
+            self.searcher = searcher
+            self.offset = docoffset
+        
+        def key_for_id(self, docid):
+            return self.fn(self.searcher, docid + self.offset)
+
+
+class FieldFacet(FacetType):
+    def __init__(self, fieldname, reverse=False):
+        self.fieldname = fieldname
+        self.reverse = reverse
+    
+    def categorizer(self, searcher):
+        from whoosh.fields import NUMERIC
+        
+        # The searcher we're passed here may wrap a multireader, but the
+        # actual key functions will always be called per-segment following a
+        # Categorizer.set_searcher method call
+        fieldname = self.fieldname
+        reader = searcher.reader()
+        schema = searcher.schema
+        if fieldname in schema and isinstance(schema[fieldname], NUMERIC):
+            # Numeric fields are naturally reversible
+            return self.NumericFieldCategorizer(reader, fieldname, self.reverse)
+        elif self.reverse:
+            # If we need to "reverse" a string field, we need to do more work
+            return self.RevFieldCategorizer(reader, fieldname, self.reverse)
+        else:
+            # Straightforward: use the field cache to sort/categorize
+            return self.FieldCategorizer(fieldname)
+    
+    class FieldCategorizer(Categorizer):
+        def __init__(self, fieldname):
+            self.fieldname = fieldname
+        
+        def set_searcher(self, searcher, docoffset):
+            self.fieldcache = searcher.reader().fieldcache(self.fieldname)
+        
+        def key_for_id(self, docid):
+            return self.fieldcache.key_for(docid)
+    
+    class NumericFieldCategorizer(Categorizer):
+        def __init__(self, reader, fieldname, reverse):
+            self.fieldname = fieldname
+            self.reverse = reverse
+        
+        def set_searcher(self, searcher, docoffset):
+            self.fieldcache = searcher.reader().fieldcache(self.fieldname)
+        
+        def key_for_id(self, docid):
+            value = self.fieldcache.key_for(docid)
+            if self.reverse:
+                return 0 - value
+            else:
+                return value
+    
+    class RevFieldCategorizer(Categorizer):
+        def __init__(self, reader, fieldname, reverse):
+            # Cache the relative positions of all docs with the given field
+            # across the entire index
+            dc = reader.doc_count_all()
+            arry = array("i", [0] * dc)
+            field = self.searcher.schema[fieldname]
+            for i, (t, _) in enumerate(field.sortable_values(reader, fieldname)):
+                if reverse:
+                    i = 0 - i
+                postings = reader.postings(fieldname, t)
+                for docid in postings.all_ids():
+                    arry[docid] = i
+            self.array = arry
+            
+        def set_searcher(self, searcher, docoffset):
+            self.searcher = searcher
+            self.docoffset = docoffset
+        
+        def key_for_id(self, docid):
+            return self.array[docid + self.docoffset]
+
+
+class QueryFacet(object):
+    def __init__(self, querydict, other="none"):
+        self.querydict = querydict
+        self.other = other
+    
+    def categorizer(self, searcher):
+        return self.QueryCategorizer(searcher, self.querydict, self.other)
+    
+    class QueryCategorizer(Categorizer):
+        def __init__(self, searcher, querydict, other):
+            self.docsets = dict((qname, set(q.docs(searcher)))
+                                for qname, q in querydict)
+            self.other = other
+        
+        def key_for_id(self, docid):
+            for qname, docset in enumerate(self.docsets):
+                if docid in docset:
+                    return qname
+            return self.other
+
+
+class MultiFacet(FacetType):
+    def __init__(self, *items):
+        self.facets = list(items)
+    
+    @classmethod
+    def from_sortedby(cls, sortedby):
+        multi = cls()
+        def _add(item):
+            if isinstance(sortedby, FacetType):
+                multi.add_facet(sortedby)
+            elif isinstance(sortedby, string_type):
+                multi.add_field(sortedby)
+            else:
+                raise Exception("Don't know what to do with facet %r" % item)
+        
+        if isinstance(sortedby, (list, tuple)) or hasattr(sortedby, "__iter__"):
+            for item in sortedby:
+                _add(item)
+        else:
+            _add(sortedby)
+        
+        return multi
+    
+    def add_field(self, fieldname, reverse=False):
+        self.facets.append(FieldFacet(fieldname, reverse=reverse))
+        return self
+    
+    def add_query(self, querydict, other="none"):
+        self.facets.append(QueryFacet(querydict, other=other))
+        return self
+    
+    def add_function(self, fn):
+        self.facets.append(FunctionFacet(fn))
+        return self
+    
+    def add_facet(self, facet):
+        if not isinstance(facet, FacetType):
+            raise Exception()
+        self.facets.append(facet)
+        return self
+    
+    def categorizer(self, searcher):
+        if not self.facets:
+            raise Exception("No facets")
+        elif len(self.facets) == 1:
+            catter = self.facets[0].categorizer(searcher)
+        else:
+            catter = self.MultiCategorizer([facet.categorizer(searcher)
+                                            for facet in self.facets])
+        return catter
+    
+    class MultiCategorizer(Categorizer):
+        def __init__(self, catters):
+            self.catters = catters
+        
+        def set_searcher(self, searcher, docoffset):
+            for catter in self.catters:
+                catter.set_searcher(searcher, docoffset)
+        
+        def key_for_matcher(self, matcher):
+            return tuple(catter.key_for_matcher(matcher)
+                         for catter in self.catters)
+        
+        def key_for_id(self, docid):
+            return tuple(catter.key_for_id(docid) for catter in self.catters)
+
+
+class Facets(object):
+    def __init__(self):
+        self.facets = {}
+    
+    @classmethod
+    def from_groupedby(cls, groupedby):
+        facets = cls()
+        if isinstance(groupedby, (cls, dict)):
+            facets.add_facets(groupedby)
+        elif isinstance(groupedby, string_type):
+            facets.add_field(groupedby)
+        elif isinstance(groupedby, FacetType):
+            facets.add_facet("facet", groupedby)
+        elif isinstance(groupedby, (list, tuple)):
+            for item in groupedby:
+                facets.add_facets(cls.from_groupedby(item))
+        else:
+            raise Exception("Don't know what to do with groupedby=%r" % groupedby)
+        
+        return facets
+    
+    def items(self):
+        return self.facets.items()
+    
+    def add_facet(self, name, facet):
+        if not isinstance(facet, FacetType):
+            raise Exception("%r:%r is not a facet" % (name, facet))
+        self.facets[name] = facet
+        return self
+    
+    def add_facets(self, facets, replace=True):
+        if not isinstance(facets, (dict, Facets)):
+            raise Exception("%r is not a Facets object or dict" % facets)
+        for name, facet in facets.items():
+            if replace or name not in self.facets:
+                self.facets[name] = facet
+        return self
+    
+    def add_field(self, fieldname, reverse=False):
+        self.facets[fieldname] = FieldFacet(fieldname, reverse=reverse)
+        return self
+    
+    def add_query(self, name, querydict, other="none"):
+        self.facets[name] = QueryFacet(querydict, other=other)
+        return self
+    
+    def add_score(self):
+        self.facets["_score"] = ScoreFacet()
+        return self
+    
+    def add_function(self, name, fn):
+        self.facets[name] = FunctionFacet(fn)
+        return self
+    
+    def key_function(self, searcher, name):
+        facet = self.facets[name]
+        catter = facet.categorizer(searcher)
+        return catter.key_for_id
+
+
+
+
+
+
+
+
+
+

File tests/test_sorting.py

             w.add_document(ev=u("a"), **doc)
         w.commit(merge=False)
 
-def try_sort(sortedby, key, q=None, limit=None, reverse=False):
+def try_sort(sortedby, key, q=None, limit=None, reverse=False, debug=False):
     if q is None: q = query.Term("ev", u("a"))
     
     correct = [d["id"] for d in sorted(docs, key=key, reverse=reverse)][:limit]
             with ix.searcher() as s:
                 r = s.search(q, sortedby=sortedby, limit=limit, reverse=reverse)
                 rids = [d["id"] for d in r]
+                if debug:
+                    print "fn=", fn
+                    print "rids=", rids
+                    print "correct=", correct
                 assert_equal(rids, correct)
 
 
         w.commit()
         
         with ix.searcher() as s:
+            r = s.search(query.Every(), sortedby="key", limit=5)
+            assert_equal(r.scored_length(), 5)
+            assert_equal(len(r), s.doc_count_all())
+            
             rp = s.search_page(query.Every(), 1, pagelen=5, sortedby="key")
             assert_equal("".join([h["key"] for h in rp]), "abcde")
             assert_equal(rp[10:], [])
     check(make_single_index)
     check(make_multi_index)
 
-def test_query_facets():
+def test_define_facets():
     schema = fields.Schema(value=fields.ID(stored=True))
     with TempIndex(schema, "queryfacets") as ix:
         w = ix.writer()
             cats = s.categorize_query(query.Every(), ("tag", "size"))
             assert_equal(cats, correct)
             
-            r = s.search(query.Every(), groupedby=[("tag", "size")])
-            cats = r.groups(("tag", "size"))
+            from whoosh.sorting import MultiFacet
+            
+            facet = MultiFacet("tag", "size")
+            r = s.search(query.Every(), groupedby={"tag/size" : facet})
+            cats = r.groups(("tag/size"))
             assert_equal(cats, correct)
 
 def test_sort_filter():