Commits

Matt Chaput committed 0e919ee Merge

Merge with mainline.

  • Participants
  • Parent commits b17e714, 44ed3d8
  • Branches nested

Comments (0)

Files changed (21)

 9b9108fd23fd3a0a35be171c9b170560023c9491 2.2
 dee059118c86964dfcc8e72a8a3fbd29eae4cd01 2.2.1
 9a84c1a1d557b809e46808768f2c451e2560c5cd 2.2.2
+5d1064ce4c8550fe7dda58b17ab389347d6cbb77 2.3
+19c2df0a94efd8fdf7be8ea480f3cdd219a06c7a 2.3.1

File docs/source/api/sorting.rst

 .. autoclass:: Facets
     :members:
 
+
+FacetType objects
+=================
+
+.. autoclass:: FacetMap
+    :members:
+.. autoclass:: OrderedList
+.. autoclass:: UnorderedList
+.. autoclass:: Count
+.. autoclass:: Best
+
+

File docs/source/facets.rst

 
 A ``FacetType`` object
     Uses this object to group the documents. See below for the available facet
-    types. The facet name will automatically be set to ``"facet"``.
+    types.
 
 A field name string
     Converts the field name into a ``FieldFacet`` (see below) and uses it to
     sort the documents. The name of the field is used as the facet name.
 
+A list or tuple of field name strings
+    Sets up multiple field grouping criteria.
+
 A dictionary mapping facet names to FacetType objects
-    Sets up multiple grouping crieteria.
+    Sets up multiple grouping criteria.
 
 A ``Facets`` object
     This object is a lot like using a dictionary, but has some convenience
     methods to make setting up multiple groupings a little easier.
 
+
 Examples
 --------
 
 
     cats = sorting.FieldFacet("category")
     tags = sorting.FieldFacet("tags", allow_overlap=True)
-    results = searcher.search(myquery, groupedby={"cats": cats, "tags": tags})
+    results = searcher.search(myquery, groupedby={"category": cats, "tags": tags})
     
     # ...or, using a Facets object has a little less duplication
     facets = sorting.Facets()
 --------------------------
 
 The ``Results.groups("facetname")`` method  returns a dictionary mapping
-category names to lists of **document IDs**.
+category names to lists of **document IDs**::
 
-    {"small": [1, 2, 4, 5, 8], "medium": [0, 3, 6], "large": [7, 9]}
+    myfacets = sorting.Facets().add_field("size").add_field("tag")
+    results = mysearcher.search(myquery, groupedby=myfacets)
+    results.groups("size")
+    # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]}
 
-The ``Searcher`` object's ``stored_fields()`` method takes a document number
-and returns the document's stored fields as a dictionary::
+If there is only one facet, you can just use ``Results.groups()`` with no
+argument to access its groups::
+
+    results = mysearcher.search(myquery, groupedby=myfunctionfacet)
+    results.groups()
+
+By default, the values in the dictionary returned by ``groups()`` are lists of
+document numbers in the same relative order as in the results. You can use the
+``Searcher`` object's ``stored_fields()`` method to take a document number and
+return the document's stored fields as a dictionary::
 
     for category_name in categories:
         print "Top 5 documents in the %s category" % category_name
         if len(doclist) > 5:
             print "  (%s more)" % (len(doclist) - 5)
 
-(You can use ``Searcher.stored_fields(docnum)`` to get the stored fields
-associated with a document number.)
+If you want different information about the groups, for example just the count
+of documents in each group, or you don't need the groups to be ordered, you can
+specify a :class:`whoosh.sorting.FacetMap` type or instance with the
+``maptype`` keyword argument when creating the ``FacetType``::
 
-If you just want to **count** the number of documents in each group, instead of
-generating a full list of the documents, use the ``groupids=False`` keyword
-argument::
+    # This is the same as the default
+    myfacet = FieldFacet("size", maptype=sorting.OrderedList)
+    results = mysearcher.search(myquery, groupedby=myfacet)
+    results.groups()
+    # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]}
+    
+    # Don't sort the groups to match the order of documents in the results
+    # (faster)
+    myfacet = FieldFacet("size", maptype=sorting.UnorderedList)
+    results = mysearcher.search(myquery, groupedby=myfacet)
+    results.groups()
+    # {"small": [1, 2, 4, 5, 8], "medium": [0, 3, 6], "large": [7, 9]}
 
-    results = searcher.search(myquery, groupedby="size")
-    groups = results.groups("size")
+    # Only count the documents in each group
+    myfacet = FieldFacet("size", maptype=sorting.Count)
+    results = mysearcher.search(myquery, groupedby=myfacet)
+    results.groups()
     # {"small": 5, "medium": 3, "large": 2}
+    
+    # Only remember the "best" document in each group
+    myfacet = FieldFacet("size", maptype=sorting.Best)
+    results = mysearcher.search(myquery, groupedby=myfacet)
+    results.groups()
+    # {"small": 8, "medium": 3, "large": 7}
 
-To generate multiple groupings, you can name multiple fields in the list you
-pass to the `groups` keyword::
+Alternatively you can specify a ``maptype`` argument in the
+``Searcher.search()`` method call which applies to all facets::
 
-    # Generate separate groupings for the "tag" and "size" fields
-    results = searcher.search(myquery, groupedby=["tag", "size"])
-    
-    # Get the groupings by "tag"
-    tag_groups = results.groups("tag")
-    
-    # Get the groupings by "size"
-    size_groups = results.groups("size")
+    results = mysearcher.search(myquery, groupedby=["size", "tag"],
+                                maptype=sorting.Count)
+
+(You can override this overall ``maptype`` argument on individual facets by
+specifying the ``maptype`` argument for them as well.)
 
 
 Facet types

File docs/source/releases/2_0.rst

   functionality right now; I'm trying to think of ways to make its power easier
   to access.
 
+* The documents in the lists in the dictionary returned by ``Results.groups()``
+  by default are now in the same relative order as in the results. This makes
+  it much easier to display the "top N" results in each category, for example.
+
+* The ``groupids`` keyword argument to ``Searcher.search`` has been removed.
+  Instead you can now pass a :class:`whoosh.sorting.FacetMap` object to the
+  ``Searcher.search`` method's ``maptype`` argument to control how faceted
+  documents are grouped, and/or set the ``maptype`` argument on individual
+  :class:`whoosh.sorting.FacetType`` objects to set custom grouping per facet.
+  See :doc:`../facets` for more information.
+
+* Calling ``Searcher.documents()`` or ``Searcher.document_numbers()`` with no
+  arguments now yields all documents/numbers.
+
+* Calling ``Writer.update_document()`` with no unique fields is now equivalent
+  to calling ``Writer.add_document()`` with the same arguments.
+
+* Fixed a problem with keyword expansion where the code was building a cache
+  that was fast on small indexes, but unacceptably slow on large indexes.
+
+* Added the hyphen (``-``) to the list of characters that match a "wildcard"
+  token, to make parsing slightly more predictable. A true fix will have to
+  wait for another parser rewrite.
+
+* Fixed an unused ``__future__`` import and use of ``float("nan")`` which were
+  breaking under Python 2.5.
+
+* Fixed a bug where vectored fields with only one term stored an empty term
+  vector.
+
+* Various other bug fixes.
 
 Whoosh 2.2
 ==========

File src/whoosh/__init__.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-__version__ = (2, 2, 2)
+__version__ = (2, 3, 1)
 
 
 def versionstring(build=True, extra=True):

File src/whoosh/classify.py

                           self.ixreader.field_length(fieldname))
         self.model = model
 
-        # Cache the collection frequency of every term in this field. This
-        # turns out to be much faster than reading each individual weight
-        # from the term index as we add words.
-        self.collection_freq = dict((word, ti.weight()) for word, ti
-                                    in self.ixreader.iter_field(fieldname))
-
         # Maps words to their weight in the top N documents.
         self.topN_weight = defaultdict(float)
 
         """
 
         model = self.model
+        fieldname = self.fieldname
+        ixreader = self.ixreader
         tlist = []
         maxweight = 0
-        collection_freq = self.collection_freq
+
+        # If no terms have been added, return an empty list
+        if not self.topN_weight:
+            return []
 
         for word, weight in iteritems(self.topN_weight):
-            if word in collection_freq:
-                score = model.score(weight, collection_freq[word],
-                                    self.top_total)
+            if (fieldname, word) in ixreader:
+                cf = ixreader.frequency(fieldname, word)
+                score = model.score(weight, cf, self.top_total)
                 if score > maxweight:
                     maxweight = score
                 tlist.append((score, word))

File src/whoosh/fields.py

     
     The FieldType object supports the following attributes:
     
-    * format (fields.Format): the storage format for the field's contents.
+    * format (formats.Format): the storage format for the field's contents.
     
     * analyzer (analysis.Analyzer): the analyzer to use to turn text into
       terms.
     
-    * vector (fields.Format): the storage format for the field's vectors
+    * vector (formats.Format): the storage format for the field's vectors
       (forward index), or None if the field should not store vectors.
     
     * scorable (boolean): whether searches against this field may be scored.
     return schema
 
 
+def merge_fielddict(d1, d2):
+    keyset = set(d1.keys()) | set(d2.keys())
+    out = {}
+    for name in keyset:
+        field1 = d1.get(name)
+        field2 = d2.get(name)
+        if field1 and field2 and field1 != field2:
+            raise Exception("Inconsistent field %r: %r != %r"
+                            % (name, field1, field2))
+        out[name] = field1 or field2
+    return out
+
+
+def merge_schema(s1, s2):
+    schema = Schema()
+    schema._fields = merge_fielddict(s1._fields, s2._fields)
+    schema._dyn_fields = merge_fielddict(s1._dyn_fields, s2._dyn_fields)
+    return schema
+
+
+def merge_schemas(schemas):
+    schema = schemas[0]
+    for i in xrange(1, len(schemas)):
+        schema = merge_schema(schema, schemas[i])
+    return schema

File src/whoosh/filedb/fieldcache.py

     return arry
 
 
+def make_array(typecode, size=0, default=None):
+    if typecode.lower() == "q":
+        # Python does not support arrays of long long see Issue 1172711
+        if default is not None and size:
+            arry = [default] * size
+        else:
+            arry = []
+    else:
+        if default is not None and size:
+            arry = array(typecode, (default for _ in xrange(size)))
+        else:
+            arry = array(typecode)
+    return arry
+
+
 class FieldCache(object):
     """Keeps a list of the sorted text values of a field and an array of ints
     where each place in the array corresponds to a document, and the value
         :param default: the value to use for documents without the field.
         """
 
-        self.order = order or array(self.code)
+        self.order = order or make_array(typecode)
+        self.typecode = typecode
+
         self.hastexts = hastexts
         self.texts = None
         if hastexts:
             self.texts = texts or [default]
-        self.typecode = typecode
 
     def __eq__(self, other):
         return (other and self.__class__ is other.__class__
             defaultnum = field.sortable_default()
 
         doccount = ixreader.doc_count_all()
-        # Python does not support arrays of long long see Issue 1172711
-        if typecode.lower() == "q":
-            order = [defaultnum] * doccount
-        else:
-            order = array(typecode, [defaultnum] * doccount)
+        order = make_array(typecode, doccount, defaultnum)
 
         enum = enumerate(field.sortable_values(ixreader, fieldname))
         for i, (text, sortable) in enum:
     def __init__(self, dbfile, size=0, hastexts=True, code="I",
                  default=u('\uFFFF')):
         self.dbfile = dbfile
-        self.order = array(self.code, [0] * size)
         self.hastexts = hastexts
         self.code = code
+        self.order = make_array(code, size, 0)
 
         self.key = 0
         self.keycount = 1
         return cache
 
     def is_loaded(self, key):
-        if key in self.caches:
-            return True
-
-        with self.sharedlock:
-            return key in self.shared_cache
+        return key in self.caches or key in self.shared_cache
 
     def put(self, key, cache, save=True):
         self.caches[key] = cache
 
         if self._file_exists(key):
             try:
-                return self._load(key)
+                fc = self._load(key)
+                self.put(key, fc)
+                return fc
             except (OSError, BadFieldCache):
                 return None
 

File src/whoosh/filedb/filereading.py

         if not cp:
             if save and storage is None:
                 storage = self.storage
-            else:
+            elif not save:
                 storage = None
             cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage)
 

File src/whoosh/filedb/filewriting.py

         vpostwriter = self.vpostwriter
         offset = vpostwriter.start(self.schema[fieldname].vector)
         for text, weight, valuestring in vlist:
-            assert isinstance(text, text_type), "%r is not unicode" % text
+            #assert isinstance(text, text_type), "%r is not unicode" % text
             vpostwriter.write(text, weight, valuestring, 0)
-        vpostwriter.finish()
-
+        vpostwriter.finish(inlinelimit=0)
         self.vectorindex.add((docnum, fieldname), offset)
 
     def _add_vector_reader(self, docnum, fieldname, vreader):
             vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(),
                               0)
             vreader.next()
-        vpostwriter.finish()
-
+        vpostwriter.finish(inlinelimit=0)
         self.vectorindex.add((docnum, fieldname), offset)
 
     def _close_all(self):

File src/whoosh/qparser/plugins.py

 
 class WildcardPlugin(TaggingPlugin):
     class WildcardNode(syntax.TextNode):
+        # Note that this node inherits tokenize = False from TextNode,
+        # so the text in this node will not be analyzed... just passed
+        # straight to the query
+
+        # TODO: instead of parsing a "wildcard word", create marker nodes for
+        # individual ? and * characters. This will have to wait for a more
+        # advanced wikiparser-like parser.
+
         qclass = query.Wildcard
 
         def r(self):
     # \u061F = Arabic question mark
     # \u1367 = Ethiopic question mark
     qms = u("\u055E\u061F\u1367")
-    expr = u("(?P<text>\\w*[*?%s](\\w|[*?%s])*)") % (qms, qms)
+    expr = u("(?P<text>(\\w|[-])*[*?%s](\\w|[-*?%s])*)") % (qms, qms)
     nodetype = WildcardNode
 
 

File src/whoosh/searching.py

 from whoosh.compat import (iteritems, itervalues, iterkeys, xrange, text_type,
                            string_type)
 from whoosh.reading import TermNotFound
-from whoosh.support.bitvector import BitSet
+from whoosh.support.bitvector import BitSet, DocIdSet
 from whoosh.util import now, lru_cache
 
 
 
     def documents(self, **kw):
         """Convenience method returns the stored fields of a document
-        matching the given keyword arguments, where the keyword keys are
-        field names and the values are terms that must appear in the field.
+        matching the given keyword arguments, where the keyword keys are field
+        names and the values are terms that must appear in the field.
         
-        Returns a generator of dictionaries containing the
-        stored fields of any documents matching the keyword arguments.
+        Returns a generator of dictionaries containing the stored fields of any
+        documents matching the keyword arguments. If you do not specify any
+        arguments (``Searcher.documents()``), this method will yield **all**
+        documents.
         
         >>> for stored_fields in searcher.documents(emailto=u"matt@whoosh.ca"):
         ...   print "Email subject:", stored_fields['subject']
         subqueries = []
         for key, value in iteritems(kw):
             subqueries.append(query.Term(key, value))
-        return query.And(subqueries).normalize()
+        if subqueries:
+            q = query.And(subqueries).normalize()
+        else:
+            q = query.Every()
+        return q
 
     def document_number(self, **kw):
         """Returns the document number of the document matching the given
     def document_numbers(self, **kw):
         """Returns a generator of the document numbers for documents matching
         the given keyword arguments, where the keyword keys are field names and
-        the values are terms that must appear in the field.
+        the values are terms that must appear in the field. If you do not
+        specify any arguments (``Searcher.document_numbers()``), this method
+        will yield **all** document numbers.
         
         >>> docnums = list(searcher.document_numbers(emailto="matt@whoosh.ca"))
         """
 
-        if len(kw) == 0:
-            return []
-
         self._kw_to_text(kw)
         return self.docs_for_query(self._query_for_kw(kw))
 
     def _filter_to_comb(self, obj):
         if obj is None:
             return None
-        if isinstance(obj, (set, Bits)):
+        if isinstance(obj, (set, DocIdSet)):
             c = obj
         elif isinstance(obj, Results):
             c = obj.docset
                 yield docnum
 
     def search(self, q, limit=10, sortedby=None, reverse=False, groupedby=None,
-               optimize=True, filter=None, mask=None, groupids=True,
-               terms=False):
+               optimize=True, filter=None, mask=None, terms=False,
+               maptype=None):
         """Runs the query represented by the ``query`` object and returns a
         Results object.
         
             will only contain documents that are also in the filter object.
         :param mask: a query, Results object, or set of docnums. The results
             will not contain documents that are also in the mask object.
-        :param groupids: by default, faceting groups map keys to lists of
-            document numbers associated with that key. To map to a simple count
-            of the number of documents instead of a list, use
-            ``groupids=False``.
         :param terms: if True, record which terms were found in each matching
             document. You can use :meth:`Results.contains_term` or
             :meth:`Hit.contains_term` to check whether a hit contains a
             particular term.
+        :param maptype: by default, the results of faceting with ``groupedby``
+            is a dictionary mapping group names to ordered lists of document
+            numbers in the group. You can pass a
+            :class:`whoosh.sorting.FacetMap` subclass to this keyword argument
+            to specify a different (usually faster) method for grouping. For
+            example, ``maptype=sorting.Count`` would store only the count of
+            documents in each group, instead of the full list of document IDs.
         :rtype: :class:`Results`
         """
 
             raise ValueError("limit must be >= 1")
 
         collector = Collector(limit=limit, usequality=optimize,
-                              groupedby=groupedby, groupids=groupids,
-                              terms=terms)
+                              groupedby=groupedby, terms=terms,
+                              maptype=maptype)
 
         if sortedby:
             return collector.sort(self, q, sortedby, reverse=reverse,
     """
 
     def __init__(self, limit=10, usequality=True, groupedby=None,
-                 groupids=True, timelimit=None, greedy=False, terms=False,
-                 replace=10):
+                 timelimit=None, greedy=False, terms=False, replace=10,
+                 maptype=None):
         """
         :param limit: the maximum number of hits to collect. If this is None,
             collect all hits.
         :param usequality: whether to use block quality optimizations when
             available. This is mostly useful for debugging purposes.
         :param groupedby: see :doc:`/facets` for information.
-        :param groupids: if True, saves lists of document IDs for facets. If
-            False, only saves a count of the number of documents in each group.
         :param timelimit: the maximum amount of time (in possibly fractional
             seconds) to allow for searching. If the search takes longer than
             this, it will raise a ``TimeLimit`` exception.
         :param greedy: if ``True``, the collector will finish adding the most
             recent hit before raising the ``TimeLimit`` exception.
         :param terms: if ``True``, record which terms matched in each document.
+        :param maptype: a :class:`whoosh.sorting.FacetMap` type to use for all
+            facets that don't specify their own.
         """
 
         self.limit = limit
         self.replace = replace
         self.timelimit = timelimit
         self.greedy = greedy
-        self.groupids = groupids
+        self.maptype = maptype
         self.termlists = defaultdict(set) if terms else None
 
         self.facets = None
         return scorefn
 
     def _set_categorizers(self, searcher, offset):
-        groups = self.groups
         if self.facets:
-            self.categorizers = dict((name, facet.categorizer(searcher))
-                                     for name, facet in self.facets.items())
-
-            for name, catter in self.categorizers.items():
-                if self.groupids and name not in groups:
-                    groups[name] = defaultdict(list)
-                elif name not in groups:
-                    groups[name] = defaultdict(int)
-
+            self.categorizers = {}
+            for name, facet in self.facets.items():
+                catter = facet.categorizer(searcher)
                 catter.set_searcher(searcher, offset)
+                self.categorizers[name] = catter
 
     def _set_filters(self, allow, restrict):
         if allow:
             self.timer.start()
 
     def _reset(self):
-        self.groups = {}
+        self.facetmaps = {}
         self.items = []
         self.timedout = False
         self.runtime = -1
         self.minscore = None
+        if self.facets:
+            self.facetmaps = dict((facetname, facet.map(self.maptype))
+                                  for facetname, facet in self.facets.items())
+        else:
+            self.facetmaps = {}
 
     def _timestop(self):
         # Called by the Timer when the time limit expires. Set an attribute on
         self.timer = None
         self.timedout = True
 
-    def _add_to_group(self, name, key, offsetid):
-        if self.groupids:
-            self.groups[name][key].append(offsetid)
-        else:
-            self.groups[name][key] += 1
-
-    def collect(self, id, offsetid):
+    def collect(self, id, offsetid, sortkey):
         docset = self.docset
         if docset is not None:
             docset.add(offsetid)
 
         if self.facets is not None:
-            add = self._add_to_group
             for name, catter in self.categorizers.items():
+                add = self.facetmaps[name].add
                 if catter.allow_overlap:
                     for key in catter.keys_for_id(id):
-                        add(name, catter.key_to_name(key), offsetid)
+                        add(catter.key_to_name(key), offsetid, sortkey)
                 else:
                     key = catter.key_to_name(catter.key_for_id(id))
-                    add(name, key, offsetid)
+                    add(key, offsetid, sortkey)
 
     def search(self, searcher, q, allow=None, restrict=None):
         """Top-level method call which uses the given :class:`Searcher` and
             if ((not allow or offsetid in allow)
                 and (not restrict or offsetid not in restrict)):
                 # Collect and yield this document
-                collect(id, offsetid)
                 if scorefn:
                     score = scorefn(matcher)
+                    collect(id, offsetid, score)
                 else:
                     score = matcher.score()
+                    collect(id, offsetid, 0 - score)
                 yield (score, offsetid)
 
             # If recording terms, add the document to the termlists
             if ((not allow or offsetid in allow)
                 and (not restrict or offsetid not in restrict)):
                 # Collect and yield this document
-                collect(id, offsetid)
-                yield (keyfn(id), offsetid)
+                key = keyfn(id)
+                collect(id, offsetid, key)
+                yield (key, offsetid)
 
             # Check whether the time limit expired
             if timelimited and self.timedout:
             items = sorted(self.items, reverse=reverse)
 
         return Results(self.searcher, self.q, items, self.docset,
-                       groups=self.groups, runtime=self.runtime,
+                       facetmaps=self.facetmaps, runtime=self.runtime,
                        filter=self.allow, mask=self.restrict,
                        termlists=self.termlists)
 
     so keeps all files used by it open.
     """
 
-    def __init__(self, searcher, q, top_n, docset, groups=None, runtime= -1,
+    def __init__(self, searcher, q, top_n, docset, facetmaps=None, runtime= -1,
                  filter=None, mask=None, termlists=None, highlighter=None):
         """
         :param searcher: the :class:`Searcher` object that produced these
         self.q = q
         self.top_n = top_n
         self.docset = docset
-        self._groups = groups or {}
+        self._facetmaps = facetmaps or {}
         self.runtime = runtime
         self._filter = filter
         self._mask = mask
 
         return self.searcher.stored_fields(self.top_n[n][1])
 
-    def groups(self, name):
-        """If you generating groupings for the results by using the `groups`
-        keyword to the `search()` method, you can use this method to retrieve
-        the groups.
+    def facet_names(self):
+        """Returns the available facet names, for use with the ``groups()``
+        method.
+        """
+
+        return self._facetmaps.keys()
+
+    def groups(self, name=None):
+        """If you generated facet groupings for the results using the
+        `groupedby` keyword argument to the ``search()`` method, you can use
+        this method to retrieve the groups. You can use the ``facet_names()``
+        method to get the list of available facet names.
         
-        >>> results = searcher.search(my_query, groups=["tag"])
+        >>> results = searcher.search(my_query, groupedby=["tag", "price"])
+        >>> results.facet_names()
+        ["tag", "price"]
         >>> results.groups("tag")
+        {"new": [12, 1, 4], "apple": [3, 10, 5], "search": [11]}
         
-        Returns a dictionary mapping category names to lists of document IDs.
+        If you only used one facet, you can call the method without a facet
+        name to get the groups for the facet.
         
-        >>> groups = results.groups("tag")
-        >>> groups['new']
-        set([1, 4, 12])
+        >>> results = searcher.search(my_query, groupedby="tag")
+        >>> results.groups()
+        {"new": [12, 1, 4], "apple": [3, 10, 5, 0], "search": [11]}
+        
+        By default, this returns a dictionary mapping category names to a list
+        of document numbers, in the same relative order as they appear in the
+        results.
+        
+        >>> results = mysearcher.search(myquery, groupedby="tag")
+        >>> docnums = results.groups()
+        >>> docnums['new']
+        [12, 1, 4]
         
         You can then use :meth:`Searcher.stored_fields` to get the stored
         fields associated with a document ID.
+        
+        If you specified a different ``maptype`` for the facet when you
+        searched, the values in the dictionary depend on the
+        :class:`whoosh.sorting.FacetMap`.
+        
+        >>> myfacet = sorting.FieldFacet("tag", maptype=sorting.Count)
+        >>> results = mysearcher.search(myquery, groupedby=myfacet)
+        >>> counts = results.groups()
+        {"new": 3, "apple": 4, "search": 1}
         """
 
-        if name not in self._groups:
-            raise KeyError("%r not in group names %r"
-                           % (name, self._groups.keys()))
-        return dict(self._groups[name])
+        if (name is None or name == "facet") and len(self._facetmaps) == 1:
+            name = self._facetmaps.keys()[0]
+        elif name not in self._facetmaps:
+            raise KeyError("%r not in facet names %r"
+                           % (name, self.facet_names()))
+        return self._facetmaps[name].as_dict()
 
     def _load_docs(self):
         # If self.docset is None, that means this results object was created

File src/whoosh/sorting.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from array import array
+from collections import defaultdict
 
-from whoosh.compat import string_type, u, xrange
+from whoosh.compat import string_type, u, xrange, iteritems
 from whoosh.fields import DEFAULT_LONG
 from whoosh.support.times import (long_to_datetime, datetime_to_long,
                                   timedelta_to_usecs)
     """Base class for "facets", aspects that can be sorted/faceted.
     """
 
+    maptype = None
+
     def categorizer(self, searcher):
         """Returns a :class:`Categorizer` corresponding to this facet.
         """
 
         raise NotImplementedError
 
+    def map(self, default=None):
+        t = self.maptype
+        if t is None:
+            t = default
+
+        if t is None:
+            return OrderedList()
+        elif type(t) is type:
+            return t()
+        else:
+            return t
+
+    def default_name(self):
+        return "facet"
+
 
 class Categorizer(object):
     """Base class for categorizer objects which compute a key value for a
     This facet returns different categorizers based on the field type.
     """
 
-    def __init__(self, fieldname, reverse=False, allow_overlap=False):
+    def __init__(self, fieldname, reverse=False, allow_overlap=False,
+                 maptype=None):
         """
         :param fieldname: the name of the field to sort/facet on.
         :param reverse: if True, when sorting, reverse the sort order of this
         self.fieldname = fieldname
         self.reverse = reverse
         self.allow_overlap = allow_overlap
+        self.maptype = maptype
+
+    def default_name(self):
+        return self.fieldname
 
     def categorizer(self, searcher):
         from whoosh.fields import NUMERIC, DATETIME
     """Sorts/facets based on the results of a series of queries.
     """
 
-    def __init__(self, querydict, other=None, allow_overlap=False):
+    def __init__(self, querydict, other=None, allow_overlap=False,
+                 maptype=None):
         """
         :param querydict: a dictionary mapping keys to
             :class:`whoosh.query.Query` objects.
 
         self.querydict = querydict
         self.other = other
+        self.maptype = maptype
 
     def categorizer(self, searcher):
         return self.QueryCategorizer(self.querydict, self.other)
     at the end.
     """
 
-    def __init__(self, fieldname, start, end, gap, hardend=False):
+    def __init__(self, fieldname, start, end, gap, hardend=False,
+                 maptype=None):
         """
         :param fieldname: the numeric field to sort/facet on.
         :param start: the start of the entire range.
         self.end = end
         self.gap = gap
         self.hardend = hardend
+        self.maptype = maptype
         self._queries()
 
+    def default_name(self):
+        return self.fieldname
+
     def _rangetype(self):
         from whoosh import query
 
         lengths = FunctionFacet(fn)
     """
 
-    def __init__(self, fn):
+    def __init__(self, fn, maptype=None):
         self.fn = fn
+        self.maptype = maptype
 
     def categorizer(self, searcher):
         return self.FunctionCategorizer(searcher, self.fn)
     if one is supplied).
     """
 
-    def __init__(self, fieldname, allow_overlap=False, split_fn=None):
+    def __init__(self, fieldname, allow_overlap=False, split_fn=None,
+                 maptype=None):
         """
         :param fieldname: the name of the stored field.
         :param allow_overlap: if True, when grouping, allow documents to appear
         self.fieldname = fieldname
         self.allow_overlap = allow_overlap
         self.split_fn = None
+        self.maptype = maptype
+
+    def default_name(self):
+        return self.fieldname
 
     def categorizer(self, searcher):
         return self.StoredFieldCategorizer(self.fieldname, self.allow_overlap,
                          "n-z": TermRange("name", "n", "z")})
     """
 
-    def __init__(self, items=None):
+    def __init__(self, items=None, maptype=None):
         self.facets = []
         if items:
             for item in items:
                 self._add(item)
+        self.maptype = maptype
 
     @classmethod
     def from_sortedby(cls, sortedby):
         elif isinstance(groupedby, string_type):
             facets.add_field(groupedby)
         elif isinstance(groupedby, FacetType):
-            facets.add_facet("facet", groupedby)
+            facets.add_facet(groupedby.default_name(), groupedby)
         elif isinstance(groupedby, (list, tuple)):
             for item in groupedby:
                 facets.add_facets(cls.from_groupedby(item))
 
         return facets
 
+    def names(self):
+        """Returns an iterator of the facet names in this object.
+        """
+
+        return iter(self.facets)
+
     def items(self):
         """Returns a list of (facetname, facetobject) tuples for the facets in
         this object.
 
         return self.facets.items()
 
-    def add_field(self, fieldname, allow_overlap=False):
+    def add_field(self, fieldname, **kwargs):
         """Adds a :class:`FieldFacet` for the given field name (the field name
         is automatically used as the facet name).
         """
 
-        self.facets[fieldname] = FieldFacet(fieldname,
-                                            allow_overlap=allow_overlap)
+        self.facets[fieldname] = FieldFacet(fieldname, **kwargs)
         return self
 
-    def add_query(self, name, querydict, other=None, allow_overlap=False):
+    def add_query(self, name, querydict, **kwargs):
         """Adds a :class:`QueryFacet` under the given ``name``.
         
         :param name: a name for the facet.
             :class:`whoosh.query.Query` objects.
         """
 
-        self.facets[name] = QueryFacet(querydict, other=other,
-                                       allow_overlap=allow_overlap)
+        self.facets[name] = QueryFacet(querydict, **kwargs)
         return self
 
     def add_facet(self, name, facet):
         return self
 
 
+# Objects for holding facet groups
+
+class FacetMap(object):
+    """Base class for objects holding the results of grouping search results by
+    a Facet. Use an object's ``as_dict()`` method to access the results.
+    
+    You can pass a subclass of this to the ``maptype`` keyword argument when
+    creating a ``FacetType`` object to specify what information the facet
+    should record about the group. For example::
+    
+        # Record each document in each group in its sorted order
+        myfacet = FieldFacet("size", maptype=OrderedList)
+        
+        # Record only the count of documents in each group
+        myfacet = FieldFacet("size", maptype=Count)
+    """
+
+    def add(self, groupname, docid, sortkey):
+        """Adds a document to the facet results.
+        
+        :param groupname: the name of the group to add this document to.
+        :param docid: the document number of the document to add.
+        :param sortkey: a value representing the sort position of the document
+            in the full results.
+        """
+
+        raise NotImplementedError
+
+    def as_dict(self):
+        """Returns a dictionary object mapping group names to
+        implementation-specific values. For example, the value might be a list
+        of document numbers, or a integer representing the number of documents
+        in the group.
+        """
+
+        raise NotImplementedError
+
+
+class OrderedList(FacetMap):
+    """Stores a list of document numbers for each group, in the same order as
+    they appear in the search results.
+    
+    The ``as_dict`` method returns a dictionary mapping group names to lists
+    of document numbers.
+    """
+
+    def __init__(self):
+        self.dict = defaultdict(list)
+
+    def add(self, groupname, docid, sortkey):
+        self.dict[groupname].append((sortkey, docid))
+
+    def as_dict(self):
+        d = {}
+        for key, items in iteritems(self.dict):
+            d[key] = [docnum for _, docnum in sorted(items)]
+        return d
+
+
+class UnorderedList(FacetMap):
+    """Stores a list of document numbers for each group, in arbitrary order.
+    This is slightly faster and uses less memory than
+    :class:`OrderedListResult` if you don't care about the ordering of the
+    documents within groups.
+    
+    The ``as_dict`` method returns a dictionary mapping group names to lists
+    of document numbers.
+    """
+
+    def __init__(self):
+        self.dict = defaultdict(list)
+
+    def add(self, groupname, docid, sortkey):
+        self.dict[groupname].append(docid)
+
+    def as_dict(self):
+        return dict(self.dict)
+
+
+class Count(FacetMap):
+    """Stores the number of documents in each group.
+    
+    The ``as_dict`` method returns a dictionary mapping group names to
+    integers.
+    """
+
+    def __init__(self):
+        self.dict = defaultdict(int)
+
+    def add(self, groupname, docid, sortkey):
+        self.dict[groupname] += 1
+
+    def as_dict(self):
+        return dict(self.dict)
+
+
+class Best(FacetMap):
+    """Stores the "best" document in each group (that is, the one that appears
+    highest in the results).
+    
+    The ``as_dict`` method returns a dictionary mapping group names to
+    docnument numbers.
+    """
+
+    def __init__(self):
+        self.bestids = {}
+        self.bestkeys = {}
+
+    def add(self, groupname, docid, sortkey):
+        if groupname not in self.bestids or sortkey < self.bestkeys[groupname]:
+            self.bestids[groupname] = docid
+            self.bestkeys[groupname] = sortkey
+
+    def as_dict(self):
+        return self.bestids
+
+
 #
 #
 #

File src/whoosh/support/base85.py

+"""
+This module contains generic base85 encoding and decoding functions. The
+whoosh.support.numeric module contains faster variants for encoding and
+decoding integers.
+
+Modified from:
+http://paste.lisp.org/display/72815
+"""
+
+import struct
+
+from whoosh.compat import xrange
+
+
+# Instead of using the character set from the ascii85 algorithm, I put the
+# characters in order so that the encoded text sorts properly (my life would be
+# a lot easier if they had just done that from the start)
+b85chars = ("!$%&*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            "^_abcdefghijklmnopqrstuvwxyz{|}~")
+b85dec = {}
+for i in range(len(b85chars)):
+    b85dec[b85chars[i]] = i
+
+
+# Integer encoding and decoding functions
+
+def to_base85(x, islong=False):
+    "Encodes the given integer using base 85."
+
+    size = 10 if islong else 5
+    rems = ""
+    for i in xrange(size):
+        rems = b85chars[x % 85] + rems
+        x //= 85
+    return rems
+
+
+def from_base85(text):
+    "Decodes the given base 85 text into an integer."
+
+    acc = 0
+    for c in text:
+        acc = acc * 85 + b85dec[c]
+    return acc
+
+
+# Bytes encoding and decoding functions
+
+def b85encode(text, pad=False):
+    l = len(text)
+    r = l % 4
+    if r:
+        text += '\0' * (4 - r)
+    longs = len(text) >> 2
+    out = []
+    words = struct.unpack('>' + 'L' * longs, text[0:longs * 4])
+    for word in words:
+        rems = [0, 0, 0, 0, 0]
+        for i in range(4, -1, -1):
+            rems[i] = b85chars[word % 85]
+            word /= 85
+        out.extend(rems)
+
+    out = ''.join(out)
+    if pad:
+        return out
+
+    # Trim padding
+    olen = l % 4
+    if olen:
+        olen += 1
+    olen += l / 4 * 5
+    return out[0:olen]
+
+
+def b85decode(text):
+    l = len(text)
+    out = []
+    for i in range(0, len(text), 5):
+        chunk = text[i:i + 5]
+        acc = 0
+        for j in range(len(chunk)):
+            try:
+                acc = acc * 85 + b85dec[chunk[j]]
+            except KeyError:
+                raise TypeError('Bad base85 character at byte %d' % (i + j))
+        if acc > 4294967295:
+            raise OverflowError('Base85 overflow in hunk starting at byte %d' % i)
+        out.append(acc)
+
+    # Pad final chunk if necessary
+    cl = l % 5
+    if cl:
+        acc *= 85 ** (5 - cl)
+        if cl > 1:
+            acc += 0xffffff >> (cl - 2) * 8
+        out[-1] = acc
+
+    out = struct.pack('>' + 'L' * ((l + 4) / 5), *out)
+    if cl:
+        out = out[:-(5 - cl)]
+
+    return out

File src/whoosh/support/numeric.py

 import struct
 from array import array
 
-from whoosh.compat import long_type, xrange, PY3
+from whoosh.compat import long_type, PY3
+from whoosh.support.base85 import to_base85, from_base85
 
 _istruct = struct.Struct(">i")
 _qstruct = struct.Struct(">q")
         yield (starttext, endtext)
 
 
-# Functions for encoding numeric values as sequences of 7-bit ascii characters
-
-# Instead of using the character set from the ascii85 algorithm, I put the
-# characters in order so that the encoded text sorts properly (my life would be
-# a lot easier if they had just done that from the start)
-_b85chars = ("!$%&*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "^_abcdefghijklmnopqrstuvwxyz{|}~")
-_b85dec = {}
-for i in range(len(_b85chars)):
-    _b85dec[_b85chars[i]] = i
-
-
-def to_base85(x, islong=False):
-    "Encodes the given integer using base 85."
-
-    size = 10 if islong else 5
-    rems = ""
-    for i in xrange(size):
-        rems = _b85chars[x % 85] + rems
-        x //= 85
-    return rems
-
-
-def from_base85(text):
-    "Decodes the given base 85 text into an integer."
-
-    acc = 0
-    for c in text:
-        acc = acc * 85 + _b85dec[c]
-    return acc
-
-
 # Older, slower number-to-ascii functions
 
 def to_7bit(x, islong):

File src/whoosh/writing.py

File contents unchanged.

File stress/test_bigfacet.py

 
 tagcount = 100
 doccount = 500000
-dirname = "tagindex"
+dirname = "testindex"
 
 schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence()))
 

File tests/test_parsing.py

     assert_equal(q[1].fieldname, "title")
     assert_equal(q[0].text, "*john*")
     assert_equal(q[1].text, "blog")
+
+def test_dash():
+    ana = analysis.StandardAnalyzer("[ \t\r\n()*?]+")
+    schema = fields.Schema(title=fields.TEXT(analyzer=ana),
+                           text=fields.TEXT(analyzer=ana), time=fields.ID)
+    qtext = u("*Ben-Hayden*")
+
+    qp = default.QueryParser("text", schema)
+    q = qp.parse(qtext)
+    assert_equal(repr(q), "Wildcard('text', u'*ben-hayden*')")
+
+    qp = default.MultifieldParser(["title", "text", "time"], schema)
+    q = qp.parse(qtext)
+    assert_equal(repr(q), "Or([Wildcard('title', u'*ben-hayden*'), Wildcard('text', u'*ben-hayden*'), Wildcard('time', u'*Ben-Hayden*')])")
+
+

File tests/test_searching.py

 def test_docs_method():
     ix = make_index()
     with ix.searcher() as s:
-        assert_equal(_get_keys(s.documents(name="yellow")), [u("A"), u("E")])
-        assert_equal(_get_keys(s.documents(value="red")), [u("A"), u("D")])
+        assert_equal(_get_keys(s.documents(name="yellow")), ["A", "E"])
+        assert_equal(_get_keys(s.documents(value="red")), ["A", "D"])
+        assert_equal(_get_keys(s.documents()), ["A", "B", "C", "D", "E"])
 
 def test_term():
     _run_query(Term("name", u("yellow")), [u("A"), u("E")])

File tests/test_sorting.py

             multiprocessing.Process.__init__(self)
             self.storage = storage
             self.indexname = indexname
-            
+
         def run(self):
             ix = self.storage.open_index(self.indexname)
             with ix.searcher() as s:
 
 
 docs = ({"id": u("zulu"), "num": 100, "tag": u("one"), "frac": 0.75},
-        {"id": u("xray"), "num": -5, "tag": u("three"), "frac": 2.0},
+        {"id": u("xray"), "num":-5, "tag": u("three"), "frac": 2.0},
         {"id": u("yankee"), "num": 3, "tag": u("two"), "frac": 5.5},
-        
+
         {"id": u("alfa"), "num": 7, "tag": u("three"), "frac": 2.25},
         {"id": u("tango"), "num": 2, "tag": u("two"), "frac": 1.75},
-        {"id": u("foxtrot"), "num": -800, "tag": u("two"), "frac": 3.25},
-        
+        {"id": u("foxtrot"), "num":-800, "tag": u("two"), "frac": 3.25},
+
         {"id": u("sierra"), "num": 1, "tag": u("one"), "frac": 4.75},
         {"id": u("whiskey"), "num": 0, "tag": u("three"), "frac": 5.25},
         {"id": u("bravo"), "num": 582045, "tag": u("three"), "frac": 1.25},
 def make_multi_index(ix):
     for i in xrange(0, len(docs), 3):
         w = ix.writer()
-        for doc in docs[i:i+3]:
+        for doc in docs[i:i + 3]:
             w.add_document(ev=u("a"), **doc)
         w.commit(merge=False)
 
 def try_sort(sortedby, key, q=None, limit=None, reverse=False):
     if q is None: q = query.Term("ev", u("a"))
-    
+
     correct = [d["id"] for d in sorted(docs, key=key, reverse=reverse)][:limit]
-    
+
     for fn in (make_single_index, make_multi_index):
         with TempIndex(get_schema()) as ix:
             fn(ix)
         w.add_document(tag=u("juliet"))
         w.add_document(tag=u("romeo"))
         w.commit()
-        
+
         with ix.reader() as r:
             _ = r.fieldcache("tag")
             assert_equal(list(r.lexicon("tag")), ["alfa", "juliet", "romeo", "sierra"])
 
 # 
 
+def test_persistent_cache():
+    schema = fields.Schema(id=fields.ID(stored=True))
+    st = RamStorage()
+    ix = st.create_index(schema)
+    with ix.writer() as w:
+        for term in u("charlie alfa echo bravo delta").split():
+            w.add_document(id=term)
+
+    ix = st.open_index()
+    with ix.reader() as r:
+        _ = r.fieldcache("id")
+        del _
+
+    ix = st.open_index()
+    with ix.reader() as r:
+        assert r.fieldcache_available("id")
+        assert not r.fieldcache_loaded("id")
+        fc = r.fieldcache("id")
+        assert r.fieldcache_loaded("id")
+        assert_equal(list(fc.order), [3, 1, 5, 2, 4])
+        assert_equal(list(fc.texts), [u('\uffff'), u'alfa', u'bravo',
+                                      u'charlie', u'delta', u'echo'])
+
 def test_float_cache():
     schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(type=float))
     with TempIndex(schema, "floatcache") as ix:
         w = ix.writer()
         w.add_document(id=1, num=1.5)
-        w.add_document(id=2, num=-8.25)
+        w.add_document(id=2, num= -8.25)
         w.add_document(id=3, num=0.75)
         w.commit()
-        
+
         with ix.reader() as r:
             r.fieldcache("num")
+            assert r.fieldcache_loaded("num")
             r.unload_fieldcache("num")
-            
+            assert not r.fieldcache_loaded("num")
+            assert r.fieldcache_available("num")
+
             fc = r.fieldcache("num")
             assert not fc.hastexts
             assert_equal(fc.texts, None)
     with TempIndex(schema, "longcache") as ix:
         w = ix.writer()
         w.add_document(id=1, num=2858205080241)
-        w.add_document(id=2, num=-3572050858202)
+        w.add_document(id=2, num= -3572050858202)
         w.add_document(id=3, num=4985020582043)
         w.commit()
-        
+
         with ix.reader() as r:
             r.fieldcache("num")
+            assert r.fieldcache_loaded("num")
             r.unload_fieldcache("num")
-            
+            assert not r.fieldcache_loaded("num")
+            assert r.fieldcache_available("num")
+
             fc = r.fieldcache("num")
             assert not fc.hastexts
             assert_equal(fc.texts, None)
         make_single_index(ix)
         r1 = ix.reader()
         fc1 = r1.fieldcache("id")
-        
+
         r2 = ix.reader()
         fc2 = r2.fieldcache("id")
-        
+
         assert fc1 is fc2
-        
+
         r3 = ix.reader()
         assert r3.fieldcache_loaded("id")
-        
+
         r1.close()
         r2.close()
         del r1, fc1, r2, fc2
         import gc
         gc.collect()
-        
+
         assert not r3.fieldcache_loaded("id")
         r3.close()
 
         for char in domain:
             w.add_document(key=char)
         w.commit()
-        
+
         tasks = [MPFCTask(ix.storage, ix.indexname) for _ in xrange(4)]
         for task in tasks:
             task.start()
     try_sort("id", lambda d: d["id"])
     try_sort("id", lambda d: d["id"], limit=5)
     try_sort("id", lambda d: d["id"], reverse=True)
-    try_sort("id",  lambda d: d["id"], limit=5, reverse=True)
+    try_sort("id", lambda d: d["id"], limit=5, reverse=True)
 
 def test_multisort():
     mf = sorting.MultiFacet(["tag", "id"])
         w.add_document(id=2)
         w.add_document(id=3)
         w.commit()
-        
+
         with ix.searcher() as s:
             r = s.search(query.Every(), sortedby="key")
             assert_equal([h["id"] for h in r], [1, 2, 3])
     with TempIndex(schema, "pagesorted") as ix:
         domain = list(u("abcdefghijklmnopqrstuvwxyz"))
         random.shuffle(domain)
-        
+
         w = ix.writer()
         for char in domain:
             w.add_document(key=char)
         w.commit()
-        
+
         with ix.searcher() as s:
             r = s.search(query.Every(), sortedby="key", limit=5)
             assert_equal(r.scored_length(), 5)
             assert_equal(len(r), s.doc_count_all())
-            
+
             rp = s.search_page(query.Every(), 1, pagelen=5, sortedby="key")
             assert_equal("".join([h["key"] for h in rp]), "abcde")
             assert_equal(rp[10:], [])
-            
+
             rp = s.search_page(query.Term("key", "glonk"), 1, pagelen=5, sortedby="key")
             assert_equal(len(rp), 0)
             assert rp.is_last_page()
     w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c"))
     w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c"))
     w.commit(merge=False)
-    
+
     with ix.searcher() as s:
         facet = sorting.MultiFacet(["b", sorting.ScoreFacet()])
         r = s.search(q=query.Term("a", u("alfa")), sortedby=facet)
                     w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4)))
                     count += 1
     w.commit()
-    
+
     def fn(searcher, docnum):
         v = dict(searcher.vector_as("frequency", docnum, "text"))
         # Give high score to documents that have equal number of "alfa"
         # and "bravo". Negate value so higher values sort first
         return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0))
-    
+
     with ix.searcher() as s:
         q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])
-        
+
         fnfacet = sorting.FunctionFacet(fn)
         r = s.search(q, sortedby=fnfacet)
         texts = [hit["text"] for hit in r]
     w.add_document(id=5, v1=2, v2=50)
     w.add_document(id=6, v1=1, v2=200)
     w.commit()
-    
+
     with ix.searcher() as s:
         mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True)
         r = s.search(query.Every(), sortedby=mf)
         w = ix.writer()
         w.add_document(id=i, v=ltr)
         w.commit(merge=False)
-    
+
     with ix.searcher() as s:
         q1 = query.TermRange("v", "a", "c")
         q2 = query.TermRange("v", "d", "f")
         q3 = query.TermRange("v", "g", "i")
-        
+
         assert_equal([hit["id"] for hit in s.search(q1)], [1, 2, 4])
         assert_equal([hit["id"] for hit in s.search(q2)], [5, 7, 8])
         assert_equal([hit["id"] for hit in s.search(q3)], [0, 3, 6])
-        
+
         facet = sorting.QueryFacet({"a-c": q1, "d-f": q2, "g-i": q3})
         r = s.search(query.Every(), groupedby=facet)
         # If you specify a facet without a name, it's automatically called
         for i, ltr in enumerate(domain):
             v = "%s %s" % (ltr, domain[0 - i])
             w.add_document(v=v)
-    
+
     with ix.searcher() as s:
         q1 = query.TermRange("v", "a", "c")
         q2 = query.TermRange("v", "d", "f")
         q3 = query.TermRange("v", "g", "i")
-        
+
         facets = sorting.Facets()
         facets.add_query("myfacet", {"a-c": q1, "d-f": q2, "g-i": q3}, allow_overlap=True)
         r = s.search(query.Every(), groupedby=facets)
     w.add_document(id=3, tag=u("bravo"))
     w.add_document(id=4)
     w.commit()
-    
+
     with ix.searcher() as s:
         r = s.search(query.Every(), groupedby="tag")
         assert_equal(r.groups("tag"), {None: [2, 4], 'bravo': [3], 'alfa': [0, 1]})
     w.add_document(id=3, tag=0)
     w.add_document(id=4)
     w.commit()
-    
+
     with ix.searcher() as s:
         r = s.search(query.Every(), groupedby="tag")
         assert_equal(r.groups("tag"), {None: [2, 4], 0: [3], 1: [0, 1]})
     w.add_document(id=3, date=d2)
     w.add_document(id=4)
     w.commit()
-    
+
     with ix.searcher() as s:
         r = s.search(query.Every(), groupedby="date")
-        assert_equal(r.groups("date"),  {d1: [0, 1], d2: [3], None: [2, 4]})
+        assert_equal(r.groups("date"), {d1: [0, 1], d2: [3], None: [2, 4]})
 
 def test_range_facet():
     schema = fields.Schema(id=fields.STORED, price=fields.NUMERIC)
     w.add_document(id=4, price=500)
     w.add_document(id=5, price=125)
     w.commit()
-    
+
     with ix.searcher() as s:
         rf = sorting.RangeFacet("price", 0, 1000, 100)
         r = s.search(query.Every(), groupedby={"price": rf})
     for i in range(10):
         w.add_document(id=i, num=i)
     w.commit()
-    
+
     with ix.searcher() as s:
-        rf = sorting.RangeFacet("num", 0, 1000, [1,2,3])
+        rf = sorting.RangeFacet("num", 0, 1000, [1, 2, 3])
         r = s.search(query.Every(), groupedby={"num": rf})
         assert_equal(r.groups("num"), {(0, 1): [0],
                                        (1, 3): [1, 2],
     w.add_document(id=4, date=datetime(2001, 1, 8))
     w.add_document(id=5, date=datetime(2001, 1, 6))
     w.commit()
-    
+
     with ix.searcher() as s:
         rf = sorting.DateRangeFacet("date", datetime(2001, 1, 1),
                                     datetime(2001, 1, 20), timedelta(days=5))
 def test_relative_daterange():
     from whoosh.support.relativedelta import relativedelta
     dt = datetime
-    
+
     schema = fields.Schema(id=fields.STORED, date=fields.DATETIME)
     ix = RamStorage().create_index(schema)
     basedate = datetime(2001, 1, 1)
             w.add_document(id=count, date=basedate)
             basedate += timedelta(days=14, hours=16)
             count += 1
-    
+
     with ix.searcher() as s:
         gap = relativedelta(months=1)
         rf = sorting.DateRangeFacet("date", dt(2001, 1, 1), dt(2001, 12, 31), gap)
         w.add_document(id=2, tags=u("charlie delta echo"))
         w.add_document(id=3, tags=u("delta echo alfa"))
         w.add_document(id=4, tags=u("echo alfa bravo"))
-    
+
     with ix.searcher() as s:
         of = sorting.FieldFacet("tags", allow_overlap=True)
         r = s.search(query.Every(), groupedby={"tags": of})
                         == [(u('one'), [0, 6]),
                             (u('three'), [1, 3, 7, 8]),
                             (u('two'), [2, 4, 5])])
-    
+
     check(make_single_index)
     check(make_multi_index)
 
         w.add_document(tag=u("alfa"), size=u("medium"))
         w.add_document(tag=u("bravo"), size=u("medium"))
         w.commit()
-        
+
         correct = {(u('bravo'), u('medium')): [1, 5], (u('alfa'), u('large')): [2],
                    (u('alfa'), u('medium')): [4], (u('alfa'), u('small')): [0],
                    (u('bravo'), u('small')): [3]}
-        
+
         with ix.searcher() as s:
             facet = sorting.MultiFacet(["tag", "size"])
             r = s.search(query.Every(), groupedby={"tag/size" : facet})
         group = groups[i % len(groups)]
         source.append({"key": key, "group": group})
     source.sort(key=lambda x: (x["key"], x["group"]))
-    
+
     sample = source[:]
     random.shuffle(sample)
-    
+
     with TempIndex(schema, "sortfilter") as ix:
         w = ix.writer()
         for i, fs in enumerate(sample):
                 w.commit(merge=False)
                 w = ix.writer()
         w.commit()
-        
+
         fq = query.Term("group", u("bravo"))
-        
+
         with ix.searcher() as s:
             r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=20)
             assert_equal([h.fields() for h in r],
                          [d for d in source if d["group"] == "bravo"][:20])
-            
+
             fq = query.Term("group", u("bravo"))
             r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=None)
             assert_equal([h.fields() for h in r],
                          [d for d in source if d["group"] == "bravo"])
-            
+
         ix.optimize()
-        
+
         with ix.searcher() as s:
             r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=20)
             assert_equal([h.fields() for h in r],
                          [d for d in source if d["group"] == "bravo"][:20])
-            
+
             fq = query.Term("group", u("bravo"))
             r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=None)
             assert_equal([h.fields() for h in r],
     schema = fields.Schema(name=fields.ID(stored=True),
                            price=fields.NUMERIC,
                            quant=fields.NUMERIC)
-    
+
     with TempIndex(schema, "customsort") as ix:
         w = ix.writer()
         w.add_document(name=u("A"), price=200, quant=9)
         w.add_document(name=u("B"), price=250, quant=11)
         w.add_document(name=u("C"), price=200, quant=10)
         w.commit()
-        
+
         with ix.searcher() as s:
             cs = s.sorter()
             cs.add_field("price")
             cs.add_field("quant", reverse=True)
             r = cs.sort_query(query.Every(), limit=None)
             assert_equal([hit["name"] for hit in r], list(u("DCAFBE")))
-            
+
 def test_sorting_function():
     schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True))
     ix = RamStorage().create_index(schema)
                     w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4)))
                     count += 1
     w.commit()
-    
+
     def fn(searcher, docnum):
         v = dict(searcher.vector_as("frequency", docnum, "text"))
         # Sort documents that have equal number of "alfa"
         # and "bravo" first
         return 0 - 1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)
     fnfacet = sorting.FunctionFacet(fn)
-    
+
     with ix.searcher() as s:
         q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])
         results = s.search(q, sortedby=fnfacet)
             tks = t.split()
             assert_equal(tks.count("alfa"), tks.count("bravo"))
 
+def test_sorted_groups():
+    schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(a=0, b=u("blah"), c=u("apple"))
+        w.add_document(a=1, b=u("blah blah"), c=u("bear"))
+        w.add_document(a=2, b=u("blah blah blah"), c=u("apple"))
+        w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear"))
+        w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple"))
+        w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear"))
 
-#def test_custom_sort2():
-#    from array import array
-#    from whoosh.searching import Results
-#    
-#    class CustomSort(object):
-#        def __init__(self, *criteria):
-#            self.criteria = criteria
-#            self.arrays = None
-#            
-#        def cache(self, searcher):
-#            self.arrays = []
-#            r = searcher.reader()
-#            for name, reverse in self.criteria:
-#                arry = array("i", [0] * r.doc_count_all())
-#                field = ix.schema[name]
-#                for i, (token, _) in enumerate(field.sortable_values(r, name)):
-#                    if reverse: i = 0 - i
-#                    postings = r.postings(name, token)
-#                    for docid in postings.all_ids():
-#                        arry[docid] = i
-#                self.arrays.append(arry)
-#                
-#        def key_fn(self, docnum):
-#            return tuple(arry[docnum] for arry in self.arrays)
-#        
-#        def sort_query(self, searcher, q):
-#            if self.arrays is None:
-#                self.cache(searcher)
-#            
-#            return self._results(searcher, q, searcher.docs_for_query(q))
-#        
-#        def sort_all(self, searcher):
-#            if self.arrays is None:
-#                self.cache(searcher)
-#            
-#            return self._results(searcher, None, searcher.reader().all_doc_ids())
-#            
-#        def _results(self, searcher, q, docnums):
-#            docnums = sorted(docnums, key=self.key_fn)
-#            return Results(searcher, q, [(None, docnum) for docnum in docnums], None)
-#            
-#    
-#    schema = fields.Schema(name=fields.ID(stored=True),
-#                           price=fields.NUMERIC,
-#                           quant=fields.NUMERIC)
-#    
-#    with TempIndex(schema, "customsort") as ix:
-#        w = ix.writer()
-#        w.add_document(name=u("A"), price=200, quant=9)
-#        w.add_document(name=u("E"), price=300, quant=4)
-#        w.add_document(name=u("F"), price=200, quant=8)
-#        w.add_document(name=u("D"), price=150, quant=5)
-#        w.add_document(name=u("B"), price=250, quant=11)
-#        w.add_document(name=u("C"), price=200, quant=10)
-#        w.commit()
-#        
-#        cs = CustomSort(("price", False), ("quant", True))
-#        with ix.searcher() as s:
-#            assert_equal([hit["name"] for hit in cs.sort_query(s, query.Every())],
-#                          list("DCAFBE"))
+    with ix.searcher() as s:
+        q = query.Term("b", "blah")
+        r = s.search(q, groupedby="c")
+        gs = r.groups("c")
+        assert_equal(gs["apple"], [4, 2, 0])
+        assert_equal(gs["bear"], [5, 3, 1])
+
+def test_group_types():
+    schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(a=0, b=u("blah"), c=u("apple"))
+        w.add_document(a=1, b=u("blah blah"), c=u("bear"))
+        w.add_document(a=2, b=u("blah blah blah"), c=u("apple"))
+        w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear"))
+        w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple"))
+        w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear"))
+        w.add_document(a=6, b=u("blah blah blah blah blah blah blah"), c=u("apple"))
+
+    with ix.searcher() as s:
+        q = query.Term("b", "blah")
+
+        f = sorting.FieldFacet("c", maptype=sorting.UnorderedList)
+        r = s.search(q, groupedby=f)
+        gs = r.groups("c")
+        assert_equal(gs["apple"], [0, 2, 4, 6])
+        assert_equal(gs["bear"], [1, 3, 5])
+
+        f = sorting.FieldFacet("c", maptype=sorting.Count)
+        r = s.search(q, groupedby=f)
+        gs = r.groups("c")
+        assert_equal(gs["apple"], 4)
+        assert_equal(gs["bear"], 3)
+
+        f = sorting.FieldFacet("c", maptype=sorting.Best)
+        r = s.search(q, groupedby=f)
+        gs = r.groups()
+        assert_equal(gs["apple"], 6)
+        assert_equal(gs["bear"], 5)
+
+        r = s.search(q, groupedby="c", maptype=sorting.Count)
+        gs = r.groups()
+        assert_equal(gs["apple"], 4)
+        assert_equal(gs["bear"], 3)
+
+
+

File tests/test_vectors.py

 from whoosh.support.testing import TempIndex
 
 
+def test_single_term():
+    schema = fields.Schema(text=fields.TEXT(vector=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("TEST TEST TEST"))
+    with ix.searcher() as s:
+        v = s.vector(0, "text")
+        assert v.is_active()
+
 def test_vector_reading():
-    schema = fields.Schema(title = fields.TEXT,
-                           content = fields.TEXT(vector=formats.Frequency()))
-    
+    schema = fields.Schema(title=fields.TEXT,
+                           content=fields.TEXT(vector=formats.Frequency()))
+
     with TempIndex(schema, "vectorreading") as ix:
         writer = ix.writer()
         writer.add_document(title=u("one"),
                             content=u("This is the story of the black hole story"))
         writer.commit()
-        
+
         with ix.reader() as r:
             assert_equal(list(r.vector_as("frequency", 0, "content")),
                              [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
 
 def test_vector_merge():
-    schema = fields.Schema(title = fields.TEXT,
-                           content = fields.TEXT(vector=formats.Frequency()))
-    
+    schema = fields.Schema(title=fields.TEXT,
+                           content=fields.TEXT(vector=formats.Frequency()))
+
     with TempIndex(schema, "vectormerge") as ix:
         writer = ix.writer()
         writer.add_document(title=u("one"),
                             content=u("This is the story of the black hole story"))
         writer.commit()
-        
+
         writer = ix.writer()
         writer.add_document(title=u("two"),
                             content=u("You can read along in your book"))
         writer.commit()
-        
+
         with ix.searcher() as s:
             r = s.reader()
-        
+
             docnum = s.document_number(title=u("one"))
             vec = list(r.vector_as("frequency", docnum, "content"))
             assert_equal(vec, [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
-        
+
             docnum = s.document_number(title=u("two"))
-        
+
             vec = list(r.vector_as("frequency", docnum, "content"))
             assert_equal(vec, [(u('along'), 1), (u('book'), 1), (u('read'), 1)])
-        
+
 def test_vector_unicode():
-    schema = fields.Schema(content = fields.TEXT(vector=formats.Frequency()))
+    schema = fields.Schema(content=fields.TEXT(vector=formats.Frequency()))
     ix = RamStorage().create_index(schema)
-    
+
     writer = ix.writer()
     writer.add_document(content=u("\u1234\u2345\u3456 \u4567\u5678\u6789"))
     writer.add_document(content=u("\u0123\u1234\u4567 \u4567\u5678\u6789"))
     writer.commit()
-    
+
     writer = ix.writer()
     writer.add_document(content=u("\u2345\u3456\u4567 \u789a\u789b\u789c"))
     writer.add_document(content=u("\u0123\u1234\u4567 \u2345\u3456\u4567"))
     writer.commit()
-    
+
     with ix.reader() as r:
         vec = list(r.vector_as("frequency", 0, "content"))
         assert_equal(vec, [(u('\u3456\u4567'), 1), (u('\u789a\u789b\u789c'), 1)])