Commits

ma...@bondi.sidefx.com  committed badf2a5

Changed implementation of sorting on column-less fields to make and cache a fake column.
Added missing MemPerDocReader.has_column() in memory.py.
Added warning text in codec.base.PerDocWriter.add_column_value().
Added explicit codec argument in column-related tests to allow switching default codec.
Minor doc string changes.

  • Participants
  • Parent commits 31bc740

Comments (0)

Files changed (12)

 
 [pytest]
 ; --tb= traceback print mode (long/short/line/native/no)
-addopts = --pep8 -rs --tb=native
+addopts = -rs --tb=native
 
 norecursedirs = .hg .tox _build tmp* env* benchmark stress
 minversion = 2.0

File src/whoosh/codec/base.py

 
     @abstractmethod
     def add_column_value(self, fieldname, columnobj, value):
-        raise NotImplementedError
+        raise NotImplementedError("Codec does not implement writing columns")
 
     @abstractmethod
     def add_vector_items(self, fieldname, fieldobj, items):

File src/whoosh/codec/memory.py

     def supports_columns(self):
         return True
 
+    def has_column(self, fieldname):
+        filename = "%s.c" % fieldname
+        return self._storage.file_exists(filename)
+
     def column_reader(self, fieldname, column):
         filename = "%s.c" % fieldname
         colfile = self._storage.open_file(filename)

File src/whoosh/codec/whoosh2.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-import struct, sys
+"""
+This module implements a "codec" for writing/reading Whoosh 2 indexes.
+"""
+
+import struct
+import sys
 from array import array
 from binascii import crc32
 from collections import defaultdict

File src/whoosh/codec/whoosh3.py

 # policies, either expressed or implied, of Matt Chaput.
 
 """
-This module implements a "codec" for writing/reading Whoosh 3 indexes.
+This module implements a "codec" for writing/reading Whoosh X indexes.
 """
 
 import struct

File src/whoosh/columns.py

         return (translate(v) for v in self._reader)
 
 
+# Fake column reader for fields without columns
+
+class PostingColumnReader(ColumnReader):
+    """
+    Builds a synthetic column for fields that weren't indexed with column
+    storage. This object reads every posting for every term in the field, so
+    building it is quite expensive and the reader should cache it once it's
+    built.
+    """
+
+    def __init__(self, reader, fieldname):
+        self._length = reader.doc_count_all()
+        # Dictionary mapping document IDs to values
+        self._values = values = {}
+
+        fieldobj = reader.schema[fieldname]
+        self._frombytes =fieldobj.from_bytes
+
+        # Read the terms in the field in sorted order
+
+        btexts = fieldobj.sortable_terms(reader, fieldname)
+        for btext in btexts:
+            # Read the document IDs containing this term
+            # Global doc ids
+            postings = reader.postings(fieldname, btext)
+            for docid in postings.all_ids():
+                values[docid] = btext
+
+    def sort_key(self, docnum, reverse=False):
+        return self._frombytes(self._values.get(docnum, emptybytes))
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, docnum):
+        return self._values.get(docnum, emptybytes)
+
+    def __iter__(self):
+        values = self._values
+        for docnum in xrange(self._length):
+            yield values.get(docnum, emptybytes)
+
+
 # Column wrappers
 
 class WrappedColumn(Column):

File src/whoosh/reading.py

 
         # self.files is a storage object from which to load the segment files.
         # This is different from the general storage (which will be used for
-        # cahces) if the segment is in a compound file.
+        # caches) if the segment is in a compound file.
         if segment.is_compound():
             # Open the compound file as a storage object
             files = segment.open_compound_file(storage)
             self._storage = OverlayStorage(files, storage)
         else:
             self._storage = storage
+        self._column_cache = {}
 
         # Get subreaders from codec
         self._codec = codec if codec else segment.codec()
                                             prefix=prefix)
         gr = self._get_graph()
         return fst.within(gr, text, k=maxdist, prefix=prefix,
-                           address=self._graph.root(fieldname))
+                          address=self._graph.root(fieldname))
 
     # Column methods
 
 
     def column_reader(self, fieldname, column=None):
         fieldobj = self.schema[fieldname]
-        column = column or fieldobj.column_type
-        reader = self._perdoc.column_reader(fieldname, column)
+        if self.has_column(fieldname):
+            ctype = column or fieldobj.column_type
+            creader = self._perdoc.column_reader(fieldname, ctype)
 
-        translate = fieldobj.from_column_value
-        creader = columns.TranslatingColumnReader(reader, translate)
+            # Wrap the column in a translator to present nice values to the
+            # caller instead of sortable column values
+            translate = fieldobj.from_column_value
+            creader = columns.TranslatingColumnReader(creader, translate)
+        else:
+            # If the field wasn't indexed with a column, create one from
+            # postings and cache it
+            if fieldname in self._column_cache:
+                creader = self._column_cache[fieldname]
+            else:
+                creader = columns.PostingColumnReader(self, fieldname)
+                self._column_cache[fieldname] = creader
+
         return creader
 
 
     def iter_from(self, fieldname, text):
         return iter([])
 
-    def iter_field(self, fieldname):
+    def iter_field(self, fieldname, prefix=''):
         return iter([])
 
-    def iter_prefix(self, fieldname):
+    def iter_prefix(self, fieldname, prefix=''):
         return iter([])
 
     def lexicon(self, fieldname):

File src/whoosh/sorting.py

         fieldname = self.fieldname
         fieldobj = global_searcher.schema[fieldname]
 
-        # If we're grouping with allow_overlap=True, all we can use is
-        # OverlappingCategorizer
         if self.allow_overlap:
+            # If we're grouping with allow_overlap=True, all we can use is
+            # OverlappingCategorizer
             c = OverlappingCategorizer(global_searcher, fieldname)
         else:
+            # Check if the field has a real column and if it's reversible
+            reversible = False
             if global_searcher.reader().has_column(fieldname):
-                coltype = fieldobj.column_type
-                if coltype.reversible or not self.reverse:
-                    c = ColumnCategorizer(global_searcher, fieldname,
-                                          self.reverse)
-                else:
-                    c = ReversedColumnCategorizer(global_searcher, fieldname)
+                reversible = fieldobj.column_type.reversible
+
+            # If the facet is reversed and the column isn't reversible, we have
+            # to use a ReversedColumnCategorizer
+            if self.reverse and not reversible:
+                c = ReversedColumnCategorizer(global_searcher, fieldname)
             else:
-                c = PostingCategorizer(global_searcher, fieldname,
-                                            self.reverse)
+                c = ColumnCategorizer(global_searcher, fieldname, self.reverse)
+
         return c
 
 
         return ColumnCategorizer.key_to_name(self, key)
 
 
-class PostingCategorizer(Categorizer):
-    """Categorizer for fields that don't store column values. This is very
-    inefficient. Instead of relying on this categorizer you should plan for
-    which fields you'll want to sort on and set ``sortable=True`` in their
-    field type.
-
-    This object builds an array caching the order of all documents according to
-    the field, then uses the cached order as a numeric key. This is useful when
-    a field cache is not available, and also for reversed fields (since field
-    cache keys for non- numeric fields are arbitrary data, it's not possible to
-    "negate" them to reverse the sort order).
-    """
-
-    def __init__(self, global_searcher, fieldname, reverse):
-        # Cache the relative positions of all docs with the given field
-        # across the entire index
-        reader = global_searcher.reader()
-        dc = reader.doc_count_all()
-        self._fieldobj = global_searcher.schema[fieldname]
-        from_bytes = self._fieldobj.from_bytes
-
-        self.values = []
-        self.array = array("i", [dc + 1] * dc)
-
-        btexts = self._fieldobj.sortable_terms(reader, fieldname)
-        for i, btext in enumerate(btexts):
-            self.values.append(from_bytes(btext))
-            if reverse:
-                i = dc - i
-
-            # Get global docids from global reader
-            postings = reader.postings(fieldname, btext)
-            for docid in postings.all_ids():
-                self.array[docid] = i
-
-        if reverse:
-            self.values.reverse()
-
-    def set_searcher(self, segment_searcher, docoffset):
-        self._searcher = segment_searcher
-        self.docoffset = docoffset
-
-    def key_for(self, matcher, segment_docnum):
-        global_docnum = self.docoffset + segment_docnum
-        return self.array[global_docnum]
-
-    def key_to_name(self, key):
-        if key >= len(self.values):
-            return None
-        return self.values[key]
-
-
 class OverlappingCategorizer(Categorizer):
     allow_overlap = True
 

File tests/test_codecs.py

 def test_plaintext_codec():
     pytest.importorskip("ast")
     from whoosh.codec.plaintext import PlainTextCodec
+    from whoosh.codec.whoosh3 import W3Codec
 
     ana = analysis.StemmingAnalyzer()
     schema = fields.Schema(a=fields.TEXT(vector=True, sortable=True),
 
     st = RamStorage()
     ix = st.create_index(schema)
-    with ix.writer() as w:
+    with ix.writer(codec=W3Codec()) as w:
         w.add_document(a=u("alfa bravo charlie"), b="hello", c=100,
                        d=u("quelling whining echoing"))
         w.add_document(a=u("bravo charlie delta"), b=1000, c=200,
     c_values = [cfield.from_bytes(t) for t in c_sortables]
     assert c_values, [-200, -100, 100, 200, 300]
 
+    assert reader.has_column("c")
     c_values = list(reader.column_reader("c"))
     assert c_values == [100, 200, 300, -100, -200]
 

File tests/test_columns.py

 import inspect, random, sys
 
 from whoosh import columns, fields, query
+from whoosh.codec.whoosh3 import W3Codec
 from whoosh.compat import b, u, BytesIO, bytes_type, text_type
 from whoosh.compat import izip, xrange, dumps, loads
 from whoosh.filedb import compound
     schema = fields.Schema(s=fields.TEXT(sortable=True),
                            n=fields.NUMERIC(sortable=True))
     ix = RamStorage().create_index(schema)
-    with ix.writer() as w:
+    with ix.writer(codec=W3Codec()) as w:
         w.add_document(s=u("alfa foxtrot charlie").split(), n=[100, 200, 300])
         w.add_document(s=u("juliet bravo india").split(), n=[10, 20, 30])
 
     schema = fields.Schema(a=fields.TEXT(sortable=True),
                            b=fields.COLUMN(columns.RefBytesColumn()))
     with TempIndex(schema, "columnfield") as ix:
-        with ix.writer() as w:
+        with ix.writer(codec=W3Codec()) as w:
             w.add_document(a=u("alfa bravo"), b=b("charlie delta"))
             w.add_document(a=u("bravo charlie"), b=b("delta echo"))
             w.add_document(a=u("charlie delta"), b=b("echo foxtrot"))
                            a=fields.ID(sortable=True),
                            b=fields.NUMERIC(sortable=True))
     with TempIndex(schema, "columnquery") as ix:
-        with ix.writer() as w:
+        with ix.writer(codec=W3Codec()) as w:
             w.add_document(id=1, a=u("alfa"), b=10)
             w.add_document(id=2, a=u("bravo"), b=20)
             w.add_document(id=3, a=u("charlie"), b=30)

File tests/test_results.py

 import pytest
 
 from whoosh import analysis, fields, formats, highlight, qparser, query
+from whoosh.codec.whoosh3 import W3Codec
 from whoosh.compat import u, xrange, text_type, permutations
 from whoosh.filedb.filestore import RamStorage
 
     # With column
     schema = fields.Schema(text=fields.TEXT(sortable=True))
     ix = RamStorage().create_index(schema)
-    with ix.writer() as w:
+    with ix.writer(codec=W3Codec()) as w:
         w.add_document(text=u("alfa bravo charlie"))
 
     with ix.searcher() as s:

File tests/test_searching.py

 import pytest
 
 from whoosh import analysis, fields, index, qparser, query, searching, scoring
+from whoosh.codec.whoosh3 import W3Codec
 from whoosh.compat import b, u, text_type
 from whoosh.compat import xrange, permutations, izip_longest
 from whoosh.filedb.filestore import RamStorage
                            size=fields.NUMERIC,
                            tag=fields.KEYWORD(sortable=True))
     ix = RamStorage().create_index(schema)
+    with ix.writer(codec=W3Codec()) as w:
+        for id, text, size, tag in domain:
+            w.add_document(id=u(id), text=u(text), size=size, tag=u(tag))
+
+    with ix.searcher() as s:
+        q = query.Term("text", "blah")
+        r = s.search(q, limit=None)
+        assert " ".join(hit["id"] for hit in r) == "f c a d h b g"
+
+        col = s.collector(limit=3)
+        col = collectors.CollapseCollector(col, "tag")
+        s.search_with_collector(q, col)
+        r = col.results()
+        assert " ".join(hit["id"] for hit in r) == "f c h"
+
+        col = s.collector(limit=None)
+        col = collectors.CollapseCollector(col, "tag")
+        s.search_with_collector(q, col)
+        r = col.results()
+        assert " ".join(hit["id"] for hit in r) == "f c h b g"
+
+        r = s.search(query.Every(), sortedby="size")
+        assert " ".join(hit["id"] for hit in r) == "e c b d a f h g"
+
+        col = s.collector(sortedby="size")
+        col = collectors.CollapseCollector(col, "tag")
+        s.search_with_collector(query.Every(), col)
+        r = col.results()
+        assert " ".join(hit["id"] for hit in r) == "e c b d h g"
+
+
+def test_collapse_nocolumn():
+    from whoosh import collectors
+
+    # id, text, size, tag
+    domain = [("a", "blah blah blah", 5, "x"),
+              ("b", "blah", 3, "y"),
+              ("c", "blah blah blah blah", 2, "z"),
+              ("d", "blah blah", 4, "x"),
+              ("e", "bloop", 1, "-"),
+              ("f", "blah blah blah blah blah", 6, "x"),
+              ("g", "blah", 8, "w"),
+              ("h", "blah blah", 7, "=")]
+
+    schema = fields.Schema(id=fields.STORED, text=fields.TEXT,
+                           size=fields.NUMERIC,
+                           tag=fields.KEYWORD)
+    ix = RamStorage().create_index(schema)
     with ix.writer() as w:
         for id, text, size, tag in domain:
             w.add_document(id=u(id), text=u(text), size=size, tag=u(tag))
     schema = fields.Schema(key=fields.ID(sortable=True),
                            word=fields.ID(stored=True))
     ix = RamStorage().create_index(schema)
+    with ix.writer(codec=W3Codec()) as w:
+        for word in domain:
+            w.add_document(key=word[0], word=word)
+
+    with ix.searcher() as s:
+        q = query.Every()
+
+        def check(r):
+            words = " ".join(hit["word"] for hit in r)
+            assert words == "alfa bravo charlie delta echo foxtrot golf"
+            assert r.scored_length() == 7
+            assert len(r) == 7
+
+        r = s.search(q, collapse="key", collapse_limit=1, limit=None)
+        check(r)
+
+        r = s.search(q, collapse="key", collapse_limit=1, limit=50)
+        check(r)
+
+        r = s.search(q, collapse="key", collapse_limit=1, limit=10)
+        check(r)
+
+
+def test_collapse_length_nocolumn():
+    domain = u("alfa apple agnostic aplomb arc "
+               "bravo big braid beer "
+               "charlie crouch car "
+               "delta dog "
+               "echo "
+               "foxtrot fold flip "
+               "golf gym goop"
+               ).split()
+
+    schema = fields.Schema(key=fields.ID(),
+                           word=fields.ID(stored=True))
+    ix = RamStorage().create_index(schema)
     with ix.writer() as w:
         for word in domain:
             w.add_document(key=word[0], word=word)
                            rating=fields.NUMERIC(sortable=True),
                            tag=fields.ID(sortable=True))
     ix = RamStorage().create_index(schema)
+    with ix.writer(codec=W3Codec()) as w:
+        w.add_document(id="a", price=10, rating=1, tag=u("x"))
+        w.add_document(id="b", price=80, rating=3, tag=u("y"))
+        w.add_document(id="c", price=60, rating=1, tag=u("z"))
+        w.add_document(id="d", price=30, rating=2)
+        w.add_document(id="e", price=50, rating=3, tag=u("x"))
+        w.add_document(id="f", price=20, rating=1, tag=u("y"))
+        w.add_document(id="g", price=50, rating=2, tag=u("z"))
+        w.add_document(id="h", price=90, rating=5)
+        w.add_document(id="i", price=50, rating=5, tag=u("x"))
+        w.add_document(id="j", price=40, rating=1, tag=u("y"))
+        w.add_document(id="k", price=50, rating=4, tag=u("z"))
+        w.add_document(id="l", price=70, rating=2)
+
+    with ix.searcher() as s:
+        def check(kwargs, target):
+            r = s.search(query.Every(), limit=None, **kwargs)
+            assert " ".join(hit["id"] for hit in r) == target
+
+        price = sorting.FieldFacet("price", reverse=True)
+        rating = sorting.FieldFacet("rating", reverse=True)
+        tag = sorting.FieldFacet("tag")
+
+        check(dict(sortedby=price), "h b l c e g i k j d f a")
+        check(dict(sortedby=price, collapse=tag), "h b l c e d")
+        check(dict(sortedby=price, collapse=tag, collapse_order=rating),
+              "h b l i k d")
+
+
+def test_collapse_order_nocolumn():
+    from whoosh import sorting
+
+    schema = fields.Schema(id=fields.STORED,
+                           price=fields.NUMERIC(),
+                           rating=fields.NUMERIC(),
+                           tag=fields.ID())
+    ix = RamStorage().create_index(schema)
     with ix.writer() as w:
         w.add_document(id="a", price=10, rating=1, tag=u("x"))
         w.add_document(id="b", price=80, rating=3, tag=u("y"))