Commits

Matt Chaput  committed cd4af80

Fixed unicode handling in FSA/FST more.
Fixed lots of bytes/unicode and other Python 3 compatibility issues.
Unit tests pass on Python 3.

  • Participants
  • Parent commits 524dc1e

Comments (0)

Files changed (26)

File src/whoosh/codec/base.py

 from array import array
 from struct import Struct, pack
 
-from whoosh.compat import loads, dumps, b, bytes_type, string_type
+from whoosh.compat import loads, dumps, b, bytes_type, string_type, xrange
 from whoosh.matching import Matcher, ReadTooFar
 from whoosh.reading import TermInfo
 from whoosh.spans import Span
                 lasttext = text
                 continue
 
-            if fieldname < lastfn or (fieldname == lastfn and text < lasttext):
+            # This comparison is so convoluted because Python 3 removed the
+            # ability to compare a string to None
+            if ((lastfn is not None and fieldname < lastfn)
+                or (fieldname == lastfn and lasttext is not None
+                    and text < lasttext)):
                 raise Exception("Postings are out of order: %r:%s .. %r:%s" %
                                 (lastfn, lasttext, fieldname, text))
             if fieldname != lastfn or text != lasttext:
 
 def minimize_weights(weights, compression=0):
     if all(w == 1.0 for w in weights):
-        string = ""
+        string = b("")
     else:
         if not IS_LITTLE:
             weights.byteswap()

File src/whoosh/codec/standard.py

         self.text = text
         self.terminfo = base.FileTermInfo()
         if self.spelling:
-            self.dawg.insert_string(utf8encode(text)[0])
+            self.dawg.insert(text)
         self._start_blocklist()
 
     def add(self, docnum, weight, valuestring, length):
     def add_spell_word(self, fieldname, text):
         if self.dawg is None:
             self._make_dawg_files()
-        self.dawg.insert_string(utf8encode(text)[0])
+        self.dawg.insert(text)
 
     def finish_term(self):
         if self.block is None:

File src/whoosh/compat.py

     from urllib.request import urlretrieve
 
     def u(s):
+        if isinstance(s, bytes):
+            return s.decode("ascii")
         return s
 
     def with_metaclass(meta, base=object):

File src/whoosh/filedb/filetables.py

 from hashlib import md5  #@UnresolvedImport
 from struct import Struct
 
-from whoosh.compat import long_type, xrange, b, text_type
+from whoosh.compat import long_type, xrange, b, bytes_type
 from whoosh.system import _INT_SIZE, _LONG_SIZE
 
 
         write = dbfile.write
 
         for key, value in items:
-            if isinstance(key, text_type):
-                key = key.encode('latin-1')
-            if isinstance(value, text_type):
-                value = value.encode('latin-1')
+            if not isinstance(key, bytes_type):
+                raise TypeError("Key %r should be bytes" % key)
+            if not isinstance(value, bytes_type):
+                raise TypeError("Value %r should be bytes" % value)
             write(pack_lengths(len(key), len(value)))
             write(key)
             write(value)
     def ranges_for_key(self, key):
         read = self.read
         pointer_size = self.pointer_size
-        if isinstance(key, text_type):
-            key = key.encode('latin-1')
+        if not isinstance(key, bytes_type):
+            raise TypeError("Key %r should be bytes" % key)
         keyhash = self.hash_func(key)
         hpos, hslots = self._hashtable_info(keyhash)
         if not hslots:
         lk = self.lastkey or b('')
 
         for key, value in items:
-            if isinstance(key, text_type):
-                key = key.encode('latin-1')
-            if isinstance(value, text_type):
-                value = value.encode('latin-1')
+            if not isinstance(key, bytes_type):
+                raise TypeError("Key %r should be bytes" % key)
+            if not isinstance(value, bytes_type):
+                raise TypeError("Value %r should be bytes" % value)
             if key <= lk:
                 raise ValueError("Keys must increase: %r .. %r" % (lk, key))
             lk = key
         indexbase = self.indexbase
         lo = 0
         hi = self.length
-        if isinstance(key, text_type):
-            key = key.encode('latin-1')
+        if not isinstance(key, bytes_type):
+            raise TypeError("Key %r should be bytes" % key)
         while lo < hi:
             mid = (lo + hi) // 2
             midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE))

File src/whoosh/filedb/filewriting.py

     from whoosh.filedb.filereading import SegmentReader
 
     newsegments = []
-    sorted_segment_list = sorted((s.doc_count_all(), s) for s in segments)
+    sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
     total_docs = 0
-    for i, (count, seg) in enumerate(sorted_segment_list):
+    for i, seg in enumerate(sorted_segment_list):
+        count = seg.doc_count_all()
         if count > 0:
             total_docs += count
             if total_docs < fib(i + 5):
         for fieldname in fieldnames:
             gw.start_field(fieldname)
             for word in r.lexicon(fieldname):
-                gw.insert_string(utf8encode(word)[0])
+                gw.insert(word)
             gw.finish_field()
         gw.close()
 

File src/whoosh/filedb/structfile.py

 
 from whoosh.compat import dump as dump_pickle
 from whoosh.compat import load as load_pickle
-from whoosh.compat import integer_types, b
+from whoosh.compat import PY3, integer_types, b
 from whoosh.system import (_INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE,
                            pack_byte, pack_sbyte, pack_ushort, pack_int,
                            pack_uint, pack_long, pack_float, unpack_byte,
         _self = self
 
         class fakemap(object):
-            def __getitem__(self, slice):
-                if isinstance(slice, integer_types):
-                    _self.seek(slice)
-                    return _self.read(1)
+            def __getitem__(self, slc):
+                if isinstance(slc, integer_types):
+                    _self.seek(slc)
+                    return _self.read(1)[0]
                 else:
-                    _self.seek(slice.start)
-                    return _self.read(slice.stop - slice.start)
+                    _self.seek(slc.start)
+                    return _self.read(slc.stop - slc.start)
 
         self.map = fakemap()
 
         return ord(self.file.read(1))
 
     def get_byte(self, position):
-        return ord(self.map[position])
+        v = self.map[position]
+        if PY3:  # Getting an item returns an int
+            return v
+        else:  # Getting an item returns a 1-character str
+            return ord(v[0])
 
     def write_8bitfloat(self, f, mantissabits=5, zeroexp=2):
         """Writes a byte-sized representation of floating point value f to the

File src/whoosh/lang/wordnet.py

 
 
 if __name__ == "__main__":
-    from time import clock
     from whoosh.filedb.filestore import FileStorage
     st = FileStorage("c:/testindex")
 
-#    t = clock()
 #    th = Thesaurus.from_filename("c:/wordnet/wn_s.pl")
-#    print(clock() - t)
 #    
-#    t = clock()
 #    th.to_storage(st)
-#    print(clock() - t)
 #    
 #    t = clock()
 #    print th.synonyms("light")
 #    print(clock() - t)
 
-    t = clock()
     th = Thesaurus.from_storage(st)
-    print(clock() - t)
-
-    t = clock()
     print(th.synonyms("hail"))
-    print(clock() - t)

File src/whoosh/searching.py

         
         >>> stored_fields = searcher.document(path=u"/a/b")
         >>> if stored_fields:
-        ...   print stored_fields['title']
+        ...   print(stored_fields['title'])
         ... else:
-        ...   print "There is no document with the path /a/b"
+        ...   print("There is no document with the path /a/b")
         """
 
         for p in self.documents(**kw):
         documents.
         
         >>> for stored_fields in searcher.documents(emailto=u"matt@whoosh.ca"):
-        ...   print "Email subject:", stored_fields['subject']
+        ...   print("Email subject:", stored_fields['subject'])
         """
 
         ixreader = self.ixreader
         
             corrector = searcher.corrector("fieldname")
             for word in words:
-                print corrector.suggest(word)
+                print(corrector.suggest(word))
         
         :param limit: only return up to this many suggestions. If there are not
             enough terms in the field within ``maxdist`` of the given word, the
             
             r = searcher.more_like(docnum)
         
-            print "Documents like", searcher.stored_fields(docnum)["title"]
+            print("Documents like", searcher.stored_fields(docnum)["title"])
             for hit in r:
-                print hit["title"]
+                print(hit["title"])
         
         :param fieldname: the name of the field to use to test similarity.
         :param text: by default, the method will attempt to load the contents
             pagelen = int(request.get("perpage", 10))
             
             results = searcher.search_page(query, pagenum, pagelen=pagelen)
-            print "Page %d of %d" % (results.pagenum, results.pagecount)
+            print("Page %d of %d" % (results.pagenum, results.pagecount))
             print ("Showing results %d-%d of %d" 
                    % (results.offset + 1, results.offset + results.pagelen + 1,
                       len(results)))
             for hit in results:
-                print "%d: %s" % (hit.rank + 1, hit["title"])
+                print("%d: %s" % (hit.rank + 1, hit["title"]))
         
         (Note that results.pagelen might be less than the pagelen argument if
         there aren't enough results to fill a page.)
         """
 
         if (name is None or name == "facet") and len(self._facetmaps) == 1:
-            name = self._facetmaps.keys()[0]
+            # If there's only one facet, just use it; convert keys() to list
+            # for Python 3
+            name = list(self._facetmaps.keys())[0]
         elif name not in self._facetmaps:
             raise KeyError("%r not in facet names %r"
                            % (name, self.facet_names()))
         
             r = searcher.search(myquery)
             for hit in r:
-                print hit["title"]
-                print "Top 3 similar documents:"
+                print(hit["title"])
+                print("Top 3 similar documents:")
                 for subhit in hit.more_like_this("content", top=3):
-                  print "  ", subhit["title"]
+                  print("  ", subhit["title"])
                   
         :param fieldname: the name of the field to use to test similarity.
         :param text: by default, the method will attempt to load the contents

File src/whoosh/sorting.py

     @classmethod
     def from_sortedby(cls, sortedby):
         multi = cls()
-        if (isinstance(sortedby, (list, tuple))
-            or hasattr(sortedby, "__iter__")):
+        if isinstance(sortedby, string_type):
+            multi._add(sortedby)
+        elif (isinstance(sortedby, (list, tuple)) or hasattr(sortedby,
+                                                             "__iter__")):
             for item in sortedby:
                 multi._add(item)
         else:

File src/whoosh/spelling.py

     for word in wordlist:
         if strip:
             word = word.strip()
-        gw.insert_string(word)
+        gw.insert(word)
     gw.close()
 
 

File src/whoosh/support/dawg.py

 from array import array
 from hashlib import sha1  #@UnresolvedImport
 
-from whoosh.compat import (b, BytesIO, xrange, iteritems, iterkeys, bytes_type,
-                           izip)
+from whoosh.compat import (b, u, BytesIO, xrange, iteritems, iterkeys,
+                           bytes_type, text_type, izip)
 from whoosh.filedb.structfile import StructFile
-from whoosh.system import (_INT_SIZE, pack_byte, pack_ushort, pack_int,
-                           pack_uint, pack_long)
+from whoosh.system import (_INT_SIZE, pack_byte, pack_int, pack_uint,
+                           pack_long)
 from whoosh.util import utf8encode, utf8decode, varint
 
 
 
     @staticmethod
     def read(dbfile):
-        typecode = b(dbfile.read(1))
+        typecode = u(dbfile.read(1))
         length = dbfile.read_int()
         return dbfile.read_array(typecode, length)
 
     @staticmethod
     def skip(dbfile):
-        typecode = b(dbfile.read(1))
+        typecode = u(dbfile.read(1))
         length = dbfile.read_int()
         a = array(typecode)
         dbfile.seek(length * a.itemsize, 1)
             for result in node.flatten(sofar + key):
                 yield result
 
+    def flatten_strings(self):
+        return (utf8decode(k)[0] for k in self.flatten())
+
 
 class ComboNode(Node):
     """Base class for nodes that blend the nodes of two different graphs.
 
         return emptybytes.join(self.prefix())
 
+    def prefix_string(self):
+        """Returns the labels of the path from the root to the current arc as
+        a decoded unicode string.
+        """
+
+        return utf8decode(self.prefix_bytes())[0]
+
     def peek_key(self):
         """Returns a sequence of label bytes representing the next closest
         key in the graph.
 
         return emptybytes.join(self.peek_key())
 
+    def peek_key_string(self):
+        """Returns the next closest key in the graph as a decoded unicode
+        string.
+        """
+
+        return utf8decode(self.peek_key_bytes())[0]
+
     def stopped(self):
         """Returns True if the current arc leads to a stop state.
         """
         for key in self.flatten():
             yield key, self.value()
 
+    def flatten_strings(self):
+        return (utf8decode(k)[0] for k in self.flatten())
+
     def find_path(self, path):
         """Follows the labels in the given path, starting at the current
         position.
         """
 
+        path = to_labels(path)
         _switch_to = self.switch_to
         _follow = self.follow
         _stopped = self.stopped
 
     # Override: more efficient implementation manipulating the stack
     def skip_to(self, key):
+        key = to_labels(key)
         stack = self.stack
         if not stack:
             raise InactiveCursor
 class GraphWriter(object):
     """Writes an FSA/FST graph to disk.
     
-    Call ``insert_string(bytes)`` to insert keys into the graph. You must
+    Call ``insert(key)`` to insert keys into the graph. You must
     insert keys in sorted order. Call ``close()`` to finish the graph and close
     the file.
     
     >>> gw = GraphWriter(my_file)
-    >>> gw.insert_string("alfa")
-    >>> gw.insert_string("bravo")
-    >>> gw.insert_string("charlie")
+    >>> gw.insert("alfa")
+    >>> gw.insert("bravo")
+    >>> gw.insert("charlie")
     >>> gw.close()
     
     The graph writer can write separate graphs for multiple fields. Use
     
     >>> gw = GraphWriter(my_file)
     >>> gw.start_field("content")
-    >>> gw.insert_u16("alfalfa")
-    >>> gw.insert_u16("apple")
+    >>> gw.insert("alfalfa")
+    >>> gw.insert("apple")
     >>> gw.finish_field()
     >>> gw.start_field("title")
-    >>> gw.insert_u16("artichoke")
+    >>> gw.insert("artichoke")
     >>> gw.finish_field()
     >>> gw.close()
     """
         dbfile.close()
 
     def insert(self, key, value=None):
-        """Inserts the given sequence of bytestrings as a key.
+        """Inserts the given key into the graph.
         
-        This will work with Python 2 ``str`` objects but WON'T work with Python
-        3 ``bytes`` objects because they act like sequences of numbers, not
-        bytestrings. For consistency, instead use ``insert_string()`` which
-        accepts both bytes and unicode.
-        
-        :param key: a sequence of bytestrings.
+        :param key: a sequence of bytes objects, a bytes object, or a string.
         :param value: an optional value to encode in the graph along with the
             key. If the writer was not instantiated with a value type, passing
             a value here will raise an error.
         if self.fieldname is None:
             raise Exception("Inserted %r before starting a field" % key)
         self._inserted = True
+        key = to_labels(key)  # Python 3 sucks
 
         vtype = self.vtype
         lastkey = self.lastkey
         nodes = self.nodes
         if len(key) < 1:
-            raise KeyError("Can't store a null key %r" % key)
+            raise KeyError("Can't store a null key %r" % (key,))
         if lastkey and lastkey > key:
             raise KeyError("Keys out of order %r..%r" % (lastkey, key))
 
 
         self.lastkey = key
 
-    def insert_string(self, key, value=None):
-        """This method converts the given ``key`` string into a sequence of
-        UTF-8 encoded bytestrings for each character and passes it to the
-        ``insert()`` method. It should work with bytes and all string
-        representations.
-        """
-
-        # I hate the Python 3 bytes object so friggin much
-        if isinstance(key, bytes_type):
-            k = [key[i:i + 1] for i in xrange(len(key))]
-        else:
-            k = [utf8encode(key[i:i + 1])[0] for i in xrange(len(key))]
-        self.insert(k, value=value)
-
     def _freeze_tail(self, prefixlen):
         nodes = self.nodes
         lastkey = self.lastkey
                     for arc in self.iter_arcs(address))
 
     def find_path(self, path, arc=None):
+        path = to_labels(path)
+
         if arc:
             address = arc.target
         else:
 
         self._root = None
         if rootname is None and len(self.roots) == 1:
-            rootname = self.roots.keys()[0]
+            # If there's only one root, just use it. Have to wrap a list around
+            # the keys() method here because of Python 3.
+            rootname = list(self.roots.keys())[0]
         if rootname is not None:
             self._root = self.root(rootname)
 
             return None
 
 
+def to_labels(key):
+    """Takes a string and returns a list of bytestrings, suitable for use as
+    a key or path in an FSA/FST graph.
+    """
+
+    # Convert to tuples of bytestrings (must be tuples so they can be hashed)
+    keytype = type(key)
+
+    # I hate the Python 3 bytes object so friggin much
+    if keytype is tuple or keytype is list:
+        if not all(isinstance(e, bytes_type) for e in key):
+            raise TypeError("%r contains a non-bytestring")
+        if keytype is list:
+            key = tuple(key)
+    elif isinstance(key, bytes_type):
+        key = tuple(key[i:i + 1] for i in xrange(len(key)))
+    elif isinstance(key, text_type):
+        key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key)))
+    else:
+        raise TypeError("Don't know how to convert %r" % key)
+    return key
+
+
 # Within edit distance function
 
 def within(graph, text, k=1, prefix=0, address=None):
     ``prefix`` characters of ``text``.
     """
 
+    text = to_labels(text)
     if address is None:
         address = graph._root
 
     sofar = emptybytes
     accept = False
     if prefix:
-        sofar = text[:prefix]
-        arc = graph.find_path(sofar)
+        prefixchars = text[:prefix]
+        arc = graph.find_path(prefixchars)
         if arc is None:
             return
+        sofar = emptybytes.join(prefixchars)
         address = arc.target
         accept = arc.accept
 
         # us to the end and still within K), and we're in the accept state,
         # yield the current result
         if (len(text) - i <= k) and accept:
-            yield sofar
+            yield utf8decode(sofar)[0]
+
         # If we're in the stop state, give up
         if address is None:
             continue

File src/whoosh/support/filelock.py

             return False
 
     def release(self):
+        if self.fd is None:
+            raise Exception("Lock was not acquired")
+
         import fcntl  #@UnresolvedImport
         fcntl.flock(self.fd, fcntl.LOCK_UN)
         os.close(self.fd)

File src/whoosh/support/testing.py

                                 % (subclass.__name__, attrname))
 
 
-class Timing(object):
-    def __init__(self):
-        pass
 
-    def __enter__(self):
-        self.t = now()
 
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not exc_type:
-            print "%0.8f" % (now() - self.t)
-

File src/whoosh/writing.py

     def __del__(self):
         if hasattr(self, "writer") and self.writer:
             if not self.writer.is_closed:
-                self.writer.cancel()
+                try:
+                    self.writer.cancel()
+                except:
+                    pass
             del self.writer
 
     def _create_ramindex(self):

File tests/test_codecs.py

 from __future__ import with_statement
 import random
+from array import array
 
 from nose.tools import assert_equal  #@UnresolvedImport
 
         return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20))
 
     def random_token():
-        return "".join(unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20))
+        a = array("H", (random.randint(0, 0xd7ff) for _ in xrange(1, 20)))
+        return a.tostring().decode("utf-16")
 
     domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)])
 
     assert_equal(block.max_id(), 4)
     assert_equal(list(block.ids), [0, 1, 2, 3, 4])
     assert_equal(list(block.read_weights()), [2.0, 5.0, 3.0, 4.0, 1.0])
-    assert_equal(list(block.read_values()), ["test1", "test2", "test3", "test4", "test5"])
+    assert_equal(list(block.read_values()), [b("test1"), b("test2"), b("test3"),
+                                             b("test4"), b("test5")])
 
     f = st.create_file("test")
     block = standard.StdBlock(0)
     gr = codec.graph_reader(seg)
     assert gr.has_root("text")
     cur = gr.cursor("text")
-    assert_equal(list(cur.flatten()), ["special", "specific"])
+    assert_equal(list(cur.flatten_strings()), ["special", "specific"])
 
 def test_special_spelled_field():
     from whoosh.analysis import StemmingAnalyzer
     assert_equal(list(tr.keys()), [("text", "special"), ("text", "specific")])
 
     cur = codec.graph_reader(seg).cursor("text")
-    assert_equal(list(cur.flatten()), ["specials", "specifically"])
+    assert_equal(list(cur.flatten_strings()), ["specials", "specifically"])
 
 

File tests/test_dawg.py

 import random
 from array import array
 
-from whoosh.compat import b, u
+from whoosh.compat import b, u, xrange
 from whoosh.filedb.filestore import RamStorage
 from whoosh.support import dawg
 from whoosh.support.testing import TempStorage
     f = st.create_file("test")
     gw = dawg.GraphWriter(f)
     for key in keys:
-        gw.insert_string(key)
+        gw.insert(key)
     gw.close()
     return st
 
 
 
 def enlist(string):
-    return [part.encode("utf8") for part in string.split()]
+    return string.split()
 
 #
 
 def test_empty_key():
     gw = dawg.GraphWriter(RamStorage().create_file("test"))
     assert_raises(KeyError, gw.insert, b(""))
+    assert_raises(KeyError, gw.insert, "")
+    assert_raises(KeyError, gw.insert, u(""))
+    assert_raises(KeyError, gw.insert, [])
 
 def test_keys_out_of_order():
     f = RamStorage().create_file("test")
     gw = dawg.GraphWriter(f)
-    gw.insert_string("alfa")
-    assert_raises(KeyError, gw.insert_string, "abba")
+    gw.insert("alfa")
+    assert_raises(KeyError, gw.insert, "abba")
 
 def test_duplicate_keys():
     st = gwrite(enlist("alfa bravo bravo bravo charlie"))
     cur = dawg.Cursor(greader(st))
-    assert_equal(list(cur.flatten()), ["alfa", "bravo", "charlie"])
+    assert_equal(list(cur.flatten_strings()), ["alfa", "bravo", "charlie"])
 
 def test_inactive_raise():
     st = gwrite(enlist("alfa bravo charlie"))
     assert_raises(dawg.InactiveCursor, cur.skip_to, b("a"))
     assert_raises(dawg.InactiveCursor, list, cur.flatten())
     assert_raises(dawg.InactiveCursor, list, cur.flatten_v())
+    assert_raises(dawg.InactiveCursor, list, cur.flatten_strings())
     assert_raises(dawg.InactiveCursor, cur.find_path, b("a"))
 
 def test_types():
         f = st.create_file("test")
         gw = dawg.GraphWriter(f, vtype=t)
         for key, value in domain:
-            gw.insert_string(key, value)
+            gw.insert(key, value)
         gw.close()
 
         f = st.open_file("test")
     with TempStorage() as st:
         gwrite(words, st)
         cur = dawg.Cursor(greader(st))
-        assert_equal(list(cur.flatten()), words)
+        assert_equal(list(cur.flatten_strings()), words)
 
 def test_random():
     def randstring():
         f = st.create_file("test")
         gw = dawg.GraphWriter(f)
         gw.start_field("f1")
-        gw.insert_string("a")
-        gw.insert_string("aa")
-        gw.insert_string("ab")
+        gw.insert("a")
+        gw.insert("aa")
+        gw.insert("ab")
         gw.finish_field()
         gw.start_field("f2")
-        gw.insert_string("ba")
-        gw.insert_string("baa")
-        gw.insert_string("bab")
+        gw.insert("ba")
+        gw.insert("baa")
+        gw.insert("bab")
         gw.close()
 
         gr = dawg.GraphReader(st.open_file("test"))
         cur1 = dawg.Cursor(gr, gr.root("f1"))
         cur2 = dawg.Cursor(gr, gr.root("f2"))
-        assert_equal(list(cur1.flatten()), [b("a"), b("aa"), b("ab")])
-        assert_equal(list(cur2.flatten()), [b("ba"), b("baa"), b("bab")])
+        assert_equal(list(cur1.flatten_strings()), ["a", "aa", "ab"])
+        assert_equal(list(cur2.flatten_strings()), ["ba", "baa", "bab"])
 
 def test_within():
     with TempStorage() as st:
         gwrite(enlist("0 00 000 001 01 010 011 1 10 100 101 11 110 111"), st)
         gr = greader(st)
         s = set(dawg.within(gr, "01", k=1))
-    assert_equal(s, set([b("0"), b("00"), b("01"), b("011"), b("010"),
-                         b("001"), b("10"), b("101"), b("1"), b("11")]))
+    assert_equal(s, set(["0", "00", "01", "011", "010",
+                         "001", "10", "101", "1", "11"]))
 
 def test_within_match():
     st = gwrite(enlist("abc def ghi"))
     gr = greader(st)
-    assert_equal(set(dawg.within(gr, b("def"))), set([b("def")]))
+    assert_equal(set(dawg.within(gr, "def")), set(["def"]))
 
 def test_within_insert():
     st = gwrite(enlist("00 01 10 11"))
     gr = greader(st)
     s = set(dawg.within(gr, "0"))
-    assert_equal(s, set([b("00"), b("01"), b("10")]))
+    assert_equal(s, set(["00", "01", "10"]))
 
 def test_within_delete():
     st = gwrite(enlist("abc def ghi"))
     gr = greader(st)
-    assert_equal(set(dawg.within(gr, b("df"))), set([b("def")]))
+    assert_equal(set(dawg.within(gr, "df")), set(["def"]))
 
     st = gwrite(enlist("0"))
     gr = greader(st)
-    assert_equal(list(dawg.within(gr, b("01"))), [b("0")])
+    assert_equal(list(dawg.within(gr, "01")), ["0"])
 
 def test_within_replace():
     st = gwrite(enlist("abc def ghi"))
     gr = greader(st)
-    assert_equal(set(dawg.within(gr, b("dez"))), set([b("def")]))
+    assert_equal(set(dawg.within(gr, "dez")), set(["def"]))
 
     st = gwrite(enlist("00 01 10 11"))
     gr = greader(st)
-    s = set(dawg.within(gr, b("00")))
-    assert_equal(s, set([b("00"), b("10"), b("01")]), s)
+    s = set(dawg.within(gr, "00"))
+    assert_equal(s, set(["00", "10", "01"]), s)
 
 def test_within_transpose():
     st = gwrite(enlist("abc def ghi"))
     gr = greader(st)
-    s = set(dawg.within(gr, b("dfe")))
-    assert_equal(s, set([b("def")]))
+    s = set(dawg.within(gr, "dfe"))
+    assert_equal(s, set(["def"]))
 
 def test_within_k2():
     st = gwrite(enlist("abc bac cba"))
     gr = greader(st)
-    s = set(dawg.within(gr, b("cb"), k=2))
-    assert_equal(s, set([b("abc"), b("cba")]))
+    s = set(dawg.within(gr, "cb", k=2))
+    assert_equal(s, set(["abc", "cba"]))
 
 def test_within_prefix():
     st = gwrite(enlist("aabc aadc babc badc"))
     gr = greader(st)
-    s = set(dawg.within(gr, b("aaxc"), prefix=2))
-    assert_equal(s, set([b("aabc"), b("aadc")]))
+    s = set(dawg.within(gr, "aaxc", prefix=2))
+    assert_equal(s, set(["aabc", "aadc"]))
 
 def test_skip():
     st = gwrite(enlist("abcd abfg cdqr1 cdqr12 cdxy wxyz"))
     while not cur.stopped(): cur.follow()
     assert_equal(cur.prefix_bytes(), b("abcd"))
     assert cur.accept()
-    cur._pop_to_prefix("abzz")
-    assert_equal(cur.prefix_bytes(), b("abf"))
 
     cur = gr.cursor()
     while not cur.stopped(): cur.follow()
     st = RamStorage()
     gw = dawg.GraphWriter(st.create_file("test"))
     for key in domain:
-        gw.insert_string(key)
+        gw.insert(key)
     gw.close()
 
     cur = dawg.GraphReader(st.open_file("test")).cursor()
     st = RamStorage()
     gw = dawg.GraphWriter(st.create_file("test"))
     for key in domain:
-        gw.insert_string(key)
+        gw.insert(key)
     gw.close()
 
     cur = dawg.GraphReader(st.open_file("test")).cursor()
-    assert_equal(list(cur.flatten()), domain)
+    assert_equal(list(cur.flatten_strings()), domain)
 
 def test_within_unicode():
     domain = [u("\u280b\u2817\u2801\u281d\u2809\u2811"),
     st = RamStorage()
     gw = dawg.GraphWriter(st.create_file("test"))
     for key in domain:
-        gw.insert_string(key)
+        gw.insert(key)
     gw.close()
 
     gr = dawg.GraphReader(st.open_file("test"))

File tests/test_highlighting.py

     w.commit()
 
     with ix.searcher() as s:
-        for x in s.lexicon("text"):
-            print x
         assert ("text", "5000") in s.reader()
         hit = s.search(query.Term("text", "5000"))[0]
         assert_equal(hit.highlights("text"), 'Our BabbleTron<b class="match term0">5000</b> is great')

File tests/test_matching.py

 from nose.tools import assert_equal, assert_not_equal  #@UnresolvedImport
 
 from whoosh import fields, matching, query
-from whoosh.compat import u
+from whoosh.compat import u, xrange
 from whoosh.filedb.filestore import RamStorage
 from whoosh.query import And, Term
 from whoosh.util import make_binary_tree, permutations
 
 def test_listmatcher():
     ids = [1, 2, 5, 9, 10]
-    
+
     lm = matching.ListMatcher(ids)
     ls = []
     while lm.is_active():
         ls.append((lm.id(), lm.score()))
         lm.next()
     assert_equal(ls, [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)])
-    
+
     lm = matching.ListMatcher(ids)
     assert_equal(list(lm.all_ids()), ids)
-    
+
     lm = matching.ListMatcher(ids, position=3)
     ls = []
     while lm.is_active():
         ls.append(lm.id())
         lm.next()
     assert_equal(ls, [9, 10])
-    
+
     lm = matching.ListMatcher(ids)
     for _ in xrange(3):
         lm.next()
         ls.append((wm.id(), wm.score()))
         wm.next()
     assert_equal(ls, [(1, 2.0), (2, 2.0), (5, 2.0), (9, 2.0), (10, 2.0)])
-    
+
     ids = [1, 2, 5, 9, 10]
     wm = matching.WrappingMatcher(matching.ListMatcher(ids), boost=2.0)
     assert_equal(list(wm.all_ids()), ids)
 
 def test_filter():
     lm = lambda: matching.ListMatcher(list(range(2, 10)))
-    
+
     fm = matching.FilterMatcher(lm(), frozenset([3, 9]))
     assert_equal(list(fm.all_ids()), [3, 9])
-    
+
     fm = matching.FilterMatcher(lm(), frozenset([1, 5, 9, 13]))
     assert_equal(list(fm.all_ids()), [5, 9])
 
 def test_exclude():
     em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     assert_equal(list(em.all_ids()), [1, 5, 10])
-    
+
     em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     assert_equal(list(em.all_ids()), [1, 5, 10])
-    
+
     em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True)
     em.next()
     em.next()
         ls.append((um.id(), um.score()))
         um.next()
     assert_equal(ls, [(0, 1.0), (1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     um = matching.UnionMatcher(lm1, lm2)
     assert_equal(list(um.all_ids()), [0, 1, 4, 10, 20, 90])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     um = matching.UnionMatcher(lm1, lm2)
         ls.append(um.id())
         um.next()
     assert_equal(ls, [4, 10, 20, 90])
-    
+
 def test_simple_intersection():
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
         ls.append((im.id(), im.score()))
         im.next()
     assert_equal(ls, [(4, 2.0), (20, 2.0)])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     im = matching.IntersectionMatcher(lm1, lm2)
     assert_equal(list(im.all_ids()), [4, 20])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     im = matching.IntersectionMatcher(lm1, lm2)
         ls.append((anm.id(), anm.score()))
         anm.next()
     assert_equal(ls, [(1, 1.0), (10, 1.0), (90, 1.0)])
-    
+
     echo_lm = matching.ListMatcher([0, 1, 2, 3, 4])
     bravo_lm = matching.ListMatcher([0, 1])
     anm = matching.AndNotMatcher(echo_lm, bravo_lm)
     assert_equal(list(anm.all_ids()), [2, 3, 4])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     anm = matching.AndNotMatcher(lm1, lm2)
     assert_equal(list(anm.all_ids()), [1, 10, 90])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     anm = matching.AndNotMatcher(lm1, lm2)
         ls.append((rm.id(), rm.score()))
         rm.next()
     assert_equal(ls, [(4, 1.0), (20, 1.0)])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     rm = matching.RequireMatcher(lm1, lm2)
     assert_equal(list(rm.all_ids()), [4, 20])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     rm = matching.RequireMatcher(lm1, lm2)
         ls.append((amm.id(), amm.score()))
         amm.next()
     assert_equal(ls, [(1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     amm = matching.AndMaybeMatcher(lm1, lm2)
     assert_equal(list(amm.all_ids()), [1, 4, 10, 20, 90])
-    
+
     lm1 = matching.ListMatcher([1, 4, 10, 20, 90])
     lm2 = matching.ListMatcher([0, 4, 20])
     amm = matching.AndMaybeMatcher(lm1, lm2)
     assert_equal(ls, [10, 20, 90])
 
 def test_intersection():
-    schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True))
+    schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True))
     st = RamStorage()
     ix = st.create_index(schema)
-    
+
     w = ix.writer()
     w.add_document(key=u("a"), value=u("alpha bravo charlie delta"))
     w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo"))
     w.add_document(key=u("c"), value=u("charlie delta golf hotel"))
     w.commit()
-    
+
     w = ix.writer()
     w.add_document(key=u("d"), value=u("india alpha bravo charlie"))
     w.add_document(key=u("e"), value=u("delta bravo india bravo"))
     w.commit()
-    
+
     with ix.searcher() as s:
         q = And([Term("value", u("bravo")), Term("value", u("delta"))])
         m = q.matcher(s)
         assert_equal(_keys(s, m.all_ids()), ["a", "e"])
-        
+
         q = And([Term("value", u("bravo")), Term("value", u("alpha"))])
         m = q.matcher(s)
         assert_equal(_keys(s, m.all_ids()), ["a", "b", "d"])
-    
+
 def test_random_intersections():
     domain = [u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"),
               u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"),
     docsperseg = 50
     fieldlimits = (3, 10)
     documents = []
-    
+
     schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True))
     st = RamStorage()
     ix = st.create_index(schema)
-    
+
     # Create docsperseg * segments documents containing random words from
     # the domain list. Add the documents to the index, but also keep them
     # in the "documents" list for the sanity check
             documents.append((docnum, doc))
         w.commit()
     assert_not_equal(len(ix._segments()), 1)
-    
+
     testcount = 20
     testlimits = (2, 5)
-    
+
     with ix.searcher() as s:
         for i in xrange(s.doc_count_all()):
             assert_not_equal(s.stored_fields(i).get("key"), None)
-        
+
         for _ in xrange(testcount):
             # Create a random list of words and manually do an intersection of
             # items in "documents" that contain the words ("target").
                 if all((doc.find(w) > -1) for w in words):
                     target.append(docnum)
             target.sort()
-            
+
             # Create a query from the list of words and get two matchers from
             # it.
             q = And([Term("value", w) for w in words])
             m1 = q.matcher(s)
             m2 = q.matcher(s)
-            
+
             # Try getting the list of IDs from all_ids()
             ids1 = list(m1.all_ids())
-            
+
             # Try getting the list of IDs using id()/next()
             ids2 = []
             while m2.is_active():
                 ids2.append(m2.id())
                 m2.next()
-            
+
             # Check that the two methods return the same list
             assert_equal(ids1, ids2)
-            
+
             # Check that the IDs match the ones we manually calculated
             assert_equal(_keys(s, ids1), target)
 
     target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 100, 200]
     um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3))
     assert_equal(target, list(um.all_ids()))
-    
+
 def test_union_scores():
     s1 = matching.ListMatcher([1, 2, 3])
     s2 = matching.ListMatcher([2, 4, 8])
     testcount = 100
     rangelimits = (2, 10)
     clauselimits = (2, 10)
-    
+
     vals = list(range(100))
-    
+
     for _ in xrange(testcount):
         target = set()
         matchers = []
         ids.append(inv.id())
         inv.next()
     assert_equal(ids, [0, 2, 3, 4, 6, 7, 8, 9, 12, 14])
-    
+
 def test_inverse_skip():
     s = matching.ListMatcher([1, 5, 10, 11, 13])
     inv = matching.InverseMatcher(s, 15)
     inv.skip_to(8)
-    
+
     ids = []
     while inv.is_active():
         ids.append(inv.id())
     anm = matching.AndNotMatcher(pos, neg)
     assert not anm.is_active()
     assert_equal(list(anm.all_ids()), [])
-    
+
     pos = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
     neg = matching.NullMatcher()
     ans = matching.AndNotMatcher(pos, neg)
 def test_random_andnot():
     testcount = 100
     rangesize = 100
-    
+
     rng = list(range(rangesize))
-    
+
     for _ in xrange(testcount):
-        negs = sorted(sample(rng, randint(0, rangesize-1)))
+        negs = sorted(sample(rng, randint(0, rangesize - 1)))
         negset = frozenset(negs)
         matched = [n for n in rng if n not in negset]
-        
+
         pos = matching.ListMatcher(rng)
         neg = matching.ListMatcher(negs)
-        
+
         anm = matching.AndNotMatcher(pos, neg)
         ids = list(anm.all_ids())
         assert_equal(ids, matched)
     for ls in permutations(domain, 3):
         w.add_document(text=" ".join(ls), _stored_text=ls)
     w.commit()
-    
+
     with ix.searcher() as s:
         q = query.And([query.Term("text", "alfa"), query.Term("text", "charlie")])
         m = q.matcher(s)
-    
+
         while m.is_active():
             assert_equal(sorted(m.matching_terms()), [("text", "alfa"), ("text", "charlie")])
             m.next()

File tests/test_parsing.py

 def test_singlequote_multitoken():
     schema = fields.Schema(text=fields.TEXT(multitoken_query="or"))
     parser = default.QueryParser("text", schema)
-    q = parser.parse(u"foo bar")
+    q = parser.parse(u("foo bar"))
     assert_equal(q.__unicode__(), "(text:foo AND text:bar)")
 
-    q = parser.parse(u"'foo bar'")  # single quotes
+    q = parser.parse(u("'foo bar'"))  # single quotes
     assert_equal(q.__unicode__(), "(text:foo OR text:bar)")
 
 def test_operator_queries():
 
     qp = default.QueryParser("text", schema)
     q = qp.parse(qtext)
-    assert_equal(repr(q), "Wildcard('text', u'*ben-hayden*')")
+    assert_equal(q.__class__, query.Wildcard)
+    assert_equal(q.fieldname, "text")
+    assert_equal(q.text, "*ben-hayden*")
 
     qp = default.MultifieldParser(["title", "text", "time"], schema)
     q = qp.parse(qtext)
-    assert_equal(repr(q), "Or([Wildcard('title', u'*ben-hayden*'), Wildcard('text', u'*ben-hayden*'), Wildcard('time', u'*Ben-Hayden*')])")
+    assert_equal(q.__unicode__(),
+                 "(title:*ben-hayden* OR text:*ben-hayden* OR time:*Ben-Hayden*)")
 
 

File tests/test_reading.py

             w.commit(merge=False)
 
         def fn():
-            for _ in xrange(10):
+            for _ in xrange(5):
                 r = ix.reader()
+                assert_equal(list(r.lexicon("text")),
+                             ["document", "five", "four", "one", "test", "three", "two"])
                 r.close()
 
-        ths = [threading.Thread(target=fn) for _ in xrange(10)]
+        ths = [threading.Thread(target=fn) for _ in xrange(5)]
         for th in ths:
             th.start()
         for th in ths:

File tests/test_results.py

     hl = highlight.Highlighter()
     ucf = highlight.UppercaseFormatter()
     r.highlighter = hl
-    print "fmt=", r.formatter
     r.formatter = ucf
-    print r.formatter
     assert hl.formatter is ucf
 
 def test_snippets():

File tests/test_sorting.py

         fc = r.fieldcache("id")
         assert r.fieldcache_loaded("id")
         assert_equal(list(fc.order), [3, 1, 5, 2, 4])
-        assert_equal(list(fc.texts), [u('\uffff'), u'alfa', u'bravo',
-                                      u'charlie', u'delta', u'echo'])
+        assert_equal(list(fc.texts), [u('\uffff'), 'alfa', 'bravo',
+                                      'charlie', 'delta', 'echo'])
 
 def test_float_cache():
     schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(type=float))

File tests/test_spans.py

 
 def get_index():
     global _ix
-    
+
     if _ix is not None:
         return _ix
-    
+
     charfield = fields.FieldType(formats.Characters(), analysis.SimpleAnalyzer(),
                                  scorable=True, stored=True)
     schema = fields.Schema(text=charfield)
     st = RamStorage()
     _ix = st.create_index(schema)
-    
+
     w = _ix.writer()
     for ls in permutations(domain, 4):
         w.add_document(text=u(" ").join(ls), _stored_text=ls)
     w.commit()
-    
+
     return _ix
 
 def test_multimatcher():
     schema = fields.Schema(content=fields.TEXT(stored=True))
     ix = RamStorage().create_index(schema)
-    
+
     domain = ("alfa", "bravo", "charlie", "delta")
-    
+
     for _ in xrange(3):
         w = ix.writer()
         for ls in permutations(domain):
             w.add_document(content=u(" ").join(ls))
         w.commit(merge=False)
-    
+
     q = Term("content", "bravo")
     with ix.searcher() as s:
         m = q.matcher(s)
 def test_excludematcher():
     schema = fields.Schema(content=fields.TEXT(stored=True))
     ix = RamStorage().create_index(schema)
-    
+
     domain = ("alfa", "bravo", "charlie", "delta")
-    
+
     for _ in xrange(3):
         w = ix.writer()
         for ls in permutations(domain):
             w.add_document(content=u(" ").join(ls))
         w.commit(merge=False)
-    
+
     w = ix.writer()
     w.delete_document(5)
     w.delete_document(10)
     w.delete_document(28)
     w.commit(merge=False)
-    
+
     q = Term("content", "bravo")
     with ix.searcher() as s:
         m = q.matcher(s)
             for span in spans:
                 assert_equal(content[span.start], "bravo")
             m.next()
-    
+
 
 def test_span_term():
     ix = get_index()
     with ix.searcher() as s:
         alllists = [d["text"] for d in s.all_stored_fields()]
-        
+
         for word in domain:
             q = Term("text", word)
             m = q.matcher(s)
-            
+
             ids = set()
             while m.is_active():
                 id = m.id()
                 ids.add(id)
                 original = list(s.stored_fields(id)["text"])
                 assert word in original, "%r not in %r" % (word, original)
-                
+
                 if word != "bravo":
                     assert_equal(len(sps), 1)
                 assert_equal(original.index(word), sps[0].start)
                 assert_equal(original.index(word), sps[0].end)
                 m.next()
-        
+
             for i, ls in enumerate(alllists):
                 if word in ls:
                     assert i in ids
                 else:
                     assert i not in ids
-                
+
 def test_span_first():
     ix = get_index()
     with ix.searcher() as s:
                 assert_equal(sps[0].start, 0)
                 assert_equal(sps[0].end, 0)
                 m.next()
-                
+
         q = spans.SpanFirst(Term("text", "bravo"), limit=1)
         m = q.matcher(s)
         while m.is_active():
             for sp in m.spans():
                 assert_equal(orig[sp.start], "bravo")
             m.next()
-        
+
 def test_span_near():
     ix = get_index()
     with ix.searcher() as s:
             while m.is_active():
                 yield s.stored_fields(m.id())["text"], m.spans()
                 m.next()
-                
+
         for orig, sps in test(spans.SpanNear(Term("text", "alfa"), Term("text", "bravo"), ordered=True)):
             assert_equal(orig[sps[0].start], "alfa")
             assert_equal(orig[sps[0].end], "bravo")
-            
+
         for orig, sps in test(spans.SpanNear(Term("text", "alfa"), Term("text", "bravo"), ordered=False)):
             first = orig[sps[0].start]
             second = orig[sps[0].end]
             assert (first == "alfa" and second == "bravo"
                             or (first == "bravo" and second == "alfa"))
-            
+
         for orig, sps in test(spans.SpanNear(Term("text", "bravo"), Term("text", "bravo"), ordered=True)):
             text = " ".join(orig)
             assert text.find("bravo bravo") > -1
-            
+
         q = spans.SpanNear(spans.SpanNear(Term("text", "alfa"), Term("text", "charlie")), Term("text", "echo"))
         for orig, sps in test(q):
             text = " ".join(orig)
             assert text.find("alfa charlie echo") > -1
-            
+
         q = spans.SpanNear(Or([Term("text", "alfa"), Term("text", "charlie")]), Term("text", "echo"), ordered=True)
         for orig, sps in test(q):
             text = " ".join(orig)
     w.add_document(text=u("alfa charlie bravo delta echo"))
     w.add_document(text=u("echo delta alfa foxtrot"))
     w.commit()
-    
+
     with ix.searcher() as s:
         q = spans.SpanNear(Term("text", "bravo"), Term("text", "charlie"), ordered=False)
         r = sorted(d["text"] for d in s.search(q))
         assert_equal(r, [u('alfa bravo charlie delta echo'),
                              u('alfa charlie bravo delta echo')])
-    
+
 def test_span_near2():
     ana = analysis.SimpleAnalyzer()
     schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True))
     w = ix.writer()
     w.add_document(text=u("The Lucene library is by Doug Cutting and Whoosh was made by Matt Chaput"))
     w.commit()
-    
+
     nq1 = spans.SpanNear(Term("text", "lucene"), Term("text", "doug"), slop=5)
     nq2 = spans.SpanNear(nq1, Term("text", "whoosh"), slop=4)
-    
+
     with ix.searcher() as s:
         m = nq2.matcher(s)
         assert_equal(m.spans(), [spans.Span(1, 8)])
-    
+
 def test_span_not():
     ix = get_index()
     with ix.searcher() as s:
             if "bravo" in orig:
                 assert orig.index("bravo") != i1 + 1
             m.next()
-        
+
 def test_span_or():
     ix = get_index()
     with ix.searcher() as s:
     with ix.searcher() as s:
         nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), slop=3)
         cq = spans.SpanContains(nq, Term("text", "echo"))
-        
+
         m = cq.matcher(s)
         ls = []
         while m.is_active():
             m.next()
         ls.sort()
         assert_equal(ls, ['alfa bravo echo charlie', 'alfa bravo echo charlie',
-                              'alfa delta echo charlie', 'alfa echo bravo charlie',
-                              'alfa echo bravo charlie', 'alfa echo charlie bravo',
-                              'alfa echo charlie bravo', 'alfa echo charlie delta',
-                              'alfa echo delta charlie', 'bravo alfa echo charlie',
-                              'bravo alfa echo charlie', 'delta alfa echo charlie'])
+                          'alfa delta echo charlie', 'alfa echo bravo charlie',
+                          'alfa echo bravo charlie', 'alfa echo charlie bravo',
+                          'alfa echo charlie bravo', 'alfa echo charlie delta',
+                          'alfa echo delta charlie', 'bravo alfa echo charlie',
+                          'bravo alfa echo charlie', 'delta alfa echo charlie'])
 
 def test_span_before():
     ix = get_index()
                 v = orig[span.start]
                 assert v == "bravo" or v == "alfa"
             m.next()
-        
+
 def test_regular_and():
     ix = get_index()
     with ix.searcher() as s:

File tests/test_spelling.py

     with ix.reader() as r:
         assert not r.is_atomic()
         assert r.has_word_graph("text")
-        words = list(r.word_graph("text").flatten())
+        words = list(r.word_graph("text").flatten_strings())
         assert_equal(words, sorted(domain))
 
         corr = r.corrector("text")
         assert r.is_atomic()
         assert_equal(list(r.lexicon("text")), sorted(domain))
         assert r.has_word_graph("text")
-        words = list(r.word_graph("text").flatten())
+        words = list(r.word_graph("text").flatten_strings())
         assert_equal(words, sorted(domain))
 
         corr = r.corrector("text")
     with ix.reader() as r:
         assert_equal(list(r.lexicon("text")),
                      ["model", "reaction", "render", "shade"])
-        assert_equal(list(r.word_graph("text").flatten()),
+        assert_equal(list(r.word_graph("text").flatten_strings()),
                      ["modeling", "reactions", "rendering", "shading"])
 
 def test_spelling_field_order():

File tests/test_tables.py

 
 from nose.tools import assert_equal  #@UnresolvedImport
 
-from whoosh.compat import u, b, xrange, iteritems, unichr
-from whoosh.filedb.filestore import RamStorage
+from whoosh.compat import b, xrange, iteritems
 from whoosh.filedb.filetables import (HashReader, HashWriter,
                                       OrderedHashWriter, OrderedHashReader)
 from whoosh.support.testing import TempStorage
 
 
-def randstring(domain, minlen, maxlen):
-    return "".join(random.sample(domain, random.randint(minlen, maxlen)))
-
-
 def test_hash():
     with TempStorage("hash") as st:
         hwf = st.create_file("test.hsh")
         hw = HashWriter(hwf)
-        hw.add("foo", "bar")
-        hw.add("glonk", "baz")
+        hw.add(b("foo"), b("bar"))
+        hw.add(b("glonk"), b("baz"))
         hw.close()
 
         hrf = st.open_file("test.hsh")
         hr = HashReader(hrf)
-        assert_equal(hr.get("foo"), b("bar"))
-        assert_equal(hr.get("baz"), None)
+        assert_equal(hr.get(b("foo")), b("bar"))
+        assert_equal(hr.get(b("baz")), None)
         hr.close()
 
 def test_hash_contents():
-    samp = set((('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'),
-               ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'),
-               ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'),
-               ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray')))
+    samp = [('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'),
+            ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'),
+            ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'),
+            ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'),
+            ]
+    # Convert to bytes
+    samp = set((b(k), b(v)) for k, v in samp)
 
     with TempStorage("hashcontents") as st:
         hwf = st.create_file("test.hsh")
         hr.close()
 
 def test_random_hash():
+    from string import ascii_letters as domain
+
+    times = 1000
+    minlen = 1
+    maxlen = len(domain)
+
+    def randstring():
+        s = "".join(random.sample(domain, random.randint(minlen, maxlen)))
+        return b(s)
+
     with TempStorage("randomhash") as st:
-        domain = "abcdefghijklmnopqrstuvwxyz"
-        domain += domain.upper()
-        times = 1000
-        minlen = 1
-        maxlen = len(domain)
-
-        samp = dict((randstring(domain, minlen, maxlen),
-                     randstring(domain, minlen, maxlen)) for _ in xrange(times))
+        samp = dict((randstring(), randstring()) for _ in xrange(times))
 
         hwf = st.create_file("test.hsh")
         hw = HashWriter(hwf)
         hrf = st.open_file("test.hsh")
         hr = HashReader(hrf)
         for k in keys:
-            v = hr[k]
-            assert_equal(v, b(samp[k]))
+            assert_equal(hr[k], samp[k])
         hr.close()
 
 def test_ordered_hash():
     with TempStorage("orderedhash") as st:
         hwf = st.create_file("test.hsh")
         hw = HashWriter(hwf)
-        hw.add_all(("%08x" % x, str(x)) for x in xrange(times))
+        hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times))
         hw.close()
 
         keys = list(range(times))
         hrf = st.open_file("test.hsh")
         hr = HashReader(hrf)
         for x in keys:
-            assert_equal(hr["%08x" % x], b(str(x)))
+            assert_equal(hr[b("%08x" % x)], b(str(x)))
         hr.close()
 
 def test_ordered_closest():
     keys = ['alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf',
             'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november']
-    values = [''] * len(keys)
+    # Make into bytes for Python 3
+    keys = [b(k) for k in keys]
+    values = [b('')] * len(keys)
 
     with TempStorage("orderedclosest") as st:
         hwf = st.create_file("test.hsh")
         hrf = st.open_file("test.hsh")
         hr = OrderedHashReader(hrf)
         ck = hr.closest_key
-        assert_equal(ck(''), b('alfa'))
-        assert_equal(ck(' '), b('alfa'))
-        assert_equal(ck('alfa'), b('alfa'))
-        assert_equal(ck('bravot'), b('charlie'))
-        assert_equal(ck('charlie'), b('charlie'))
-        assert_equal(ck('kiloton'), b('lima'))
-        assert_equal(ck('oskar'), None)
-        assert_equal(list(hr.keys()), [b(k) for k in keys])
-        assert_equal(list(hr.values()), [b(v) for v in values])
-        assert_equal(list(hr.keys_from('f')), [b(k) for k in keys[5:]])
+        assert_equal(ck(b('')), b('alfa'))
+        assert_equal(ck(b(' ')), b('alfa'))
+        assert_equal(ck(b('alfa')), b('alfa'))
+        assert_equal(ck(b('bravot')), b('charlie'))
+        assert_equal(ck(b('charlie')), b('charlie'))
+        assert_equal(ck(b('kiloton')), b('lima'))
+        assert_equal(ck(b('oskar')), None)
+        assert_equal(list(hr.keys()), keys)
+        assert_equal(list(hr.values()), values)
+        assert_equal(list(hr.keys_from(b('f'))), keys[5:])
         hr.close()
 
 

File tests/test_vectors.py

     writer.commit()
 
     with ix.reader() as r:
-        for x in r.all_terms():
-            print x
         vec = list(r.vector_as("frequency", 0, "content"))
         assert_equal(vec, [(u('\u13ac\u13ad\u13ae'), 1),
                            (u('\u13af\u13b0\u13b1'), 1)])