Commits

Vinay Sajip  committed f5c1a0a

Interim update. All tests pass on 2.7, on 3.2 1 skip and 3 failures.

  • Participants
  • Parent commits 273279b

Comments (0)

Files changed (82)

File benchmark/enron.py

 import os.path, tarfile
 from email import message_from_string
 from marshal import dump, load
-from urllib import urlretrieve
 from zlib import compress, decompress
 
 try:
     pass
 
 from whoosh import analysis, fields
+from whoosh.compat import urlretrieve, next
 from whoosh.support.bench import Bench, Spec
 from whoosh.util import now
 
     # the messages in an easier-to-digest format
     
     def download_archive(self, archive):
-        print "Downloading Enron email archive to %r..." % archive
+        print("Downloading Enron email archive to %r..." % archive)
         t = now()
         urlretrieve(self.enron_archive_url, archive)
-        print "Downloaded in ", now() - t, "seconds"
+        print("Downloaded in ", now() - t, "seconds")
     
     @staticmethod
     def get_texts(archive):
         archive = tarfile.open(archive, "r:gz")
         while True:
-            entry = archive.next()
+            entry = next(archive)
             archive.members = []
             if entry is None:
                 break
             yield d
     
     def cache_messages(self, archive, cache):
-        print "Caching messages in %s..." % cache
+        print("Caching messages in %s..." % cache)
         
         if not os.path.exists(archive):
             raise Exception("Archive file %r does not exist" % archive)
         for d in self.get_messages(archive):
             c += 1
             dump(d, f)
-            if not c % 1000: print c
+            if not c % 1000: print(c)
         f.close()
-        print "Cached messages in ", now() - t, "seconds"
+        print("Cached messages in ", now() - t, "seconds")
 
     def setup(self):
         archive = os.path.abspath(os.path.join(self.options.dir, self.enron_archive_filename))
         if not os.path.exists(archive):
             self.download_archive(archive)
         else:
-            print "Archive is OK"
+            print("Archive is OK")
         
         if not os.path.exists(cache):
             self.cache_messages(archive, cache)
         else:
-            print "Cache is OK"
+            print("Cache is OK")
     
     def documents(self):
         if not os.path.exists(self.cache_filename):

File benchmark/wikipedia.py

             if not self.pagecount % 1000:
                 n = now()
                 t = n - self.stime
-                print self.pagecount, self.textcount, n - self.stime_block, t/60
+                print(self.pagecount, self.textcount, n - self.stime_block, t/60)
                 self.stime_block = n
         elif name == "title":
             self.intitle = False
 t = now()
 h = WPHandler()
 parser = xml.sax.parse(f, h)
-print now() - t
-print h.pagecount
+print(now() - t)
+print(h.pagecount)
 

File scripts/release.py

 import sys, os.path
-from ConfigParser import ConfigParser
+try:
+    from ConfigParser import ConfigParser
+except ImportError:
+    from configparser import ConfigParser
 from optparse import OptionParser
 from os import system
 

File src/whoosh/analysis.py

 from collections import deque
 from itertools import chain
 
+from whoosh.compat import (callable, iteritems, string_type, text_type, u,
+                           unichr, xrange, next)
 from whoosh.lang.dmetaphone import double_metaphone
 from whoosh.lang.porter import stem
 from whoosh.util import lru_cache, unbound_cache
     
     def __repr__(self):
         parms = ", ".join("%s=%r" % (name, value)
-                          for name, value in self.__dict__.iteritems())
+                          for name, value in iteritems(self.__dict__))
         return "%s(%s)" % (self.__class__.__name__, parms)
         
     def copy(self):
         if self.__dict__:
             attrs = ", ".join("%s=%r" % (key, value)
                               for key, value
-                              in self.__dict__.iteritems())
+                              in iteritems(self.__dict__))
         return self.__class__.__name__ + "(%s)" % attrs
 
 
     untokenized fields, such as a document's path.
     
     >>> idt = IDTokenizer()
-    >>> [token.text for token in idt(u"/a/b 123 alpha")]
-    [u"/a/b 123 alpha"]
+    >>> [token.text for token in idt(u("/a/b 123 alpha"))] == [u("/a/b 123 alpha")]
+    True
     """
     
     def __call__(self, value, positions=False, chars=False,
                  keeporiginal=False, removestops=True,
                  start_pos=0, start_char=0, mode='',
                  **kwargs):
-        assert isinstance(value, unicode), "%r is not unicode" % value
+        assert isinstance(value, text_type), "%r is not unicode" % value
         t = Token(positions, chars, removestops=removestops, mode=mode)
         t.text = value
         t.boost = 1.0
     Uses a regular expression to extract tokens from text.
     
     >>> rex = RegexTokenizer()
-    >>> [token.text for token in rex(u"hi there 3.141 big-time under_score")]
-    [u"hi", u"there", u"3.141", u"big", u"time", u"under_score"]
+    >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))] == [u("hi"), u("there"), u("3.141"), u("big"), u("time"), u("under_score")]
+    True
     """
     
-    __inittypes__ = dict(expression=unicode, gaps=bool)
+    __inittypes__ = dict(expression=text_type, gaps=bool)
     
     def __init__(self, expression=default_pattern, gaps=False):
         """
             than matching on the expression.
         """
         
-        if isinstance(expression, basestring):
+        if isinstance(expression, string_type):
             self.expression = re.compile(expression, re.UNICODE)
         else:
             self.expression = expression
         :param tokenize: if True, the text should be tokenized.
         """
         
-        assert isinstance(value, unicode), "%r is not unicode" % value
+        assert isinstance(value, text_type), "%r is not unicode" % value
         
         t = Token(positions, chars, removestops=removestops, mode=mode)
         if not tokenize:
         :param tokenize: if True, the text should be tokenized.
         """
         
-        assert isinstance(value, unicode), "%r is not unicode" % value
+        assert isinstance(value, text_type), "%r is not unicode" % value
         
         t = Token(positions, chars, removestops=removestops, mode=mode)
         if not tokenize:
                 t.endchar = start_char + len(value)
             yield t
         else:
-            text = u""
+            text = u("")
             charmap = self.charmap
             pos = start_pos
             startchar = currentchar = start_char
                             t.endchar = currentchar
                         yield t
                     startchar = currentchar + 1
-                    text = u""
+                    text = u("")
                     
                 currentchar += 1
             
     """Returns a RegexTokenizer that splits tokens by whitespace.
     
     >>> sst = SpaceSeparatedTokenizer()
-    >>> [token.text for token in sst(u"hi there big-time, what's up")]
-    [u"hi", u"there", u"big-time,", u"what's", u"up"]
+    >>> [token.text for token in sst(u("hi there big-time, what's up"))] == [u("hi"), u("there"), u("big-time,"), u("what's"), u("up")]
+    True
     """
     
     return RegexTokenizer(r"[^ \t\r\n]+")
     expression.
     
     >>> cst = CommaSeparatedTokenizer()
-    >>> [token.text for token in cst(u"hi there, what's , up")]
-    [u"hi there", u"what's", u"up"]
+    >>> [token.text for token in cst(u("hi there, what's , up"))] == [u("hi there"), u("what's"), u("up")]
+    True
     """
     
     return RegexTokenizer(r"[^,]+") | StripFilter()
     """Splits input text into N-grams instead of words.
     
     >>> ngt = NgramTokenizer(4)
-    >>> [token.text for token in ngt(u"hi there")]
-    [u"hi t", u"i th", u" the", u"ther", u"here"]
-    
+    >>> [token.text for token in ngt(u("hi there"))] == [u("hi t"), u("i th"), u(" the"), u("ther"), u("here")]
+    True
+
     Note that this tokenizer does NOT use a regular expression to extract
     words, so the grams emitted by it will contain whitespace, punctuation,
     etc. You may want to massage the input or add a custom filter to this
     def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                  removestops=True, start_pos=0, start_char=0, mode='',
                  **kwargs):
-        assert isinstance(value, unicode), "%r is not unicode" % value
+        assert isinstance(value, text_type), "%r is not unicode" % value
         
         inlen = len(value)
         t = Token(positions, chars, removestops=removestops, mode=mode)
     
     def __call__(self, tokens):
         # Only selects on the first token
-        t = tokens.next()
+        t = next(tokens)
         filter = self.filters[t.mode]
         return filter(chain([t], tokens))
         
     """Reverses the text of each token.
     
     >>> ana = RegexTokenizer() | ReverseTextFilter()
-    >>> [token.text for token in ana(u"hello there")]
-    [u"olleh", u"ereht"]
+    >>> [token.text for token in ana(u("hello there"))] == [u("olleh"), u("ereht")]
+    True
     """
     
     def __call__(self, tokens):
     """Uses unicode.lower() to lowercase token text.
     
     >>> rext = RegexTokenizer()
-    >>> stream = rext(u"This is a TEST")
-    >>> [token.text for token in LowercaseFilter(stream)]
-    [u"this", u"is", u"a", u"test"]
+    >>> stream = rext(u("This is a TEST"))
+    >>> [token.text for token in LowercaseFilter(stream)] == [u("this"), u("is"), u("a"), u("test")]
+    True
     """
     
     def __call__(self, tokens):
     default removes them).
     
     >>> rext = RegexTokenizer()
-    >>> stream = rext(u"this is a test")
+    >>> stream = rext(u("this is a test"))
     >>> stopper = StopFilter()
-    >>> [token.text for token in stopper(stream)]
-    [u"this", u"test"]
+    >>> [token.text for token in stopper(stream)] == [u("this"), u("test")]
+    True
     
     """
 
     single word in the index.
     
     >>> stemmer = RegexTokenizer() | StemFilter()
-    >>> [token.text for token in stemmer(u"fundamentally willows")]
-    [u"fundament", u"willow"]
+    >>> [token.text for token in stemmer(u("fundamentally willows"))] == [u("fundament"), u("willow")]
+    True
     
     You can pass your own stemming function to the StemFilter. The default
     is the Porter stemming algorithm for English.
     """Splits token text into N-grams.
     
     >>> rext = RegexTokenizer()
-    >>> stream = rext(u"hello there")
+    >>> stream = rext(u("hello there"))
     >>> ngf = NgramFilter(4)
-    >>> [token.text for token in ngf(stream)]
-    [u"hell", u"ello", u"ther", u"here"]
+    >>> [token.text for token in ngf(stream)] == [u("hell"), u("ello"), u("ther"), u("here")]
+    True
     
     """
     
     lowers = re.escape("".join(lowers))
     letters = uppers + lowers
     
-    __inittypes__ = dict(delims=unicode, splitwords=bool, splitnums=bool,
+    __inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool,
                          mergewords=bool, mergenums=bool)
     
-    def __init__(self, delims=u"-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+",
+    def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"),
                  splitwords=True, splitnums=True,
                  mergewords=False, mergenums=False):
         """
         self.delims = re.escape(delims)
         
         # Expression for splitting at delimiter characters
-        self.splitter = re.compile(u"[%s]+" % (self.delims,), re.UNICODE)
+        self.splitter = re.compile(u("[%s]+") % (self.delims,), re.UNICODE)
         # Expression for removing "'s" from the end of sub-words
-        dispat = u"(?<=[%s])'[Ss](?=$|[%s])" % (self.letters, self.delims)
+        dispat = u("(?<=[%s])'[Ss](?=$|[%s])") % (self.letters, self.delims)
         self.disposses = re.compile(dispat, re.UNICODE)
         
         # Expression for finding case and letter-number transitions
-        lower2upper = u"[%s][%s]" % (self.lowers, self.uppers)
-        letter2digit = u"[%s][%s]" % (self.letters, self.digits)
-        digit2letter = u"[%s][%s]" % (self.digits, self.letters)
+        lower2upper = u("[%s][%s]") % (self.lowers, self.uppers)
+        letter2digit = u("[%s][%s]") % (self.letters, self.digits)
+        digit2letter = u("[%s][%s]") % (self.digits, self.letters)
         if splitwords and splitnums:
-            splitpat = u"(%s|%s|%s)" % (lower2upper, letter2digit, digit2letter)
+            splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit, digit2letter)
             self.boundary = re.compile(splitpat, re.UNICODE)
         elif splitwords:
-            self.boundary = re.compile(unicode(lower2upper), re.UNICODE)
+            self.boundary = re.compile(text_type(lower2upper), re.UNICODE)
         elif splitnums:
-            numpat = u"(%s|%s)" % (letter2digit, digit2letter)
+            numpat = u("(%s|%s)") % (letter2digit, digit2letter)
             self.boundary = re.compile(numpat, re.UNICODE)
         
         self.splitting = splitwords or splitnums
                 if len(buf) > 1:
                     # If the buffer has at least two parts in it, merge them
                     # and add them to the original list of parts.
-                    parts.insert(insertat, (pos - 1, u"".join(buf)))
+                    parts.insert(insertat, (pos - 1, u("").join(buf)))
                     insertat += 1
                 # Reset the buffer
                 buf = [part]
         # If there are parts left in the buffer at the end, merge them and add
         # them to the original list.
         if len(buf) > 1:
-            parts.append((pos, u"".join(buf)))
+            parts.append((pos, u("").join(buf)))
     
     def __call__(self, tokens):
         disposses = self.disposses.sub
     
     >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost")
     >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter()
-    >>> for t in ana(u"image render^2 file^0.5")
-    ...    print "%r %f" % (t.text, t.boost)
+    >>> for t in ana(u("image render^2 file^0.5"))
+    ...    print("%r %f" % (t.text, t.boost))
     'image' 1.0
     'render' 2.0
     'file' 0.5
         :param replacement: the substitution text.
         """
         
-        if isinstance(pattern, basestring):
+        if isinstance(pattern, string_type):
             pattern = re.compile(pattern, re.UNICODE)
         self.pattern = pattern
         self.replacement = replacement
     """Parses whitespace- or comma-separated tokens.
     
     >>> ana = KeywordAnalyzer()
-    >>> [token.text for token in ana(u"Hello there, this is a TEST")]
-    [u"Hello", u"there,", u"this", u"is", u"a", u"TEST"]
+    >>> [token.text for token in ana(u("Hello there, this is a TEST"))] == [u("Hello"), u("there,"), u("this"), u("is"), u("a"), u("TEST")]
+    True
     
     :param lowercase: whether to lowercase the tokens.
     :param commas: if True, items are separated by commas rather than whitespace.
     """
     
     return RegexTokenizer(expression=expression, gaps=gaps)
-RegexAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool)
+RegexAnalyzer.__inittypes__ = dict(expression=text_type, gaps=bool)
 
 
 def SimpleAnalyzer(expression=default_pattern, gaps=False):
     """Composes a RegexTokenizer with a LowercaseFilter.
     
     >>> ana = SimpleAnalyzer()
-    >>> [token.text for token in ana(u"Hello there, this is a TEST")]
-    [u"hello", u"there", u"this", u"is", u"a", u"test"]
+    >>> [token.text for token in ana(u("Hello there, this is a TEST"))] == [u("hello"), u("there"), u("this"), u("is"), u("a"), u("test")]
+    True
     
     :param expression: The regular expression pattern to use to extract tokens.
     :param gaps: If True, the tokenizer *splits* on the expression, rather
     """
     
     return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
-SimpleAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool)
+SimpleAnalyzer.__inittypes__ = dict(expression=text_type, gaps=bool)
 
 
 def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
     StopFilter.
     
     >>> ana = StandardAnalyzer()
-    >>> [token.text for token in ana(u"Testing is testing and testing")]
-    [u"testing", u"testing", u"testing"]
-    
+    >>> [token.text for token in ana(u("Testing is testing and testing"))] == [u("testing"), u("testing"), u("testing")]
+    True
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param stoplist: A list of stop words. Set this to None to disable
         the stop word filter.
         chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
                                    maxsize=maxsize)
     return chain
-StandardAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
+StandardAnalyzer.__inittypes__ = dict(expression=text_type, gaps=bool,
                                       stoplist=list, minsize=int, maxsize=int)
 
 
     filter, and a stemming filter.
     
     >>> ana = StemmingAnalyzer()
-    >>> [token.text for token in ana(u"Testing is testing and testing")]
-    [u"test", u"test", u"test"]
+    >>> [token.text for token in ana(u("Testing is testing and testing"))] == [u("test"), u("test"), u("test")]
+    True
     
     :param expression: The regular expression pattern to use to extract tokens.
     :param stoplist: A list of stop words. Set this to None to disable
         chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
                                    maxsize=maxsize)
     return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize)
-StemmingAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
+StemmingAnalyzer.__inittypes__ = dict(expression=text_type, gaps=bool,
                                       stoplist=list, minsize=int, maxsize=int)
 
 
     StopFilter.
     
     >>> ana = FancyAnalyzer()
-    >>> [token.text for token in ana(u"Should I call getInt or get_real?")]
-    [u"should", u"call", u"getInt", u"get", u"int", u"get_real", u"get", u"real"]
+    >>> [token.text for token in ana(u("Should I call getInt or get_real?"))] == [u("should"), u("call"), u("getInt"), u("get"), u("int"), u("get_real"), u("get"), u("real")]
+    True
     
     :param expression: The regular expression pattern to use to extract tokens.
     :param stoplist: A list of stop words. Set this to None to disable
     swf = StopFilter(stoplist=stoplist, minsize=minsize)
     
     return ret | iwf | lcf | swf
-FancyAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
+FancyAnalyzer.__inittypes__ = dict(expression=text_type, gaps=bool,
                                    stoplist=list, minsize=int, maxsize=int)
 
 
     """Composes an NgramTokenizer and a LowercaseFilter.
     
     >>> ana = NgramAnalyzer(4)
-    >>> [token.text for token in ana(u"hi there")]
-    [u"hi t", u"i th", u" the", u"ther", u"here"]
+    >>> [token.text for token in ana(u("hi there"))] == [u("hi t"), u("i th"), u(" the"), u("ther"), u("here")]
+    True
     """
     
     return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()

File src/whoosh/classify.py

 from collections import defaultdict
 from math import log, sqrt
 
+from whoosh.compat import xrange, iteritems
 
 # Expansion models
 
         maxweight = 0
         collection_freq = self.collection_freq
         
-        for word, weight in self.topN_weight.iteritems():
+        for word, weight in iteritems(self.topN_weight):
             if word in collection_freq:
                 score = model.score(weight, collection_freq[word], self.top_total)
                 if score > maxweight:
                 yield item
                 
     def dump(self, tab=0):
-        print "%s-" % (" " * tab, )
+        print("%s-" % (" " * tab, ))
         for item in self.items:
             if isinstance(item, Cluster):
                 item.dump(tab + 2)
             else:
-                print "%s%r" % (" " * tab, item)
+                print("%s%r" % (" " * tab, item))
     
 
 class HierarchicalClustering(object):
     for shingle in (input[i:i + size]
                     for i in xrange(len(input) - (size - 1))):
         d[shingle] += 1
-    return d.iteritems()
+    return iteritems(d)
 
 
 def simhash(features, hashbits=32):

File src/whoosh/compat.py

+import sys
+
+if sys.version_info[0] < 3:
+    PY3 = False
+    
+    def b(s):
+        return s
+
+    import cStringIO as StringIO
+    StringIO = BytesIO = StringIO.StringIO
+    callable = callable
+    integer_types = (int, long)
+    iteritems = lambda o: o.iteritems()
+    itervalues = lambda o: o.itervalues()
+    iterkeys = lambda o: o.iterkeys()
+    from itertools import izip
+    long_type = long
+    next = lambda o: o.next()
+    import cPickle as pickle
+    from cPickle import dumps, loads, dump, load
+    string_type = basestring
+    text_type = unicode
+    unichr = unichr
+    from urllib import urlretrieve
+
+    def u(s):
+        return unicode(s, "unicode_escape")
+
+    def with_metaclass(meta, base=object):
+        class _WhooshBase(base):
+            __metaclass__ = meta
+        return _WhooshBase
+
+    xrange = xrange
+    zip_ = zip
+else:
+    PY3 = True
+    import collections
+
+    def b(s):
+        return s.encode("latin-1")
+
+    import io
+    BytesIO = io.BytesIO
+    callable = lambda o: isinstance(o, collections.Callable)
+    exec_ = eval("exec")
+    integer_types = (int,)
+    iteritems = lambda o: o.items()
+    itervalues = lambda o: o.values()
+    iterkeys = lambda o: iter(o.keys())
+    izip = zip
+    long_type = int
+    next = next
+    import pickle
+    from pickle import dumps, loads, dump, load
+    StringIO = io.StringIO
+    string_type = str
+    text_type = str
+    unichr = chr
+    from urllib.request import urlretrieve
+
+    def u(s):
+        return s
+
+    def with_metaclass(meta, base=object):
+        ns = dict(base=base, meta=meta)
+        exec_("""class _WhooshBase(base, metaclass=meta):
+    pass""", ns)
+        return ns["_WhooshBase"]
+
+    xrange = range
+    zip_ = lambda *args: list(zip(*args))
+

File src/whoosh/fields.py

 import datetime
 import fnmatch
 import re
+import sys
 from decimal import Decimal
 
 from whoosh.analysis import (IDAnalyzer, RegexAnalyzer, KeywordAnalyzer,
                              StandardAnalyzer, NgramAnalyzer, Tokenizer,
                              NgramWordAnalyzer, Analyzer)
+from whoosh.compat import (with_metaclass, itervalues, string_type, u,
+                           integer_types, long_type, text_type, xrange, PY3)
 from whoosh.formats import Format, Existence, Frequency, Positions
 from whoosh.support.numeric import (int_to_text, text_to_int, long_to_text,
                                     text_to_long, float_to_text, text_to_float,
     format = vector = scorable = stored = unique = None
     indexed = True
     multitoken_query = "first"
-    sortable_type = unicode
+    sortable_type = text_type
     sortable_typecode = None
     
     __inittypes__ = dict(format=Format, vector=Format,
         
         if not self.format:
             raise Exception("%s field cannot index without a format" % self.__class__)
-        if not isinstance(value, unicode):
+        if not isinstance(value, text_type):
             raise ValueError("%r is not unicode" % value)
         return self.format.word_values(value, mode="index", **kwargs)
     
         
         self.type = type
         if self.type is int:
-            self._to_text = int_to_text
-            self._from_text = text_to_int
             self.sortable_type = int
-            self.sortable_typecode = "i" if signed else "I"
-        elif self.type is long:
+            if PY3:
+                self._to_text = long_to_text
+                self._from_text = text_to_long
+                self.sortable_typecode = "q" if signed else "Q"
+            else:
+                self._to_text = int_to_text
+                self._from_text = text_to_int
+                self.sortable_typecode = "i" if signed else "I"
+        elif self.type is long_type:
             self._to_text = long_to_text
             self._from_text = text_to_long
-            self.sortable_type = long
+            self.sortable_type = long_type
             self.sortable_typecode = "q" if signed else "Q"
         elif self.type is float:
             self._to_text = float_to_text
         
         try:
             text = self.to_text(qstring)
-        except Exception, e:
+        except Exception:
+            e = sys.exc_info()[1]
             raise QueryParserError(e)
         
         return query.Term(fieldname, text, boost=boost)
                 start = self.from_text(self.to_text(start))
             if end is not None:
                 end = self.from_text(self.to_text(end))
-        except Exception, e:
+        except Exception:
+            e = sys.exc_info()[1]
             raise QueryParserError(e)
         
         return query.NumericRange(fieldname, start, end, startexcl, endexcl,
         :param unique: Whether the value of this field is unique per-document.
         """
         
-        super(DATETIME, self).__init__(type=long, stored=stored, unique=unique,
-                                       shift_step=8)
+        super(DATETIME, self).__init__(type=long_type, stored=stored,
+                                       unique=unique, shift_step=8)
     
     def to_text(self, x, shift=0):
         if isinstance(x, datetime.datetime):
             x = datetime_to_long(x)
-        elif not isinstance(x, (int, long)):
+        elif not isinstance(x, integer_types):
             raise ValueError("DATETIME.to_text field doesn't know what to do "
                              "with %r" % x)
         
     >>> w.add_document(path="/a", done=False)
     >>> w.commit()
     """
-    
-    strings = (u"f", u"t")
-    trues = frozenset((u"t", u"true", u"yes", u"1"))
-    falses = frozenset((u"f", u"false", u"no", u"0"))
+
+    strings = (u("f"), u("t"))    
+    trues = frozenset((u("t"), u("true"), u("yes"), u("1")))
+    falses = frozenset((u("f"), u("false"), u("no"), u("0")))
     
     __inittypes__ = dict(stored=bool)
     
         self.format = Existence(None)
     
     def to_text(self, bit):
-        if isinstance(bit, basestring):
+        if isinstance(bit, string_type):
             bit = bit in self.trues
         elif not isinstance(bit, bool):
             raise ValueError("%r is not a boolean")
         
         # Create the class
         special_attrs = {}
-        for key in attrs.keys():
+        for key in list(attrs.keys()):
             if key.startswith("__"):
                 special_attrs[key] = attrs.pop(key)
         new_class = super_new(cls, name, bases, special_attrs)
     
     def __eq__(self, other):
         return (other.__class__ is self.__class__
-                and self.items() == other.items())
+                and list(self.items()) == list(other.items()))
     
     def __repr__(self):
         return "<%s: %r>" % (self.__class__.__name__, self.names())
         """Returns the field objects in this schema.
         """
         
-        return self._fields.itervalues()
+        return iter(itervalues(self._fields))
     
     def __getitem__(self, name):
         """Returns the field associated with the given field name.
         if name in self._fields:
             return self._fields[name]
         
-        for expr, fieldtype in self._dyn_fields.itervalues():
+        for expr, fieldtype in itervalues(self._dyn_fields):
             if expr.match(name):
                 return fieldtype
         
         if type(fieldtype) is type:
             try:
                 fieldtype = fieldtype()
-            except Exception, e:
+            except Exception:
+                e = sys.exc_info()[1]
                 raise FieldConfigurationError("Error: %s instantiating field %r: %r" % (e, name, fieldtype))
         
         if not isinstance(fieldtype, FieldType):
             return field.format.analyzer
 
 
-class SchemaClass(Schema):
-    __metaclass__ = MetaSchema
+class SchemaClass(with_metaclass(MetaSchema, Schema)):
     
     """Allows you to define a schema using declarative syntax, similar to
     Django models::

File src/whoosh/filedb/fieldcache.py

 from heapq import heappush, heapreplace
 from struct import Struct
 
+from whoosh.compat import u, b, xrange
 from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
 from whoosh.util import utf8encode
 
     each document with a value through the array.
     """
     
-    def __init__(self, order=None, texts=None, hastexts=True, default=u"",
+    def __init__(self, order=None, texts=None, hastexts=True, default=u(""),
                  typecode="I"):
         """
         :param order: an array of ints.
     # Class constructor for building a field cache from a reader
     
     @classmethod
-    def from_field(cls, ixreader, fieldname, default=u""):
+    def from_field(cls, ixreader, fieldname, default=u("")):
         """Creates an in-memory field cache from a reader.
         
         >>> r = ix.reader()
     # Class constructor for defining a field cache using arbitrary queries
     
     @classmethod
-    def from_lists(cls, doclists, doccount, default=u""):
+    def from_lists(cls, doclists, doccount, default=u("")):
         texts = sorted(doclists.keys())
         order = array("I", [0] * doccount)
         
         # Write a tag at the start of the file indicating the file write is in
         # progress, to warn other processes that might open the file. We'll
         # seek back and change this when the file is done.
-        dbfile.write("-")
+        dbfile.write(b("-"))
         
         dbfile.write_uint(len(self.order))  # Number of documents
         
         else:
             dbfile.write_uint(0)  # No texts
         
-        dbfile.write(self.typecode)
+        dbfile.write(b(self.typecode))
         write_qsafe_array(self.typecode, self.order, dbfile)
         dbfile.flush()
         
         # Seek back and change the tag byte at the start of the file
         dbfile.seek(0)
-        dbfile.write("+")
+        dbfile.write(b("+"))
     
     # Field cache operations
     
 # Streaming cache file writer
 
 class FieldCacheWriter(object):
-    def __init__(self, dbfile, size=0, hastexts=True, code="I", default=u""):
+    def __init__(self, dbfile, size=0, hastexts=True, code="I", default=u("")):
         self.dbfile = dbfile
         self.order = array(self.code, [0] * size)
         self.hastexts = hastexts
         self.keycount = 1
         
         self.tagpos = dbfile.tell()
-        dbfile.write("-")
+        dbfile.write(b("-"))
         self.start = dbfile.tell()
         dbfile.write_uint(0)  # Number of docs
         dbfile.write_uint(0)  # Number of texts
         
         if self.hastexts:
             # Start the pickled list of texts
-            dbfile.write("(" + pickled_unicode(default))
+            dbfile.write(b("(") + pickled_unicode(default))
     
     def add_key(self, value):
         if self.hastexts:
         keycount = self.keycount
         
         # Finish the pickled list of texts
-        dbfile.write("l.")
+        dbfile.write(b("l."))
         
         # Compact the order array if possible
         if self.hastexts:
         
         # Seek back and write the finished file tag
         dbfile.seek(self.tagpos)
-        dbfile.write("+")
+        dbfile.write(b("+"))
         
         dbfile.close()
     

File src/whoosh/filedb/fileindex.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-import cPickle
 import re
 import uuid
 from time import time
 from threading import Lock
 
 from whoosh import __version__
+from whoosh.compat import pickle, integer_types, string_type, iteritems
 from whoosh.fields import ensure_schema
 from whoosh.index import Index, EmptyIndexError, IndexVersionError, _DEF_INDEX_NAME
 from whoosh.reading import EmptyReader, MultiReader
     for num in __version__[:3]:
         stream.write_varint(num)
 
-    stream.write_string(cPickle.dumps(schema, -1))
+    stream.write_string(pickle.dumps(schema, -1))
     stream.write_int(gen)
     stream.write_int(segment_counter)
     stream.write_pickle(segments)
 
 class Toc(object):
     def __init__(self, **kwargs):
-        for name, value in kwargs.iteritems():
+        for name, value in iteritems(kwargs):
             setattr(self, name, value)
         
 
     if schema:
         stream.skip_string()
     else:
-        schema = cPickle.loads(stream.read_string())
+        schema = pickle.loads(stream.read_string())
     schema = ensure_schema(schema)
     
     # Generation
     def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):
         if not isinstance(storage, Storage):
             raise ValueError("%r is not a Storage object" % storage)
-        if not isinstance(indexname, (str, unicode)):
+        if not isinstance(indexname, string_type):
             raise ValueError("indexname %r is not a string" % indexname)
         
         if schema:
             deleted documents exist in this segment.
         """
 
-        assert isinstance(name, basestring)
-        assert isinstance(doccount, (int, long))
+        assert isinstance(name, string_type)
+        assert isinstance(doccount, integer_types)
         assert fieldlength_totals is None or isinstance(fieldlength_totals, dict), "fl_totals=%r" % fieldlength_totals
         assert fieldlength_maxes is None or isinstance(fieldlength_maxes, dict), "fl_maxes=%r" % fieldlength_maxes
         
             return False
         return docnum in self.deleted
 
+    def __lt__(self, other):
+        return id(self) < id(other)
 
 
 
 
 
 
+

File src/whoosh/filedb/filepostings.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
+from whoosh.compat import integer_types, xrange
 from whoosh.formats import Format
 from whoosh.writing import PostingWriter
 from whoosh.matching import Matcher, ReadTooFar
     def __init__(self, postfile, offset, format, scorer=None,
                  fieldname=None, text=None, stringids=False):
         
-        assert isinstance(offset, (int, long)), "offset is %r/%s" % (offset, type(offset))
+        assert isinstance(offset, integer_types), "offset is %r/%s" % (offset, type(offset))
         assert isinstance(format, Format), "format is %r/%s" % (format, type(format))
         
         self.postfile = postfile
             self.i += 1
             return False
 
+    __next__ = next
+
     def skip_to(self, id):
         if not self.is_active():
             raise ReadTooFar

File src/whoosh/filedb/filereading.py

 from heapq import nlargest, nsmallest
 from threading import Lock
 
+from whoosh.compat import iteritems, string_type, integer_types, next, xrange
 from whoosh.filedb.fieldcache import FieldCache, DefaultFieldCachingPolicy
 from whoosh.filedb.filepostings import FilePostingReader
 from whoosh.filedb.filetables import (TermIndexReader, StoredFieldReader,
     def stored_fields(self, docnum):
         schema = self.schema
         return dict(item for item
-                    in self.storedfields[docnum].iteritems()
+                    in iteritems(self.storedfields[docnum])
                     if item[0] in schema)
 
     @protected
             fieldcache = self.fieldcache(fieldname)
             it = iter(fieldcache.texts)
             # The first value in fieldcache.texts is the default; throw it away
-            it.next()
+            next(it)
             return it
         
         return self.expand_prefix(fieldname, '')
             raise TermNotFound("%s:%r" % (fieldname, text))
 
         format = self.schema[fieldname].format
-        if isinstance(offset, (int, long)):
+        if isinstance(offset, integer_types):
             postreader = FilePostingReader(self.postfile, offset, format,
                                            scorer=scorer, fieldname=fieldname,
                                            text=text)
     # Sorting and faceting methods
     
     def key_fn(self, fields):
-        if isinstance(fields, basestring):
+        if isinstance(fields, string_type):
             fields = (fields, )
         
         if len(fields) > 1:

File src/whoosh/filedb/filestore.py

 # policies, either expressed or implied, of Matt Chaput.
 
 import os
-from cStringIO import StringIO
 from threading import Lock
 
+from whoosh.compat import BytesIO
 from whoosh.index import _DEF_INDEX_NAME
 from whoosh.store import Storage
 from whoosh.support.filelock import FileLock
         try:
             f = StructFile(open(self._fpath(name), "rb"), name=name, *args, **kwargs)
         except IOError:
-            print "Tried to open %r, files=%r" % (name, self.list())
+            print("Tried to open %r, files=%r" % (name, self.list()))
             raise
         return f
 
         self.folder = ''
 
     def list(self):
-        return self.files.keys()
+        return list(self.files.keys())
 
     def clean(self):
         self.files = {}
     def create_file(self, name, **kwargs):
         def onclose_fn(sfile):
             self.files[name] = sfile.file.getvalue()
-        f = StructFile(StringIO(), name=name, onclose=onclose_fn)
+        f = StructFile(BytesIO(), name=name, onclose=onclose_fn)
         return f
 
     def open_file(self, name, *args, **kwargs):
         if name not in self.files:
             raise NameError("No such file %r" % name)
-        return StructFile(StringIO(self.files[name]), name=name, *args, **kwargs)
+        return StructFile(BytesIO(self.files[name]), name=name, *args, **kwargs)
 
     def lock(self, name):
         if name not in self.locks:

File src/whoosh/filedb/filetables.py

 
 from array import array
 from collections import defaultdict
-from cPickle import loads, dumps
 from hashlib import md5
 from struct import Struct
 
+from whoosh.compat import (loads, dumps, long_type, xrange, iteritems,
+                           b, text_type)
 from whoosh.system import (_INT_SIZE, _LONG_SIZE, pack_ushort, pack_uint,
                            pack_long, unpack_ushort, unpack_uint, unpack_long)
 from whoosh.util import byte_to_length, utf8encode, utf8decode
 _4GB = 4 * 1024 * 1024 * 1024
 
 def cdb_hash(key):
-    h = 5381L
+    h = long_type(5381)
     for c in key:
         h = (h + (h << 5)) & 0xffffffff ^ ord(c)
     return h
         write = dbfile.write
 
         for key, value in items:
+            key = key.encode('latin-1')
+            value = value.encode('latin-1')
             write(pack_lengths(len(key), len(value)))
             write(key)
             write(value)
 
         dbfile.seek(0)
         if self.format:
-            dbfile.write("HASH")
+            dbfile.write(b("HASH"))
             dbfile.write_byte(self.format)
-            dbfile.write("\x00\x00\x00")  # Unused
+            dbfile.write(b("\x00\x00\x00"))  # Unused
             dbfile.write_long(self._end_of_hashes)
         
         for position, numslots in directory:
         
         dbfile.seek(0)
         magic = dbfile.read(4)
-        if magic == "HASH":
+        if magic == b("HASH"):
             self.header_size = 16 + 256 * header_entry_size
             _pointer_struct = Struct("!Iq")  # Hash value, position
             self.format = dbfile.read_byte()
             yield (keypos, keylen, datapos, datalen)
 
     def __iter__(self):
-        return self.items()
+        return iter(self.items())
 
     def items(self):
         read = self.read
         for keypos, keylen, datapos, datalen in self._ranges():
-            yield (read(keypos, keylen), read(datapos, datalen))
+            key = read(keypos, keylen).decode('latin-1')
+            value = read(datapos, datalen).decode('latin-1')
+            yield (key, value)
 
     def keys(self):
         read = self.read
         return self.buckets[keyhash & 255]
 
     def _key_position(self, key):
-        keyhash = self.hash_func(key)
+        keyhash = self.hash_func(key.encode('latin-1'))
         hpos, hslots = self._hashtable_info(keyhash)
         if not hslots:
             raise KeyError(key)
     def _get_ranges(self, key):
         read = self.read
         pointer_size = self.pointer_size
+        if isinstance(key, text_type):
+            key = key.encode('latin-1')
         keyhash = self.hash_func(key)
         hpos, hslots = self._hashtable_info(keyhash)
         if not hslots:
         write = dbfile.write
 
         index = self.index
-        lk = self.lastkey
+        lk = self.lastkey or b('')
 
         for key, value in items:
+            if isinstance(key, text_type):
+                key = key.encode('latin-1')
+            if isinstance(value, text_type):
+                value = value.encode('latin-1')
             if key <= lk:
                 raise ValueError("Keys must increase: %r .. %r" % (lk, key))
             lk = key
         indexbase = self.indexbase
         lo = 0
         hi = self.length
+        if isinstance(key, text_type):
+            key = key.encode('latin-1')
         while lo < hi:
             mid = (lo + hi) // 2
             midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE))
         dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
         self.fieldmap = dbfile.read_pickle()
         self.names = [None] * len(self.fieldmap)
-        for name, num in self.fieldmap.iteritems():
+        for name, num in iteritems(self.fieldmap):
             self.names[num] = name
     
     def keycoder(self, key):
         return pack_ushort(fnum) + utf8encode(text)[0]
         
     def keydecoder(self, v):
+        if isinstance(v, text_type):
+            v = v.encode('latin-1')
         return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
     
     def valuedecoder(self, v):
-        v = loads(v + ".")
+        if isinstance(v, text_type):
+            v = v.encode('latin-1')
+        v = loads(v + b("."))
         if len(v) == 1:
             return (1, v[0], 1)
         elif len(v) == 2:
     
     def close(self):
         self.dbfile.write_ushort(len(self.lengths))
-        for fieldname, arry in self.lengths.iteritems():
-            self.dbfile.write_string(fieldname)
+        for fieldname, arry in iteritems(self.lengths):
+            self.dbfile.write_string(fieldname.encode('utf-8'))
             self.dbfile.write_array(arry)
         self.dbfile.close()
         
             self.lengths = {}
             count = dbfile.read_ushort()
             for _ in xrange(count):
-                fieldname = dbfile.read_string()
+                fieldname = dbfile.read_string().decode('utf-8')
                 self.lengths[fieldname] = dbfile.read_array("B", self.doccount)
             dbfile.close()
     
         name_map = self.name_map
         
         vlist = [None] * len(name_map)
-        for k, v in values.iteritems():
+        for k, v in iteritems(values):
             if k in name_map:
                 vlist[name_map[k]] = v
             else:
         dbfile.seek(pos)
         name_map = dbfile.read_pickle()
         self.names = [None] * len(name_map)
-        for name, pos in name_map.iteritems():
+        for name, pos in iteritems(name_map):
             self.names[pos] = name
         self.directory_offset = dbfile.tell()
         
         if len(ptr) != stored_pointer_size:
             raise Exception("Error reading %r @%s %s < %s" % (dbfile, start, len(ptr), stored_pointer_size))
         position, length = unpack_stored_pointer(ptr)
-        vlist = loads(dbfile.map[position:position + length] + ".")
+        vlist = loads(dbfile.map[position:position + length] + b("."))
         
         names = self.names
         # Recreate a dictionary by putting the field names and values back

File src/whoosh/filedb/filewriting.py

 from bisect import bisect_right
 from collections import defaultdict
 
+from whoosh.compat import iteritems, next, text_type
 from whoosh.fields import UnknownFieldError
 from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.filepostings import FilePostingWriter
         for docnum in reader.all_doc_ids():
             if (not has_deletions) or (not reader.is_deleted(docnum)):
                 d = dict(item for item
-                         in reader.stored_fields(docnum).iteritems()
+                         in iteritems(reader.stored_fields(docnum))
                          if item[0] in fieldnames)
                 # We have to append a dictionary for every document, even if
                 # it's empty.
                     
                     self.pool.add_posting(fieldname, text, newdoc,
                                           postreader.weight(), valuestring)
-                    postreader.next()
+                    next(postreader)
                     
         self._added = True
     
         vpostwriter = self.vpostwriter
         offset = vpostwriter.start(self.schema[fieldname].vector)
         for text, weight, valuestring in vlist:
-            assert isinstance(text, unicode), "%r is not unicode" % text
+            assert isinstance(text, text_type), "%r is not unicode" % text
             vpostwriter.write(text, weight, valuestring, 0)
         vpostwriter.finish()
         
         while vreader.is_active():
             # text, weight, valuestring, fieldlen
             vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(), 0)
-            vreader.next()
+            next(vreader)
         vpostwriter.finish()
         
         self.vectorindex.add((docnum, fieldname), offset)
         self.offset = None
     
     def _new_term(self, fieldname, text):
-        lastfn = self.lastfn
-        lasttext = self.lasttext
+        lastfn = self.lastfn or ''
+        lasttext = self.lasttext or ''
         if fieldname < lastfn or (fieldname == lastfn and text < lasttext):
             raise Exception("Postings are out of order: %r:%s .. %r:%s" %
                             (lastfn, lasttext, fieldname, text))
                 newdoc = offset + docnum
             totalweight += weight
             postwrite(newdoc, weight, valuestring, getlen(docnum, fieldname))
-            matcher.next()
+            next(matcher)
         self.weight += totalweight
     
     def add_iter(self, postiter, getlen, offset=0, docmap=None):

File src/whoosh/filedb/gae.py

     ix = DataStoreStorage().open_index()
 """
 
-from cStringIO import StringIO
-
 from google.appengine.api import memcache
 from google.appengine.ext import db
 
+from whoosh.compat import BytesIO
 from whoosh.store import Storage
 from whoosh.filedb.fileindex import _create_index, FileIndex, _DEF_INDEX_NAME
 from whoosh.filedb.filestore import ReadOnlyError
 
 
 class DatastoreFile(db.Model):
-    """A file-like object that is backed by a StringIO() object whose contents
+    """A file-like object that is backed by a BytesIO() object whose contents
     is loaded from a BlobProperty in the app engine datastore.
     """
     
 
     def __init__(self, *args, **kwargs):
         super(DatastoreFile, self).__init__(*args, **kwargs)
-        self.data = StringIO()
+        self.data = BytesIO()
 
     @classmethod
     def loadfile(cls, name):
             memcache.set(name, file.value, namespace="DatastoreFile")
         else:
             file = cls(value=value)
-        file.data = StringIO(file.value)
+        file.data = BytesIO(file.value)
         return file
 
     def close(self):

File src/whoosh/filedb/multiproc.py

 import os
 import tempfile
 from multiprocessing import Process, Queue, cpu_count
-from cPickle import dump, load
 
+from whoosh.compat import dump, load, xrange, iteritems
 from whoosh.filedb.filetables import LengthWriter, LengthReader
 from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.filewriting import SegmentWriter
             taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
             runs.extend(taskruns)
             lenfilenames.append(lenfilename)
-            for fieldnum, total in flentotals.iteritems():
+            for fieldnum, total in iteritems(flentotals):
                 _fieldlength_totals[fieldnum] += total
-            for fieldnum, length in flenmaxes.iteritems():
+            for fieldnum, length in iteritems(flenmaxes):
                 if length > self._fieldlength_maxes.get(fieldnum, 0):
                     self._fieldlength_maxes[fieldnum] = length
         
  
  
  
- 
+ 

File src/whoosh/filedb/pools.py

 from marshal import load, dump
 #import sqlite3 as sqlite
 
+from whoosh.compat import long_type, iteritems, xrange, text_type, PY3
 from whoosh.filedb.filetables import LengthWriter, LengthReader
 from whoosh.util import length_to_byte
 
         if obj is None:
             return 8
         t = type(obj)
-        if t is int:
+        if t is int and not PY3:
             return 12
         elif t is float:
             return 16
-        elif t is long:
+        elif t is long_type:
             return 16
         elif t is str:
             return 21 + len(obj)
-        elif t is unicode:
+        elif t is text_type:
             return 26 + 2 * len(obj)
 
 
             buff = []
             take = min(atatime, count)
             for _ in xrange(take):
+                #print('*** before', f.tell())
                 buff.append(load(f))
+                #print('*** after', f.tell())
             count -= take
             for item in buff:
                 yield item
         return con
     
     def flush(self):
-        for fieldname, lst in self.postbuf.iteritems():
+        for fieldname, lst in iteritems(self.postbuf):
             con = self._con(fieldname)
             con.executemany("insert into postings values (?, ?, ?, ?)", lst)
             con.commit()
         self.postbuf = defaultdict(list)
         self.bufsize = 0
         self._flushed = True
-        print "flushed"
+        print("flushed")
     
     def add_posting(self, fieldname, text, docnum, weight, valuestring):
         self.postbuf[fieldname].append((text, docnum, weight, valuestring))

File src/whoosh/filedb/postblocks.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from array import array
-from cPickle import dumps, load, loads
 from struct import Struct
 
 try:
 except ImportError:
     can_compress = False
 
+from whoosh.compat import dumps, load, loads, xrange, b, u, PY3
 from whoosh.system import _INT_SIZE, _FLOAT_SIZE, pack_uint, IS_LITTLE
 from whoosh.util import utf8decode, length_to_byte, byte_to_length
 
     
     def __nonzero__(self):
         return bool(self.ids)
-    
+
+    __bool__ = __nonzero__
+
     def stats(self):
         # Calculate block statistics
         maxweight = max(self.weights)
         block.maxwol = header[8]
         block.minlen = byte_to_length(header[10])
         
+        if PY3:
+            block.typecode = block.typecode.decode('latin-1')
         if stringids:
             block.maxid = load(postfile)
         else:
         
         # Weights
         if all(w == 1.0 for w in weights):
-            weights_string = ''
+            weights_string = b('')
         else:
             if not IS_LITTLE:
                 weights.byteswap()
         if posting_size < 0:
             values_string = dumps(values, -1)[2:]
         elif posting_size == 0:
-            values_string = ''
+            values_string = b('')
         else:
-            values_string = "".join(values)
+            values_string = b("").join(values)
         if values_string and compression:
             values_string = compress(values_string, compression)
         
         minlen_byte = length_to_byte(minlength)
         blocksize = sum((self._struct.size, len(maxid_string), len(ids_string),
                          len(weights_string), len(values_string)))
-        header = self._struct.pack(blocksize, flags, postcount, typecode,
+        header = self._struct.pack(blocksize, flags, postcount, typecode.encode('latin-1'),
                                    0, len(ids_string), len(weights_string),
                                    maxweight, maxwol, 0, minlen_byte)
         

File src/whoosh/filedb/structfile.py

 import os
 from array import array
 from copy import copy
-from cPickle import dump as dump_pickle
-from cPickle import load as load_pickle
 from struct import calcsize
 from gzip import GzipFile
 
+from whoosh.compat import dump as dump_pickle
+from whoosh.compat import load as load_pickle
+from whoosh.compat import integer_types, b
 from whoosh.system import (_INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE,
                            pack_sbyte, pack_ushort, pack_int, pack_uint,
                            pack_long, pack_float,
                     self._setup_fake_map()
         else:
             self._setup_fake_map()
-            
+
     def __repr__(self):
         return "%s(%r)" % (self.__class__.__name__, self._name)
 
         
         class fakemap(object):
             def __getitem__(self, slice):
-                if isinstance(slice, (int, long)):
+                if isinstance(slice, integer_types):
                     _self.seek(slice)
                     return _self.read(1)
                 else:
         """Writes a single byte to the wrapped file, shortcut for
         ``file.write(chr(n))``.
         """
-        self.file.write(chr(n))
+        self.file.write(b(chr(n)))
 
     def read_byte(self):
         return ord(self.file.read(1))

File src/whoosh/formats.py

 """
 
 from collections import defaultdict
-from cPickle import dumps, loads
 
 from whoosh.analysis import unstopped, entoken
+from whoosh.compat import iteritems, dumps, loads, b
 from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_uint, unpack_uint,
                            pack_float, unpack_float)
 from whoosh.util import float_to_byte, byte_to_float
         
         encode = self.encode
         return ((w, freq, weights[w] * fb, encode(freq))
-                for w, freq in freqs.iteritems())
+                for w, freq in iteritems(freqs))
 
     def encode(self, freq):
         return pack_uint(freq)
         
         encode = self.encode
         return ((w, freq, weights[w] * doc_boost * fb, encode((freq, doc_boost)))
-                for w, freq in freqs.iteritems())
+                for w, freq in iteritems(freqs))
     
     def encode(self, freq_docboost):
         freq, docboost = freq_docboost
         
         encode = self.encode
         return ((w, len(poslist), weights[w] * fb, encode(poslist))
-                for w, poslist in poses.iteritems())
+                for w, poslist in iteritems(poses))
     
     def encode(self, positions):
         codes = []
         return pack_uint(len(codes)) + dumps(codes, -1)[2:-1]
     
     def decode_positions(self, valuestring):
-        codes = loads(valuestring[_INT_SIZE:] + ".")
+        codes = loads(valuestring[_INT_SIZE:] + b("."))
         position = 0
         positions = []
         for code in codes:
         
         encode = self.encode
         return ((w, len(ls), weights[w] * fb, encode(ls))
-                for w, ls in seen.iteritems())
+                for w, ls in iteritems(seen))
     
     def encode(self, posns_chars):
         # posns_chars = [(pos, startchar, endchar), ...]
         return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
     
     def decode_characters(self, valuestring):
-        codes = loads(valuestring[_INT_SIZE:] + ".")
+        codes = loads(valuestring[_INT_SIZE:] + b("."))
         position = 0
         endchar = 0
         posns_chars = []
         return posns_chars
     
     def decode_positions(self, valuestring):
-        codes = loads(valuestring[_INT_SIZE:] + ".")
+        codes = loads(valuestring[_INT_SIZE:] + b("."))
         position = 0
         posns = []
         for code in codes:
         
         encode = self.encode
         return ((w, len(poslist), sum(p[1] for p in poslist) * fb, encode(poslist))
-                for w, poslist in seen.iteritems())
+                for w, poslist in iteritems(seen))
     
     def encode(self, posns_boosts):
         # posns_boosts = [(pos, boost), ...]
                 + dumps(codes, -1)[2:-1])
         
     def decode_position_boosts(self, valuestring):
-        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".")
+        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
         position = 0
         posns_boosts = []
         for code in codes:
         return posns_boosts
     
     def decode_positions(self, valuestring):
-        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".")
+        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
         position = 0
         posns = []
         for code in codes:
         
         encode = self.encode
         return ((w, len(poslist), sum(p[3] for p in poslist) * fb, encode(poslist))
-                for w, poslist in seen.iteritems())
+                for w, poslist in iteritems(seen))
     
     def encode(self, posns_chars_boosts):
         # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
                 + dumps(codes, -1)[2:-1])
         
     def decode_character_boosts(self, valuestring):
-        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".")
+        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
         position = 0
         endchar = 0
         posn_char_boosts = []

File src/whoosh/highlight.py

         return any(t.matched for t in self.tokens)
     
 
+    def __lt__(self, other):
+        return id(self) < id(other)
+
 # Filters
 
 def copyandmatchfilter(termset, tokens):

File src/whoosh/index.py

 
 from __future__ import division
 import os.path
+import sys
 
 from whoosh import fields, store
 
         
         ix = storage.open_index(indexname)
         return (ix.release, ix.version)
-    except IndexVersionError, e:
+    except IndexVersionError:
+        e = sys.exc_info()[1]
         return (None, e.version)
 
 

File src/whoosh/lang/dmetaphone.py

 
 import re
 
+from whoosh.compat import u
 
 vowels = frozenset("AEIOUY")
 slavo_germ_exp = re.compile("W|K|CZ|WITZ")
                         next = ('K', 2)
                     else:  # default for 'C'
                         next = ('K', 1)
-        elif ch == u'Ç':
+        elif ch == u('\xc7'):
             next = ('S', 1)
         elif ch == 'D':
             if text[pos:pos + 2] == 'DG':
                 next = ('N', 2)
             else:
                 next = ('N', 1)
-        elif ch == u'Ñ':
+        elif ch == u('\xd1'):
             next = ('N', 1)
         elif ch == 'P':
             if text[pos + 1] == 'H':
     for name in names.keys():
         assert (double_metaphone(name) == names[name]), 'For "%s" function returned %s. Should be %s.' % (name, double_metaphone(name), names[name])
         
-        
+        

File src/whoosh/lang/morph_en.py

 
 import re
 
+from whoosh.compat import xrange, iteritems
 # Rule exceptions
 
 exceptions = [
         match = p.search(word)
         if match:
             # Get the named group that matched
-            num = int([k for k, v in match.groupdict().iteritems()
+            num = int([k for k, v in iteritems(match.groupdict())
                        if v is not None and k.startswith("_g")][0][2:])
             # Get the positional groups for the matched group (all other
             # positional groups are None)
     import time
     t = time.clock()
     s = variations("rendering")
-    print time.clock() - t
-    print len(s)
+    print(time.clock() - t)
+    print(len(s))
     

File src/whoosh/lang/phonetic.py

 
 import re
 
+from whoosh.compat import iteritems
+
 # This soundex implementation is adapted from the recipe here:
 # http://code.activestate.com/recipes/52213/
 
 
 # Create a dictionary mapping arabic characters to digits
 _arabic_codes = {}
-for chars, code in {'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0",
+for chars, code in iteritems({'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0",
                     '\u0641\u0628': "1",
                     '\u062c\u0632\u0633\u0635\u0638\u0642\u0643': "2",
                     '\u062a\u062b\u062f\u0630\u0636\u0637': "3",
                     '\u0644': "4",
                     '\u0645\u0646': "5",
                     '\u0631': "6",
-                    }.iteritems():
+                    }):
     for char in chars:
         _arabic_codes[char] = code
 
     return r
 
 if __name__ == "__main__":
-    print soundex_esp("solidad")
+    print(soundex_esp("solidad"))
     

File src/whoosh/lang/porter.py

     return w
 
 if __name__ == '__main__':
-    print stem("fundamentally")
+    print(stem("fundamentally"))
     
     
     

File src/whoosh/lang/wordnet.py

 
 from collections import defaultdict
 
+from whoosh.compat import iterkeys, text_type
 from whoosh.fields import Schema, ID, STORED
 
 
     schema = Schema(word=ID, syns=STORED)
     ix = storage.create_index(schema, indexname=indexname)
     w = ix.writer()
-    for word in word2nums.iterkeys():
+    for word in iterkeys(word2nums):
         syns = synonyms(word2nums, num2words, word)
-        w.add_document(word=unicode(word), syns=syns)
+        w.add_document(word=text_type(word), syns=syns)
     w.commit()
     return ix
 
     
 #    t = clock()
 #    th = Thesaurus.from_filename("c:/wordnet/wn_s.pl")
-#    print clock() - t
+#    print(clock() - t)
 #    
 #    t = clock()
 #    th.to_storage(st)
-#    print clock() - t
+#    print(clock() - t)
 #    
 #    t = clock()
 #    print th.synonyms("light")
-#    print clock() - t
+#    print(clock() - t)
     
     t = clock()
     th = Thesaurus.from_storage(st)
-    print clock() - t
+    print(clock() - t)
     
     t = clock()
-    print th.synonyms("hail")
-    print clock() - t
+    print(th.synonyms("hail"))
+    print(clock() - t)
     
-    
+    

File src/whoosh/matching.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-from itertools import izip, repeat
+from itertools import repeat
+import sys
 
+from whoosh.compat import izip, next, xrange
 
 """
 This module contains "matcher" classes. Matchers deal with posting lists. The
         i = 0
         while self.is_active():
             yield self.id()
-            self.next()
+            next(self)
             i += 1
             if i == 10: