Matt Chaput avatar Matt Chaput committed 38295c2

Re-commit all changes made to the "scratch" hg repo onto the end of the converted repo.
Bumped revision number to 0.3.16.
Docstring margin cleanups.
Fixed file lock code to close the file descriptor when the lock is released.
Added implementations of Lovins and Paice-Husk stemming algorithms.
Added varint_to_int function to whoosh.util.
Added .hgignore and Eclipse project files to source control.

Comments (0)

Files changed (34)

+syntax: glob
+*.pyc
+*~
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>whoosh</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.python.pydev.PyDevBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.python.pydev.pythonNature</nature>
+	</natures>
+</projectDescription>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?>
+
+<pydev_project>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.6</pydev_property>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/whoosh/src</path>
+<path>/whoosh/tests</path>
+</pydev_pathproperty>
+</pydev_project>

src/whoosh/__init__.py

 # limitations under the License.
 #===============================================================================
 
-__version__ = (0, 3, 15)
+__version__ = (0, 3, 16)
 
 
 def versionstring(build=True, extra=True):

src/whoosh/analysis.py

 # limitations under the License.
 #===============================================================================
 
-"""Classes and functions for turning a piece of text into
-an indexable stream of "tokens" (usually equivalent to words). There are
-three general types of classes/functions involved in analysis:
+"""Classes and functions for turning a piece of text into an indexable stream
+of "tokens" (usually equivalent to words). There are three general types of
+classes/functions involved in analysis:
 
-* Tokenizers are always at the start of the text processing pipeline.
-  They take a string and yield Token objects (actually, the same token
-  object over and over, for performance reasons) corresponding to the
-  tokens (words) in the text.
+* Tokenizers are always at the start of the text processing pipeline. They take
+  a string and yield Token objects (actually, the same token object over and
+  over, for performance reasons) corresponding to the tokens (words) in the
+  text.
       
-  Every tokenizer is a callable that takes a string and returns a
-  generator of tokens.
+  Every tokenizer is a callable that takes a string and returns a generator of
+  tokens.
       
 * Filters take the tokens from the tokenizer and perform various
-  transformations on them. For example, the LowercaseFilter converts
-  all tokens to lowercase, which is usually necessary when indexing
-  regular English text.
+  transformations on them. For example, the LowercaseFilter converts all tokens
+  to lowercase, which is usually necessary when indexing regular English text.
       
-  Every filter is a callable that takes a token generator and returns
-  a token generator.
+  Every filter is a callable that takes a token generator and returns a token
+  generator.
       
-* Analyzers are convenience functions/classes that "package up" a
-  tokenizer and zero or more filters into a single unit, so you
-  don't have to construct the tokenizer-filter-filter-etc. pipeline
-  yourself. For example, the StandardAnalyzer combines a RegexTokenizer,
-  LowercaseFilter, and StopFilter.
+* Analyzers are convenience functions/classes that "package up" a tokenizer and
+  zero or more filters into a single unit, so you don't have to construct the
+  tokenizer-filter-filter-etc. pipeline yourself. For example, the
+  StandardAnalyzer combines a RegexTokenizer, LowercaseFilter, and StopFilter.
     
-  Every analyzer is a callable that takes a string and returns a
-  token generator. (So Tokenizers can be used as Analyzers if you
-  don't need any filtering).
+  Every analyzer is a callable that takes a string and returns a token
+  generator. (So Tokenizers can be used as Analyzers if you don't need any
+  filtering).
   
-You can implement an analyzer as a custom class or function, or compose tokenizers
-and filters together using the ``|`` character::
+You can implement an analyzer as a custom class or function, or compose
+tokenizers and filters together using the ``|`` character::
 
     my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
     
-The first item must be a tokenizer and the rest must be filters (you can't put a
-filter first or a tokenizer after the first item).
+The first item must be a tokenizer and the rest must be filters (you can't put
+a filter first or a tokenizer after the first item).
 """
 
 from array import array
 
 from whoosh.lang.porter import stem
 
-# Default list of stop words (words so common it's usually
-# wasteful to index them). This list is used by the StopFilter
-# class, which allows you to supply an optional list to override
-# this one.
+# Default list of stop words (words so common it's usually wasteful to index
+# them). This list is used by the StopFilter class, which allows you to supply
+# an optional list to override this one.
 
 STOP_WORDS = frozenset(("the", "to", "of", "a", "and", "is", "in", "this",
-                        "you", "for", "be", "on", "or", "will", "if", "can", "are",
-                        "that", "by", "with", "it", "as", "from", "an", "when",
-                        "not", "may", "tbd", "us", "we", "yet"))
+                        "you", "for", "be", "on", "or", "will", "if", "can",
+                        "are", "that", "by", "with", "it", "as", "from", "an",
+                        "when", "not", "may", "tbd", "us", "we", "yet"))
 
 
 # Utility functions
 
 def unstopped(tokenstream):
-    """Removes tokens from a token stream where token.stopped = True."""
+    """Removes tokens from a token stream where token.stopped = True.
+    """
     return (t for t in tokenstream if not t.stopped)
 
 
 
 class Token(object):
     """
-    Represents a "token" (usually a word) extracted from the source text
-    being indexed.
+    Represents a "token" (usually a word) extracted from the source text being
+    indexed.
     
     See "Advanced analysis" in the user guide for more information.
     
     ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
     each time.
     
-    This trick means that consumers of tokens (i.e. filters) must
-    never try to hold onto the token object between loop iterations, or convert
-    the token generator into a list.
-    Instead, save the attributes between iterations, not the object::
+    This trick means that consumers of tokens (i.e. filters) must never try to
+    hold onto the token object between loop iterations, or convert the token
+    generator into a list. Instead, save the attributes between iterations,
+    not the object::
     
         def RemoveDuplicatesFilter(self, stream):
             # Removes duplicate words.
                     yield token
                 lasttext = token.text
 
+    ...or, call token.copy() to get a copy of the token object.
     """
     
-    def __init__(self, positions = False, chars = False, boosts = False, removestops = True,
-                 mode = '', **kwargs):
+    def __init__(self, positions=False, chars=False, boosts=False,
+                 removestops=True, mode='', **kwargs):
         """
-        :param positions: Whether tokens should have the token position in
-            the 'pos' attribute.
-        :param chars: Whether tokens should have character offsets
-            in the 'startchar' and 'endchar' attributes.
-        :param boosts: whether the tokens should have per-token boosts
-            in the 'boost' attribute.
-        :param removestops: whether to remove stop words from the stream
-            (if the tokens pass through a stop filter).
+        :param positions: Whether tokens should have the token position in the
+            'pos' attribute.
+        :param chars: Whether tokens should have character offsets in the
+            'startchar' and 'endchar' attributes.
+        :param boosts: whether the tokens should have per-token boosts in the
+            'boost' attribute.
+        :param removestops: whether to remove stop words from the stream (if
+            the tokens pass through a stop filter).
         :param mode: contains a string describing the purpose for which the
             analyzer is being called, i.e. 'index' or 'query'.
         """
         self.__dict__.update(kwargs)
     
     def __repr__(self):
-        return "%s(%s)" % (self.__class__.__name__,
-                           ", ".join(["%s=%r" % (name, value)
-                                      for name, value in self.__dict__.iteritems()]))
+        parms = ", ".join("%s=%r" % (name, value)
+                          for name, value in self.__dict__.iteritems())
+        return "%s(%s)" % (self.__class__.__name__, parms)
         
     def copy(self):
         return copy.copy(self)
     def __repr__(self):
         attrs = ""
         if self.__dict__:
-            attrs = ", ".join("%s=%r" % (key, value) for key, value in self.__dict__.iteritems())
+            attrs = ", ".join("%s=%r" % (key, value)
+                              for key, value
+                              in self.__dict__.iteritems())
         return self.__class__.__name__ + "(%s)" % attrs
 
 
 
 
 class IDTokenizer(Tokenizer):
-    """Yields the entire input string as a single token. For use
-    in indexed but untokenized fields, such as a document's path.
+    """Yields the entire input string as a single token. For use in indexed but
+    untokenized fields, such as a document's path.
     
     >>> idt = IDTokenizer()
     >>> [token.text for token in idt(u"/a/b 123 alpha")]
     [u"/a/b 123 alpha"]
     """
     
-    def __call__(self, value, positions = False, chars = False,
-                 keeporiginal = False, removestops = True,
-                 start_pos = 0, start_char = 0, mode='',
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0, mode='',
                  **kwargs):
         assert isinstance(value, unicode), "%r is not unicode" % value
-        t = Token(positions, chars, removestops = removestops, mode=mode)
+        t = Token(positions, chars, removestops=removestops, mode=mode)
         t.text = value
         if keeporiginal:
             t.original = value
     
     __inittypes__ = dict(expression=unicode, gaps=bool)
     
-    def __init__(self, expression = r"\w+(\.?\w+)*", gaps=False):
+    def __init__(self, expression=r"\w+(\.?\w+)*", gaps=False):
         """
         :param expression: A regular expression object or string. Each match
             of the expression equals a token. Group 0 (the entire matched text)
                 return True
         return False
     
-    def __call__(self, value, positions = False, chars = False,
-                 keeporiginal = False, removestops = True,
-                 start_pos = 0, start_char = 0,
-                 tokenize = True, mode = '', **kwargs):
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0,
+                 tokenize=True, mode='', **kwargs):
         """
         :param value: The unicode string to tokenize.
         :param positions: Whether to record token positions in the token.
 
 
 class CharsetTokenizer(Tokenizer):
-    """Tokenizes and translates text according to a character mapping object. Characters
-    that map to None are considered token break characters. For all other characters the
-    map is used to translate the character. This is useful for case and accent folding.
+    """Tokenizes and translates text according to a character mapping object.
+    Characters that map to None are considered token break characters. For all
+    other characters the map is used to translate the character. This is useful
+    for case and accent folding.
     
     This tokenizer loops character-by-character and so will likely be much
     slower than :class:`RegexTokenizer`.
     
-    One way to get a character mapping object is to convert a Sphinx charset table file
-    using :func:`whoosh.support.charset.charset_table_to_dict`.
+    One way to get a character mapping object is to convert a Sphinx charset
+    table file using :func:`whoosh.support.charset.charset_table_to_dict`.
     
     >>> from whoosh.support.charset import charset_table_to_dict, default_charset
     >>> charmap = charset_table_to_dict(default_charset)
     
     def __init__(self, charmap):
         """
-        :param charmap: a mapping from integer character numbers to unicode characters,
-            as used by the unicode.translate() method.
+        :param charmap: a mapping from integer character numbers to unicode
+            characters, as used by the unicode.translate() method.
         """
         self.charmap = charmap
     
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and self.charmap == other.charmap
+        return (other
+                and self.__class__ is other.__class__
+                and self.charmap == other.charmap)
 
-    def __call__(self, value, positions = False, chars = False,
-             keeporiginal = False, removestops = True,
-             start_pos = 0, start_char = 0,
-             tokenize = True, mode = '', **kwargs):
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0,
+                 tokenize=True, mode='', **kwargs):
         """
         :param value: The unicode string to tokenize.
         :param positions: Whether to record token positions in the token.
 def CommaSeparatedTokenizer():
     """Splits tokens by commas.
     
-    Note that the tokenizer calls unicode.strip() on each match
-    of the regular expression.
+    Note that the tokenizer calls unicode.strip() on each match of the regular
+    expression.
     
     >>> cst = CommaSeparatedTokenizer()
     >>> [token.text for token in cst(u"hi there, what's , up")]
     >>> [token.text for token in ngt(u"hi there")]
     [u"hi t", u"i th", u" the", u"ther", u"here"]
     
-    Note that this tokenizer does NOT use a regular expression to extract words,
-    so the grams emitted by it will contain whitespace, punctuation, etc. You may
-    want to massage the input or add a custom filter to this tokenizer's output.
+    Note that this tokenizer does NOT use a regular expression to extract
+    words, so the grams emitted by it will contain whitespace, punctuation,
+    etc. You may want to massage the input or add a custom filter to this
+    tokenizer's output.
     
     Alternatively, if you only want sub-word grams without whitespace, you
     could combine a RegexTokenizer with NgramFilter instead.
     
     __inittypes__ = dict(minsize=int, maxsize=int)
     
-    def __init__(self, minsize, maxsize = None):
+    def __init__(self, minsize, maxsize=None):
         """
         :param minsize: The minimum size of the N-grams.
         :param maxsize: The maximum size of the N-grams. If you omit
                 return True
         return False
     
-    def __call__(self, value, positions = False, chars = False,
-                 keeporiginal = False, removestops = True,
-                 start_pos = 0, start_char = 0,
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0,
                  **kwargs):
         assert isinstance(value, unicode), "%r is not unicode" % value
         
         inlen = len(value)
-        t = Token(positions, chars, removestops = removestops)
+        t = Token(positions, chars, removestops=removestops)
         pos = start_pos
         for start in xrange(0, inlen - self.min + 1):
             for size in xrange(self.min, self.max + 1):
 # Filters
 
 class Filter(Composable):
-    """Base class for Filter objects. A Filter subclass must implement
-    a __call__ method that takes a single argument, which is an iterator
-    of Token objects, and yield a series of Token objects in return.
+    """Base class for Filter objects. A Filter subclass must implement a
+    __call__ method that takes a single argument, which is an iterator of Token
+    objects, and yield a series of Token objects in return.
     """
     
     def __eq__(self, other):
 
 
 class RecordFilter(Filter):
-    """A debug filter that remembers the tokens that pass through
-    it, and stores them in the 'tokens' attribute.
+    """A debug filter that remembers the tokens that pass through it, and
+    stores them in the 'tokens' attribute.
     """
     
     def __init__(self):
         self.filters = kwargs
     
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and self.filters == other.filters
+        return (other
+                and self.__class__ is other.__class__
+                and self.filters == other.filters)
     
     def __call__(self, tokens):
         # Only selects on the first token
 
 
 class StopFilter(Filter):
-    """Marks "stop" words (words too common to index) in the stream (and by default
-    removes them).
+    """Marks "stop" words (words too common to index) in the stream (and by
+    default removes them).
     
     >>> rext = RegexTokenizer()
     >>> stream = rext(u"this is a test")
 
     __inittypes__ = dict(stoplist=list, minsize=int, renumber=bool)
 
-    def __init__(self, stoplist = STOP_WORDS, minsize = 2,
-                 renumber = True):
+    def __init__(self, stoplist=STOP_WORDS, minsize=2,
+                 renumber=True):
         """
         :param stoplist: A collection of words to remove from the stream.
             This is converted to a frozenset. The default is a list of
         self.renumber = renumber
     
     def __eq__(self, other):
-        if self.__class__ is other.__class__:
-            if self.stops == other.stops and self.min == other.min and self.renumber == other.renumber:
-                return True
-        return False
+        return (other
+                and self.__class__ is other.__class__
+                and self.stops == other.stops
+                and self.min == other.min
+                and self.renumber == other.renumber)
     
     def __call__(self, tokens):
         assert hasattr(tokens, "__iter__")
 
 
 class StemFilter(Filter):
-    """Stems (removes suffixes from) the text of tokens using the Porter stemming
-    algorithm. Stemming attempts to reduce multiple forms of the same root word
-    (for example, "rendering", "renders", "rendered", etc.) to a single word in
-    the index.
+    """Stems (removes suffixes from) the text of tokens using the Porter
+    stemming algorithm. Stemming attempts to reduce multiple forms of the same
+    root word (for example, "rendering", "renders", "rendered", etc.) to a
+    single word in the index.
     
     >>> rext = RegexTokenizer()
     >>> stream = rext(u"fundamentally willows")
     
     __inittypes__ = dict(stemfn=object, ignore=list)
     
-    def __init__(self, stemfn = stem, ignore = None):
+    def __init__(self, stemfn=stem, ignore=None):
         """
         :param stemfn: the function to use for stemming.
-        :param ignore: a set/list of words that should not be stemmed. This
-            is converted into a frozenset. If you omit this argument, all tokens
+        :param ignore: a set/list of words that should not be stemmed. This is
+            converted into a frozenset. If you omit this argument, all tokens
             are stemmed.
         """
         
             self.ignores = frozenset(ignore)
     
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and self.stemfn == other.stemfn
+        return (other
+                and self.__class__ is other.__class__
+                and self.stemfn == other.stemfn)
     
     def __call__(self, tokens):
         assert hasattr(tokens, "__iter__")
 
 
 class CharsetFilter(Filter):
-    """Translates the text of tokens by calling unicode.translate() using the supplied
-    character mapping object. This is useful for case and accent folding.
+    """Translates the text of tokens by calling unicode.translate() using the
+    supplied character mapping object. This is useful for case and accent
+    folding.
     
-    One way to get a character mapping object is to convert a Sphinx charset table file
-    using :func:`whoosh.support.charset.charset_table_to_dict`.
+    One way to get a character mapping object is to convert a Sphinx charset
+    table file using :func:`whoosh.support.charset.charset_table_to_dict`.
     
     >>> from whoosh.support.charset import charset_table_to_dict, default_charset
     >>> retokenizer = RegexTokenizer()
     
     def __init__(self, charmap):
         """
-        :param charmap: a mapping from integer character numbers to unicode characters,
-            as required by the unicode.translate() method.
+        :param charmap: a mapping from integer character numbers to unicode
+            characters, as required by the unicode.translate() method.
         """
         self.charmap = charmap
     
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and self.charmap == other.charmap
+        return (other
+                and self.__class__ is other.__class__
+                and self.charmap == other.charmap)
     
     def __call__(self, tokens):
         assert hasattr(tokens, "__iter__")
     
     __inittypes__ = dict(minsize=int, maxsize=int)
     
-    def __init__(self, minsize, maxsize = None):
+    def __init__(self, minsize, maxsize=None):
         """
         :param minsize: The minimum size of the N-grams.
-        :param maxsize: The maximum size of the N-grams. If you omit
-            this parameter, maxsize == minsize.
+        :param maxsize: The maximum size of the N-grams. If you omit this
+            parameter, maxsize == minsize.
         """
         
         self.min = minsize
 
 
 class IntraWordFilter(Filter):
-    """Splits words into subwords and performs optional transformations on subword groups.
-    This filter is funtionally based on yonik's WordDelimiterFilter in Solr, but shares no
-    code with it.
+    """Splits words into subwords and performs optional transformations on
+    subword groups. This filter is funtionally based on yonik's
+    WordDelimiterFilter in Solr, but shares no code with it.
     
     * Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`.
-    * When splitwords=True, split on case transitions, e.g. `PowerShot` -> `Power`, `Shot`.
-    * When splitnums=True, split on letter-number transitions, e.g. `SD500` -> `SD`, `500`.
+    * When splitwords=True, split on case transitions,
+      e.g. `PowerShot` -> `Power`, `Shot`.
+    * When splitnums=True, split on letter-number transitions,
+      e.g. `SD500` -> `SD`, `500`.
     * Leading and trailing delimiter characters are ignored.
-    * Trailing possesive "'s" removed from subwords, e.g. `O'Neil's` -> `O`, `Neil`.
+    * Trailing possesive "'s" removed from subwords,
+      e.g. `O'Neil's` -> `O`, `Neil`.
     
     The mergewords and mergenums arguments turn on merging of subwords.
     
     When the merge arguments are false, subwords are not merged.
     
-    * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token positions).
+    * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token
+      positions).
     
-    When one or both of the merge arguments are true, consecutive runs of alphabetic
-    and/or numeric subwords are merged into an additional token with the same position
-    as the last sub-word.
+    When one or both of the merge arguments are true, consecutive runs of
+    alphabetic and/or numeric subwords are merged into an additional token with
+    the same position as the last sub-word.
     
     * `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot`
     * `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC`
-    * `Super-Duper-XL500-42-AutoCoder!` -> 0:`Super`, 1:`Duper`, 2:`XL`, 2:`SuperDuperXL`,
-      `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`, `6`:`AutoCoder`
+    * `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`,
+      `2`:`SuperDuperXL`,
+      `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`,
+      `6`:`AutoCoder`
     
-    When using this filter you should use a tokenizer that only splits on whitespace,
-    so the tokenizer does not remove intra-word delimiters before this filter can see them,
-    and put this filter before any use of LowercaseFilter.
+    When using this filter you should use a tokenizer that only splits on
+    whitespace, so the tokenizer does not remove intra-word delimiters before
+    this filter can see them, and put this filter before any use of
+    LowercaseFilter.
     
     >>> analyzer = RegexTokenizer(r"\\S+") | IntraWordFilter() | LowercaseFilter()
     
-    One use for this filter is to help match different written representations of a
-    concept. For example, if the source text contained `wi-fi`, you probably want
-    `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this is to specify
-    mergewords=True and/or mergenums=True in the analyzer used for indexing, and
-    mergewords=False / mergenums=False in the analyzer used for querying.
+    One use for this filter is to help match different written representations
+    of a concept. For example, if the source text contained `wi-fi`, you
+    probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this
+    is to specify mergewords=True and/or mergenums=True in the analyzer used
+    for indexing, and mergewords=False / mergenums=False in the analyzer used
+    for querying.
     
     >>> iwf = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True),
                           query=IntraWordFilter(mergewords=False, mergenums=False))
     digits = array("u")
     uppers = array("u")
     lowers = array("u")
-    for n in xrange(2**16-1):
+    for n in xrange(2 ** 16 - 1):
         ch = unichr(n)
         if ch.islower(): lowers.append(ch)
         elif ch.isupper(): uppers.append(ch)
                  mergewords=False, mergenums=False):
         """
         :param delims: a string of delimiter characters.
-        :param splitwords: if True, split at case transitions, e.g. `PowerShot` -> `Power`, `Shot`
-        :param splitnums: if True, split at letter-number transitions, e.g. `SD500` -> `SD`, `500`
-        :param mergewords: merge consecutive runs of alphabetic subwords into an
-            additional token with the same position as the last subword.
+        :param splitwords: if True, split at case transitions,
+            e.g. `PowerShot` -> `Power`, `Shot`
+        :param splitnums: if True, split at letter-number transitions,
+            e.g. `SD500` -> `SD`, `500`
+        :param mergewords: merge consecutive runs of alphabetic subwords into
+            an additional token with the same position as the last subword.
         :param mergenums: merge consecutive runs of numeric subwords into an
             additional token with the same position as the last subword.
         """
         self.delims = re.escape(delims)
         
         # Expression for splitting at delimiter characters
-        self.splitter = re.compile(u"[%s]+" % (self.delims, ), re.UNICODE)
+        self.splitter = re.compile(u"[%s]+" % (self.delims,), re.UNICODE)
         # Expression for removing "'s" from the end of sub-words
-        self.disposses = re.compile(u"(?<=[%s])'[Ss](?=$|[%s])" % (self.letters,
-                                                                   self.delims), re.UNICODE)
+        dispat = u"(?<=[%s])'[Ss](?=$|[%s])" % (self.letters, self.delims)
+        self.disposses = re.compile(dispat, re.UNICODE)
         
         # Expression for finding case and letter-number transitions
         lower2upper = u"[%s][%s]" % (self.lowers, self.uppers)
         letter2digit = u"[%s][%s]" % (self.letters, self.digits)
         digit2letter = u"[%s][%s]" % (self.digits, self.letters)
         if splitwords and splitnums:
-            self.boundary = re.compile(u"(%s|%s|%s)" % (lower2upper,
-                                                        letter2digit,
-                                                        digit2letter), re.UNICODE)
+            splitpat = u"(%s|%s|%s)" % (lower2upper, letter2digit, digit2letter)
+            self.boundary = re.compile(splitpat, re.UNICODE)
         elif splitwords:
-            self.boundary = re.compile(u"%s" % (lower2upper, ), re.UNICODE)
+            self.boundary = re.compile(unicode(lower2upper), re.UNICODE)
         elif splitnums:
-            self.boundary = re.compile(u"(%s|%s)" % (letter2digit,
-                                                     digit2letter), re.UNICODE)
+            numpat = u"(%s|%s)" % (letter2digit, digit2letter)
+            self.boundary = re.compile(numpat, re.UNICODE)
         
         self.splitting = splitwords or splitnums
         self.mergewords = mergewords
             splitted = self.splitter.split(string)
             
             for run in splitted:
-                # For each delimited run of characters, find the
-                # boundaries (e.g. lower->upper, letter->num, num->letter)
-                # and split between them.
+                # For each delimited run of characters, find the boundaries
+                # (e.g. lower->upper, letter->num, num->letter) and split
+                # between them.
                 start = 0
                 for match in boundaries(run):
                     middle = match.start() + 1
             # Is this the same type as the previous part?
             if buf and (this == last == 1 and mergewords)\
             or (this == last == 2 and mergenums):
-                # This part is the same type as the previous.
-                # Add it to the buffer of parts to merge.
+                # This part is the same type as the previous. Add it to the
+                # buffer of parts to merge.
                 buf.append(part)
             else:
                 # This part is different than the previous.
                 if len(buf) > 1:
-                    # If the buffer has at least two parts in
-                    # it, merge them and add them to the original
-                    # list of parts.
-                    parts.insert(insertat, (pos-1, u"".join(buf)))
+                    # If the buffer has at least two parts in it, merge them
+                    # and add them to the original list of parts.
+                    parts.insert(insertat, (pos - 1, u"".join(buf)))
                     insertat += 1
                 # Reset the buffer
                 buf = [part]
                 last = this
             insertat += 1
         
-        # If there are parts left in the buffer at the end,
-        # merge them and add them to the original list.
+        # If there are parts left in the buffer at the end, merge them and add
+        # them to the original list.
         if len(buf) > 1:
             parts.append((pos, u"".join(buf)))
     
         mergewords = self.mergewords
         mergenums = self.mergenums
         
-        # This filter renumbers tokens as it expands them.
-        # New position counter.
+        # This filter renumbers tokens as it expands them. New position
+        # counter.
         
         newpos = None
         for t in tokens:
             text = t.text
             
-            # If this is the first token we've seen, use it to set
-            # the new position counter
+            # If this is the first token we've seen, use it to set the new
+            # position counter
             if newpos is None:
                 if t.positions:
                     newpos = t.pos
                     # Token doesn't have positions, just use 0
                     newpos = 0
             
-            if (text.isalpha() and (text.islower() or text.isupper())) or text.isdigit():
-                # Short-circuit the common cases of no delimiters, no case transitions,
-                # only digits, etc.
+            if (text.isalpha()
+                and (text.islower() or text.isupper())) or text.isdigit():
+                # Short-circuit the common cases of no delimiters, no case
+                # transitions, only digits, etc.
                 t.pos = newpos
                 yield t
                 newpos += 1
             else:
-                # Should we check for an apos before doing the
-                # disposses step? Or is the re faster?
-                # if "'" in text:
+                # Should we check for an apos before doing the disposses step?
+                # Or is the re faster? if "'" in text:
                 text = disposses("", text)
                 
-                # Split the token text on delimiters, word and/or number boundaries,
-                # and give the split parts positions
-                parts = [(newpos + i, part) for i, part in enumerate(self.split(text))]
+                # Split the token text on delimiters, word and/or number
+                # boundaries, and give the split parts positions
+                parts = [(newpos + i, part)
+                         for i, part in enumerate(self.split(text))]
                 
                 # Did the split yield more than one part?
                 if len(parts) > 1:
-                    # If the options are set, merge consecutive runs of all-letters
-                    # and/or all-numbers.
+                    # If the options are set, merge consecutive runs of all-
+                    # letters and/or all-numbers.
                     if mergewords or mergenums:
                         merge(parts)
                     
                     # Set the new position counter based on the last part
                     newpos = parts[-1][0] + 1
                 else:
-                    # The split only gave one part, so just yield the 
+                    # The split only gave one part, so just yield the
                     # "dispossesed" text.
                     t.text = text
                     t.pos = newpos
 
 
 class CamelFilter(Filter):
-    """Splits CamelCased words into multiple words.
-    This filter is deprecated, use IntraWordFilter instead.
+    """Splits CamelCased words into multiple words. This filter is deprecated,
+    use IntraWordFilter instead.
     
     >>> rext = RegexTokenizer()
     >>> stream = rext(u"call getProcessedToken")
     >>> [token.text for token in CamelFilter(stream)]
     [u"call", u"getProcessedToken", u"get", u"Processed", u"Token"]
     
-    Obviously this filter needs to precede LowercaseFilter if they
-    are both in a filter chain.
+    Obviously this filter needs to precede LowercaseFilter if they are both in
+    a filter chain.
     """
     
     camel_exp = re.compile("[A-Z][a-z]*|[a-z]+|[0-9]+")
             yield t
             text = t.text
             
-            if text and not text.islower() and not text.isupper() and not text.isdigit():
+            if (text
+                and not text.islower()
+                and not text.isupper()
+                and not text.isdigit()):
                 chars = t.chars
                 if chars:
                     oldstart = t.startchar
 
 
 class UnderscoreFilter(Filter):
-    """Splits words with underscores into multiple words.
-    This filter is deprecated, use IntraWordFilter instead.
+    """Splits words with underscores into multiple words. This filter is
+    deprecated, use IntraWordFilter instead.
     
     >>> rext = RegexTokenizer()
     >>> stream = rext(u"call get_processed_token")
     >>> [token.text for token in CamelFilter(stream)]
     [u"call", u"get_processed_token", u"get", u"processed", u"token"]
     
-    Obviously you should not split words on underscores in the
-    tokenizer if you want to use this filter.
+    Obviously you should not split words on underscores in the tokenizer if you
+    want to use this filter.
     """
     
     underscore_exp = re.compile("[A-Z][a-z]*|[a-z]+|[0-9]+")
     
     For example, if you added a filter:
     
-      BoostTextFilter("\\^([0-9.]+)$")
+        BoostTextFilter("\\^([0-9.]+)$")
     
     The user could then write keywords with an optional boost encoded in them,
     like this:
     
     (Of course, you might want to write a better pattern for the number part.)
     
-     - Note that the pattern is run on EACH TOKEN, not the source text as a whole.
+     * Note that the pattern is run on EACH TOKEN, not the source text as a
+       whole.
      
-     - Because this filter runs a regular expression match on every token,
+     * Because this filter runs a regular expression match on every token,
        for performance reasons it is probably only suitable for short fields.
        
-     - You may use this filter in a Frequency-formatted field, where
-       the Frequency format object has boost_as_freq = True. Bear in mind that
-       in that case, you can only use integer "boosts".
+     * You may use this filter in a Frequency-formatted field, where the
+       Frequency format object has boost_as_freq = True. Bear in mind that in
+       that case, you can only use integer "boosts".
     """
     
-    def __init__(self, expression, group = 1, default = 1.0):
+    def __init__(self, expression, group=1, default=1.0):
         """
         :param expression: a compiled regular expression object representing
-        the pattern to look for within each token.
+            the pattern to look for within each token.
         :param group: the group name or number to use as the boost number
             (what to pass to match.group()). The string value of this group is
             passed to float().
         self.default = default
     
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__\
-        and self.expression == other.expression and self.default == other.default\
-        and self.group == other.group
+        return (other
+                and self.__class__ is other.__class__
+                and self.expression == other.expression
+                and self.default == other.default
+                and self.group == other.group)
     
     def __call__(self, tokens):
         expression = self.expression
 # Analyzers
 
 class Analyzer(Composable):
-    """
-    Abstract base class for analyzers. Since the analyzer protocol is just
+    """ Abstract base class for analyzers. Since the analyzer protocol is just
     __call__, this is pretty simple -- it mostly exists to provide common
     implementations of __repr__ and __eq__.
     """
         return "%s()" % self.__class__.__name__
 
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and self.__dict__ == other.__dict__
+        return (other
+                and self.__class__ is other.__class__
+                and self.__dict__ == other.__dict__)
 
     def __call__(self, value, **kwargs):
         raise NotImplementedError
         return len(self.items)
     
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and self.items == other.items
+        return (other
+                and self.__class__ is other.__class__
+                and self.items == other.items)
     
     def clean(self):
         for item in self.items:
                 item.clean()
 
 
-def IDAnalyzer(lowercase = False):
-    """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if desired.
+def IDAnalyzer(lowercase=False):
+    """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
+    desired.
     """
     
     tokenizer = IDTokenizer()
 IDAnalyzer.__inittypes__ = dict(lowercase=bool)
 
 
-def KeywordAnalyzer(lowercase = False, commas = False):
+def KeywordAnalyzer(lowercase=False, commas=False):
     """Parses space-separated tokens.
     
     >>> ana = KeywordAnalyzer()
     return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
 SimpleAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool)
 
-def StandardAnalyzer(expression=r"\w+(\.?\w+)*", stoplist = STOP_WORDS, minsize = 2, gaps=False):
-    """Composes a RegexTokenizer with a LowercaseFilter and optional StopFilter.
+def StandardAnalyzer(expression=r"\w+(\.?\w+)*", stoplist=STOP_WORDS,
+                     minsize=2, gaps=False):
+    """Composes a RegexTokenizer with a LowercaseFilter and optional
+    StopFilter.
     
     >>> ana = StandardAnalyzer()
     >>> [token.text for token in ana(u"Testing is testing and testing")]
         than matching on the expression.
     """
     
-    chain = RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
+    ret = RegexTokenizer(expression=expression, gaps=gaps)
+    chain = ret | LowercaseFilter()
     if stoplist is not None:
-        chain = chain | StopFilter(stoplist = stoplist, minsize = minsize)
+        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize)
     return chain
-StandardAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool, stoplist=list, minsize=int)
+StandardAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
+                                      stoplist=list, minsize=int)
 
 
-def StemmingAnalyzer(expression=r"\w+(\.?\w+)*", stoplist=STOP_WORDS, minsize=2, gaps=False):
-    """Composes a RegexTokenizer with a lower case filter, an optional stop filter,
-    and a stemming filter.
+def StemmingAnalyzer(expression=r"\w+(\.?\w+)*", stoplist=STOP_WORDS,
+                     minsize=2, gaps=False, stemfn=stem, ignore=None):
+    """Composes a RegexTokenizer with a lower case filter, an optional stop
+    filter, and a stemming filter.
     
     >>> ana = StemmingAnalyzer()
     >>> [token.text for token in ana(u"Testing is testing and testing")]
         than matching on the expression.
     """
     
-    chain = RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
+    ret = RegexTokenizer(expression=expression, gaps=gaps)
+    chain = ret | LowercaseFilter()
     if stoplist is not None:
-        chain = chain | StopFilter(stoplist = stoplist, minsize = minsize)
-    return chain | StemFilter()
-StemmingAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool, stoplist=list, minsize=int)
+        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize)
+    return chain | StemFilter(stemfn=stemfn, ignore=ignore)
+StemmingAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
+                                      stoplist=list, minsize=int)
 
 
-def FancyAnalyzer(expression=r"\s+", stoplist = STOP_WORDS, minsize = 2, gaps=True,
+def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2, gaps=True,
                   splitwords=True, splitnums=True,
                   mergewords=False, mergenums=False):
-    """Composes a RegexTokenizer with a CamelFilter, UnderscoreFilter, LowercaseFilter,
-    and StopFilter.
+    """Composes a RegexTokenizer with a CamelFilter, UnderscoreFilter,
+    LowercaseFilter, and StopFilter.
     
     >>> ana = FancyAnalyzer()
     >>> [token.text for token in ana(u"Should I call getInt or get_real?")]
         than matching on the expression.
     """
     
-    return RegexTokenizer(expression=expression, gaps=gaps) | \
-           IntraWordFilter(splitwords=splitwords, splitnums=splitnums,
-                           mergewords=mergewords, mergenums=mergenums) | \
-           LowercaseFilter() | \
-           StopFilter(stoplist = stoplist, minsize = minsize)
-FancyAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool, stoplist=list, minsize=int)
+    ret = RegexTokenizer(expression=expression, gaps=gaps)
+    iwf = IntraWordFilter(splitwords=splitwords, splitnums=splitnums,
+                          mergewords=mergewords, mergenums=mergenums)
+    lcf = LowercaseFilter()
+    swf = StopFilter(stoplist=stoplist, minsize=minsize)
+    
+    return ret | iwf | lcf | swf
+FancyAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
+                                   stoplist=list, minsize=int)
 
 
-def NgramAnalyzer(minsize, maxsize = None):
+def NgramAnalyzer(minsize, maxsize=None):
     """Composes an NgramTokenizer and a LowercaseFilter.
     
     >>> ana = NgramAnalyzer(4)
     [u"hi t", u"i th", u" the", u"ther", u"here"]
     """
     
-    return NgramTokenizer(minsize, maxsize = maxsize) | LowercaseFilter()
+    return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
 NgramAnalyzer.__inittypes__ = dict(minsize=int, maxsize=int)
 
 

src/whoosh/classify.py

 # limitations under the License.
 #===============================================================================
 
-"""Classes and functions for classifying and extracting information from documents.
+"""Classes and functions for classifying and extracting information from
+documents.
 """
 
 from collections import defaultdict
 
 
 class Expander(object):
-    """Uses an ExpansionModel to expand the set of query terms based on
-    the top N result documents.
+    """Uses an ExpansionModel to expand the set of query terms based on the top
+    N result documents.
     """
     
-    def __init__(self, ixreader, fieldname, model = Bo1Model):
+    def __init__(self, ixreader, fieldname, model=Bo1Model):
         """
         :param reader: A :class:whoosh.reading.IndexReader object.
         :param fieldname: The name of the field in which to search.
             model = model(ixreader, fieldname)
         self.model = model
         
-        # Cache the collection frequency of every term in this
-        # field. This turns out to be much faster than reading each
-        # individual weight from the term index as we add words.
+        # Cache the collection frequency of every term in this field. This
+        # turns out to be much faster than reading each individual weight from
+        # the term index as we add words.
         self.collection_freq = dict((word, freq) for word, _, freq
                                       in ixreader.iter_field(fieldname))
         
             
         self.top_total += total_weight
     
-    def expanded_terms(self, number, normalize = True):
+    def expanded_terms(self, number, normalize=True):
         """Returns the N most important terms in the vectors added so far.
         
         :param number: The number of terms to return.
         else:
             norm = maxweight
         tlist = [(weight / norm, t) for weight, t in tlist]
-        tlist.sort(reverse = True)
+        tlist.sort(reverse=True)
         
         return [(t, weight) for weight, t in tlist[:number]]
 

src/whoosh/fields.py

 # limitations under the License.
 #===============================================================================
 
-"""
-Contains functions and classes related to fields.
+""" Contains functions and classes related to fields.
 """
 
 import datetime, re, struct
 
-from whoosh.analysis import IDAnalyzer, RegexAnalyzer, KeywordAnalyzer
-from whoosh.analysis import StandardAnalyzer, NgramAnalyzer
+from whoosh.analysis import (IDAnalyzer, RegexAnalyzer, KeywordAnalyzer,
+                             StandardAnalyzer, NgramAnalyzer)
 from whoosh.formats import Format, Existence, Frequency, Positions
 
 # Exceptions
       fields marked as 'unique' to find the previous version of a document
       being updated.
       
-    The constructor for the base field type simply lets you supply your
-    own configured field format, vector format, and scorable and stored
-    values. Subclasses may configure some or all of this for you.
+    The constructor for the base field type simply lets you supply your own
+    configured field format, vector format, and scorable and stored values.
+    Subclasses may configure some or all of this for you.
     
     """
     
     __inittypes__ = dict(format=Format, vector=Format,
                          scorable=bool, stored=bool, unique=bool)
     
-    def __init__(self, format, vector = None,
-                 scorable = False, stored = False,
-                 unique = False):
+    def __init__(self, format, vector=None,
+                 scorable=False, stored=False,
+                 unique=False):
         self.format = format
         self.vector = vector
         self.scorable = scorable
         self.unique = unique
     
     def __repr__(self):
-        return "%s(format=%r, vector=%r, scorable=%s, stored=%s, unique=%s)"\
-        % (self.__class__.__name__, self.format, self.vector,
-           self.scorable, self.stored, self.unique)
+        temp = "%s(format=%r, vector=%r, scorable=%s, stored=%s, unique=%s)"
+        return temp % (self.__class__.__name__, self.format, self.vector,
+                       self.scorable, self.stored, self.unique)
     
     def __eq__(self, other):
         return all((isinstance(other, FieldType),
                     (self.unique == other.unique)))
     
     def clean(self):
-        """Clears any cached information in the field and any child objects."""
+        """Clears any cached information in the field and any child objects.
+        """
         
         if self.format and hasattr(self.format, "clean"):
             self.format.clean()
     def process_text(self, qstring, mode='', **kwargs):
         if not self.format:
             raise Exception("%s field has no format" % self)
-        return (t.text for t in self.format.analyze(qstring, mode=mode, **kwargs))
+        return (t.text for t
+                in self.format.analyze(qstring, mode=mode, **kwargs))
     
 
 class ID(FieldType):
     """Configured field type that indexes the entire value of the field as one
-    token. This is useful for data you don't want to tokenize, such as the
-    path of a file.
+    token. This is useful for data you don't want to tokenize, such as the path
+    of a file.
     """
     
     __inittypes__ = dict(stored=bool, unique=bool, field_boost=float)
         """
         :param stored: Whether the value of this field is stored with the document.
         """
-        self.format = Existence(analyzer=IDAnalyzer(), field_boost= field_boost)
+        self.format = Existence(analyzer=IDAnalyzer(), field_boost=field_boost)
         self.stored = stored
         self.unique = unique
 
     
     def __init__(self, stored=False, unique=False, expression=None, field_boost=1.0):
         """
-        :param stored: Whether the value of this field is stored with the document.
+        :param stored: Whether the value of this field is stored with the
+            document.
         :param unique: Whether the value of this field is unique per-document.
-        :param expression: The regular expression object to use to extract tokens.
-            The default expression breaks tokens on CRs, LFs, tabs, spaces, commas,
-            and semicolons.
+        :param expression: The regular expression object to use to extract
+            tokens. The default expression breaks tokens on CRs, LFs, tabs,
+            spaces, commas, and semicolons.
         """
         
         expression = expression or re.compile(r"[^\r\n\t ,;]+")
-        analyzer = RegexAnalyzer(expression = expression)
-        self.format = Existence(analyzer = analyzer, field_boost = field_boost)
+        analyzer = RegexAnalyzer(expression=expression)
+        self.format = Existence(analyzer=analyzer, field_boost=field_boost)
         self.stored = stored
         self.unique = unique
 
         self.type = type
         self.stored = stored
         self.unique = unique
-        self.format = Existence(analyzer=IDAnalyzer(), field_boost= field_boost)
+        self.format = Existence(analyzer=IDAnalyzer(), field_boost=field_boost)
     
     def index(self, num):
         method = getattr(self, self.type.__name__ + "_to_text")
         return method(ntype(x))
     
     def process_text(self, text, **kwargs):
-        return (self.to_text(text), )
+        return (self.to_text(text),)
     
     def parse_query(self, fieldname, qstring, boost=1.0):
         from whoosh import query
     
     @staticmethod
     def int_to_text(x):
-        x += (1<<(4<<2))-1 # 4 means 32-bits
+        x += (1 << (4 << 2)) - 1 # 4 means 32-bits
         return u"%08x" % x
     
     @staticmethod
     def text_to_int(text):
         x = int(text, 16)
-        x -= (1<<(4<<2))-1
+        x -= (1 << (4 << 2)) - 1
         return x
     
     @staticmethod
     def long_to_text(x):
-        x += (1<<(8<<2))-1
+        x += (1 << (8 << 2)) - 1
         return u"%016x" % x
     
     @staticmethod
     def text_to_long(text):
         x = long(text, 16)
-        x -= (1<<(8<<2))-1
+        x -= (1 << (8 << 2)) - 1
         return x
     
     @staticmethod
     def float_to_text(x):
         x = struct.unpack("<q", struct.pack("<d", x))[0]
-        x += (1<<(8<<2))-1
+        x += (1 << (8 << 2)) - 1
         return u"%016x" % x
     
     @staticmethod
     def text_to_float(text):
         x = long(text, 16)
-        x -= (1<<(8<<2))-1
+        x -= (1 << (8 << 2)) - 1
         x = struct.unpack("<d", struct.pack("<q", x))[0]
         return x
     
     
     def __init__(self, stored=False, unique=False):
         """
-        :param stored: Whether the value of this field is stored with the document.
+        :param stored: Whether the value of this field is stored with the
+            document.
         :param unique: Whether the value of this field is unique per-document.
         """
         
     
     def process_text(self, text, **kwargs):
         text = text.replace(" ", "").replace(":", "").replace("-", "").replace(".", "")
-        return (text, )
+        return (text,)
     
     def parse_query(self, fieldname, qstring, boost=1.0):
         text = self.process_text(qstring)
     
 
 class KEYWORD(FieldType):
-    """Configured field type for fields containing space-separated or comma-separated
-    keyword-like data (such as tags). The default is to not store positional information
-    (so phrase searching is not allowed in this field) and to not make the field scorable.
+    """Configured field type for fields containing space-separated or
+    comma-separated keyword-like data (such as tags). The default is to not
+    store positional information (so phrase searching is not allowed in this
+    field) and to not make the field scorable.
     """
     
     __inittypes__ = dict(stored=bool, lowercase=bool, commas=bool, scorable=bool,
                          unique=bool, field_boost=float)
     
-    def __init__(self, stored = False, lowercase = False, commas = False,
-                 scorable = False, unique = False, field_boost = 1.0):
+    def __init__(self, stored=False, lowercase=False, commas=False,
+                 scorable=False, unique=False, field_boost=1.0):
         """
-        :param stored: Whether to store the value of the field with the document.
+        :param stored: Whether to store the value of the field with the
+            document.
         :param comma: Whether this is a comma-separated field. If this is False
             (the default), it is treated as a space-separated field.
         :param scorable: Whether this field is scorable.
         """
         
-        ana = KeywordAnalyzer(lowercase = lowercase, commas = commas)
-        self.format = Frequency(analyzer = ana, field_boost = field_boost)
+        ana = KeywordAnalyzer(lowercase=lowercase, commas=commas)
+        self.format = Frequency(analyzer=ana, field_boost=field_boost)
         self.scorable = scorable
         self.stored = stored
         self.unique = unique
 
 
 class TEXT(FieldType):
-    """Configured field type for text fields (for example, the body text of an article). The
-    default is to store positional information to allow phrase searching. This field type
-    is always scorable.
+    """Configured field type for text fields (for example, the body text of an
+    article). The default is to store positional information to allow phrase
+    searching. This field type is always scorable.
     """
     
     __inittypes__ = dict(analyzer=object, phrase=bool, vector=Format,
                          stored=bool, field_boost=float)
     
-    def __init__(self, analyzer = None, phrase = True, vector = None,
-                 stored = False, field_boost = 1.0):
+    def __init__(self, analyzer=None, phrase=True, vector=None,
+                 stored=False, field_boost=1.0):
         """
-        :param stored: Whether to store the value of this field with the document. Since
-            this field type generally contains a lot of text, you should avoid storing it
-            with the document unless you need to, for example to allow fast excerpts in the
-            search results.
-        :param phrase: Whether the store positional information to allow phrase searching.
-        :param analyzer: The analysis.Analyzer to use to index the field contents. See the
-            analysis module for more information. If you omit this argument, the field uses
-            analysis.StandardAnalyzer.
+        :param stored: Whether to store the value of this field with the
+            document. Since this field type generally contains a lot of text,
+            you should avoid storing it with the document unless you need to,
+            for example to allow fast excerpts in the search results.
+        :param phrase: Whether the store positional information to allow phrase
+            searching.
+        :param analyzer: The analysis.Analyzer to use to index the field
+            contents. See the analysis module for more information. If you omit
+            this argument, the field uses analysis.StandardAnalyzer.
         """
         
         ana = analyzer or StandardAnalyzer()
             formatclass = Positions
         else:
             formatclass = Frequency
-        self.format = formatclass(analyzer = ana, field_boost = field_boost)
+        self.format = formatclass(analyzer=ana, field_boost=field_boost)
         self.vector = vector
         
         self.scorable = True
 
 
 class NGRAM(FieldType):
-    """Configured field that indexes text as N-grams. For example, with a field type
-    NGRAM(3,4), the value "hello" will be indexed as tokens
+    """Configured field that indexes text as N-grams. For example, with a field
+    type NGRAM(3,4), the value "hello" will be indexed as tokens
     "hel", "hell", "ell", "ello", "llo".
     """
     
     __inittypes__ = dict(minsize=int, maxsize=int, stored=bool, field_boost=float)
     
-    def __init__(self, minsize = 2, maxsize = 4, stored = False, field_boost = 1.0):
+    def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0):
         """
-        :param stored: Whether to store the value of this field with the document. Since
-            this field type generally contains a lot of text, you should avoid storing it
-            with the document unless you need to, for example to allow fast excerpts in the
-            search results.
+        :param stored: Whether to store the value of this field with the
+            document. Since this field type generally contains a lot of text,
+            you should avoid storing it with the document unless you need to,
+            for example to allow fast excerpts in the search results.
         :param minsize: The minimum length of the N-grams.
         :param maxsize: The maximum length of the N-grams.
         """
         
-        self.format = Frequency(analyzer = NgramAnalyzer(minsize, maxsize),
-                                field_boost = field_boost)
+        self.format = Frequency(analyzer=NgramAnalyzer(minsize, maxsize),
+                                field_boost=field_boost)
         self.scorable = True
         self.stored = stored
 
     """Represents the collection of fields in an index. Maps field names to
     FieldType objects which define the behavior of each field.
     
-    Low-level parts of the index use field numbers instead of field names
-    for compactness. This class has several methods for converting between
-    the field name, field number, and field object itself.
+    Low-level parts of the index use field numbers instead of field names for
+    compactness. This class has several methods for converting between the
+    field name, field number, and field object itself.
     """
     
     def __init__(self, **fields):
-        """
-        All keyword arguments to the constructor are treated as fieldname = fieldtype
-        pairs. The fieldtype can be an instantiated FieldType object, or a FieldType
-        sub-class (in which case the Schema will instantiate it with the default
-        constructor before adding it).
+        """ All keyword arguments to the constructor are treated as fieldname =
+        fieldtype pairs. The fieldtype can be an instantiated FieldType object,
+        or a FieldType sub-class (in which case the Schema will instantiate it
+        with the default constructor before adding it).
         
         For example::
         
         return "<Schema: %s>" % repr(self._names)
     
     def __iter__(self):
-        """
-        Yields the sequence of fields in this schema.
+        """Yields the sequence of fields in this schema.
         """
         
         return iter(self._by_number)
     
     def __getitem__(self, id):
-        """
-        Returns the field associated with the given field name or number.
+        """Returns the field associated with the given field name or number.
         
         :param id: A field name or field number.
         """
         return self._by_number[id]
     
     def __len__(self):
-        """
-        Returns the number of fields in this schema.
+        """Returns the number of fields in this schema.
         """
         return len(self._by_number)
     
     def __contains__(self, fieldname):
-        """
-        Returns True if a field by the given name is in this schema.
+        """Returns True if a field by the given name is in this schema.
         
         :param fieldname: The name of the field.
         """
         return copy.deepcopy(self)
     
     def field_by_name(self, name):
-        """
-        Returns the field object associated with the given name.
+        """Returns the field object associated with the given name.
         
         :param name: The name of the field to retrieve.
         """
         return self._by_name[name]
     
     def field_by_number(self, number):
-        """
-        Returns the field object associated with the given number.
+        """Returns the field object associated with the given number.
         
         :param number: The number of the field to retrieve.
         """
         return self._by_number[number]
     
     def fields(self):
-        """
-        Yields ("fieldname", field_object) pairs for the fields
-        in this schema.
+        """Yields ("fieldname", field_object) pairs for the fields in this
+        schema.
         """
         return self._by_name.iteritems()
     
     def field_names(self):
-        """
-        Returns a list of the names of the fields in this schema.
+        """Returns a list of the names of the fields in this schema.
         """
         return self._names
     
     def add(self, name, fieldtype):
-        """
-        Adds a field to this schema. This is a low-level method; use keyword
+        """Adds a field to this schema. This is a low-level method; use keyword
         arguments to the Schema constructor to create the fields instead.
         
         :param name: The name of the field.
-        :param fieldtype: An instantiated fields.FieldType object, or a FieldType subclass.
-            If you pass an instantiated object, the schema will use that as the field
-            configuration for this field. If you pass a FieldType subclass, the schema
-            will automatically instantiate it with the default constructor.
+        :param fieldtype: An instantiated fields.FieldType object, or a
+            FieldType subclass. If you pass an instantiated object, the schema
+            will use that as the field configuration for this field. If you
+            pass a FieldType subclass, the schema will automatically
+            instantiate it with the default constructor.
         """
         
         if name.startswith("_"):
             return self.name_to_number(id)
     
     def to_name(self, id):
+        """Given a field name or number, returns the field's name.
+        """
         if isinstance(id, int):
             return self.number_to_name(id)
         else:

src/whoosh/filedb/fileindex.py

 # well as Index for convenience, so they're broken out here.
 
 class SegmentDeletionMixin(object):
-    """Mix-in for classes that support deleting documents from self.segments."""
-    
-    def delete_document(self, docnum, delete = True):
+    """Mix-in for classes that support deleting documents from self.segments.
+    """
+
+    def delete_document(self, docnum, delete=True):
         """Deletes a document by number."""
-        self.segments.delete_document(docnum, delete = delete)
-    
+        self.segments.delete_document(docnum, delete=delete)
+
     def deleted_count(self):
         """Returns the total number of deleted documents in this index.
         """
         return self.segments.deleted_count()
-    
+
     def is_deleted(self, docnum):
         """Returns True if a given document number is deleted but
         not yet optimized out of the index.
         """
         return self.segments.is_deleted(docnum)
-    
+
     def has_deletions(self):
         """Returns True if this index has documents that are marked
         deleted but haven't been optimized out of the index yet.
         """
         return self.segments.has_deletions()
-    
+
 
 class FileIndex(SegmentDeletionMixin, Index):
-    def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME):
+    def __init__(self, storage, schema, create=False,
+                 indexname=_DEF_INDEX_NAME):
         self.storage = storage
         self.indexname = indexname
-        
+
         if schema is not None and not isinstance(schema, Schema):
             raise ValueError("%r is not a Schema object" % schema)
-        
+
         self.generation = self.latest_generation()
-        
+
         if create:
             if schema is None:
                 raise IndexError("To create an index you must specify a schema")
-            
+
             self.schema = schema
             self.generation = 0
             self.segment_counter = 0
             self.segments = SegmentSet()
-            
+
             # Clear existing files
             prefix = "_%s_" % self.indexname
             for filename in self.storage:
                 if filename.startswith(prefix):
                     storage.delete_file(filename)
-            
+
             self._write()
         elif self.generation >= 0:
             self._read(schema)
         else:
             raise EmptyIndexError("No index named %r in storage %r" % (indexname, storage))
-        
+
         # Open a reader for this index. This is used by the
         # deletion methods, but mostly it's to keep the underlying
         # files open so they don't get deleted from underneath us.
         self._searcher = self.searcher()
-        
+
         self.segment_num_lock = Lock()
-    
+
     def __repr__(self):
-        return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname)
-    
+        return "%s(%r, %r)" % (self.__class__.__name__,
+                               self.storage, self.indexname)
+
     def __del__(self):
-        if hasattr(self, "_searcher") and self._searcher and not self._searcher.is_closed:
+        if (hasattr(self, "_searcher")
+            and self._searcher
+            and not self._searcher.is_closed):
             self._searcher.close()
-            
+
     def close(self):
         self._searcher.close()
-        
+
     def latest_generation(self):
         pattern = _toc_pattern(self.indexname)
-        
+
         max = -1
         for filename in self.storage:
             m = pattern.match(filename)
                 num = int(m.group(1))
                 if num > max: max = num
         return max
-    
+
     def refresh(self):
         if not self.up_to_date():
-            return self.__class__(self.storage, self.schema, indexname = self.indexname)
+            return self.__class__(self.storage, self.schema,
+                                  indexname=self.indexname)
         else:
             return self
-        
+
     def up_to_date(self):
         return self.generation == self.latest_generation()
-    
+
     def _write(self):
         # Writes the content of this index to the .toc file.
         for field in self.schema:
             field.clean()
         #stream = self.storage.create_file(self._toc_filename())
-        
+
         # Use a temporary file for atomic write.
         tocfilename = self._toc_filename()
         tempfilename = '%s.%s' % (tocfilename, time())
         stream = self.storage.create_file(tempfilename)
-        
+
         stream.write_varint(_INT_SIZE)
         stream.write_varint(_FLOAT_SIZE)
         stream.write_int(-12345)
-        
+
         stream.write_int(_INDEX_VERSION)
         for num in __version__[:3]:
             stream.write_varint(num)
-        
+
         stream.write_string(cPickle.dumps(self.schema, -1))
         stream.write_int(self.generation)
         stream.write_int(self.segment_counter)
         stream.write_pickle(self.segments)
         stream.close()
-        
+
         # Rename temporary file to the proper filename
         self.storage.rename_file(tempfilename, self._toc_filename(), safe=True)
-    
+
     def _read(self, schema):
         # Reads the content of this index from the .toc file.
         stream = self.storage.open_file(self._toc_filename())
-        
+
         if stream.read_varint() != _INT_SIZE or \
            stream.read_varint() != _FLOAT_SIZE:
             raise IndexError("Index was created on an architecture with different data sizes")
-        
+
         if not stream.read_int() == -12345:
             raise IndexError("Number misread: byte order problem")
-        
+
         version = stream.read_int()
         if version != _INDEX_VERSION:
             raise IndexVersionError("Can't read format %s" % version, version)
         self.release = (stream.read_varint(),
                         stream.read_varint(),
                         stream.read_varint())
-        
-        # If the user supplied a schema object with the constructor,
-        # don't load the pickled schema from the saved index.
+
+        # If the user supplied a schema object with the constructor, don't load
+        # the pickled schema from the saved index.
         if schema:
             self.schema = schema
             stream.skip_string()
         else:
             self.schema = cPickle.loads(stream.read_string())
-        
+
         generation = stream.read_int()
         assert generation == self.generation
         self.segment_counter = stream.read_int()
         self.segments = stream.read_pickle()
         stream.close()
-    
+
     def _next_segment_name(self):
         #Returns the name of the next segment in sequence.
         if self.segment_num_lock.acquire():
                 self.segment_num_lock.release()
         else:
             raise LockError
-    
+
     def _toc_filename(self):
-        # Returns the computed filename of the TOC for this
-        # index name and generation.
+        # Returns the computed filename of the TOC for this index name and
+        # generation.
         return "_%s_%s.toc" % (self.indexname, self.generation)
-    
+
     def last_modified(self):
         return self.storage.file_modified(self._toc_filename())
-    
+
     def is_empty(self):
         return len(self.segments) == 0
-    
+
     def optimize(self):
         if len(self.segments) < 2 and not self.segments.has_deletions():
             return
-        
+
         from whoosh.filedb.filewriting import OPTIMIZE
         w = self.writer()
         w.commit(OPTIMIZE)
-        
-    def commit(self, new_segments = None):
+
+    def commit(self, new_segments=None):
         self._searcher.close()
-        
+
         if not self.up_to_date():
             raise OutOfDateError
-        
+
         if new_segments:
             self.segments = new_segments
-        
+
         self.generation += 1
         self._write()
         self._clean_files()
-        
+
         self._searcher = self.searcher()
-        
+
     def _clean_files(self):
         # Attempts to remove unused index files (called when a new generation
         # is created). If existing Index and/or reader objects have the files
-        # open, they may not get deleted immediately (i.e. on Windows)
-        # but will probably be deleted eventually by a later call to clean_files.
-        
+        # open, they may not get deleted immediately (i.e. on Windows) but will
+        # probably be deleted eventually by a later call to clean_files.
+
         storage = self.storage
         current_segment_names = set([s.name for s in self.segments])
-        
+
         tocpattern = _toc_pattern(self.indexname)
         segpattern = _segment_pattern(self.indexname)
-        
+
         for filename in storage:
             m = tocpattern.match(filename)
             if m:
                         except OSError:
                             # Another process still has this file open
                             pass
-                        
+
     def doc_count_all(self):
         return self.segments.doc_count_all()
-    
+
     def doc_count(self):
         return self.segments.doc_count()
-    
+
     def field_length(self, fieldid):
         fieldnum = self.schema.to_number(fieldid)
         return sum(s.field_length(fieldnum) for s in self.segments)
-    
+
     def reader(self):
         return self.segments.reader(self.storage, self.schema)
-    
+
     def writer(self, **kwargs):
         from whoosh.filedb.filewriting import FileIndexWriter
         return FileIndexWriter(self, **kwargs)
-        
+
 
 # SegmentSet object
 
     object to keep track of the segments in the index.
     """
 
-    def __init__(self, segments = None):
+    def __init__(self, segments=None):
         if segments is None:
             self.segments = []
         else:
             self.segments = segments
-        
+
         self._doc_offsets = self.doc_offsets()
-    
+
     def __repr__(self):
         return repr(self.segments)
-    
+
     def __len__(self):
-        """:returns: the number of segments in this set."""
+        """
+        :returns: the number of segments in this set.
+        """
         return len(self.segments)
-    
+
     def __iter__(self):
         return iter(self.segments)
-    
+
     def __getitem__(self, n):
         return self.segments.__getitem__(n)
-    
+
     def append(self, segment):
         """Adds a segment to this set."""
-        
+
         self.segments.append(segment)
         self._doc_offsets = self.doc_offsets()
-    
+
     def _document_segment(self, docnum):
         """Returns the index.Segment object containing the given document
         number.
         """
-        
+
         offsets = self._doc_offsets
         if len(offsets) == 1: return 0
         return bisect_right(offsets, docnum) - 1
-    
+
     def _segment_and_docnum(self, docnum):
-        """Returns an (index.Segment, segment_docnum) pair for the
-        segment containing the given document number.
+        """Returns an (index.Segment, segment_docnum) pair for the segment
+        containing the given document number.
         """
-        
+
         segmentnum = self._document_segment(docnum)
         offset = self._doc_offsets[segmentnum]
         segment = self.segments[segmentnum]
         return segment, docnum - offset
-    
+
     def copy(self):
         """:returns: a deep copy of this set."""
         return self.__class__([s.copy() for s in self.segments])
-    
+
     def doc_offsets(self):
         # Recomputes the document offset list. This must be called if you
         # change self.segments.
             offsets.append(base)
             base += s.doc_count_all()
         return offsets
-    
+
     def doc_count_all(self):
         """
-        :returns: the total number of documents, DELETED or
-            UNDELETED, in this set.
+        :returns: the total number of documents, DELETED or UNDELETED, in this
+            set.
         """
         return sum(s.doc_count_all() for s in self.segments)
-    
+
     def doc_count(self):
         """
         :returns: the number of undeleted documents in this set.
         """
         return sum(s.doc_count() for s in self.segments)
-    
-    
+
+
     def has_deletions(self):
         """
-        :returns: True if this index has documents that are marked
-            deleted but haven't been optimized out of the index yet.
-            This includes deletions that haven't been written to disk
-            with Index.commit() yet.
+        :returns: True if this index has documents that are marked deleted but
+            haven't been optimized out of the index yet. This includes
+            deletions that haven't been written to disk with Index.commit()
+            yet.
         """
         return any(s.has_deletions() for s in self.segments)
-    
-    def delete_document(self, docnum, delete = True):
+
+    def delete_document(self, docnum, delete=True):
         """Deletes a document by number.
 
         You must call Index.commit() for the deletion to be written to disk.
         """
-        
+
         segment, segdocnum = self._segment_and_docnum(docnum)
-        segment.delete_document(segdocnum, delete = delete)
-    
+        segment.delete_document(segdocnum, delete=delete)
+
     def deleted_count(self):
         """
         :returns: the total number of deleted documents in this index.
         """
         return sum(s.deleted_count() for s in self.segments)
-    
+
     def is_deleted(self, docnum):
         """
         :returns: True if a given document number is deleted but not yet
             optimized out of the index.
         """
-        
+
         segment, segdocnum = self._segment_and_docnum(docnum)
         return segment.is_deleted(segdocnum)
-    
+
     def reader(self, storage, schema):
         from whoosh.filedb.filereading import SegmentReader
         segments = self.segments
             readers = [SegmentReader(storage, segment, schema)
                        for segment in segments]
             return MultiReader(readers, self._doc_offsets, schema)
-    
+
 
 class Segment(object):
-    """Do not instantiate this object directly. It is used by the Index
-    object to hold information about a segment. A list of objects of this
-    class are pickled as part of the TOC file.
+    """Do not instantiate this object directly. It is used by the Index object
+    to hold information about a segment. A list of objects of this class are
+    pickled as part of the TOC file.
     
     The TOC file stores a minimal amount of information -- mostly a list of
     Segment objects. Segments are the real reverse indexes. Having multiple
     contents of existing segments into one (removing any deleted documents
     along the way).
     """
-    
-    def __init__(self, name, max_doc, field_length_totals, deleted = None):
+
+    def __init__(self, name, max_doc, field_length_totals, deleted=None):
         """
-        :param name: The name of the segment (the Index object computes this from its
-            name and the generation).
+        :param name: The name of the segment (the Index object computes this
+            from its name and the generation).
         :param max_doc: The maximum document number in the segment.
         :param term_count: Total count of all terms in all documents.
-        :param field_length_totals: A dictionary mapping field numbers to the total
-            number of terms in that field across all documents in the segment.
-        :param deleted: A set of deleted document numbers, or None if no deleted
-            documents exist in this segment.
+        :param field_length_totals: A dictionary mapping field numbers to the
+            total number of terms in that field across all documents in the
+            segment.
+        :param deleted: A set of deleted document numbers, or None if no
+            deleted documents exist in this segment.
         """
-        
+
         self.name = name
         self.max_doc = max_doc
         self.field_length_totals = field_length_totals
         self.deleted = deleted
-        
+
         self.doclen_filename = self.name + ".dci"
         self.docs_filename = self.name + ".dcz"
         self.term_filename = self.name + ".tiz"
         self.vector_filename = self.name + ".fvz"
         self.posts_filename = self.name + ".pst"
         self.vectorposts_filename = self.name + ".vps"
-    
+
     def __repr__(self):
         return "%s(%r)" % (self.__class__.__name__, self.name)
-    
+
     def copy(self):
         if self.deleted:
             deleted = set(self.deleted)
         return Segment(self.name, self.max_doc,
                        self.field_length_totals,
                        deleted)
-    
+
     def doc_count_all(self):
         """
-        :returns: the total number of documents, DELETED OR UNDELETED,
-            in this segment.
+        :returns: the total number of documents, DELETED OR UNDELETED, in this
+            segment.
         """
         return self.max_doc
-    
+
     def doc_count(self):
-        """:returns: the number of (undeleted) documents in this segment."""
+        """
+        :returns: the number of (undeleted) documents in this segment.
+        """
         return self.max_doc - self.deleted_count()
-    
+
     def has_deletions(self):
-        """:returns: True if any documents in this segment are deleted."""
+        """
+        :returns: True if any documents in this segment are deleted.
+        """
         return self.deleted_count() > 0
-    
+
     def deleted_count(self):
-        """:returns: the total number of deleted documents in this segment."""
+        """
+        :returns: the total number of deleted documents in this segment.
+        """
         if self.deleted is None: return 0
         return len(self.deleted)
-    
+
     def field_length(self, fieldnum):
         """
         :param fieldnum: the internal number of the field.
             documents in this segment.
         """
         return self.field_length_totals.get(fieldnum, 0)
-    
-    def delete_document(self, docnum, delete = True):
+
+    def delete_document(self, docnum, delete=True):
         """Deletes the given document number. The document is not actually
         removed from the index until it is optimized.
 
         :param docnum: The document number to delete.
         :param delete: If False, this undeletes a deleted document.
         """
-        
+
         if delete:
             if self.deleted is None:
                 self.deleted = set()
             elif docnum in self.deleted:
                 raise KeyError("Document %s in segment %r is already deleted"
                                % (docnum, self.name))
-            
+
             self.deleted.add(docnum)
         else: