Matt Chaput avatar Matt Chaput committed e1706fb

Fixed up docstrings.
Changed uses of isinstance(x, type) to callable(x) for flexibility.
util.ClosableMixin makes classes with a close() method into context managers.

store.py: made Storage.lock() return True on success.
postpool.py: commented out MemoryPool.
qparser.py: improved handling of analyzers.
query.py: Variations now computes variants on-the-fly.
searching.py: Fixed up __getitem__.

Comments (0)

Files changed (15)

src/whoosh/__init__.py

 # limitations under the License.
 #===============================================================================
 
+from whoosh.index import open_dir, create_index_in

src/whoosh/analysis.py

 #===============================================================================
 
 """
-TFunctions and classes for turning a piece of text into an
-indexable stream of words.
+Classes and functions for turning a piece of text into
+an indexable stream of "tokens" (usually equivalent to words). There are
+three general types of classes/functions involved in analysis:
 
-This module defines three types of functions/classes:
-
-Tokenizers: callables that take a string and yield tokenized "words".
-
-Filters: callables that take a "word" generator and filter it.
-
-Analyzers: a callable that combines a tokenizer and filters for
-convenience.
+    - Tokenizers are always at the start of the text processing pipeline.
+      They take a string and yield Token objects (actually, the same token
+      object over and over, for performance reasons) corresponding to the
+      tokens (words) in the text.
+      
+      Every tokenizer is simply a callable that takes a string and returns a
+      generator of tokens.
+      
+    - Filters take the tokens from the tokenizer and perform various
+      transformations on them. For example, the LowerCaseFilter converts
+      all tokens to lowercase, which is usually necessary when indexing
+      regular English text.
+      
+      Every filter is a callable that takes a token generator and returns
+      a token generator.
+      
+    - Analyzers are convenience functions/classes that "package up" a
+      tokenizer and zero or more filters into a single unit, so you
+      don't have to construct the tokenizer-filter-filter-etc. pipeline
+      yourself. For example, the StandardAnalyzer combines a RegexTokenizer,
+      LowerCaseFilter, and StopFilter.
+    
+      Every analyzer is simply a callable that takes a string and returns a
+      token generator.
 """
 
 import re
 # Token object
 
 class Token(object):
-    __slots__ = ("positions", "chars",
-                 "orig", "text", "pos", "startchar", "endchar",
-                 "stopped")
+    """
+    Represents a "token" (usually a word) extracted from the source text
+    being indexed.
     
-    def __init__(self, positions, chars):
+    Because object instantiation in Python is slow, tokenizers should create
+    ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
+    each time.
+    
+    This trick means that consumers of tokens (i.e. filters) must
+    never try to hold onto the token object between loop iterations, or convert
+    the token generator into a list.
+    Instead, save the attributes between iterations, not the object::
+    
+        def RemoveDuplicatesFilter(self, stream):
+            # Removes duplicate words.
+            lasttext = None
+            for token in stream:
+                # Only yield the token if its text doesn't
+                # match the previous token.
+                if lasttext != token.text:
+                    yield token
+                lasttext = token.text
+    
+    The Token object supports the following attributes:
+    
+        - text (string): The text of this token.
+        - original (string): The original text of the token, set by the tokenizer
+          and never modified by filters.
+        - positions (boolean): whether this token contains a position. If this
+          is True, the 'pos' attribute should be set to the index of the token
+          (e.g. for the first token, pos = 0, for the second token, pos = 1, etc.)
+        - chars (boolean): whether this token contains character offsets. If this
+          is True, the 'startchar' and 'endchar' attributes should be set to the
+          starting character offset and the ending character offset of this token.
+        - stopped (boolean): whether this token has been stopped by a stop-word
+          filter (not currently used).
+        - boosts (boolean): whether this token contains a per-token boost. If this
+          is True, the 'boost' attribute should be set to the current boost factor.
+    """
+    
+    __slots__ = ("positions", "chars", "boosts",
+                 "original", "text", "pos", "startchar", "endchar",
+                 "stopped", "boost")
+    
+    def __init__(self, positions, chars, boosts = False):
+        """
+        @param positions: Whether this token should have the token position in
+            the 'pos' attribute.
+        @param chars: Whether this token should have the token's character offsets
+            in the 'startchar' and 'endchar' attributes.
+        """
+        
         self.positions = positions
         self.chars = chars
+        self.boosts = boosts
         self.stopped = False
-
-# Support functions
-
-def gram(text, min, max):
-    """
-    Breaks a text into N-grams. min is the minimum size of the N-grams,
-    max is the maximum size. For example, gram("hello", 3, 4) will yield
-    ["hel", "ell", "llo", "hell", "ello"]
-    """
-    
-    inLen = len(text)
-    for g in xrange(min, max + 1):
-        pos = 0
-        limit = inLen - g + 1
-        for pos in xrange(0, limit):
-            yield text[pos:pos + g]
+        self.boost = 1.0
 
 # Tokenizers
 
 def IDTokenizer(value, positions = False, chars = False, start_pos = 0, start_char = 0):
     """
-    Returns the entire input string as a single token. For use
+    Yields the entire input string as a single token. For use
     in indexed but untokenized fields, such as a document's path.
     """
     
     t = Token(positions, chars)
-    t.orig = t.text = value
+    t.original = t.text = value
     if positions:
         t.pos = start_pos + 1
     if chars:
     Uses a regular expression to extract tokens from text.
     """
     
-    default_expression = re.compile("\w+", re.UNICODE)
+    _default_expression = re.compile("\w+", re.UNICODE)
     
     def __init__(self, expression = None):
-        self.expression = expression or self.default_expression
+        """
+        @param expression: A compiled regular expression object. Each match
+            of the expression equals a token. For example, the expression
+            re.compile("[A-Za-z0-9]+") would give tokens that only contain
+            letters and numbers. Group 0 (the entire matched text) is used
+            as the text of the token. If you require more complicated handling
+            of the expression match, simply write your own tokenizer.
+        @type expression: re.RegexObject
+        """
         
+        self.expression = expression or self._default_expression
+    
     def __call__(self, value, positions = False, chars = False,
                  start_pos = 0, start_char = 0):
+        """
+        @param value: The text to tokenize.
+        @param positions: Whether to record token positions in the token.
+        @param chars: Whether to record character offsets in the token.
+        @param start_pos: The position number of the first token. For example,
+            if you set start_pos=2, the tokens will be numbered 2,3,4,...
+            instead of 0,1,2,...
+        @param start_char: The offset of the first character of the first
+            token. For example, if you set start_char=2, the text "aaa bbb"
+            will have chars (2,5),(6,9) instead (0,3),(4,7).
+        @type value: string
+        """
+        
         t = Token(positions, chars)
         
         for pos, match in enumerate(self.expression.finditer(value)):
-            t.orig = t.text = match.group(0)
+            t.original = t.text = match.group(0)
             t.stopped = False
             if positions:
                 t.pos = start_pos + pos
     Splits tokens by whitespace.
     """
     
-    default_expression = re.compile("[^ \t\r\n]+")
+    _default_expression = re.compile("[^ \t\r\n]+")
 
 
 class CommaSeparatedTokenizer(RegexTokenizer):
     Splits tokens by commas with optional whitespace.
     """
     
-    default_expression = re.compile("[^,]+")
+    _default_expression = re.compile("[^,]+")
     
     def __call__(self, value, positions = False, chars = False,
                  start_pos = 0, start_char = 0):
         t = Token(positions, chars)
         
         for pos, match in enumerate(self.expression.finditer(value)):
-            t.orig = t.text = match.group(0).strip()
+            t.original = t.text = match.group(0).strip()
             t.stopped = False
             if positions:
                 t.pos = start_pos + pos
 
 class NgramTokenizer(object):
     """
-    Splits input text into Ngrams instead of words.
+    Splits input text into N-grams instead of words. For example,
+    NgramTokenizer(3, 4)("hello") will yield token texts
+    "hel", "hell", "ell", "ello", "llo".
+    
+    Note that this tokenizer does NOT use a regular expression
+    to extract words, so the grams emitted by it will contain
+    whitespace, punctuation, etc. You may want to add a custom filter
+    to this tokenizer's output. Alternatively, if you only want
+    sub-word grams without whitespace, you could use RegexTokenizer
+    with NgramFilter instead.
     """
     
     def __init__(self, minsize, maxsize = None):
         """
-        min is the minimum length of the Ngrams to output, max is the
-        maximum length to output. normalize is a regular expression that
-        is globally replaced by spaces (used to eliminate punctuation).
+        @param minsize: The minimum size of the N-grams.
+        @param maxsize: The maximum size of the N-grams. If you omit
+            this parameter, maxsize == minsize.
         """
         
         self.min = minsize
                 end = start + size
                 if end > inlen: continue
                 
-                t.orig = t.text = value[start:end]
                 t.stopped = False
                 if positions:
                     t.pos = pos
 # Filters
 
 def PassFilter(tokens):
+    """
+    An identity filter: passes the tokens through untouched.
+    """
+    
     for t in tokens:
         yield t
 
 
+class NgramFilter(object):
+    """
+    Splits token text into N-grams. For example,
+    NgramFilter(3, 4), for token "hello" will yield token texts
+    "hel", "hell", "ell", "ello", "llo".
+    """
+    
+    def __init__(self, minsize, maxsize = None):
+        """
+        @param minsize: The minimum size of the N-grams.
+        @param maxsize: The maximum size of the N-grams. If you omit
+            this parameter, maxsize == minsize.
+        """
+        
+        self.min = minsize
+        self.max = maxsize or minsize
+        
+    def __call__(self, tokens):
+        for t in tokens:
+            text, chars = t.text, t.chars
+            if chars:
+                startchar = t.startchar
+            # Token positions don't mean much for N-grams,
+            # so we'll leave the token's original position
+            # untouched.
+            
+            for start in xrange(0, len(text) - self.min):
+                for size in xrange(self.min, self.max + 1):
+                    end = start + size
+                    if end > len(text): continue
+                    
+                    t.text = text[start:end]
+                    
+                    if chars:
+                        t.startchar = startchar + start
+                        t.endchar = startchar + end
+                        
+                    yield t
+
+
 class StemFilter(object):
     """
-    Stems (removes suffixes from) words using the Porter stemming algorithm.
-    Stemming attempts to reduce multiple forms of the same root word (for
-    example, "rendering", "renders", "rendered", etc.) to a single word in
+    Stems (removes suffixes from) the text of tokens using the Porter stemming
+    algorithm. Stemming attempts to reduce multiple forms of the same root word
+    (for example, "rendering", "renders", "rendered", etc.) to a single word in
     the index.
+    
+    Note that I recommed you use a strategy of morphologically expanding the
+    query terms (see query.Variations) rather than stemming the indexed words.
     """
     
     def __init__(self, ignore = None):
         """
-        ignore is a sequence of words to avoid stemming; the default
-        is to stem all words.
+        @param ignore: a collection of words that should not be stemmed. This
+            is converted into a frozenset. If you omit this argument, all tokens
+            are stemmed.
+        @type ignore: sequence 
         """
         
         self.cache = {}
             self.ignores = frozenset(ignore)
     
     def clear(self):
+        """
+        This filter memoizes previously stemmed words to greatly speed up
+        stemming. This method clears the cache of previously stemmed words.
+        """
         self.cache.clear()
     
     def __call__(self, tokens):
                         t.endchar = oldstart + match.end()
                     yield t
 
+
 _underscore_exp = re.compile("[A-Z][a-z]*|[a-z]+|[0-9]+")
 def UnderscoreFilter(tokens):
     """
 
     def __init__(self, stoplist = STOP_WORDS, minsize = 2):
         """
-        Stoplist is a sequence of words to remove from the stream (this
-        is converted to a frozenset); the default is a list of common
-        stop words (analysis.STOP_WORDS). minsize is a minimum length
-        requirement for any word; the default is 2. Words smaller than
-        minsize are removed from the stream.
+        @param stoplist: A collection of words to remove from the stream.
+            This is converted to a frozenset. The default is a list of
+            common stop words.
+        @param minsize: The minimum length of token texts. Tokens with
+            text smaller than this will be stopped.
+        @type stoplist: sequence
         """
         
-        if not isinstance(stoplist, (set, frozenset)):
-            stoplist = frozenset(stoplist)
-        self.stops = stoplist
+        if stoplist is None:
+            self.stops = frozenset()
+        else:
+            self.stops = frozenset(stoplist)
         self.min = minsize
     
     def __call__(self, tokens):
             if len(text) >= minsize and text not in stoplist:
                 yield t
 
+
 def LowerCaseFilter(tokens):
     """
-    Lowercases (using .lower()) words in the stream.
+    Uses str.lower() to lowercase token text. For example, tokens
+    "This","is","a","TEST" become "this","is","a","test".
     """
     
     for t in tokens:
 # Analyzers
 
 class Analyzer(object):
+    """
+    Abstract base class for analyzers.
+    """
+    
     def __repr__(self):
         return "%s()" % self.__class__.__name__
 
+    def __call__(self, value):
+        raise NotImplementedError
+
 
 class IDAnalyzer(Analyzer):
+    """
+    Yields the original text as a single token. This is useful for fields
+    you don't want to tokenize, such as the path of a file.
+    """
+    
     def __init__(self, strip = True):
+        """
+        @param strip: Whether to use str.strip() to strip whitespace
+            from the value before yielding it as a token.
+        @type strip: boolean
+        """
         self.strip = strip
     
     def __call__(self, value, **kwargs):
 
 
 class SpaceSeparatedAnalyzer(Analyzer):
+    """
+    Parses space-separated tokens.
+    """
+    
     def __init__(self):
         self.tokenizer = SpaceSeparatedTokenizer()
     
 
 
 class CommaSeparatedAnalyzer(Analyzer):
+    """
+    Parses comma-separated tokens (with optional whitespace surrounding
+    the commas).
+    """
+    
     def __init__(self):
         self.tokenizer = CommaSeparatedTokenizer()
         
     """
     
     def __init__(self, stoplist = STOP_WORDS, minsize = 2):
+        """
+        @param stoplist: See analysis.StopFilter.
+        @param minsize: See analysis.StopFilter.
+        """
+        
         self.tokenizer = RegexTokenizer()
         
         if stoplist is None:
 
 class FancyAnalyzer(Analyzer):
     """
-    Uses a RegexTokenizer (by default) and applies a CamelFilter,
+    Uses a RegexTokenizer and applies a CamelFilter,
     UnderscoreFilter, LowerCaseFilter, and StopFilter.
     """
     
     def __init__(self, stoplist = STOP_WORDS, minsize = 2):
+        """
+        @param stoplist: See analysis.StopFilter.
+        @param minsize: See analysis.StopFilter.
+        """
+        
         self.tokenizer = RegexTokenizer()
         self.stopper = StopFilter(stoplist = stoplist, minsize = minsize)
         
     """
     
     def __init__(self, minsize, maxsize = None):
+        """
+        See analysis.NgramTokenizer.
+        """
         self.tokenizer = NgramTokenizer(minsize, maxsize = maxsize)
         
     def __call__(self, value, positions = False, chars = False):
 
 
 if __name__ == '__main__':
-    import timeit
-    
-    fix = """
-from whoosh.analysis import CamelFilter, FancyAnalyzer, StandardAnalyzer
-d = open("/Volumes/Storage/Development/help/documents/nodes/sop/copy.txt").read()
-sa = StandardAnalyzer()
-fa = FancyAnalyzer()
-"""
-    
-    t = timeit.Timer("l = [t.text for t in sa(d)]", fix)
-    print t.timeit(100)
-    
-    t = timeit.Timer("l = [t.text for t in fa(d)]", fix)
-    print t.timeit(100)
+    pass
 
 
 
 
 
 
-

src/whoosh/classify.py

 Classes and functions for classifying and extracting information from documents.
 """
 
-from __future__ import division
+from __future__ import division, with_statement
 from collections import defaultdict
 from math import log
 
     
     def __init__(self, ix, fieldname, model = None):
         """
-        ix is an Index object.
-        model is an ExpansionModel object. The default is Bo1Model.
+        @param ix: The index to search.
+        @param fieldname: The name of the field in which to search.
+        @param model: The model to use for expanding the query terms. If you
+            omit this parameter, the expander uses Bo1Model by default.
+        @type ix: index.Index
+        @type fieldname: string
+        @type model: classify.ExpansionModel
         """
         
         self.index = ix
         
         if model is None:
             self.model = Bo1Model(ix)
-        elif isinstance(model, type):
+        elif callable(model):
             self.model = model(ix)
         else:
             self.model = model
         # Cache the collection weight of every term in this
         # field. This turns out to be much faster than reading each
         # individual weight from the term index as we add words.
-        tr = ix.term_reader()
-        try:
+        with ix.term_reader() as tr:
             collection_weight = {}
             for word in tr.lexicon(fieldname):
                 collection_weight[word] = tr.term_count(fieldname, word)
             self.collection_weight = collection_weight
-        finally:
-            tr.close()
         
         # Maps words to their weight in the top N documents.
         self.topN_weight = defaultdict(float)
     def add(self, term_vector):
         """
         Adds forward-index information about one of the "top N" documents.
-        term_vector is a dictionary mapping term text to weight in the document.
+        
+        @param term_vector: a dictionary mapping term text to weight in the document.
         """
         
         total_weight = 0
             
         self.top_total += total_weight
     
-    def expanded_terms(self, number, normalize = True, min_docs = 2):
+    def expanded_terms(self, number, normalize = True):
+        """
+        Returns the N most important terms in the vectors added so far.
+        
+        @param number: The number of terms to return.
+        @param normalize: Whether to normalize the weights.
+        @return: A list of ("term", weight) tuples.
+        """
+        
         model = self.model
         tlist = []
         maxweight = 0
         
         return [(t, weight) for weight, t in tlist[:number]]
 
+
 # Expansion models
 
 class ExpansionModel(object):

src/whoosh/fields.py

 
 """
 This module contains functions and classes related to fields.
+
+
 """
 
 from collections import defaultdict
 class FieldConfigurationError(Exception):
     pass
 
+
 # Field Types
 
 class FieldType(object):
+    """
+    Represents a field configuration.
+    
+    The FieldType object supports the following attributes:
+    
+        - format (fields.Format): the storage format for the field's contents.
+        
+        - vector (fields.Format): the storage format for the field's vectors.
+        
+        - scorable (boolean): whether searches against this field may be scored.
+          This controls whether the index stores per-document field lengths for
+          this field.
+          
+        - stored (boolean): whether the content of this field is stored for each
+          document. For example, in addition to indexing the title of a document,
+          you usually want to store the title so it can be presented as part of
+          the search results.
+      
+    The constructor for the base field type simply lets you supply your
+    own configured field format, vector format, and scorable and stored
+    values. Subclasses may configure some or all of this for you.
+    """
+    
     format = vector = scorable = stored = None
     
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class CUSTOM(FieldType):
     def __init__(self, format, vector = None,
                  scorable = False, stored = False):
         self.format = format
         self.vector = vector
         self.scorable = scorable
+        self.stored = stored
 
 
 class ID(FieldType):
+    """
+    Configured field type that indexes the entire value of the field as one
+    token. This is useful for data you don't want to tokenize, such as the
+    path of a file.
+    """
+    
     def __init__(self, stored = False):
+        """
+        @param stored: Whether the value of this field is stored with the document.
+        """
         self.format = Existance(analyzer = analysis.IDAnalyzer())
         self.stored = stored
 
 
 class STORED(FieldType):
+    """
+    Configured field type for fields you want to store but not index.
+    """
+    
     def __init__(self):
         self.format = Stored()
         self.stored = True
 
 
 class KEYWORD(FieldType):
+    """
+    Configured field type for fields containing space-separated or comma-separated
+    keyword-like data (such as tags). The default is to not store positional information
+    (so phrase searching is not allowed in this field) and to not make the field scorable.
+    """
+    
     def __init__(self, stored = False, comma = False, scorable = False):
+        """
+        @param stored: Whether to store the value of the field with the document.
+        @param comma: Whether this is a comma-separated field. If this is False
+            (the default), it is treated as a space-separated field.
+        @param scorable: Whether this field is scorable.
+        """
+        
         ana = analysis.CommaSeparatedAnalyzer if comma else analysis.SpaceSeparatedAnalyzer()
         self.format = Frequency(analyzer = ana)
         self.scorable = scorable
 
 
 class TEXT(FieldType):
+    """
+    Configured field type for text fields (for example, the body text of an article). The
+    default is to store positional information to allow phrase searching. This field type
+    is always scorable.
+    """
+    
     def __init__(self, stored = False, phrase = True, analyzer = None):
+        """
+        @param stored: Whether to store the value of this field with the document. Since
+            this field type generally contains a lot of text, you should avoid storing it
+            with the document unless you need to, for example to allow fast excerpts in the
+            search results.
+        @param phrase: Whether the store positional information to allow phrase searching.
+        @param analyzer: The analyzer to use to index the field contents. See the analysis
+            module for more information. If you omit this argument, the field uses
+            analysis.StandardAnalyzer.
+        @type analyzer: analysis.Analyzer
+        """
+        
         ana = analyzer or analysis.StandardAnalyzer()
         self.format = Frequency(analyzer = ana)
         
 
 
 class NGRAM(FieldType):
-    def __init__(self, stored = False, minsize = 2, maxsize = 4):
+    """
+    Configured field that indexes text as N-grams. For example, with a field type
+    NGRAM(3,4), the value "hello" will be indexed as tokens
+    "hel", "hell", "ell", "ello", "llo".
+    """
+    
+    def __init__(self, minsize = 2, maxsize = 4, stored = False):
+        """
+        @param stored: Whether to store the value of this field with the document. Since
+            this field type generally contains a lot of text, you should avoid storing it
+            with the document unless you need to, for example to allow fast excerpts in the
+            search results.
+        @param minsize: The minimum length of the N-grams.
+        @param maxsize: The maximum length of the N-grams.
+        """
+        
         self.format = Frequency(analyzer = analysis.NgramAnalyzer(minsize, maxsize))
         self.scorable = True
         self.stored = stored
 
 class Schema(object):
     """
-    Represents the fields in an index. Maps names to FieldType objects
-    which define the behavior of each field.
+    Represents the collection of fields in an index. Maps field names to
+    FieldType objects which define the behavior of each field.
+    
+    Low-level parts of the index use field numbers instead of field names
+    for compactness. This class has several methods for converting between
+    the field name, field number, and field object itself.
     """
     
     def __init__(self, **fields):
+        """
+        All keyword arguments to the constructor are treated as fieldname = fieldtype
+        pairs. The fieldtype can be an instantiated FieldType object, or a FieldType
+        sub-class (in which case the Schema will instantiate it with the default
+        constructor before adding it).
+        
+        For example::
+        
+            s = Schema(content = TEXT,
+                       title = TEXT(stored = True),
+                       tags = KEYWORD(stored = True))
+        """
+        
         self._by_number = []
         self._names = []
         self._by_name = {}
         return "<Schema: %s>" % repr(self._names)
     
     def __iter__(self):
+        """
+        Yields the sequence of fields in this schema.
+        """
+        
         return iter(self._by_number)
     
     def __getitem__(self, id):
+        """
+        Returns the field associated with the given field name or number.
+        
+        @param id: A field name or field number.
+        """
+        
         if isinstance(id, basestring):
             return self._by_name[id]
         return self._by_number[id]
     
     def __len__(self):
+        """
+        Returns the number of fields in this schema.
+        """
         return len(self._by_number)
     
-    def __contains__(self, field):
-        return field in self._by_name
+    def __contains__(self, fieldname):
+        """
+        Returns True if a field by the given name is in this schema.
+        
+        @param fieldname: The name of the field.
+        @type fieldname: string
+        """
+        return fieldname in self._by_name
     
     def field_by_name(self, name):
+        """
+        Returns the field object associated with the given name.
+        
+        @param name: The name of the field to retrieve.
+        """
         return self._by_name[name]
     
     def field_by_number(self, number):
+        """
+        Returns the field object associated with the given number.
+        
+        @param number: The number of the field to retrieve.
+        """
         return self._by_number[number]
     
     def fields(self):
+        """
+        Yields ("fieldname", field_object) pairs for the fields
+        in this schema.
+        """
         return self._by_name.iteritems()
     
     def field_names(self):
         """
         return self._names
     
-    def add(self, name, fieldtype, **kwargs):
+    def add(self, name, fieldtype):
         """
         Adds a field to this schema.
+        
+        @param name: The name of the field.
+        @param fieldtype: An instantiated FieldType object, or a FieldType subclass.
+            If you pass an instantiated object, the schema will use that as the field
+            configuration for this field. If you pass a FieldType subclass, the schema
+            will automatically instantiate it with the default constructor.
+        @type fieldtype: fields.FieldType
         """
         
         if name.startswith("_"):
         elif name in self._by_name:
             raise FieldConfigurationError("Schema already has a field named %s" % name)
         
-        if isinstance(fieldtype, type):
-            fieldtype = fieldtype(**kwargs)
+        if callable(fieldtype):
+            fieldtype = fieldtype()
         if not isinstance(fieldtype, FieldType):
             raise FieldConfigurationError("%r is not a FieldType object" % fieldtype)
         
 
 class Format(object):
     """
-    Abstract base class representing a field in an indexed document.
+    Abstract base class representing a storage format for a field or vector.
+    Format objects are responsible for writing and reading the low-level
+    representation of a field. It controls what kind/level of information
+    to store about the indexed fields.
     """
     
     def __init__(self, analyzer, field_boost = 1.0, **options):
         """
-        analyzer is an analyzer object to use to
-        index this field (see the analysis module). Set the analyzer
-        to None if the field should not be indexed/searchable.
-        field_boost is a floating point factor to apply to the score of any
-        results from this field. stored controls whether the contents of this
-        field are stored in the index.
+        @param analyzer: The analyzer object to use to index this field.
+            See the analysis module for more information. If this value
+            is None, the field is not indexed/searchable.
+        @param field_boost: A constant boost factor to add to the score
+            of all queries matching terms in this field.
+        @type analyzer: analysis.Analyzer
+        @type field_boost: float
         """
         
         self.analyzer = analyzer
     
     def word_datas(self, value, **kwargs):
         """
-        Yields a series of "data" tuples from a string.
-        Applies the field's analyzer to get a stream of tokens from
-        the string, then turns the stream of words into a stream of
-        (word, freq, data) tuples, where "data" is field-specific information
-        about the word. The data may also be the frequency (eg in
-        a Frequency field, 'freq' and 'data' would be the same in the absence
-        of any boost).
+        Takes the text value to be indexed and yields a series of
+        ("tokentext", frequency, data) tuples, where frequency is the number
+        of times "tokentext" appeared in the value, and data is field-specific
+        posting data for the token. For example, in a Frequency format, data
+        would be the same as frequency; in a Positions format, data would be a
+        list of token positions at which "tokentext" occured.
+        
+        @param value: The text to index.
+        @type value: unicode
         """
         raise NotImplementedError
     
         
         raise NotImplementedError
     
-    def data_to_frequency(self, data):
+    def supports(self, name):
         """
-        Returns the 'data' interpreted as term frequency.
+        Returns True if this format supports interpreting its posting
+        data as 'name' (e.g. "frequency" or "positions").
         """
-        raise NotImplementedError
-
-    def data_to_weight(self, data):
+        return hasattr(self, "data_to_" + name)
+    
+    def data_to(self, data, name):
         """
-        Returns the 'data' interpreted as a term weight.
+        Interprets the given data as 'name', where 'name' is for example
+        "frequency" or "positions". This object must have a corresponding
+        .data_to_<name>() method.
         """
-        raise NotImplementedError
-
-    def supports(self, name):
-        return hasattr(self, "data_to_" + name)
+        return getattr(self, "data_to_"+name)(data)
     
 
 # Concrete field classes
     """
     Only indexes whether a given term occurred in
     a given document; it does not store frequencies or positions.
-    For example, use this format to store a field like "filepath".
+    This is useful for fields that should be searchable but not
+    scorable, such as file path.
     """
     
     def __init__(self, analyzer, field_boost = 1.0, **options):
-        """
-        analyzer is an analyzer object to use to
-        index this field (see the analysis module). field_boost is a
-        floating point factor to apply to the score of any results
-        from this field. stored controls whether the contents of this
-        field are stored in the index. indexed controls whether the
-        contents of this field are searchable.
-        """
-        
         self.analyzer = analyzer
         self.field_boost = field_boost
         self.options = options
     allow phrase searching and "near" queries.
     """
     
+    _supports = ("frequency", "weight", "positions")
+    
     def word_datas(self, value, start_pos = 0, **kwargs):
         seen = defaultdict(list)
         for t in self.analyzer(value, positions = True, start_pos = start_pos):
 
 class PositionBoosts(Format):
     """
-    A format that stores position and per-position boost information
+    A format that stores positions and per-position boost information
     in each posting.
     """
     
-    def word_datas(self, value, start_pos = 0, boosts = None, **kwargs):
-        if boosts is None: boosts = {}
-        
+    def word_datas(self, value, start_pos = 0, **kwargs):
         seen = defaultdict(iter)
         for t in self.analyzer(value, positions = True, start_pos = start_pos):
             pos = t.pos
-            seen[t.text].append((pos, boosts.get(pos, 1.0)))
+            if t.boosts:
+                boost = t.boost
+            seen[t.text].append((pos, boost))
         
         return ((w, len(poslist), poslist) for w, poslist in seen.iteritems())
     
             pos_list.append((pos_base, stream.read_8bitfloat())) # , self.options.get("limit", 8)
         return (freq, pos_list)
 
+    def data_to_frequency(self, data):
+        return len(data)
+    
+    def data_to_weight(self, data):
+        return len(data) * sum(d[1] for d in data) * self.field_boost
+
     def data_to_positions(self, data):
-        return [d[0] for d in data[1]]
+        return [d[0] for d in data]
 
     def data_to_position_boosts(self, data):
-        return data[1]
+        return data
 
 
 if __name__ == '__main__':

src/whoosh/index.py

 # limitations under the License.
 #===============================================================================
 
-"""
-Contains the main functions/classes for creating, maintaining, and using
+"""Contains the main functions/classes for creating, maintaining, and using
 an index.
 """
 
 import re
 from bisect import bisect_right
 import cPickle
+from threading import Lock
 
 from whoosh import fields, store
+from whoosh.util import synchronized
 
 
 _DEF_INDEX_NAME = "MAIN"
 # Exceptions
 
 class OutOfDateError(Exception):
-    """
-    Raised when you try to commit changes to an index which is not
+    """Raised when you try to commit changes to an index which is not
     the latest generation.
     """
     pass
 
 class EmptyIndexError(Exception):
-    """
-    Raised when you try to work with an index that has no indexed terms.
+    """Raised when you try to work with an index that has no indexed terms.
     """
     pass
 
 class IndexLockedError(Exception):
-    """
-    Raised when you try to write to or lock an already-locked index (or
+    """Raised when you try to write to or lock an already-locked index (or
     one that was accidentally left in a locked state).
     """
     pass
 
 class IndexError(Exception):
-    """
-    Generic index error.
-    """
+    """Generic index error."""
     pass
 
 # Utility functions
 
 def _toc_pattern(indexname):
-    """
-    Returns a regular expression object that matches TOC filenames.
+    """Returns a regular expression object that matches TOC filenames.
     name is the name of the index.
     """
     
     return re.compile("_%s_([0-9]+).toc" % indexname)
 
 def _segment_pattern(indexname):
-    """
-    Returns a regular expression object that matches segment filenames.
+    """Returns a regular expression object that matches segment filenames.
     name is the name of the index.
     """
     
     return re.compile("(_%s_[0-9]+).(%s)" % (indexname, _EXTENSIONS))
 
-def create_index_in(dirname, schema, indexname = None):
-    """
-    Convenience function to create an index in a directory. Takes care of creating
+def create_index_in(dirname, schema = None, indexname = None, **kwargs):
+    """Convenience function to create an index in a directory. Takes care of creating
     a FileStorage object for you. dirname is the filename of the directory in
     which to create the index. schema is a fields.Schema object describing the
     index's fields. indexname is the name of the index to create; you only need to
     Returns an Index object.
     """
     
-    if indexname is None:
+    if not indexname:
         indexname = _DEF_INDEX_NAME
     
     storage = store.FileStorage(dirname)
+    if schema is None:
+        schema = fields.Schema(**kwargs)
+    
     return Index(storage, schema, indexname = indexname)
 
 def open_dir(dirname, indexname = None):
-    """
-    Convenience function for opening an index in a directory. Takes care of creating
+    """Convenience function for opening an index in a directory. Takes care of creating
     a FileStorage object for you. dirname is the filename of the directory in
     containing the index. indexname is the name of the index to create; you only need to
     specify this if you have multiple indexes within the same storage object.
 # documents from self.segments. These methods are on IndexWriter as
 # well as Index for convenience, so they're broken out here.
 
-class SupportsDeletion(object):
+class DeletionMixin(object):
     """Mix-in for classes that support deleting documents from self.segments."""
     
     def delete_document(self, docnum, delete = True):
-        """
-        Deletes a document by number.
-        """
+        """Deletes a document by number."""
         self.segments.delete_document(docnum, delete = delete)
     
     def deleted_count(self):
-        """
-        Returns the total number of deleted documents in this index.
+        """Returns the total number of deleted documents in this index.
         """
         return self.segments.deleted_count()
     
     def is_deleted(self, docnum):
-        """
-        Returns True if a given document number is deleted but
+        """Returns True if a given document number is deleted but
         not yet optimized out of the index.
         """
         return self.segments.is_deleted(docnum)
     
     def has_deletions(self):
-        """
-        Returns True if this index has documents that are marked
+        """Returns True if this index has documents that are marked
         deleted but haven't been optimized out of the index yet.
         """
         return self.segments.has_deletions()
     
     def delete_by_term(self, fieldname, text, searcher = None):
-        """
-        Deletes any documents containing "term" in the "fieldname"
+        """Deletes any documents containing "term" in the "fieldname"
         field. This is useful when you have an indexed field containing
         a unique ID (such as "pathname") for each document.
         
         return self.delete_by_query(q, searcher = searcher)
     
     def delete_by_query(self, q, searcher = None):
-        """
-        Deletes any documents matching a query object.
+        """Deletes any documents matching a query object.
         
         Note that this method opens and closes a Searcher. If you are calling
         this method repeatedly (for example, deleting changed documents before
         
         return count
 
-
 # Index class
 
-class Index(object, SupportsDeletion):
-    """
-    Represents (a generation of) an index. You must create the index using
-    index.create() or index.create_index_in() before you can instantiate this
-    object (otherwise it will raise index.EmptyIndexError).
+class Index(object, DeletionMixin):
+    """Represents an indexed collection of documents.
     """
     
     def __init__(self, storage, schema = None, create = False, indexname = _DEF_INDEX_NAME):
         """
-        
+        @param storage: The Storage object in which this index resides.
+            See the store module for more details.
+        @param schema: A Schema object defining the fields of this index. If you omit
+            this argument for an existing index, the object will load the pickled Schema
+            object that was saved with the index. If you are creating a new index
+            (create = True), you must supply this argument.
+        @param create: Whether to create a new index. If this is True, you must supply
+            a Schema instance using the schema keyword argument.
+        @param indexname: An optional name to use for the index. Use this if you need
+            to keep multiple indexes in the same storage object.
+        @type storage: store.Storage
+        @type schema: fields.Schema
         """
         
         self.storage = storage
         if schema is not None and not isinstance(schema, fields.Schema):
             raise ValueError("%r is not a Schema object" % schema)
         
+        self.generation = self.latest_generation()
+        if self.generation < 0:
+            create = True
+        
         if create:
             if schema is None:
                 raise IndexError("To create an index you must specify a schema")
         else:
             self._read(schema)
             
+        self.segment_num_lock = Lock()
+            
     def latest_generation(self):
-        """
-        Returns the generation number of the latest generation of this
+        """Returns the generation number of the latest generation of this
         index.
         """
         
         return max
     
     def refresh(self):
-        """
-        Returns a new Index object representing the latest generation
+        """Returns a new Index object representing the latest generation
         of this index (if this object is the latest generation, returns
         self).
+        @return: index.Index
         """
         
         if not self.up_to_date():
             return self
     
     def up_to_date(self):
-        """
-        Returns True if this object represents the latest generation of
+        """Returns True if this object represents the latest generation of
         this index. Returns False if this object is not the latest
         generation (that is, someone else has updated the index since
         you opened this object).
     
     def _read(self, schema):
         # Reads the content of this index from the .toc file.
-        self.generation = self.latest_generation()
         stream = self.storage.open_file(self._toc_filename())
         
         # If the user supplied a schema object with the constructor,
         self.segments = stream.read_pickle()
         stream.close()
     
-    def next_segment_name(self):
+    def _next_segment_name(self):
         #Returns the name of the next segment in sequence.
-        
-        self.segment_counter += 1
-        return "_%s_%s" % (self.indexname, self.segment_counter)
+        if self.segment_num_lock.acquire():
+            try:
+                self.segment_counter += 1
+                return "_%s_%s" % (self.indexname, self.segment_counter)
+            finally:
+                self.segment_num_lock.release()
+        else:
+            raise IndexLockedError
     
     def _toc_filename(self):
+        # Returns the computed filename of the TOC for this
+        # index name and generation.
         return "_%s_%s.toc" % (self.indexname, self.generation)
     
     def last_modified(self):
-        """
-        Returns the last modified time of the .toc file.
+        """Returns the last modified time of the .toc file.
         """
         return self.storage.file_modified(self._toc_filename())
     
         return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname)
     
     def lock(self):
+        """Locks this index for writing, or raises an error if the index
+        is already locked. Returns true if the index was successfully
+        locked.
         """
-        Locks this index for writing, or raises IndexLockedError if the index
-        is already locked.
-        """
-        
-        self.storage.lock("_%s_LOCK" % self.indexname)
-        return True
+        return self.storage.lock("_%s_LOCK" % self.indexname)
     
     def unlock(self):
-        """
-        Unlocks the index. Only call this if you were the one who locked
+        """Unlocks the index. Only call this if you were the one who locked
         it (without getting an exception) in the first place!
         """
-        
-        try:
-            self.storage.unlock("_%s_LOCK" % self.indexname)
-        except:
-            pass
+        self.storage.unlock("_%s_LOCK" % self.indexname)
     
     def is_empty(self):
-        """
-        Returns True if this index is empty (that is, it has never
+        """Returns True if this index is empty (that is, it has never
         had any documents successfully written to it.
         """
-        
         return len(self.segments) == 0
     
     def optimize(self):
-        """
-        Optimizes this index's segments.
-        
-        This opens and closes a writing.IndexWriter object, so it may
-        fail if the index is already locked for writing.
+        """Optimizes this index's segments. This will fail if the index
+        is already locked for writing.
         """
         
         if len(self.segments) < 2 and not self.segments.has_deletions():
         w.close()
     
     def commit(self, new_segments = None):
-        """
-        Commits pending deletions to this index object.
+        """Commits pending edits (such as deletions) to this index object.
         Raises OutOfDateError if this index is not the latest generation
-        (that is, if some code has updated the index since you opened
+        (that is, if someone has updated the index since you opened
         this object).
+        
+        @param new_segments: a replacement SegmentSet. This is used by
+            IndexWriter to update the index after it finishes
+            writing.
         """
         
         if not self.up_to_date():
         self.clean_files()
     
     def clean_files(self):
-        """
-        Attempts to remove unused index files (called when a new generation
+        """Attempts to remove unused index files (called when a new generation
         is created). If existing Index and/or reader objects have the files
         open, they may not get deleted immediately (i.e. on Windows)
         but will probably be deleted eventually by a later call to clean_files.
                         storage.delete_file(filename)
     
     def doc_count_all(self):
-        """
-        Returns the total number of documents, DELETED OR UNDELETED,
+        """Returns the total number of documents, DELETED OR UNDELETED,
         in this index.
         """
         return self.segments.doc_count_all()
     
     def doc_count(self):
-        """
-        Returns the total number of UNDELETED documents in this index.
+        """Returns the total number of UNDELETED documents in this index.
         """
         return self.segments.doc_count()
     
-    def max_count(self):
-        """
-        Returns the maximum term weight in this index.
+    def max_weight(self):
+        """Returns the maximum term weight in this index.
         This is used by some scoring algorithms.
         """
-        return max(s.max_count for s in self.segments)
+        return self.segments.max_weight()
     
     def total_term_count(self):
-        """
-        Returns the total term count across all fields in all documents.
+        """Returns the total term count across all fields in all documents.
         This is used by some scoring algorithms. Note that this
         necessarily includes terms in deleted documents.
         """
-        return sum(s.term_count for s in self.segments)
+        return self.segments.total_term_count()
     
     def field_length(self, fieldnum):
-        """
-        Returns the total number of terms in a given field (the "field length").
+        """Returns the total number of terms in a given field (the "field length").
         This is used by some scoring algorithms. Note that this
         necessarily includes terms in deleted documents.
         """
         return sum(s.field_length(fieldnum) for s in self.segments)
     
     def term_reader(self):
-        """
-        Returns a TermReader object for this index.
+        """Returns a TermReader object for this index.
+        
+        @return: reading.TermReader
         """
         
         from whoosh import reading
             return reading.MultiTermReader(self.storage, segments, self.schema)
     
     def doc_reader(self):
-        """
-        Returns a DocReader object for this index.
+        """Returns a DocReader object for this index.
+        
+        @return: reading.DocReader
         """
         
         from whoosh import reading
             return reading.MultiDocReader(self.storage, segments, schema)
     
     def searcher(self):
-        """
-        Returns a Searcher object for this index.
+        """Returns a Searcher object for this index.
+        
+        @return: searching.Searcher
         """
         
         from whoosh.searching import Searcher
         return Searcher(self)
     
+    def writer(self):
+        """Returns an IndexWriter object for this index.
+        
+        @return: writing.IndexWriter
+        """
+        from whoosh.writing import IndexWriter
+        return IndexWriter(self)
+    
     def find(self, querystring, parser = None, **kwargs):
+        """Parses querystring, runs the query in this index, and returns a
+        Result object. Any additional keyword arguments are passed to
+        Searcher.search() along with the parsed query.
+
+        @param querystring: The query string to parse and search for.
+        @param parser: A Parser object to use to parse 'querystring'.
+            The default is to use a standard qparser.QueryParser.
+            This object must implement a parse(str) method which returns a
+            query.Query instance.
+        @return: searching.Results
         """
-        Searches for querystring and returns a Results object. By default,
-        this method uses a standard qparser.QueryParser object to parse the
-        querystring. You can specify a different parser using the parser
-        keyword argument. This object must implement a 'parse' method which
-        takes a query string as the sole argument and returns a query.Query
-        object.
-        """
-        
+
         if parser is None:
             from whoosh.qparser import QueryParser
             parser = QueryParser(self.schema)
             
         return self.searcher().search(parser.parse(querystring), **kwargs)
     
+    
+
+
+# SegmentSet object
+
 
 class SegmentSet(object):
+    """This class is never instantiated by the user. It is used by the Index
+    object to keep track of the segments in the index.
+    """
+
     def __init__(self, segments = None):
         if segments is None:
             self.segments = []
         self._doc_offsets = self.doc_offsets()
     
     def __len__(self):
+        """@return: the number of segments in this set."""
         return len(self.segments)
     
     def __iter__(self):
         return iter(self.segments)
     
     def append(self, segment):
+        """Adds a segment to this set."""
+        
         if self._doc_offsets:
             self._doc_offsets.append(self._doc_offsets[-1] + segment.doc_count_all())
         else:
         return self.segments.__getitem__(n)
     
     def _document_segment(self, docnum):
-        """
-        Returns the index.Segment object containing the given document
+        """Returns the index.Segment object containing the given document
         number.
         """
         
         return bisect_right(offsets, docnum) - 1
     
     def _segment_and_docnum(self, docnum):
-        """Returns an (index.Segment, segment_docnum) tuple for the
-        given document number.
+        """Returns an (index.Segment, segment_docnum) pair for the
+        segment containing the given document number.
         """
         
         segmentnum = self._document_segment(docnum)
         return segment, docnum - offset
     
     def copy(self):
-        """Returns a deep copy of this set."""
+        """@return: a deep copy of this set."""
         return self.__class__([s.copy() for s in self.segments])
     
     def doc_offsets(self):
+        # Recomputes the document offset list. This must be called if you
+        # change self.segments.
         offsets = []
         base = 0
         for s in self.segments:
     
     def doc_count_all(self):
         """
-        Returns the total number of documents, DELETED or
-        UNDELETED, in this set.
+        @return: the total number of documents, DELETED or
+            UNDELETED, in this set.
         """
         return sum(s.doc_count_all() for s in self.segments)
     
     def doc_count(self):
         """
-        Returns the number of undeleted documents.
+        @return: the number of undeleted documents in this set.
         """
         return sum(s.doc_count() for s in self.segments)
     
+    
+    def max_weight(self):
+        """
+        @return: the maximum frequency of any term in the set.
+        """
+        return max(s.max_weight for s in self.segments)
+    
+    def total_term_count(self):
+        """
+        @return: the total number of terms in the set. Note that this
+            necessarily includes deleted documents.
+        """
+        return sum(s.term_count for s in self.segments)
+    
     def has_deletions(self):
         """
-        Returns True if this index has documents that are marked
-        deleted but haven't been optimized out of the index yet.
-        This includes deletions that haven't been written to disk
-        with Index.commit() yet.
+        @return: True if this index has documents that are marked
+            deleted but haven't been optimized out of the index yet.
+            This includes deletions that haven't been written to disk
+            with Index.commit() yet.
         """
         return any(s.has_deletions() for s in self.segments)
     
     def delete_document(self, docnum, delete = True):
-        """
-        Deletes a document by number.
+        """Deletes a document by number.
 
         You must call Index.commit() for the deletion to be written to disk.
         """
     
     def deleted_count(self):
         """
-        Returns the total number of deleted documents in this index.
+        @return: the total number of deleted documents in this index.
         """
         return sum(s.deleted_count() for s in self.segments)
     
     def is_deleted(self, docnum):
         """
-        Returns True if a given document number is deleted but
-        not yet optimized out of the index.
-        
-        You must call Index.() for the deletion to be written to disk.
+        @return: True if a given document number is deleted but not yet
+            optimized out of the index.
         """
         
         segment, segdocnum = self._segment_and_docnum(docnum)
     
 
 class Segment(object):
-    """
-    Do not instantiate this object directly. It is used by the Index
+    """Do not instantiate this object directly. It is used by the Index
     object to hold information about a segment. A list of objects of this
     class are pickled as part of the TOC file.
     
     along the way).
     """
     
-    def __init__(self, name, max_doc, term_count, max_count, field_counts, deleted = None):
+    def __init__(self, name, max_doc, term_count, max_weight, field_counts, deleted = None):
         """
-        name is the name of the segment (the Index object computes this from its
-        name and the generation). max_doc is the maximum document number in the
-        segment.
-        term_count is the total count of all terms in all documents. max_count is
-        the maximum count of any term in the segment. deleted is a set of deleted
-        document numbers, or None if no documents are deleted in this segment.
+        @param name: The name of the segment (the Index object computes this from its
+            name and the generation).
+        @param max_doc: The maximum document number in the segment.
+        @param term_count: Total count of all terms in all documents.
+        @param max_count: The maximum count of any term in the segment.
+        @param deleted: A collection of deleted document numbers, or None
+            if no deleted documents exist in this segment.
         """
         
         self.name = name
         self.max_doc = max_doc
         self.term_count = term_count
-        self.max_count = max_count
+        self.max_weight = max_weight
         self.field_counts = field_counts
         self.deleted = deleted
         
     
     def copy(self):
         return Segment(self.name, self.max_doc,
-                       self.term_count, self.max_count, self.field_counts,
+                       self.term_count, self.max_weight, self.field_counts,
                        self.deleted)
     
     def doc_count_all(self):
         """
-        Returns the total number of documents, DELETED OR UNDELETED,
-        in this segment.
+        @return: the total number of documents, DELETED OR UNDELETED,
+            in this segment.
         """
         return self.max_doc
     
     def doc_count(self):
-        """
-        Returns the number of (undeleted) documents in this segment.
-        """
+        """@return: the number of (undeleted) documents in this segment."""
         return self.max_doc - self.deleted_count()
     
     def has_deletions(self):
-        """
-        Returns True if any documents in this segment are deleted.
-        """
+        """@return: True if any documents in this segment are deleted."""
         return self.deleted_count() > 0
     
     def deleted_count(self):
-        """
-        Returns the total number of deleted documents in this segment.
-        """
+        """@return: the total number of deleted documents in this segment."""
         if self.deleted is None: return 0
         return len(self.deleted)
     
     def field_length(self, fieldnum):
+        """Returns the total number of terms in the given field."""
         return self.field_counts.get(fieldnum, 0)
     
-    def delete_document(self, docnum, delete = True):
-        """
-        Deletes the given document number. The document is not actually
+    def delete_document(self, docnum, undelete = False):
+        """Deletes the given document number. The document is not actually
         removed from the index until it is optimized.
-        if delete = False, this undeletes a deleted document.
+
+        @param docnum: The document number to delete.
+        @param undelete: If True, this undeletes a deleted document.
         """
         
-        if delete:
+        if not undelete:
             if self.deleted is None:
                 self.deleted = set()
             elif docnum in self.deleted:
             self.deleted.remove(docnum)
     
     def is_deleted(self, docnum):
-        """
-        Returns True if the given document number is deleted.
-        """
+        """@return: True if the given document number is deleted."""
+        
         if self.deleted is None: return False
         return docnum in self.deleted
 
 # Debugging functions
 
         
-if __name__ == '__main__':
+if __name__ == '__main__':
     pass
     
     

src/whoosh/passages.py

 excerpts from result documents, similar to what Google displays under
 each result link.
 
-This module is unfinished.
+This module is still experimental and unfinished.
 """
 
 from __future__ import division

src/whoosh/postpool.py

 # limitations under the License.
 #===============================================================================
 
-"""
-This module implements the KinoSearch indexing model.
+"""Support functions and classes implementing the KinoSearch-like external sort
+merging model. This module does not contain any user-level objects.
 """
 
 import cPickle, struct, tempfile
 
 from whoosh import structfile
 
-_intSize = struct.calcsize("!i")
+_int_size = struct.calcsize("!i")
 
 # Utility functions
 
 def encode_posting(fieldNum, text, doc, data):
-    """
-    Encodes a posting as a string, for sorting.
-    """
+    """Encodes a posting as a string, for sorting."""
     
     return "".join([struct.pack("!i", fieldNum),
                     text.encode("utf8"),
                     ])
 
 def decode_posting(posting):
-    """
-    Decodes an encoded posting string into a
+    """Decodes an encoded posting string into a
     (field_number, text, document_number, data) tuple.
     """
     
-    pointer = 0
+    field_num = struct.unpack("!i", posting[:_int_size])[0]
     
-    field_num = struct.unpack("!i", posting[pointer:pointer + _intSize])[0]
-    pointer += _intSize
+    zero = posting.find(chr(0), _int_size)
+    text = posting[_int_size:zero].decode("utf8")
     
-    zero = posting.find(chr(0), pointer)
-    text = posting[pointer:zero].decode("utf8")
-    pointer = zero + 1
+    docstart = zero + 1
+    docend = docstart + _int_size
+    doc = struct.unpack("!i", posting[docstart:docend])[0]
     
-    doc = struct.unpack("!i", posting[pointer:pointer + _intSize])[0]
-    pointer += _intSize
-    
-    data = cPickle.loads(posting[pointer:])
+    data = cPickle.loads(posting[docend:])
     
     return field_num, text, doc, data
 
 # Classes
 
 class RunReader(object):
-    """
-    An iterator that yields posting strings from a "run" on disk.
+    """An iterator that yields posting strings from a "run" on disk.
     This class buffers the reads to improve efficiency.
     """
     
         self.finished = False
         
     def _fill(self):
-        """
-        Clears and refills the buffer.
-        """
+        # Clears and refills the buffer.
         
         # If this reader is exhausted, do nothing.
         if self.finished:
         self.finished = True
 
 
-class RamPostingPool(object):
-    """
-    An experimental alternate implementation of PostingPool that
-    just keeps everything in memory instead of doing an external
-    sort on disk. This is very memory inefficient and, as it turns
-    out, not much faster.
-    """
-    
-    def __init__(self):
-        self.postings = []
-
-    def add_posting(self, field_num, text, doc, data):
-        self.postings.append((field_num, text, doc, data))
-        
-    def __iter__(self):
-        return iter(sorted(self.postings))
+#class RamPostingPool(object):
+#    """
+#    An experimental alternate implementation of PostingPool that
+#    just keeps everything in memory instead of doing an external
+#    sort on disk. This is very memory inefficient and, as it turns
+#    out, not much faster.
+#    """
+#
+#    def __init__(self):
+#        self.postings = []
+#
+#    def add_posting(self, field_num, text, doc, data):
+#        self.postings.append((field_num, text, doc, data))
+#
+#    def __iter__(self):
+#        return iter(sorted(self.postings))
 
 
 

src/whoosh/qparser.py

+from whoosh.support.pyparsing import \
+Group, Combine, Suppress, Regex, OneOrMore, Forward, Word, alphanums, Keyword,\
+Empty, StringEnd, ParserElement
+
+import analysis, query
+
 """
 This module contains the default search query parser.
 
 
 This parser handles:
 
-* 'and', 'or', 'not'
-* grouping with parentheses
-* quoted phrase searching
-* wildcards at the end of a search prefix (help*);
-
-TO DO:
-    The parser currently works by FIRST allowing pyparsing to build an
-    abstract syntax tree (AST), and then walking the AST with the
-    eval* functions to replace the AST nodes with query.* objects.
-    This is inefficient and should be replaced by attaching pyparsing
-    parseAction methods on the rules to generate query.* objects
-    directly. However, this isn't straightforward, and I don't have
-    time to work on it now. -- MattChaput
+    - 'and', 'or', 'not'
+    - grouping with parentheses
+    - quoted phrase searching
+    - wildcards at the end of a search prefix (help*)
 
 This parser is based on the searchparser example code available at:
 
 
 This code was made available by the authors under the following copyright
 and conditions:
-"""
 
 # Copyright (c) 2006, Estrate, the Netherlands
 # All rights reserved.
 # - Steven Mooij
 # - Rudolph Froger
 # - Paul McGuire
-
-from whoosh.support.pyparsing import \
-Group, Combine, Suppress, Regex, OneOrMore, Forward, Word, alphanums, Keyword,\
-Empty, StringEnd, ParserElement
-
-import analysis, query
+"""
 
 def _makeParser():
     ParserElement.setDefaultWhitespaceChars(" \n\t\r'-")
 
 parser = _makeParser()
 
+
+# Query parser objects
+
 class QueryParser(object):
     def __init__(self, default_field, schema = None,
+                 analyzer = analysis.SimpleAnalyzer,
                  conjunction = query.And,
                  multiword_conjunction = query.Or,
                  termclass = query.Term,
                  **kwargs):
+        """
+        The query parser needs to break the parsed query terms similarly
+        to the indexed source text. You can either pass the index's
+        Schema object using the 'schema' keyword (in which case the parser
+        will use the analyzer associated with each field), or specify
+        a default analyzer for all fields using the 'analyzer' keyword.
+        In either case, you can specify an "override" analyzer for specific
+        fields by passing a <fieldname>_analyzer keyword argument with
+        an Analyzer instance for each field you want to override.
+
+        @param default_field: Use this as the field for any terms without
+            an explicit field. For example, if the query string is
+            "hello f1:there" and the default field is "f2", the parsed
+            query will be as if the user had entered "f2:hello f1:there".
+            This argument is required.
+        @param schema: The schema of the Index where this query will be
+            run. This is used to know which analyzers to use to analyze
+            the query text. If you can't or don't want to specify a schema,
+            you can specify a default analyzer for all fields using the
+            analyzer keyword argument, and overrides using <name>_analyzer
+            keyword arguments.
+        @param analyzer: The analyzer to use to analyze query text if
+            the schema argument is None.
+        @param conjuction: Use this query class to join together clauses
+            where the user has not explictly specified a join. For example,
+            if this is query.And, the query string "a b c" will be parsed as
+            "a AND b AND c". If this is query.Or, the string will be parsed as
+            "a OR b OR c".
+        @param multiword_conjuction: Use this query class to join together
+            sub-words when an analyzer parses a query term into multiple
+            tokens.
+        @param termclass: Use this query class for bare terms. For example,
+            query.Term or query.Variations.
+
+        @type default_field: string
+        @type schema: fields.Schema
+        @type analyzer: analysis.Analyzer
+        @type conjuction: query.Query
+        @type multiword_conjuction: query.Query
+        @type termclass: query.Query
+        """
+
         self.schema = schema
-        self.default_field = default_field or schema.number_to_name(0)
-        
-        self.conjunction = conjunction
-        self.multiword_conjunction = multiword_conjunction
-        self.termclass = termclass
-        
-        if schema is not None:
-            self._build_field_analyzers(kwargs)
-    
-    def _build_field_analyzers(self, kwargs):
-        # Initialize the field->analyzer map with the analyzer
-        # associated with each field.
-        self.field_analyzers = dict((fname, field.format.analyzer)
-                                    for fname, field in self.schema.fields())
-        
-        # Look for overrides in the keyword arguments
+        self.default_field = default_field
+
+        # Work out the analyzers to use
+        if not schema and not analyzer:
+            raise Exception("You must specify 'schema' and/or 'analyzer'")
+
+        # If the analyzer is a class, instantiate it
+        if callable(analyzer):
+            analyzer = analyzer()
+
+        self.analyzer = analyzer
+        self.field_analyzers = {}
+        if schema:
+            self.field_analyzers = dict((fname, field.format.analyzer)
+                                        for fname, field in self.schema.fields())
+
+        # Look in the keyword arguments for analyzer overrides
         for k, v in kwargs.iteritems():
             if k.endswith("_analyzer"):
                 fieldname = k[:-9]
                     self.field_analyzers[fieldname] = v
                 else:
                     raise KeyError("Found keyword argument %r but there is no field %r" % (k, fieldname))
-    
+
+        self.conjunction = conjunction
+        self.multiword_conjunction = multiword_conjunction
+        self.termclass = termclass
+        
     def _analyzer(self, fieldname):
-        if self.schema:
-            return self.field_analyzers[fieldname or self.default_field]
-        return analysis.SimpleAnalyzer()
-    
+        # Returns the analyzer associated with a field name.
+
+        # If fieldname is None, that means use the default field
+        fieldname = fieldname or self.default_field
+
+        if fieldname in self.field_analyzers:
+            self.field_analyzers[fieldname]
+        else:
+            return self.analyzer
+
+    # These methods are called by the parsing code to generate query
+    # objects. They are useful for subclassing.
+
     def make_terms(self, fieldname, words):
-        return self.multiword_conjunction([self.make_term(fieldname, w) for w in words])
+        return self.multiword_conjunction([self.make_term(fieldname, w)
+                                           for w in words])
     
     def make_term(self, fieldname, text):
         return self.termclass(fieldname or self.default_field, text)
     
     def parse(self, input, normalize = True):
         ast = parser(input)[0]
-        q = self.eval(ast, None)
+        q = self._eval(ast, None)
         if normalize:
             q = q.normalize()
         return q
     
-    def eval(self, node, fieldname):
+    def _eval(self, node, fieldname):
+        # Get the name of the AST node and call the corresponding
+        # method to get a query object
         name = node.getName()
-        return getattr(self, name)(node, fieldname)
-        
-    def Toplevel(self, node, fieldname):
-        return self.conjunction([self.eval(s, fieldname) for s in node])
+        return getattr(self, "_" + name)(node, fieldname)
 
-    def Word(self, node, fieldname):
+    # These methods take the AST from pyparsing, extract the
+    # relevant data, and call the appropriate make_* methods to
+    # create query objects.
+
+    def _Toplevel(self, node, fieldname):
+        return self.conjunction([self._eval(s, fieldname) for s in node])
+
+    def _Word(self, node, fieldname):
         analyzer = self._analyzer(fieldname)
         words = list(analyzer.words(node[0]))
         
         else:
             return self.make_terms(fieldname, words)
     
-    def Quotes(self, node, fieldname):
+    def _Quotes(self, node, fieldname):
         return self.make_phrase(fieldname, [n[0] for n in node])
 
-    def Prefix(self, node, fieldname):
+    def _Prefix(self, node, fieldname):
         return self.make_prefix(fieldname, node[0])
     
-    def Range(self, node, fieldname):
+    def _Range(self, node, fieldname):
         return self.make_range(fieldname, node[0][0], node[1][0])
     
-    def Wildcard(self, node, fieldname):
+    def _Wildcard(self, node, fieldname):
         return self.make_wildcard(fieldname, node[0])
     
-    def And(self, node, fieldname):
-        return self.make_and([self.eval(s, fieldname) for s in node])
+    def _And(self, node, fieldname):
+        return self.make_and([self._eval(s, fieldname) for s in node])
     
-    def Or(self, node, fieldname):
-        return self.make_or([self.eval(s, fieldname) for s in node])
+    def _Or(self, node, fieldname):
+        return self.make_or([self._eval(s, fieldname) for s in node])
     
-    def Not(self, node, fieldname):
-        return self.make_not(self.eval(node[0], fieldname))
+    def _Not(self, node, fieldname):
+        return self.make_not(self._eval(node[0], fieldname))
     
-    def Group(self, node, fieldname):
-        return self.conjunction([self.eval(s, fieldname) for s in node])
+    def _Group(self, node, fieldname):
+        return self.conjunction([self._eval(s, fieldname) for s in node])
     
-    def Field(self, node, fieldname):
-        return self.eval(node[1], node[0])
+    def _Field(self, node, fieldname):
+        return self._eval(node[1], node[0])
 
 
 class MultifieldParser(QueryParser):
-    def __init__(self, schema, fieldnames,
+    """A subclass of QueryParser. Instead of assigning unfielded clauses
+    to a default field, this class transforms them into an OR clause that
+    searches a list of fields. For example, if the list of multi-fields
+    is "f1", "f2" and the query string is "hello there", the class will
+    parse "(f1:hello OR f2:hello) (f1:there OR f2:there)". This is very
+    useful when you have two textual fields (e.g. "title" and "content")
+    you want to search by default.
+    """
+
+    def __init__(self, fieldnames, schema = None,
+                 analyzer = None,
                  conjunction = query.And,
                  multiword_conjunction = query.Or,
                  termclass = query.Term,
                  **kwargs):
-        self.conjunction = conjunction
-        self.termclass = termclass
-        self.multiword_conjunction = multiword_conjunction
-        self.schema = schema
+        super(MultifieldParser, self).__init__(fieldnames[0],
+                                               schema = schema,
+                                               analyzer = analyzer,
+                                               conjunction = conjunction,
+                                               multiword_conjuction = multiword_conjunction,
+                                               termclass = termclass,
+                                               **kwargs)
         self.fieldnames = fieldnames
+        self.field_values = {}
+
+    # Override the superclass's make_* methods with versions that convert
+    # the clauses to multifield ORs.
+
+    def _make(self, typename, fieldname, data):
+        if fieldname is not None:
+            return typename(fieldname, data)
         
-        self.field_values = dict([(fieldname, 1.0) for fieldname in fieldnames])
-        for k, v in kwargs.iteritems():
-            if not k.endswith("_analyzer") and k not in self.field_values:
-                raise KeyError("You specified a value for field %r but did not include the field" % k)
-            self.field_values[k] = v
-            
-        self._build_field_analyzers(kwargs)
-    
-    def _analyzer(self, fieldname):
-        if fieldname is None:
-            return self.field_analyzers[self.fieldnames[0]]
-        else:
-            return self.field_analyzers[fieldname]
-    
-    def _make(self, type, fieldname, data):
-        if fieldname is not None:
-            return type(fieldname, data)
-        
-        return query.Or([type(fn, data, boost = self.field_values[fn])
+        return query.Or([typename(fn, data, boost = self.field_values.get(fn))
                          for fn in self.fieldnames])
     
     def make_term(self, fieldname, text):
         return self._make(query.Wildcard, fieldname, text)
     
     def make_phrase(self, fieldname, texts):
-        return query.Or([super(self.__class__, self).make_phrase(fn, texts, boost = self.field_values[fn])