Commits

Matt Chaput committed c7cb797

Expanded docstrings and reformatted for Sphinx.

Removed imports into whoosh package namespace, since these caused problems with circular imports,
as importing *anything* triggered importing whoosh.index.

Added whoosh.__version__ tuple.

Added index.version and index.version_in functions to get package version and format version
numbers for an index.

Replaced reference to WindowsError in index.py with OSError to avoid dealing with it on
other platforms.

Changed TermRange and binary boolean query interfaces to accept two arguments instead of
requiring a sequence of length 2.

Fixed MultiDocReader.vector() to use proper number of arguments, fixes #46.

Added "create" keyword argument to spelling.SpellChecker.index() to allow clearing the
spelling dictionary.

Comments (0)

Files changed (15)

src/whoosh/__init__.py

 # limitations under the License.
 #===============================================================================
 
-from whoosh.index import open_dir, create_in
+__version__ = (0, 1, 24)

src/whoosh/analysis.py

 an indexable stream of "tokens" (usually equivalent to words). There are
 three general types of classes/functions involved in analysis:
 
-    - Tokenizers are always at the start of the text processing pipeline.
-      They take a string and yield Token objects (actually, the same token
-      object over and over, for performance reasons) corresponding to the
-      tokens (words) in the text.
+* Tokenizers are always at the start of the text processing pipeline.
+  They take a string and yield Token objects (actually, the same token
+  object over and over, for performance reasons) corresponding to the
+  tokens (words) in the text.
       
-      Every tokenizer is a callable that takes a string and returns a
-      generator of tokens.
+  Every tokenizer is a callable that takes a string and returns a
+  generator of tokens.
       
-    - Filters take the tokens from the tokenizer and perform various
-      transformations on them. For example, the LowercaseFilter converts
-      all tokens to lowercase, which is usually necessary when indexing
-      regular English text.
+* Filters take the tokens from the tokenizer and perform various
+  transformations on them. For example, the LowercaseFilter converts
+  all tokens to lowercase, which is usually necessary when indexing
+  regular English text.
       
-      Every filter is a callable that takes a token generator and returns
-      a token generator.
+  Every filter is a callable that takes a token generator and returns
+  a token generator.
       
-    - Analyzers are convenience functions/classes that "package up" a
-      tokenizer and zero or more filters into a single unit, so you
-      don't have to construct the tokenizer-filter-filter-etc. pipeline
-      yourself. For example, the StandardAnalyzer combines a RegexTokenizer,
-      LowercaseFilter, and StopFilter.
+* Analyzers are convenience functions/classes that "package up" a
+  tokenizer and zero or more filters into a single unit, so you
+  don't have to construct the tokenizer-filter-filter-etc. pipeline
+  yourself. For example, the StandardAnalyzer combines a RegexTokenizer,
+  LowercaseFilter, and StopFilter.
     
-      Every analyzer is a callable that takes a string and returns a
-      token generator. (So Tokenizers can be used as Analyzers if you
-      don't need any filtering).
+  Every analyzer is a callable that takes a string and returns a
+  token generator. (So Tokenizers can be used as Analyzers if you
+  don't need any filtering).
 """
 
 import copy, re
     Represents a "token" (usually a word) extracted from the source text
     being indexed.
     
+    See "Advaned analysis" in the user guide for more information.
+    
     Because object instantiation in Python is slow, tokenizers should create
     ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
     each time.
                 if lasttext != token.text:
                     yield token
                 lasttext = token.text
-    
-    The Token object supports the following attributes:
-    
-        - text (string): The text of this token.
-        - original (string): The original text of the token, set by the tokenizer
-          and never modified by filters.
-        - positions (boolean): whether this token contains a position. If this
-          is True, the 'pos' attribute should be set to the index of the token
-          (e.g. for the first token, pos = 0, for the second token, pos = 1, etc.)
-        - chars (boolean): whether this token contains character offsets. If this
-          is True, the 'startchar' and 'endchar' attributes should be set to the
-          starting character offset and the ending character offset of this token.
-        - stopped (boolean): whether this token has been stopped by a stop-word
-          filter (not currently used).
-        - boosts (boolean): whether this token contains a per-token boost. If this
-          is True, the 'boost' attribute should be set to the current boost factor.
-        - removestops (boolean): whether stopped tokens should be removed from
-          the token stream. If this is true, the 'stopped' attribute will indicate
-          whether the current token is a "stop" word.
+
     """
     
     def __init__(self, positions = False, chars = False, boosts = False, removestops = True,
                  **kwargs):
         """
-        :positions: Whether this token should have the token position in
+        :param positions: Whether tokens should have the token position in
             the 'pos' attribute.
-        :chars: Whether this token should have the token's character offsets
+        :param chars: Whether tokens should have character offsets
             in the 'startchar' and 'endchar' attributes.
+        :param boosts: whether the tokens should have per-token boosts
+            in the 'boost' attribute.
+        :param removestops: whether to remove stop words from the stream
+            (if the tokens pass through a stop filter).
         """
         
         self.positions = positions
     """
     Yields the entire input string as a single token. For use
     in indexed but untokenized fields, such as a document's path.
+    
+    >>> [token.text for token in IDTokenizer(u"/a/b 123 alpha")]
+    [u"/a/b 123 alpha"]
     """
     
     t = Token(positions, chars, removestops = removestops)
 class RegexTokenizer(object):
     """
     Uses a regular expression to extract tokens from text.
+    
+    >>> rex = RegexTokenizer()
+    >>> [token.text for token in rex(u"hi there 3.141 big-time under_score")]
+    [u"hi", u"there", u"3.141", u"big", u"time", u"under_score"]
     """
     
-    _default_expression = re.compile(r"\w+(\.?\w+)*", re.UNICODE)
-    
-    def __init__(self, expression = None):
+    def __init__(self, expression = r"\w+(\.?\w+)*"):
         """
-        :expression: A compiled regular expression object. Each match
-            of the expression equals a token. For example, the expression
-            re.compile("[A-Za-z0-9]+") would give tokens that only contain
-            letters and numbers. Group 0 (the entire matched text) is used
-            as the text of the token. If you require more complicated handling
-            of the expression match, simply write your own tokenizer.
+        :param expression: A regular expression object or string. Each match
+            of the expression equals a token. Group 0 (the entire matched text)
+            is used as the text of the token. If you require more complicated
+            handling of the expression match, simply write your own tokenizer.
         """
         
-        self.expression = expression or self._default_expression
+        if isinstance(expression, basestring):
+            self.expression = re.compile(expression)
+        else:
+            self.expression = expression
     
     def __call__(self, value, positions = False, chars = False,
                  keeporiginal = False, removestops = True,
                  start_pos = 0, start_char = 0):
         """
-        :value: The unicode string to tokenize.
-        :positions: Whether to record token positions in the token.
-        :chars: Whether to record character offsets in the token.
-        :start_pos: The position number of the first token. For example,
+        :param value: The unicode string to tokenize.
+        :param positions: Whether to record token positions in the token.
+        :param chars: Whether to record character offsets in the token.
+        :param start_pos: The position number of the first token. For example,
             if you set start_pos=2, the tokens will be numbered 2,3,4,...
             instead of 0,1,2,...
-        :start_char: The offset of the first character of the first
+        :param start_char: The offset of the first character of the first
             token. For example, if you set start_char=2, the text "aaa bbb"
             will have chars (2,5),(6,9) instead (0,3),(4,7).
         """
 
 class SpaceSeparatedTokenizer(RegexTokenizer):
     """Splits tokens by whitespace.
+    
+    >>> sst = SpaceSeparatedTokenizer()
+    >>> [token.text for token in sst(u"hi there big-time, what's up")]
+    [u"hi", u"there", u"big-time,", u"what's", u"up"]
+    
     """
     
-    _default_expression = re.compile("[^ \t\r\n]+")
+    def __init__(self, expression = r"[^ \t\r\n]+"):
+        super(SpaceSeparatedTokenizer, self).__init__(expression=expression)
 
 
 class CommaSeparatedTokenizer(RegexTokenizer):
-    """Splits tokens by commas surrounded by optional whitespace.
+    """Splits tokens by commas.
+    
+    Note that the tokenizer calls unicode.strip() on each match
+    of the regular expression.
+    
+    >>> cst = CommaSeparatedTokenizer()
+    >>> [token.text for token in cst(u"hi there, what's , up")]
+    [u"hi there", u"what's", u"up"]
     """
     
-    _default_expression = re.compile("[^,]+")
+    def __init__(self, expression = r"[^,]+"):
+        super(CommaSeparatedTokenizer, self).__init__(expression=expression)
     
     def __call__(self, value, **kwargs):
         for t in super(self.__class__, self).__call__(value, **kwargs):
 
 
 class NgramTokenizer(object):
-    """Splits input text into N-grams instead of words. For example,
-    NgramTokenizer(3, 4)("hello") will yield token texts
-    "hel", "hell", "ell", "ello", "llo".
+    """Splits input text into N-grams instead of words.
+    
+    >>> ngt = NgramTokenizer(4)
+    >>> [token.text for token in ngt(u"hi there")]
+    [u"hi t", u"i th", u" the", u"ther", u"here"]
     
     Note that this tokenizer does NOT use a regular expression to extract words,
     so the grams emitted by it will contain whitespace, punctuation, etc. You may
     want to massage the input or add a custom filter to this tokenizer's output.
     
     Alternatively, if you only want sub-word grams without whitespace, you
-    could use RegexTokenizer with NgramFilter instead.
+    could combine a RegexTokenizer with NgramFilter instead.
     """
     
     def __init__(self, minsize, maxsize = None):
         """
-        :minsize: The minimum size of the N-grams.
-        :maxsize: The maximum size of the N-grams. If you omit
+        :param minsize: The minimum size of the N-grams.
+        :param maxsize: The maximum size of the N-grams. If you omit
             this parameter, maxsize == minsize.
         """
         
 
 
 class NgramFilter(object):
-    """Splits token text into N-grams. For example,
-    NgramFilter(3, 4), for token "hello" will yield token texts
-    "hel", "hell", "ell", "ello", "llo".
+    """Splits token text into N-grams.
+    
+    >>> rext = RegexTokenizer()
+    >>> stream = rext(u"hello there")
+    >>> ngf = NgramFilter(4)
+    >>> [token.text for token in ngf(stream)]
+    [u"hell", u"ello", u"ther", u"here"]
+    
     """
     
     def __init__(self, minsize, maxsize = None):
         """
-        :minsize: The minimum size of the N-grams.
-        :maxsize: The maximum size of the N-grams. If you omit
+        :param minsize: The minimum size of the N-grams.
+        :param maxsize: The maximum size of the N-grams. If you omit
             this parameter, maxsize == minsize.
         """
         
     (for example, "rendering", "renders", "rendered", etc.) to a single word in
     the index.
     
-    Note that I recommend you use a strategy of morphologically expanding the
-    query terms (see query.Variations) rather than stemming the indexed words.
+    >>> rext = RegexTokenizer()
+    >>> stream = rext(u"fundamentally willows")
+    >>> stemmer = StemFilter()
+    >>> [token.text for token in stemmer(stream)]
+    [u"fundament", u"willow"]
     """
     
     def __init__(self, ignore = None):
         """
-        :ignore: a set/list of words that should not be stemmed. This
+        :param ignore: a set/list of words that should not be stemmed. This
             is converted into a frozenset. If you omit this argument, all tokens
             are stemmed.
         """
 
 _camel_exp = re.compile("[A-Z][a-z]*|[a-z]+|[0-9]+")
 def CamelFilter(tokens):
-    """Splits CamelCased words into multiple words. For example,
-    the string "getProcessedToken" yields tokens
-    "getProcessedToken", "get", "Processed", and "Token".
+    """Splits CamelCased words into multiple words.
     
-    Obviously this filter needs to precede LowercaseFilter in a filter
-    chain.
+    >>> rext = RegexTokenizer()
+    >>> stream = rext(u"call getProcessedToken")
+    >>> [token.text for token in CamelFilter(stream)]
+    [u"call", u"getProcessedToken", u"get", u"Processed", u"Token"]
+    
+    Obviously this filter needs to precede LowercaseFilter if they
+    are both in a filter chain.
     """
     
     for t in tokens:
 
 _underscore_exp = re.compile("[A-Z][a-z]*|[a-z]+|[0-9]+")
 def UnderscoreFilter(tokens):
-    """Splits words with underscores into multiple words. For example,
-    the string "get_processed_token" yields tokens
-    "get_processed_token", "get", "processed", and "token".
+    """Splits words with underscores into multiple words.
+    
+    >>> rext = RegexTokenizer()
+    >>> stream = rext(u"call get_processed_token")
+    >>> [token.text for token in CamelFilter(stream)]
+    [u"call", u"get_processed_token", u"get", u"processed", u"token"]
     
     Obviously you should not split words on underscores in the
     tokenizer if you want to use this filter.
 class StopFilter(object):
     """Marks "stop" words (words too common to index) in the stream (and by default
     removes them).
+    
+    >>> rext = RegexTokenizer()
+    >>> stream = rext(u"this is a test")
+    >>> stopper = StopFilter()
+    >>> [token.text for token in sopper(stream)]
+    [u"this", u"test"]
+    
     """
 
     def __init__(self, stoplist = STOP_WORDS, minsize = 2,
                  renumber = True):
         """
-        :stoplist: A collection of words to remove from the stream.
+        :param stoplist: A collection of words to remove from the stream.
             This is converted to a frozenset. The default is a list of
             common stop words.
-        :minsize: The minimum length of token texts. Tokens with
+        :param minsize: The minimum length of token texts. Tokens with
             text smaller than this will be stopped.
-        :renumber: Change the 'pos' attribute of unstopped tokens
+        :param renumber: Change the 'pos' attribute of unstopped tokens
             to reflect their position with the stopped words removed.
-        :remove: Whether to remove the stopped words from the stream
+        :param remove: Whether to remove the stopped words from the stream
             entirely. This is not normally necessary, since the indexing
             code will ignore tokens it receives with stopped=True.
         """
 
 
 def LowercaseFilter(tokens):
-    """Uses str.lower() to lowercase token text. For example, tokens
-    "This","is","a","TEST" become "this","is","a","test".
+    """Uses unicode.lower() to lowercase token text.
+    
+    >>> rext = RegexTokenizer()
+    >>> stream = rext(u"This is a TEST")
+    >>> [token.text for token in LowercaseFilter(stream)]
+    [u"this", u"is", u"a", u"test"]
     """
     
     for t in tokens:
     
     def __init__(self, expression, group = 1, default = 1.0):
         """
-        :expression: a compiled regular expression object representing
+        :param expression: a compiled regular expression object representing
         the pattern to look for within each token.
-        :group: the group name or number to use as the boost number
+        :param group: the group name or number to use as the boost number
             (what to pass to match.group()). The string value of this group is
             passed to float().
-        :default: the default boost to use for tokens that don't have
+        :param default: the default boost to use for tokens that don't have
             the marker.
         """
         
 
 class Analyzer(object):
     """
-    Abstract base class for analyzers.
+    Abstract base class for analyzers. Since the analyzer protocol is just
+    __call__, this is pretty simple -- it mostly exists to provide common
+    implementations of __repr__ and __eq__.
     """
     
     def __repr__(self):
     """
     Yields the original text as a single token. This is useful for fields
     you don't want to tokenize, such as the path of a file.
+    
+    >>> ana = IDAnalyzer()
+    >>> [token.text for token in ana(u"Hello there, this is a TEST")
+    [u"Hello there, this is a TEST"]
     """
     
     def __init__(self, strip = True, lowercase = False):
         """
-        :strip: Whether to use str.strip() to strip whitespace
+        :param strip: Whether to use str.strip() to strip whitespace
             from the value before yielding it as a token.
-        :lowercase: Whether to convert the token to lowercase
+        :param lowercase: Whether to convert the token to lowercase
             before indexing.
         """
         self.strip = strip
 
 class KeywordAnalyzer(Analyzer):
     """Parses space-separated tokens.
+    
+    >>> ana = KeywordAnalyzer()
+    >>> [token.text for token in ana(u"Hello there, this is a TEST")]
+    [u"Hello", u"there,", u"this", u"is", u"a", u"TEST"]
     """
     
     def __init__(self, lowercase = False, commas = False):
 class RegexAnalyzer(Analyzer):
     """Uses a RegexTokenizer, applies no filters.
     
-    :expression: The regular expression pattern to use to extract tokens.
+    >>> ana = RegexAnalyzer()
+    >>> [token.text for token in ana(u"hi there 3.141 big-time under_score")]
+    [u"hi", u"there", u"3.141", u"big", u"time", u"under_score"]
     """
     
-    def __init__(self, expression = None):
+    def __init__(self, expression = r"\w+(\.?\w+)*"):
+        """
+        :param expression: The regular expression pattern to use to extract tokens.
+        """
         self.tokenizer = RegexTokenizer(expression = expression)
         
     def __call__(self, value, **kwargs):
 class SimpleAnalyzer(Analyzer):
     """Uses a RegexTokenizer and applies a LowercaseFilter.
     
-    :expression: The regular expression pattern to use to extract tokens.
+    >>> ana = SimpleAnalyzer()
+    >>> [token.text for token in ana(u"Hello there, this is a TEST")]
+    [u"hello", u"there", u"this", u"is", u"a", u"test"]
     """
     
     def __init__(self, expression = None):
+        """
+        :param expression: The regular expression pattern to use to extract tokens.
+        """
         self.tokenizer = RegexTokenizer(expression = expression)
         
     def __call__(self, value, **kwargs):
 
 
 class StemmingAnalyzer(Analyzer):
+    """Uses a RegexTokenizer and applies a lower case filter,
+    an optional stop filter, and then a stemming filter.
+    
+    >>> ana = StemmingAnalyzer()
+    >>> [token.text for token in ana(u"Testing is testing and testing")]
+    [u"test", u"test", u"test"]
+    """
+    
     def __init__(self, stoplist = STOP_WORDS, minsize = 2):
+        """
+        :param stoplist: A list of stop words. Set this to None to disable
+            the stop word filter.
+        :param minsize: Words smaller than this are removed from the stream.
+        """
+        
         self.tokenizer = RegexTokenizer()
         self.stemfilter = StemFilter()
         self.stopper = None
 
 
 class StandardAnalyzer(Analyzer):
-    """Uses a RegexTokenizer and applies a LowercaseFilter and StopFilter.
+    """Uses a RegexTokenizer and applies a LowercaseFilter and optional StopFilter.
+    
+    >>> ana = StandardAnalyzer()
+    >>> [token.text for token in ana(u"Testing is testing and testing")]
+    [u"testing", u"testing", u"testing"]
     """
     
     def __init__(self, stoplist = STOP_WORDS, minsize = 2):
         """
-        :stoplist: See analysis.StopFilter.
-        :minsize: See analysis.StopFilter.
+        :param stoplist: A list of stop words. Set this to None to disable
+            the stop word filter.
+        :param minsize: Words smaller than this are removed from the stream.
         """
         
         self.tokenizer = RegexTokenizer()
 class FancyAnalyzer(Analyzer):
     """Uses a RegexTokenizer and applies a CamelFilter,
     UnderscoreFilter, LowercaseFilter, and StopFilter.
+    
+    >>> ana = FancyAnalyzer()
+    >>> [token.text for token in ana(u"Should I call getInt or get_real?")]
+    [u"should", u"call", u"getInt", u"get", u"int", u"get_real", u"get", u"real"]
     """
     
     def __init__(self, stoplist = STOP_WORDS, minsize = 2):
         """
-        :stoplist: See analysis.StopFilter.
-        :minsize: See analysis.StopFilter.
+        :param stoplist: See analysis.StopFilter.
+        :param minsize: See analysis.StopFilter.
         """
         
         self.tokenizer = RegexTokenizer()
 
 class NgramAnalyzer(Analyzer):
     """Uses an NgramTokenizer and applies a LowercaseFilter.
+    
+    >>> ana = NgramAnalyzer(4)
+    >>> [token.text for token in ana(u"hi there")]
+    [u"hi t", u"i th", u" the", u"ther", u"here"]
     """
     
     def __init__(self, minsize, maxsize = None):
         return LowercaseFilter(self.tokenizer(value, **kwargs))
     
 
-if __name__ == '__main__':
-    import time
-    txt = open("/Volumes/Storage/Development/help/documents/nodes/sop/copy.txt", "rb").read().decode("utf8")
-    st = time.time()
-    print [t.text for t in StopFilter()(LowercaseFilter(RegexTokenizer()(txt, positions = True)))]
-    print time.time() - st
 
-    st = time.time()
-    print [t.text for t in StopFilter(remove = False)(LowercaseFilter(RegexTokenizer()(txt, positions = True)))]
-    print time.time() - st
     
 
 

src/whoosh/classify.py

     
     def __init__(self, searcher, fieldname, model = Bo1Model):
         """
-        :searcher: A searching.Searcher object for the index.
-        :fieldname: The name of the field in which to search.
-        :model: (classify.ExpansionModel) The model to use for expanding
+        :param searcher: A searching.Searcher object for the index.
+        :param fieldname: The name of the field in which to search.
+        :param model: (classify.ExpansionModel) The model to use for expanding
             the query terms. If you omit this parameter, the expander uses
             scoring.Bo1Model by default.
         """
     def add(self, vector):
         """Adds forward-index information about one of the "top N" documents.
         
-        :vector: A series of (text, weight) tuples, such as is
+        :param vector: A series of (text, weight) tuples, such as is
             returned by DocReader.vector_as(docnum, fieldnum, "weight").
         """
         
     def expanded_terms(self, number, normalize = True):
         """Returns the N most important terms in the vectors added so far.
         
-        :number: The number of terms to return.
-        :normalize: Whether to normalize the weights.
+        :param number: The number of terms to return.
+        :param normalize: Whether to normalize the weights.
         :*returns*: A list of ("term", weight) tuples.
         """
         

src/whoosh/fields.py

     
     The FieldType object supports the following attributes:
     
-        - format (fields.Format): the storage format for the field's contents.
-        
-        - vector (fields.Format): the storage format for the field's vectors
-          (forward index), or None if the field should not store vectors.
-        
-        - scorable (boolean): whether searches against this field may be scored.
-          This controls whether the index stores per-document field lengths for
-          this field.
+    * format (fields.Format): the storage format for the field's contents.
+    
+    * vector (fields.Format): the storage format for the field's vectors
+      (forward index), or None if the field should not store vectors.
+    
+    * scorable (boolean): whether searches against this field may be scored.
+      This controls whether the index stores per-document field lengths for
+      this field.
           
-        - stored (boolean): whether the content of this field is stored for each
-          document. For example, in addition to indexing the title of a document,
-          you usually want to store the title so it can be presented as part of
-          the search results.
-          
-        - unique (boolean): whether this field's value is unique to each document.
-          For example, 'path' or 'ID'. IndexWriter.update_document() will use
-          fields marked as 'unique' to find the previous version of a document
-          being updated.
+    * stored (boolean): whether the content of this field is stored for each
+      document. For example, in addition to indexing the title of a document,
+      you usually want to store the title so it can be presented as part of
+      the search results.
+         
+    * unique (boolean): whether this field's value is unique to each document.
+      For example, 'path' or 'ID'. IndexWriter.update_document() will use
+      fields marked as 'unique' to find the previous version of a document
+      being updated.
       
     The constructor for the base field type simply lets you supply your
     own configured field format, vector format, and scorable and stored
     values. Subclasses may configure some or all of this for you.
+    
     """
     
     format = vector = scorable = stored = unique = None
     
     def __init__(self, stored = False, unique = False):
         """
-        :stored: Whether the value of this field is stored with the document.
+        :param stored: Whether the value of this field is stored with the document.
         """
         self.format = Existence(analyzer = IDAnalyzer())
         self.stored = stored
 class IDLIST(FieldType):
     """Configured field type for fields containing IDs separated by whitespace
     and/or puntuation.
-    
-    :stored: Whether the value of this field is stored with the document.
-    :unique: Whether the value of this field is unique per-document.
-    :expression: The regular expression object to use to extract tokens.
-        The default expression breaks tokens on CRs, LFs, tabs, spaces, commas,
-        and semicolons.
     """
     
     def __init__(self, stored = False, unique = False, expression = None):
+        """
+        :param stored: Whether the value of this field is stored with the document.
+        :param unique: Whether the value of this field is unique per-document.
+        :param expression: The regular expression object to use to extract tokens.
+            The default expression breaks tokens on CRs, LFs, tabs, spaces, commas,
+            and semicolons.
+        """
+        
         expression = expression or re.compile(r"[^\r\n\t ,;]+")
         analyzer = RegexAnalyzer(expression = expression)
         self.format = Existence(analyzer = analyzer)
     def __init__(self, stored = False, lowercase = False, commas = False,
                  scorable = False, unique = False, field_boost = 1.0):
         """
-        :stored: Whether to store the value of the field with the document.
-        :comma: Whether this is a comma-separated field. If this is False
+        :param stored: Whether to store the value of the field with the document.
+        :param comma: Whether this is a comma-separated field. If this is False
             (the default), it is treated as a space-separated field.
-        :scorable: Whether this field is scorable.
+        :param scorable: Whether this field is scorable.
         """
         
         ana = KeywordAnalyzer(lowercase = lowercase, commas = commas)
     def __init__(self, analyzer = None, phrase = True, vector = None,
                  stored = False, field_boost = 1.0):
         """
-        :stored: Whether to store the value of this field with the document. Since
+        :param stored: Whether to store the value of this field with the document. Since
             this field type generally contains a lot of text, you should avoid storing it
             with the document unless you need to, for example to allow fast excerpts in the
             search results.
-        :phrase: Whether the store positional information to allow phrase searching.
-        :analyzer: The analysis.Analyzer to use to index the field contents. See the
+        :param phrase: Whether the store positional information to allow phrase searching.
+        :param analyzer: The analysis.Analyzer to use to index the field contents. See the
             analysis module for more information. If you omit this argument, the field uses
             analysis.StandardAnalyzer.
         """
     
     def __init__(self, minsize = 2, maxsize = 4, stored = False):
         """
-        :stored: Whether to store the value of this field with the document. Since
+        :param stored: Whether to store the value of this field with the document. Since
             this field type generally contains a lot of text, you should avoid storing it
             with the document unless you need to, for example to allow fast excerpts in the
             search results.
-        :minsize: The minimum length of the N-grams.
-        :maxsize: The maximum length of the N-grams.
+        :param minsize: The minimum length of the N-grams.
+        :param maxsize: The maximum length of the N-grams.
         """
         
         self.format = Frequency(analyzer = NgramAnalyzer(minsize, maxsize))
         """
         Returns the field associated with the given field name or number.
         
-        :id: A field name or field number.
+        :param id: A field name or field number.
         """
         
         if isinstance(id, basestring):
         """
         Returns True if a field by the given name is in this schema.
         
-        :fieldname: The name of the field.
+        :param fieldname: The name of the field.
         """
         return fieldname in self._by_name
     
         """
         Returns the field object associated with the given name.
         
-        :name: The name of the field to retrieve.
+        :param name: The name of the field to retrieve.
         """
         return self._by_name[name]
     
         """
         Returns the field object associated with the given number.
         
-        :number: The number of the field to retrieve.
+        :param number: The number of the field to retrieve.
         """
         return self._by_number[number]
     
     
     def add(self, name, fieldtype):
         """
-        Adds a field to this schema.
+        Adds a field to this schema. This is a low-level method; use keyword
+        arguments to the Schema constructor to create the fields instead.
         
-        :name: The name of the field.
-        :fieldtype: An instantiated fields.FieldType object, or a FieldType subclass.
+        :param name: The name of the field.
+        :param fieldtype: An instantiated fields.FieldType object, or a FieldType subclass.
             If you pass an instantiated object, the schema will use that as the field
             configuration for this field. If you pass a FieldType subclass, the schema
             will automatically instantiate it with the default constructor.
     
     def __init__(self, analyzer, field_boost = 1.0, **options):
         """
-        :analyzer: The analysis.Analyzer object to use to index this field.
+        :param analyzer: The analysis.Analyzer object to use to index this field.
             See the analysis module for more information. If this value
             is None, the field is not indexed/searchable.
-        :field_boost: A constant boost factor to scale to the score
+        :param field_boost: A constant boost factor to scale to the score
             of all queries matching terms in this field.
         """
         
         would be the same as frequency; in a Positions format, data would be a
         list of token positions at which "tokentext" occured.
         
-        :value: The unicode text to index.
+        :param value: The unicode text to index.
         """
         raise NotImplementedError
     
     
     def __init__(self, analyzer, field_boost = 1.0, boost_as_freq = False, **options):
         """
-        :analyzer: The analysis.Analyzer object to use to index this field.
+        :param analyzer: The analysis.Analyzer object to use to index this field.
             See the analysis module for more information. If this value
             is None, the field is not indexed/searchable.
-        :field_boost: A constant boost factor to scale to the score
+        :param field_boost: A constant boost factor to scale to the score
             of all queries matching terms in this field.
-        :boost_as_freq: if True, take the integer value of each token's
+        :param boost_as_freq: if True, take the integer value of each token's
             boost attribute and use it as the token's frequency.
         """
         

src/whoosh/highlight.py

 from __future__ import division
 from heapq import nlargest
 
+"""
+The highlight module contains classes and functions for displaying short
+excerpts from hit documents in the search results you present to the user, with
+query terms highlighted.
+"""
 
 # Fragment object
 
 class Fragment(object):
+    """Represents a fragment (extract) from a hit document. This object is mainly
+    used to keep track of the start and end points of the fragment; it does not
+    contain the text of the fragment or do much else.
+    """
+    
     def __init__(self, tokens, charsbefore = 0, charsafter = 0, textlen = 999999):
+        """
+        :param tokens: list of the Token objects representing the matched terms. 
+        :param charsbefore: approx. how many characters before the start of the first
+            matched term to include in the fragment.
+        :param charsafter: approx. how many characters after the end of the last
+            matched term to include in the fragment.
+        :param textlen: length in characters of the document text.
+        """
+        
+        #: index of the first character of the fragment in the original document
         self.startchar = max(0, tokens[0].startchar - charsbefore)
+        #: index after the last character of the fragment in the original document
         self.endchar = min(textlen, tokens[-1].endchar + charsafter)
         self.matches = [t for t in tokens if t.matched]
         self.matched_terms = frozenset(t.text for t in self.matches)
     
     def __init__(self, size = 70):
         """
-        :size: size (in characters) to chunk to. The chunking is based on
+        :param size: size (in characters) to chunk to. The chunking is based on
             tokens, so the fragments will usually be smaller.
         """
         self.size = 70
 
 
 class SentenceFragmenter(object):
-    """"Breaks the text up on sentence end punctuation characters (".", "!", or "?").
+    """Breaks the text up on sentence end punctuation characters (".", "!", or "?").
     This object works by looking in the original text for a sentence end as the next
     character after each token's 'endchar'.
     """
     
     def __init__(self, maxchars = 200, sentencechars = ".!?"):
         """
-        :maxchars: The maximum number of characters allowed in a fragment.
+        :param maxchars: The maximum number of characters allowed in a fragment.
         """
         
         self.maxchars = maxchars
     
     def __init__(self, termset, maxchars = 200, charsbefore = 20, charsafter = 20):
         """
-        :termset: A collection (probably a set or frozenset) containing the
+        :param termset: A collection (probably a set or frozenset) containing the
             terms you want to match to token.text attributes.
-        :maxchars: The maximum number of characters allowed in a fragment.
-        :charsbefore: The number of extra characters of context to add before
+        :param maxchars: The maximum number of characters allowed in a fragment.
+        :param charsbefore: The number of extra characters of context to add before
             the first matched term.
-        :charsafter: The number of extra characters of context to add after
+        :param charsafter: The number of extra characters of context to add after
             the last matched term.
         """
         
 #class VectorFragmenter(object):
 #    def __init__(self, termmap, maxchars = 200, charsbefore = 20, charsafter = 20):
 #        """
-#        :termmap: A dictionary mapping the terms you're looking for to
+#        :param termmap: A dictionary mapping the terms you're looking for to
 #            lists of either (posn, startchar, endchar) or
 #            (posn, startchar, endchar, boost) tuples.
-#        :maxchars: The maximum number of characters allowed in a fragment.
-#        :charsbefore: The number of extra characters of context to add before
+#        :param maxchars: The maximum number of characters allowed in a fragment.
+#        :param charsbefore: The number of extra characters of context to add before
 #            the first matched term.
-#        :charsafter: The number of extra characters of context to add after
+#        :param charsafter: The number of extra characters of context to add after
 #            the last matched term.
 #        """
 #        

src/whoosh/index.py

 from threading import Lock
 from array import array
 
-from whoosh import fields, store
+from whoosh import __version__, fields, store
 
 
 _DEF_INDEX_NAME = "MAIN"
 _EXTENSIONS = "dci|dcz|tiz|fvz"
 
-_index_version = -101
+_index_version = -102
 
 _int_size = array("i").itemsize
 _ulong_size = array("L").itemsize
 
 # Exceptions
 
-class OutOfDateError(Exception):
+class IndexError(Exception):
+    """Generic index error."""
+
+class IndexVersionError(IndexError):
+    """Raised when you try to open an index using a format that the
+    current version of Whoosh cannot read. That is, when the index you're
+    trying to open is either not backward or forward compatible with this
+    version of Whoosh.
+    """
+    
+    def __init__(self, msg, version, release=None):
+        super(IndexVersionError, self).__init__(msg)
+        self.version = version
+        self.release = release
+
+class OutOfDateError(IndexError):
     """Raised when you try to commit changes to an index which is not
     the latest generation.
     """
-    pass
 
-class EmptyIndexError(Exception):
+class EmptyIndexError(IndexError):
     """Raised when you try to work with an index that has no indexed terms.
     """
-    pass
 
-class IndexLockedError(Exception):
+class IndexLockedError(IndexError):
     """Raised when you try to write to or lock an already-locked index (or
     one that was accidentally left in a locked state).
     """
-    pass
 
-class IndexError(Exception):
-    """Generic index error."""
-    pass
 
 
 # Utility functions
     
     return re.compile("(_%s_[0-9]+).(%s)" % (indexname, _EXTENSIONS))
 
+# User functions
+
 def create_in(dirname, schema = None, indexname = None, **kwargs):
     """Convenience function to create an index in a directory. Takes care of creating
     a FileStorage object for you. dirname is the filename of the directory in
     
     If you specify both a schema and keyword arguments, the schema wins.
     
-    Returns an Index object.
+    :returns: :class:`Index`
     """
     
     if not indexname:
     containing the index. indexname is the name of the index to create; you only need to
     specify this if you have multiple indexes within the same storage object.
     
-    Returns an Index object.
+    :returns: :class:`Index`
     """
     
     if indexname is None:
     return Index(store.FileStorage(dirname), indexname = indexname)
 
 def exists_in(dirname, indexname = None):
-    """Returns True if dirname contains a Whoosh index."""
+    """Returns True if dirname contains a Whoosh index.
     
-    if indexname is None:
-        indexname = _DEF_INDEX_NAME
+    :param dirname: the file path of a directory.
+    :param indexname: the name of the index. If None, the default index name is used.
+    :param rtype: bool
+    """
     
     if os.path.exists(dirname):
         try:
-            ix = open_dir(dirname)
+            ix = open_dir(dirname, indexname=indexname)
             return ix.latest_generation() > -1
         except EmptyIndexError:
             pass
 
     return False
 
-def exists(storage, indexname):
+def exists(storage, indexname = None):
+    """Returns True if the given Storage object contains a Whoosh
+    index.
+    
+    :param storage: a store.Storage object.
+    :param indexname: the name of the index. If None, the default index name is used.
+    :param rtype: bool
+    """
+    
     if indexname is None:
         indexname = _DEF_INDEX_NAME
         
     
     return False
 
+def version_in(dirname, indexname = None):
+    """Returns a tuple of (release_version, format_version), where
+    release_version is the release version number of the Whoosh code that
+    created the index -- e.g. (0, 1, 24) -- and format_version is the
+    version number of the on-disk format used for the index -- e.g. -102.
+    
+    The second number (format version) may be useful for figuring out if you
+    need to recreate an index because the format has changed. However, you
+    can just try to open the index and see if you get an IndexVersionError
+    exception.
+    
+    Note that the release and format version are available as attributes
+    on the Index object in Index.release and Index.version.
+    
+    :param dirname: the file path of a directory containing an index.
+    :param indexname: the name of the index. If None, the default index name is used.
+    :returns: ((major_ver, minor_ver, build_ver), format_ver)
+    """
+    
+    storage = store.FileStorage(dirname)
+    return version(storage, indexname=indexname)
+    
+
+def version(storage, indexname = None):
+    """Returns a tuple of (release_version, format_version), where
+    release_version is the release version number of the Whoosh code that
+    created the index -- e.g. (0, 1, 24) -- and format_version is the
+    version number of the on-disk format used for the index -- e.g. -102.
+    
+    The second number (format version) may be useful for figuring out if you
+    need to recreate an index because the format has changed. However, you
+    can just try to open the index and see if you get an IndexVersionError
+    exception.
+    
+    Note that the release and format version are available as attributes
+    on the Index object in Index.release and Index.version.
+    
+    :param storage: a store.Storage object.
+    :param indexname: the name of the index. If None, the default index name is used.
+    :returns: ((major_ver, minor_ver, build_ver), format_ver)
+    """
+    
+    try:
+        if indexname is None:
+            indexname = _DEF_INDEX_NAME
+        ix = Index(storage, indexname=indexname)
+        return (ix.release, ix.version)
+    except IndexVersionError, e:
+        return (e.release, e.version)
+
 
 # A mix-in that adds methods for deleting
 # documents from self.segments. These methods are on IndexWriter as
         field. This is useful when you have an indexed field containing
         a unique ID (such as "pathname") for each document.
         
-        :*returns*: the number of documents deleted.
+        :returns: the number of documents deleted.
         """
         
         from whoosh.query import Term
     def delete_by_query(self, q):
         """Deletes any documents matching a query object.
         
-        :*returns*: the number of documents deleted.
+        :returns: the number of documents deleted.
         """
         
         count = 0
     
     def __init__(self, storage, schema = None, create = False, indexname = _DEF_INDEX_NAME):
         """
-        :storage: The store.Storage object in which this index resides.
+        :param storage: The :class:`whoosh.store.Storage` object in which this index resides.
             See the store module for more details.
-        :schema: A fields.Schema object defining the fields of this index. If you omit
+        :param schema: A :class:`whoosh.fields.Schema` object defining the fields of this index. If you omit
             this argument for an existing index, the object will load the pickled Schema
             object that was saved with the index. If you are creating a new index
             (create = True), you must supply this argument.
-        :create: Whether to create a new index. If this is True, you must supply
+        :param create: Whether to create a new index. If this is True, you must supply
             a Schema instance using the schema keyword argument.
-        :indexname: An optional name to use for the index. Use this if you need
+        :param indexname: An optional name to use for the index. Use this if you need
             to keep multiple indexes in the same storage object.
         """
         
         """Returns a new Index object representing the latest generation
         of this index (if this object is the latest generation, returns
         self).
-        :*returns*: index.Index
+        
+        :returns: :class:`Index`
         """
         
         if not self.up_to_date():
         this index. Returns False if this object is not the latest
         generation (that is, someone else has updated the index since
         you opened this object).
+        
+        :param rtype: bool
         """
         return self.generation == self.latest_generation()
     
         stream.write_varint(_float_size)
         stream.write_string(byteorder)
         
+        for num in __version__[:3]:
+            stream.write_varint(num)
         stream.write_int(_index_version)
         stream.write_string(cPickle.dumps(self.schema, -1))
         stream.write_int(self.generation)
             raise IndexError("Index was created on a different architecture")
         
         version = stream.read_int()
-        if version != _index_version:
-            raise IndexError("Don't know how to read index version %s" % version)
+        if version > -101:
+            # This index was created by an older version of Whoosh
+            raise IndexVersionError("Can't read old format %s" % version,
+                                    version)
+        elif version < _index_version:
+            # This index was created by a future version of Whoosh
+            raise IndexVersionError("Can't read newer format %s" )
+        elif version == -101:
+            # Backward compatibility: format -101 didn't write out the release
+            # number.
+            release = None
+        else:
+            release = (stream.read_varint(),
+                       stream.read_varint(),
+                       stream.read_varint())
+            
+        self.release = release
+        self.version = version
         
         # If the user supplied a schema object with the constructor,
         # don't load the pickled schema from the saved index.
     
     def lock(self):
         """Locks this index for writing, or raises an error if the index
-        is already locked. Returns true if the index was successfully
+        is already locked. Returns True if the index was successfully
         locked.
+        
+        :param rtype: bool
         """
         return self.storage.lock("_%s_LOCK" % self.indexname)
     
     def is_empty(self):
         """Returns True if this index is empty (that is, it has never
         had any documents successfully written to it.
+        
+        :param rtype: bool
         """
         return len(self.segments) == 0
     
     def optimize(self):
         """Optimizes this index's segments. This will fail if the index
-        is already locked for writing.
+        is already locked for writing. If the index only has one segment
+        this does nothing.
         """
         
         if len(self.segments) < 2 and not self.segments.has_deletions():
         (that is, if someone has updated the index since you opened
         this object).
         
-        :new_segments: a replacement SegmentSet. This is used by
+        :param new_segments: a replacement SegmentSet. This is used by
             IndexWriter to update the index after it finishes
             writing.
         """
                 if num != self.generation:
                     try:
                         storage.delete_file(filename)
-                    except WindowsError:
+                    except OSError:
                         # Another process still has this file open
                         pass
             else:
                     if name not in current_segment_names:
                         try:
                             storage.delete_file(filename)
-                        except WindowsError:
+                        except OSError:
                             # Another process still has this file open
                             pass
     
         return sum(s.field_length(fieldnum) for s in self.segments)
     
     def term_reader(self):
-        """Returns a TermReader object for this index.
+        """Returns a TermReader object for this index. This is a low-level
+        method; users should obtain a :param class:`whoosh.searching.Searcher`
+        with Index.searcher() instead.
         
-        :*returns*: reading.TermReader
+        :rtype: :class:`whoosh.reading.TermReader`
         """
         
         from whoosh import reading
             return reading.MultiTermReader(term_readers, doc_offsets, self.schema)
     
     def doc_reader(self):
-        """Returns a DocReader object for this index.
+        """Returns a DocReader object for this index. This is a low-level
+        method; users should obtain a :param class:`whoosh.searching.Searcher`
+        with Index.searcher() instead.
         
-        :*returns*: reading.DocReader
+        :rtype: :class:`whoosh.reading.DocReader`
         """
         
         from whoosh import reading
         """Returns a Searcher object for this index. Keyword arguments
         are passed to the Searcher object's constructor.
         
-        :*returns*: searching.Searcher
+        :rtype: :class:`whoosh.searching.Searcher`
         """
         
         from whoosh.searching import Searcher
     def writer(self, **kwargs):
         """Returns an IndexWriter object for this index.
         
-        :*returns*: writing.IndexWriter
+        :rtype: :class:`whoosh.writing.IndexWriter`
         """
         from whoosh.writing import IndexWriter
         return IndexWriter(self, **kwargs)
         Result object. Any additional keyword arguments are passed to
         Searcher.search() along with the parsed query.
 
-        :querystring: The query string to parse and search for.
-        :parser: A Parser object to use to parse 'querystring'.
+        :param querystring: The query string to parse and search for.
+        :param parser: A Parser object to use to parse 'querystring'.
             The default is to use a standard qparser.QueryParser.
             This object must implement a parse(str) method which returns a
-            query.Query instance.
-        :*returns*: searching.Results
+            :param class:`whoosh.query.Query` instance.
+        :rtype: :class:`whoosh.searching.Results`
         """
 
         if parser is None:
         return repr(self.segments)
     
     def __len__(self):
-        """:*returns*: the number of segments in this set."""
+        """:returns: the number of segments in this set."""
         return len(self.segments)
     
     def __iter__(self):
         return segment, docnum - offset
     
     def copy(self):
-        """:*returns*: a deep copy of this set."""
+        """:returns: a deep copy of this set."""
         return self.__class__([s.copy() for s in self.segments])
     
     def doc_offsets(self):
     
     def doc_count_all(self):
         """
-        :*returns*: the total number of documents, DELETED or
+        :returns: the total number of documents, DELETED or
             UNDELETED, in this set.
         """
         return sum(s.doc_count_all() for s in self.segments)
     
     def doc_count(self):
         """
-        :*returns*: the number of undeleted documents in this set.
+        :returns: the number of undeleted documents in this set.
         """
         return sum(s.doc_count() for s in self.segments)
     
     
     def max_weight(self):
         """
-        :*returns*: the maximum frequency of any term in the set.
+        :returns: the maximum frequency of any term in the set.
         """
         
         if not self.segments:
     
     def has_deletions(self):
         """
-        :*returns*: True if this index has documents that are marked
+        :returns: True if this index has documents that are marked
             deleted but haven't been optimized out of the index yet.
             This includes deletions that haven't been written to disk
             with Index.commit() yet.
     
     def deleted_count(self):
         """
-        :*returns*: the total number of deleted documents in this index.
+        :returns: the total number of deleted documents in this index.
         """
         return sum(s.deleted_count() for s in self.segments)
     
     def is_deleted(self, docnum):
         """
-        :*returns*: True if a given document number is deleted but not yet
+        :returns: True if a given document number is deleted but not yet
             optimized out of the index.
         """
         
     def __init__(self, name, max_doc, max_weight, field_length_totals,
                  deleted = None):
         """
-        :name: The name of the segment (the Index object computes this from its
+        :param name: The name of the segment (the Index object computes this from its
             name and the generation).
-        :max_doc: The maximum document number in the segment.
-        :term_count: Total count of all terms in all documents.
-        :max_weight: The maximum weight of any term in the segment. This is used
+        :param max_doc: The maximum document number in the segment.
+        :param term_count: Total count of all terms in all documents.
+        :param max_weight: The maximum weight of any term in the segment. This is used
             by some scoring algorithms.
-        :field_length_totals: A dictionary mapping field numbers to the total
+        :param field_length_totals: A dictionary mapping field numbers to the total
             number of terms in that field across all documents in the segment.
-        :deleted: A collection of deleted document numbers, or None
+        :param deleted: A collection of deleted document numbers, or None
             if no deleted documents exist in this segment.
         """
         
     
     def doc_count_all(self):
         """
-        :*returns*: the total number of documents, DELETED OR UNDELETED,
+        :returns: the total number of documents, DELETED OR UNDELETED,
             in this segment.
         """
         return self.max_doc
     
     def doc_count(self):
-        """:*returns*: the number of (undeleted) documents in this segment."""
+        """:returns: the number of (undeleted) documents in this segment."""
         return self.max_doc - self.deleted_count()
     
     def has_deletions(self):
-        """:*returns*: True if any documents in this segment are deleted."""
+        """:returns: True if any documents in this segment are deleted."""
         return self.deleted_count() > 0
     
     def deleted_count(self):
-        """:*returns*: the total number of deleted documents in this segment."""
+        """:returns: the total number of deleted documents in this segment."""
         if self.deleted is None: return 0
         return len(self.deleted)
     
     def field_length(self, fieldnum):
         """
-        :fieldnum: the internal number of the field.
-        :*returns*: the total number of terms in the given field across all
+        :param fieldnum: the internal number of the field.
+        :returns: the total number of terms in the given field across all
             documents in this segment.
         """
         return self.field_length_totals.get(fieldnum, 0)
         """Deletes the given document number. The document is not actually
         removed from the index until it is optimized.
 
-        :docnum: The document number to delete.
-        :delete: If False, this undeletes a deleted document.
+        :param docnum: The document number to delete.
+        :param delete: If False, this undeletes a deleted document.
         """
         
         if delete:
             self.deleted.remove(docnum)
     
     def is_deleted(self, docnum):
-        """:*returns*: True if the given document number is deleted."""
+        """:returns: True if the given document number is deleted."""
         
         if self.deleted is None: return False
         return docnum in self.deleted

src/whoosh/passages.py

 #    
 #    Translated into Python from the passages engine of the Minion search engine.
 #    
-#    :words: List of the search words.
-#    :poslist: List of lists, where each sublist contains the positions
+#    :param words: List of the search words.
+#    :param poslist: List of lists, where each sublist contains the positions
 #        at which the corresponding search word (from the 'words' list) was found.
-#    :maxmissing: The maximum number of missing words allowed. The default
+#    :param maxmissing: The maximum number of missing words allowed. The default
 #        is the number of words in 'words'. Set this to 0 to only find passages
 #        containing all the search words.
-#    :minwindow: The minimum size for passages (in words).
-#    :maxwindow: The maximum size for passages (in words).
+#    :param minwindow: The minimum size for passages (in words).
+#    :param maxwindow: The maximum size for passages (in words).
 #    """
 #    
 #    if maxmissing is None:

src/whoosh/postpool.py

     
     def __init__(self, stream, count, buffer_size):
         """
-        :stream: the file from which to read.
-        :count: the number of postings in the stream.
-        :buffer_size: the size (in bytes) of the read buffer to use.
+        :param stream: the file from which to read.
+        :param count: the number of postings in the stream.
+        :param buffer_size: the size (in bytes) of the read buffer to use.
         """
         
         self.stream = stream
     
     def __init__(self, limit):
         """
-        :limit: the maximum amount of memory to use at once
+        :param limit: the maximum amount of memory to use at once
             for adding postings and the merge sort.
         """
         

src/whoosh/qparser.py

 
 This parser handles:
 
-    - 'and', 'or', 'not'
-    - grouping with parentheses
-    - quoted phrase searching
-    - wildcards at the end of a search prefix, e.g. help*
-    - ranges, e.g. a..b
+* 'and', 'or', 'not'
+* grouping with parentheses
+* quoted phrase searching
+* wildcards at the end of a search prefix, e.g. help*
+* ranges, e.g. a..b
 
-This parser is based on the searchparser example code available at:
+This parser was originally based on the searchparser example code available at:
 
 http://pyparsing.wikispaces.com/space/showimage/searchparser.py
+"""
 
-The code upon which this parser is based was made available by the authors under
-the following copyright and conditions:
+#The code upon which this parser was based was made available by the authors under
+#the following copyright and conditions:
 
 # Copyright (c) 2006, Estrate, the Netherlands
 # All rights reserved.
 # - Steven Mooij
 # - Rudolph Froger
 # - Paul McGuire
-"""
 
 def _make_default_parser():
     ParserElement.setDefaultWhitespaceChars(" \n\t\r'")
         valid queries. It may also raise a variety of exceptions if the input
         string is malformed.
         
-        :input: the unicode string to parse.
-        :normalize: whether to call normalize() on the query object/tree
+        :param input: the unicode string to parse.
+        :param normalize: whether to call normalize() on the query object/tree
             before returning it. This should be left on unless you're trying to
             debug the parser output.
+        :rtype: :class:`whoosh.query.Query`
         """
         
         self.stopped_words = set()
                  termclass = query.Term,
                  schema = None):
         """
-        :default_field: Use this as the field for any terms without
+        :param default_field: Use this as the field for any terms without
             an explicit field. For example, if the query string is
             "hello f1:there" and the default field is "f2", the parsed
             query will be as if the user had entered "f2:hello f1:there".
             This argument is required.
-        :conjuction: Use this query.Query class to join together clauses
+        :param conjuction: Use this query.Query class to join together clauses
             where the user has not explictly specified a join. For example,
             if this is query.And, the query string "a b c" will be parsed as
             "a AND b AND c". If this is query.Or, the string will be parsed as
             "a OR b OR c".
-        :termclass: Use this query.Query class for bare terms. For example,
+        :param termclass: Use this query.Query class for bare terms. For example,
             query.Term or query.Variations.
-        :schema: An optional fields.Schema object. If this argument is present,
+        :param schema: An optional fields.Schema object. If this argument is present,
             the analyzer for the appropriate field will be run on terms/phrases
             before they are turned into query objects.
         """
         fieldname = fieldname or self.default_field
         start = self._analyze(fieldname, start)
         end = self._analyze(fieldname, end)
-        return query.TermRange(fieldname or self.default_field, (start, end))
+        return query.TermRange(fieldname or self.default_field, start, end)
     
     def make_and(self, qs):
         return query.And(qs)
     
     def __init__(self, default_field, termclass = query.Term, schema = None):
         """
-        :default_field: Use this as the field for any terms without
+        :param default_field: Use this as the field for any terms without
             an explicit field. For example, if the query string is
             "hello f1:there" and the default field is "f2", the parsed
             query will be as if the user had entered "f2:hello f1:there".
             This argument is required.
-        :termclass: Use this query class for bare terms. For example,
+        :param termclass: Use this query class for bare terms. For example,
             query.Term or query.Variations.
-        :schema: An optional fields.Schema object. If this argument is present,
+        :param schema: An optional fields.Schema object. If this argument is present,
             the analyzer for the appropriate field will be run on terms/phrases
             before they are turned into query objects.
         """
     def __init__(self, fieldname, minchars, maxchars, discardspaces = False,
                  analyzerclass = analysis.NgramAnalyzer):
         """
-        :fieldname: The field to search.
-        :minchars: The minimum gram size the text was indexed with.
-        :maxchars: The maximum gram size the text was indexed with.
-        :discardspaces: If False, grams containing spaces are made into optional
+        :param fieldname: The field to search.
+        :param minchars: The minimum gram size the text was indexed with.
+        :param maxchars: The maximum gram size the text was indexed with.
+        :param discardspaces: If False, grams containing spaces are made into optional
             clauses of the query. If True, grams containing spaces are ignored.
-        :analyzerclass: An analyzer class. The default is the standard NgramAnalyzer.
+        :param analyzerclass: An analyzer class. The default is the standard NgramAnalyzer.
             The parser will instantiate this analyzer with the gram size set to the maximum
             usable size based on the input string.
         """
         self.analyzerclass = analyzerclass
     
     def parse(self, input):
-        """Parses the input string and returns a Query object/tree.
-        
-        This method may return None if the input string does not result in any
-        valid queries. It may also raise a variety of exceptions if the input
-        string is malformed.
-        
-        :input: the unicode string to parse.
-        """
-        
         required = []
         optional = []
         gramsize = max(self.minchars, min(self.maxchars, len(input)))
 
 
 
-if __name__=='__main__':
-    from whoosh.fields import Schema, TEXT, NGRAM, ID
-    s = Schema(content = TEXT, path=ID)
-    
-    qp = QueryParser("content", schema = s)
-    pn = qp.parse(u'hello there', normalize = False)
-    print "pn=", pn
-    if pn:
-        nn = pn.normalize()
-        print "nn=", nn
     
 
 

src/whoosh/query.py

             
     return nvector
 
-# 
+# Exceptions
 
 class QueryError(Exception):
     """
     pass
 
 
+# Base classes
+
+
 class Query(object):
     """
     Abstract base class for all queries.
+    
+    Note that this base class implements __or__, __and__, and __sub__ to allow
+    slightly more convenient composition of query objects::
+    
+        >>> Term("content", u"a") | Term("content", u"b")
+        Or([Term("content", u"a"), Term("content", u"b")])
+        
+        >>> Term("content", u"a") & Term("content", u"b")
+        And([Term("content", u"a"), Term("content", u"b")])
+        
+        >>> Term("content", u"a") - Term("content", u"b")
+        And([Term("content", u"a"), Not(Term("content", u"b"))])
     """
     
     def __or__(self, query):
+        """Allows you to use | between query objects to wrap them in an Or query.
+        """
         return Or([self, query]).normalize()
     
     def __and__(self, query):
+        """Allows you to use & between query objects to wrap them in an Or query.
+        """
         return And([self, query]).normalize()
     
     def __sub__(self, query):
+        """Allows you to use - between query objects to add the right-hand query
+        as a "NOT" query.
+        """
+        
         q = And([self, Not(query)])
         return q.normalize()
     
     def all_terms(self, termset):
-        """
-        Adds the term(s) in this query (and its subqueries, where
-        applicable) to termset. Note that unlike existing_terms(),
-        this method will not add terms from queries that require
-        a TermReader to calculate their terms, such as Prefix and
-        Wildcard.
+        """Takes a set and recursively adds all terms in this query tree
+        to the set (this method *does not* return a sequence!).
+        
+        This method simply operates on the query itself, without reference
+        to an index (unlike existing_terms()), so it will *not* add terms
+        that require an index to compute, such as Prefix and Wildcard.
+        
+        >>> termset = set()
+        >>> q = And([Term("content", u"render"), Term("path", u"/a/b")])
+        >>> q.all_terms(termset)
+        >>> termset
+        set([("content", u"render"), ("path", u"/a/b")])
+        
+        :param termset: The set to add the terms to.
         """
         pass
     
     def existing_terms(self, searcher, termset, reverse = False):
-        """
-        Adds the term(s) in the query (and its subqueries, where
-        applicable) IF AND AS EXIST IN THE INDEX to termset.
-        If reverse is True, this method returns MISSING terms rather
-        than existing terms.
+        """Takes a set and recursively adds all terms in this query tree
+        to the set *if* they exist in the index represented by the
+        given Searcher (this method *does not* return a sequence!).
+        
+        This method references the Searcher to expand Prefix and Wildcard
+        queries, and only adds terms that actually exist in the index
+        (unless reverse=True).
+        
+        >>> searcher = my_index.searcher()
+        >>> termset = set()
+        >>> q = And([Or([Term("content", u"render"), Term("content", u"rendering")]),
+                     Prefix("path", u"/a/")])
+        >>> q.existing_terms(searcher, termset)
+        >>> termset
+        set([("content", u"render"), ("path", u"/a/b"), ("path", u"/a/c")])
+        
+        :param searcher: A :class:`whoosh.searching.Searcher` object.
+        :param termset: The set to add the terms to.
+        :param reverse: If True, this method adds *missing* terms
+            rather than *existing* terms to the set.
         """
         raise NotImplementedError
     
     def estimate_size(self, searcher):
-        """
-        Returns an estimate of how many documents this query could potentially
+        """Returns an estimate of how many documents this query could potentially
         match (for example, the estimated size of a simple term query is the
         document frequency of the term). It is permissible to overestimate, but
         not to underestimate.
         raise NotImplementedError
     
     def docs(self, searcher, exclude_docs = None):
-        """
-        Runs this query on the index represented by 'searcher'.
-        Yields a sequence of docnums. The base method simply forwards to
-        doc_scores() and throws away the scores, but if possible specific
-        implementations should use a more efficient method to avoid scoring
-        the hits.
+        """Low-level method. Yields a sequence of docnums matching this query.
         
-        exclude_docs is a BitVector of documents to exclude from the results.
+        The base method simply forwards to doc_scores() and throws away the scores,
+        but if possible specific implementations use a more efficient method
+        to avoid scoring the hits.
+        
+        >>> list(my_query.docs(searcher))
+        [10, 34, 78, 103]
+        
+        :param searcher: A :class:`whoosh.searching.Searcher` object.
+        :param exclude_docs: A :class:`~whoosh.support.bitvector.BitVector`
+            of documents to exclude from the results.
         """
         
         return (docnum for docnum, _ in self.doc_scores(searcher,
                                                         exclude_docs = exclude_docs))
     
     def doc_scores(self, searcher, weighting = None, exclude_docs = None):
-        """
-        Runs this query on the index represented by 'searcher'.
-        Yields a sequence of (docnum, score) pairs.
+        """Low-level method. Yields a sequence of (docnum, score) pairs, *not in order*.
         
-        exclude_docs is a BitVector of documents to exclude from the results.
+        >>> list(my_query.doc_scores(searcher))
+        [(10, 0.73), (34, 2.54), (78, 0.05), (103, 12.84)]
+        
+        :param searcher: A :class:`whoosh.searching.Searcher` object.
+        :param weighting: A :class:`whoosh.scoring.Weighting` object, or None.
+        :param exclude_docs: A :class:`~whoosh.support.bitvector.BitVector`
+            of documents to exclude from the results.
         """
         raise NotImplementedError
     
     def normalize(self):
-        """
-        Returns a recursively "normalized" form of this query. The normalized
-        form removes redundancy and empty queries. For example,
-        AND(AND(a, b), c, Or()) -> AND(a, b, c).
+        """Returns a recursively "normalized" form of this query. The normalized
+        form removes redundancy and empty queries. This is called automatically
+        on query trees created by the query parser, but you may want to call it
+        yourself if you're writing your own parser or building your own queries.
+        
+        >>> q = And([And([Term("f", u"a"), Term("f", u"b")]), Term("f", u"c"), Or([])])
+        >>> q.normalize()
+        And([Term("f", u"a"), Term("f", u"b"), Term("f", u"c")])
+        
+        Note that this returns a *new, normalized* query. It *does not* modify the
+        original query "in place".
         """
         return self
     
     def replace(self, oldtext, newtext):
-        """
-        Returns a copy of this query with oldtext replaced by newtext
-        (if oldtext was in this query).
+        """Returns a copy of this query with oldtext replaced by newtext
+        (if oldtext was anywhere in this query).
+        
+        Note that this returns a *new* query with the given text replaced.
+        It *does not* modify the original query "in place".
         """
         return self
     
 
-class MultifieldTerm(Query):
-    def __init__(self, fieldnames, text, boost = 1.0):
-        self.fieldnames = fieldnames
-        self.text = text
-        self.boost = boost
-            
-    def __repr__(self):
-        return "%s(%r, %r, boost = %s)" % (self.fieldnames, self.text, self.boost)
-
-    def __unicode__(self):
-        return u"(%s):%s" % (u"|".join(self.fieldnames), self.text)
-    
-    def all_terms(self, termset):
-        for fn in self.fieldnames:
-            termset.add((fn, self.text))
-    
-    def existing_terms(self, searcher, termset, reverse = False):
-        for fn in self.fieldnames:
-            t = (fn, self.text)
-            contains = t in searcher
-            if reverse: contains = not contains
-            if contains:
-                termset.add(t)
-    
-    def estimate_size(self, searcher):
-        max_df = 0
-        text = self.text
-        
-        for fieldname in self.fieldnames:
-            fieldnum = searcher.fieldname_to_num(fieldname)
-            df = searcher.doc_frequency(fieldnum, text)
-            if df > max_df:
-                max_df = df
-                
-        return max_df
-    
-    def docs(self, searcher, exclude_docs = None):
-        vector = BitVector(searcher.doc_count_all())
-        text = self.text
-        
-        for fieldname in self.fieldnames:
-            fieldnum = searcher.fieldname_to_num(fieldname)
-            
-            if (fieldnum, text) in searcher:
-                for docnum, _ in searcher.postings(fieldnum, self.text,
-                                                      exclude_docs = exclude_docs):
-                    vector.set(docnum)
-                
-        return iter(vector)
-    
-    def doc_scores(self, searcher, weighting = None, exclude_docs = None):
-        text = self.text
-        weighting = weighting or searcher.weighting
-        
-        accumulators = defaultdict(float)
-        for fieldname in self.fieldnames:
-            fieldnum = searcher.fieldname_to_num(fieldname)
-            if (fieldnum, text) in searcher:
-                for docnum, weight in searcher.weights(fieldnum, text,
-                                                       exclude_docs = exclude_docs,
-                                                       boost = self.boost):
-                    accumulators[docnum] += weighting.score(searcher, fieldnum, text, docnum, weight)
-        
-        return accumulators.iteritems()
-    
-
 class SimpleQuery(Query):
     """
     Abstract base class for simple (single term) queries.
     """
     
     def __init__(self, fieldname, text, boost = 1.0):
-        """
-        fieldname is the name of the field to search. text is the text
-        of the term to search for. boost is a boost factor to apply to
-        the raw scores of any documents matched by this query.
-        """
-        
         self.fieldname = fieldname
         self.text = text
         self.boost = boost
             termset.add((fieldname, text))
 
 
-class Term(SimpleQuery):
-    """
-    Matches documents containing the given term (fieldname+text pair).
-    """
-    
-    def replace(self, oldtext, newtext):
-        if self.text == oldtext:
-            return Term(self.fieldname, newtext, boost = self.boost)
-        else:
-            return self
-    
-    def estimate_size(self, searcher):
-        fieldnum = searcher.fieldname_to_num(self.fieldname)
-        return searcher.doc_frequency(fieldnum, self.text)
-    
-    def docs(self, searcher, exclude_docs = None):
-        fieldnum = searcher.fieldname_to_num(self.fieldname)
-        text = self.text
-        
-        if (fieldnum, text) in searcher:
-            for docnum, _ in searcher.postings(fieldnum, text, exclude_docs = exclude_docs):
-                yield docnum
-    
-    def doc_scores(self, searcher, weighting = None, exclude_docs = None):
-        fieldnum = searcher.fieldname_to_num(self.fieldname)
-        text = self.text
-        boost = self.boost
-        if (fieldnum, text) in searcher:
-            weighting = weighting or searcher.weighting
-            for docnum, weight in searcher.weights(fieldnum, self.text,
-                                                   exclude_docs = exclude_docs):
-                yield docnum, weighting.score(searcher, fieldnum, text, docnum,
-                                              weight * boost)
-
-
 class CompoundQuery(Query):
     """
     Abstract base class for queries that combine or manipulate the results of
     """
     
     def __init__(self, subqueries, boost = 1.0):
-        """
-        subqueries is a list of queries to combine.
-        boost is a boost factor that should be applied to the raw score of
-        results matched by this query.
-        """
-        
         self.subqueries = subqueries
         self._notqueries = None
         self.boost = boost
                 subqs.append(s)
         
         return self.__class__(subqs)
-    
 
-class Require(CompoundQuery):
-    """Binary query returns results from the first query that also appear in the
-    second query, but only uses the scores from the first query. This lets you
-    filter results without affecting scores.
+
+class MultiTerm(Query):
+    """
+    Abstract base class for queries that operate on multiple
+    terms in the same field.
     """
     
-    JOINT = " REQUIRE "
+    def __init__(self, fieldname, words, boost = 1.0):
+        self.fieldname = fieldname
+        self.words = words
+        self.boost = boost
     
-    def __init__(self, subqueries, boost = 1.0):
-        assert len(subqueries) == 2
-        self.subqueries = subqueries
+    def __repr__(self):
+        return "%s(%r, %r)" % (self.__class__.__name__,
+                               self.fieldname, self.words)
+    
+    def _or_query(self, searcher):
+        fn = self.fieldname
+        return Or([Term(fn, word) for word in self._words(searcher)])
+    
+    def normalize(self):
+        return self.__class__(self.fieldname,
+                              [w for w in self.words if w is not None],
+                              boost = self.boost)
+    
+    def _words(self, searcher):
+        return self.words
+    
+    def all_terms(self, termset):
+        fieldname = self.fieldname
+        for word in self.words:
+            termset.add(fieldname, word)
+    
+    def existing_terms(self, searcher, termset, reverse = False):
+        fieldname = self.fieldname
+        for word in self._words(searcher):
+            t = (fieldname, word)
+            contains = t in searcher
+            if reverse: contains = not contains
+            if contains:
+                termset.add(t)
+    
+    def estimate_size(self, searcher):
+        fieldnum = searcher.fieldname_to_num(self.fieldname)
+        return sum(searcher.doc_frequency(fieldnum, text)
+                   for text in self._words(searcher))
+
+    def docs(self, searcher, exclude_docs = None):
+        return self._or_query(searcher).docs(searcher, exclude_docs = exclude_docs)
+
+    def doc_scores(self, searcher, weighting = None, exclude_docs = None):
+        return self._or_query(searcher).doc_scores(searcher,
+                                                               weighting = weighting,
+                                                               exclude_docs = exclude_docs)
+
+
+class ExpandingTerm(MultiTerm):
+    """
+    Abstract base class for queries that take one term and expand it into
+    multiple terms, such as Prefix and Wildcard.
+    """
+    
+    def __init__(self, fieldname, text, boost = 1.0):
+        self.fieldname = fieldname
+        self.text = text
         self.boost = boost
+    
+    def __repr__(self):
+        return "%s(%r, %r)" % (self.__class__.__name__,
+                               self.fieldname, self.text)
+    
+    def __unicode__(self):
+        return "%s:%s*" % (self.fieldname, self.text)
+
+    def all_terms(self, termset):
+        termset.add((self.fieldname, self.text))
+    
+    def normalize(self):
+        return self
+
+
+# Concrete classes
+
+
+class Term(SimpleQuery):
+    """
+    Matches documents containing the given term (fieldname+text pair).
+    
+    >>> Term("content", u"render")
+    """
+    
+    def replace(self, oldtext, newtext):
+        if self.text == oldtext:
+            return Term(self.fieldname, newtext, boost = self.boost)
+        else:
+            return self
+    
+    def estimate_size(self, searcher):
+        fieldnum = searcher.fieldname_to_num(self.fieldname)
+        return searcher.doc_frequency(fieldnum, self.text)
+    
+    def docs(self, searcher, exclude_docs = None):
+        fieldnum = searcher.fieldname_to_num(self.fieldname)
+        text = self.text
         
-    def docs(self, searcher, exclude_docs = None):
-        return And(self.subqueries).docs(searcher, exclude_docs = exclude_docs)
+        if (fieldnum, text) in searcher:
+            for docnum, _ in searcher.postings(fieldnum, text, exclude_docs = exclude_docs):
+                yield docnum
     
     def doc_scores(self, searcher, weighting = None, exclude_docs = None):
-        query, filterquery = self.subqueries
-        
-        filter = BitVector(searcher.doc_count_all())
-        for docnum in filterquery.docs(searcher, exclude_docs = exclude_docs):
-            filter.set(