Commits

Matt Chaput  committed 7dec8b1

Large number of wide-ranging improvements that should have been checked in individually.
Started adding unit tests.

  • Participants
  • Parent commits fb1fe45

Comments (0)

Files changed (24)

File src/whoosh/analysis.py

 #===============================================================================
 
 """
-This module contains functions and classes for turning a piece of
-text into an indexable stream of words.
+TFunctions and classes for turning a piece of text into an
+indexable stream of words.
 
 This module defines three types of functions/classes:
 
 
 Filters: callables that take a "word" generator and filter it.
 
-Analyzers: classes that implement Analyzer.words() and
-Analyzer.positioned_words(). Analyzers package up a tokenizer and
-zero or more filters into a high-level interface used by other code.
-When you create an index, you specify an Analyzer for each field.
+Analyzers: a callable that combines a tokenizer and filters for
+convenience.
 """
 
 import re
 
-from lang.porter import stem
+from whoosh.lang.porter import stem
 
 # Default list of stop words (words so common it's usually
 # wasteful to index them). This list is used by the StopFilter
               "that", "by", "with", "it", "as", "from", "an", "when",
               "not", "may", "tbd", "yet"]
 
+# Token object
+
+class Token(object):
+    __slots__ = ("positions", "chars",
+                 "orig", "text", "pos", "startchar", "endchar",
+                 "stopped")
+    
+    def __init__(self, positions, chars):
+        self.positions = positions
+        self.chars = chars
+        self.stopped = False
+
 # Support functions
 
 def gram(text, min, max):
 
 # Tokenizers
 
-def SimpleTokenizer(text):
+def IDTokenizer(value, positions = False, chars = False, start_pos = 0, start_char = 0):
     """
-    Uses a regular expression to pull words out of text.
+    Returns the entire input string as a single token. For use
+    in indexed but untokenized fields, such as a document's path.
     """
     
-    exp = re.compile(r"\W", re.UNICODE)
-    for w in exp.split(text):
-        if w and len(w) > 0:
-            yield w
+    t = Token(positions, chars)
+    t.orig = t.text = value
+    if positions:
+        t.pos = start_pos + 1
+    if chars:
+        t.startchar = start_char
+        t.endchar = start_char + len(value)
+    yield t
+    
 
-_space_split_exp = re.compile(r"(\s|,)+")
-def ListTokenizer(text):
+class RegexTokenizer(object):
     """
-    Instead of splitting words by ALL punctuation and whitespace, this
-    tokenizer only splits by whitespace and commas. This is useful for
-    lists of IDs.
+    Uses a regular expression to extract tokens from text.
     """
     
-    for w in _space_split_exp.split(text):
-        if w and len(w) > 0:
-            yield w
-            
-_comma_split_exp = re.compile("\s*,\s*")
-def CommaTokenizer(text):
+    default_expression = re.compile("\w+", re.UNICODE)
+    
+    def __init__(self, expression = None):
+        self.expression = expression or self.default_expression
+        
+    def __call__(self, value, positions = False, chars = False,
+                 start_pos = 0, start_char = 0):
+        t = Token(positions, chars)
+        
+        for pos, match in enumerate(self.expression.finditer(value)):
+            t.orig = t.text = match.group(0)
+            t.stopped = False
+            if positions:
+                t.pos = start_pos + pos
+            if chars:
+                t.startchar = start_char + match.start()
+                t.endchar = start_char + match.end()
+            yield t
+
+
+class SpaceSeparatedTokenizer(RegexTokenizer):
     """
-    Instead of splitting words by ALL punctuation and whitespace, this
-    tokenizer only splits by commas. This is useful for lists of tokens
-    that might contain spaces.
+    Splits tokens by whitespace.
     """
     
-    for w in _comma_split_exp.split(text):
-        if w and len(w) > 0:
-            yield w
+    default_expression = re.compile("[^ \t\r\n]+")
+
+
+class CommaSeparatedTokenizer(RegexTokenizer):
+    """
+    Splits tokens by commas with optional whitespace.
+    """
+    
+    default_expression = re.compile("[^,]+")
+    
+    def __call__(self, value, positions = False, chars = False,
+                 start_pos = 0, start_char = 0):
+        t = Token(positions, chars)
+        
+        for pos, match in enumerate(self.expression.finditer(value)):
+            t.orig = t.text = match.group(0).strip()
+            t.stopped = False
+            if positions:
+                t.pos = start_pos + pos
+            if chars:
+                t.startchar = start_char + match.start()
+                t.endchar = start_char + match.end()
+            yield t
+
 
 class NgramTokenizer(object):
     """
     Splits input text into Ngrams instead of words.
     """
     
-    def __init__(self, min, max, normalize = r"\W+"):
+    def __init__(self, minsize, maxsize = None):
         """
         min is the minimum length of the Ngrams to output, max is the
         maximum length to output. normalize is a regular expression that
         is globally replaced by spaces (used to eliminate punctuation).
         """
         
-        self.min = min
-        self.max = max
+        self.min = minsize
+        self.max = maxsize or minsize
         
-        self.normalize = normalize
-        if normalize:
-            self.normalize_exp = re.compile(normalize)
-    
-    def __call__(self, text):
-        if self.normalize:
-            text = self.normalize_exp.sub(" ", " %s " % text).strip()
-        return gram(text, self.min, self.max)
+    def __call__(self, value, positions = False, chars = False,
+                 start_pos = 0, start_char = 0):
+        inLen = len(value)
+        t = Token(positions, chars)
+        
+        pos = start_pos
+        for size in xrange(self.min, self.max + 1):
+            limit = inLen - size + 1
+            for start in xrange(0, limit):
+                end = start + size
+                t.orig = t.text = value[start : end]
+                t.stopped = False
+                if positions:
+                    t.pos = pos
+                if chars:
+                    t.startchar = start_char + start
+                    t.endchar = start_char + end
+                yield t
+                pos += 1
 
 # Filters
 
     def clear(self):
         self.cache.clear()
     
-    def __call__(self, ws):
+    def __call__(self, tokens):
         cache = self.cache
         ignores = self.ignores
         
-        for w in ws:
-            if w in ignores:
-                yield w
-            elif w in cache:
-                yield cache[w]
+        for t in tokens:
+            if t.stopped:
+                yield t
+                continue
+            
+            text = t.text
+            if text in ignores:
+                yield t
+            elif text in cache:
+                t.text = cache[text]
+                yield t
             else:
-                s = stem(w)
-                cache[w] = s
+                t.text = s = stem(text)
+                cache[text] = s
                 yield s
 
+
 _camel_exp = re.compile("[A-Z][a-z]*|[a-z]+|[0-9]+")
-def CamelFilter(ws):
+def CamelFilter(tokens):
     """
     Splits CamelCased words into multiple words. For example,
-    splits "getProcessedToken" into "get", "Processed", and "Token".
+    the string "getProcessedToken" yields tokens
+    "getProcessedToken", "get", "Processed", and "Token".
+    
+    Obviously this filter needs to precede LowerCaseFilter in a filter
+    chain.
     """
     
-    for w in ws:
-        yield w
-        for match in _camel_exp.finditer(w):
-            sw = match.group(0)
-            if sw != w:
-                yield sw
+    for t in tokens:
+        yield t
+        text = t.text
+        
+        if text and not text.islower() and not text.isupper() and not text.isdigit():
+            chars = t.chars
+            if chars:
+                oldstart = t.startchar
+            
+            for match in _camel_exp.finditer(text):
+                sub = match.group(0)
+                if sub != text:
+                    t.text = sub
+                    if chars:
+                        t.startchar = oldstart + match.start()
+                        t.endchar = oldstart + match.end()
+                    yield t
+
 
 class StopFilter(object):
     """
         self.stops = frozenset(stoplist)
         self.min = minsize
     
-    def __call__(self, ws):
+    def __call__(self, tokens):
         stoplist = self.stops
         minsize = self.min
         
-        for w in ws:
-            if len(w) > minsize and not w in stoplist:
-                yield w
+        for t in tokens:
+            text = t.text
+            if len(text) < minsize or text in stoplist:
+                t.stopped = True
+            yield t
 
-def LowerCaseFilter(ws):
+
+def LowerCaseFilter(tokens):
     """
-    Lowercases (using str.lower()) words in the stream.
+    Lowercases (using .lower()) words in the stream.
     """
     
-    for w in ws:
-        yield w.lower()
+    for t in tokens:
+        t.text = t.text.lower()
+        yield t
 
 # Analyzers
 
 class Analyzer(object):
-    """
-    Base class for "analyzers" -- classes that package up
-    a tokenizer and zero or more filters to provide higher-level
-    functionality.
-    """
+    def __repr__(self):
+        return "%s()" % self.__class__.__name__
+
+
+class IDAnalyzer(Analyzer):
+    def __init__(self, strip = True):
+        self.strip = strip
     
-    def filter(self, ws):
-        """
-        If a derived class accepts the default tokenizer
-        (SimpleTokenizer) used by the base class, it only needs
-        to override this method. Otherwise they can override
-        Analyzer.words() instead for complete control.
-        """
+    def __call__(self, value, **kwargs):
+        if self.strip: value = value.strip()
+        return IDTokenizer(value, **kwargs)
+
+
+class SpaceSeparatedAnalyzer(Analyzer):
+    def __init__(self):
+        self.tokenizer = SpaceSeparatedTokenizer()
+    
+    def __call__(self, value, **kwargs):
+        return self.tokenizer(value, **kwargs)
+
+
+class CommaSeparatedAnalyzer(Analyzer):
+    def __init__(self):
+        self.tokenizer = CommaSeparatedTokenizer()
         
-        return ws
-    
-    def words(self, text):
-        """
-        Takes the text to index and yields a series of terms.
-        """
-        
-        return self.filter(SimpleTokenizer(text))
-    
-    def position_words(self, text, start_pos = 0):
-        """
-        Takes the text to index and yields a series of (position, term)
-        tuples. The base method simply enumerates the terms from the
-        words() method, but if you want something more complex you can
-        override this method.
-        start_pos is the base position to start numbering at.
-        """
-        
-        for i, w in enumerate(self.words(text)):
-            yield (start_pos + i, w)
+    def __call__(self, value, **kwargs):
+        return self.tokenizer(value, **kwargs)
+
 
 class SimpleAnalyzer(Analyzer):
     """
-    Simple analyzer: does nothing but return the result of the
-    SimpleTokenizer.
-    """
-    
-    def words(self, text):
-        return SimpleTokenizer(text)
-
-class IDAnalyzer(Analyzer):
-    """
-    Does no tokenization or analysis of the text at all: simply passes it
-    through as a single term.
+    Uses a RegexTokenizer and applies a LowerCaseFilter.
     """
     
     def __init__(self):
-        self.tokenizer = None
-        self.filters = []
-    
-    def words(self, text):
-        yield text
+        self.tokenizer = RegexTokenizer()
+        
+    def __call__(self, value, **kwargs):
+        return LowerCaseFilter(self.tokenizer(value, **kwargs))
 
-class KeywordAnalyzer(Analyzer):
+
+class StandardAnalyzer(Analyzer):
     """
-    Simple analyzer: does nothing but return the result of the
-    ListTokenizer.
+    Uses a RegexTokenizer (by default) and applies a LowerCaseFilter
+    and StopFilter.
     """
     
-    def words(self, text):
-        return ListTokenizer(text)
+    def __init__(self, stoplist = None, minsize = 2):
+        self.tokenizer = RegexTokenizer()
+        self.stopper = StopFilter(stoplist = stoplist, minsize = minsize)
+        
+    def __call__(self, value, **kwargs):
+        return self.stopper(LowerCaseFilter(
+                            self.tokenizer(value, **kwargs)))
 
-class CommaAnalyzer(Analyzer):
+
+class FancyAnalyzer(Analyzer):
     """
-    Simple analyzer: does nothing but return the result of the
-    CommaTokenizer.
+    Uses a RegexTokenizer (by default) and applies a CamelFilter,
+    LowerCaseFilter, and StopFilter.
     """
     
-    def words(self, text):
-        return CommaTokenizer(text)
+    def __init__(self, stoplist = None, minsize = 2):
+        self.tokenizer = RegexTokenizer()
+        self.stopper = StopFilter(stoplist = stoplist, minsize = minsize)
+        
+    def __call__(self, value, **kwargs):
+        return self.stopper(LowerCaseFilter(
+                            CamelFilter(
+                            self.tokenizer(value, **kwargs))))
 
-class LCAnalyzer(Analyzer):
+
+class NgramAnalyzer(Analyzer):
     """
-    Filters SimpleTokenizer through the LowerCaseFilter.
+    Uses an NgramTokenizer and applies a LowerCaseFilter.
     """
     
-    def filter(self, ws):
-        return LowerCaseFilter(ws)
-
-class StopAnalyzer(Analyzer):
-    """
-    Filters SimpleTokenizer through LowerCaseFilter and StopFilter.
-    """
-    
-    def __init__(self, stopwords = None):
-        """
-        stopwords is a sequence of words not to index; the default
-        is a list of common words.
-        """
+    def __init__(self, minsize, maxsize = None):
+        self.tokenizer = NgramTokenizer(minsize, maxsize = maxsize)
         
-        self.stopwords = stopwords
-        self.stopper = StopFilter(stopwords)
-    
-    def filter(self, ws):
-        return self.stopper(LowerCaseFilter(ws))
-    
-class StemmingAnalyzer(Analyzer):
-    """
-    Filters SimpleTokenizer through LowerCaseFilter, StopFilter,
-    and StemFilter.
-    """
-    
-    def __init__(self, stopwords = None):
-        """
-        stopwords is a sequence of words not to index; the default
-        is a list of common words.
-        """
-        
-        self.stemmer = StemFilter()
-        self.stopper = StopFilter(stopwords)
-    
-    def clear(self):
-        """
-        Releases memory used by the stem cache.
-        """
-        
-        self.stemmer.clear()
-    
-    def filter(self, ws):
-        return self.stemmer(self.stopper(LowerCaseFilter(CamelFilter(ws))))
-    
-class NgramAnalyzer(Analyzer):
-    """
-    Converts a string into a stream of (lower-case) N-grams
-    instead of words.
-    """
-    
-    def __init__(self, min = 3, max = None, normalize = r"\W+"):
-        """
-        min is the minimum length of the Ngrams to output, max is the
-        maximum length to output. normalize is a regular expression that
-        is globally replaced by spaces (used to eliminate punctuation).
-        """
-        
-        if max is None: max = min
-        assert type(min) == type(max) == int
-        self.min = min
-        self.max = max
-        
-        self.tokenizer = NgramTokenizer(min, max, normalize = normalize)
-    
-    def words(self, text):
-        for w in self.filter(self.tokenizer(text)):
-            yield w
-    
-    def filter(self, ws):
-        return LowerCaseFilter(ws)
+    def __call__(self, value, positions = False, chars = False):
+        return LowerCaseFilter(self.tokenizer(value,
+                                              positions = positions, chars = chars))
 
 
 if __name__ == '__main__':
-    import time
-    import index
-    from collections import defaultdict
+    import timeit
     
-    st = time.time()
-    map = defaultdict(list)
-    ix = index.open_dir("../index")
-    tr = ix.term_reader()
+    fix = """
+from whoosh.analysis import CamelFilter, FancyAnalyzer, StandardAnalyzer
+d = open("/Volumes/Storage/Development/help/documents/nodes/sop/copy.txt").read()
+sa = StandardAnalyzer()
+fa = FancyAnalyzer()
+"""
     
-    c = 0
-    for t in tr.field_words("content"):
-        map[stem(t)].append(t)
-        c += 1
+    t = timeit.Timer("l = [t.text for t in sa(d)]", fix)
+    print t.timeit(100)
     
-    print time.time() - st
-    print "\n".join("%r %r" % (stm, lst) for stm, lst in map.iteritems())
+    t = timeit.Timer("l = [t.text for t in fa(d)]", fix)
+    print t.timeit(100)
 
 
 

File src/whoosh/fields.py

 """
 
 from collections import defaultdict
+from whoosh import analysis
 
-# Base class
+# Exceptions
 
-class Field(object):
+class FieldConfigurationError(Exception):
+    pass
+
+# Field Types
+
+class FieldType(object):
+    format = vector = scorable = stored = None
+    
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class CUSTOM(FieldType):
+    def __init__(self, format, vector = None,
+                 scorable = False, stored = False):
+        self.format = format
+        self.vector = vector
+        self.scorable = scorable
+
+
+class ID(FieldType):
+    def __init__(self, stored = False):
+        self.format = Existance(analyzer = analysis.IDAnalyzer())
+        self.stored = stored
+
+
+class STORED(FieldType):
+    def __init__(self):
+        self.format = Stored()
+        self.stored = True
+
+
+class KEYWORD(FieldType):
+    def __init__(self, stored = False, comma = False, scorable = False):
+        ana = analysis.CommaSeparatedAnalyzer if comma else analysis.SpaceSeparatedAnalyzer()
+        self.format = Frequency(analyzer = ana)
+        self.scorable = scorable
+        self.stored = stored
+
+
+class TEXT(FieldType):
+    def __init__(self, stored = False, phrase = True, analyzer = None):
+        ana = analyzer or analysis.StandardAnalyzer()
+        self.format = Frequency(analyzer = ana)
+        
+        if phrase:
+            self.vector = Positions(analyzer = ana)
+        
+        self.scorable = True
+        self.stored = stored
+
+
+class NGRAM(FieldType):
+    def __init__(self, stored = False, minsize = 2, maxsize = 4):
+        self.format = Frequency(analyzer = analysis.NgramAnalyzer(minsize, maxsize))
+        self.scorable = True
+        self.stored = stored
+
+
+# Schema class
+
+class Schema(object):
+    """
+    Represents the fields in an index. Maps names to FieldType objects
+    which define the behavior of each field.
+    """
+    
+    def __init__(self, **fields):
+        self._by_number = []
+        self._names = []
+        self._by_name = {}
+        self._numbers = {}
+        
+        for name in sorted(fields.keys()):
+            self.add(name, fields[name])
+        
+    def __repr__(self):
+        return "<Schema: %s>" % repr(self.names)
+    
+    def __iter__(self):
+        return iter(self._by_number)
+    
+    def __getitem__(self, id):
+        if isinstance(id, basestring):
+            return self._by_name[id]
+        return self._by_number[id]
+    
+    def __len__(self):
+        return len(self._by_number)
+    
+    def __contains__(self, field):
+        return field in self._by_name
+    
+    def field_by_name(self, name):
+        return self._by_name[name]
+    
+    def field_by_number(self, number):
+        return self._by_number[number]
+    
+    def fields(self):
+        return self._by_name.iteritems()
+    
+    def names(self):
+        return self._names
+    
+    def add(self, name, fieldtype, **kwargs):
+        """
+        Adds a field to this schema.
+        """
+        
+        if name.startswith("_"):
+            raise FieldConfigurationError("Field names cannot start with an underscore")
+        elif name in self._by_name:
+            raise FieldConfigurationError("Schema already has a field named %s" % name)
+        
+        if isinstance(fieldtype, type):
+            fieldtype = fieldtype(**kwargs)
+        if not isinstance(fieldtype, FieldType):
+            raise FieldConfigurationError("%r is not a FieldType object" % fieldtype)
+        
+        fnum = len(self._by_number)
+        self._numbers[name] = fnum
+        self._by_number.append(fieldtype)
+        self._names.append(name)
+        self._by_name[name] = fieldtype
+        
+    def field_names(self):
+        """
+        Returns a list of the names of the fields in this schema.
+        """
+        return self._names
+    
+    def name_to_number(self, name):
+        """
+        Given a field name, returns the field's number.
+        """
+        return self._numbers[name]
+    
+    def number_to_name(self, number):
+        """
+        Given a field number, returns the field's name.
+        """
+        return self._names[number]
+    
+    def is_vectored(self, fieldnum):
+        """
+        Returns True if the given field stores vector information.
+        """
+        return self._by_number[fieldnum].vector is not None
+    
+    def has_vectored_fields(self):
+        """
+        Returns True if any of the fields in this schema store term vectors.
+        """
+        return any(ftype.vector for ftype in self._by_number)
+    
+    def vectored_fields(self):
+        """
+        Returns a list of field numbers corresponding to the fields that are
+        vectored.
+        """
+        return [i for i, ftype in enumerate(self._by_number) if ftype.vector]
+    
+    def is_scorable(self, fieldnum):
+        """
+        Returns True if the given field stores length information.
+        """
+        return self._by_number[fieldnum].scorable
+    
+    def scorable_fields(self):
+        """
+        Returns a list of field numbers corresponding to the fields that
+        store length information.
+        """
+        return [i for i, field in enumerate(self) if field.scorable]
+
+
+# Format base class
+
+class Format(object):
     """
     Abstract base class representing a field in an indexed document.
     """
     
-    def __init__(self, name, analyzer, field_boost = 1.0,
-                 stored = False, indexed = True,
-                 vector = None, **options):
+    def __init__(self, analyzer, field_boost = 1.0, **options):
         """
-        name is the name of the field, such as "contents" or "title".
-        analyzer is an INSTANCED analyzer (not a class) to use to
-        index this field (see the analysis module). field_boost is a
-        floating point factor to apply to the score of any results
-        from this field. stored controls whether the contents of this
-        field are stored in the index. indexed controls whether the
-        contents of this field are searchable.
+        analyzer is an analyzer object to use to
+        index this field (see the analysis module). Set the analyzer
+        to None if the field should not be indexed/searchable.
+        field_boost is a floating point factor to apply to the score of any
+        results from this field. stored controls whether the contents of this
+        field are stored in the index.
         """
         
-        self._check_name(name)
-        self.name = name
         self.analyzer = analyzer
         self.field_boost = field_boost
-        self.indexed = indexed
-        self.stored = stored
-        self.number = -1
         self.options = options
         
-        if isinstance(vector, type):
-            vector = vector(self.analyzer)
-        self.vector = vector
+    def __repr__(self):
+        return "%s(%r, boost = %s)" % (self.__class__.__name__,
+                                       self.analyzer, self.field_boost)
     
-    def _check_name(self, name):
-        if name.startswith("_"):
-            raise ValueError("Field names cannot start with an underscore")
-    
-    def __repr__(self):
-        return "%s(%r, %r, boost = %s)" % (self.__class__.__name__,
-                                           self.name,
-                                           self.analyzer,
-                                           self.field_boost)
-    
-    def __eq__(self, other):
-        if not hasattr(other, "name"): return False
-        if not hasattr(other, "analyzer"): return False
-        if not hasattr(other, "boost"): return False
-        return other.__class__ is self.__class__\
-            and other.name == self.name\
-            and other.analyzer == self.analyzer\
-            and other.boost == self.boost
-
     def word_datas(self, value, **kwargs):
         """
         Yields a series of "data" tuples from a string.
-        Applies the field's analyzer to get a stream of terms from
+        Applies the field's analyzer to get a stream of tokens from
         the string, then turns the stream of words into a stream of
         (word, freq, data) tuples, where "data" is field-specific information
-        about the word. This may include the frequency also (eg in
-        a FrequencyField, 'freq' and 'data' would be the same in the absence
+        about the word. The data may also be the frequency (eg in
+        a Frequency field, 'freq' and 'data' would be the same in the absence
         of any boost).
         """
-        
         raise NotImplementedError
     
     def write_postvalue(self, stream, data):
         
         raise NotImplementedError
     
+    def data_to_frequency(self, data):
+        """
+        Returns the 'data' interpreted as term frequency.
+        """
+        raise NotImplementedError
+
     def data_to_weight(self, data):
         """
-        Takes a data string and returns the weight component,
-        if any.
+        Returns the 'data' interpreted as a term weight.
         """
-        
-        raise NotImplementedError
-    
-    def data_to_positions(self, data):
-        """
-        Takes a data string and returns the position list,
-        if any.
-        """
-        
-        raise NotImplementedError
-    
-    def data_to_position_boosts(self, data):
-        """
-        Takes a data string and returns the (position, weight)
-        list, if any.
-        """
-        
         raise NotImplementedError
 
-    def has_positions(self):
-        return False
+    def supports(self, name):
+        return hasattr(self, "data_to_" + name)
+    
 
 # Concrete field classes
 
-class StoredField(Field):
+class Stored(Format):
     """
-    A Field that's stored but not indexed.
+    A field that's stored but not indexed.
     """
     
-    stored = True
-    indexed = False
-    vector = None
     analyzer = None
     
-    def __init__(self, name, **options):
-        self._check_name(name)
-        self.name = name
+    def __init__(self, **options):
         self.options = options
         
+    def __repr__(self):
+        return "%s()" % self.__class__.__name__
+        
 
-class IDField(Field):
+class Existance(Format):
     """
-    A Field that only indexes whether a given term occurred in
+    Only indexes whether a given term occurred in
     a given document; it does not store frequencies or positions.
-    For example, use this Field type to store a field like "filepath".
+    For example, use this format to store a field like "filepath".
     """
     
+    def __init__(self, analyzer, field_boost = 1.0, **options):
+        """
+        analyzer is an analyzer object to use to
+        index this field (see the analysis module). field_boost is a
+        floating point factor to apply to the score of any results
+        from this field. stored controls whether the contents of this
+        field are stored in the index. indexed controls whether the
+        contents of this field are searchable.
+        """
+        
+        self.analyzer = analyzer
+        self.field_boost = field_boost
+        self.options = options
+    
     def word_datas(self, value, **kwargs):
         seen = set()
-        for w in self.analyzer.words(value):
-            seen.add(w)
+        for t in self.analyzer(value):
+            seen.add(t.text)
         
         return ((w, 1, None) for w in seen)
     
     def read_postvalue(self, stream):
         return None
     
+    def data_to_frequency(self, data):
+        return 1
+    
     def data_to_weight(self, data):
         return self.field_boost
 
 
-class FrequencyField(Field):
+class Frequency(Format):
     """
-    A Field that stores frequency information in each posting.
+    Stores frequency information for each posting.
     """
     
     def word_datas(self, value, **kwargs):
         seen = defaultdict(int)
-        for w in self.analyzer.words(value):
-            seen[w] += 1
+        for t in self.analyzer(value):
+            seen[t.text] += 1
         
         return ((w, freq, freq) for w, freq in seen.iteritems())
 
     def read_postvalue(self, stream):
         return stream.read_varint()
     
+    def data_to_frequency(self, data):
+        return data
+    
     def data_to_weight(self, data):
         return data * self.field_boost
 
 
-class DocBoostField(FrequencyField):
+class DocBoosts(Frequency):
     """
     A Field that stores frequency and per-document boost information
-    in each posting.
+    for each posting.
     """
     
     def word_datas(self, value, doc_boost = 1.0, **kwargs):
         seen = defaultdict(int)
-        for w in self.analyzer.words(value):
+        for w in self.analyzer(value):
             seen[w] += 1
         
         return ((w, freq, (freq, doc_boost)) for w, freq in seen.iteritems())
     def read_postvalue(self, stream):
         return (stream.read_varint(), stream.read_8bitfloat()) # , self.options.get("limit", 8)
     
+    def data_to_frequency(self, data):
+        return data[0]
+    
     def data_to_weight(self, data):
         return data[0] * data[1] * self.field_boost
 
 
-class PositionField(Field):
+# Vector formats
+
+class Positions(Format):
     """
-    A Field that stores position information in each posting, to
+    A vector that stores position information in each posting, to
     allow phrase searching and "near" queries.
     """
     
     def word_datas(self, value, start_pos = 0, **kwargs):
         seen = defaultdict(list)
-        
-        for pos, w in self.analyzer.position_words(value, start_pos = start_pos):
-            seen[w].append(start_pos + pos)
+        for t in self.analyzer(value, positions = True, start_pos = start_pos):
+            seen[t.text].append(start_pos + t.pos)
         
         return ((w, len(poslist), poslist) for w, poslist in seen.iteritems())
     
             pos_list.append(pos_base)
         return pos_list
     
+    def data_to_frequency(self, data):
+        return len(data)
+    
     def data_to_weight(self, data):
         return len(data) * self.field_boost
     
     def data_to_positions(self, data):
         return data
+
+
+class Characters(Format):
+    """
+    Stores token position and character start and end information
+    for each posting.
+    """
     
-    def has_positions(self):
-        return True
+    def word_datas(self, value, start_pos = 0, start_char = 0, **kwargs):
+        seen = defaultdict(list)
+        
+        for t in self.analyzer(value, positions = True, chars = True,
+                               start_pos = start_pos, start_char = start_char):
+            seen[t.text].append((t.pos, start_char + t.startchar, start_char + t.endchar))
+        
+        return ((w, len(ls), ls) for w, ls in seen.iteritems())
+    
+    def write_postvalue(self, stream, data):
+        pos_base = 0
+        char_base = 0
+        stream.write_varint(len(data))
+        for pos, startchar, endchar in data:
+            stream.write_varint(pos - pos_base)
+            pos_base = pos
+            
+            stream.write_varint(startchar - char_base)
+            stream.write_varint(endchar - startchar)
+            char_base = endchar
+        
+        return len(data)
+            
+    def read_postvalue(self, stream):
+        pos_base = 0
+        char_base = 0
+        ls = []
+        for i in xrange(stream.read_varint()): #@UnusedVariable
+            pos_base += stream.read_varint()
+            
+            char_base += stream.read_varint()
+            startchar = char_base
+            char_base += stream.read_varint() # End char
+            
+            ls.append(pos_base, startchar, char_base)
+        
+        return ls
+    
+    def data_to_frequency(self, data):
+        return len(data)
+    
+    def data_to_weight(self, data):
+        return len(data) * self.field_boost
+    
+    def data_to_positions(self, data):
+        return (pos for pos, _, _ in data)
+    
+    def data_to_characters(self, data):
+        return ((sc, ec) for _, sc, ec in data)
 
 
-class PositionBoostField(PositionField):
+class PositionBoosts(Format):
     """
-    A Field that stores position and per-position boost information
+    A format that stores position and per-position boost information
     in each posting.
     """
     
         if boosts is None: boosts = {}
         
         seen = defaultdict(iter)
-        for pos, w in self.analyzer.position_words(value, start_pos = start_pos):
-            seen[w].append((pos, boosts.get(pos, 1.0)))
+        for t in self.analyzer(value, positions = True, start_pos = start_pos):
+            pos = t.pos
+            seen[t.text].append((pos, boosts.get(pos, 1.0)))
         
         return ((w, len(poslist), poslist) for w, poslist in seen.iteritems())
     
     def data_to_position_boosts(self, data):
         return data[1]
 
-# Term Vector classes
-
-class TermVector(object):
-    has_positions = False
-    
-    def __init__(self, analyzer):
-        self.analyzer = analyzer
-    
-    def _entry_writer(self, postingfile, data):
-        raise NotImplementedError
-    
-    def _entry_reader(self, postingfile):
-        raise NotImplementedError
-    
-    def _entry_skipper(self, postingfile):
-        self._entry_reader(postingfile)
-    
-    def add(self, table, docnum, fieldnum, value, start_pos = 0):
-        raise NotImplementedError
-    
-    def base_data(self, table, docnum, fieldnum):
-        return table.postings((docnum, fieldnum),
-                              readfn = self._entry_reader)
-    
-    def base_data_from(self, table, docnum, fieldnum, startid):
-        return table.postings_from((docnum, fieldnum), startid,
-                                   readfn = self._entry_reader,
-                                   skipfn = self._entry_skipper)
-
-
-class FrequencyVector(TermVector):
-    def _entry_writer(self, postingfile, freq):
-        postingfile.write_varint(freq)
-    
-    def _entry_reader(self, postingfile):
-        return postingfile.read_varint()
-    
-    def add(self, table, docnum, fieldnum, value, start_pos = 0):
-        freqs = defaultdict(int)
-        
-        for w in self.analyzer.words(value):
-            freqs[w] += 1
-        
-        for word, freq in sorted(freqs.iteritems()):
-            table.write_posting(word, freq,
-                                writefn = self._entry_writer)
-        table.add_row((docnum, fieldnum))
-        
-    def freqs(self, table, docnum, fieldnum):
-        return self.base_data(table, docnum, fieldnum)
-
-
-class PositionVector(TermVector):
-    has_positions = True
-    
-    def _entry_writer(self, postingfile, poslist):
-        base = 0
-        postingfile.write_varint(len(poslist))
-        for pos in poslist:
-            postingfile.write_varint(pos - base)
-            base = pos
-        
-    def _entry_reader(self, postingfile):
-        length = postingfile.read_varint()
-        result = []
-        base = 0
-        for _ in xrange(0, length):
-            base += postingfile.read_varint()
-            result.append(base)
-        return tuple(result)
-    
-    def add(self, table, docnum, fieldnum, value, start_pos = 0):
-        positions = defaultdict(list)
-        
-        for pos, w in enumerate(self.analyzer.words(value)):
-            positions[w].append(pos + start_pos)
-        
-        for word, poslist in sorted(positions.iteritems()):
-            table.write_posting(word, tuple(poslist),
-                                writefn = self._entry_writer)
-        table.add_row((docnum, fieldnum))
-
-    def freqs(self, table, docnum, fieldnum):
-        for w, posns in self.base_data(table, docnum, fieldnum):
-            yield w, len(posns)
-            
-    def positions(self, table, docnum, fieldnum):
-        return self.base_data(table, docnum, fieldnum)
-    
-    def positions_from(self, table, docnum, fieldnum, startid):
-        return self.base_data_from(table, docnum, fieldnum, startid)
-
 
 if __name__ == '__main__':
     pass

File src/whoosh/index.py

 #===============================================================================
 
 """
-This module contains the main functions/classes for maintaining an index.
+Contains the main functions/classes for creating, maintaining, and using
+an index.
 """
 
 from __future__ import division
 import re
 from bisect import bisect_right
+import cPickle
 
-import reading, store, writing
+from whoosh import store
 
 
 _DEF_INDEX_NAME = "MAIN"
     """
     pass
 
-class EmptyIndex(Exception):
+class EmptyIndexError(Exception):
     """
     Raised when you try to work with an index that has no indexed terms.
     """
     pass
 
-class IndexLocked(Exception):
+class IndexLockedError(Exception):
     """
     Raised when you try to write to or lock an already-locked index (or
     one that was accidentally left in a locked state).
 
 # Utility functions
 
-def toc_pattern(indexname):
+def _toc_pattern(indexname):
     """
     Returns a regular expression object that matches TOC filenames.
     name is the name of the index.
     
     return re.compile("_%s_([0-9]+).toc" % indexname)
 
-def segment_pattern(indexname):
+def _segment_pattern(indexname):
     """
     Returns a regular expression object that matches segment filenames.
     name is the name of the index.
     
     return re.compile("(_%s_[0-9]+).(%s)" % (indexname, _EXTENSIONS))
 
-def _last_generation(storage, indexname):
-    """
-    Utility function to find the most recent generation number of the index.
-    storage is the storage object containing the index. indexname is the name of
-    the index.
-    """
-    
-    pattern = toc_pattern(indexname)
-    
-    max = -1
-    for filename in storage:
-        m = pattern.match(filename)
-        if m:
-            num = int(m.group(1))
-            if num > max: max = num
-    return max
-
-def clear_index(storage, indexname):
-    """
-    Clears all information from an index!
-    storage is the storage object containing the index. indexname is the name of
-    the index.
-    """
-    
-    prefix = "_%s_" % indexname
-    for filename in storage:
-        if filename.startswith(prefix):
-            storage.delete_file(filename)
-
-def create(storage, schema, indexname = _DEF_INDEX_NAME):
-    """
-    Initializes necessary files for a new index.
-    storage is the storage object in which to create the index.
-    schema is an index.Schema object describing the index's fields.
-    indexname is the name of the index to create; you only need to
-    specify this if you are creating multiple indexes within the
-    same storage object.
-    
-    Returns an index.Index object.
-    """
-    
-    clear_index(storage, indexname)
-    _write_index_file(storage, indexname, 0, [], schema, 0)
-    ix = Index(storage, indexname)
-    if ix.is_locked():
-        ix.unlock()
-    return ix
-
-def _write_index_file(storage, indexname, generation, segments, schema, counter):
-    """
-    Utility function writes an index TOC file using the informaiton supplied in the
-    arguments.
-    """
-    stream = storage.create_file("_%s_%s.toc" % (indexname, generation))
-    stream.write_pickle((segments, schema, counter))
-    stream.close()
-
-def _toc_name(name, generation):
-    """
-    Utility function returns the filename for the TOC file given an index name
-    and a generation number.
-    """
-    
-    return "_%s_%s.toc" % (name, generation)
-
-def _read_index_file(storage, name, generation):
-    """
-    Utility function reads the contents of an index TOC file and returns the
-    information inside as a tuple of ([index.Segment], index.Schema, counter)
-    """
-    
-    stream = storage.open_file(_toc_name(name, generation))
-    segments, schema, counter = stream.read_pickle()
-    stream.close()
-    return segments, schema, counter
-
-def _last_modified(storage, name):
-    """
-    Utility function takes a storage object and the name of an index an returns
-    the last modified time of the index.
-    """
-    
-    gen = _last_generation(storage, name)
-    return storage.file_modified(_toc_name(name, gen))
-
-
 def create_index_in(dirname, schema, indexname = None):
     """
     Convenience function to create an index in a directory. Takes care of creating
     a FileStorage object for you. dirname is the filename of the directory in
-    which to create the index. schema is an index.Schema object describing the
+    which to create the index. schema is a fields.Schema object describing the
     index's fields. indexname is the name of the index to create; you only need to
     specify this if you are creating multiple indexes within the
     same storage object.
     
-    Returns an index.Index object.
+    Returns an Index object.
     """
     
     if indexname is None:
         indexname = _DEF_INDEX_NAME
     
     storage = store.FileStorage(dirname)
-    return create(storage, schema, indexname = indexname)
+    return Index(storage, schema, indexname = indexname)
 
 def open_dir(dirname, indexname = None):
     """
     containing the index. indexname is the name of the index to create; you only need to
     specify this if you have multiple indexes within the same storage object.
     
-    Returns an index.Index object.
+    Returns an Index object.
     """
     
     if indexname is None:
     
     return Index(store.FileStorage(dirname), indexname)
 
-def has_index(dirname, indexname = None):
-    """
-    Returns whether a given directory contains a valid index.
-    indexname is the name of the index to create; you only need to
-    specify this if you have multiple indexes within the same storage object.
-    """
-    
-    if indexname is None:
-        indexname = _DEF_INDEX_NAME
-        
-    gen = _last_generation(store.FileStorage(dirname), indexname)
-    return gen >= 0
-
-# Classes
-
-class Schema(object):
-    """
-    Represents the fields in an index.
-    """
-    
-    def __init__(self, *fields):
-        """
-        The positional arguments to he constructor must be INSTANTIATED fields.Field
-        objects (not classes) representing the fields of an index.
-        """
-        
-        self.by_number = []
-        self.by_name = {}
-        
-        for field in fields:
-            self.add(field)
-    
-    def __repr__(self):
-        return "<Schema: %r>" % self.by_number
-    
-    def __iter__(self):
-        return iter(self.by_number)
-    
-    def add(self, field):
-        """
-        Adds a fields.Field object to this schema.
-        """
-        
-        if self.by_name.has_key(field.name):
-            raise Exception("Schema already has a field named %s" % field.name)
-        
-        num = len(self.by_number)
-        field.number = num
-        self.by_number.append(field)
-        self.by_name[field.name] = field
-    
-    def field_names(self):
-        """
-        Returns a list of the names of the fields in this schema.
-        """
-        return self.by_name.keys()
-    
-    def name_to_number(self, name):
-        """
-        Given a field name, returns the field's number.
-        """
-        return self.by_name[name].number
-    
-    def number_to_name(self, number):
-        """
-        Given a field number, returns the field's name.
-        """
-        return self.by_number[number].name
-    
-    def has_name(self, name):
-        """
-        Returns True if this schema has a field by the given name.
-        """
-        return name in self.by_name
-    
-    def has_field(self, field):
-        """
-        Returns True if this schema contains the given fields.Field object.
-        """
-        return self.has_name(field.name) and self.by_name[field.name] == field
-    
-    def has_vectors(self):
-        """
-        Returns True if any of the fields in this schema store term vectors.
-        """
-        return any(field.vector for field in self)
-    
-    def vectored_fields(self):
-        """
-        Returns a list of field numbers corresponding to the fields that are
-        vectored.
-        """
-        return [field.number for field in self if field.vector]
-
 
 class Index(object):
     """
     Represents (a generation of) an index. You must create the index using
     index.create() or index.create_index_in() before you can instantiate this
-    object (otherwise it will raise index.EmptyIndex).
+    object (otherwise it will raise index.EmptyIndexError).
     """
     
-    def __init__(self, storage, indexname = None):
-        """
-        storage is a storage object in which this index is stored.
-        indexname is the name of the index; you only need to
-        specify this if you have multiple indexes within the
-        same storage object.
+    def __init__(self, storage, schema = None, create = False, indexname = _DEF_INDEX_NAME):
         """
         
-        if indexname is None:
-            indexname = _DEF_INDEX_NAME
+        """
         
         self.storage = storage
-        self.name = indexname
+        self.indexname = indexname
         
-        self.generation = _last_generation(storage, indexname)
-        if self.generation >= 0:
-            self.reload()
+        if create:
+            if schema is None:
+                raise IndexError("To create an index you must specify a schema")
+            
+            self.schema = schema
+            self.generation = 0
+            self.segment_counter = 0
+            self.segments = SegmentSet()
+            
+            # Clear existing files
+            prefix = "_%s_" % self.indexname
+            for filename in self.storage:
+                if filename.startswith(prefix):
+                    storage.delete_file(filename)
+            
+            self._write()
         else:
-            raise EmptyIndex
+            self._read(schema)
+            
+    def latest_generation(self):
+        """
+        Returns the generation number of the latest generation of this
+        index.
+        """
         
-        self._dr = self._tr = None
+        pattern = _toc_pattern(self.indexname)
         
+        max = -1
+        for filename in self.storage:
+            m = pattern.match(filename)
+            if m:
+                num = int(m.group(1))
+                if num > max: max = num
+        return max
+    
+    def refresh(self):
+        """
+        Returns a new Index object representing the latest generation
+        of this index (if this object is the latest generation, returns
+        self).
+        """
+        
+        if not self.up_to_date():
+            return self.__class__(self.storage, indexname = self.indexname)
+        else:
+            return self
+    
+    def up_to_date(self):
+        """
+        Returns True if this object represents the latest generation of
+        this index. Returns False if this object is not the latest
+        generation (that is, someone else has updated the index since
+        you opened this object).
+        """
+        return self.generation == self.latest_generation()
+    
+    def _write(self):
+        # Writes the content of this index to the .toc file.
+        stream = self.storage.create_file(self._toc_filename())
+        stream.write_string(cPickle.dumps(self.schema, -1))
+        stream.write_int(self.generation)
+        stream.write_int(self.segment_counter)
+        stream.write_pickle(self.segments)
+        stream.close()
+    
+    def _read(self, schema):
+        # Reads the content of this index from the .toc file.
+        stream = self.storage.open_file(self._toc_filename())
+        
+        # If the user supplied a schema object with the constructor,
+        # don't load the pickled schema from the saved index.
+        if schema:
+            self.schema = schema
+            stream.skip_string()
+        else:
+            self.schema = cPickle.loads(stream.read_string())
+            
+        self.generation = stream.read_int()
+        self.segment_counter = stream.read_int()
+        self.segments = stream.read_pickle()
+        stream.close()
+    
+    def next_segment_name(self):
+        #Returns the name of the next segment in sequence.
+        
+        self.segment_counter += 1
+        return "_%s_%s" % (self.indexname, self.segment_counter)
+    
+    def _toc_filename(self):
+        return "_%s_%s.toc" % (self.indexname, self.generation)
+    
+    def last_modified(self):
+        """
+        Returns the last modified time of the .toc file.
+        """
+        return self.storage.file_modified(self._toc_filename())
+    
     def __repr__(self):
-        return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.name)
+        return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname)
     
     def lock(self):
         """
-        Locks this index for writing, or raises IndexLocked if the index
+        Locks this index for writing, or raises IndexLockedError if the index
         is already locked.
         """
         
-        try:
-            self.storage.make_dir("_%s_LOCK" % self.name)
-            return True
-        except:
-            raise IndexLocked
-    
-    def is_locked(self):
-        """
-        Returns True if this index is currently locked for writing.
-        """
-        
-        return self.storage.file_exists("_%s_LOCK" % self.name)
+        self.storage.lock("_%s_LOCK" % self.indexname)
+        return True
     
     def unlock(self):
         """
         Unlocks the index. Only call this if you were the one who locked
-        it (without getting an IndexLocked exception) in the first place!
+        it (without getting an exception) in the first place!
         """
         
         try:
-            self.storage.remove_dir("_%s_LOCK" % self.name)
+            self.storage.unlock("_%s_LOCK" % self.indexname)
         except:
             pass
     
     def is_empty(self):
         """
         Returns True if this index is empty (that is, it has never
-        had any documents sucessfully written to it.
+        had any documents successfully written to it.
         """
         
         return len(self.segments) == 0
     
-    def field_by_name(self, name):
+    def optimize(self):
         """
-        Given a field name, returns the fields.Field object
-        from this index's schema.
+        Optimizes this index's segments.
+        
+        This opens and closes a writing.IndexWriter object, so it may
+        fail if the index is already locked for writing.
         """
-        return self.schema.by_name[name]
+        
+        if len(self.segments) < 2 and not self.segments.has_deletions():
+            return
+        
+        from whoosh import writing
+        w = writing.IndexWriter(self)
+        w.optimize()
+        w.close()
     
-    def fieldnum_by_name(self, name):
+    def commit(self, newsegments = None):
         """
-        Given a field name, returns the field number in this
-        index's schema.
+        Commits pending edits (such as deletions) to this index object.
+        Raises OutOfDateError if this index is not the latest generation
+        (that is, if some code has updated the index since you opened
+        this object).
         """
-        return self.schema.name_to_number(name)
+        
+        if not self.up_to_date():
+            raise OutOfDateError
+        
+        if newsegments:
+            self.segments = newsegments
+        
+        self.generation += 1
+        self._write()
+        self.clean_files()
+    
+    def clean_files(self):
+        """
+        Attempts to remove unused index files (called when a new generation
+        is created). If existing Index and/or reader objects have the files
+        open, they may not get deleted immediately (i.e. on Windows)
+        but will probably be deleted eventually by a later call to clean_files.
+        """
+        
+        storage = self.storage
+        current_segment_names = set([s.name for s in self.segments])
+        
+        tocpattern = _toc_pattern(self.indexname)
+        segpattern = _segment_pattern(self.indexname)
+        
+        for filename in storage:
+            m = tocpattern.match(filename)
+            if m:
+                num = int(m.group(1))
+                if num != self.generation:
+                    storage.delete_file(filename)
+            else:
+                m = segpattern.match(filename)
+                if m:
+                    name = m.group(1)
+                    if name not in current_segment_names:
+                        storage.delete_file(filename)
     
     def doc_count_all(self):
         """
         Returns the total number of documents, DELETED OR UNDELETED,
         in this index.
         """
-        return sum(s.max_doc for s in self.segments)
+        return self.segments.doc_count_all()
     
     def doc_count(self):
         """
         Returns the total number of UNDELETED documents in this index.
         """
-        return sum(s.doc_count() for s in self.segments)
+        return self.segments.doc_count()
     
     def max_count(self):
         """
         """
         return max(s.max_count for s in self.segments)
     
-    def term_count(self):
+    def total_term_count(self):
         """
         Returns the total term count across all fields in all documents.
-        This is used by some scoring algorithms.
+        This is used by some scoring algorithms. Note that this
+        necessarily includes terms in deleted documents.
         """
         return sum(s.term_count for s in self.segments)
     
     def field_length(self, fieldnum):
         """
         Returns the total number of terms in a given field (the "field length").
-        This is used by some scoring algorithms.
+        This is used by some scoring algorithms. Note that this
+        necessarily includes terms in deleted documents.
         """
         
         if isinstance(fieldnum, basestring):
             fieldnum = self.schema.number_to_name(fieldnum)
         
-        return sum(s.field_counts.get(fieldnum, 0) for s in self.segments)
+        return sum(s.field_length(fieldnum) for s in self.segments)
     
-    def sibling(self, indexname):
+    def term_reader(self):
         """
-        Convenience function to get another index in the same storage
-        object as this one. This is only useful if you have multiple
-        indexes in the same storage object.
-        
-        Returns an index.Index object.
+        Returns a TermReader object for this index.
         """
         
-        return Index(self.storage, indexname = indexname)
-    
-    def term_reader(self):
-        segs = self.segments
+        from whoosh import reading
+        segments = self.segments
         
-        if len(segs) == 1:
-            segment = segs[0]
-            return reading.TermReader(self.storage, segment, self.schema)
+        if len(segments) == 1:
+            return reading.TermReader(self.storage, segments[0], self.schema)
         else:
-            term_readers = [reading.TermReader(self.storage, s, self.schema)
-                            for s in segs]
-            return reading.MultiTermReader(term_readers, self.doc_offsets)
+            return reading.MultiTermReader(self.storage, segments, self.schema)
     
     def doc_reader(self):
-        schema = self.schema
-        if len(self.segments) == 1:
-            return reading.DocReader(self.storage, self.segments[0], schema)
-        else:
-            doc_readers = [reading.DocReader(self.storage, s, schema)
-                           for s in self.segments]
-            return reading.MultiDocReader(doc_readers, self.doc_offsets)
-    
-    def find(self, querystring):
-        import searching, qparser
-        s = searching.Searcher(self)
-        pq = qparser.QueryParser(self.schema).parse(querystring)
-        return s.search(pq)
-    
-    def doc(self, **kw):
         """
-        Convenience function returns the stored fields of a document
-        matching the given keyword arguments, where the keyword keys are
-        field names and the values are terms that must appear in the field.
-        
-        Where Index.docs() returns a generator, this function returns either
-        a dictionary or None. Use it when you assume the given keyword arguments
-        either match zero or one documents (i.e. at least one of the fields is
-        a unique key).
-        
-        This method opens and closes a temporary searcher for each call and
-        forwards to its equivalent method. If you are calling it multiple times
-        in a row, you should open your own searcher instead.
+        Returns a DocReader object for this index.
         """
         
-        for p in self.docs(**kw):
-            return p
+        from whoosh import reading
+        schema = self.schema
+        segments = self.segments
+        if len(segments) == 1:
+            return reading.DocReader(self.storage, segments[0], schema)
+        else:
+            return reading.MultiDocReader(self.storage, segments, schema)
     
-    def docs(self, **kw):
+    def searcher(self):
         """
-        Convenience function returns the stored fields of a document
-        matching the given keyword arguments, where the keyword keys are
-        field names and the values are terms that must appear in the field.
-        
-        Returns a list (not a generator, so as not to keep the readers open)
-        of dictionaries containing the stored fields of any documents matching
-        the keyword arguments.
-        
-        This method opens and closes a temporary searcher for each call and
-        forwards to its equivalent method. If you are calling it multiple times
-        in a row, you should open your own searcher instead.
+        Returns a Searcher object for this index.
         """
         
-        import searching
-        s = searching.Searcher(self)
-        try:
-            return s.docs(**kw)
-        finally:
-            s.close()
-        
+        from whoosh.searching import Searcher
+        return Searcher(self)
     
-    def term_exists(self, fieldname, text):
+    def find(self, querystring, parser = None, **kwargs):
         """
-        Returns True if the given term exists in this index.
-        
-        Note that this convenience method opens and closes a temporary TermReader.
-        If you are planning to call this multiple times, it's more efficient to
-        create your own TermReader and use 'term in term_reader'.
+        Searches for querystring and returns a Results object. By default,
+        this method uses a standard qparser.QueryParser object to parse the
+        querystring. You can specify a different parser using the parser
+        keyword argument. This object must implement a 'parse' method which
+        takes a query string as the sole argument and returns a query.Query
+        object.
         """
         
-        tr = self.term_reader()
-        try:
-            return (fieldname, text) in tr
-        finally:
-            tr.close()
+        if parser is None:
+            from whoosh.qparser import QueryParser
+            parser = QueryParser(self.schema)
+            
+        return self.searcher().search(parser.parse(querystring), **kwargs)
     
-    def stored(self, docnum):
-        """
-        Returns the stored fields of the given document number.
-        
-        Note that this convenience method opens and closes a temporary DocReader.
-        If you are planning to call it multiple times, it's more efficient to
-        create your own DocReader.
-        """
-        
-        dr = self.doc_reader()
-        try:
-            fields = dr[docnum]
-        finally:
-            dr.close()
-        
-        return fields
-    
-    def up_to_date(self):
-        """
-        Returns true if this object represents the current generation of
-        the index.
-        """
-        
-        return self.generation == _last_generation(self.storage, self.name)
-    
-    def last_modified(self):
-        """
-        Returns the last modified time of this index.
-        """
-        
-        return _last_modified(self.storage, self.name)
-    
-    def next_segment_name(self):
-        """
-        Returns the name of the next segment in sequence.
-        """
-        
-        self.counter += 1
-        return "_%s_%s" % (self.name, self.counter)
-    
-    def reload(self):
-        """
-        Reloads information from this index/generation's files on disk.
-        This will NOT update the object to a later generation.
-        """
-        
-        segments, self.schema, self.counter = _read_index_file(self.storage, self.name, self.generation)
-        self._set_segments(segments)
-    
-    def refresh(self):
-        """
-        Returns the latest generation of this index.
-        """
-        return self.__class__(self.storage, indexname = self.name)
-    
-    def _set_segments(self, segments):
-        """
-        Sets this object's segment information. This is called by a writer
-        to update the Index object's information after the writer commits.
-        """
-        
-        self.segments = segments
-        
-        self.doc_offsets = []
-        self.max_doc = 0
-        
-        for segment in self.segments:
-            self.doc_offsets.append(self.max_doc)
-            self.max_doc += segment.max_doc
-    
-    def _add_segment_tuples(self, segtuples):
-        segments = [Segment(name, maxdoc, termcount, maxcount, dict(fieldcounts))
-                    for name, maxdoc, termcount, maxcount, fieldcounts
-                    in segtuples]
-        self._set_segments(self.segments + segments)
-    
-    def _document_segment(self, docnum):
-        """
-        Returns the index.Segment object containing the given document
-        number.
-        """
-        
-        if len(self.doc_offsets) == 1: return 0
-        return bisect_right(self.doc_offsets, docnum) - 1
-    
-    def _segment_and_docnum(self, docnum):
-        """
-        Returns an (index.Segment, segment_docnum) tuple for the
-        given document number.
-        """
-        
-        segmentnum = self._document_segment(docnum)
-        offset = self.doc_offsets[segmentnum]
-        segment = self.segments[segmentnum]
-        return segment, docnum - offset
-    
-    def delete_document(self, docnum):
+    def delete_document(self, docnum, delete = True):
         """
         Deletes a document by number.
 
         You must call Index.commit() for the deletion to be written to disk.
         """
-        
-        segment, segdocnum = self._segment_and_docnum(docnum)
-        segment.delete_document(segdocnum)
+        self.segments.delete_document(docnum, delete = delete)
+    
+    def deleted_count(self):
+        """
+        Returns the total number of deleted documents in this index.
+        """
+        return self.segments.deleted_count()
     
     def is_deleted(self, docnum):
         """
         Returns True if a given document number is deleted but
         not yet optimized out of the index.
         
-        You must call Index.() for the deletion to be written to disk.
+        You must call Index.commit() for the deletion to be written to disk.
         """
-        
-        segment, segdocnum = self._segment_and_docnum(docnum)
-        return segment.is_deleted(segdocnum)
+        return self.segments.is_deleted(docnum)
+    
+    def has_deletions(self):
+        """
+        Returns True if this index has documents that are marked
+        deleted but haven't been optimized out of the index yet.
+        This includes deletions that haven't been written to disk
+        with Index.commit() yet.
+        """
+        return self.segments.has_deletions()
     
     def delete_by_term(self, fieldname, text, searcher = None):
         """
         """
         
         import query
-        q = query.Term(fieldname, text, searcher = searcher)
-        return self.delete_by_query(q)
+        q = query.Term(fieldname, text)
+        return self.delete_by_query(q, searcher = searcher)
     
     def delete_by_query(self, q, searcher = None):
         """
         finally:
             if searcher is None:
                 s.close()
+        
+        return count
+
+
+class SegmentSet(object):
+    def __init__(self, segments = None):
+        if segments is None:
+            self.segments = []
+        else:
+            self.segments = segments
+        
+        self._doc_offsets = self.doc_offsets()
+            
+    def __len__(self):
+        return len(self.segments)
+    
+    def __iter__(self):
+        return iter(self.segments)
+    
+    def append(self, segment):
+        if self._doc_offsets:
+            self._doc_offsets.append(self._doc_offsets[-1] + segment.doc_count_all())
+        else:
+            self._doc_offsets = [0]
+        self.segments.append(segment)
+    
+    def __getitem__(self, n):
+        return self.segments.__getitem__(n)
+    
+    def _document_segment(self, docnum):
+        """
+        Returns the index.Segment object containing the given document
+        number.
+        """
+        
+        offsets = self._doc_offsets
+        if len(offsets) == 1: return 0
+        return bisect_right(offsets, docnum) - 1
+    
+    def _segment_and_docnum(self, docnum):
+        """
+        Returns an (index.Segment, segment_docnum) tuple for the
+        given document number.
+        """
+        
+        segmentnum = self._document_segment(docnum)
+        offset = self._doc_offsets[segmentnum]
+        segment = self.segments[segmentnum]
+        return segment, docnum - offset
+    
+    def copy(self):
+        return SegmentSet([s.copy() for s in self.segments])
+    
+    def doc_offsets(self):
+        offsets = []
+        base = 0
+        for s in self.segments:
+            offsets.append(base)
+            base += s.doc_count_all()
+        return offsets
+    
+    def doc_count_all(self):
+        """
+        Returns the total number of documents, DELETED or
+        UNDELETED, in this set.
+        """
+        return sum(s.doc_count_all() for s in self.segments)
+    
+    def doc_count(self):
+        """
+        Returns the number of undeleted documents.
+        """
+        return sum(s.doc_count() for s in self.segments)
     
     def has_deletions(self):
         """
         This includes deletions that haven't been written to disk
         with Index.commit() yet.
         """
-        
-        for segment in self.segments:
-            if segment.has_deletions(): return True
-        return False
+        return any(s.has_deletions() for s in self.segments)
     
-    def optimize(self):
+    def delete_document(self, docnum, delete = True):
         """
-        Optimizes this index's segments.
-        
-        This opens and closes a writing.IndexWriter object, so it may