Commits

Matt Chaput committed 95da3b3

Key value implementation.

Comments (0)

Files changed (162)

benchmark/reuters.py

 import gzip, os.path
 
 from whoosh import analysis, fields, index, qparser, query
-from whoosh.support.bench import Bench, Spec
+from whoosh.kv.pylmdb import LMDB
+from whoosh.kv.sqlite import Sqlite
+from whoosh.compat import xrange
 from whoosh.util import now
 
 
-class Reuters(Spec):
-    name = "reuters"
-    filename = "reuters21578.txt.gz"
-    main_field = "text"
-    headline_text = "headline"
+ana = analysis.StandardAnalyzer()
+schema = fields.Schema(id=fields.ID(stored=True),
+                       headline=fields.STORED,
+                       text=fields.TEXT(analyzer=ana, stored=True))
 
-    def whoosh_schema(self):
-        #ana = analysis.StemmingAnalyzer()
-        ana = analysis.StandardAnalyzer()
-        schema = fields.Schema(id=fields.ID(stored=True),
-                               headline=fields.STORED,
-                               text=fields.TEXT(analyzer=ana, stored=True))
-        return schema
 
-    def zcatalog_setup(self, cat):
-        from zcatalog import indexes  #@UnresolvedImport
-        cat["id"] = indexes.FieldIndex(field_name="id")
-        cat["headline"] = indexes.TextIndex(field_name="headline")
-        cat["body"] = indexes.TextIndex(field_name="text")
+def documents():
+    dirpath = os.path.dirname(__file__)
+    path = os.path.join(dirpath, "reuters21578.txt.gz")
+    f = gzip.GzipFile(path)
+    for line in f:
+        aid, text = line.decode("latin1").split("\t")
+        yield {"id": aid, "text": text, "headline": text[:70]}
 
-    def documents(self):
-        path = os.path.join(self.options.dir, self.filename)
-        f = gzip.GzipFile(path)
 
-        for line in f:
-            id, text = line.decode("latin1").split("\t")
-            yield {"id": id, "text": text, "headline": text[:70]}
+indexdir = "/Users/matt/dev/reuters.index"
+dbclass = None
 
 
-if __name__ == "__main__":
-    Bench().run(Reuters)
+def make_index():
+    t = now()
+    ix = index.create_in(indexdir, schema, dbclass=dbclass, clear=True)
+    with ix.writer() as w:
+        for doc in documents():
+            w.add_document(**doc)
+    print(now() - t)
+
+
+def run_search():
+    ix = index.open_dir(indexdir, dbclass=dbclass)
+    with ix.searcher() as s:
+        words = list(s.lexicon("text"))
+        domain = [words[i] for i
+                  in xrange(0, len(words), int(len(words) / 1000.0))]
+        print(s.doc_count(), len(domain))
+
+        t = now()
+        for word in domain:
+            q = query.Term("text", word)
+            r = s.search(q)
+        print(now() - t)
+
+
+from cProfile import run
+
+make_index()
+# run("run_search()", sort="time")
+run_search()
+

docs/source/recipes.rst

 ------------------------------------
 ::
 
-    # Including documents that are deleted but not yet optimized away
-    numdocs = searcher.doc_count_all()
-
-    # Not including deleted documents
     numdocs = searcher.doc_count()
 
 

docs/source/tech/backend.rst

 
   * :meth:`whoosh.index.Index.delete_document`
 
-  * :meth:`whoosh.index.Index.doc_count_all` -- if the backend has delayed
-    deletion.
-
 * Indexes that require/support versioning/transactions *may* implement the following methods.
 
   * :meth:`whoosh.index.Index.latest_generation`
 
   * :meth:`whoosh.reading.IndexReader.stored_fields`
 
-  * :meth:`whoosh.reading.IndexReader.doc_count_all`
+  * :meth:`whoosh.reading.IndexReader.doc_count`
 
   * :meth:`whoosh.reading.IndexReader.doc_count`
 

scripts/read_checkpoint.py

                 print(hit.fields())
         assert len(r) == 1, len(r)
         assert r[0]["dt"] == dt
-
-print("Done")

src/whoosh/__init__.py

 
 
 def versionstring(build=True, extra=True):
-    """Returns the version number of Whoosh as a string.
+    """
+    Returns the version number of Whoosh as a string.
 
     :param build: Whether to include the build number in the string.
     :param extra: Whether to include alpha/beta/rc etc. tags. Only

src/whoosh/analysis/__init__.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-"""Classes and functions for turning a piece of text into an indexable stream
+"""
+Classes and functions for turning a piece of text into an indexable stream
 of "tokens" (usually equivalent to words). There are three general classes
 involved in analysis:
 
 a filter first or a tokenizer after the first item).
 """
 
-from whoosh.analysis.acore import *
+from whoosh.analysis.analysis import *
 from whoosh.analysis.tokenizers import *
 from whoosh.analysis.filters import *
 from whoosh.analysis.morph import *

src/whoosh/analysis/acore.py

-# Copyright 2007 Matt Chaput. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#    1. Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#
-#    2. Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
-# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# The views and conclusions contained in the software and documentation are
-# those of the authors and should not be interpreted as representing official
-# policies, either expressed or implied, of Matt Chaput.
-
-from whoosh.compat import iteritems
-
-
-# Exceptions
-
-class CompositionError(Exception):
-    pass
-
-
-# Utility functions
-
-def unstopped(tokenstream):
-    """Removes tokens from a token stream where token.stopped = True.
-    """
-    return (t for t in tokenstream if not t.stopped)
-
-
-def entoken(textstream, positions=False, chars=False, start_pos=0,
-            start_char=0, **kwargs):
-    """Takes a sequence of unicode strings and yields a series of Token objects
-    (actually the same Token object over and over, for performance reasons),
-    with the attributes filled in with reasonable values (for example, if
-    ``positions`` or ``chars`` is True, the function assumes each token was
-    separated by one space).
-    """
-
-    pos = start_pos
-    char = start_char
-    t = Token(positions=positions, chars=chars, **kwargs)
-
-    for text in textstream:
-        t.text = text
-
-        if positions:
-            t.pos = pos
-            pos += 1
-
-        if chars:
-            t.startchar = char
-            char = char + len(text)
-            t.endchar = char
-
-        yield t
-
-
-# Token object
-
-class Token(object):
-    """
-    Represents a "token" (usually a word) extracted from the source text being
-    indexed.
-
-    See "Advanced analysis" in the user guide for more information.
-
-    Because object instantiation in Python is slow, tokenizers should create
-    ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
-    each time.
-
-    This trick means that consumers of tokens (i.e. filters) must never try to
-    hold onto the token object between loop iterations, or convert the token
-    generator into a list. Instead, save the attributes between iterations,
-    not the object::
-
-        def RemoveDuplicatesFilter(self, stream):
-            # Removes duplicate words.
-            lasttext = None
-            for token in stream:
-                # Only yield the token if its text doesn't
-                # match the previous token.
-                if lasttext != token.text:
-                    yield token
-                lasttext = token.text
-
-    ...or, call token.copy() to get a copy of the token object.
-    """
-
-    def __init__(self, positions=False, chars=False, removestops=True, mode='',
-                 **kwargs):
-        """
-        :param positions: Whether tokens should have the token position in the
-            'pos' attribute.
-        :param chars: Whether tokens should have character offsets in the
-            'startchar' and 'endchar' attributes.
-        :param removestops: whether to remove stop words from the stream (if
-            the tokens pass through a stop filter).
-        :param mode: contains a string describing the purpose for which the
-            analyzer is being called, i.e. 'index' or 'query'.
-        """
-
-        self.positions = positions
-        self.chars = chars
-        self.stopped = False
-        self.boost = 1.0
-        self.removestops = removestops
-        self.mode = mode
-        self.__dict__.update(kwargs)
-
-    def __repr__(self):
-        parms = ", ".join("%s=%r" % (name, value)
-                          for name, value in iteritems(self.__dict__))
-        return "%s(%s)" % (self.__class__.__name__, parms)
-
-    def copy(self):
-        # This is faster than using the copy module
-        return Token(**self.__dict__)
-
-
-# Composition support
-
-class Composable(object):
-    is_morph = False
-
-    def __or__(self, other):
-        from whoosh.analysis.analyzers import CompositeAnalyzer
-
-        if not isinstance(other, Composable):
-            raise TypeError("%r is not composable with %r" % (self, other))
-        return CompositeAnalyzer(self, other)
-
-    def __repr__(self):
-        attrs = ""
-        if self.__dict__:
-            attrs = ", ".join("%s=%r" % (key, value)
-                              for key, value
-                              in iteritems(self.__dict__))
-        return self.__class__.__name__ + "(%s)" % attrs
-
-    def has_morph(self):
-        return self.is_morph

src/whoosh/analysis/analysis.py

+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.compat import iteritems
+
+
+# Exceptions
+
+class CompositionError(Exception):
+    pass
+
+
+# Utility functions
+
+def unstopped(tokenstream):
+    """
+    Removes tokens from a token stream where token.stopped = True.
+    """
+    return (t for t in tokenstream if not t.stopped)
+
+
+def entoken(textstream, positions=False, chars=False, start_pos=0,
+            start_char=0, **kwargs):
+    """
+    Takes a sequence of unicode strings and yields a series of Token objects
+    (actually the same Token object over and over, for performance reasons),
+    with the attributes filled in with reasonable values (for example, if
+    ``positions`` or ``chars`` is True, the function assumes each token was
+    separated by one space).
+    """
+
+    pos = start_pos
+    char = start_char
+    t = Token(positions=positions, chars=chars, **kwargs)
+
+    for text in textstream:
+        t.text = text
+
+        if positions:
+            t.pos = pos
+            pos += 1
+
+        if chars:
+            t.startchar = char
+            char = char + len(text)
+            t.endchar = char
+
+        yield t
+
+
+# Token object
+
+class Token(object):
+    """
+    Represents a "token" (usually a word) extracted from the source text being
+    indexed.
+
+    See "Advanced analysis" in the user guide for more information.
+
+    Because object instantiation in Python is slow, tokenizers should create
+    ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
+    each time.
+
+    This trick means that consumers of tokens (i.e. filters) must never try to
+    hold onto the token object between loop iterations, or convert the token
+    generator into a list. Instead, save the attributes between iterations,
+    not the object::
+
+        def RemoveDuplicatesFilter(self, stream):
+            # Removes duplicate words.
+            lasttext = None
+            for token in stream:
+                # Only yield the token if its text doesn't
+                # match the previous token.
+                if lasttext != token.text:
+                    yield token
+                lasttext = token.text
+
+    ...or, call token.copy() to get a copy of the token object.
+    """
+
+    def __init__(self, positions=False, chars=False, removestops=True, mode='',
+                 payload=None, nomorph=False, **kwargs):
+        """
+        :param positions: Whether tokens should have the token position in the
+            'pos' attribute.
+        :param chars: Whether tokens should have character offsets in the
+            'startchar' and 'endchar' attributes.
+        :param removestops: whether to remove stop words from the stream (if
+            the tokens pass through a stop filter).
+        :param mode: contains a string describing the purpose for which the
+            analyzer is being called, i.e. 'index' or 'query'.
+        :param nomorph: Whether filters that transform the text of the toekn
+            should keep an un-transformed version in the ``unmorphed``
+            attribute.
+        """
+
+        self.boost = 1.0
+        self.positions = positions
+        self.chars = chars
+        self.payload = payload
+        self.stopped = False
+        self.removestops = removestops
+        self.mode = mode
+        self.nomorph = nomorph
+        self.__dict__.update(kwargs)
+
+    def __repr__(self):
+        parms = ", ".join("%s=%r" % (name, value)
+                          for name, value in iteritems(self.__dict__))
+        return "%s(%s)" % (self.__class__.__name__, parms)
+
+    def copy(self):
+        # This is faster than using the copy module
+        return Token(**self.__dict__)
+
+
+# Composition support
+
+class Composable(object):
+    is_morph = False
+
+    def __or__(self, other):
+        from whoosh.analysis.analyzers import CompositeAnalyzer
+
+        if not isinstance(other, Composable):
+            raise TypeError("%r is not composable with %r" % (self, other))
+        return CompositeAnalyzer(self, other)
+
+    def __repr__(self):
+        attrs = ""
+        if self.__dict__:
+            attrs = ", ".join("%s=%r" % (key, value)
+                              for key, value
+                              in iteritems(self.__dict__))
+        return self.__class__.__name__ + "(%s)" % attrs

src/whoosh/analysis/analyzers.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-from whoosh.analysis.acore import Composable, CompositionError
+from whoosh.analysis import Composable, CompositionError
 from whoosh.analysis.tokenizers import Tokenizer
 from whoosh.analysis.filters import LowercaseFilter
 from whoosh.analysis.filters import StopFilter, STOP_WORDS
 # Analyzers
 
 class Analyzer(Composable):
-    """ Abstract base class for analyzers.
+    """
+     Abstract base class for analyzers.
     """
 
     def __repr__(self):
     def clean(self):
         pass
 
+    @property
+    def is_morph(self):
+        return False
+
 
 class CompositeAnalyzer(Analyzer):
     def __init__(self, *composables):
         return "%s(%s)" % (self.__class__.__name__,
                            ", ".join(repr(item) for item in self.items))
 
-    def __call__(self, value, no_morph=False, **kwargs):
+    def __call__(self, value, **kwargs):
         items = self.items
         # Start with tokenizer
         gen = items[0](value, **kwargs)
         # Run filters
-        for item in items[1:]:
-            if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
-                gen = item(gen)
+        for filter_ in items[1:]:
+            if filter_.is_morph and kwargs.get("nomorph"):
+                continue
+            gen = filter_(gen)
         return gen
 
     def __getitem__(self, item):
                 and self.__class__ is other.__class__
                 and self.items == other.items)
 
+    @property
+    def is_morph(self):
+        return any(item.is_morph for item in self.items)
+
     def clean(self):
         for item in self.items:
             if hasattr(item, "clean"):
                 item.clean()
 
-    def has_morph(self):
-        return any(item.is_morph for item in self.items)
-
 
 # Functions that return composed analyzers
 
 def IDAnalyzer(lowercase=False):
-    """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
+    """
+    Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
     desired.
     """
 
 
 
 def KeywordAnalyzer(lowercase=False, commas=False):
-    """Parses whitespace- or comma-separated tokens.
+    """
+    Parses whitespace- or comma-separated tokens.
 
     >>> ana = KeywordAnalyzer()
     >>> [token.text for token in ana("Hello there, this is a TEST")]
 
 
 def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
-    """Deprecated, just use a RegexTokenizer directly.
+    """
+    Deprecated, just use a RegexTokenizer directly.
     """
 
     return RegexTokenizer(expression=expression, gaps=gaps)
 
 
 def SimpleAnalyzer(expression=default_pattern, gaps=False):
-    """Composes a RegexTokenizer with a LowercaseFilter.
+    """
+    Composes a RegexTokenizer with a LowercaseFilter.
 
     >>> ana = SimpleAnalyzer()
     >>> [token.text for token in ana("Hello there, this is a TEST")]
 
 def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
                      minsize=2, maxsize=None, gaps=False):
-    """Composes a RegexTokenizer with a LowercaseFilter and optional
+    """
+    Composes a RegexTokenizer with a LowercaseFilter and optional
     StopFilter.
 
     >>> ana = StandardAnalyzer()
 def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
                      minsize=2, maxsize=None, gaps=False, stemfn=stem,
                      ignore=None, cachesize=50000):
-    """Composes a RegexTokenizer with a lower case filter, an optional stop
+    """
+    Composes a RegexTokenizer with a lower case filter, an optional stop
     filter, and a stemming filter.
 
     >>> ana = StemmingAnalyzer()
 def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,
                   maxsize=None, gaps=True, splitwords=True, splitnums=True,
                   mergewords=False, mergenums=False):
-    """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
+    """
+    Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
     StopFilter.
 
     >>> ana = FancyAnalyzer()
 
 def LanguageAnalyzer(lang, expression=default_pattern, gaps=False,
                      cachesize=50000):
-    """Configures a simple analyzer for the given language, with a
+    """
+    Configures a simple analyzer for the given language, with a
     LowercaseFilter, StopFilter, and StemFilter.
 
     >>> ana = LanguageAnalyzer("es")

src/whoosh/analysis/filters.py

 from itertools import chain
 
 from whoosh.compat import next, xrange
-from whoosh.analysis.acore import Composable
+from whoosh.analysis import Composable
 from whoosh.util.text import rcompile
 
 
 ) | (                      # or...
     \w+([:.]?\w+)*         # word characters, with opt. internal colons/dots
 )
-""", verbose=True)
+"""
+, verbose=True)
 
 
 # Filters
 
 class Filter(Composable):
-    """Base class for Filter objects. A Filter subclass must implement a
+    """
+    Base class for Filter objects. A Filter subclass must implement a
     filter() method that takes a single argument, which is an iterator of Token
     objects, and yield a series of Token objects in return.
-
-    Filters that do morphological transformation of tokens (e.g. stemming)
-    should set their ``is_morph`` attribute to True.
     """
 
     def __eq__(self, other):
 
 
 class PassFilter(Filter):
-    """An identity filter: passes the tokens through untouched.
+    """
+    An identity filter: passes the tokens through untouched.
     """
 
     def __call__(self, tokens):
 
 
 class LoggingFilter(Filter):
-    """Prints the contents of every filter that passes through as a debug
+    """
+    Prints the contents of every filter that passes through as a debug
     log entry.
     """
 
 
 
 class MultiFilter(Filter):
-    """Chooses one of two or more sub-filters based on the 'mode' attribute
+    """
+    Chooses one of two or more sub-filters based on the 'mode' attribute
     of the token stream.
     """
 
     default_filter = PassFilter()
 
     def __init__(self, **kwargs):
-        """Use keyword arguments to associate mode attribute values with
+        """
+        Use keyword arguments to associate mode attribute values with
         instantiated filters.
 
         >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False)
 
 
 class TeeFilter(Filter):
-    """Interleaves the results of two or more filters (or filter chains).
+    """
+    Interleaves the results of two or more filters (or filter chains).
 
     NOTE: because it needs to create copies of each token for each sub-filter,
     this filter is quite slow.
 
 
 class ReverseTextFilter(Filter):
-    """Reverses the text of each token.
+    """
+    Reverses the text of each token.
 
     >>> ana = RegexTokenizer() | ReverseTextFilter()
     >>> [token.text for token in ana("hello there")]
 
 
 class LowercaseFilter(Filter):
-    """Uses unicode.lower() to lowercase token text.
+    """
+    Uses unicode.lower() to lowercase token text.
 
     >>> rext = RegexTokenizer()
     >>> stream = rext("This is a TEST")
 
 
 class StripFilter(Filter):
-    """Calls unicode.strip() on the token text.
+    """
+    Calls unicode.strip() on the token text.
     """
 
     def __call__(self, tokens):
 
 
 class StopFilter(Filter):
-    """Marks "stop" words (words too common to index) in the stream (and by
+    """
+    Marks "stop" words (words too common to index) in the stream (and by
     default removes them).
 
     Make sure you precede this filter with a :class:`LowercaseFilter`.
 
 
 class CharsetFilter(Filter):
-    """Translates the text of tokens by calling unicode.translate() using the
+    """
+    Translates the text of tokens by calling unicode.translate() using the
     supplied character mapping object. This is useful for case and accent
     folding.
 
 
 
 class DelimitedAttributeFilter(Filter):
-    """Looks for delimiter characters in the text of each token and stores the
+    """
+    Looks for delimiter characters in the text of each token and stores the
     data after the delimiter in a named attribute on the token.
 
     The defaults are set up to use the ``^`` character as a delimiter and store
 
     >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost")
     >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter()
-    >>> for t in ana(u("image render^2 file^0.5"))
+    >>> for t in ana(u"image render^2 file^0.5")
     ...    print("%r %f" % (t.text, t.boost))
     'image' 1.0
     'render' 2.0
 
 
 class SubstitutionFilter(Filter):
-    """Performs a regular expression substitution on the token text.
+    """
+    Performs a regular expression substitution on the token text.
 
     This is especially useful for removing text from tokens, for example
     hyphens::

src/whoosh/analysis/intraword.py

 
 
 class CompoundWordFilter(Filter):
-    """Given a set of words (or any object with a ``__contains__`` method),
+    """
+    Given a set of words (or any object with a ``__contains__`` method),
     break any tokens in the stream that are composites of words in the word set
     into their individual parts.
 
 
 
 class BiWordFilter(Filter):
-    """Merges adjacent tokens into "bi-word" tokens, so that for example::
+    """
+    Merges adjacent tokens into "bi-word" tokens, so that for example::
 
         "the", "sign", "of", "four"
 
 
 
 class ShingleFilter(Filter):
-    """Merges a certain number of adjacent tokens into multi-word tokens, so
+    """
+    Merges a certain number of adjacent tokens into multi-word tokens, so
     that for example::
 
         "better", "a", "witty", "fool", "than", "a", "foolish", "wit"
 
 
 class IntraWordFilter(Filter):
-    """Splits words into subwords and performs optional transformations on
+    """
+    Splits words into subwords and performs optional transformations on
     subword groups. This filter is funtionally based on yonik's
     WordDelimiterFilter in Solr, but shares no code with it.
 
     (See :class:`MultiFilter`.)
     """
 
-    is_morph = True
-
     __inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool,
                          mergewords=bool, mergenums=bool)
 
+    is_morph = True
+
     def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"),
                  splitwords=True, splitnums=True,
                  mergewords=False, mergenums=False):
         self.delims = re.escape(delims)
 
         # Expression for text between delimiter characters
-        self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE)
+        self.between = re.compile(u"[^%s]+" % (self.delims,), re.UNICODE)
         # Expression for removing "'s" from the end of sub-words
-        dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase,
+        dispat = u"(?<=[%s%s])'[Ss](?=$|[%s])" % (lowercase, uppercase,
                                                     self.delims)
         self.possessive = re.compile(dispat, re.UNICODE)
 
         # Expression for finding case and letter-number transitions
-        lower2upper = u("[%s][%s]") % (lowercase, uppercase)
-        letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits)
-        digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase)
+        lower2upper = u"[%s][%s]" % (lowercase, uppercase)
+        letter2digit = u"[%s%s][%s]" % (lowercase, uppercase, digits)
+        digit2letter = u"[%s][%s%s]" % (digits, lowercase, uppercase)
         if splitwords and splitnums:
-            splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit,
+            splitpat = u"(%s|%s|%s)" % (lower2upper, letter2digit,
                                           digit2letter)
             self.boundary = re.compile(splitpat, re.UNICODE)
         elif splitwords:
             self.boundary = re.compile(text_type(lower2upper), re.UNICODE)
         elif splitnums:
-            numpat = u("(%s|%s)") % (letter2digit, digit2letter)
+            numpat = u"(%s|%s)" % (letter2digit, digit2letter)
             self.boundary = re.compile(numpat, re.UNICODE)
 
         self.splitting = splitwords or splitnums
         # counter.
         newpos = None
         for t in tokens:
+            if t.nomorph:
+                yield t
+                continue
+
             text = t.text
-
             # If this is the first token we've seen, use it to set the new
             # position counter
             if newpos is None:

src/whoosh/analysis/morph.py

 
 
 class StemFilter(Filter):
-    """Stems (removes suffixes from) the text of tokens using the Porter
+    """
+    Stems (removes suffixes from) the text of tokens using the Porter
     stemming algorithm. Stemming attempts to reduce multiple forms of the same
     root word (for example, "rendering", "renders", "rendered", etc.) to a
     single word in the index.
     stemmers in that library.
     """
 
-    __inittypes__ = dict(stemfn=object, ignore=list)
-
     is_morph = True
 
     def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000):
         # Can't pickle a dynamic function, so we have to remove the _stem
         # attribute from the state
         return dict([(k, self.__dict__[k]) for k in self.__dict__
-                      if k != "_stem"])
+                     if k != "_stem"])
 
     def __setstate__(self, state):
         # Check for old instances of StemFilter class, which didn't have a
         ignore = self.ignore
 
         for t in tokens:
+            if t.nomorph:
+                yield t
+                continue
+
             if not t.stopped:
                 text = t.text
                 if text not in ignore:
 
 
 class PyStemmerFilter(StemFilter):
-    """This is a simple subclass of StemFilter that works with the py-stemmer
+    """
+    This is a simple subclass of StemFilter that works with the py-stemmer
     third-party library. You must have the py-stemmer library installed to use
     this filter.
 
         self._stem = self._get_stemmer_fn()
 
     def algorithms(self):
-        """Returns a list of stemming algorithms provided by the py-stemmer
+        """
+        Returns a list of stemming algorithms provided by the py-stemmer
         library.
         """
 
 
 
 class DoubleMetaphoneFilter(Filter):
-    """Transforms the text of the tokens using Lawrence Philips's Double
+    """
+    Transforms the text of the tokens using Lawrence Philips's Double
     Metaphone algorithm. This algorithm attempts to encode words in such a way
     that similar-sounding words reduce to the same code. This may be useful for
     fields containing the names of people and places, and other uses where
         combine = self.combine
 
         for t in tokens:
+            if t.nomorph:
+                yield t
+                continue
+
             if combine:
                 yield t
 

src/whoosh/analysis/ngrams.py

 
 from whoosh.compat import text_type
 from whoosh.compat import xrange
-from whoosh.analysis.acore import Token
+from whoosh.analysis import Token
 from whoosh.analysis.filters import Filter, LowercaseFilter
 from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer
 
 # Tokenizer
 
 class NgramTokenizer(Tokenizer):
-    """Splits input text into N-grams instead of words.
+    """
+    Splits input text into N-grams instead of words.
 
     >>> ngt = NgramTokenizer(4)
     >>> [token.text for token in ngt("hi there")]
 # Filter
 
 class NgramFilter(Filter):
-    """Splits token text into N-grams.
+    """
+    Splits token text into N-grams.
 
     >>> rext = RegexTokenizer()
     >>> stream = rext("hello there")
 # Analyzers
 
 def NgramAnalyzer(minsize, maxsize=None):
-    """Composes an NgramTokenizer and a LowercaseFilter.
+    """
+    Composes an NgramTokenizer and a LowercaseFilter.
 
     >>> ana = NgramAnalyzer(4)
     >>> [token.text for token in ana("hi there")]

src/whoosh/analysis/tokenizers.py

 # policies, either expressed or implied, of Matt Chaput.
 
 from whoosh.compat import u, text_type
-from whoosh.analysis.acore import Composable, Token
+from whoosh.analysis import Composable, Token
 from whoosh.util.text import rcompile
 
 
 
 # Tokenizers
 
-
 class Tokenizer(Composable):
-    """Base class for Tokenizers.
+    """
+    Base class for Tokenizers.
     """
 
     def __eq__(self, other):
 
 
 class IDTokenizer(Tokenizer):
-    """Yields the entire input string as a single token. For use in indexed but
+    """
+    Yields the entire input string as a single token. For use in indexed but
     untokenized fields, such as a document's path.
 
     >>> idt = IDTokenizer()
     Uses a regular expression to extract tokens from text.
 
     >>> rex = RegexTokenizer()
-    >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
+    >>> [token.text for token in rex(u"hi there 3.141 big-time under_score")]
     ["hi", "there", "3.141", "big", "time", "under_score"]
     """
 
 
     def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                  removestops=True, start_pos=0, start_char=0, tokenize=True,
-                 mode='', **kwargs):
+                 mode='', boost=1.0, **kwargs):
         """
         :param value: The unicode string to tokenize.
         :param positions: Whether to record token positions in the token.
                   **kwargs)
         if not tokenize:
             t.original = t.text = value
-            t.boost = 1.0
+            t.boost = boost
             if positions:
                 t.pos = start_pos
             if chars:
             # The default: expression matches are used as tokens
             for pos, match in enumerate(self.expression.finditer(value)):
                 t.text = match.group(0)
-                t.boost = 1.0
+                t.boost = boost
                 if keeporiginal:
                     t.original = t.text
                 t.stopped = False
                 text = value[start:end]
                 if text:
                     t.text = text
-                    t.boost = 1.0
+                    t.boost = boost
                     if keeporiginal:
                         t.original = t.text
                     t.stopped = False
             # yield the last bit of text as a final token.
             if prevend < len(value):
                 t.text = value[prevend:]
-                t.boost = 1.0
+                t.boost = boost
                 if keeporiginal:
                     t.original = t.text
                 t.stopped = False
 
 
 class CharsetTokenizer(Tokenizer):
-    """Tokenizes and translates text according to a character mapping object.
+    """
+    Tokenizes and translates text according to a character mapping object.
     Characters that map to None are considered token break characters. For all
     other characters the map is used to translate the character. This is useful
     for case and accent folding.
                 t.endchar = start_char + len(value)
             yield t
         else:
-            text = u("")
+            text = u""
             charmap = self.charmap
             pos = start_pos
             startchar = currentchar = start_char
                             t.endchar = currentchar
                         yield t
                     startchar = currentchar + 1
-                    text = u("")
+                    text = u""
 
                 currentchar += 1
 
 
 
 def SpaceSeparatedTokenizer():
-    """Returns a RegexTokenizer that splits tokens by whitespace.
+    """
+    Returns a RegexTokenizer that splits tokens by whitespace.
 
     >>> sst = SpaceSeparatedTokenizer()
     >>> [token.text for token in sst("hi there big-time, what's up")]
 
 
 def CommaSeparatedTokenizer():
-    """Splits tokens by commas.
+    """
+    Splits tokens by commas.
 
     Note that the tokenizer calls unicode.strip() on each match of the regular
     expression.
 
 
 class PathTokenizer(Tokenizer):
-    """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens
+    """
+    A simple tokenizer that given a string ``"/a/b/c"`` yields tokens
     ``["/a", "/a/b", "/a/b/c"]``.
     """
 

src/whoosh/automata/fsa.py

+from __future__ import print_function
+
+import itertools
+import operator
+import sys
+from bisect import bisect_left
+from collections import defaultdict
+
+from whoosh.compat import iteritems, next, text_type, unichr, xrange
+
+
+unull = unichr(0)
+
+
+# Marker constants
+
+class Marker(object):
+    def __init__(self, name):
+        self.name = name
+
+    def __repr__(self):
+        return "<%s>" % self.name
+
+
+EPSILON = Marker("EPSILON")
+ANY = Marker("ANY")
+
+
+# Base class
+
+class FSA(object):
+    def __init__(self, initial):
+        self.initial = initial
+        self.transitions = {}
+        self.final_states = set()
+
+    def __len__(self):
+        return len(self.all_states())
+
+    def __eq__(self, other):
+        if self.initial != other.initial:
+            return False
+        if self.final_states != other.final_states:
+            return False
+        st = self.transitions
+        ot = other.transitions
+        if list(st) != list(ot):
+            return False
+        for key in st:
+            if st[key] != ot[key]:
+                return False
+        return True
+
+    def all_states(self):
+        stateset = set(self.transitions)
+        for src, trans in iteritems(self.transitions):
+            stateset.update(trans.values())
+        return stateset
+
+    def all_labels(self):
+        labels = set()
+        for src, trans in iteritems(self.transitions):
+            labels.update(trans)
+        return labels
+
+    def get_labels(self, src):
+        return iter(self.transitions.get(src, []))
+
+    def generate_all(self, state=None, sofar=""):
+        state = self.start() if state is None else state
+        if self.is_final(state):
+            yield sofar
+        for label in sorted(self.get_labels(state)):
+            newstate = self.next_state(state, label)
+            for string in self.generate_all(newstate, sofar + label):
+                yield string
+
+    def start(self):
+        return self.initial
+
+    def next_state(self, state, label):
+        raise NotImplementedError
+
+    def is_final(self, state):
+        raise NotImplementedError
+
+    def add_transition(self, src, label, dest):
+        raise NotImplementedError
+
+    def add_final_state(self, state):
+        raise NotImplementedError
+
+    def to_dfa(self):
+        raise NotImplementedError
+
+    def accept(self, string, debug=False):
+        state = self.start()
+        if debug:
+            print("INIT:", state)
+
+        for label in string:
+            if debug:
+                print("  ", state, "->", label, "->")
+
+            state = self.next_state(state, label)
+            if not state:
+                break
+
+        if debug:
+            print(" END:", state, self.is_final(state))
+        return self.is_final(state)
+
+    def append(self, fsa):
+        self.transitions.update(fsa.transitions)
+        for state in self.final_states:
+            self.add_transition(state, EPSILON, fsa.initial)
+        self.final_states = fsa.final_states
+
+
+# Implementations
+
+class NFA(FSA):
+    def __init__(self, initial):
+        self.transitions = {}
+        self.final_states = set()
+        self.initial = initial
+
+    def dump(self, stream=sys.stdout):
+        starts = self.start()
+        for src in self.transitions:
+            beg = "@" if src in starts else " "
+            print(beg, src, file=stream)
+            xs = self.transitions[src]
+            for label in xs:
+                dests = xs[label]
+                end = "||" if self.is_final(dests) else ""
+                print("    ->", label, "->", dests, end, file=stream)
+
+    def start(self):
+        return frozenset(self._expand(set([self.initial])))
+
+    def add_transition(self, src, label, dest):
+        self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest)
+
+    def add_final_state(self, state):
+        self.final_states.add(state)
+
+    def triples(self):
+        for src, trans in iteritems(self.transitions):
+            for label, dests in iteritems(trans):
+                for dest in dests:
+                    yield src, label, dest
+
+    def is_final(self, states):
+        return bool(self.final_states.intersection(states))
+
+    def _expand(self, states):
+        transitions = self.transitions
+        frontier = set(states)
+        while frontier:
+            state = frontier.pop()
+            if state in transitions and EPSILON in transitions[state]:
+                new_states = transitions[state][EPSILON].difference(states)
+                frontier.update(new_states)
+                states.update(new_states)
+        return states
+
+    def next_state(self, states, label):
+        transitions = self.transitions
+        dest_states = set()
+        for state in states:
+            if state in transitions:
+                xs = transitions[state]
+                if label in xs:
+                    dest_states.update(xs[label])
+                if ANY in xs:
+                    dest_states.update(xs[ANY])
+        return frozenset(self._expand(dest_states))
+
+    def get_labels(self, states):
+        transitions = self.transitions
+        labels = set()
+        for state in states:
+            if state in transitions:
+                labels.update(transitions[state])
+        return labels
+
+    def embed(self, other):
+        # Copy all transitions from the other NFA into this one
+        for s, othertrans in iteritems(other.transitions):
+            trans = self.transitions.setdefault(s, {})
+            for label, otherdests in iteritems(othertrans):
+                dests = trans.setdefault(label, set())
+                dests.update(otherdests)
+
+    def insert(self, src, other, dest):
+        self.embed(other)
+
+        # Connect src to the other NFA's initial state, and the other
+        # NFA's final states to dest
+        self.add_transition(src, EPSILON, other.initial)
+        for finalstate in other.final_states:
+            self.add_transition(finalstate, EPSILON, dest)
+
+    def to_dfa(self):
+        dfa = DFA(self.start())
+        frontier = [self.start()]
+        seen = set()
+        while frontier:
+            current = frontier.pop()
+            if self.is_final(current):
+                dfa.add_final_state(current)
+            labels = self.get_labels(current)
+            for label in labels:
+                if label is EPSILON:
+                    continue
+                new_state = self.next_state(current, label)
+                if new_state not in seen:
+                    frontier.append(new_state)
+                    seen.add(new_state)
+                    if self.is_final(new_state):
+                        dfa.add_final_state(new_state)
+                if label is ANY:
+                    dfa.set_default_transition(current, new_state)
+                else:
+                    dfa.add_transition(current, label, new_state)
+        return dfa
+
+
+class DFA(FSA):
+    def __init__(self, initial):
+        self.initial = initial
+        self.transitions = {}
+        self.defaults = {}
+        self.final_states = set()
+        self.outlabels = {}
+
+    def dump(self, stream=sys.stdout):
+        for src in sorted(self.transitions):
+            beg = "@" if src == self.initial else " "
+            print(beg, src, file=stream)
+            xs = self.transitions[src]
+            for label in sorted(xs):
+                dest = xs[label]
+                end = "||" if self.is_final(dest) else ""
+                print("    ->", label, "->", dest, end, file=stream)
+        print("DEF", self.defaults)
+
+    def start(self):
+        return self.initial
+
+    def add_transition(self, src, label, dest):
+        self.transitions.setdefault(src, {})[label] = dest
+
+    def set_default_transition(self, src, dest):
+        self.defaults[src] = dest
+
+    def add_final_state(self, state):
+        self.final_states.add(state)
+
+    def is_final(self, state):
+        return state in self.final_states
+
+    def next_state(self, src, label):
+        trans = self.transitions.get(src, {})
+        return trans.get(label, self.defaults.get(src, None))
+
+    def next_valid_string(self, string, asbytes=False):
+        state = self.start()
+        stack = []
+
+        # Follow the DFA as far as possible
+        for i, label in enumerate(string):
+            stack.append((string[:i], state, label))
+            state = self.next_state(state, label)
+            if not state:
+                break
+        else:
+            stack.append((string[:i + 1], state, None))
+
+        if self.is_final(state):
+            # Word is already valid
+            return string
+
+        # Perform a 'wall following' search for the lexicographically smallest
+        # accepting state.
+        while stack:
+            path, state, label = stack.pop()
+            label = self.find_next_edge(state, label, asbytes=asbytes)
+            if label:
+                path += label
+                state = self.next_state(state, label)
+                if self.is_final(state):
+                    return path
+                stack.append((path, state, None))
+        return None
+
+    def find_next_edge(self, s, label, asbytes):
+        if label is None:
+            label = b"\x00" if asbytes else u'\0'
+        else:
+            label = (label + 1) if asbytes else unichr(ord(label) + 1)
+        trans = self.transitions.get(s, {})
+        if label in trans or s in self.defaults:
+            return label
+
+        try:
+            labels = self.outlabels[s]
+        except KeyError:
+            self.outlabels[s] = labels = sorted(trans)
+
+        pos = bisect_left(labels, label)
+        if pos < len(labels):
+            return labels[pos]
+        return None
+
+    def reachable_from(self, src, inclusive=True):
+        transitions = self.transitions
+
+        reached = set()
+        if inclusive:
+            reached.add(src)
+
+        stack = [src]
+        seen = set()
+        while stack:
+            src = stack.pop()
+            seen.add(src)
+            for _, dest in iteritems(transitions[src]):
+                reached.add(dest)
+                if dest not in seen:
+                    stack.append(dest)
+        return reached
+
+    def minimize(self):
+        transitions = self.transitions
+        initial = self.initial
+
+        # Step 1: Delete unreachable states
+        reachable = self.reachable_from(initial)
+        for src in list(transitions):
+            if src not in reachable:
+                del transitions[src]
+        final_states = self.final_states.intersection(reachable)
+        labels = self.all_labels()
+
+        # Step 2: Partition the states into equivalence sets
+        changed = True
+        parts = [final_states, reachable - final_states]
+        while changed:
+            changed = False
+            for i in xrange(len(parts)):
+                part = parts[i]
+                changed_part = False
+                for label in labels:
+                    next_part = None
+                    new_part = set()
+                    for state in part:
+                        dest = transitions[state].get(label)
+                        if dest is not None:
+                            if next_part is None:
+                                for p in parts:
+                                    if dest in p:
+                                        next_part = p
+                            elif dest not in next_part:
+                                new_part.add(state)
+                                changed = True
+                                changed_part = True
+                    if changed_part:
+                        old_part = part - new_part
+                        parts.pop(i)
+                        parts.append(old_part)
+                        parts.append(new_part)
+                        break
+
+        # Choose one state from each equivalence set and map all equivalent
+        # states to it
+        new_trans = {}
+
+        # Create mapping
+        mapping = {}
+        new_initial = None
+        for part in parts:
+            representative = part.pop()
+            if representative is initial:
+                new_initial = representative
+            mapping[representative] = representative
+            new_trans[representative] = {}
+            for state in part:
+                if state is initial:
+                    new_initial = representative
+                mapping[state] = representative
+        assert new_initial is not None
+
+        # Apply mapping to existing transitions
+        new_finals = set(mapping[s] for s in final_states)
+        for state, d in iteritems(new_trans):
+            trans = transitions[state]
+            for label, dest in iteritems(trans):
+                d[label] = mapping[dest]
+
+        # Remove dead states - non-final states with no outgoing arcs except
+        # to themselves
+        non_final_srcs = [src for src in new_trans if src not in new_finals]
+        removing = set()
+        for src in non_final_srcs:
+            dests = set(new_trans[src].values())
+            dests.discard(src)
+            if not dests:
+                removing.add(src)
+                del new_trans[src]
+        # Delete transitions to removed dead states
+        for t in new_trans.values():
+            for label in list(t):
+                if t[label] in removing:
+                    del t[label]
+
+        self.transitions = new_trans
+        self.initial = new_initial
+        self.final_states = new_finals
+
+    def to_dfa(self):
+        return self
+
+
+# Useful functions
+
+def renumber_dfa(dfa, base=0):
+    c = itertools.count(base)
+    mapping = {}
+
+    def remap(state):
+        if state in mapping:
+            newnum = mapping[state]
+        else:
+            newnum = next(c)
+            mapping[state] = newnum
+        return newnum
+
+    newdfa = DFA(remap(dfa.initial))
+    for src, trans in iteritems(dfa.transitions):
+        for label, dest in iteritems(trans):
+            newdfa.add_transition(remap(src), label, remap(dest))
+    for finalstate in dfa.final_states:
+        newdfa.add_final_state(remap(finalstate))
+    for src, dest in iteritems(dfa.defaults):
+        newdfa.set_default_transition(remap(src), remap(dest))
+    return newdfa
+
+
+def u_to_utf8(dfa, base=0):
+    c = itertools.count(base)
+    transitions = dfa.transitions
+
+    for src, trans in iteritems(transitions):
+        trans = transitions[src]
+        for label, dest in list(iteritems(trans)):
+            if label is EPSILON:
+                continue
+            elif label is ANY:
+                raise Exception
+            else:
+                assert isinstance(label, text_type)
+                label8 = label.encode("utf8")
+                for i, byte in enumerate(label8):
+                    if i < len(label8) - 1:
+                        st = next(c)
+                        dfa.add_transition(src, byte, st)
+                        src = st
+                    else:
+                        dfa.add_transition(src, byte, dest)
+                del trans[label]
+
+
+def find_all_matches(dfa, lookup_func, first=unull):
+    """
+    Uses lookup_func to find all words within levenshtein distance k of word.
+
+    Args:
+      word: The word to look up
+      k: Maximum edit distance
+      lookup_func: A single argument function that returns the first word in the
+        database that is greater than or equal to the input argument.
+    Yields:
+      Every matching word within levenshtein distance k from the database.
+    """
+
+    match = dfa.next_valid_string(first)
+    while match:
+        key = lookup_func(match)
+        if key is None:
+            return
+        if match == key:
+            yield match
+            key += unull
+        match = dfa.next_valid_string(key)
+
+
+# Construction functions
+
+def reverse_nfa(n):
+    s = object()
+    nfa = NFA(s)
+    for src, trans in iteritems(n.transitions):
+        for label, destset in iteritems(trans):
+            for dest in destset:
+                nfa.add_transition(dest, label, src)
+    for finalstate in n.final_states:
+        nfa.add_transition(s, EPSILON, finalstate)
+    nfa.add_final_state(n.initial)
+    return nfa
+
+
+def product(dfa1, op, dfa2):
+    dfa1 = dfa1.to_dfa()
+    dfa2 = dfa2.to_dfa()
+    start = (dfa1.start(), dfa2.start())
+    dfa = DFA(start)
+    stack = [start]
+    while stack:
+        src = stack.pop()
+        state1, state2 = src
+        trans1 = set(dfa1.transitions[state1])
+        trans2 = set(dfa2.transitions[state2])
+        for label in trans1.intersection(trans2):
+            state1 = dfa1.next_state(state1, label)
+            state2 = dfa2.next_state(state2, label)
+            if op(state1 is not None, state2 is not None):
+                dest = (state1, state2)
+                dfa.add_transition(src, label, dest)
+                stack.append(dest)
+                if op(dfa1.is_final(state1), dfa2.is_final(state2)):
+                    dfa.add_final_state(dest)
+    return dfa
+
+
+def intersection(dfa1, dfa2):
+    return product(dfa1, operator.and_, dfa2)
+
+
+def union(dfa1, dfa2):
+    return product(dfa1, operator.or_, dfa2)
+
+
+def epsilon_nfa():
+    return basic_nfa(EPSILON)
+
+
+def dot_nfa():
+    return basic_nfa(ANY)
+
+
+def basic_nfa(label):
+    s = object()
+    e = object()
+    nfa = NFA(s)
+    nfa.add_transition(s, label, e)
+    nfa.add_final_state(e)
+    return nfa
+
+
+def charset_nfa(labels):
+    s = object()
+    e = object()
+    nfa = NFA(s)
+    for label in labels:
+        nfa.add_transition(s, label, e)
+    nfa.add_final_state(e)
+    return nfa
+
+
+def string_nfa(string):
+    s = object()
+    e = object()
+    nfa = NFA(s)
+    for label in string:
+        e = object()
+        nfa.add_transition(s, label, e)
+        s = e
+    nfa.add_final_state(e)
+    return nfa
+
+
+def choice_nfa(n1, n2):
+    s = object()
+    e = object()
+    nfa = NFA(s)
+    #   -> nfa1 -
+    #  /         \
+    # s           e
+    #  \         /
+    #   -> nfa2 -
+    nfa.insert(s, n1, e)
+    nfa.insert(s, n2, e)
+    nfa.add_final_state(e)
+    return nfa
+
+
+def concat_nfa(n1, n2):
+    s = object()
+    m = object()
+    e = object()
+    nfa = NFA(s)
+    nfa.insert(s, n1, m)
+    nfa.insert(m, n2, e)
+    nfa.add_final_state(e)
+    return nfa
+
+
+def star_nfa(n):
+    s = object()
+    e = object()
+    nfa = NFA(s)
+    #   -----<-----
+    #  /           \
+    # s ---> n ---> e
+    #  \           /
+    #   ----->-----
+
+    nfa.insert(s, n, e)
+    nfa.add_transition(s, EPSILON, e)
+    for finalstate in n.final_states:
+        nfa.add_transition(finalstate, EPSILON, s)
+    nfa.add_final_state(e)
+    return nfa
+
+
+def plus_nfa(n):
+    return concat_nfa(n, star_nfa(n))
+
+
+def optional_nfa(n):
+    return choice_nfa(n, epsilon_nfa())
+
+
+# Daciuk Mihov DFA construction algorithm
+
+class DMNode(object):
+    def __init__(self, n):
+        self.n = n
+        self.arcs = {}
+        self.final = False
+
+    def __repr__(self):
+        return "<%s, %r>" % (self.n, self.tuple())
+
+    def __hash__(self):
+        return hash(self.tuple())
+
+    def tuple(self):
+        arcs = tuple(sorted(iteritems(self.arcs)))
+        return arcs, self.final
+
+
+def strings_dfa(strings):
+    dfa = DFA(0)
+    c = itertools.count(1)
+
+    last = ""
+    seen = {}
+    nodes = [DMNode(0)]
+
+    for string in strings:
+        if string <= last:
+            raise Exception("Strings must be in order")
+        if not string:
+            raise Exception("Can't add empty string")
+
+        # Find the common prefix with the previous string
+        i = 0
+        while i < len(last) and i < len(string) and last[i] == string[i]:
+            i += 1
+        prefixlen = i
+
+        # Freeze the transitions after the prefix, since they're not shared
+        add_suffix(dfa, nodes, last, prefixlen + 1, seen)
+
+        # Create new nodes for the substring after the prefix
+        for label in string[prefixlen:]:
+            node = DMNode(next(c))
+            # Create an arc from the previous node to this node
+            nodes[-1].arcs[label] = node.n
+            nodes.append(node)
+        # Mark the last node as an accept state
+        nodes[-1].final = True
+
+        last = string
+
+    if len(nodes) > 1:
+        add_suffix(dfa, nodes, last, 0, seen)
+    return dfa
+
+
+def add_suffix(dfa, nodes, last, downto, seen):
+    while len(nodes) > downto:
+        node = nodes.pop()
+        tup = node.tuple()
+
+        # If a node just like this one (final/nonfinal, same arcs to same
+        # destinations) is already seen, replace with it
+        try:
+            this = seen[tup]
+        except KeyError:
+            this = node.n
+            if node.final:
+                dfa.add_final_state(this)
+            seen[tup] = this
+        else:
+            # If we replaced the node with an already seen one, fix the parent
+            # node's pointer to this
+            parent = nodes[-1]
+            inlabel = last[len(nodes) - 1]
+            parent.arcs[inlabel] = this
+
+        # Add the node's transitions to the DFA
+        for label, dest in iteritems(node.arcs):
+            dfa.add_transition(this, label, dest)
+
+
+
+

src/whoosh/automata/fst.py

-# Copyright 2009 Matt Chaput. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#    1. Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#<