Source

whoosh / src / whoosh / analysis / morph.py

Full commit
# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.filters import Filter
from whoosh.compat import integer_types
from whoosh.lang.dmetaphone import double_metaphone
from whoosh.lang.porter import stem
from whoosh.util.cache import lfu_cache, unbound_cache


class StemFilter(Filter):
    """Stems (removes suffixes from) the text of tokens using the Porter
    stemming algorithm. Stemming attempts to reduce multiple forms of the same
    root word (for example, "rendering", "renders", "rendered", etc.) to a
    single word in the index.

    >>> stemmer = RegexTokenizer() | StemFilter()
    >>> [token.text for token in stemmer("fundamentally willows")]
    ["fundament", "willow"]

    You can pass your own stemming function to the StemFilter. The default
    is the Porter stemming algorithm for English.

    >>> stemfilter = StemFilter(stem_function)

    By default, this class wraps an LRU cache around the stemming function. The
    ``cachesize`` keyword argument sets the size of the cache. To make the
    cache unbounded (the class caches every input), use ``cachesize=-1``. To
    disable caching, use ``cachesize=None``.

    If you compile and install the py-stemmer library, the
    :class:`PyStemmerFilter` provides slightly easier access to the language
    stemmers in that library.
    """

    __inittypes__ = dict(stemfn=object, ignore=list)

    is_morph = True

    def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000):
        """
        :param stemfn: the function to use for stemming.
        :param lang: if not None, overrides the stemfn with a language stemmer
            from the ``whoosh.lang.snowball`` package.
        :param ignore: a set/list of words that should not be stemmed. This is
            converted into a frozenset. If you omit this argument, all tokens
            are stemmed.
        :param cachesize: the maximum number of words to cache. Use ``-1`` for
            an unbounded cache, or ``None`` for no caching.
        """

        self.stemfn = stemfn
        self.lang = lang
        self.ignore = frozenset() if ignore is None else frozenset(ignore)
        self.cachesize = cachesize
        # clear() sets the _stem attr to a cached wrapper around self.stemfn
        self.clear()

    def __getstate__(self):
        # Can't pickle a dynamic function, so we have to remove the _stem
        # attribute from the state
        return dict([(k, self.__dict__[k]) for k in self.__dict__
                      if k != "_stem"])

    def __setstate__(self, state):
        # Check for old instances of StemFilter class, which didn't have a
        # cachesize attribute and pickled the cache attribute
        if "cachesize" not in state:
            self.cachesize = 50000
        if "ignores" in state:
            self.ignore = state["ignores"]
        elif "ignore" not in state:
            self.ignore = frozenset()
        if "lang" not in state:
            self.lang = None
        if "cache" in state:
            del state["cache"]

        self.__dict__.update(state)
        # Set the _stem attribute
        self.clear()

    def clear(self):
        if self.lang:
            from whoosh.lang import stemmer_for_language
            stemfn = stemmer_for_language(self.lang)
        else:
            stemfn = self.stemfn

        if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
            if self.cachesize < 0:
                self._stem = unbound_cache(stemfn)
            elif self.cachesize > 1:
                self._stem = lfu_cache(self.cachesize)(stemfn)
        else:
            self._stem = stemfn

    def cache_info(self):
        if self.cachesize <= 1:
            return None
        return self._stem.cache_info()

    def __eq__(self, other):
        return (other and self.__class__ is other.__class__
                and self.stemfn == other.stemfn)

    def __call__(self, tokens):
        stemfn = self._stem
        ignore = self.ignore

        for t in tokens:
            if not t.stopped:
                text = t.text
                if text not in ignore:
                    t.text = stemfn(text)
            yield t


class PyStemmerFilter(StemFilter):
    """This is a simple subclass of StemFilter that works with the py-stemmer
    third-party library. You must have the py-stemmer library installed to use
    this filter.

    >>> PyStemmerFilter("spanish")
    """

    def __init__(self, lang="english", ignore=None, cachesize=10000):
        """
        :param lang: a string identifying the stemming algorithm to use. You
            can get a list of available algorithms by with the
            :meth:`PyStemmerFilter.algorithms` method. The identification
            strings are directly from the py-stemmer library.
        :param ignore: a set/list of words that should not be stemmed. This is
            converted into a frozenset. If you omit this argument, all tokens
            are stemmed.
        :param cachesize: the maximum number of words to cache.
        """

        self.lang = lang
        self.ignore = frozenset() if ignore is None else frozenset(ignore)
        self.cachesize = cachesize
        self._stem = self._get_stemmer_fn()

    def algorithms(self):
        """Returns a list of stemming algorithms provided by the py-stemmer
        library.
        """

        import Stemmer  # @UnresolvedImport

        return Stemmer.algorithms()

    def cache_info(self):
        return None

    def _get_stemmer_fn(self):
        import Stemmer  # @UnresolvedImport

        stemmer = Stemmer.Stemmer(self.lang)
        stemmer.maxCacheSize = self.cachesize
        return stemmer.stemWord

    def __getstate__(self):
        # Can't pickle a dynamic function, so we have to remove the _stem
        # attribute from the state
        return dict([(k, self.__dict__[k]) for k in self.__dict__
                     if k != "_stem"])

    def __setstate__(self, state):
        # Check for old instances of StemFilter class, which didn't have a
        # cachesize attribute and pickled the cache attribute
        if "cachesize" not in state:
            self.cachesize = 10000
        if "ignores" in state:
            self.ignore = state["ignores"]
        elif "ignore" not in state:
            self.ignore = frozenset()
        if "cache" in state:
            del state["cache"]

        self.__dict__.update(state)
        # Set the _stem attribute
        self._stem = self._get_stemmer_fn()


class DoubleMetaphoneFilter(Filter):
    """Transforms the text of the tokens using Lawrence Philips's Double
    Metaphone algorithm. This algorithm attempts to encode words in such a way
    that similar-sounding words reduce to the same code. This may be useful for
    fields containing the names of people and places, and other uses where
    tolerance of spelling differences is desireable.
    """

    is_morph = True

    def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False):
        """
        :param primary_boost: the boost to apply to the token containing the
            primary code.
        :param secondary_boost: the boost to apply to the token containing the
            secondary code, if any.
        :param combine: if True, the original unencoded tokens are kept in the
            stream, preceding the encoded tokens.
        """

        self.primary_boost = primary_boost
        self.secondary_boost = secondary_boost
        self.combine = combine

    def __eq__(self, other):
        return (other
                and self.__class__ is other.__class__
                and self.primary_boost == other.primary_boost)

    def __call__(self, tokens):
        primary_boost = self.primary_boost
        secondary_boost = self.secondary_boost
        combine = self.combine

        for t in tokens:
            if combine:
                yield t

            primary, secondary = double_metaphone(t.text)
            b = t.boost
            # Overwrite the token's text and boost and yield it
            if primary:
                t.text = primary
                t.boost = b * primary_boost
                yield t
            if secondary:
                t.text = secondary
                t.boost = b * secondary_boost
                yield t