Commits

Thomas Wanschik  committed 5f92a91

moved the code into a new module called search

  • Participants
  • Parent commits 66b9d8a

Comments (0)

Files changed (25)

File .hgignore

-syntax: glob
-.project
-.pydevproject
-.settings
-*~
-*.orig
-*.pyc
-*.pyo
-*.swp
-*.tmp
-_generated_media
-.dynamic_media
-desktop.ini
-settings_overrides.py
-nbproject
-django

File __init__.py

Empty file removed.

File backends/__init__.py

Empty file removed.

File backends/gae_background_tasks.py

-from django.conf import settings
-from django.db import models
-from google.appengine.ext import deferred
-
-default_search_queue = getattr(settings, 'DEFAULT_SEARCH_QUEUE', 'default')
-
-def update_relation_index(search_index_field, parent_pk, delete):
-    # pass only the field / model names to the background task to transfer less
-    # data
-    app_label = search_index_field.model_class._meta.app_label
-    object_name = search_index_field.model_class._meta.object_name
-    deferred.defer(update, app_label, object_name, search_index_field.name,
-        parent_pk, delete, _queue=default_search_queue)
-
-def update(app_label, object_name, field_name, parent_pk, delete):
-    model = models.get_model(app_label, object_name)
-    update_property = model._meta.get_field_by_name(field_name)[0]
-    update_property.update_relation_index(parent_pk, delete)

File backends/immediate_update.py

-def update_relation_index(search_index_field, parent_pk, delete):
-    search_index_field.update_relation_index(parent_pk, delete)

File core.py

-from django.conf import settings
-from django.core.exceptions import ObjectDoesNotExist
-from django.db import models
-from django.db.models import signals
-from djangotoolbox.fields import ListField
-from djangotoolbox.utils import getattr_by_path
-from copy import copy
-import re
-import string
-
-_PUNCTUATION_REGEX = re.compile(
-    '[' + re.escape(string.punctuation.replace('-', '').replace(
-        '_', '').replace('#', '')) + ']')
-_PUNCTUATION_SEARCH_REGEX = re.compile(
-    '[' + re.escape(string.punctuation.replace('_', '').replace(
-        '#', '')) + ']')
-
-# Various base indexers
-def startswith(words, indexing, **kwargs):
-    """Allows for word prefix search."""
-    if not indexing:
-        # In search mode we simply match search terms exactly
-        return words
-    # In indexing mode we add all prefixes ('h', 'he', ..., 'hello')
-    result = []
-    for word in words:
-        result.extend([word[:count].strip(u'-')
-                       for count in range(1, len(word)+1)])
-    return result
-
-def porter_stemmer(words, language, **kwargs):
-    """Porter-stemmer in various languages."""
-    languages = [language,]
-    if '-' in language:
-        languages.append(language.split('-')[0])
-
-    # Fall back to English
-    languages.append('en')
-
-    # Find a stemmer for this language
-    for language in languages:
-        try:
-            stem = __import__('search.porter_stemmers.%s' % language,
-                                 {}, {}, ['']).stem
-        except:
-            continue
-        break
-
-    result = []
-    for word in words:
-        result.append(stem(word))
-    return result
-
-stop_words = {
-    'en': set(('a', 'an', 'and', 'or', 'the', 'these', 'those', 'whose', 'to')),
-    'de': set(('ein', 'eine', 'eines', 'einer', 'einem', 'einen', 'den',
-               'der', 'die', 'das', 'dieser', 'dieses', 'diese', 'diesen',
-               'deren', 'und', 'oder'))
-}
-
-def get_stop_words(language):
-    if language not in stop_words and '-' in language:
-        language = language.split('-', 1)[0]
-    return stop_words.get(language, set())
-
-def non_stop(words, indexing, language, **kwargs):
-    """Removes stop words from search query."""
-    if indexing:
-        return words
-    return list(set(words) - get_stop_words(language))
-
-def porter_stemmer_non_stop(words, **kwargs):
-    """Combines porter_stemmer with non_stop."""
-    return porter_stemmer(non_stop(words, **kwargs), **kwargs)
-
-# Language handler
-def site_language(instance, **kwargs):
-    """The default language handler tries to determine the language from
-    fields in the model instance."""
-
-    # Check if there's a language attribute
-    if hasattr(instance, 'language'):
-        return instance.language
-    if hasattr(instance, 'lang'):
-        return instance.lang
-
-    # Fall back to default language
-    return settings.LANGUAGE_CODE
-
-def default_splitter(text, indexing=False, **kwargs):
-    """
-    Returns an array of  keywords, that are included
-    in query. All character besides of letters, numbers
-    and '_' are split characters. The character '-' is a special
-    case: two words separated by '-' create an additional keyword
-    consisting of both words without separation (see example).
-
-    Examples:
-    - text='word1/word2 word3'
-      returns ['word1', 'word2', word3]
-    - text='word1/word2-word3'
-      returns ['word1', 'word2', 'word3', 'word2word3']
-    """
-    if not text:
-        return []
-    if not indexing:
-        return _PUNCTUATION_SEARCH_REGEX.sub(u' ', text.lower()).split()
-    keywords = []
-    for word in set(_PUNCTUATION_REGEX.sub(u' ', text.lower()).split()):
-        if not word:
-            continue
-        if '-' not in word:
-            keywords.append(word)
-        else:
-            keywords.extend(get_word_combinations(word))
-    return keywords
-
-def get_word_combinations(word):
-    """
-    'one-two-three'
-    =>
-    ['one', 'two', 'three', 'onetwo', 'twothree', 'onetwothree']
-    """
-    permutations = []
-    parts = [part for part in word.split(u'-') if part]
-    for count in range(1, len(parts) + 1):
-        for index in range(len(parts) - count + 1):
-            permutations.append(u''.join(parts[index:index+count]))
-    return permutations
-
-class DictEmu(object):
-    def __init__(self, data):
-        self.data = data
-    def __getitem__(self, key):
-        return getattr(self.data, key)
-
-class StringListField(ListField):
-    def __init__(self, *args, **kwargs):
-        # TODO: provide some property in the settings which tells us which
-        # model field to use for field type in order to let other backends
-        # use other max_lengts,...
-        kwargs['field_type'] = models.CharField(max_length=500)
-        super(StringListField, self).__init__(*args, **kwargs)
-
-    def contribute_to_class(self, cls, name):
-        # XXX: Use contribute_to_class in order to add the model_class to the field
-        self.model_class = cls
-        super(StringListField, self).contribute_to_class(cls, name)
-
-class SearchableListField(StringListField):
-    """
-    This is basically a string ListField with search support.
-    """
-    def filter(self, values):
-        """Returns a query for the given values (creates '=' filters for this
-        field. Additionally filters can be applied afterwoods via chaining."""
-
-        if not isinstance(values, (tuple, list)):
-            values = (values,)
-        filtered = self.model_class.objects.all()
-        for value in set(values):
-            filter = {self.name:value}
-            filtered = filtered.filter(**filter)
-        return filtered
-
-    def search(self, query, indexer=None, splitter=None,
-            language=settings.LANGUAGE_CODE):
-        if not splitter:
-            splitter = default_splitter
-        words = splitter(query, indexing=False, language=language)
-        if indexer:
-            words = indexer(words, indexing=False, language=language)
-        # Optimize query
-        words = set(words)
-        if len(words) >= 4:
-            words -= get_stop_words(language)
-        # Don't allow empty queries
-        if not words and query:
-            # This query will never find anything
-            return self.filter(()).filter({self.name:' '})
-        return self.filter(sorted(words))
-
-class SearchIndexField(SearchableListField):
-    """
-    Simple full-text index for the given fields.
-
-    If "relation_index" is True the index will be stored in a separate entity.
-
-    With "integrate" you can add fields to your relation index,
-    so they can be searched, too.
-
-    With "filters" you can specify when a values index should be created.
-    """
-    # TODO: filters has to be extended (maybe a function) to allow Django's
-    # QuerySet methods like exclude
-    def __init__(self, fields_to_index, indexer=None, splitter=default_splitter,
-            relation_index=True, integrate='*', filters={},
-            language=site_language, **kwargs):
-        if integrate is None:
-            integrate = ()
-        if integrate == '*' and not relation_index:
-            integrate = ()
-        if isinstance(fields_to_index, basestring):
-            fields_to_index = (fields_to_index,)
-        self.fields_to_index = fields_to_index
-        if isinstance(integrate, basestring):
-            integrate = (integrate,)
-        self.filters = filters
-        self.integrate = integrate
-        self.splitter = splitter
-        self.indexer = indexer
-        self.language = language
-        self.relation_index = relation_index
-        if len(fields_to_index) == 0:
-            raise ValueError('No fields specified for index!')
-        super(SearchIndexField, self).__init__(**kwargs)
-
-    def should_index(self, values):
-        # Check if filter doesn't match
-        if not values:
-            return False
-        for filter, value in self.filters.items():
-            attr, op = filter, 'exact'
-            if '__' in filter:
-                attr, op = filter.rsplit('__', 1)
-            op = op.lower()
-            if (op == 'exact' and values[attr] != value or
-#                    op == '!=' and values[attr] == value or
-                    op == 'in' and values[attr] not in value or
-                    op == 'lt' and values[attr] >= value or
-                    op == 'lte' and values[attr] > value or
-                    op == 'gt' and values[attr] <= value or
-                    op == 'gte' and values[attr] < value):
-                return False
-            elif op not in ('exact', 'in', 'lt', 'lte', 'gte', 'gt'):
-                raise ValueError('Invalid search index filter: %s %s' % (filter, value))
-        return True
-
-#    @commit_locked
-    def update_relation_index(self, parent_pk, delete=False):
-        model = self._relation_index_model
-        try:
-            index = model.objects.get(pk=parent_pk)
-        except ObjectDoesNotExist:
-            index = None
-
-        if not delete:
-            try:
-                parent = self.model_class.objects.get(pk=parent_pk)
-            except ObjectDoesNotExist:
-                parent = None
-
-            values = None
-            if parent:
-                values = self.get_index_values(parent)
-
-        # Remove index if it's not needed, anymore
-        if delete or not self.should_index(values):
-            if index:
-                index.delete()
-            return
-
-        # Update/create index
-        if not index:
-            index = model(pk=parent_pk, **values)
-
-        # This guarantees that we also set virtual @properties
-        for key, value in values.items():
-            setattr(index, key, value)
-
-        index.save()
-
-    def create_index_model(self):
-        attrs = dict(__module__=self.__module__)
-        # By default we integrate everything when using relation index
-        if self.relation_index and self.integrate == ('*',):
-            self.integrate = tuple(field.name
-                                   for field in self.model_class._meta.fields
-                                   if not isinstance(field, SearchIndexField))
-
-        for field_name in self.integrate:
-            field = self.model_class._meta.get_field_by_name(field_name)[0]
-            field = copy(field)
-            attrs[field_name] = field
-            if hasattr(field, 'related_name'):
-                attrs[field_name].related_name = '_sidx_%s_%s_set_' % (
-                    self.model_class._meta.object_name.lower(),
-                    self.name,
-                )
-        index_name = self.name
-        attrs[index_name] = SearchIndexField(self.fields_to_index,
-            splitter=self.splitter, indexer=self.indexer,
-            language=self.language, relation_index=False)
-        if self.relation_index:
-            owner = self
-            def __init__(self, *args, **kwargs):
-                # Save some space: don't copy the whole indexed text into the
-                # relation index field unless the field gets integrated.
-                field_names = [field.name for field in self._meta.fields]
-                owner_field_names = [field.name
-                                     for field in owner.model_class._meta.fields]
-                for key, value in kwargs.items():
-                    if key in field_names or key not in owner_field_names:
-                        continue
-                    setattr(self, key, value)
-                    del kwargs[key]
-                models.Model.__init__(self, *args, **kwargs)
-            attrs['__init__'] = __init__
-            self._relation_index_model = type(
-                'RelationIndex_%s_%s_%s' % (self.model_class._meta.app_label,
-                                            self.model_class._meta.object_name,
-                                            self.name),
-                (models.Model,), attrs)
-
-    def get_index_values(self, model_instance):
-        filters = []
-        for filter in self.filters.keys():
-            if '__' in filter:
-                filters.append(filter.rsplit('__')[0])
-            else:
-                filters.append(filter)
-        filters = tuple(filters)
-        values = {}
-        for field_name in set(self.fields_to_index + self.integrate + filters):
-            instance = self.model_class._meta.get_field_by_name(field_name)[0]
-            if isinstance(instance, models.ForeignKey):
-                value = instance.pre_save(model_instance, False)
-            else:
-                value = getattr(model_instance, field_name)
-            if field_name == self.fields_to_index[0] and \
-                    isinstance(value, (list, tuple)):
-                value = sorted(value)
-            values[field_name] = value
-        return values
-
-    def pre_save(self, model_instance, add):
-        if self.filters and not self.should_index(DictEmu(model_instance)) \
-                or self.relation_index:
-            return []
-
-        language = self.language
-        if callable(language):
-            language = language(model_instance, property=self)
-
-        index = []
-        for field in self.fields_to_index:
-            values = getattr_by_path(model_instance, field, None)
-            if not values:
-                values = ()
-            elif not isinstance(values, (list, tuple)):
-                values = (values,)
-            for value in values:
-                index.extend(self.splitter(value, indexing=True, language=language))
-        if self.indexer:
-            index = self.indexer(index, indexing=True, language=language)
-        # Sort index to make debugging easier
-        setattr(model_instance, self.name, sorted(set(index)))
-        return index
-
-    def contribute_to_class(self, cls, name):
-        attrs = {name:self}
-        def search(self, query, language=settings.LANGUAGE_CODE):
-            return getattr(self, name).search(query, language)
-        attrs['search'] = search
-        setattr(cls, name, type('Indexes', (models.Manager, ), attrs)())
-        super(SearchIndexField, self).contribute_to_class(cls, name)
-
-    def search(self, query, language=settings.LANGUAGE_CODE):
-        if self.relation_index:
-            items = self._relation_index_model._meta.get_field_by_name(
-                self.name)[0].search(query, language=language).values('pk')
-            return RelationIndexQuery(self, items)
-        return super(SearchIndexField, self).search(query, splitter=self.splitter,
-            indexer=self.indexer, language=language)
-
-def post(delete, sender, instance, **kwargs):
-    for field in sender._meta.fields:
-        if isinstance(field, SearchIndexField):
-            if field.relation_index:
-                backend = load_backend()
-                backend.update_relation_index(field, instance.pk, delete)
-
-def post_save(sender, instance, **kwargs):
-    post(False, sender, instance, **kwargs)
-
-def post_delete(sender, instance, **kwargs):
-    post(True, sender, instance, **kwargs)
-
-def load_backend():
-    backend = getattr(settings, 'BACKEND', 'search.backends.immediate_update')
-    import_list = []
-    if '.' in backend:
-        import_list = [backend.rsplit('.', 1)[1]]
-    return __import__(backend, globals(), locals(), import_list)
-
-def install_index_model(sender, **kwargs):
-    needs_relation_index = False
-    for field in sender._meta.fields:
-        if isinstance(field, SearchIndexField) and field.relation_index:
-            field.create_index_model()
-            needs_relation_index = True
-    if needs_relation_index:
-        signals.post_save.connect(post_save, sender=sender)
-        signals.post_delete.connect(post_delete, sender=sender)
-signals.class_prepared.connect(install_index_model)
-
-# TODO: Refactor QueryTraits using Django's QuerySet
-class QueryTraits(object):
-    def __iter__(self):
-        return iter(self[:301])
-
-    def __len__(self):
-        return self.count()
-
-    def get(self, *args, **kwargs):
-        result = self[:1]
-        if result:
-            return result[0]
-        raise ObjectDoesNotExist
-
-class RelationIndexQuery(QueryTraits):
-    """Combines the results of multiple queries by appending the queries in the
-    given order."""
-    def __init__(self, field, query):
-        self.model = field.model_class
-        self.field = field
-        self.query = query
-
-    def order(self, *args, **kwargs):
-        self.query = self.query.order(*args, **kwargs)
-
-    def filter(self, *args, **kwargs):
-        self.query = self.query.filter(*args, **kwargs)
-        return self
-
-    def __getitem__(self, index):
-        pks = [instance.pk if isinstance(instance, models.Model) else instance['pk']
-                for instance in self.query[index]]
-        return [item for item in self.model.objects.filter(pk__in=pks) if item]
-
-    def count(self):
-        return self.query.count()
-
-    # TODO: add keys_only query
-#    def values(self, fields):
-#        pass

File models.py

Empty file removed.

File porter_stemmers/__init__.py

Empty file removed.

File porter_stemmers/de.py

-# -*- coding: utf-8 -*-
-#   Eine Pythonimplementation des Porter-Stemmers für Deutsch (Orginal unter http://snowball.tartarus.org/texts/germanic.html)
-#
-#   Modifiziert/optimiert/gefixt von Waldemar Kornewald
-#
-#   Ersteller dieser Version: (c) by kristall 'ät' c-base.org       http://kristall.crew.c-base.org/porter_de.py
-#
-#   Der Algorithmus in (englischem) Prosa unter http://snowball.tartarus.org/algorithms/german/stemmer.html
-#
-#   Wikipedia zum Porter-Stemmer: http://de.wikipedia.org/wiki/Porter-Stemmer-Algorithmus
-#
-#   Lizenz: Diese Software steht unter der BSD License (siehe http://www.opensource.org/licenses/bsd-license.html).
-#   Ursprünglicher Autor: (c) by Dr. Martin Porter 
-#
-#
-###
-
-#   Wer mit Strings arbeitet, sollte dieses Modul laden
-import string
-
-#   Die Stopliste; Wörter in dieser Liste werden nicht 'gestemmt', wenn stop  = 'True' an die Funktion übergeben wurde
-stopliste = (u'aber', u'alle', u'allem', u'allen', u'aller', u'alles', u'als', u'also', u'am', u'an', u'ander', u'andere', u'anderem',
-        u'anderen', u'anderer', u'anderes', u'anderm', u'andern', u'anders', u'auch', u'auf', u'aus', u'bei', u'bin', u'bis', u'bist',
-        u'da', u'damit', u'dann', u'der', u'den', u'des', u'dem', u'die', u'das', u'dass', u'daß', u'derselbe', u'derselben', u'denselben',
-        u'desselben', u'demselben', u'dieselbe', u'dieselben', u'dasselbe', u'dazu', u'dein', u'deine', u'deinem', u'deinen', u'deiner',
-        u'deines', u'denn', u'derer', u'dessen', u'dich', u'dir', u'du', u'dies', u'diese', u'diesem', u'diesen', u'dieser', u'dieses',
-        u'doch', u'dort', u'durch', u'ein', u'eine', u'einem', u'einen', u'einer', u'eines', u'einig', u'einige', u'einigem', u'einigen', 
-        u'einiger', u'einiges', u'einmal', u'er', u'ihn', u'ihm', u'es', u'etwas', u'euer', u'eure', u'eurem', u'euren', u'eurer', u'eures',
-        u'für', u'gegen', u'gewesen', u'hab', u'habe', u'haben', u'hat', u'hatte', u'hatten', u'hier', u'hin', u'hinter', u'ich', u'mich',
-        u'mir', u'ihr', u'ihre', u'ihrem', u'ihren', u'ihrer', u'ihres', u'euch', u'im', u'in', u'indem', u'ins', u'ist', u'jede', u'jedem',
-        u'jeden', u'jeder', u'jedes', u'jene', u'jenem', u'jenen', u'jener', u'jenes', u'jetzt', u'kann', u'kein', u'keine', u'keinem', 
-        u'keinen', u'keiner', u'keines', u'können', u'könnte', u'machen', u'man', u'manche', u'manchem', u'manchen', u'mancher', 
-        u'manches', u'mein', u'meine', u'meinem', u'meinen', u'meiner', u'meines', u'mit', u'muss', u'musste', u'muß', u'mußte', u'nach',
-        u'nicht', u'nichts', u'noch', u'nun', u'nur', u'ob', u'oder', u'ohne', u'sehr', u'sein', u'seine', u'seinem', u'seinen', u'seiner',
-        u'seines', u'selbst', u'sich', u'sie', u'ihnen', u'sind', u'so', u'solche', u'solchem', u'solchen', u'solcher', u'solches', u'soll',
-        u'sollte', u'sondern', u'sonst', u'über', u'um', u'und', u'uns', u'unse', u'unsem', u'unsen', u'unser', u'unses', u'unter', u'viel',
-        u'vom', u'von', u'vor', u'während', u'war', u'waren', u'warst', u'was', u'weg', u'weil', u'weiter', u'welche', u'welchem', 
-        u'welchen', u'welcher', u'welches', u'wenn', u'werde', u'werden', u'wie', u'wieder', u'will', u'wir', u'wird', u'wirst', u'wo',
-        u'wollem', u'wollte', u'würde', u'würden', u'zu', u'zum', u'zur', u'zwar', u'zwischen')
-
-#   Die Funktion stem nimmt ein Wort und versucht dies durch Regelanwendung zu verkürzen. Wenn Stop auf 'True' gesetzt wird, werden Wörter in der Stopliste nicht 'gestemmt'.
-def stem(wort, stop=True):
-    #   ACHTUNG: für den Stemmer gilt 'y' als Vokal.
-    vokale = u'aeiouyäüö'
-    #   ACHTUNG: 'U' und 'Y' gelten als Konsonaten.
-    konsonanten = u'bcdfghjklmnpqrstvwxzßUY'
-    #   Konsonanten die vor einer 's'-Endung stehen dürfen.
-    s_endung = u'bdfghklmnrt'
-    #   Konsonanten die vor einer 'st'-Endung stehen dürfen.
-    st_endung = u'bdfghklmnt'
-    #   Zu r1 & r2 siehe http://snowball.tartarus.org/texts/r1r2.html, p1 & p2 sind die Start'p'ositionen von r1 & r2 im String
-    r1 = u''
-    p1 = 0
-    r2 = u''
-    p2 = 0
-    #   Wortstämme werden klein geschrieben
-    wort = wort.lower()
-    #   Wenn 'stop' und Wort in Stopliste gib 'wort' zurück 
-    if stop == True and wort in stopliste:
-        return end_stemming(wort.replace(u'ß', u'ss'))
-    # Ersetze alle 'ß' durch 'ss'
-    wort = wort.replace(u'ß', u'ss')
-    #   Schützenswerte 'u' bzw. 'y' werden durch 'U' bzw. 'Y' ersetzt
-    for e in map(None, wort, range(len(wort))):
-        if e[1] == 0: continue
-        if u'u' in e:
-            try:
-                if ((wort[(e[1]-1)] in vokale) and (wort[(e[1]+1)] in vokale)): wort = wort[:e[1]] + u'U' + wort[(e[1]+1):]
-            except : pass
-        if  u'y' in e:
-            try:
-                if ((wort[(e[1]-1)] in vokale) and (wort[(e[1]+1)] in vokale)): wort = wort[:e[1]] + u'Y' + wort[(e[1]+1):]
-            except: pass
-    #   r1, r2, p1 & p2 werden mit Werten belegt
-    try:
-        Bedingung = False
-        for e in map(None, wort, range(len(wort))):
-            if e[0] in vokale: Bedingung = True
-            if ((e[0] in konsonanten) and (Bedingung)):
-                p1 = e[1] + 1 
-                r1 = wort[p1:]
-                break
-        Bedingung = False
-        for e in map(None, r1, range(len(r1))):
-            if e[0] in vokale: Bedingung = True
-            if ((e[0] in konsonanten) and (Bedingung)):
-                p2 = e[1] + 1 
-                r2 = r1[p2:]
-                break
-        if ((p1 < 3)and(p1 > 0)):
-            p1 = 3
-            r1 = wort[p1:]
-        if p1 == 0:
-            return end_stemming(wort)
-    except: pass
-    #   Die Schritte 1 bis 3 d) 'stemmen' das übergebene Wort. 
-    #   Schritt 1
-    eSuffixe_1 = [u'e', u'em', u'en', u'ern', u'er', u'es']
-    eSonst_1 = [u's']
-    try:
-        for e in eSuffixe_1:
-            if e in r1[-(len(e)):]:
-                wort = wort[:-(len(e))]
-                r1 = r1[:-(len(e))]
-                r2 = r2[:-(len(e))]
-                break
-        else:
-            if r1[-1] in eSonst_1:
-                if wort[-2] in s_endung:
-                    wort = wort[:-1]
-                    r1 = r1[:-1]
-                    r2 = r2[:-1]
-    except: pass
-    #   Schritt 2
-    eSuffixe_2 = [u'est', u'er', u'en']
-    eSonst_2 = [u'st']
-    try:
-        for e in eSuffixe_2:
-            if e in r1[-len(e):]:
-                wort = wort[:-len(e)]
-                r1 = r1[:-len(e)]
-                r2 = r2[:-len(e)]
-                break
-        else:
-            if r1[-2:] in eSonst_2:             
-                if wort[-3] in st_endung:
-                    if len(wort) > 5:
-                        wort = wort[:-2]
-                        r1 = r1[:-2]
-                        r2 = r2[:-2]
-    except:pass
-    #   Schritt 3 a)
-    dSuffixe_1 = [u'end', u'ung']
-    try:
-        for e in dSuffixe_1:
-            if e in r2[-(len(e)):]:
-                if u'ig' in r2[-(len(e)+2):-(len(e))]:
-                    if u'e' in wort[-(len(e)+3)]:
-                        wort = wort[:-(len(e))]
-                        r1 = r1[:-(len(e))]
-                        r2 = r2[:-(len(e))]
-                        break
-                    else:
-                        wort = wort[:-(len(e)+2)]
-                        r2 = r2[:-(len(e)+2)]
-                        r1 = r1[:-(len(e)+2)]
-                        break
-                else:
-                    wort = wort[:-(len(e))]
-                    r2 = r2[:-(len(e))]
-                    r1 = r1[:-(len(e))]
-                return end_stemming(wort)
-    except: pass
-    #   Schritt 3 b)
-    dSuffixe_2 = [u'ig', u'ik', u'isch']
-    try:
-        for e in dSuffixe_2:
-            if e in r2[-(len(e)):]:
-                if ((u'e' in wort[-(len(e)+1)])):
-                    pass
-                else:
-                    wort = wort[:-(len(e))]
-                    r2 = r2[:-(len(e))]
-                    r1 = r1[:-(len(e))]
-                    break
-    except: pass
-    #   Schritt 3 c)
-    dSuffixe_3 = [u'lich', u'heit']
-    sonder_1 = [u'er', u'en']
-    try: 
-        for e in dSuffixe_3:
-            if e in r2[-(len(e)):]:
-                for i in sonder_1:
-                    if i in r1[-(len(e)+len(i)):-(len(e))]:
-                        wort = wort[:-(len(e)+len(i))]
-                        r1 = r1[:-(len(e)+len(i))]
-                        r2 = r2[:-(len(e)+len(i))]
-                        break
-                else:
-                    wort = wort[:-(len(e))]
-                    r1 = r1[:-(len(e))]
-                    r2 = r2[:-(len(e))]
-                    break
-                        
-    except: pass
-    #   Schritt 3 d)
-    dSuffixe_4 = [u'keit']
-    sonder_2 = [u'lich', u'ig']
-    try:
-        for e in dSuffixe_4:
-            if e in r2[-(len(e)):]:
-                for i in sonder_2:
-                    if i in r2[-(len(e)+len(i)):-(len(e))]:
-                        wort = wort[:-(len(e)+len(i))]
-                        break
-                else:
-                    wort = wort[:-(len(e))]
-                                    
-    except: pass
-    return end_stemming(wort)
-
-#  end_stemming verwandelt u'ä', u'ö', u'ü' in den "Grundvokal" und macht 'U' bzw. 'Y' klein. 
-def end_stemming(wort):
-    return wort.replace(u'ä', u'a').replace(u'ö', u'o').replace(
-        u'ü', u'u').replace(u'U', u'u').replace(u'Y', u'y')

File porter_stemmers/en.py

-# Copyright (c) 2008 Michael Dirolf (mike at dirolf dot com)
-
-# Permission is hereby granted, free of charge, to any person
-# obtaining a copy of this software and associated documentation
-# files (the "Software"), to deal in the Software without
-# restriction, including without limitation the rights to use,
-# copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following
-# conditions:
-
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-
-"""pyporter2: An implementation of the Porter2 stemming algorithm.
-
-See http://snowball.tartarus.org/algorithms/english/stemmer.html"""
-import unittest, re
-
-regexp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
-def get_r1(word):
-    # exceptional forms
-    if word.startswith('gener') or word.startswith('arsen'):
-        return 5
-    if word.startswith('commun'):
-        return 6
-
-    # normal form
-    match = regexp.match(word)
-    if match:
-        return match.start(1)
-    return len(word)
-
-def get_r2(word):
-    match = regexp.match(word, get_r1(word))
-    if match:
-        return match.start(1)
-    return len(word)
-
-def ends_with_short_syllable(word):
-    if len(word) == 2:
-        if re.match(r"^[aeiouy][^aeiouy]$", word):
-            return True
-    if re.match(r".*[^aeiouy][aeiouy][^aeiouywxY]$", word):
-        return True
-    return False
-
-def is_short_word(word):
-    if ends_with_short_syllable(word):
-        if get_r1(word) == len(word):
-            return True
-    return False
-
-def remove_initial_apostrophe(word):
-    if word.startswith("'"):
-        return word[1:]
-    return word
-
-def capitalize_consonant_ys(word):
-    if word.startswith('y'):
-        word = 'Y' + word[1:]
-    return re.sub(r"([aeiouy])y", '\g<1>Y', word)
-
-def step_0(word):
-    if word.endswith("'s'"):
-        return word[:-3]
-    if word.endswith("'s"):
-        return word[:-2]
-    if word.endswith("'"):
-        return word[:-1]
-    return word
-
-def step_1a(word):
-    if word.endswith('sses'):
-        return word[:-4] + 'ss'
-    if word.endswith('ied') or word.endswith('ies'):
-        if len(word) > 4:
-            return word[:-3] + 'i'
-        else:
-            return word[:-3] + 'ie'
-    if word.endswith('us') or word.endswith('ss'):
-        return word
-    if word.endswith('s'):
-        preceding = word[:-1]
-        if re.search(r"[aeiouy].", preceding):
-            return preceding
-        return word
-    return word
-
-def step_1b(word, r1):
-    if word.endswith('eedly'):
-        if len(word) - 5 >= r1:
-            return word[:-3]
-        return word
-    if word.endswith('eed'):
-        if len(word) - 3 >= r1:
-            return word[:-1]
-        return word
-
-    def ends_with_double(word):
-        doubles = ['bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt']
-        for double in doubles:
-            if word.endswith(double):
-                return True
-        return False
-
-    def step_1b_helper(word):
-        if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
-            return word + 'e'
-        if ends_with_double(word):
-            return word[:-1]
-        if is_short_word(word):
-            return word + 'e'
-        return word
-
-    suffixes = ['ed', 'edly', 'ing', 'ingly']
-    for suffix in suffixes:
-        if word.endswith(suffix):
-            preceding = word[:-len(suffix)]
-            if re.search(r"[aeiouy]", preceding):
-                return step_1b_helper(preceding)
-            return word
-
-    return word
-
-def step_1c(word):
-    if word.endswith('y') or word.endswith('Y'):
-        if word[-2] not in 'aeiouy':
-            if len(word) > 2:
-                return word[:-1] + 'i'
-    return word
-
-def step_2(word, r1):
-    def step_2_helper(end, repl, prev):
-        if word.endswith(end):
-            if len(word) - len(end) >= r1:
-                if prev == []:
-                    return word[:-len(end)] + repl
-                for p in prev:
-                    if word[:-len(end)].endswith(p):
-                        return word[:-len(end)] + repl
-            return word
-        return None
-
-    triples = [('ization', 'ize', []),
-               ('ational', 'ate', []),
-               ('fulness', 'ful', []),
-               ('ousness', 'ous', []),
-               ('iveness', 'ive', []),
-               ('tional', 'tion', []),
-               ('biliti', 'ble', []),
-               ('lessli', 'less', []),
-               ('entli', 'ent', []),
-               ('ation', 'ate', []),
-               ('alism', 'al', []),
-               ('aliti', 'al', []),
-               ('ousli', 'ous', []),
-               ('iviti', 'ive', []),
-               ('fulli', 'ful', []),
-               ('enci', 'ence', []),
-               ('anci', 'ance', []),
-               ('abli', 'able', []),
-               ('izer', 'ize', []),
-               ('ator', 'ate', []),
-               ('alli', 'al', []),
-               ('bli', 'ble', []),
-               ('ogi', 'og', ['l']),
-               ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])]
-
-    for trip in triples:
-        attempt = step_2_helper(trip[0], trip[1], trip[2])
-        if attempt:
-            return attempt
-
-    return word
-
-def step_3(word, r1, r2):
-    def step_3_helper(end, repl, r2_necessary):
-        if word.endswith(end):
-            if len(word) - len(end) >= r1:
-                if not r2_necessary:
-                    return word[:-len(end)] + repl
-                else:
-                    if len(word) - len(end) >= r2:
-                        return word[:-len(end)] + repl
-            return word
-        return None
-
-    triples = [('ational', 'ate', False),
-               ('tional', 'tion', False),
-               ('alize', 'al', False),
-               ('icate', 'ic', False),
-               ('iciti', 'ic', False),
-               ('ative', '', True),
-               ('ical', 'ic', False),
-               ('ness', '', False),
-               ('ful', '', False)]
-
-    for trip in triples:
-        attempt = step_3_helper(trip[0], trip[1], trip[2])
-        if attempt:
-            return attempt
-
-    return word
-
-def step_4(word, r2):
-    delete_list = ['al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize']
-
-    for end in delete_list:
-        if word.endswith(end):
-            if len(word) - len(end) >= r2:
-                return word[:-len(end)]
-            return word
-
-    if word.endswith('sion') or word.endswith('tion'):
-        if len(word) - 3 >= r2:
-            return word[:-3]
-
-    return word
-
-def step_5(word, r1, r2):
-    if word.endswith('l'):
-        if len(word) - 1 >= r2 and word[-2] == 'l':
-            return word[:-1]
-        return word
-
-    if word.endswith('e'):
-        if len(word) - 1 >= r2:
-            return word[:-1]
-        if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
-            return word[:-1]
-
-    return word
-
-def normalize_ys(word):
-    return word.replace('Y', 'y')
-
-exceptional_forms = {'skis': 'ski',
-                    'skies': 'sky',
-                    'dying': 'die',
-                    'lying': 'lie',
-                    'tying': 'tie',
-                    'idly': 'idl',
-                    'gently': 'gentl',
-                    'ugly': 'ugli',
-                    'early': 'earli',
-                    'only': 'onli',
-                    'singly': 'singl',
-                    'sky': 'sky',
-                    'news': 'news',
-                    'howe': 'howe',
-                    'atlas': 'atlas',
-                    'cosmos': 'cosmos',
-                    'bias': 'bias',
-                    'andes': 'andes'}
-
-exceptional_early_exit_post_1a = ['inning', 'outing', 'canning', 'herring', 'earring', 'proceed', 'exceed', 'succeed']
-
-def stem(word):
-    """The main entry point in the old version of the API."""
-    return Stemmer._stem(word)
-
-def algorithms():
-    """Get a list of the names of the available stemming algorithms.
-
-    The only algorithm currently supported is the "english", or porter2,
-    algorithm.
-    """
-    return ['english']
-
-def version ():
-    """Get the version number of the stemming module.
-
-    This is the version number of the Stemmer module as a whole (not for an
-    individual algorithm).
-    """
-    return '1.0.0'
-
-class Stemmer:
-    """An instance of a stemming algorithm.
-
-    When creating a Stemmer object, there is one required argument:
-    the name of the algorithm to use in the new stemmer. A list of the
-    valid algorithm names may be obtained by calling the algorithms()
-    function in this module. In addition, the appropriate stemming algorithm
-    for a given language may be obtained by using the 2 or 3 letter ISO 639
-    language codes.
-    """
-    max_cache_size = 10000
-
-    def __init__ (self, algorithm, cache_size=None):
-        if algorithm not in ['english', 'eng', 'en']:
-            raise KeyError("Stemming algorithm '%s' not found" % algorithm)
-        if cache_size:
-            self.max_cache_size = cache_size
-
-    def stemWord(self, word):
-        """Stem a word.
-
-        This takes a single argument, word, which should either be a UTF-8
-        encoded string, or a unicode object.
-
-        The result is the stemmed form of the word. If the word supplied
-        was a unicode object, the result will be a unicode object: if the
-        word supplied was a string, the result will be a UTF-8 encoded string.
-        """
-        return Stemmer._stem(word)
-
-    def stemWords(self, words):
-        """Stem a list of words.
-
-        This takes a single argument, words, which must be a sequence,
-        iterator, generator or similar.
-
-        The entries in words should either be UTF-8 encoded strings,
-        or a unicode objects.
-
-        The result is a list of the stemmed forms of the words. If the word
-        supplied was a unicode object, the stemmed form will be a unicode
-        object: if the word supplied was a string, the stemmed form will
-        be a UTF-8 encoded string.
-        """
-        return [self.stemWord(word) for word in words]
-
-    @classmethod
-    def _stem(cls, word):
-        was_unicode = False
-
-        if len(word) <= 2:
-            return word
-
-        if isinstance(word, unicode):
-            was_unicode = True
-            word = word.encode('utf-8')
-
-        word = remove_initial_apostrophe(word)
-
-        # handle some exceptional forms
-        if word in exceptional_forms:
-            return exceptional_forms[word]
-
-        word = capitalize_consonant_ys(word)
-        r1 = get_r1(word)
-        r2 = get_r2(word)
-        word = step_0(word)
-        word = step_1a(word)
-
-        # handle some more exceptional forms
-        if word in exceptional_early_exit_post_1a:
-            return word
-
-        word = step_1b(word, r1)
-        word = step_1c(word)
-        word = step_2(word, r1)
-        word = step_3(word, r1, r2)
-        word = step_4(word, r2)
-        word = step_5(word, r1, r2)
-        word = normalize_ys(word)
-
-        if was_unicode:
-            return word.decode('utf-8')
-        return word
-
-class TestPorter2(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def testModule(self):
-        self.assertEqual(algorithms(), ['english'])
-        self.assertEqual(version(), '1.0.0')
-        self.assertRaises(KeyError, Stemmer, 'porter')
-        self.assertRaises(KeyError, Stemmer, 'random')
-        stemmer = Stemmer('english')
-        stemmer = Stemmer('en')
-        stemmer = Stemmer('eng')
-
-    def testDeprecation(self):
-        self.assertRaises(DeprecationWarning, stem, 'stemming')
-
-    def testGetR1(self):
-        self.assertEqual(get_r1('beautiful'), 5)
-        self.assertEqual(get_r1('beauty'), 5)
-        self.assertEqual(get_r1('beau'), 4)
-        self.assertEqual(get_r1('animadversion'), 2)
-        self.assertEqual(get_r1('sprinkled'), 5)
-        self.assertEqual(get_r1('eucharist'), 3)
-
-        # test exceptional forms
-        self.assertEqual(get_r1('gener'), 5)
-        self.assertEqual(get_r1('generous'), 5)
-        self.assertEqual(get_r1('generousity'), 5)
-        self.assertEqual(get_r1('general'), 5)
-        self.assertEqual(get_r1('generally'), 5)
-        self.assertEqual(get_r1('generality'), 5)
-        self.assertEqual(get_r1('commun'), 6)
-        self.assertEqual(get_r1('communist'), 6)
-        self.assertEqual(get_r1('communal'), 6)
-        self.assertEqual(get_r1('communistic'), 6)
-        self.assertEqual(get_r1('arsen'), 5)
-        self.assertEqual(get_r1('arsenic'), 5)
-        self.assertEqual(get_r1('arsenal'), 5)
-        self.assertEqual(get_r1('arsenality'), 5)
-
-    def testGetR2(self):
-        self.assertEqual(get_r2('beautiful'), 7)
-        self.assertEqual(get_r2('beauty'), 6)
-        self.assertEqual(get_r2('beau'), 4)
-        self.assertEqual(get_r2('animadversion'), 4)
-        self.assertEqual(get_r2('sprinkled'), 9)
-        self.assertEqual(get_r2('eucharist'), 6)
-
-    def testEndsWithShortSyllable(self):
-        self.assertEqual(ends_with_short_syllable(''), False)
-        self.assertEqual(ends_with_short_syllable('rap'), True)
-        self.assertEqual(ends_with_short_syllable('trap'), True)
-        self.assertEqual(ends_with_short_syllable('entrap'), True)
-        self.assertEqual(ends_with_short_syllable('ow'), True)
-        self.assertEqual(ends_with_short_syllable('on'), True)
-        self.assertEqual(ends_with_short_syllable('at'), True)
-        self.assertEqual(ends_with_short_syllable('uproot'), False)
-        self.assertEqual(ends_with_short_syllable('bestow'), False)
-        self.assertEqual(ends_with_short_syllable('disturb'), False)
-
-    def testIsShortWord(self):
-        self.assertEqual(is_short_word(''), False)
-        self.assertEqual(is_short_word('bed'), True)
-        self.assertEqual(is_short_word('shed'), True)
-        self.assertEqual(is_short_word('shred'), True)
-        self.assertEqual(is_short_word('bead'), False)
-        self.assertEqual(is_short_word('embed'), False)
-        self.assertEqual(is_short_word('beds'), False)
-
-    def testRemoveInitialApostrophe(self):
-        self.assertEqual(remove_initial_apostrophe(''), '')
-        self.assertEqual(remove_initial_apostrophe('mike'), 'mike')
-        self.assertEqual(remove_initial_apostrophe('\'mike'), 'mike')
-        self.assertEqual(remove_initial_apostrophe('\'mi\'e'), 'mi\'e')
-        self.assertEqual(remove_initial_apostrophe('\'til'), 'til')
-
-    def testCapitalizeConsonantYs(self):
-        self.assertEqual(capitalize_consonant_ys(''), '')
-        self.assertEqual(capitalize_consonant_ys('mike'), 'mike')
-        self.assertEqual(capitalize_consonant_ys('youth'), 'Youth')
-        self.assertEqual(capitalize_consonant_ys('boy'), 'boY')
-        self.assertEqual(capitalize_consonant_ys('boyish'), 'boYish')
-        self.assertEqual(capitalize_consonant_ys('fly'), 'fly')
-        self.assertEqual(capitalize_consonant_ys('flying'), 'flying')
-        self.assertEqual(capitalize_consonant_ys('syzygy'), 'syzygy')
-        self.assertEqual(capitalize_consonant_ys('sayyid'), 'saYyid')
-
-    def testStep0(self):
-        self.assertEqual(step_0(''), '')
-        self.assertEqual(step_0('mike'), 'mike')
-        self.assertEqual(step_0('dog\'s'), 'dog')
-        self.assertEqual(step_0('dog\'s\''), 'dog')
-        self.assertEqual(step_0('dog\''), 'dog')
-
-    def testStep1a(self):
-        self.assertEqual(step_1a(''), '')
-        self.assertEqual(step_1a('caresses'), 'caress')
-        self.assertEqual(step_1a('sses'), 'ss')
-        self.assertEqual(step_1a('ssesmike'), 'ssesmike')
-        self.assertEqual(step_1a('tied'), 'tie')
-        self.assertEqual(step_1a('cries'), 'cri')
-        self.assertEqual(step_1a('ties'), 'tie')
-        self.assertEqual(step_1a('hurried'), 'hurri')
-        self.assertEqual(step_1a('gas'), 'gas')
-        self.assertEqual(step_1a('this'), 'this')
-        self.assertEqual(step_1a('gaps'), 'gap')
-        self.assertEqual(step_1a('kiwis'), 'kiwi')
-        self.assertEqual(step_1a('bus'), 'bus')
-        self.assertEqual(step_1a('mikeus'), 'mikeus')
-        self.assertEqual(step_1a('mikess'), 'mikess')
-        self.assertEqual(step_1a('truss'), 'truss')
-
-    def testStep1b(self):
-        self.assertEqual(step_1b('', 0), '')
-        self.assertEqual(step_1b('ed', 0), 'ed')
-        self.assertEqual(step_1b('eed', 1), 'eed')
-        self.assertEqual(step_1b('ing', 0), 'ing')
-        self.assertEqual(step_1b('heed', 2), 'heed')
-        self.assertEqual(step_1b('coheed', 2), 'cohee')
-        self.assertEqual(step_1b('coheed', 3), 'cohee')
-        self.assertEqual(step_1b('heedly', 3), 'heedly')
-        self.assertEqual(step_1b('heedly', 0), 'hee')
-        self.assertEqual(step_1b('shred', 0), 'shred')
-        self.assertEqual(step_1b('luxuriated', 0), 'luxuriate')
-        self.assertEqual(step_1b('luxuriatedly', 0), 'luxuriate')
-        self.assertEqual(step_1b('luxuriating', 0), 'luxuriate')
-        self.assertEqual(step_1b('luxuriatingly', 0), 'luxuriate')
-        self.assertEqual(step_1b('disabled', 0), 'disable')
-        self.assertEqual(step_1b('disablingly', 0), 'disable')
-        self.assertEqual(step_1b('cauterizedly', 0), 'cauterize')
-        self.assertEqual(step_1b('cauterizing', 0), 'cauterize')
-        self.assertEqual(step_1b('hopped', 0), 'hop')
-        self.assertEqual(step_1b('clubbing', 0), 'club')
-        self.assertEqual(step_1b('troddedly', 0), 'trod')
-        self.assertEqual(step_1b('puffingly', 0), 'puf')
-        self.assertEqual(step_1b('hagged', 0), 'hag')
-        self.assertEqual(step_1b('spamming', 0), 'spam')
-        self.assertEqual(step_1b('shunnedly', 0), 'shun')
-        self.assertEqual(step_1b('torred', 0), 'tor')
-        self.assertEqual(step_1b('catted', 0), 'cat')
-        self.assertEqual(step_1b('exazzedly', 0), 'exazz')
-        self.assertEqual(step_1b('hoped', 0), 'hope')
-        self.assertEqual(step_1b('hopedly', 0), 'hope')
-        self.assertEqual(step_1b('hoping', 0), 'hope')
-        self.assertEqual(step_1b('hopingly', 0), 'hope')
-        self.assertEqual(step_1b('coped', 0), 'cope')
-
-    def testStep1c(self):
-        self.assertEqual(step_1c(''), '')
-        self.assertEqual(step_1c('cry'), 'cri')
-        self.assertEqual(step_1c('by'), 'by')
-        self.assertEqual(step_1c('say'), 'say')
-        self.assertEqual(step_1c('crY'), 'cri')
-        self.assertEqual(step_1c('bY'), 'bY')
-        self.assertEqual(step_1c('saY'), 'saY')
-
-    def testStep2(self):
-        self.assertEqual(step_2('', 0), '')
-        self.assertEqual(step_2('mike', 0), 'mike')
-        self.assertEqual(step_2('emotional', 2), 'emotion')
-        self.assertEqual(step_2('emotional', 4), 'emotional')
-        self.assertEqual(step_2('fenci', 1), 'fence')
-        self.assertEqual(step_2('fenci', 2), 'fenci')
-        self.assertEqual(step_2('necromanci', 3), 'necromance')
-        self.assertEqual(step_2('necromanci', 7), 'necromanci')
-        self.assertEqual(step_2('disabli', 3), 'disable')
-        self.assertEqual(step_2('disabli', 4), 'disabli')
-        self.assertEqual(step_2('evidentli', 2), 'evident')
-        self.assertEqual(step_2('evidentli', 5), 'evidentli')
-        self.assertEqual(step_2('kaizer', 2), 'kaize')
-        self.assertEqual(step_2('kaizer', 3), 'kaizer')
-        self.assertEqual(step_2('kaization', 2), 'kaize')
-        self.assertEqual(step_2('kaization', 8), 'kaization')
-        self.assertEqual(step_2('operational', 2), 'operate')
-        self.assertEqual(step_2('operational', 5), 'operational')
-        self.assertEqual(step_2('operation', 2), 'operate')
-        self.assertEqual(step_2('operation', 5), 'operation')
-        self.assertEqual(step_2('operator', 2), 'operate')
-        self.assertEqual(step_2('operator', 5), 'operator')
-        self.assertEqual(step_2('rationalism', 3), 'rational')
-        self.assertEqual(step_2('rationalism', 7), 'rationalism')
-        self.assertEqual(step_2('rationaliti', 3), 'rational')
-        self.assertEqual(step_2('rationaliti', 7), 'rationaliti')
-        self.assertEqual(step_2('rationalli', 3), 'rational')
-        self.assertEqual(step_2('rationalli', 7), 'rationalli')
-        self.assertEqual(step_2('gratefulness', 4), 'grateful')
-        self.assertEqual(step_2('gratefulness', 6), 'gratefulness')
-        self.assertEqual(step_2('obviousli', 2), 'obvious')
-        self.assertEqual(step_2('obviousli', 5), 'obviousli')
-        self.assertEqual(step_2('obviousness', 2), 'obvious')
-        self.assertEqual(step_2('obviousness', 5), 'obviousness')
-        self.assertEqual(step_2('responsiveness', 7), 'responsive')
-        self.assertEqual(step_2('responsiveness', 8), 'responsiveness')
-        self.assertEqual(step_2('responsiviti', 3), 'responsive')
-        self.assertEqual(step_2('responsiviti', 10), 'responsiviti')
-        self.assertEqual(step_2('abiliti', 1), 'able')
-        self.assertEqual(step_2('abiliti', 2), 'abiliti')
-        self.assertEqual(step_2('cebli', 2), 'ceble')
-        self.assertEqual(step_2('cebli', 3), 'cebli')
-        self.assertEqual(step_2('apogi', 2), 'apogi')
-        self.assertEqual(step_2('illogi', 2), 'illog')
-        self.assertEqual(step_2('illogi', 4), 'illogi')
-        self.assertEqual(step_2('gracefulli', 4), 'graceful')
-        self.assertEqual(step_2('gracefulli', 6), 'gracefulli')
-        self.assertEqual(step_2('classlessli', 4), 'classless')
-        self.assertEqual(step_2('classlessli', 6), 'classlessli')
-        self.assertEqual(step_2('cali', 0), 'cali')
-        self.assertEqual(step_2('acli', 0), 'ac')
-        self.assertEqual(step_2('acli', 3), 'acli')
-        self.assertEqual(step_2('adli', 0), 'ad')
-        self.assertEqual(step_2('beli', 0), 'be')
-        self.assertEqual(step_2('agli', 2), 'ag')
-        self.assertEqual(step_2('agli', 3), 'agli')
-        self.assertEqual(step_2('thli', 0), 'th')
-        self.assertEqual(step_2('likli', 0), 'lik')
-        self.assertEqual(step_2('homili', 0), 'homili')
-        self.assertEqual(step_2('tamli', 2), 'tam')
-        self.assertEqual(step_2('openli', 0), 'open')
-        self.assertEqual(step_2('earli', 3), 'ear')
-        self.assertEqual(step_2('earli', 4), 'earli')
-        self.assertEqual(step_2('tartli', 2), 'tart')
-
-    def testStep3(self):
-        self.assertEqual(step_3('', 0, 0), '')
-        self.assertEqual(step_3('mike', 0, 0), 'mike')
-        self.assertEqual(step_3('relational', 3, 0), 'relate')
-        self.assertEqual(step_3('relational', 4, 9), 'relational')
-        self.assertEqual(step_3('emotional', 2, 9), 'emotion')
-        self.assertEqual(step_3('emotional', 4, 0), 'emotional')
-        self.assertEqual(step_3('rationalize', 3, 0), 'rational')
-        self.assertEqual(step_3('rationalize',7, 9), 'rationalize')
-        self.assertEqual(step_3('intricate', 2, 9), 'intric')
-        self.assertEqual(step_3('intricate', 7, 0), 'intricate')
-        self.assertEqual(step_3('intriciti', 2, 0), 'intric')
-        self.assertEqual(step_3('intriciti', 5, 9), 'intriciti')
-        self.assertEqual(step_3('intrical', 4, 9), 'intric')
-        self.assertEqual(step_3('intrical', 5, 0), 'intrical')
-        self.assertEqual(step_3('youthful', 4, 0), 'youth')
-        self.assertEqual(step_3('youthful', 6, 0), 'youthful')
-        self.assertEqual(step_3('happiness', 3, 0), 'happi')
-        self.assertEqual(step_3('happiness', 6, 0), 'happiness')
-        self.assertEqual(step_3('decorative', 3, 5), 'decor')
-        self.assertEqual(step_3('decorative', 3, 6), 'decorative')
-        self.assertEqual(step_3('decorative', 6, 5), 'decorative')
-
-    def testStep4(self):
-        self.assertEqual(step_4('', 0), '')
-        self.assertEqual(step_4('mike', 0), 'mike')
-        self.assertEqual(step_4('penal', 3), 'pen')
-        self.assertEqual(step_4('penal', 4), 'penal')
-        self.assertEqual(step_4('pance', 1), 'p')
-        self.assertEqual(step_4('pance', 2), 'pance')
-        self.assertEqual(step_4('dence', 0), 'd')
-        self.assertEqual(step_4('dence', 4), 'dence')
-        self.assertEqual(step_4('header', 3), 'head')
-        self.assertEqual(step_4('header', 5), 'header')
-        self.assertEqual(step_4('graphic', 5), 'graph')
-        self.assertEqual(step_4('graphic', 6), 'graphic')
-        self.assertEqual(step_4('table', 0), 't')
-        self.assertEqual(step_4('table', 2), 'table')
-        self.assertEqual(step_4('quible', 1), 'qu')
-        self.assertEqual(step_4('quible', 3), 'quible')
-        self.assertEqual(step_4('recant', 1), 'rec')
-        self.assertEqual(step_4('recant', 5), 'recant')
-        self.assertEqual(step_4('lement', 0), 'l')
-        self.assertEqual(step_4('lement', 2), 'lement')
-        self.assertEqual(step_4('ment', 0), '')
-        self.assertEqual(step_4('ment', 1), 'ment')
-        self.assertEqual(step_4('ent', 0), '')
-        self.assertEqual(step_4('ent', 2), 'ent')
-        self.assertEqual(step_4('schism', 3), 'sch')
-        self.assertEqual(step_4('schism', 4), 'schism')
-        self.assertEqual(step_4('kate', 1), 'k')
-        self.assertEqual(step_4('kate', 2), 'kate')
-        self.assertEqual(step_4('citi', 0), 'c')
-        self.assertEqual(step_4('citi', 3), 'citi')
-        self.assertEqual(step_4('lous', 1), 'l')
-        self.assertEqual(step_4('lous', 2), 'lous')
-        self.assertEqual(step_4('hive', 0), 'h')
-        self.assertEqual(step_4('hive', 3), 'hive')
-        self.assertEqual(step_4('ize', 0), '')
-        self.assertEqual(step_4('ize', 1), 'ize')
-
-    def testStep5(self):
-        self.assertEqual(step_5('mik', 0, 0), 'mik')
-        self.assertEqual(step_5('mike', 5, 3), 'mik')
-        self.assertEqual(step_5('mike', 5, 4), 'mike')
-        self.assertEqual(step_5('mike', 3, 4), 'mike')
-        self.assertEqual(step_5('mixe', 3, 4), 'mix')
-        self.assertEqual(step_5('recall', 7, 5), 'recal')
-        self.assertEqual(step_5('recal', 0, 4), 'recal')
-        self.assertEqual(step_5('recall', 0, 6), 'recall')
-
-    def testNormalizeYs(self):
-        self.assertEqual(normalize_ys(''), '')
-        self.assertEqual(normalize_ys('mike'), 'mike')
-        self.assertEqual(normalize_ys('syzygy'), 'syzygy')
-        self.assertEqual(normalize_ys('sYzygY'), 'syzygy')
-        self.assertEqual(normalize_ys('MiKe'), 'MiKe')
-        self.assertEqual(normalize_ys('MDirYol'), 'MDiryol')
-
-    def testStem(self):
-        stemmer = Stemmer('english')
-        self.assertEqual(stemmer.stemWord(''), '')
-
-        # some normal case tests
-        self.assertEqual(stemmer.stemWord('mike'), 'mike')
-        self.assertEqual(stemmer.stemWord('consign'), 'consign')
-        self.assertEqual(stemmer.stemWord('consigned'), 'consign')
-        self.assertEqual(stemmer.stemWord('consigning'), 'consign')
-        self.assertEqual(stemmer.stemWord('consignment'), 'consign')
-        self.assertEqual(stemmer.stemWord('consist'), 'consist')
-        self.assertEqual(stemmer.stemWord('consisted'), 'consist')
-        self.assertEqual(stemmer.stemWord('consistency'), 'consist')
-        self.assertEqual(stemmer.stemWord('consistent'), 'consist')
-        self.assertEqual(stemmer.stemWord('consistently'), 'consist')
-        self.assertEqual(stemmer.stemWord('consisting'), 'consist')
-        self.assertEqual(stemmer.stemWord('consists'), 'consist')
-
-        # exceptionalWord form tests
-        self.assertEqual(stemmer.stemWord('skis'), 'ski')
-        self.assertEqual(stemmer.stemWord('skies'), 'sky')
-        self.assertEqual(stemmer.stemWord('dying'), 'die')
-        self.assertEqual(stemmer.stemWord('lying'), 'lie')
-        self.assertEqual(stemmer.stemWord('tying'), 'tie')
-        self.assertEqual(stemmer.stemWord('idly'), 'idl')
-        self.assertEqual(stemmer.stemWord('gently'), 'gentl')
-        self.assertEqual(stemmer.stemWord('ugly'), 'ugli')
-        self.assertEqual(stemmer.stemWord('early'), 'earli')
-        self.assertEqual(stemmer.stemWord('only'), 'onli')
-        self.assertEqual(stemmer.stemWord('singly'), 'singl')
-        self.assertEqual(stemmer.stemWord('sky'), 'sky')
-        self.assertEqual(stemmer.stemWord('news'), 'news')
-        self.assertEqual(stemmer.stemWord('howe'), 'howe')
-        self.assertEqual(stemmer.stemWord('atlas'), 'atlas')
-        self.assertEqual(stemmer.stemWord('cosmos'), 'cosmos')
-        self.assertEqual(stemmer.stemWord('bias'), 'bias')
-        self.assertEqual(stemmer.stemWord('andes'), 'andes')
-        self.assertEqual(stemmer.stemWord('innings'), 'inning')
-        self.assertEqual(stemmer.stemWord('outing'), 'outing')
-        self.assertEqual(stemmer.stemWord('canninger'), 'canning')
-        self.assertEqual(stemmer.stemWord('herrings'), 'herring')
-        self.assertEqual(stemmer.stemWord('earring'), 'earring')
-        self.assertEqual(stemmer.stemWord('proceeder'), 'proceed')
-        self.assertEqual(stemmer.stemWord('exceeding'), 'exceed')
-        self.assertEqual(stemmer.stemWord('succeeds'), 'succeed')
-
-        # hardcore test
-        infile = open('./en-voc.txt', 'r')
-        outfile = open('./en-stemmedvoc.txt', 'r')
-        while True:
-            word = infile.readline()
-            output = outfile.readline()
-            if word == '':
-                break
-            word = word[:-1]
-            output = output[:-1]
-            self.assertEqual(stemmer.stemWord(word), output)
-
-if __name__ == '__main__':
-    unittest.main()

File search/.hgignore

+syntax: glob
+.project
+.pydevproject
+.settings
+*~
+*.orig
+*.pyc
+*.pyo
+*.swp
+*.tmp
+_generated_media
+.dynamic_media
+desktop.ini
+settings_overrides.py
+nbproject
+django

File search/__init__.py

Empty file added.

File search/__init__.pyc

Binary file added.

File search/backends/__init__.py

Empty file added.

File search/backends/gae_background_tasks.py

+from django.conf import settings
+from django.db import models
+from google.appengine.ext import deferred
+
+default_search_queue = getattr(settings, 'DEFAULT_SEARCH_QUEUE', 'default')
+
+def update_relation_index(search_index_field, parent_pk, delete):
+    # pass only the field / model names to the background task to transfer less
+    # data
+    app_label = search_index_field.model_class._meta.app_label
+    object_name = search_index_field.model_class._meta.object_name
+    deferred.defer(update, app_label, object_name, search_index_field.name,
+        parent_pk, delete, _queue=default_search_queue)
+
+def update(app_label, object_name, field_name, parent_pk, delete):
+    model = models.get_model(app_label, object_name)
+    update_property = model._meta.get_field_by_name(field_name)[0]
+    update_property.update_relation_index(parent_pk, delete)

File search/backends/immediate_update.py

+def update_relation_index(search_index_field, parent_pk, delete):
+    search_index_field.update_relation_index(parent_pk, delete)

File search/core.py

+from django.conf import settings
+from django.core.exceptions import ObjectDoesNotExist
+from django.db import models
+from django.db.models import signals
+from djangotoolbox.fields import ListField
+from djangotoolbox.utils import getattr_by_path
+from copy import copy
+import re
+import string
+
+_PUNCTUATION_REGEX = re.compile(
+    '[' + re.escape(string.punctuation.replace('-', '').replace(
+        '_', '').replace('#', '')) + ']')
+_PUNCTUATION_SEARCH_REGEX = re.compile(
+    '[' + re.escape(string.punctuation.replace('_', '').replace(
+        '#', '')) + ']')
+
+# Various base indexers
+def startswith(words, indexing, **kwargs):
+    """Allows for word prefix search."""
+    if not indexing:
+        # In search mode we simply match search terms exactly
+        return words
+    # In indexing mode we add all prefixes ('h', 'he', ..., 'hello')
+    result = []
+    for word in words:
+        result.extend([word[:count].strip(u'-')
+                       for count in range(1, len(word)+1)])
+    return result
+
+def porter_stemmer(words, language, **kwargs):
+    """Porter-stemmer in various languages."""
+    languages = [language,]
+    if '-' in language:
+        languages.append(language.split('-')[0])
+
+    # Fall back to English
+    languages.append('en')
+
+    # Find a stemmer for this language
+    for language in languages:
+        try:
+            stem = __import__('search.porter_stemmers.%s' % language,
+                                 {}, {}, ['']).stem
+        except:
+            continue
+        break
+
+    result = []
+    for word in words:
+        result.append(stem(word))
+    return result
+
+stop_words = {
+    'en': set(('a', 'an', 'and', 'or', 'the', 'these', 'those', 'whose', 'to')),
+    'de': set(('ein', 'eine', 'eines', 'einer', 'einem', 'einen', 'den',
+               'der', 'die', 'das', 'dieser', 'dieses', 'diese', 'diesen',
+               'deren', 'und', 'oder'))
+}
+
+def get_stop_words(language):
+    if language not in stop_words and '-' in language:
+        language = language.split('-', 1)[0]
+    return stop_words.get(language, set())
+
+def non_stop(words, indexing, language, **kwargs):
+    """Removes stop words from search query."""
+    if indexing:
+        return words
+    return list(set(words) - get_stop_words(language))
+
+def porter_stemmer_non_stop(words, **kwargs):
+    """Combines porter_stemmer with non_stop."""
+    return porter_stemmer(non_stop(words, **kwargs), **kwargs)
+
+# Language handler
+def site_language(instance, **kwargs):
+    """The default language handler tries to determine the language from
+    fields in the model instance."""
+
+    # Check if there's a language attribute
+    if hasattr(instance, 'language'):
+        return instance.language
+    if hasattr(instance, 'lang'):
+        return instance.lang
+
+    # Fall back to default language
+    return settings.LANGUAGE_CODE
+
+def default_splitter(text, indexing=False, **kwargs):
+    """
+    Returns an array of  keywords, that are included
+    in query. All character besides of letters, numbers
+    and '_' are split characters. The character '-' is a special
+    case: two words separated by '-' create an additional keyword
+    consisting of both words without separation (see example).
+
+    Examples:
+    - text='word1/word2 word3'
+      returns ['word1', 'word2', word3]
+    - text='word1/word2-word3'
+      returns ['word1', 'word2', 'word3', 'word2word3']
+    """
+    if not text:
+        return []
+    if not indexing:
+        return _PUNCTUATION_SEARCH_REGEX.sub(u' ', text.lower()).split()
+    keywords = []
+    for word in set(_PUNCTUATION_REGEX.sub(u' ', text.lower()).split()):
+        if not word:
+            continue
+        if '-' not in word:
+            keywords.append(word)
+        else:
+            keywords.extend(get_word_combinations(word))
+    return keywords
+
+def get_word_combinations(word):
+    """
+    'one-two-three'
+    =>
+    ['one', 'two', 'three', 'onetwo', 'twothree', 'onetwothree']
+    """
+    permutations = []
+    parts = [part for part in word.split(u'-') if part]
+    for count in range(1, len(parts) + 1):
+        for index in range(len(parts) - count + 1):
+            permutations.append(u''.join(parts[index:index+count]))
+    return permutations
+
+class DictEmu(object):
+    def __init__(self, data):
+        self.data = data
+    def __getitem__(self, key):
+        return getattr(self.data, key)
+
+class StringListField(ListField):
+    def __init__(self, *args, **kwargs):
+        # TODO: provide some property in the settings which tells us which
+        # model field to use for field type in order to let other backends
+        # use other max_lengts,...
+        kwargs['field_type'] = models.CharField(max_length=500)
+        super(StringListField, self).__init__(*args, **kwargs)
+
+    def contribute_to_class(self, cls, name):
+        # XXX: Use contribute_to_class in order to add the model_class to the field
+        self.model_class = cls
+        super(StringListField, self).contribute_to_class(cls, name)
+
+class SearchableListField(StringListField):
+    """
+    This is basically a string ListField with search support.
+    """
+    def filter(self, values):
+        """Returns a query for the given values (creates '=' filters for this
+        field. Additionally filters can be applied afterwoods via chaining."""
+
+        if not isinstance(values, (tuple, list)):
+            values = (values,)
+        filtered = self.model_class.objects.all()
+        for value in set(values):
+            filter = {self.name:value}
+            filtered = filtered.filter(**filter)
+        return filtered
+
+    def search(self, query, indexer=None, splitter=None,
+            language=settings.LANGUAGE_CODE):
+        if not splitter:
+            splitter = default_splitter
+        words = splitter(query, indexing=False, language=language)
+        if indexer:
+            words = indexer(words, indexing=False, language=language)
+        # Optimize query
+        words = set(words)
+        if len(words) >= 4:
+            words -= get_stop_words(language)
+        # Don't allow empty queries
+        if not words and query:
+            # This query will never find anything
+            return self.filter(()).filter({self.name:' '})
+        return self.filter(sorted(words))
+
+class SearchIndexField(SearchableListField):
+    """
+    Simple full-text index for the given fields.
+
+    If "relation_index" is True the index will be stored in a separate entity.
+
+    With "integrate" you can add fields to your relation index,
+    so they can be searched, too.
+
+    With "filters" you can specify when a values index should be created.
+    """
+    # TODO: filters has to be extended (maybe a function) to allow Django's
+    # QuerySet methods like exclude
+    def __init__(self, fields_to_index, indexer=None, splitter=default_splitter,
+            relation_index=True, integrate='*', filters={},
+            language=site_language, **kwargs):
+        if integrate is None:
+            integrate = ()
+        if integrate == '*' and not relation_index:
+            integrate = ()
+        if isinstance(fields_to_index, basestring):
+            fields_to_index = (fields_to_index,)
+        self.fields_to_index = fields_to_index
+        if isinstance(integrate, basestring):
+            integrate = (integrate,)
+        self.filters = filters
+        self.integrate = integrate
+        self.splitter = splitter
+        self.indexer = indexer
+        self.language = language
+        self.relation_index = relation_index
+        if len(fields_to_index) == 0:
+            raise ValueError('No fields specified for index!')
+        super(SearchIndexField, self).__init__(**kwargs)
+
+    def should_index(self, values):
+        # Check if filter doesn't match
+        if not values:
+            return False
+        for filter, value in self.filters.items():
+            attr, op = filter, 'exact'
+            if '__' in filter:
+                attr, op = filter.rsplit('__', 1)
+            op = op.lower()
+            if (op == 'exact' and values[attr] != value or
+#                    op == '!=' and values[attr] == value or
+                    op == 'in' and values[attr] not in value or
+                    op == 'lt' and values[attr] >= value or
+                    op == 'lte' and values[attr] > value or
+                    op == 'gt' and values[attr] <= value or
+                    op == 'gte' and values[attr] < value):
+                return False
+            elif op not in ('exact', 'in', 'lt', 'lte', 'gte', 'gt'):
+                raise ValueError('Invalid search index filter: %s %s' % (filter, value))
+        return True
+
+#    @commit_locked
+    def update_relation_index(self, parent_pk, delete=False):
+        model = self._relation_index_model
+        try:
+            index = model.objects.get(pk=parent_pk)
+        except ObjectDoesNotExist:
+            index = None
+
+        if not delete:
+            try:
+                parent = self.model_class.objects.get(pk=parent_pk)
+            except ObjectDoesNotExist:
+                parent = None
+
+            values = None
+            if parent:
+                values = self.get_index_values(parent)
+
+        # Remove index if it's not needed, anymore
+        if delete or not self.should_index(values):
+            if index:
+                index.delete()
+            return
+
+        # Update/create index
+        if not index:
+            index = model(pk=parent_pk, **values)
+
+        # This guarantees that we also set virtual @properties
+        for key, value in values.items():
+            setattr(index, key, value)
+
+        index.save()
+
+    def create_index_model(self):
+        attrs = dict(__module__=self.__module__)
+        # By default we integrate everything when using relation index
+        if self.relation_index and self.integrate == ('*',):
+            self.integrate = tuple(field.name
+                                   for field in self.model_class._meta.fields
+                                   if not isinstance(field, SearchIndexField))
+
+        for field_name in self.integrate:
+            field = self.model_class._meta.get_field_by_name(field_name)[0]
+            field = copy(field)
+            attrs[field_name] = field
+            if hasattr(field, 'related_name'):
+                attrs[field_name].related_name = '_sidx_%s_%s_set_' % (
+                    self.model_class._meta.object_name.lower(),
+                    self.name,
+                )
+        index_name = self.name
+        attrs[index_name] = SearchIndexField(self.fields_to_index,
+            splitter=self.splitter, indexer=self.indexer,
+            language=self.language, relation_index=False)
+        if self.relation_index:
+            owner = self
+            def __init__(self, *args, **kwargs):
+                # Save some space: don't copy the whole indexed text into the
+                # relation index field unless the field gets integrated.
+                field_names = [field.name for field in self._meta.fields]
+                owner_field_names = [field.name
+                                     for field in owner.model_class._meta.fields]
+                for key, value in kwargs.items():
+                    if key in field_names or key not in owner_field_names:
+                        continue
+                    setattr(self, key, value)
+                    del kwargs[key]
+                models.Model.__init__(self, *args, **kwargs)
+            attrs['__init__'] = __init__
+            self._relation_index_model = type(
+                'RelationIndex_%s_%s_%s' % (self.model_class._meta.app_label,
+                                            self.model_class._meta.object_name,
+                                            self.name),
+                (models.Model,), attrs)
+
+    def get_index_values(self, model_instance):
+        filters = []
+        for filter in self.filters.keys():
+            if '__' in filter:
+                filters.append(filter.rsplit('__')[0])
+            else:
+                filters.append(filter)
+        filters = tuple(filters)
+        values = {}
+        for field_name in set(self.fields_to_index + self.integrate + filters):
+            instance = self.model_class._meta.get_field_by_name(field_name)[0]
+            if isinstance(instance, models.ForeignKey):
+                value = instance.pre_save(model_instance, False)
+            else:
+                value = getattr(model_instance, field_name)
+            if field_name == self.fields_to_index[0] and \
+                    isinstance(value, (list, tuple)):
+                value = sorted(value)
+            values[field_name] = value
+        return values
+
+    def pre_save(self, model_instance, add):
+        if self.filters and not self.should_index(DictEmu(model_instance)) \
+                or self.relation_index:
+            return []
+
+        language = self.language
+        if callable(language):
+            language = language(model_instance, property=self)
+
+        index = []
+        for field in self.fields_to_index:
+            values = getattr_by_path(model_instance, field, None)
+            if not values:
+                values = ()
+            elif not isinstance(values, (list, tuple)):
+                values = (values,)
+            for value in values:
+                index.extend(self.splitter(value, indexing=True, language=language))
+        if self.indexer:
+            index = self.indexer(index, indexing=True, language=language)
+        # Sort index to make debugging easier
+        setattr(model_instance, self.name, sorted(set(index)))
+        return index
+
+    def contribute_to_class(self, cls, name):
+        attrs = {name:self}
+        def search(self, query, language=settings.LANGUAGE_CODE):
+            return getattr(self, name).search(query, language)
+        attrs['search'] = search
+        setattr(cls, name, type('Indexes', (models.Manager, ), attrs)())
+        super(SearchIndexField, self).contribute_to_class(cls, name)
+
+    def search(self, query, language=settings.LANGUAGE_CODE):
+        if self.relation_index:
+            items = self._relation_index_model._meta.get_field_by_name(
+                self.name)[0].search(query, language=language).values('pk')
+            return RelationIndexQuery(self, items)
+        return super(SearchIndexField, self).search(query, splitter=self.splitter,
+            indexer=self.indexer, language=language)
+
+def post(delete, sender, instance, **kwargs):
+    for field in sender._meta.fields:
+        if isinstance(field, SearchIndexField):
+            if field.relation_index:
+                backend = load_backend()
+                backend.update_relation_index(field, instance.pk, delete)
+
+def post_save(sender, instance, **kwargs):
+    post(False, sender, instance, **kwargs)
+
+def post_delete(sender, instance, **kwargs):
+    post(True, sender, instance, **kwargs)
+
+def load_backend():
+    backend = getattr(settings, 'BACKEND', 'search.backends.immediate_update')
+    import_list = []
+    if '.' in backend:
+        import_list = [backend.rsplit('.', 1)[1]]
+    return __import__(backend, globals(), locals(), import_list)
+
+def install_index_model(sender, **kwargs):
+    needs_relation_index = False
+    for field in sender._meta.fields:
+        if isinstance(field, SearchIndexField) and field.relation_index:
+            field.create_index_model()
+            needs_relation_index = True
+    if needs_relation_index:
+        signals.post_save.connect(post_save, sender=sender)
+        signals.post_delete.connect(post_delete, sender=sender)
+signals.class_prepared.connect(install_index_model)
+
+# TODO: Refactor QueryTraits using Django's QuerySet
+class QueryTraits(object):
+    def __iter__(self):
+        return iter(self[:301])
+
+    def __len__(self):
+        return self.count()
+
+    def get(self, *args, **kwargs):
+        result = self[:1]
+        if result:
+            return result[0]
+        raise ObjectDoesNotExist
+
+class RelationIndexQuery(QueryTraits):
+    """Combines the results of multiple queries by appending the queries in the
+    given order."""
+    def __init__(self, field, query):
+        self.model = field.model_class
+        self.field = field
+        self.query = query
+
+    def order(self, *args, **kwargs):
+        self.query = self.query.order(*args, **kwargs)
+
+    def filter(self, *args, **kwargs):
+        self.query = self.query.filter(*args, **kwargs)
+        return self
+
+    def __getitem__(self, index):
+        pks = [instance.pk if isinstance(instance, models.Model) else instance['pk']
+                for instance in self.query[index]]
+        return [item for item in self.model.objects.filter(pk__in=pks) if item]
+
+    def count(self):
+        return self.query.count()
+
+    # TODO: add keys_only query
+#    def values(self, fields):
+#        pass

File search/models.py

Empty file added.

File search/models.pyc

Binary file added.

File search/porter_stemmers/__init__.py

Empty file added.

File search/porter_stemmers/de.py

+# -*- coding: utf-8 -*-
+#   Eine Pythonimplementation des Porter-Stemmers für Deutsch (Orginal unter http://snowball.tartarus.org/texts/germanic.html)
+#
+#   Modifiziert/optimiert/gefixt von Waldemar Kornewald
+#
+#   Ersteller dieser Version: (c) by kristall 'ät' c-base.org       http://kristall.crew.c-base.org/porter_de.py
+#
+#   Der Algorithmus in (englischem) Prosa unter http://snowball.tartarus.org/algorithms/german/stemmer.html
+#
+#   Wikipedia zum Porter-Stemmer: http://de.wikipedia.org/wiki/Porter-Stemmer-Algorithmus
+#
+#   Lizenz: Diese Software steht unter der BSD License (siehe http://www.opensource.org/licenses/bsd-license.html).
+#   Ursprünglicher Autor: (c) by Dr. Martin Porter 
+#
+#
+###
+
+#   Wer mit Strings arbeitet, sollte dieses Modul laden
+import string
+
+#   Die Stopliste; Wörter in dieser Liste werden nicht 'gestemmt', wenn stop  = 'True' an die Funktion übergeben wurde
+stopliste = (u'aber', u'alle', u'allem', u'allen', u'aller', u'alles', u'als', u'also', u'am', u'an', u'ander', u'andere', u'anderem',
+        u'anderen', u'anderer', u'anderes', u'anderm', u'andern', u'anders', u'auch', u'auf', u'aus', u'bei', u'bin', u'bis', u'bist',
+        u'da', u'damit', u'dann', u'der', u'den', u'des', u'dem', u'die', u'das', u'dass', u'daß', u'derselbe', u'derselben', u'denselben',
+        u'desselben', u'demselben', u'dieselbe', u'dieselben', u'dasselbe', u'dazu', u'dein', u'deine', u'deinem', u'deinen', u'deiner',
+        u'deines', u'denn', u'derer', u'dessen', u'dich', u'dir', u'du', u'dies', u'diese', u'diesem', u'diesen', u'dieser', u'dieses',
+        u'doch', u'dort', u'durch', u'ein', u'eine', u'einem', u'einen', u'einer', u'eines', u'einig', u'einige', u'einigem', u'einigen', 
+        u'einiger', u'einiges', u'einmal', u'er', u'ihn', u'ihm', u'es', u'etwas', u'euer', u'eure', u'eurem', u'euren', u'eurer', u'eures',
+        u'für', u'gegen', u'gewesen', u'hab', u'habe', u'haben', u'hat', u'hatte', u'hatten', u'hier', u'hin', u'hinter', u'ich', u'mich',
+        u'mir', u'ihr', u'ihre', u'ihrem', u'ihren', u'ihrer', u'ihres', u'euch', u'im', u'in', u'indem', u'ins', u'ist', u'jede', u'jedem',
+        u'jeden', u'jeder', u'jedes', u'jene', u'jenem', u'jenen', u'jener', u'jenes', u'jetzt', u'kann', u'kein', u'keine', u'keinem', 
+        u'keinen', u'keiner', u'keines', u'können', u'könnte', u'machen', u'man', u'manche', u'manchem', u'manchen', u'mancher', 
+        u'manches', u'mein', u'meine', u'meinem', u'meinen', u'meiner', u'meines', u'mit', u'muss', u'musste', u'muß', u'mußte', u'nach',
+        u'nicht', u'nichts', u'noch', u'nun', u'nur', u'ob', u'oder', u'ohne', u'sehr', u'sein', u'seine', u'seinem', u'seinen', u'seiner',
+        u'seines', u'selbst', u'sich', u'sie', u'ihnen', u'sind', u'so', u'solche', u'solchem', u'solchen', u'solcher', u'solches', u'soll',
+        u'sollte', u'sondern', u'sonst', u'über', u'um', u'und', u'uns', u'unse', u'unsem', u'unsen', u'unser', u'unses', u'unter', u'viel',
+        u'vom', u'von', u'vor', u'während', u'war', u'waren', u'warst', u'was', u'weg', u'weil', u'weiter', u'welche', u'welchem', 
+        u'welchen', u'welcher', u'welches', u'wenn', u'werde', u'werden', u'wie', u'wieder', u'will', u'wir', u'wird', u'wirst', u'wo',
+        u'wollem', u'wollte', u'würde', u'würden', u'zu', u'zum', u'zur', u'zwar', u'zwischen')
+
+#   Die Funktion stem nimmt ein Wort und versucht dies durch Regelanwendung zu verkürzen. Wenn Stop auf 'True' gesetzt wird, werden Wörter in der Stopliste nicht 'gestemmt'.
+def stem(wort, stop=True):
+    #   ACHTUNG: für den Stemmer gilt 'y' als Vokal.
+    vokale = u'aeiouyäüö'
+    #   ACHTUNG: 'U' und 'Y' gelten als Konsonaten.
+    konsonanten = u'bcdfghjklmnpqrstvwxzßUY'
+    #   Konsonanten die vor einer 's'-Endung stehen dürfen.
+    s_endung = u'bdfghklmnrt'
+    #   Konsonanten die vor einer 'st'-Endung stehen dürfen.
+    st_endung = u'bdfghklmnt'
+    #   Zu r1 & r2 siehe http://snowball.tartarus.org/texts/r1r2.html, p1 & p2 sind die Start'p'ositionen von r1 & r2 im String
+    r1 = u''
+    p1 = 0
+    r2 = u''
+    p2 = 0
+    #   Wortstämme werden klein geschrieben
+    wort = wort.lower()
+    #   Wenn 'stop' und Wort in Stopliste gib 'wort' zurück 
+    if stop == True and wort in stopliste:
+        return end_stemming(wort.replace(u'ß', u'ss'))
+    # Ersetze alle 'ß' durch 'ss'
+    wort = wort.replace(u'ß', u'ss')
+    #   Schützenswerte 'u' bzw. 'y' werden durch 'U' bzw. 'Y' ersetzt
+    for e in map(None, wort, range(len(wort))):
+        if e[1] == 0: continue
+        if u'u' in e:
+            try:
+                if ((wort[(e[1]-1)] in vokale) and (wort[(e[1]+1)] in vokale)): wort = wort[:e[1]] + u'U' + wort[(e[1]+1):]
+            except : pass
+        if  u'y' in e:
+            try:
+                if ((wort[(e[1]-1)] in vokale) and (wort[(e[1]+1)] in vokale)): wort = wort[:e[1]] + u'Y' + wort[(e[1]+1):]
+            except: pass
+    #   r1, r2, p1 & p2 werden mit Werten belegt
+    try:
+        Bedingung = False
+        for e in map(None, wort, range(len(wort))):
+            if e[0] in vokale: Bedingung = True
+            if ((e[0] in konsonanten) and (Bedingung)):
+                p1 = e[1] + 1 
+                r1 = wort[p1:]
+                break
+        Bedingung = False
+        for e in map(None, r1, range(len(r1))):
+            if e[0] in vokale: Bedingung = True
+            if ((e[0] in konsonanten) and (Bedingung)):
+                p2 = e[1] + 1 
+                r2 = r1[p2:]
+                break
+        if ((p1 < 3)and(p1 > 0)):
+            p1 = 3
+            r1 = wort[p1:]
+        if p1 == 0:
+            return end_stemming(wort)
+    except: pass
+    #   Die Schritte 1 bis 3 d) 'stemmen' das übergebene Wort. 
+    #   Schritt 1
+    eSuffixe_1 = [u'e', u'em', u'en', u'ern', u'er', u'es']
+    eSonst_1 = [u's']
+    try:
+        for e in eSuffixe_1:
+            if e in r1[-(len(e)):]:
+                wort = wort[:-(len(e))]
+                r1 = r1[:-(len(e))]
+                r2 = r2[:-(len(e))]
+                break
+        else:
+            if r1[-1] in eSonst_1:
+                if wort[-2] in s_endung:
+                    wort = wort[:-1]
+                    r1 = r1[:-1]
+                    r2 = r2[:-1]
+    except: pass
+    #   Schritt 2
+    eSuffixe_2 = [u'est', u'er', u'en']
+    eSonst_2 = [u'st']
+    try:
+        for e in eSuffixe_2:
+            if e in r1[-len(e):]:
+                wort = wort[:-len(e)]
+                r1 = r1[:-len(e)]
+                r2 = r2[:-len(e)]
+                break
+        else:
+            if r1[-2:] in eSonst_2:             
+                if wort[-3] in st_endung:
+                    if len(wort) > 5:
+                        wort = wort[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+    except:pass
+    #   Schritt 3 a)
+    dSuffixe_1 = [u'end', u'ung']
+    try: