1. Luke Plant
  2. sphinx


sphinx / sphinx / search.py

# -*- coding: utf-8 -*-

    Create a search index for offline search.

    :copyright: 2007-2008 by Armin Ronacher.
    :license: BSD.
import re
import pickle

from docutils.nodes import Text, NodeVisitor

from sphinx.util.stemmer import PorterStemmer
from sphinx.util.json import dump_json, load_json

word_re = re.compile(r'\w+(?u)')

class Stemmer(PorterStemmer):
    All those porter stemmer implementations look hideous.
    make at least the stem method nicer.

    def stem(self, word):
        return PorterStemmer.stem(self, word, 0, len(word) - 1)

class WordCollector(NodeVisitor):
    A special visitor that collects words for the `IndexBuilder`.

    def __init__(self, document):
        NodeVisitor.__init__(self, document)
        self.found_words = []

    def dispatch_visit(self, node):
        if node.__class__ is Text:

class IndexBuilder(object):
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    formats = {
        'json':     (dump_json, load_json),
        'pickle':   (pickle.dumps, pickle.loads),

    def __init__(self):
        self._stemmer = Stemmer()
        # filename -> title
        self._titles = {}
        # stemmed word -> set(filenames)
        self._mapping = {}

    def load(self, stream, format):
        """Reconstruct from frozen data."""
        frozen = self.formats[format][1](stream.read())
        index2fn = frozen[0]
        self._titles = dict(zip(frozen[0], frozen[1]))
        self._mapping = dict((k, set(index2fn[i] for i in v))
                             for (k, v) in frozen[2].iteritems())

    def dump(self, stream, format):
        """Dump the frozen index to a stream."""

    def freeze(self):
        Create a useable data structure. You can pass this output
        to the `SearchFrontend` to search the index.
        fns, titles = self._titles.keys(), self._titles.values()
        fn2index = dict((f, i) for (i, f) in enumerate(fns))
        return [
            dict((k, [fn2index[fn] for fn in v])
                 for (k, v) in self._mapping.iteritems()),

    def prune(self, filenames):
        """Remove data for all filenames not in the list."""
        new_titles = {}
        for filename in filenames:
            if filename in self._titles:
                new_titles[filename] = self._titles[filename]
        self._titles = new_titles
        for wordnames in self._mapping.itervalues():

    def feed(self, filename, title, doctree):
        """Feed a doctree to the index."""
        self._titles[filename] = title

        visitor = WordCollector(doctree)
        for word in word_re.findall(title) + visitor.found_words:

class SearchFrontend(object):
    This class acts as a frontend for the search index. It can search
    a searchindex as provided by `IndexBuilder`.

    def __init__(self, index):
        self.filenames, self.titles, self.words = index
        self._stemmer = Stemmer()

    def query(self, required, excluded):
        file_map = {}
        for word in required:
            if word not in self.words:
            for fid in self.words[word]:
                file_map.setdefault(fid, set()).add(word)

        return sorted(((self.filenames[fid], self.titles[fid])
            for fid, words in file_map.iteritems()
            if len(words) == len(required) and not
               any(fid in self.words.get(word, ()) for word in excluded)
        ), key=lambda x: x[1].lower())

    def search(self, searchstring):
        required = set()
        excluded = set()
        for word in searchstring.split():
            if word.startswith('-'):
                storage = excluded
                word = word[1:]
                storage = required

        return self.query(required, excluded)