Commits

Georg Brandl committed bb92596

#273: Add an API for adding full-text search support for languages other than English. Add support for Japanese.

Based on the implementation by SHIBUKAWA Yoshiki in https://bitbucket.org/shibu/sphinx/.

  • Participants
  • Parent commits c81f578

Comments (0)

Files changed (13)

 * Benjamin Peterson -- unittests
 * T. Powers -- HTML output improvements
 * Stefan Seefeld -- toctree improvements
+* Shibukawa Yoshiki -- pluggable search API and Japanese search
 * Antonio Valentino -- qthelp builder
 * Pauli Virtanen -- autodoc improvements, autosummary extension
 * Stefan van der Walt -- autosummary extension
   requests and allow configuring the timeout.  New config values:
   :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
 
+* #273: Add an API for adding full-text search support for languages
+  other than English.  Add support for Japanese.
+
 * #221: Add Swedish locale.
 
 * Added ``inline`` option to graphviz directives, and fixed the
 
    .. versionadded:: 1.0
 
+.. confval:: html_search_language
+
+   Language to be used for generating the HTML full-text search index.  This
+   defaults to the global language selected with :confval:`language`.  If there
+   is no support for this language, ``"en"`` is used which selects the English
+   language.
+
+   Support is present for these languages:
+
+   * ``en`` -- English
+   * ``ja`` -- Japanese
+
+   .. versionadded:: 1.1
+
+.. confval:: html_search_options
+
+   A dictionary with options for the search language support, empty by default.
+   The meaning of these options depends on the language selected.
+
+   The English support has no options.
+
+   The Japanese support has these options:
+
+   * ``type`` -- ``'mecab'`` or ``'default'`` (selects either MeCab or
+     TinySegmenter word splitter algorithm)
+   * ``dic_enc`` -- the encoding for the MeCab algorithm
+   * ``dict`` -- the dictionary to use for the MeCab algorithm
+   * ``lib`` -- the library name for finding the MeCab library via ctypes if the
+     Python binding is not installed
+
+   .. versionadded:: 1.1
+
 .. confval:: htmlhelp_basename
 
    Output file base name for HTML help builder.  Default is ``'pydoc'``.

doc/ext/appapi.rst

 
    .. versionadded:: 0.6
 
+.. method:: Sphinx.add_search_language(cls)
+
+   Add *cls*, which must be a subclass of :class:`sphinx.search.SearchLanguage`,
+   as a support language for building the HTML full-text search index.  The
+   class must have a *lang* attribute that indicates the language it should be
+   used for.  See :confval:`html_search_language`.
+
+   .. versionadded:: 1.1
+
 .. method:: Sphinx.connect(event, callback)
 
    Register *callback* to be called when *event* is emitted.  For details on

sphinx/application.py

         from sphinx.ext import autodoc
         autodoc.AutoDirective._special_attrgetters[type] = getter
 
+    def add_search_language(self, cls):
+        from sphinx.search import languages, SearchLanguage
+        assert isinstance(cls, SearchLanguage)
+        languages[cls.lang] = cls
+
 
 class TemplateBridge(object):
     """

sphinx/builders/html.py

             if jsfile:
                 copyfile(jsfile, path.join(self.outdir, '_static',
                                            'translations.js'))
+
+        # add context items for search function used in searchtools.js_t
+        ctx = self.globalcontext.copy()
+        ctx.update(self.indexer.globalcontext_for_searchtool())
+
         # then, copy over theme-supplied static files
         if self.theme:
             themeentries = [path.join(themepath, 'static')
                             for themepath in self.theme.get_dirchain()[::-1]]
             for entry in themeentries:
                 copy_static_entry(entry, path.join(self.outdir, '_static'),
-                                  self, self.globalcontext)
+                                  self, ctx)
         # then, copy over all user-supplied static files
         staticentries = [path.join(self.confdir, spath)
                          for spath in self.config.html_static_path]
                 self.warn('html_static_path entry %r does not exist' % entry)
                 continue
             copy_static_entry(entry, path.join(self.outdir, '_static'), self,
-                              self.globalcontext, exclude_matchers=matchers)
+                              ctx, exclude_matchers=matchers)
         # copy logo and favicon files if not already in static path
         if self.config.html_logo:
             logobase = path.basename(self.config.html_logo)
         html_output_encoding = ('utf-8', 'html'),
         html_compact_lists = (True, 'html'),
         html_secnumber_suffix = ('. ', 'html'),
+        html_search_language = (None, 'html'),
+        html_search_options = ({}, 'html'),
 
         # HTML help only options
         htmlhelp_basename = (lambda self: make_filename(self.project), None),

sphinx/search.py

-# -*- coding: utf-8 -*-
-"""
-    sphinx.search
-    ~~~~~~~~~~~~~
-
-    Create a search index for offline search.
-
-    :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
-    :license: BSD, see LICENSE for details.
-"""
-import re
-import cPickle as pickle
-
-from docutils.nodes import comment, Text, NodeVisitor, SkipNode
-
-from sphinx.util import jsdump, rpartition
-try:
-    # http://bitbucket.org/methane/porterstemmer/
-    from porterstemmer import Stemmer as CStemmer
-    CSTEMMER = True
-except ImportError:
-    from sphinx.util.stemmer import PorterStemmer
-    CSTEMMER = False
-
-
-word_re = re.compile(r'\w+(?u)')
-
-stopwords = set("""
-a  and  are  as  at
-be  but  by
-for
-if  in  into  is  it
-near  no  not
-of  on  or
-such
-that  the  their  then  there  these  they  this  to
-was  will  with
-""".split())
-
-
-class _JavaScriptIndex(object):
-    """
-    The search index as javascript file that calls a function
-    on the documentation search object to register the index.
-    """
-
-    PREFIX = 'Search.setIndex('
-    SUFFIX = ')'
-
-    def dumps(self, data):
-        return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
-
-    def loads(self, s):
-        data = s[len(self.PREFIX):-len(self.SUFFIX)]
-        if not data or not s.startswith(self.PREFIX) or not \
-           s.endswith(self.SUFFIX):
-            raise ValueError('invalid data')
-        return jsdump.loads(data)
-
-    def dump(self, data, f):
-        f.write(self.dumps(data))
-
-    def load(self, f):
-        return self.loads(f.read())
-
-
-js_index = _JavaScriptIndex()
-
-
-if CSTEMMER:
-    class Stemmer(CStemmer):
-
-        def stem(self, word):
-            return self(word.lower())
-
-else:
-    class Stemmer(PorterStemmer):
-        """
-        All those porter stemmer implementations look hideous.
-        make at least the stem method nicer.
-        """
-
-        def stem(self, word):
-            word = word.lower()
-            return PorterStemmer.stem(self, word, 0, len(word) - 1)
-
-
-
-class WordCollector(NodeVisitor):
-    """
-    A special visitor that collects words for the `IndexBuilder`.
-    """
-
-    def __init__(self, document):
-        NodeVisitor.__init__(self, document)
-        self.found_words = []
-
-    def dispatch_visit(self, node):
-        if node.__class__ is comment:
-            raise SkipNode
-        if node.__class__ is Text:
-            self.found_words.extend(word_re.findall(node.astext()))
-
-
-class IndexBuilder(object):
-    """
-    Helper class that creates a searchindex based on the doctrees
-    passed to the `feed` method.
-    """
-    formats = {
-        'jsdump':   jsdump,
-        'pickle':   pickle
-    }
-
-    def __init__(self, env):
-        self.env = env
-        self._stemmer = Stemmer()
-        # filename -> title
-        self._titles = {}
-        # stemmed word -> set(filenames)
-        self._mapping = {}
-        # objtype -> index
-        self._objtypes = {}
-        # objtype index -> objname (localized)
-        self._objnames = {}
-
-    def load(self, stream, format):
-        """Reconstruct from frozen data."""
-        if isinstance(format, basestring):
-            format = self.formats[format]
-        frozen = format.load(stream)
-        # if an old index is present, we treat it as not existing.
-        if not isinstance(frozen, dict):
-            raise ValueError('old format')
-        index2fn = frozen['filenames']
-        self._titles = dict(zip(index2fn, frozen['titles']))
-        self._mapping = {}
-        for k, v in frozen['terms'].iteritems():
-            if isinstance(v, int):
-                self._mapping[k] = set([index2fn[v]])
-            else:
-                self._mapping[k] = set(index2fn[i] for i in v)
-        # no need to load keywords/objtypes
-
-    def dump(self, stream, format):
-        """Dump the frozen index to a stream."""
-        if isinstance(format, basestring):
-            format = self.formats[format]
-        format.dump(self.freeze(), stream)
-
-    def get_objects(self, fn2index):
-        rv = {}
-        otypes = self._objtypes
-        onames = self._objnames
-        for domainname, domain in self.env.domains.iteritems():
-            for fullname, dispname, type, docname, anchor, prio in \
-                    domain.get_objects():
-                # XXX use dispname?
-                if docname not in fn2index:
-                    continue
-                if prio < 0:
-                    continue
-                # XXX splitting at dot is kind of Python specific
-                prefix, name = rpartition(fullname, '.')
-                pdict = rv.setdefault(prefix, {})
-                try:
-                    i = otypes[domainname, type]
-                except KeyError:
-                    i = len(otypes)
-                    otypes[domainname, type] = i
-                    otype = domain.object_types.get(type)
-                    if otype:
-                        # use unicode() to fire translation proxies
-                        onames[i] = unicode(domain.get_type_name(otype))
-                    else:
-                        onames[i] = type
-                pdict[name] = (fn2index[docname], i, prio)
-        return rv
-
-    def get_terms(self, fn2index):
-        rv = {}
-        for k, v in self._mapping.iteritems():
-            if len(v) == 1:
-                fn, = v
-                if fn in fn2index:
-                    rv[k] = fn2index[fn]
-            else:
-                rv[k] = [fn2index[fn] for fn in v if fn in fn2index]
-        return rv
-
-    def freeze(self):
-        """Create a usable data structure for serializing."""
-        filenames = self._titles.keys()
-        titles = self._titles.values()
-        fn2index = dict((f, i) for (i, f) in enumerate(filenames))
-        terms = self.get_terms(fn2index)
-        objects = self.get_objects(fn2index)  # populates _objtypes
-        objtypes = dict((v, k[0] + ':' + k[1])
-                        for (k, v) in self._objtypes.iteritems())
-        objnames = self._objnames
-        return dict(filenames=filenames, titles=titles, terms=terms,
-                    objects=objects, objtypes=objtypes, objnames=objnames)
-
-    def prune(self, filenames):
-        """Remove data for all filenames not in the list."""
-        new_titles = {}
-        for filename in filenames:
-            if filename in self._titles:
-                new_titles[filename] = self._titles[filename]
-        self._titles = new_titles
-        for wordnames in self._mapping.itervalues():
-            wordnames.intersection_update(filenames)
-
-    def feed(self, filename, title, doctree):
-        """Feed a doctree to the index."""
-        self._titles[filename] = title
-
-        visitor = WordCollector(doctree)
-        doctree.walk(visitor)
-
-        def add_term(word, stem=self._stemmer.stem):
-            word = stem(word)
-            if len(word) < 3 or word in stopwords or word.isdigit():
-                return
-            self._mapping.setdefault(word, set()).add(filename)
-
-        for word in word_re.findall(title):
-            add_term(word)
-
-        for word in visitor.found_words:
-            add_term(word)

sphinx/search/__init__.py

+# -*- coding: utf-8 -*-
+"""
+    sphinx.search
+    ~~~~~~~~~~~~~
+
+    Create a full-text search index for offline search.
+
+    :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+import re
+import cPickle as pickle
+
+from docutils.nodes import comment, Text, NodeVisitor, SkipNode
+
+from sphinx.util import jsdump, rpartition
+
+
+class SearchLanguage(object):
+    """
+    This class is the base class for search natural language preprocessors.  If
+    you want to add support for a new language, you should override the methods
+    of this class.
+
+    You should override `lang` class property too (e.g. 'en', 'fr' and so on).
+
+    .. attribute:: stopwords
+
+       This is a set of stop words of the target language.  Default `stopwords`
+       is empty.  This word is used for building index and embedded in JS.
+
+    .. attribute:: js_stemmer_code
+
+       Return stemmer class of JavaScript version.  This class' name should be
+       ``Stemmer`` and this class must have ``stemWord`` method.  This string is
+       embedded as-is in searchtools.js.
+
+       This class is used to preprocess search word which Sphinx HTML readers
+       type, before searching index. Default implementation does nothing.
+    """
+    lang = None
+    stopwords = set()
+    js_stemmer_code = """
+/**
+ * Dummy stemmer for languages without stemming rules.
+ */
+var Stemmer = function() {
+  this.stemWord = function(w) {
+    return w;
+  }
+}
+"""
+
+    _word_re = re.compile(r'\w+(?u)')
+
+    def __init__(self, options):
+        self.options = options
+        self.init(options)
+
+    def init(self, options):
+        """
+        Initialize the class with the options the user has given.
+        """
+
+    def split(self, input):
+        """
+        This method splits a sentence into words.  Default splitter splits input
+        at white spaces, which should be enough for most languages except CJK
+        languages.
+        """
+        return self._word_re.findall(input)
+
+    def stem(self, word):
+        """
+        This method implements stemming algorithm of the Python version.
+
+        Default implementation does nothing.  You should implement this if the
+        language has any stemming rules.
+
+        This class is used to preprocess search words before registering them in
+        the search index.  The stemming of the Python version and the JS version
+        (given in the js_stemmer_code attribute) must be compatible.
+        """
+        return word
+
+    def word_filter(self, word):
+        """
+        Return true if the target word should be registered in the search index.
+        This method is called after stemming.
+        """
+        return not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
+            (ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or
+                                     word.isdigit())))
+
+from sphinx.search import en, ja
+
+languages = {
+    'en': en.SearchEnglish,
+    'ja': ja.SearchJapanese,
+}
+
+
+class _JavaScriptIndex(object):
+    """
+    The search index as javascript file that calls a function
+    on the documentation search object to register the index.
+    """
+
+    PREFIX = 'Search.setIndex('
+    SUFFIX = ')'
+
+    def dumps(self, data):
+        return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
+
+    def loads(self, s):
+        data = s[len(self.PREFIX):-len(self.SUFFIX)]
+        if not data or not s.startswith(self.PREFIX) or not \
+           s.endswith(self.SUFFIX):
+            raise ValueError('invalid data')
+        return jsdump.loads(data)
+
+    def dump(self, data, f):
+        f.write(self.dumps(data))
+
+    def load(self, f):
+        return self.loads(f.read())
+
+
+js_index = _JavaScriptIndex()
+
+
+class WordCollector(NodeVisitor):
+    """
+    A special visitor that collects words for the `IndexBuilder`.
+    """
+
+    def __init__(self, document, lang):
+        NodeVisitor.__init__(self, document)
+        self.found_words = []
+        self.lang = lang
+
+    def dispatch_visit(self, node):
+        if node.__class__ is comment:
+            raise SkipNode
+        if node.__class__ is Text:
+            self.found_words.extend(self.lang.split(node.astext()))
+
+
+class IndexBuilder(object):
+    """
+    Helper class that creates a searchindex based on the doctrees
+    passed to the `feed` method.
+    """
+    formats = {
+        'jsdump':   jsdump,
+        'pickle':   pickle
+    }
+
+    def __init__(self, env):
+        self.env = env
+        # filename -> title
+        self._titles = {}
+        # stemmed word -> set(filenames)
+        self._mapping = {}
+        # objtype -> index
+        self._objtypes = {}
+        # objtype index -> objname (localized)
+        self._objnames = {}
+        # add language-specific SearchLanguage instance
+        search_language = env.config.html_search_language or env.config.language
+        if not search_language or search_language not in languages:
+            search_language = 'en'
+        self.lang = languages[search_language](env.config.html_search_options)
+
+    def load(self, stream, format):
+        """Reconstruct from frozen data."""
+        if isinstance(format, basestring):
+            format = self.formats[format]
+        frozen = format.load(stream)
+        # if an old index is present, we treat it as not existing.
+        if not isinstance(frozen, dict):
+            raise ValueError('old format')
+        index2fn = frozen['filenames']
+        self._titles = dict(zip(index2fn, frozen['titles']))
+        self._mapping = {}
+        for k, v in frozen['terms'].iteritems():
+            if isinstance(v, int):
+                self._mapping[k] = set([index2fn[v]])
+            else:
+                self._mapping[k] = set(index2fn[i] for i in v)
+        # no need to load keywords/objtypes
+
+    def dump(self, stream, format):
+        """Dump the frozen index to a stream."""
+        if isinstance(format, basestring):
+            format = self.formats[format]
+        format.dump(self.freeze(), stream)
+
+    def get_objects(self, fn2index):
+        rv = {}
+        otypes = self._objtypes
+        onames = self._objnames
+        for domainname, domain in self.env.domains.iteritems():
+            for fullname, dispname, type, docname, anchor, prio in \
+                    domain.get_objects():
+                # XXX use dispname?
+                if docname not in fn2index:
+                    continue
+                if prio < 0:
+                    continue
+                # XXX splitting at dot is kind of Python specific
+                prefix, name = rpartition(fullname, '.')
+                pdict = rv.setdefault(prefix, {})
+                try:
+                    i = otypes[domainname, type]
+                except KeyError:
+                    i = len(otypes)
+                    otypes[domainname, type] = i
+                    otype = domain.object_types.get(type)
+                    if otype:
+                        # use unicode() to fire translation proxies
+                        onames[i] = unicode(domain.get_type_name(otype))
+                    else:
+                        onames[i] = type
+                pdict[name] = (fn2index[docname], i, prio)
+        return rv
+
+    def get_terms(self, fn2index):
+        rv = {}
+        for k, v in self._mapping.iteritems():
+            if len(v) == 1:
+                fn, = v
+                if fn in fn2index:
+                    rv[k] = fn2index[fn]
+            else:
+                rv[k] = [fn2index[fn] for fn in v if fn in fn2index]
+        return rv
+
+    def freeze(self):
+        """Create a usable data structure for serializing."""
+        filenames = self._titles.keys()
+        titles = self._titles.values()
+        fn2index = dict((f, i) for (i, f) in enumerate(filenames))
+        terms = self.get_terms(fn2index)
+        objects = self.get_objects(fn2index)  # populates _objtypes
+        objtypes = dict((v, k[0] + ':' + k[1])
+                        for (k, v) in self._objtypes.iteritems())
+        objnames = self._objnames
+        return dict(filenames=filenames, titles=titles, terms=terms,
+                    objects=objects, objtypes=objtypes, objnames=objnames)
+
+    def prune(self, filenames):
+        """Remove data for all filenames not in the list."""
+        new_titles = {}
+        for filename in filenames:
+            if filename in self._titles:
+                new_titles[filename] = self._titles[filename]
+        self._titles = new_titles
+        for wordnames in self._mapping.itervalues():
+            wordnames.intersection_update(filenames)
+
+    def feed(self, filename, title, doctree):
+        """Feed a doctree to the index."""
+        self._titles[filename] = title
+
+        visitor = WordCollector(doctree, self.lang)
+        doctree.walk(visitor)
+
+        def add_term(word, stem=self.lang.stem):
+            word = stem(word)
+            if self.lang.word_filter(word):
+                self._mapping.setdefault(word, set()).add(filename)
+
+        for word in self.lang.split(title):
+            add_term(word)
+
+        for word in visitor.found_words:
+            add_term(word)
+
+    def globalcontext_for_searchtool(self):
+        return dict(
+            search_language_stemming_code = self.lang.js_stemmer_code,
+            search_language_stop_words = jsdump.dumps(self.lang.stopwords),
+        )

sphinx/search/en.py

+# -*- coding: utf-8 -*-
+"""
+    sphinx.search_languages.en
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    English search language: includes the JS porter stemmer.
+
+    :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+from sphinx.search import SearchLanguage
+
+try:
+    # http://bitbucket.org/methane/porterstemmer/
+    from porterstemmer import Stemmer as CStemmer
+    CSTEMMER = True
+except ImportError:
+    from sphinx.util.stemmer import PorterStemmer
+    CSTEMMER = False
+
+
+english_stopwords = set("""
+a  and  are  as  at
+be  but  by
+for
+if  in  into  is  it
+near  no  not
+of  on  or
+such
+that  the  their  then  there  these  they  this  to
+was  will  with
+""".split())
+
+js_porter_stemmer = """
+/**
+ * Porter Stemmer
+ */
+var Stemmer = function() {
+
+  var step2list = {
+    ational: 'ate',
+    tional: 'tion',
+    enci: 'ence',
+    anci: 'ance',
+    izer: 'ize',
+    bli: 'ble',
+    alli: 'al',
+    entli: 'ent',
+    eli: 'e',
+    ousli: 'ous',
+    ization: 'ize',
+    ation: 'ate',
+    ator: 'ate',
+    alism: 'al',
+    iveness: 'ive',
+    fulness: 'ful',
+    ousness: 'ous',
+    aliti: 'al',
+    iviti: 'ive',
+    biliti: 'ble',
+    logi: 'log'
+  };
+
+  var step3list = {
+    icate: 'ic',
+    ative: '',
+    alize: 'al',
+    iciti: 'ic',
+    ical: 'ic',
+    ful: '',
+    ness: ''
+  };
+
+  var c = "[^aeiou]";          // consonant
+  var v = "[aeiouy]";          // vowel
+  var C = c + "[^aeiouy]*";    // consonant sequence
+  var V = v + "[aeiou]*";      // vowel sequence
+
+  var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
+  var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
+  var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
+  var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
+
+  this.stemWord = function (w) {
+    var stem;
+    var suffix;
+    var firstch;
+    var origword = w;
+
+    if (w.length < 3)
+      return w;
+
+    var re;
+    var re2;
+    var re3;
+    var re4;
+
+    firstch = w.substr(0,1);
+    if (firstch == "y")
+      w = firstch.toUpperCase() + w.substr(1);
+
+    // Step 1a
+    re = /^(.+?)(ss|i)es$/;
+    re2 = /^(.+?)([^s])s$/;
+
+    if (re.test(w))
+      w = w.replace(re,"$1$2");
+    else if (re2.test(w))
+      w = w.replace(re2,"$1$2");
+
+    // Step 1b
+    re = /^(.+?)eed$/;
+    re2 = /^(.+?)(ed|ing)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      re = new RegExp(mgr0);
+      if (re.test(fp[1])) {
+        re = /.$/;
+        w = w.replace(re,"");
+      }
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1];
+      re2 = new RegExp(s_v);
+      if (re2.test(stem)) {
+        w = stem;
+        re2 = /(at|bl|iz)$/;
+        re3 = new RegExp("([^aeiouylsz])\\1$");
+        re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+        if (re2.test(w))
+          w = w + "e";
+        else if (re3.test(w)) {
+          re = /.$/;
+          w = w.replace(re,"");
+        }
+        else if (re4.test(w))
+          w = w + "e";
+      }
+    }
+
+    // Step 1c
+    re = /^(.+?)y$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(s_v);
+      if (re.test(stem))
+        w = stem + "i";
+    }
+
+    // Step 2
+    re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step2list[suffix];
+    }
+
+    // Step 3
+    re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step3list[suffix];
+    }
+
+    // Step 4
+    re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+    re2 = /^(.+?)(s|t)(ion)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      if (re.test(stem))
+        w = stem;
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1] + fp[2];
+      re2 = new RegExp(mgr1);
+      if (re2.test(stem))
+        w = stem;
+    }
+
+    // Step 5
+    re = /^(.+?)e$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      re2 = new RegExp(meq1);
+      re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
+        w = stem;
+    }
+    re = /ll$/;
+    re2 = new RegExp(mgr1);
+    if (re.test(w) && re2.test(w)) {
+      re = /.$/;
+      w = w.replace(re,"");
+    }
+
+    // and turn initial Y back to y
+    if (firstch == "y")
+      w = firstch.toLowerCase() + w.substr(1);
+    return w;
+  }
+}
+"""
+
+
+class SearchEnglish(SearchLanguage):
+    lang = 'en'
+    js_stemmer_code = js_porter_stemmer
+    stopwords = english_stopwords
+
+    def init(self, options):
+        if CSTEMMER:
+            class Stemmer(CStemmer):
+                def stem(self, word):
+                    return self(word.lower())
+        else:
+            class Stemmer(PorterStemmer):
+                """All those porter stemmer implementations look hideous;
+                make at least the stem method nicer.
+                """
+                def stem(self, word):
+                    word = word.lower()
+                    return PorterStemmer.stem(self, word, 0, len(word) - 1)
+
+        self.stemmer = Stemmer()
+
+    def stem(self, word):
+        return self.stemmer.stem(word)

sphinx/search/ja.py

+# -*- coding: utf-8 -*-
+"""
+    sphinx.search_languages.ja
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Japanese search language: includes routine to split words.
+
+    :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+# Python Version of TinySegmenter
+# (http://chasen.org/~taku/software/TinySegmenter/)
+# TinySegmenter is super compact Japanese tokenizer.
+#
+# TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
+# Python Version was developed by xnights <programming.magic(at)gmail.com>.
+# For details, see http://programming-magic.com/?id=170
+
+import os
+import re
+import sys
+
+try:
+    import MeCab
+    native_module = True
+except ImportError:
+    native_module = False
+
+from sphinx.search import SearchLanguage
+
+
+class MecabBinder(object):
+    def __init__(self, options):
+        self.ctypes_libmecab = None
+        self.ctypes_mecab = None
+        if not native_module:
+            self.init_ctypes(options)
+        else:
+            self.init_native(options)
+        self.dict_encode = options.get('dic_enc', 'utf-8')
+
+    def split(self, input):
+        input2 = input.encode(self.dict_encode)
+        if native_module:
+            result = self.native.parse(input2)
+        else:
+            result = self.ctypes_libmecab.mecab_sparse_tostr(
+                self.ctypes_mecab, input)
+        return result.decode(self.dict_encode).split(' ')
+
+    def init_native(self, options):
+        param = '-Owakati'
+        dict = options.get('dict')
+        if dict:
+            param += ' -d %s' % dict
+        self.native = MeCab.Tagger(param)
+
+    def init_ctypes(self, options):
+        import ctypes.util
+
+        lib = options.get('lib')
+
+        if lib is None:
+            if sys.platform.startswith('win'):
+               libname = 'libmecab.dll'
+            else:
+               libname = 'mecab'
+            libpath = ctypes.util.find_library(libname)
+        elif os.path.basename(lib) == lib:
+            libpath = ctypes.util.find_library(lib)
+        else:
+            libpath = None
+            if os.path.exists(lib):
+                libpath = lib
+        if libpath is None:
+            raise RuntimeError('MeCab dynamic library is not available')
+
+        param = 'mecab -Owakati'
+        dict = options.get('dict')
+        if dict:
+            param += ' -d %s' % dict
+
+        self.ctypes_libmecab = ctypes.CDLL(libpath)
+        self.ctypes_libmecab.mecab_sparse_tostr.restype = ctypes.c_char_p
+        self.ctypes_mecab = self.libmecab.mecab_new2(param)
+
+    def __del__(self):
+        if self.ctypes_libmecab:
+            self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
+
+
+class TinySegmenter(object):
+    patterns_ = dict([(re.compile(pattern), value) for pattern, value in {
+        u'[一二三四五六七八九十百千万億兆]': u'M',
+        u'[一-龠々〆ヵヶ]': u'H',
+        u'[ぁ-ん]': u'I',
+        u'[ァ-ヴーア-ン゙ー]': u'K',
+        u'[a-zA-Za-zA-Z]': u'A',
+        u'[0-90-9]': u'N',
+    }.iteritems()])
+    BIAS__ = -332
+    BC1__ = {u'HH':6,u'II':2461,u'KH':406,u'OH':-1378}
+    BC2__ = {u'AA':-3267,u'AI':2744,u'AN':-878,u'HH':-4070,u'HM':-1711,u'HN':4012,u'HO':3761,u'IA':1327,u'IH':-1184,u'II':-1332,u'IK':1721,u'IO':5492,u'KI':3831,u'KK':-8741,u'MH':-3132,u'MK':3334,u'OO':-2920}
+    BC3__ = {u'HH':996,u'HI':626,u'HK':-721,u'HN':-1307,u'HO':-836,u'IH':-301,u'KK':2762,u'MK':1079,u'MM':4034,u'OA':-1652,u'OH':266}
+    BP1__ = {u'BB':295,u'OB':304,u'OO':-125,u'UB':352}
+    BP2__ = {u'BO':60,u'OO':-1762}
+    BQ1__ = {u'BHH':1150,u'BHM':1521,u'BII':-1158,u'BIM':886,u'BMH':1208,u'BNH':449,u'BOH':-91,u'BOO':-2597,u'OHI':451,u'OIH':-296,u'OKA':1851,u'OKH':-1020,u'OKK':904,u'OOO':2965}
+    BQ2__ = {u'BHH':118,u'BHI':-1159,u'BHM':466,u'BIH':-919,u'BKK':-1720,u'BKO':864,u'OHH':-1139,u'OHM':-181,u'OIH':153,u'UHI':-1146}
+    BQ3__ = {u'BHH':-792,u'BHI':2664,u'BII':-299,u'BKI':419,u'BMH':937,u'BMM':8335,u'BNN':998,u'BOH':775,u'OHH':2174,u'OHM':439,u'OII':280,u'OKH':1798,u'OKI':-793,u'OKO':-2242,u'OMH':-2402,u'OOO':11699}
+    BQ4__ = {u'BHH':-3895,u'BIH':3761,u'BII':-4654,u'BIK':1348,u'BKK':-1806,u'BMI':-3385,u'BOO':-12396,u'OAH':926,u'OHH':266,u'OHK':-2036,u'ONN':-973}
+    BW1__ = {u',と':660,u',同':727,u'B1あ':1404,u'B1同':542,u'、と':660,u'、同':727,u'」と':1682,u'あっ':1505,u'いう':1743,u'いっ':-2055,u'いる':672,u'うし':-4817,u'うん':665,u'から':3472,u'がら':600,u'こう':-790,u'こと':2083,u'こん':-1262,u'さら':-4143,u'さん':4573,u'した':2641,u'して':1104,u'すで':-3399,u'そこ':1977,u'それ':-871,u'たち':1122,u'ため':601,u'った':3463,u'つい':-802,u'てい':805,u'てき':1249,u'でき':1127,u'です':3445,u'では':844,u'とい':-4915,u'とみ':1922,u'どこ':3887,u'ない':5713,u'なっ':3015,u'など':7379,u'なん':-1113,u'にし':2468,u'には':1498,u'にも':1671,u'に対':-912,u'の一':-501,u'の中':741,u'ませ':2448,u'まで':1711,u'まま':2600,u'まる':-2155,u'やむ':-1947,u'よっ':-2565,u'れた':2369,u'れで':-913,u'をし':1860,u'を見':731,u'亡く':-1886,u'京都':2558,u'取り':-2784,u'大き':-2604,u'大阪':1497,u'平方':-2314,u'引き':-1336,u'日本':-195,u'本当':-2423,u'毎日':-2113,u'目指':-724,u'B1あ':1404,u'B1同':542,u'」と':1682}
+    BW2__ = {u'..':-11822,u'11':-669,u'――':-5730,u'−−':-13175,u'いう':-1609,u'うか':2490,u'かし':-1350,u'かも':-602,u'から':-7194,u'かれ':4612,u'がい':853,u'がら':-3198,u'きた':1941,u'くな':-1597,u'こと':-8392,u'この':-4193,u'させ':4533,u'され':13168,u'さん':-3977,u'しい':-1819,u'しか':-545,u'した':5078,u'して':972,u'しな':939,u'その':-3744,u'たい':-1253,u'たた':-662,u'ただ':-3857,u'たち':-786,u'たと':1224,u'たは':-939,u'った':4589,u'って':1647,u'っと':-2094,u'てい':6144,u'てき':3640,u'てく':2551,u'ては':-3110,u'ても':-3065,u'でい':2666,u'でき':-1528,u'でし':-3828,u'です':-4761,u'でも':-4203,u'とい':1890,u'とこ':-1746,u'とと':-2279,u'との':720,u'とみ':5168,u'とも':-3941,u'ない':-2488,u'なが':-1313,u'など':-6509,u'なの':2614,u'なん':3099,u'にお':-1615,u'にし':2748,u'にな':2454,u'によ':-7236,u'に対':-14943,u'に従':-4688,u'に関':-11388,u'のか':2093,u'ので':-7059,u'のに':-6041,u'のの':-6125,u'はい':1073,u'はが':-1033,u'はず':-2532,u'ばれ':1813,u'まし':-1316,u'まで':-6621,u'まれ':5409,u'めて':-3153,u'もい':2230,u'もの':-10713,u'らか':-944,u'らし':-1611,u'らに':-1897,u'りし':651,u'りま':1620,u'れた':4270,u'れて':849,u'れば':4114,u'ろう':6067,u'われ':7901,u'を通':-11877,u'んだ':728,u'んな':-4115,u'一人':602,u'一方':-1375,u'一日':970,u'一部':-1051,u'上が':-4479,u'会社':-1116,u'出て':2163,u'分の':-7758,u'同党':970,u'同日':-913,u'大阪':-2471,u'委員':-1250,u'少な':-1050,u'年度':-8669,u'年間':-1626,u'府県':-2363,u'手権':-1982,u'新聞':-4066,u'日新':-722,u'日本':-7068,u'日米':3372,u'曜日':-601,u'朝鮮':-2355,u'本人':-2697,u'東京':-1543,u'然と':-1384,u'社会':-1276,u'立て':-990,u'第に':-1612,u'米国':-4268,u'11':-669}
+    BW3__ = {u'あた':-2194,u'あり':719,u'ある':3846,u'い.':-1185,u'い。':-1185,u'いい':5308,u'いえ':2079,u'いく':3029,u'いた':2056,u'いっ':1883,u'いる':5600,u'いわ':1527,u'うち':1117,u'うと':4798,u'えと':1454,u'か.':2857,u'か。':2857,u'かけ':-743,u'かっ':-4098,u'かに':-669,u'から':6520,u'かり':-2670,u'が,':1816,u'が、':1816,u'がき':-4855,u'がけ':-1127,u'がっ':-913,u'がら':-4977,u'がり':-2064,u'きた':1645,u'けど':1374,u'こと':7397,u'この':1542,u'ころ':-2757,u'さい':-714,u'さを':976,u'し,':1557,u'し、':1557,u'しい':-3714,u'した':3562,u'して':1449,u'しな':2608,u'しま':1200,u'す.':-1310,u'す。':-1310,u'する':6521,u'ず,':3426,u'ず、':3426,u'ずに':841,u'そう':428,u'た.':8875,u'た。':8875,u'たい':-594,u'たの':812,u'たり':-1183,u'たる':-853,u'だ.':4098,u'だ。':4098,u'だっ':1004,u'った':-4748,u'って':300,u'てい':6240,u'てお':855,u'ても':302,u'です':1437,u'でに':-1482,u'では':2295,u'とう':-1387,u'とし':2266,u'との':541,u'とも':-3543,u'どう':4664,u'ない':1796,u'なく':-903,u'など':2135,u'に,':-1021,u'に、':-1021,u'にし':1771,u'にな':1906,u'には':2644,u'の,':-724,u'の、':-724,u'の子':-1000,u'は,':1337,u'は、':1337,u'べき':2181,u'まし':1113,u'ます':6943,u'まっ':-1549,u'まで':6154,u'まれ':-793,u'らし':1479,u'られ':6820,u'るる':3818,u'れ,':854,u'れ、':854,u'れた':1850,u'れて':1375,u'れば':-3246,u'れる':1091,u'われ':-605,u'んだ':606,u'んで':798,u'カ月':990,u'会議':860,u'入り':1232,u'大会':2217,u'始め':1681,u'市':965,u'新聞':-5055,u'日,':974,u'日、':974,u'社会':2024,u'カ月':990}
+    TC1__ = {u'AAA':1093,u'HHH':1029,u'HHM':580,u'HII':998,u'HOH':-390,u'HOM':-331,u'IHI':1169,u'IOH':-142,u'IOI':-1015,u'IOM':467,u'MMH':187,u'OOI':-1832}
+    TC2__ = {u'HHO':2088,u'HII':-1023,u'HMM':-1154,u'IHI':-1965,u'KKH':703,u'OII':-2649}
+    TC3__ = {u'AAA':-294,u'HHH':346,u'HHI':-341,u'HII':-1088,u'HIK':731,u'HOH':-1486,u'IHH':128,u'IHI':-3041,u'IHO':-1935,u'IIH':-825,u'IIM':-1035,u'IOI':-542,u'KHH':-1216,u'KKA':491,u'KKH':-1217,u'KOK':-1009,u'MHH':-2694,u'MHM':-457,u'MHO':123,u'MMH':-471,u'NNH':-1689,u'NNO':662,u'OHO':-3393}
+    TC4__ = {u'HHH':-203,u'HHI':1344,u'HHK':365,u'HHM':-122,u'HHN':182,u'HHO':669,u'HIH':804,u'HII':679,u'HOH':446,u'IHH':695,u'IHO':-2324,u'IIH':321,u'III':1497,u'IIO':656,u'IOO':54,u'KAK':4845,u'KKA':3386,u'KKK':3065,u'MHH':-405,u'MHI':201,u'MMH':-241,u'MMM':661,u'MOM':841}
+    TQ1__ = {u'BHHH':-227,u'BHHI':316,u'BHIH':-132,u'BIHH':60,u'BIII':1595,u'BNHH':-744,u'BOHH':225,u'BOOO':-908,u'OAKK':482,u'OHHH':281,u'OHIH':249,u'OIHI':200,u'OIIH':-68}
+    TQ2__ = {u'BIHH':-1401,u'BIII':-1033,u'BKAK':-543,u'BOOO':-5591}
+    TQ3__ = {u'BHHH':478,u'BHHM':-1073,u'BHIH':222,u'BHII':-504,u'BIIH':-116,u'BIII':-105,u'BMHI':-863,u'BMHM':-464,u'BOMH':620,u'OHHH':346,u'OHHI':1729,u'OHII':997,u'OHMH':481,u'OIHH':623,u'OIIH':1344,u'OKAK':2792,u'OKHH':587,u'OKKA':679,u'OOHH':110,u'OOII':-685}
+    TQ4__ = {u'BHHH':-721,u'BHHM':-3604,u'BHII':-966,u'BIIH':-607,u'BIII':-2181,u'OAAA':-2763,u'OAKK':180,u'OHHH':-294,u'OHHI':2446,u'OHHO':480,u'OHIH':-1573,u'OIHH':1935,u'OIHI':-493,u'OIIH':626,u'OIII':-4007,u'OKAK':-8156}
+    TW1__ = {u'につい':-4681,u'東京都':2026}
+    TW2__ = {u'ある程':-2049,u'いった':-1256,u'ころが':-2434,u'しょう':3873,u'その後':-4430,u'だって':-1049,u'ていた':1833,u'として':-4657,u'ともに':-4517,u'もので':1882,u'一気に':-792,u'初めて':-1512,u'同時に':-8097,u'大きな':-1255,u'対して':-2721,u'社会党':-3216}
+    TW3__ = {u'いただ':-1734,u'してい':1314,u'として':-4314,u'につい':-5483,u'にとっ':-5989,u'に当た':-6247,u'ので,':-727,u'ので、':-727,u'のもの':-600,u'れから':-3752,u'十二月':-2287}
+    TW4__ = {u'いう.':8576,u'いう。':8576,u'からな':-2348,u'してい':2958,u'たが,':1516,u'たが、':1516,u'ている':1538,u'という':1349,u'ました':5543,u'ません':1097,u'ようと':-4258,u'よると':5865}
+    UC1__ = {u'A':484,u'K':93,u'M':645,u'O':-505}
+    UC2__ = {u'A':819,u'H':1059,u'I':409,u'M':3987,u'N':5775,u'O':646}
+    UC3__ = {u'A':-1370,u'I':2311}
+    UC4__ = {u'A':-2643,u'H':1809,u'I':-1032,u'K':-3450,u'M':3565,u'N':3876,u'O':6646}
+    UC5__ = {u'H':313,u'I':-1238,u'K':-799,u'M':539,u'O':-831}
+    UC6__ = {u'H':-506,u'I':-253,u'K':87,u'M':247,u'O':-387}
+    UP1__ = {u'O':-214}
+    UP2__ = {u'B':69,u'O':935}
+    UP3__ = {u'B':189}
+    UQ1__ = {u'BH':21,u'BI':-12,u'BK':-99,u'BN':142,u'BO':-56,u'OH':-95,u'OI':477,u'OK':410,u'OO':-2422}
+    UQ2__ = {u'BH':216,u'BI':113,u'OK':1759}
+    UQ3__ = {u'BA':-479,u'BH':42,u'BI':1913,u'BK':-7198,u'BM':3160,u'BN':6427,u'BO':14761,u'OI':-827,u'ON':-3212}
+    UW1__ = {u',':156,u'、':156,u'「':-463,u'あ':-941,u'う':-127,u'が':-553,u'き':121,u'こ':505,u'で':-201,u'と':-547,u'ど':-123,u'に':-789,u'の':-185,u'は':-847,u'も':-466,u'や':-470,u'よ':182,u'ら':-292,u'り':208,u'れ':169,u'を':-446,u'ん':-137,u'・':-135,u'主':-402,u'京':-268,u'区':-912,u'午':871,u'国':-460,u'大':561,u'委':729,u'市':-411,u'日':-141,u'理':361,u'生':-408,u'県':-386,u'都':-718,u'「':-463,u'・':-135}
+    UW2__ = {u',':-829,u'、':-829,u'〇':892,u'「':-645,u'」':3145,u'あ':-538,u'い':505,u'う':134,u'お':-502,u'か':1454,u'が':-856,u'く':-412,u'こ':1141,u'さ':878,u'ざ':540,u'し':1529,u'す':-675,u'せ':300,u'そ':-1011,u'た':188,u'だ':1837,u'つ':-949,u'て':-291,u'で':-268,u'と':-981,u'ど':1273,u'な':1063,u'に':-1764,u'の':130,u'は':-409,u'ひ':-1273,u'べ':1261,u'ま':600,u'も':-1263,u'や':-402,u'よ':1639,u'り':-579,u'る':-694,u'れ':571,u'を':-2516,u'ん':2095,u'ア':-587,u'カ':306,u'キ':568,u'ッ':831,u'三':-758,u'不':-2150,u'世':-302,u'中':-968,u'主':-861,u'事':492,u'人':-123,u'会':978,u'保':362,u'入':548,u'初':-3025,u'副':-1566,u'北':-3414,u'区':-422,u'大':-1769,u'天':-865,u'太':-483,u'子':-1519,u'学':760,u'実':1023,u'小':-2009,u'市':-813,u'年':-1060,u'強':1067,u'手':-1519,u'揺':-1033,u'政':1522,u'文':-1355,u'新':-1682,u'日':-1815,u'明':-1462,u'最':-630,u'朝':-1843,u'本':-1650,u'東':-931,u'果':-665,u'次':-2378,u'民':-180,u'気':-1740,u'理':752,u'発':529,u'目':-1584,u'相':-242,u'県':-1165,u'立':-763,u'第':810,u'米':509,u'自':-1353,u'行':838,u'西':-744,u'見':-3874,u'調':1010,u'議':1198,u'込':3041,u'開':1758,u'間':-1257,u'「':-645,u'」':3145,u'ッ':831,u'ア':-587,u'カ':306,u'キ':568}
+    UW3__ = {u',':4889,u'1':-800,u'−':-1723,u'、':4889,u'々':-2311,u'〇':5827,u'」':2670,u'〓':-3573,u'あ':-2696,u'い':1006,u'う':2342,u'え':1983,u'お':-4864,u'か':-1163,u'が':3271,u'く':1004,u'け':388,u'げ':401,u'こ':-3552,u'ご':-3116,u'さ':-1058,u'し':-395,u'す':584,u'せ':3685,u'そ':-5228,u'た':842,u'ち':-521,u'っ':-1444,u'つ':-1081,u'て':6167,u'で':2318,u'と':1691,u'ど':-899,u'な':-2788,u'に':2745,u'の':4056,u'は':4555,u'ひ':-2171,u'ふ':-1798,u'へ':1199,u'ほ':-5516,u'ま':-4384,u'み':-120,u'め':1205,u'も':2323,u'や':-788,u'よ':-202,u'ら':727,u'り':649,u'る':5905,u'れ':2773,u'わ':-1207,u'を':6620,u'ん':-518,u'ア':551,u'グ':1319,u'ス':874,u'ッ':-1350,u'ト':521,u'ム':1109,u'ル':1591,u'ロ':2201,u'ン':278,u'・':-3794,u'一':-1619,u'下':-1759,u'世':-2087,u'両':3815,u'中':653,u'主':-758,u'予':-1193,u'二':974,u'人':2742,u'今':792,u'他':1889,u'以':-1368,u'低':811,u'何':4265,u'作':-361,u'保':-2439,u'元':4858,u'党':3593,u'全':1574,u'公':-3030,u'六':755,u'共':-1880,u'円':5807,u'再':3095,u'分':457,u'初':2475,u'別':1129,u'前':2286,u'副':4437,u'力':365,u'動':-949,u'務':-1872,u'化':1327,u'北':-1038,u'区':4646,u'千':-2309,u'午':-783,u'協':-1006,u'口':483,u'右':1233,u'各':3588,u'合':-241,u'同':3906,u'和':-837,u'員':4513,u'国':642,u'型':1389,u'場':1219,u'外':-241,u'妻':2016,u'学':-1356,u'安':-423,u'実':-1008,u'家':1078,u'小':-513,u'少':-3102,u'州':1155,u'市':3197,u'平':-1804,u'年':2416,u'広':-1030,u'府':1605,u'度':1452,u'建':-2352,u'当':-3885,u'得':1905,u'思':-1291,u'性':1822,u'戸':-488,u'指':-3973,u'政':-2013,u'教':-1479,u'数':3222,u'文':-1489,u'新':1764,u'日':2099,u'旧':5792,u'昨':-661,u'時':-1248,u'曜':-951,u'最':-937,u'月':4125,u'期':360,u'李':3094,u'村':364,u'東':-805,u'核':5156,u'森':2438,u'業':484,u'氏':2613,u'民':-1694,u'決':-1073,u'法':1868,u'海':-495,u'無':979,u'物':461,u'特':-3850,u'生':-273,u'用':914,u'町':1215,u'的':7313,u'直':-1835,u'省':792,u'県':6293,u'知':-1528,u'私':4231,u'税':401,u'立':-960,u'第':1201,u'米':7767,u'系':3066,u'約':3663,u'級':1384,u'統':-4229,u'総':1163,u'線':1255,u'者':6457,u'能':725,u'自':-2869,u'英':785,u'見':1044,u'調':-562,u'財':-733,u'費':1777,u'車':1835,u'軍':1375,u'込':-1504,u'通':-1136,u'選':-681,u'郎':1026,u'郡':4404,u'部':1200,u'金':2163,u'長':421,u'開':-1432,u'間':1302,u'関':-1282,u'雨':2009,u'電':-1045,u'非':2066,u'駅':1620,u'1':-800,u'」':2670,u'・':-3794,u'ッ':-1350,u'ア':551,u'グ':1319,u'ス':874,u'ト':521,u'ム':1109,u'ル':1591,u'ロ':2201,u'ン':278}
+    UW4__ = {u',':3930,u'.':3508,u'―':-4841,u'、':3930,u'。':3508,u'〇':4999,u'「':1895,u'」':3798,u'〓':-5156,u'あ':4752,u'い':-3435,u'う':-640,u'え':-2514,u'お':2405,u'か':530,u'が':6006,u'き':-4482,u'ぎ':-3821,u'く':-3788,u'け':-4376,u'げ':-4734,u'こ':2255,u'ご':1979,u'さ':2864,u'し':-843,u'じ':-2506,u'す':-731,u'ず':1251,u'せ':181,u'そ':4091,u'た':5034,u'だ':5408,u'ち':-3654,u'っ':-5882,u'つ':-1659,u'て':3994,u'で':7410,u'と':4547,u'な':5433,u'に':6499,u'ぬ':1853,u'ね':1413,u'の':7396,u'は':8578,u'ば':1940,u'ひ':4249,u'び':-4134,u'ふ':1345,u'へ':6665,u'べ':-744,u'ほ':1464,u'ま':1051,u'み':-2082,u'む':-882,u'め':-5046,u'も':4169,u'ゃ':-2666,u'や':2795,u'ょ':-1544,u'よ':3351,u'ら':-2922,u'り':-9726,u'る':-14896,u'れ':-2613,u'ろ':-4570,u'わ':-1783,u'を':13150,u'ん':-2352,u'カ':2145,u'コ':1789,u'セ':1287,u'ッ':-724,u'ト':-403,u'メ':-1635,u'ラ':-881,u'リ':-541,u'ル':-856,u'ン':-3637,u'・':-4371,u'ー':-11870,u'一':-2069,u'中':2210,u'予':782,u'事':-190,u'井':-1768,u'人':1036,u'以':544,u'会':950,u'体':-1286,u'作':530,u'側':4292,u'先':601,u'党':-2006,u'共':-1212,u'内':584,u'円':788,u'初':1347,u'前':1623,u'副':3879,u'力':-302,u'動':-740,u'務':-2715,u'化':776,u'区':4517,u'協':1013,u'参':1555,u'合':-1834,u'和':-681,u'員':-910,u'器':-851,u'回':1500,u'国':-619,u'園':-1200,u'地':866,u'場':-1410,u'塁':-2094,u'士':-1413,u'多':1067,u'大':571,u'子':-4802,u'学':-1397,u'定':-1057,u'寺':-809,u'小':1910,u'屋':-1328,u'山':-1500,u'島':-2056,u'川':-2667,u'市':2771,u'年':374,u'庁':-4556,u'後':456,u'性':553,u'感':916,u'所':-1566,u'支':856,u'改':787,u'政':2182,u'教':704,u'文':522,u'方':-856,u'日':1798,u'時':1829,u'最':845,u'月':-9066,u'木':-485,u'来':-442,u'校':-360,u'業':-1043,u'氏':5388,u'民':-2716,u'気':-910,u'沢':-939,u'済':-543,u'物':-735,u'率':672,u'球':-1267,u'生':-1286,u'産':-1101,u'田':-2900,u'町':1826,u'的':2586,u'目':922,u'省':-3485,u'県':2997,u'空':-867,u'立':-2112,u'第':788,u'米':2937,u'系':786,u'約':2171,u'経':1146,u'統':-1169,u'総':940,u'線':-994,u'署':749,u'者':2145,u'能':-730,u'般':-852,u'行':-792,u'規':792,u'警':-1184,u'議':-244,u'谷':-1000,u'賞':730,u'車':-1481,u'軍':1158,u'輪':-1433,u'込':-3370,u'近':929,u'道':-1291,u'選':2596,u'郎':-4866,u'都':1192,u'野':-1100,u'銀':-2213,u'長':357,u'間':-2344,u'院':-2297,u'際':-2604,u'電':-878,u'領':-1659,u'題':-792,u'館':-1984,u'首':1749,u'高':2120,u'「':1895,u'」':3798,u'・':-4371,u'ッ':-724,u'ー':-11870,u'カ':2145,u'コ':1789,u'セ':1287,u'ト':-403,u'メ':-1635,u'ラ':-881,u'リ':-541,u'ル':-856,u'ン':-3637}
+    UW5__ = {u',':465,u'.':-299,u'1':-514,u'E2':-32768,u']':-2762,u'、':465,u'。':-299,u'「':363,u'あ':1655,u'い':331,u'う':-503,u'え':1199,u'お':527,u'か':647,u'が':-421,u'き':1624,u'ぎ':1971,u'く':312,u'げ':-983,u'さ':-1537,u'し':-1371,u'す':-852,u'だ':-1186,u'ち':1093,u'っ':52,u'つ':921,u'て':-18,u'で':-850,u'と':-127,u'ど':1682,u'な':-787,u'に':-1224,u'の':-635,u'は':-578,u'べ':1001,u'み':502,u'め':865,u'ゃ':3350,u'ょ':854,u'り':-208,u'る':429,u'れ':504,u'わ':419,u'を':-1264,u'ん':327,u'イ':241,u'ル':451,u'ン':-343,u'中':-871,u'京':722,u'会':-1153,u'党':-654,u'務':3519,u'区':-901,u'告':848,u'員':2104,u'大':-1296,u'学':-548,u'定':1785,u'嵐':-1304,u'市':-2991,u'席':921,u'年':1763,u'思':872,u'所':-814,u'挙':1618,u'新':-1682,u'日':218,u'月':-4353,u'査':932,u'格':1356,u'機':-1508,u'氏':-1347,u'田':240,u'町':-3912,u'的':-3149,u'相':1319,u'省':-1052,u'県':-4003,u'研':-997,u'社':-278,u'空':-813,u'統':1955,u'者':-2233,u'表':663,u'語':-1073,u'議':1219,u'選':-1018,u'郎':-368,u'長':786,u'間':1191,u'題':2368,u'館':-689,u'1':-514,u'E2':-32768,u'「':363,u'イ':241,u'ル':451,u'ン':-343}
+    UW6__ = {u',':227,u'.':808,u'1':-270,u'E1':306,u'、':227,u'。':808,u'あ':-307,u'う':189,u'か':241,u'が':-73,u'く':-121,u'こ':-200,u'じ':1782,u'す':383,u'た':-428,u'っ':573,u'て':-1014,u'で':101,u'と':-105,u'な':-253,u'に':-149,u'の':-417,u'は':-236,u'も':-206,u'り':187,u'る':-135,u'を':195,u'ル':-673,u'ン':-496,u'一':-277,u'中':201,u'件':-800,u'会':624,u'前':302,u'区':1792,u'員':-1212,u'委':798,u'学':-960,u'市':887,u'広':-695,u'後':535,u'業':-697,u'相':753,u'社':-507,u'福':974,u'空':-822,u'者':1811,u'連':463,u'郎':1082,u'1':-270,u'E1':306,u'ル':-673,u'ン':-496}
+
+    # ctype_
+    def ctype_(self, char):
+        for pattern, value in self.patterns_.iteritems():
+            if pattern.match(char):
+                return value
+        return u'O'
+    # ts_
+    def ts_(self, dict, key):
+        if key in dict:
+            return dict[key]
+        return 0
+
+    # segment
+    def split(self, input):
+        if not input:
+            return []
+
+        result = []
+        seg = [u'B3',u'B2',u'B1']
+        ctype = [u'O',u'O',u'O']
+        for t in input:
+            seg.append(t)
+            ctype.append(self.ctype_(t))
+        seg.append(u'E1')
+        seg.append(u'E2')
+        seg.append(u'E3')
+        ctype.append(u'O')
+        ctype.append(u'O')
+        ctype.append(u'O')
+        word = seg[3]
+        p1 = u'U'
+        p2 = u'U'
+        p3 = u'U'
+
+        for i in range(4, len(seg) - 3):
+            score = self.BIAS__
+            w1 = seg[i-3]
+            w2 = seg[i-2]
+            w3 = seg[i-1]
+            w4 = seg[i]
+            w5 = seg[i+1]
+            w6 = seg[i+2]
+            c1 = ctype[i-3]
+            c2 = ctype[i-2]
+            c3 = ctype[i-1]
+            c4 = ctype[i]
+            c5 = ctype[i+1]
+            c6 = ctype[i+2]
+            score += self.ts_(self.UP1__, p1)
+            score += self.ts_(self.UP2__, p2)
+            score += self.ts_(self.UP3__, p3)
+            score += self.ts_(self.BP1__, p1 + p2)
+            score += self.ts_(self.BP2__, p2 + p3)
+            score += self.ts_(self.UW1__, w1)
+            score += self.ts_(self.UW2__, w2)
+            score += self.ts_(self.UW3__, w3)
+            score += self.ts_(self.UW4__, w4)
+            score += self.ts_(self.UW5__, w5)
+            score += self.ts_(self.UW6__, w6)
+            score += self.ts_(self.BW1__, w2 + w3)
+            score += self.ts_(self.BW2__, w3 + w4)
+            score += self.ts_(self.BW3__, w4 + w5)
+            score += self.ts_(self.TW1__, w1 + w2 + w3)
+            score += self.ts_(self.TW2__, w2 + w3 + w4)
+            score += self.ts_(self.TW3__, w3 + w4 + w5)
+            score += self.ts_(self.TW4__, w4 + w5 + w6)
+            score += self.ts_(self.UC1__, c1)
+            score += self.ts_(self.UC2__, c2)
+            score += self.ts_(self.UC3__, c3)
+            score += self.ts_(self.UC4__, c4)
+            score += self.ts_(self.UC5__, c5)
+            score += self.ts_(self.UC6__, c6)
+            score += self.ts_(self.BC1__, c2 + c3)
+            score += self.ts_(self.BC2__, c3 + c4)
+            score += self.ts_(self.BC3__, c4 + c5)
+            score += self.ts_(self.TC1__, c1 + c2 + c3)
+            score += self.ts_(self.TC2__, c2 + c3 + c4)
+            score += self.ts_(self.TC3__, c3 + c4 + c5)
+            score += self.ts_(self.TC4__, c4 + c5 + c6)
+#           score += self.ts_(self.TC5__, c4 + c5 + c6)
+            score += self.ts_(self.UQ1__, p1 + c1)
+            score += self.ts_(self.UQ2__, p2 + c2)
+            score += self.ts_(self.UQ1__, p3 + c3)
+            score += self.ts_(self.BQ1__, p2 + c2 + c3)
+            score += self.ts_(self.BQ2__, p2 + c3 + c4)
+            score += self.ts_(self.BQ3__, p3 + c2 + c3)
+            score += self.ts_(self.BQ4__, p3 + c3 + c4)
+            score += self.ts_(self.TQ1__, p2 + c1 + c2 + c3)
+            score += self.ts_(self.TQ2__, p2 + c2 + c3 + c4)
+            score += self.ts_(self.TQ3__, p3 + c1 + c2 + c3)
+            score += self.ts_(self.TQ4__, p3 + c2 + c3 + c4)
+            p = u'O'
+            if score > 0:
+                result.append(word)
+                word = u''
+                p = u'B'
+            p1 = p2
+            p2 = p3
+            p3 = p
+            word += seg[i]
+
+        result.append(word)
+        return result
+
+
+class SearchJapanese(SearchLanguage):
+    """
+    Japanese search implementation: uses no stemmer, but word splitting is quite
+    complicated.
+    """
+    lang = 'ja'
+
+    def init(self, options):
+        type = options.get('type', 'default')
+        if type not in ('mecab', 'default'):
+            raise ValueError(("Japanese tokenizer's type should be 'mecab'"
+                              " or 'default'"))
+        self.libmecab = None
+        if type == 'mecab':
+            self.splitter = MecabBinder(options)
+        else:
+            self.splitter = TinySegmenter()
+
+    def split(self, input):
+        return self.splitter.split(input)
+
+    def word_filter(self, stemmed_word):
+        return len(stemmed_word) > 1

sphinx/themes/basic/static/searchtools.js

-/*
- * searchtools.js
- * ~~~~~~~~~~~~~~
- *
- * Sphinx JavaScript utilties for the full-text search.
- *
- * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
- */
-
-/**
- * helper function to return a node containing the
- * search summary for a given text. keywords is a list
- * of stemmed words, hlwords is the list of normal, unstemmed
- * words. the first one is used to find the occurance, the
- * latter for highlighting it.
- */
-
-jQuery.makeSearchSummary = function(text, keywords, hlwords) {
-  var textLower = text.toLowerCase();
-  var start = 0;
-  $.each(keywords, function() {
-    var i = textLower.indexOf(this.toLowerCase());
-    if (i > -1)
-      start = i;
-  });
-  start = Math.max(start - 120, 0);
-  var excerpt = ((start > 0) ? '...' : '') +
-  $.trim(text.substr(start, 240)) +
-  ((start + 240 - text.length) ? '...' : '');
-  var rv = $('<div class="context"></div>').text(excerpt);
-  $.each(hlwords, function() {
-    rv = rv.highlightText(this, 'highlighted');
-  });
-  return rv;
-}
-
-/**
- * Porter Stemmer
- */
-var PorterStemmer = function() {
-
-  var step2list = {
-    ational: 'ate',
-    tional: 'tion',
-    enci: 'ence',
-    anci: 'ance',
-    izer: 'ize',
-    bli: 'ble',
-    alli: 'al',
-    entli: 'ent',
-    eli: 'e',
-    ousli: 'ous',
-    ization: 'ize',
-    ation: 'ate',
-    ator: 'ate',
-    alism: 'al',
-    iveness: 'ive',
-    fulness: 'ful',
-    ousness: 'ous',
-    aliti: 'al',
-    iviti: 'ive',
-    biliti: 'ble',
-    logi: 'log'
-  };
-
-  var step3list = {
-    icate: 'ic',
-    ative: '',
-    alize: 'al',
-    iciti: 'ic',
-    ical: 'ic',
-    ful: '',
-    ness: ''
-  };
-
-  var c = "[^aeiou]";          // consonant
-  var v = "[aeiouy]";          // vowel
-  var C = c + "[^aeiouy]*";    // consonant sequence
-  var V = v + "[aeiou]*";      // vowel sequence
-
-  var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
-  var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
-  var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
-  var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
-
-  this.stemWord = function (w) {
-    var stem;
-    var suffix;
-    var firstch;
-    var origword = w;
-
-    if (w.length < 3)
-      return w;
-
-    var re;
-    var re2;
-    var re3;
-    var re4;
-
-    firstch = w.substr(0,1);
-    if (firstch == "y")
-      w = firstch.toUpperCase() + w.substr(1);
-
-    // Step 1a
-    re = /^(.+?)(ss|i)es$/;
-    re2 = /^(.+?)([^s])s$/;
-
-    if (re.test(w))
-      w = w.replace(re,"$1$2");
-    else if (re2.test(w))
-      w = w.replace(re2,"$1$2");
-
-    // Step 1b
-    re = /^(.+?)eed$/;
-    re2 = /^(.+?)(ed|ing)$/;
-    if (re.test(w)) {
-      var fp = re.exec(w);
-      re = new RegExp(mgr0);
-      if (re.test(fp[1])) {
-        re = /.$/;
-        w = w.replace(re,"");
-      }
-    }
-    else if (re2.test(w)) {
-      var fp = re2.exec(w);
-      stem = fp[1];
-      re2 = new RegExp(s_v);
-      if (re2.test(stem)) {
-        w = stem;
-        re2 = /(at|bl|iz)$/;
-        re3 = new RegExp("([^aeiouylsz])\\1$");
-        re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
-        if (re2.test(w))
-          w = w + "e";
-        else if (re3.test(w)) {
-          re = /.$/;
-          w = w.replace(re,"");
-        }
-        else if (re4.test(w))
-          w = w + "e";
-      }
-    }
-
-    // Step 1c
-    re = /^(.+?)y$/;
-    if (re.test(w)) {
-      var fp = re.exec(w);
-      stem = fp[1];
-      re = new RegExp(s_v);
-      if (re.test(stem))
-        w = stem + "i";
-    }
-
-    // Step 2
-    re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
-    if (re.test(w)) {
-      var fp = re.exec(w);
-      stem = fp[1];
-      suffix = fp[2];
-      re = new RegExp(mgr0);
-      if (re.test(stem))
-        w = stem + step2list[suffix];
-    }
-
-    // Step 3
-    re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
-    if (re.test(w)) {
-      var fp = re.exec(w);
-      stem = fp[1];
-      suffix = fp[2];
-      re = new RegExp(mgr0);
-      if (re.test(stem))
-        w = stem + step3list[suffix];
-    }
-
-    // Step 4
-    re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
-    re2 = /^(.+?)(s|t)(ion)$/;
-    if (re.test(w)) {
-      var fp = re.exec(w);
-      stem = fp[1];
-      re = new RegExp(mgr1);
-      if (re.test(stem))
-        w = stem;
-    }
-    else if (re2.test(w)) {
-      var fp = re2.exec(w);
-      stem = fp[1] + fp[2];
-      re2 = new RegExp(mgr1);
-      if (re2.test(stem))
-        w = stem;
-    }
-
-    // Step 5
-    re = /^(.+?)e$/;
-    if (re.test(w)) {
-      var fp = re.exec(w);
-      stem = fp[1];
-      re = new RegExp(mgr1);
-      re2 = new RegExp(meq1);
-      re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
-      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
-        w = stem;
-    }
-    re = /ll$/;
-    re2 = new RegExp(mgr1);
-    if (re.test(w) && re2.test(w)) {
-      re = /.$/;
-      w = w.replace(re,"");
-    }
-
-    // and turn initial Y back to y
-    if (firstch == "y")
-      w = firstch.toLowerCase() + w.substr(1);
-    return w;
-  }
-}
-
-
-/**
- * Search Module
- */
-var Search = {
-
-  _index : null,
-  _queued_query : null,
-  _pulse_status : -1,
-
-  init : function() {
-      var params = $.getQueryParameters();
-      if (params.q) {
-          var query = params.q[0];
-          $('input[name="q"]')[0].value = query;
-          this.performSearch(query);
-      }
-  },
-
-  loadIndex : function(url) {
-    $.ajax({type: "GET", url: url, data: null, success: null,
-            dataType: "script", cache: true});
-  },
-
-  setIndex : function(index) {
-    var q;
-    this._index = index;
-    if ((q = this._queued_query) !== null) {
-      this._queued_query = null;
-      Search.query(q);
-    }
-  },
-
-  hasIndex : function() {
-      return this._index !== null;
-  },
-
-  deferQuery : function(query) {
-      this._queued_query = query;
-  },
-
-  stopPulse : function() {
-      this._pulse_status = 0;
-  },
-
-  startPulse : function() {
-    if (this._pulse_status >= 0)
-        return;
-    function pulse() {
-      Search._pulse_status = (Search._pulse_status + 1) % 4;
-      var dotString = '';
-      for (var i = 0; i < Search._pulse_status; i++)
-        dotString += '.';
-      Search.dots.text(dotString);
-      if (Search._pulse_status > -1)
-        window.setTimeout(pulse, 500);
-    };
-    pulse();
-  },
-
-  /**
-   * perform a search for something
-   */
-  performSearch : function(query) {
-    // create the required interface elements
-    this.out = $('#search-results');
-    this.title = $('<h2>' + _('Searching') + '</h2>').appendTo(this.out);
-    this.dots = $('<span></span>').appendTo(this.title);
-    this.status = $('<p style="display: none"></p>').appendTo(this.out);
-    this.output = $('<ul class="search"/>').appendTo(this.out);
-
-    $('#search-progress').text(_('Preparing search...'));
-    this.startPulse();
-
-    // index already loaded, the browser was quick!
-    if (this.hasIndex())
-      this.query(query);
-    else
-      this.deferQuery(query);
-  },
-
-  query : function(query) {
-    var stopwords = ['and', 'then', 'into', 'it', 'as', 'are', 'in',
-                     'if', 'for', 'no', 'there', 'their', 'was', 'is',
-                     'be', 'to', 'that', 'but', 'they', 'not', 'such',
-                     'with', 'by', 'a', 'on', 'these', 'of', 'will',
-                     'this', 'near', 'the', 'or', 'at'];
-
-    // stem the searchterms and add them to the correct list
-    var stemmer = new PorterStemmer();
-    var searchterms = [];
-    var excluded = [];
-    var hlterms = [];
-    var tmp = query.split(/\s+/);
-    var object = (tmp.length == 1) ? tmp[0].toLowerCase() : null;
-    for (var i = 0; i < tmp.length; i++) {
-      if ($u.indexOf(stopwords, tmp[i]) != -1 || tmp[i].match(/^\d+$/) ||
-          tmp[i] == "") {
-        // skip this "word"
-        continue;
-      }
-      // stem the word
-      var word = stemmer.stemWord(tmp[i]).toLowerCase();
-      // select the correct list
-      if (word[0] == '-') {
-        var toAppend = excluded;
-        word = word.substr(1);
-      }
-      else {
-        var toAppend = searchterms;
-        hlterms.push(tmp[i].toLowerCase());
-      }
-      // only add if not already in the list
-      if (!$.contains(toAppend, word))
-        toAppend.push(word);
-    };
-    var highlightstring = '?highlight=' + $.urlencode(hlterms.join(" "));
-
-    // console.debug('SEARCH: searching for:');
-    // console.info('required: ', searchterms);
-    // console.info('excluded: ', excluded);
-
-    // prepare search
-    var filenames = this._index.filenames;
-    var titles = this._index.titles;
-    var terms = this._index.terms;
-    var objects = this._index.objects;
-    var objtypes = this._index.objtypes;
-    var objnames = this._index.objnames;
-    var fileMap = {};
-    var files = null;
-    // different result priorities
-    var importantResults = [];
-    var objectResults = [];
-    var regularResults = [];
-    var unimportantResults = [];
-    $('#search-progress').empty();
-
-    // lookup as object
-    if (object != null) {
-      for (var prefix in objects) {
-        for (var name in objects[prefix]) {
-          var fullname = (prefix ? prefix + '.' : '') + name;
-          if (fullname.toLowerCase().indexOf(object) > -1) {
-            match = objects[prefix][name];
-            descr = objnames[match[1]] + _(', in ') + titles[match[0]];
-            // XXX the generated anchors are not generally correct
-            // XXX there may be custom prefixes
-            result = [filenames[match[0]], fullname, '#'+fullname, descr];
-            switch (match[2]) {
-            case 1: objectResults.push(result); break;
-            case 0: importantResults.push(result); break;
-            case 2: unimportantResults.push(result); break;
-            }
-          }
-        }
-      }
-    }
-
-    // sort results descending
-    objectResults.sort(function(a, b) {
-      return (a[1] > b[1]) ? -1 : ((a[1] < b[1]) ? 1 : 0);
-    });
-
-    importantResults.sort(function(a, b) {
-      return (a[1] > b[1]) ? -1 : ((a[1] < b[1]) ? 1 : 0);
-    });
-
-    unimportantResults.sort(function(a, b) {
-      return (a[1] > b[1]) ? -1 : ((a[1] < b[1]) ? 1 : 0);
-    });
-
-
-    // perform the search on the required terms
-    for (var i = 0; i < searchterms.length; i++) {
-      var word = searchterms[i];
-      // no match but word was a required one
-      if ((files = terms[word]) == null)
-        break;
-      if (files.length == undefined) {
-        files = [files];
-      }
-      // create the mapping
-      for (var j = 0; j < files.length; j++) {
-        var file = files[j];
-        if (file in fileMap)
-          fileMap[file].push(word);
-        else
-          fileMap[file] = [word];
-      }
-    }
-
-    // now check if the files don't contain excluded terms
-    for (var file in fileMap) {
-      var valid = true;
-
-      // check if all requirements are matched
-      if (fileMap[file].length != searchterms.length)
-        continue;
-
-      // ensure that none of the excluded terms is in the
-      // search result.
-      for (var i = 0; i < excluded.length; i++) {
-        if (terms[excluded[i]] == file ||
-            $.contains(terms[excluded[i]] || [], file)) {
-          valid = false;
-          break;
-        }
-      }
-
-      // if we have still a valid result we can add it
-      // to the result list
-      if (valid)
-        regularResults.push([filenames[file], titles[file], '', null]);
-    }
-
-    // delete unused variables in order to not waste
-    // memory until list is retrieved completely
-    delete filenames, titles, terms;
-
-    // now sort the regular results descending by title
-    regularResults.sort(function(a, b) {
-      var left = a[1].toLowerCase();
-      var right = b[1].toLowerCase();
-      return (left > right) ? -1 : ((left < right) ? 1 : 0);
-    });
-
-    // combine all results
-    var results = unimportantResults.concat(regularResults)
-      .concat(objectResults).concat(importantResults);
-
-    // print the results
-    var resultCount = results.length;
-    function displayNextItem() {
-      // results left, load the summary and display it
-      if (results.length) {
-        var item = results.pop();
-        var listItem = $('<li style="display:none"></li>');
-        if (DOCUMENTATION_OPTIONS.FILE_SUFFIX == '') {
-          // dirhtml builder
-          var dirname = item[0] + '/';
-          if (dirname.match(/\/index\/$/)) {
-            dirname = dirname.substring(0, dirname.length-6);
-          } else if (dirname == 'index/') {
-            dirname = '';
-          }
-          listItem.append($('<a/>').attr('href',
-            DOCUMENTATION_OPTIONS.URL_ROOT + dirname +
-            highlightstring + item[2]).html(item[1]));
-        } else {
-          // normal html builders
-          listItem.append($('<a/>').attr('href',
-            item[0] + DOCUMENTATION_OPTIONS.FILE_SUFFIX +
-            highlightstring + item[2]).html(item[1]));
-        }
-        if (item[3]) {
-          listItem.append($('<span> (' + item[3] + ')</span>'));
-          Search.output.append(listItem);
-          listItem.slideDown(5, function() {
-            displayNextItem();
-          });
-        } else if (DOCUMENTATION_OPTIONS.HAS_SOURCE) {
-          $.get(DOCUMENTATION_OPTIONS.URL_ROOT + '_sources/' +
-                item[0] + '.txt', function(data) {
-            if (data != '') {
-              listItem.append($.makeSearchSummary(data, searchterms, hlterms));
-              Search.output.append(listItem);
-            }
-            listItem.slideDown(5, function() {
-              displayNextItem();
-            });
-          });
-        } else {
-          // no source available, just display title
-          Search.output.append(listItem);
-          listItem.slideDown(5, function() {
-            displayNextItem();
-          });
-        }
-      }
-      // search finished, update title and status message
-      else {
-        Search.stopPulse();
-        Search.title.text(_('Search Results'));
-        if (!resultCount)
-          Search.status.text(_('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.'));
-        else
-            Search.status.text(_('Search finished, found %s page(s) matching the search query.').replace('%s', resultCount));
-        Search.status.fadeIn(500);
-      }
-    }
-    displayNextItem();
-  }
-}
-
-$(document).ready(function() {
-  Search.init();
-});

sphinx/themes/basic/static/searchtools.js_t

+/*
+ * searchtools.js_t
+ * ~~~~~~~~~~~~~~~~
+ *
+ * Sphinx JavaScript utilties for the full-text search.
+ *
+ * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+/**
+ * helper function to return a node containing the
+ * search summary for a given text. keywords is a list
+ * of stemmed words, hlwords is the list of normal, unstemmed
+ * words. the first one is used to find the occurance, the
+ * latter for highlighting it.
+ */
+
+jQuery.makeSearchSummary = function(text, keywords, hlwords) {
+  var textLower = text.toLowerCase();
+  var start = 0;
+  $.each(keywords, function() {
+    var i = textLower.indexOf(this.toLowerCase());
+    if (i > -1)
+      start = i;
+  });
+  start = Math.max(start - 120, 0);
+  var excerpt = ((start > 0) ? '...' : '') +
+  $.trim(text.substr(start, 240)) +
+  ((start + 240 - text.length) ? '...' : '');
+  var rv = $('<div class="context"></div>').text(excerpt);
+  $.each(hlwords, function() {
+    rv = rv.highlightText(this, 'highlighted');
+  });
+  return rv;
+}
+
+{{ search_language_stemming_code|safe }}
+
+/**
+ * Search Module
+ */
+var Search = {
+
+  _index : null,
+  _queued_query : null,
+  _pulse_status : -1,
+
+  init : function() {
+      var params = $.getQueryParameters();
+      if (params.q) {
+          var query = params.q[0];
+          $('input[name="q"]')[0].value = query;
+          this.performSearch(query);
+      }
+  },
+
+  loadIndex : function(url) {
+    $.ajax({type: "GET", url: url, data: null, success: null,
+            dataType: "script", cache: true});
+  },
+
+  setIndex : function(index) {
+    var q;
+    this._index = index;
+    if ((q = this._queued_query) !== null) {
+      this._queued_query = null;
+      Search.query(q);
+    }
+  },
+
+  hasIndex : function() {
+      return this._index !== null;
+  },
+
+  deferQuery : function(query) {
+      this._queued_query = query;
+  },
+
+  stopPulse : function() {
+      this._pulse_status = 0;
+  },
+
+  startPulse : function() {
+    if (this._pulse_status >= 0)
+        return;
+    function pulse() {
+      Search._pulse_status = (Search._pulse_status + 1) % 4;
+      var dotString = '';
+      for (var i = 0; i < Search._pulse_status; i++)
+        dotString += '.';
+      Search.dots.text(dotString);
+      if (Search._pulse_status > -1)
+        window.setTimeout(pulse, 500);
+    };
+    pulse();
+  },
+
+  /**
+   * perform a search for something
+   */
+  performSearch : function(query) {
+    // create the required interface elements
+    this.out = $('#search-results');
+    this.title = $('<h2>' + _('Searching') + '</h2>').appendTo(this.out);
+    this.dots = $('<span></span>').appendTo(this.title);
+    this.status = $('<p style="display: none"></p>').appendTo(this.out);
+    this.output = $('<ul class="search"/>').appendTo(this.out);
+
+    $('#search-progress').text(_('Preparing search...'));
+    this.startPulse();
+
+    // index already loaded, the browser was quick!
+    if (this.hasIndex())
+      this.query(query);
+    else
+      this.deferQuery(query);
+  },
+
+  query : function(query) {
+    var stopwords = {{ search_language_stop_words }};
+
+    // Stem the searchterms and add them to the correct list
+    var stemmer = new Stemmer();
+    var searchterms = [];
+    var excluded = [];
+    var hlterms = [];
+    var tmp = query.split(/\s+/);
+    var object = (tmp.length == 1) ? tmp[0].toLowerCase() : null;
+    for (var i = 0; i < tmp.length; i++) {
+      if ($u.indexOf(stopwords, tmp[i]) != -1 || tmp[i].match(/^\d+$/) ||
+          tmp[i] == "") {
+        // skip this "word"
+        continue;
+      }
+      // stem the word
+      var word = stemmer.stemWord(tmp[i]).toLowerCase();
+      // select the correct list
+      if (word[0] == '-') {
+        var toAppend = excluded;
+        word = word.substr(1);
+      }
+      else {
+        var toAppend = searchterms;
+        hlterms.push(tmp[i].toLowerCase());
+      }
+      // only add if not already in the list
+      if (!$.contains(toAppend, word))
+        toAppend.push(word);
+    };
+    var highlightstring = '?highlight=' + $.urlencode(hlterms.join(" "));
+
+    // console.debug('SEARCH: searching for:');
+    // console.info('required: ', searchterms);
+    // console.info('excluded: ', excluded);
+
+    // prepare search
+    var filenames = this._index.filenames;
+    var titles = this._index.titles;
+    var terms = this._index.terms;
+    var objects = this._index.objects;
+    var objtypes = this._index.objtypes;
+    var objnames = this._index.objnames;
+    var fileMap = {};
+    var files = null;
+    // different result priorities
+    var importantResults = [];
+    var objectResults = [];
+    var regularResults = [];
+    var unimportantResults = [];
+    $('#search-progress').empty();
+
+    // lookup as object
+    if (object != null) {
+      for (var prefix in objects) {
+        for (var name in objects[prefix]) {
+          var fullname = (prefix ? prefix + '.' : '') + name;
+          if (fullname.toLowerCase().indexOf(object) > -1) {
+            match = objects[prefix][name];
+            descr = objnames[match[1]] + _(', in ') + titles[match[0]];
+            // XXX the generated anchors are not generally correct
+            // XXX there may be custom prefixes
+            result = [filenames[match[0]], fullname, '#'+fullname, descr];
+            switch (match[2]) {
+            case 1: objectResults.push(result); break;
+            case 0: importantResults.push(result); break;
+            case 2: unimportantResults.push(result); break;
+            }
+          }
+        }
+      }
+    }
+
+    // sort results descending
+    objectResults.sort(function(a, b) {
+      return (a[1] > b[1]) ? -1 : ((a[1] < b[1]) ? 1 : 0);
+    });
+
+    importantResults.sort(function(a, b) {
+      return (a[1] > b[1]) ? -1 : ((a[1] < b[1]) ? 1 : 0);
+    });
+
+    unimportantResults.sort(function(a, b) {
+      return (a[1] > b[1]) ? -1 : ((a[1] < b[1]) ? 1 : 0);
+    });
+
+
+    // perform the search on the required terms
+    for (var i = 0; i < searchterms.length; i++) {
+      var word = searchterms[i];
+      // no match but word was a required one
+      if ((files = terms[word]) == null)
+        break;
+      if (files.length == undefined) {
+        files = [files];
+      }
+      // create the mapping
+      for (var j = 0; j < files.length; j++) {
+        var file = files[j];
+        if (file in fileMap)
+          fileMap[file].push(word);
+        else
+          fileMap[file] = [word];
+      }
+    }
+
+    // now check if the files don't contain excluded terms
+    for (var file in fileMap) {
+      var valid = true;
+
+      // check if all requirements are matched
+      if (fileMap[file].length != searchterms.length)
+        continue;
+
+      // ensure that none of the excluded terms is in the
+      // search result.
+      for (var i = 0; i < excluded.length; i++) {
+        if (terms[excluded[i]] == file ||
+            $.contains(terms[excluded[i]] || [], file)) {
+          valid = false;
+          break;
+        }
+      }
+
+      // if we have still a valid result we can add it
+      // to the result list
+      if (valid)
+        regularResults.push([filenames[file], titles[file], '', null]);
+    }
+
+    // delete unused variables in order to not waste
+    // memory until list is retrieved completely
+    delete filenames, titles, terms;
+
+    // now sort the regular results descending by title
+    regularResults.sort(function(a, b) {
+      var left = a[1].toLowerCase();
+      var right = b[1].toLowerCase();
+      return (left > right) ? -1 : ((left < right) ? 1 : 0);
+    });
+
+    // combine all results
+    var results = unimportantResults.concat(regularResults)
+      .concat(objectResults).concat(importantResults);
+
+    // print the results
+    var resultCount = results.length;
+    function displayNextItem() {
+      // results left, load the summary and display it
+      if (results.length) {
+        var item = results.pop();
+        var listItem = $('<li style="display:none"></li>');
+        if (DOCUMENTATION_OPTIONS.FILE_SUFFIX == '') {
+          // dirhtml builder
+          var dirname = item[0] + '/';
+          if (dirname.match(/\/index\/$/)) {
+            dirname = dirname.substring(0, dirname.length-6);
+          } else if (dirname == 'index/') {
+            dirname = '';
+          }
+          listItem.append($('<a/>').attr('href',
+            DOCUMENTATION_OPTIONS.URL_ROOT + dirname +
+            highlightstring + item[2]).html(item[1]));
+        } else {
+          // normal html builders
+          listItem.append($('<a/>').attr('href',
+            item[0] + DOCUMENTATION_OPTIONS.FILE_SUFFIX +
+            highlightstring + item[2]).html(item[1]));
+        }
+        if (item[3]) {
+          listItem.append($('<span> (' + item[3] + ')</span>'));
+          Search.output.append(listItem);
+          listItem.slideDown(5, function() {
+            displayNextItem();
+          });
+        } else if (DOCUMENTATION_OPTIONS.HAS_SOURCE) {
+          $.get(DOCUMENTATION_OPTIONS.URL_ROOT + '_sources/' +
+                item[0] + '.txt', function(data) {
+            if (data != '') {
+              listItem.append($.makeSearchSummary(data, searchterms, hlterms));
+              Search.output.append(listItem);
+            }
+            listItem.slideDown(5, function() {
+              displayNextItem();
+            });
+          });
+        } else {
+          // no source available, just display title
+          Search.output.append(listItem);
+          listItem.slideDown(5, function() {
+            displayNextItem();
+          });
+        }
+      }
<