Commits

mitsuhiko  committed b687b3c Merge

Merged search with main.

  • Participants
  • Parent commits 627a304, ac6f838

Comments (0)

Files changed (4)

File solace/search.py

+# -*- coding: utf-8 -*-
+"""
+    solace.search
+    ~~~~~~~~~~~~~
+
+    Implements simple search support based on Whoosh and Xapian.  The
+    search is currently limited to topic results only which makes it
+    a lot easier.
+
+    :copyright: (c) 2009 by Plurk Inc., see AUTHORS for more details.
+    :license: BSD, see LICENSE for more details.
+"""
+from __future__ import with_statement
+import re
+from threading import Lock
+from itertools import chain
+from werkzeug import import_string
+from solace.i18n import _
+from solace import settings
+
+
+_engine = None
+_engine_lock = Lock()
+_token_re = re.compile(r'''(?ux)
+    (?P<operator>[()-]) |
+    (?P<string>"[^"]*") |
+    (?P<arg>[^\s()]+)
+''')
+
+
+def get_engine():
+    """Creates or returns the engine."""
+    with _engine_lock:
+        if _engine is None:
+            _engine = import_string(settings.SEARCH_ENGINE)()
+        return _engine
+
+
+def refresh_engine():
+    """Gets rid of the existing engine.  Useful for unittesting."""
+    global _engine
+    _engine = None
+
+
+class _QType(type):
+    """Metaclass for Q expressions that attaches the type as a
+    string to the class based on the class name.
+    """
+
+    def __new__(cls, name, bases, d):
+        if name[0] == 'Q':
+            d['type'] = name[1:].lower()
+        return type.__new__(cls, name, bases, d)
+
+
+class _QExpr(object):
+    """Baseclass for query expressions."""
+    __metaclass__ = _QType
+
+    def __unicode__(self):
+        return u'?'
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+    def __repr__(self):
+        return '<Query \'%s\'>' % self
+
+
+class _QBinExpr(_QExpr):
+
+    def __init__(self, left, right):
+        self.left = left
+        self.right = right
+
+
+class QEmpty(_QExpr):
+    """An empty query."""
+
+    def __unicode__(self):
+        return u'<empty clause>'
+
+
+class QTerm(_QExpr):
+    """A searchterm."""
+
+    def __init__(self, s, pos):
+        self.s = s
+        self.pos = pos
+
+    def __unicode__(self):
+        return self.s
+
+
+class QPhrase(QTerm):
+    """A phrase of multiple terms."""
+
+    def __unicode__(self):
+        return '"%s"' % self.s
+
+
+class QConcat(_QBinExpr):
+    """Standard term concatenation.  Matches like an AND but will a fallback
+    to OR if AND does not find enough matches.
+    """
+
+    def __unicode__(self):
+        return u'%s %s' % (self.left, self.right)
+
+
+class QOr(_QBinExpr):
+    """Matches if one of the expressions match."""
+
+    def __unicode__(self):
+        return u'(%s OR %s)' % (self.left, self.right)
+
+
+class QAnd(_QBinExpr):
+    """Matches if both expressions match."""
+
+    def __unicode__(self):
+        return u'(%s AND %s)' % (self.left, self.right)
+
+
+class QAndNot(QAnd):
+    """Matches if the first and not the second match."""
+
+    def __unicode__(self):
+        return u'(%s AND NOT %s)' % (self.left, self.right)
+
+
+def parse_query(q):
+    """Parses a query into an abstract node tree that can be used in
+    the engines.
+
+    >>> parse_query("foo AND bar")
+    <Query '(foo AND bar)'>
+    >>> parse_query("foo AND bar OR baz")
+    <Query '((foo AND bar) OR baz)'>
+    >>> parse_query("foo AND (bar OR baz)")
+    <Query '(foo AND (bar OR baz))'>
+    >>> parse_query("foo AND NOT (bar OR baz)")
+    <Query '(foo AND NOT (bar OR baz))'>
+    >>> parse_query("foo NOT (bar OR baz)")
+    <Query '(foo AND NOT (bar OR baz))'>
+    >>> parse_query('"foo AND bar" OR baz')
+    <Query '("foo AND bar" OR baz)'>
+    """
+    return _QueryParser(q).parse_concat(False)
+
+
+class _QueryParser(object):
+    """Parses a query string.  The syntax for queries depends on the
+    language of the current user.  As a rule of thumb, the English
+    keywords will always work.
+    """
+
+    def __init__(self, q):
+        # the keywords for the query parser.  We always support the
+        # English keywords and additionally the translated ones.
+        keywords = {_(u'AND'): 'and', _(u'OR'): 'or', _(u'NOT'): 'not',
+                    u'AND': 'and', u'OR': 'or', u'NOT': 'not'}
+
+        # tokenize the query
+        tokens = []
+        for match in _token_re.finditer(q):
+            for key, value in match.groupdict().iteritems():
+                if value is not None:
+                    if key == 'arg' and value in keywords:
+                        tokens.append(('keyword', keywords[value]))
+                    elif key == 'string':
+                        tokens.append(('longarg', value[1:-1]))
+                    else:
+                        tokens.append((key, value))
+                    break
+
+        tokens.reverse()
+        self.tokens = tokens
+        self.term_pos = 0
+
+    def parse_concat(self, paren_expr):
+        args = []
+        while self.tokens:
+            if paren_expr and self.tokens[-1] == ('operator', ')'):
+                self.tokens.pop()
+                break
+            args.append(self.parse_or())
+        if not args:
+            return QEmpty()
+        elif len(args) == 1:
+            return args[-1]
+        return reduce(lambda a, b: QConcat(a, b), args)
+
+    def parse_or(self):
+        q = self.parse_and()
+        while self.tokens and self.tokens[-1] == ('keyword', 'or'):
+            self.tokens.pop()
+            q = QOr(q, self.parse_and())
+        return q
+
+    def parse_and(self):
+        q = self.parse_not()
+        while self.tokens and self.tokens[-1] == ('keyword', 'and'):
+            self.tokens.pop()
+            if self.tokens and self.tokens[-1] == ('keyword', 'not'):
+                self.tokens.pop()
+                cls = QAndNot
+            else:
+                cls = QAnd
+            q = cls(q, self.parse_not())
+        return q
+
+    def parse_not(self):
+        q = self.parse_primary()
+        while self.tokens and self.tokens[-1] == ('keyword', 'not'):
+            self.tokens.pop()
+            q = QAndNot(q, self.parse_primary())
+        return q
+
+    def parse_primary(self):
+        while self.tokens:
+            tt, val = self.tokens.pop()
+            if tt in ('arg', 'longarg'):
+                self.term_pos += 1
+                if tt == 'arg':
+                    cls = QTerm
+                else:
+                    cls = QPhrase
+                return cls(val, self.term_pos)
+            if tt == 'operator' and val == '(':
+                return self.parse_concat(True)
+        return QEmpty()
+
+
+def update_search(changes):
+    """Updates the search index."""
+    engine = get_engine()
+    for model, operation in reversed(changes):
+        if isinstance(model, Topic):
+            if operation == 'insert':
+                engine.add_topic(model)
+            elif operation == 'delete':
+                engine.remove_topic(model)
+            elif operation == 'update':
+                engine.update_topic(model)
+        elif isinstance(model, Post) and not model.is_question:
+            if operation == 'insert':
+                engine.add_post(model)
+            elif operation == 'delete':
+                engine.remove_post(model)
+            elif operation == 'update':
+                engine.update_post(model)
+
+
+class SearchEngine(object):
+    """Baseclass for all search engines."""
+
+    def add_topic(self, topic):
+        pass
+
+    def remove_topic(self, topic):
+        pass
+
+    def update_topic(self, topic):
+        pass
+
+    def add_post(self, post):
+        pass
+
+    def remove_post(self, post):
+        pass
+
+    def update_post(self, post):
+        pass
+
+    def get_stemmer(self, locale=None):
+        return lambda x: x
+
+    def stem(self, word, locale=None):
+        return self.get_stemmer(locale)()
+
+    def iter_stemmed_words(self, text, locale=None):
+        stemmer = self.get_stemmer(locale)
+        for word in _word_re.findmatch(text):
+            yield stemmer(word.lower())
+
+
+class XapianEngine(SearchEngine):
+    """An engine that uses Xapian."""
+
+    def __init__(self):
+        import xapian
+        self._xap = xapian
+        self._stemmers = {}
+
+    def get_stemmer(self, locale=None):
+        """Use the xapian stemmers."""
+        if locale is not None:
+            locale = str(locale)
+        stemmer = self._stemmers.get(locale)
+        if stemmer is not None:
+            return stemmer
+        try:
+            stemmer = self._xap.Stem(locale or 'en')
+        except self._xap.InvalidArgumentError:
+            stemmer = self._xap.Stem('en')
+        self._stemmers[locale] = stemmer
+        return stemmer
+
+    def _get_connection(self, writable=False):
+        """Return a connection to the Xapian database."""
+        if writable:
+            return self._xap.WritableDatabase(settings.XAPIAN_DATABASE,
+                                              self._xapian.DB_CREATE_OR_OPEN)
+        return self._xap.Database(settings.XAPIAN_DATABASE)
+
+    def _index_topic(self, topic, doc):
+        word_iter = chain(
+            self.iter_stemmed_words(topic.title, topic.locale),
+            self.iter_stemmed_words(topic.question.text, topic.locale)
+        )
+        doc.add_term(
+        for idx, word in enumerate(word_iter):
+            doc.add_posting(word, idx, topic.locale)
+
+    def _topic_term(self, topic):
+        return '_TOPIC_%d' % topic.id
+
+    def _find_topic(self, topic, con=None):
+        if con is None:
+            con = self._get_connection()
+        enq = self._xap.Enquire(con)
+        q = self._xap.Query(self._topic_term(topic))
+        enq.set_query(q)
+        rv = list(enq.get_mset(0, 1))
+        if rv:
+            return con.get_document(rv[0].get_docid())
+
+    def add_topic(self, topic):
+        doc = self._xap.Document()
+        doc.add_term(self._topic_term(topic))
+        self._index_topic(topic, doc)
+        con = self._get_connection(writable=True)
+        con.add_document(doc)
+        con.flush()
+
+    def update_topic(self, topic):
+        con = self._get_connection(writable=True)
+        doc = self._find_topic(topic, con)
+        # let's just say that's intentional
+        if doc is None:
+            return
+
+        self._index_topic(topic, doc)
+        con.replace_document(doc)
+        con.flush()
+
+    def remove_topic(self, topic):
+        con = self._get_connection(writable=True)
+        doc = self._find_topic(topic, con)
+        if doc is not None:
+            con.delete_document(con)
+            con.flush()
+
+
+# database signal handlers
+from solace.models import Post, Topic
+from solace.signals import after_models_committed
+after_models_committed.connect(update_search)

File solace/settings.py

 #: use TLS for SMTP?
 SMTP_USE_TLS = False
 
+#: the search engine to use.
+SEARCH_ENGINE = 'solace.search.XapianEngine'
+
+#: if xapian is used as search engine, this is the database it
+#: will use.
+XAPIAN_DATABASE = '/tmp/solace.xapdb'
+
 #: the default language that is assumed if the client does not send
 #: a language information etc.  This language also has to be listed
 #: in the LANGUAGE_SECTIONS list.

File solace/tests/__init__.py

 
 def suite():
     from solace.tests import models, querycount, kb_views, core_views, \
-         templating, signals, link_check, validation
+         templating, signals, link_check, validation, search
     suite = unittest.TestSuite()
     suite.addTest(models.suite())
     suite.addTest(querycount.suite())
     suite.addTest(signals.suite())
     suite.addTest(link_check.suite())
     suite.addTest(validation.suite())
+    suite.addTest(search.suite())
     return suite

File solace/tests/search.py

+# -*- coding: utf-8 -*-
+"""
+    solace.tests.search
+    ~~~~~~~~~~~~~~~~~~~
+
+    Test the search.
+
+    :copyright: (c) 2009 by Plurk Inc., see AUTHORS for more details.
+    :license: BSD, see LICENSE for more details.
+"""
+import re
+import unittest
+import doctest
+from solace.tests import SolaceTestCase
+
+from solace import search, settings
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(doctest.DocTestSuite(search))
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest='suite')