Commits

Scott Wilson  committed 9cb9231

This one was applied.

  • Participants
  • Parent commits 7b50f08

Comments (0)

Files changed (2)

-similar-select.diff
 # Placed by Bitbucket

File similar-select.diff

-diff -r 1ef5087a649e xodb/backends/xapian.py
---- a/xodb/backends/xapian.py	Tue Apr 21 18:17:28 2009 -0700
-+++ b/xodb/backends/xapian.py	Fri Apr 24 14:01:46 2009 +0200
-@@ -3,10 +3,11 @@
- import math
- import operator
- from operator import itemgetter
-+import string
- import unicodedata
- import translitcodec
- import xapian
--from xapian import MatchDecider, QueryParser
-+from xapian import MatchDecider, QueryParser, Query
- from cPickle import dumps, loads
- 
- from xodb import Schema, Backend, Attribute
-@@ -401,6 +406,20 @@
-         if self._writable:
-             self.xapian.commit_transaction()
- 
-+    def prepare_query_parser(self, lang=None, default_op=xapian.Query.OP_AND):
-+        qp = xapian.QueryParser()
-+        qp.set_database(self.xapian)
-+        qp.set_default_op(default_op)
-+        if self.relevance_prefixes:
-+            for key, value in self.relevance_prefixes.items():
-+                qp.add_prefix(key, value)
-+        if self.boolean_prefixes:
-+            for key, value in self.boolean_prefixes.items():
-+                qp.add_boolean_prefix(key, value)
-+        if lang is not None:
-+            qp.set_stemmer(xapian.Stem(lang))
-+        return qp
-+
-     def query(self, session, query, offset=0, limit=None, order=None,
-               reverse=False, lang=None, partial=False, check=0,
-               match_decider=None, match_spy=None, filter=None,
-@@ -416,15 +435,7 @@
-             if query == "":
-                 query = xapian.Query("")
-             else:
--                qp = xapian.QueryParser()
--                qp.set_database(self.xapian)
--                qp.set_default_op(default_op)
--                if self.relevance_prefixes:
--                    for key, value in self.relevance_prefixes.items():
--                        qp.add_prefix(key, value)
--                if self.boolean_prefixes:
--                    for key, value in self.boolean_prefixes.items():
--                        qp.add_boolean_prefix(key, value)
-+                qp = self.prepare_query_parser(lang, default_op)
-                 query = qp.parse_query(query, parser_flags)
-         else:
-             # it's a list of oids, let's fetch them all in one query
-@@ -435,6 +446,145 @@
-         enq = xapian.Enquire(self.xapian)
-         enq.set_query(query)
- 
-+        return self._get_result(session, enq, offset, limit, order, reverse,
-+                                check, filter, match_decider, match_spy)
-+
-+    def estimate(self, query, limit=0, lang=None, partial=False,
-+                 parser_flags=default_parser_flags):
-+        """Estimate the number of documents that will be yielded with the
-+        given query.  
-+
-+        Limit tells the estimator the minimum number of documents to
-+        consider.  A zero limit means check all documents in the db."""
-+        self.xapian.reopen()
-+        enq = xapian.Enquire(self.xapian)
-+
-+        if limit == 0:
-+            limit = self.xapian.get_doccount()
-+
-+        if isinstance(query, basestring):
-+            if query == "":
-+                query = xapian.Query("")
-+            else:
-+                qp = self.prepare_query_parser(lang)
-+                query = qp.parse_query(query, parser_flags)
-+        else:
-+            # it's a list of oids, let's fetch them all in one query
-+            query = xapian.Query(xapian.Query.OP_OR, [oid for oid in query])
-+
-+        enq.set_query(query)
-+        return enq.get_mset(0, 0, limit).get_matches_estimated()
-+
-+    def term_freq(self, term):
-+        """
-+        Return a count of the number of documents indexed for a given
-+        term.  Useful for testing.
-+        """
-+        self.xapian.reopen()
-+        return self.xapian.get_termfreq(term)
-+
-+    def get_doccount(self):
-+        """
-+        Return the number of indexed documents, handy for tests and
-+        sanity check.
-+        """
-+        self.xapian.reopen()
-+        return self.xapian.get_doccount()
-+
-+    def describe_query(self, query, lang=None,
-+              default_op=xapian.Query.OP_AND):
-+        """
-+        Describe the parsed query.
-+        """
-+        qp = self.prepare_query_parser(lang, default_op)
-+        q = qp.parse_query(query, default_parser_flags)
-+        return q.get_description()
-+
-+    def spell(self, query, lang=None):
-+        """
-+        Suggest a query string with corrected spelling.
-+        """
-+        self.xapian.reopen()
-+        qp = self.prepare_query_parser(lang)
-+        qp.parse_query(query, xapian.QueryParser.FLAG_SPELLING_CORRECTION)
-+        return qp.get_corrected_query_string().decode('utf8')
-+
-+    def suggest(self, query, offset=0, limit=None, lang=None):
-+        """
-+        Suggest terms that would possibly yield more relevant results
-+        for the given query.
-+        """
-+        self.xapian.reopen()
-+        enq = xapian.Enquire(self.xapian)
-+
-+        qp = self.prepare_query_parser(lang)
-+
-+        if limit is None:
-+            limit = self.xapian.get_doccount()
-+
-+        enq.set_query(qp.parse_query(query))
-+        mset = enq.get_mset(offset, limit)
-+        rset = xapian.RSet()
-+        for m in mset:
-+            rset.add_document(m[xapian.MSET_DID])
-+
-+        if lang is not None:
-+            eset = enq.get_eset(limit, rset, Decider(lang))
-+        else:
-+            eset = enq.get_eset(limit, rset)
-+
-+        for item in eset.items:
-+            yield (item[0].decode('utf8'), item[1])
-+
-+    def similar(self, session, query, offset=0, limit=None, order=None,
-+                reverse=False, lang=None, check=None,
-+                match_decider=None, match_spy=None, terms=None,
-+                default_op=xapian.Query.OP_AND):
-+        """ Find documents in the database most relevant to the given terms.
-+
-+        'query' - If *terms*=None, query is passed to xaql.suggest to
-+        get a list of terms. If *terms* is set, that term list is used
-+        directly and the *query* is parsed and ANDed with the elite term
-+        set query.
-+
-+        'limit' - The number of records to return.
-+
-+        'lang' - The language stemmer the parser should use.
-+
-+        'match_decider' - A MatchDecider to apply to the query.
-+
-+        'terms' - Optional term list from which to derive the
-+        elite set of terms to match on.
-+        """
-+        self.xapian.reopen()
-+
-+        if not terms:
-+            suggested_terms = self.suggest(
-+                query, limit, lang)
-+            terms = [term[0] for term in suggested_terms]
-+            query = None
-+        elif query:
-+            qp = self.prepare_query_parser(lang, default_op)
-+            query = qp.parse_query(query, default_parser_flags)
-+
-+        enq = xapian.Enquire(self.xapian)
-+
-+        if limit is None:
-+            limit = self.xapian.get_doccount()
-+
-+        similar_query = Query(Query.OP_ELITE_SET, terms, limit)
-+        if query:
-+            similar_query = Query(Query.OP_AND, [similar_query, query])
-+        enq.set_query(similar_query)
-+
-+        return self._get_result(session, enq, offset, limit, order, reverse,
-+                                check, filter, match_decider, match_spy)
-+
-+
-+
-+    def _get_result(self, session, enq, offset, limit, order, reverse, check,
-+                    filter=None, match_decider=None, match_spy=None):
-+
-         if order is not None:
-             if isinstance(order, basestring):
-                 try:
-@@ -452,7 +602,8 @@
-         tries = 0
-         while True:
-             try:
--                mset = enq.get_mset(offset, limit, check, None, match_decider, match_spy)
-+                mset = enq.get_mset(
-+                    offset, limit, check, None, match_decider, match_spy)
-                 break
-             except xapian.DatabaseModifiedError:
-                 if tries > RETRY_LIMIT:
-@@ -460,60 +611,55 @@
-                 self.xapian.reopen()
-                 tries += 1
- 
-+        return self._return_objects(session, mset, filter)
-+
-+
-+    def _return_objects(self, session, mset, filter=None):
-         filter = filter or self.filter
-         for record in mset:
-             doc = record[xapian.MSET_DOCUMENT]
--            obj =  session.load(doc.get_value(0), state_only=self.state_only)
-+            obj = session.load(doc.get_value(0), state_only=self.state_only)
-             if filter:
-                 yield filter(obj)
-             else:
-                 yield obj
- 
--    def estimate(self, query, limit=0, lang=None, partial=False, 
--                 parser_flags=default_parser_flags):
--        """Estimate the number of documents that will be yielded with the
--        given query.  
-+    def term_counter(self, prefixes):
-+        """Construct a term count match spy with this instance's prefix dict."""
-+        prefix_map = self.relevance_prefixes.copy()
-+        prefix_map.update(self.boolean_prefixes)
-+        return TermCountMatchSpy(prefixes, prefix_map)
- 
--        Limit tells the estimator the minimum number of documents to
--        consider.  A zero limit means check all documents in the db."""
--        self.xapian.reopen()
--        enq = xapian.Enquire(self.xapian)
- 
--        if limit == 0:
--            limit = self.xapian.get_doccount()
-+class Decider(xapian.ExpandDecider):
-+    """
-+    A Xapian ExpandDecider that decide which terms to keep and which
-+    to discard when expanding a query using the "suggest" syntax.  As
-+    a place to start, we throw out:
- 
--        if isinstance(query, basestring):
--            if query == "":
--                query = xapian.Query("")
--            else:
--                qp = xapian.QueryParser()
--                qp.set_database(self.xapian)
--                if lang is not None:
--                    qp.set_stemmer(xapian.Stem(lang))
--                else:
--                    qp.set_stemmer(xapian.Stem("none"))
--                if self.relevance_prefixes:
--                    for key, value in self.relevance_prefixes.items():
--                        qp.add_prefix(key, value)
--                if self.boolean_prefixes:
--                    for key, value in self.boolean_prefixes.items():
--                        qp.add_boolean_prefix(key, value)
--                query = qp.parse_query(query, parser_flags)
-+      - Terms that don't begin with an uppercase letter or digit.
-+        This filters prefixed terms and stemmed forms.
-+
-+      - Terms shorter than min_length chars, which are likely irrelevant
-+
-+      - Stopwords for the given language.  Default is english, pass
-+        None for the lang argument if no stopping is desired.
-+    """
-+
-+    min_length = 5
-+    nostart = unicode(string.uppercase+string.digits)
-+
-+    def __init__(self, lang="en"):
-+        super(Decider, self).__init__()
-+        if lang in snowball.stoppers:
-+            self.stopper = snowball.stoppers[lang]
-         else:
--            # it's a list of oids, let's fetch them all in one query
--            query = xapian.Query(xapian.Query.OP_OR, [oid for oid in query])
-+            self.stopper = lambda(term): False
- 
--
--        enq.set_query(query)
--        return enq.get_mset(0, 0, limit).get_matches_estimated()
--
--    def term_freq(self, term):
--        """
--        Return a count of the number of documents indexed for a given
--        term.  Useful for testing.
--        """
--        self.xapian.reopen()
--        return self.xapian.get_termfreq(term)
-+    def __call__(self, term):
-+        if term[0] in self.nostart or len(term) < self.min_length or self.stopper(term):
-+            return False
-+        return True
- 
- 
- class TermCountMatchSpy(MatchDecider):
-@@ -538,6 +684,8 @@
- 
-         for prefix_name in self.terms:
-             terms = self.terms[prefix_name]
-+            if prefix_name not in self.prefix_dictionary:
-+                continue
-             prefix_code = self.prefix_dictionary[prefix_name]
-             prefix_len = len(prefix_code)
-             for term in doc: