Commits

Anonymous committed a42e399 Merge

merged main repo

  • Participants
  • Parent commits 323267d, 61c6af3

Comments (0)

Files changed (13)

File src/whoosh/__init__.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-__version__ = (1, 8, 0)
+__version__ = (1, 8, 2)
 
 
 def versionstring(build=True, extra=True):

File src/whoosh/filedb/filewriting.py

 
 class SegmentWriter(IndexWriter):
     def __init__(self, ix, poolclass=None, procs=0, blocklimit=128,
-                 timeout=0.0, delay=0.1, name=None, lock=True, **poolargs):
+                 timeout=0.0, delay=0.1, name=None, _lk=True, **poolargs):
         
         self.writelock = None
-        if lock:
+        if _lk:
             self.writelock = ix.lock("WRITELOCK")
             if not try_for(self.writelock.acquire, timeout=timeout, delay=delay):
                 raise LockError

File src/whoosh/filedb/gae.py

 """
 
 from cStringIO import StringIO
-from threading import Lock
 
 from google.appengine.api import memcache
 from google.appengine.ext import db
 
-from whoosh.store import Storage, LockError
+from whoosh.store import Storage
 from whoosh.filedb.fileindex import _create_index, FileIndex, _DEF_INDEX_NAME
 from whoosh.filedb.filestore import ReadOnlyError
 from whoosh.filedb.structfile import StructFile
         return self.data.getvalue()
 
 
+class MemcacheLock(object):
+    def __init__(self, name):
+        self.name = name
+        
+    def acquire(self, blocking=False):
+        val = memcache.add(self.name, "L", 360, namespace="whooshlocks")
+        
+        if blocking and not val:
+            # Simulate blocking by retrying the acquire over and over
+            import time
+            while not val:
+                time.sleep(0.1)
+                val = memcache.add(self.name, "", 360, namespace="whooshlocks")
+        
+        return val
+    
+    def release(self):
+        memcache.delete(self.name, namespace="whooshlocks")
+
+
 class DatastoreStorage(Storage):
     """An implementation of :class:`whoosh.store.Storage` that stores files in
     the app engine datastore as blob properties.
     """
 
-    def __init__(self):
-        self.locks = {}
-
     def create_index(self, schema, indexname=_DEF_INDEX_NAME):
         if self.readonly:
             raise ReadOnlyError
         return len(DatastoreFile.get_by_key_name(name).value)
 
     def delete_file(self, name):
+        memcache.delete(name, namespace="DatastoreFile")
         return DatastoreFile.get_by_key_name(name).delete()
 
     def rename_file(self, name, newname, safe=False):
         return StructFile(DatastoreFile.loadfile(name))
 
     def lock(self, name):
-        if name not in self.locks:
-            self.locks[name] = Lock()
-        if not self.locks[name].acquire(False):
-            raise LockError("Could not lock %r" % name)
-        return self.locks[name]
+        return MemcacheLock(name)
 
-    def unlock(self, name):
-        if name in self.locks:
-            self.locks[name].release()
-        else:
-            raise LockError("No lock named %r" % name)
 
 

File src/whoosh/filedb/multiproc.py

     def run(self):
         jobqueue = self.jobqueue
         ix = self.storage.open_index(self.indexname)
-        writer = self.writer = SegmentWriter(ix, lock=False, name=self.segname,
+        writer = self.writer = SegmentWriter(ix, _lk=False, name=self.segname,
                                              **self.kwargs)
         
         if self.firstjob:

File src/whoosh/matching.py

         self._i += 1
         while self._i < len(self._ids) and self.quality() <= minquality:
             self._i += 1
+        return 0
     
     def id(self):
         return self._ids[self._i]
         
         # Short circuit if one matcher is inactive
         if not a.is_active():
-            sk = b.skip_to_quality(minquality)
-            return sk
+            return b.skip_to_quality(minquality)
         elif not b.is_active():
             return a.skip_to_quality(minquality)
         

File src/whoosh/qparser/plugins.py

     def do_multifield(self, parser, stream):
         newstream = stream.empty()
         for t in stream:
-            if isinstance(t, BasicSyntax) and t.fieldname is None:
+            if isinstance(t, Group):
+                t = self.do_multifield(parser, t)
+            elif isinstance(t, BasicSyntax) and t.fieldname is None:
                 t = OrGroup([t.set_fieldname(fn).set_boost(self.boosts.get(fn, 1.0))
                              for fn in self.fieldnames])
             newstream.append(t)

File src/whoosh/query.py

 from whoosh.reading import TermNotFound
 from whoosh.support.levenshtein import relative
 from whoosh.support.times import datetime_to_long
-from whoosh.util import make_binary_tree, methodcaller
+from whoosh.util import make_binary_tree, make_weighted_tree, methodcaller
 
 
 # Exceptions
                              phrases=phrases)
         return termset
 
+    def requires(self):
+        """Returns a set of queries that are *known* to be required to match
+        for the entire query to match. Note that other queries might also turn
+        out to be required but not be determinable by examining the static
+        query.
+        
+        >>> a = Term("f", u"a")
+        >>> b = Term("f", u"b")
+        >>> And([a, b]).requires()
+        set([Term("f", u"a"), Term("f", u"b")])
+        >>> Or([a, b]).requires()
+        set([])
+        >>> AndMaybe(a, b).requires()
+        set([Term("f", u"a")])
+        >>> a.requires()
+        set([Term("f", u"a")])
+        """
+        
+        # Subclasses should implement the _add_required_to(qset) method
+        
+        return set([self])
+    
     def field(self):
         """Returns the field this query matches in, or None if this query does
         not match in a single field.
         return self.child.existing_terms(ixreader, termset=termset,
                                          reverse=reverse, phrases=phrases)
     
+    def requires(self):
+        return self.child.requires()
+    
     def field(self):
         return self.child.field()
     
         else:
             return NullQuery
 
-    def _matcher(self, matchercls, searcher, **kwargs):
+    def _matcher(self, matchercls, q_weight_fn, searcher, **kwargs):
+        # q_weight_fn is a function which is called on each query and returns a
+        # "weight" value which is used to build a huffman-like matcher tree. If
+        # q_weight_fn is None, an order-preserving binary tree is used instead.
+        
         # Pull any queries inside a Not() out into their own list
         subs, nots = self._split_queries()
         
         if not subs:
             return NullMatcher()
         
-        # Sort the list of subqueries by their estimated size
-        r = searcher.reader()
-        subs.sort(key=lambda q: q.estimate_size(r))
-        
         # Create a matcher from the list of subqueries
-        subms = [subq.matcher(searcher) for subq in subs]
-        if len(subms) == 1:
-            m = subms[0]
+        if len(subs) == 1:
+            m = subs[0].matcher(searcher)
+        elif q_weight_fn is None:
+            subms = [q.matcher(searcher) for q in subs]
+            m = make_binary_tree(matchercls, subms)
         else:
-            m = make_binary_tree(matchercls, subms)
+            subms = [(q_weight_fn(q), q.matcher(searcher)) for q in subs]
+            m = make_weighted_tree(matchercls, subms)
         
         # If there were queries inside Not(), make a matcher for them and
         # wrap the matchers in an AndNotMatcher
         if nots:
-            notms = [subq.matcher(searcher) for subq in nots]
-            if len(notms) == 1:
-                notm = notms[0]
+            if len(nots) == 1:
+                notm = nots[0].matcher(searcher)
             else:
-                notm = make_binary_tree(UnionMatcher, notms)
-                
+                notms = [(q.estimate_size(), q.matcher(searcher))
+                         for q in nots]
+                notm = make_weighted_tree(UnionMatcher, notms)
+            
             if notm.is_active():
                 m = AndNotMatcher(m, notm)
         
         # If this query had a boost, add a wrapping matcher to apply the boost
         if self.boost != 1.0:
             m = WrappingMatcher(m, self.boost)
-            
+        
         return m
 
 
     JOINT = " AND "
     intersect_merge = True
 
+    def requires(self):
+        s = set()
+        for q in self.subqueries:
+            s |= q.requires()
+        return s
+        
     def estimate_size(self, ixreader):
         return min(q.estimate_size(ixreader) for q in self.subqueries)
 
     def matcher(self, searcher):
-        return self._matcher(IntersectionMatcher, searcher)
+        r = searcher.reader()
+        return self._matcher(IntersectionMatcher,
+                             lambda q: 0 - q.estimate_size(r), searcher)
 
 
 class Or(CompoundQuery):
             norm.minmatch = self.minmatch
         return norm
 
+    def requires(self):
+        if len(self.subqueries) == 1:
+            return self.subqueries[0].requires()
+        else:
+            return set()
+
     def matcher(self, searcher):
-        return self._matcher(UnionMatcher, searcher)
+        r = searcher.reader()
+        return self._matcher(UnionMatcher, lambda q: q.estimate_size(r),
+                             searcher)
 
 
 class DisjunctionMax(CompoundQuery):
             norm.tiebreak = self.tiebreak
         return norm
     
+    def requires(self):
+        if len(self.subqueries) == 1:
+            return self.subqueries[0].requires()
+        else:
+            return set()
+    
     def matcher(self, searcher):
-        return self._matcher(DisjunctionMaxMatcher, searcher,
+        r = searcher.reader()
+        return self._matcher(DisjunctionMaxMatcher,
+                             lambda q: q.estimate_size(r), searcher,
                              tiebreak=self.tiebreak)
 
 
     
     def matcher(self, searcher):
         from spans import SpanBefore
-        return self._matcher(SpanBefore._Matcher, searcher)
+        
+        return self._matcher(SpanBefore._Matcher, None, searcher)
 
 
 class Every(Query):
     JOINT = " REQUIRE "
     matcherclass = RequireMatcher
 
+    def requires(self):
+        return self.a.requires() | self.b.requires()
+
     def estimate_size(self, ixreader):
         return self.b.estimate_size(ixreader)
     
             return a
         return self.__class__(a, b, boost=self.boost)
 
+    def requires(self):
+        return self.a.requires()
+
     def estimate_min_size(self, ixreader):
         return self.subqueries[0].estimate_min_size(ixreader)
 
     def _existing_terms(self, ixreader, termset, reverse=False, phrases=True):
         self.a.existing_terms(ixreader, termset, reverse=reverse,
                               phrases=phrases)
+        
+    def requires(self):
+        return self.a.requires()
 
 
 class Otherwise(BinaryQuery):

File src/whoosh/searching.py

         return self.search(q, limit=top, filter=comb)
 
     def search_page(self, query, pagenum, pagelen=10, **kwargs):
+        """This method is Like the :meth:`Searcher.search` method, but returns
+        a :class:`ResultsPage` object. This is a convenience function for
+        getting a certain "page" of the results for the given query, which is
+        often useful in web search interfaces.
+        
+        For example::
+        
+            querystring = request.get("q")
+            query = queryparser.parse("content", querystring)
+            
+            pagenum = int(request.get("page", 1))
+            pagelen = int(request.get("perpage", 10))
+            
+            results = searcher.search_page(query, pagenum, pagelen=pagelen)
+            print "Page %d of %d" % (results.pagenum, results.pagecount)
+            print ("Showing results %d-%d of %d" 
+                   % (results.offset + 1, results.offset + results.pagelen + 1,
+                      len(results)))
+            for hit in results:
+                print "%d: %s" % (hit.rank + 1, hit["title"])
+        
+        (Note that results.pagelen might be less than the pagelen argument if
+        there aren't enough results to fill a page.)
+        
+        Any additional keyword arguments you supply are passed through to
+        :meth:`Searcher.search`. For example, you can get paged results of a
+        sorted search::
+        
+            results = searcher.search_page(q, 2, sortedby="date", reverse=True)
+        
+        Currently, searching for page 100 with pagelen of 10 takes the same
+        amount of time as using :meth:`Searcher.search` to find the first 1000
+        results. That is, this method does not have any special optimizations
+        or efficiencies for getting a page from the middle of the full results
+        list. (A future enhancement may allow using previous page results to
+        improve the efficiency of finding the next page.)
+        
+        This method will raise a ``ValueError`` if you ask for a page number
+        higher than the number of pages in the resulting query.
+        
+        :param query: the :class:`whoosh.query.Query` object to match.
+        :param pagenum: the page number to retrieve, starting at ``1`` for the
+            first page.
+        :param pagelen: the number of results per page.
+        :returns: :class:`ResultsPage`
+        """
+        
         if pagenum < 1:
             raise ValueError("pagenum must be >= 1")
-        results = self.search(query, limit=pagenum * pagelen, **kwargs)
+        
+        # Turn off optimized to prevent items from jumping around in the order
+        # as the limit is increased (optimized results are not stable)
+        if "optimized" in kwargs:
+            del kwargs["optimized"]
+        results = self.search(query, limit=pagenum * pagelen, optimize=False,
+                              **kwargs)
         return ResultsPage(results, pagenum, pagelen)
 
     def find(self, defaultfield, querystring, **kwargs):
         """Runs the query represented by the ``query`` object and returns a
         Results object.
         
+        When using optimizations, the order of results with very similar scores
+        can change as the limit changes. To ensure a stable ordering of
+        documents no matter what the value of ``limit``, use
+        ``optimize=False``.
+        
         :param query: a :class:`whoosh.query.Query` object.
         :param limit: the maximum number of documents to score. If you're only
             interested in the top N documents, you can set limit=N to limit the
         :param id: the document number of the document.
         """
         
-        # This method is only called by add_all_matches
-        self._items.append((score, id))
+        # This method is only called by add_all_matches. Note: the document
+        # number is negated to match the output of add_top_matches
+        self._items.append((score, 0 - id))
         self.docset.add(id)
     
     def should_add_all(self):
         timelimited = bool(self.timelimit)
         greedy = self.greedy
         
+        # Note that document numbers are negated before putting them in the
+        # heap so that higher document numbers have lower "priority" in the
+        # queue. Lower document numbers should always come before higher
+        # document numbers with the same score to keep the order stable.
         for id, quality in self.pull_matches(matcher, usequality):
             if timelimited and not greedy and self.timesup:
                 raise TimeLimit
             
             if len(items) < limit:
                 # The heap isn't full, so just add this document
-                heappush(items, (score(searcher, matcher), offsetid, quality))
+                heappush(items, (score(searcher, matcher), 0 - offsetid, quality))
             
-            elif quality > self.minquality:
+            elif not usequality or quality > self.minquality:
                 # The heap is full, but the posting quality indicates
                 # this document is good enough to make the top N, so
                 # calculate its true score and add it to the heap
                 
                 s = score(searcher, matcher)
                 if s > items[0][0]:
-                    heapreplace(items, (s, offsetid, quality))
+                    heapreplace(items, (s, 0 - offsetid, quality))
                     self.minquality = items[0][2]
                     
             if timelimited and self.timesup:
         """Returns the collected hits as a list of (score, docid) pairs.
         """
         
-        # Turn the heap into a sorted list by sorting by score first (subtract
-        # from 0 to put highest scores first) and then by document number (to
-        # enforce a consistent ordering of documents with equal score)
-        items = self._items
+        # Docnums are stored as negative for reasons too obscure to go into
+        # here, re-negate them before returning
+        items = [(x[0], 0 - x[1]) for x in self._items]
+        
+        # Sort by negated scores so that higher scores go first, then by
+        # document number to keep the order stable when documents have the same
+        # score
         if self.scored or self.reverse:
-            items = sorted(self._items, key=lambda x: (0 - x[0], x[1]),
-                           reverse=self.reverse)
+            items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse)
+        
         return items
     
     def results(self, runtime=None):
         :param normalize: whether to normalize term weights.
         """
         
-        return self.searcher.more_like(self.docnum, text=text, top=top,
-                                       numterms=numterms, model=model,
+        return self.searcher.more_like(self.docnum, fieldname, text=text,
+                                       top=top, numterms=numterms, model=model,
                                        normalize=normalize)
     
     def __repr__(self):

File src/whoosh/util.py

 import sys
 import time
 from array import array
+from bisect import insort
 from copy import copy
 from functools import wraps
 from math import log
 
 def make_binary_tree(fn, args, **kwargs):
     """Takes a function/class that takes two positional arguments and a list of
-    arguments and returns a binary tree of instances.
+    arguments and returns a binary tree of results/instances.
     
     >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3])
     UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3))
               make_binary_tree(fn, args[half:], **kwargs), **kwargs)
 
 
+def make_weighted_tree(fn, ls, **kwargs):
+    """Takes a function/class that takes two positional arguments and a list of
+    (weight, argument) tuples and returns a huffman-like weighted tree of
+    results/instances.
+    """
+    
+    if not ls:
+        raise ValueError("Called make_weighted_tree with empty list")
+    
+    ls.sort()
+    while len(ls) > 1:
+        a = ls.pop(0)
+        b = ls.pop(0)
+        insort(ls, (a[0] + b[0], fn(a[1], b[1])))
+    return ls[0][1]
+
+
 # Varint cache
 
 # Build a cache of the varint byte sequences for the first N integers, so we

File src/whoosh/writing.py

         writer = self.writer
         while writer is None:
             try:
-                writer = self.writerfn(**self.writerargs)
+                writer = self.index.writer(**self.writerargs)
             except LockError:
                 time.sleep(self.delay)
         for method, args, kwargs in self.events:

File tests/test_parsing.py

 from nose.tools import assert_equal
 
 from whoosh import analysis, fields, qparser, query
+from whoosh.qparser import dateparse
 
 
 def test_empty_querystring():
     assert_equal(q.__class__, query.Term)
     assert_equal(q.fieldname, "title")
     assert_equal(q.text, "test")
+
+def test_multifield():
+    schema = fields.Schema(content=fields.TEXT, title=fields.TEXT,
+                           cat=fields.KEYWORD, date=fields.DATETIME)
+    
+    qs = u"time (cinema muza cat:place) OR (cinema muza cat:event)"
+    qp = qparser.MultifieldParser(['content', 'title'], schema)
+    q = qp.parse(qs)
+    assert_equal(unicode(q), "((content:time OR title:time) AND (((content:cinema OR title:cinema) AND (content:muza OR title:muza) AND cat:place) OR ((content:cinema OR title:cinema) AND (content:muza OR title:muza) AND cat:event)))")
     
 def test_fieldname_chars():
     s = fields.Schema(abc123=fields.TEXT, nisbah=fields.KEYWORD)
     assert_equal(q[0].text, "*john*")
     assert_equal(q[1].text, "blog")
 
+    
 
+
+
+
+

File tests/test_queries.py

     # do(SpanBefore)
     # do(SpanCondition)
 
+def test_requires():
+    a = Term("f", u"a")
+    b = Term("f", u"b")
+    assert_equal(And([a, b]).requires(), set([a, b]))
+    assert_equal(Or([a, b]).requires(), set())
+    assert_equal(AndMaybe(a, b).requires(), set([a]))
+    assert_equal(a.requires(), set([a]))
 
+
+
+
+
+

File tests/test_results.py

         r = s.search(q, limit=3)
         assert_equal(len(r), count)
 
-
+def test_stability():
+    schema = fields.Schema(text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    domain = u"alfa bravo charlie delta".split()
+    w = ix.writer()
+    for ls in permutations(domain, 3):
+        w.add_document(text=u" ".join(ls))
+    w.commit()
+    
+    with ix.searcher() as s:
+        q = query.Term("text", u"bravo")
+        last = []
+        for i in xrange(s.doc_frequency("text", u"bravo")):
+            # Only un-optimized results are stable
+            r = s.search(q, limit=i + 1, optimize=False)
+            docnums = [hit.docnum for hit in r]
+            assert_equal(docnums[:-1], last)
+            last = docnums