Matt Chaput  committed dda6774 Merge

Merging changes from default branch.

  • Participants
  • Parent commits 37cff9a, c45a5a6
  • Branches dawg

Comments (0)

Files changed (8)

File docs/source/index.rst

+    threads

File docs/source/threads.rst

+Concurrency, locking, and versioning
+The following refers to the default ``filedb`` backend.
+The ``FileIndex`` object is "stateless" and should be share-able between
+A Reader object (which underlies the Searcher object) wraps open files and often
+individual methods rely on consistent file cursor positions (e.g. they do two in a row, so if another thread moves the cursor between the two
+read calls Bad Things would happen). It's best to use one Reader/Searcher per
+thread in your code.
+(I've attempted to add synchronization locks around the appropriate bits of the
+code, but I haven't tested Whoosh in a shared multithreaded environment, so you
+should avoid sharing Readers/Searchers.)
+Readers/Searchers tend to cache information (such as field caches for sorting),
+so if you can share one across multiple search requests, it's a big performance
+Only one thread/process can write to an index at a time. When you open a writer,
+it locks the index. If you try to open a writer on the same index in another
+thread/process, it will raise ````.
+In a multi-threaded or multi-process environment your code needs to be aware
+than opening a writer may raise this exception if a writer is already open.
+Whoosh includes a couple of example implementations
+(:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`)
+of ways to work around the write lock.
+While the writer is open and during the commit, **the index is still available
+for reading**. Existing readers are unaffected and new readers can open the
+current index normally.
+Lock files
+Locking the index is accomplished by acquiring an exclusive file lock on the
+``<indexname>_WRITELOCK`` file in the index directory. The file is not deleted
+after the file lock is released, so the fact that the file exists **does not**
+mean the index is locked.
+A second lock file called ``<indexname>_READLOCK`` exists which is used to keep
+a writer from deleting segments at the exact moment a new reader is trying to
+open them.
+When you open a reader/searcher, the reader represents a view of the **current
+version** of the index. If someone writes changes to the index, any readers
+that are already open **will not** pick up the changes automatically. A reader
+always sees the index as it existed when the reader was opened.
+If you are re-using a Searcher across multiple search requests, you can check
+whether the Searcher is a view of the latest version of the index using
+:meth:`whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date,
+you can get an up-to-date copy of the searcher using
+    # If 'searcher' is not up-to-date, replace it
+    searcher = searcher.refresh()
+(If the searcher has the latest version of the index, ``refresh()`` simply
+returns it.)
+Calling ``Searcher.refresh()`` is more efficient that closing the searcher and
+opening a new one, since it will re-use any underlying readers and caches that
+haven't changed.

File src/whoosh/filedb/

         # Lock the index so nobody can delete a segment while we're in the
         # middle of creating the reader
         lock = self.lock("READLOCK")
-        lock.acquire(True)
+        # Try to acquire the "reader" lock, which prevents a writer from
+        # deleting segments out from under us. If another reader already has
+        # the lock, just pray.
+        #
+        # TODO: replace this with a re-entrant file lock, if possible.
+        gotit = lock.acquire(False)
             # Read the information from the TOC file
             info = self._read_toc()
             return self._reader(, info.schema, info.segments,
                                 info.generation, reuse=reuse)
-            lock.release()    
+            if gotit:
+                lock.release()    
 class Segment(object):

File src/whoosh/

         return expander.expanded_terms(numterms, normalize=normalize)
     def more_like(self, docnum, fieldname, text=None, top=10, numterms=5,
-                  model=classify.Bo1Model, normalize=False):
+                  model=classify.Bo1Model, normalize=False, filter=None):
         """Returns a :class:`Results` object containing documents similar to
         the given document, based on "key terms" in the given field::
         :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use
             to compute "key terms".
         :param normalize: whether to normalize term weights.
+        :param filter: a query, Results object, or set of docnums. The results
+            will only contain documents that are also in the filter object.
         if text:
         q = query.Or([query.Term(fieldname, word, boost=weight)
                       for word, weight in kts])
-        # Filter the original document out of the results using a bit vector
-        # with every bit set except the one for this document
-        size = self.doc_count_all()
-        comb = BitVector(size, [n for n in xrange(self.doc_count_all())
-                                if n != docnum])
-        return, limit=top, filter=comb)
+        return, limit=top, filter=filter, mask=set([docnum]))
     def search_page(self, query, pagenum, pagelen=10, **kwargs):
         """This method is Like the :meth:`` method, but returns
         return groups
     def search(self, q, limit=10, sortedby=None, reverse=False, groupedby=None,
-               optimize=True, scored=True, filter=None, collector=None):
+               optimize=True, scored=True, filter=None, mask=None,
+               collector=None):
         """Runs the query represented by the ``query`` object and returns a
         Results object.
             collect the found documents.
         :param filter: a query, Results object, or set of docnums. The results
             will only contain documents that are also in the filter object.
+        :param mask: a query, Results object, or set of docnums. The
+            results will not contain documents that are also in the mask
+            object.
         :rtype: :class:`Results`
             collector.scored = scored
             collector.reverse = reverse
-        return, q, filter=filter)
+        return, q, allow=filter, restrict=mask)
 class Collector(object):
         self.timesup = False
         self.timer = None
-    def search(self, searcher, q, filter=None):
+    def search(self, searcher, q, allow=None, restrict=None):
         """Top-level method call which uses the given :class:`Searcher` and
         :class:`whoosh.query.Query` objects to return a :class:`Results`
         if self.limit and self.limit > searcher.doc_count_all():
             self.limit = None
-        self._comb = None
-        if filter:
-            self.add_filter(filter)
+        self._allow = None
+        self._restrict = None
+        if allow:
+            self._allow = self._searcher._filter_to_comb(allow)
+        if restrict:
+            self._restrict = self._searcher._filter_to_comb(restrict)
         if self.timelimit:
             self.timer = threading.Timer(self.timelimit, self._timestop)
         # flag inside the add_(all|top)_matches loops.
         self.timesup = True
-    def add_filter(self, obj):
-        c = self._searcher._filter_to_comb(obj)
-        if self._comb is None:
-            self._comb = set()
-        self._comb |= c
     def add_searcher(self, searcher, q):
         """Adds the documents from the given searcher with the given query to
         the collector. This is called by the :meth:`` method.
         # This method is only called by add_all_matches. Note: the document
         # number is negated to match the output of add_top_matches
         self._items.append((score, 0 - id))
-        self.docset.add(id)
     def should_add_all(self):
         """Returns True if this collector needs to add all found documents (for
         items = self._items
         usequality = self.usequality
         score = self.score
-        comb = self._comb
+        allow = self._allow
+        restrict = self._restrict
         timelimited = bool(self.timelimit)
         greedy = self.greedy
         # heap so that higher document numbers have lower "priority" in the
         # queue. Lower document numbers should always come before higher
         # document numbers with the same score to keep the order stable.
-        for id, quality in self.pull_matches(matcher, usequality):
+        for offsetid, quality in self.pull_matches(matcher, usequality, offset):
             if timelimited and not greedy and self.timesup:
                 raise TimeLimit
-            offsetid = id + offset
-            if comb and offsetid not in comb:
+            if allow and offsetid not in allow:
+                continue
+            if restrict and offsetid in restrict:
             if len(items) < limit:
         items = self._items
         scored = self.scored
         score = self.score
-        comb = self._comb
+        allow = self._allow
+        restrict = self._restrict
         timelimited = bool(self.timelimit)
         greedy = self.greedy
         reverse = self.reverse
             for name in self.groupedby:
                 keyfns[name] = searcher.reader().key_fn(name)
-        for id, _ in self.pull_matches(matcher, False):
+        for offsetid, _ in self.pull_matches(matcher, False, offset):
             if timelimited and not greedy and self.timesup:
                 raise TimeLimit
-            offsetid = id + offset
-            if comb and offsetid not in comb:
+            if allow and offsetid not in allow:
+                continue
+            if restrict and offsetid in restrict:
             if keyfns:
                 for name, keyfn in keyfns.iteritems():
                     if name not in self.groups:
                         self.groups[name] = defaultdict(list)
-                    key = keyfn(id)
-                    self.groups[name][key].append(id)
+                    key = keyfn(offsetid - offset)
+                    self.groups[name][key].append(offsetid)
             scr = 0
             if scored:
             if timelimited and self.timesup:
                 raise TimeLimit
-    def pull_matches(self, matcher, usequality):
+    def pull_matches(self, matcher, usequality, offset):
         """Low-level method yields (docid, quality) pairs from the given
         matcher. Called by :meth:`Collector.add_top_matches` and
         :meth:`Collector.add_all_matches`. If ``usequality`` is False or the
             # The current document ID 
             id =
+            offsetid = id + offset
             if not usequality:
-                docset.add(id)
+                docset.add(offsetid)
             # If we're using quality optimizations, check whether the current
             # posting has higher quality than the minimum before yielding it.
             if usequality:
                 postingquality = matcher.quality()
                 if postingquality > self.minquality:
-                    yield (id, postingquality)
+                    yield (offsetid, postingquality)
-                yield (id, None)
+                yield (offsetid, None)
             # Move to the next document. This method returns True if the
             # matcher has entered a new block, so we should check block quality
                                        formatter=formatter, order=order)
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
-                       model=classify.Bo1Model, normalize=True):
+                       model=classify.Bo1Model, normalize=True, filter=None):
         """Returns a new Results object containing documents similar to this
         hit, based on "key terms" in the given field::
         return self.searcher.more_like(self.docnum, fieldname, text=text,
                                        top=top, numterms=numterms, model=model,
-                                       normalize=normalize)
+                                       normalize=normalize, filter=filter)
     def __repr__(self):
         return "<%s %r>" % (self.__class__.__name__, self.fields())

File tests/

     schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
     _check(schema, text=docs[0])
+def test_more_like():
+    schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=u"1", text=u"alfa bravo charlie")
+    w.add_document(id=u"2", text=u"bravo charlie delta")
+    w.add_document(id=u"3", text=u"echo")
+    w.add_document(id=u"4", text=u"delta echo foxtrot")
+    w.add_document(id=u"5", text=u"echo echo echo")
+    w.add_document(id=u"6", text=u"foxtrot golf hotel")
+    w.add_document(id=u"7", text=u"golf hotel india")
+    w.commit()
+    with ix.searcher() as s:
+        docnum = s.document_number(id="3")
+        r = s.more_like(docnum, "text")
+        assert_equal([hit["id"] for hit in r], ["5", "4"])

File tests/

 from import assert_equal
+from whoosh import fields
 from import try_for
-from whoosh.util import length_to_byte, byte_to_length
-from import TempStorage
+from whoosh.util import length_to_byte, byte_to_length, now
+from import TempStorage, TempIndex
 def test_filelock_simple():
     assert_equal(test.cache_info(), (3, 7, 5, 5))
     assert_equal(test.cache_info(), (0, 0, 5, 0))
+def test_readlock():
+    schema = fields.Schema(text=fields.TEXT)
+    with TempIndex(schema, "readlock") as ix:
+        for num in u"one two three four five".split():
+            w = ix.writer()
+            w.add_document(text=u"Test document %s" % num)
+            w.commit(merge=False)
+        def fn():
+            for _ in xrange(10):
+                r = ix.reader()
+                r.close()
+        ths = [threading.Thread(target=fn) for _ in xrange(10)]
+        for th in ths:
+            th.start()
+        for th in ths:
+            th.join()

File tests/

         r ="words", u"alfa"))
         r.filter("words", u"bottom")))
         check(r, "4")
+def test_extend_empty():
+    schema = fields.Schema(id=fields.STORED, words=fields.KEYWORD)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=1, words=u"alfa bravo charlie")
+    w.add_document(id=1, words=u"bravo charlie delta")
+    w.add_document(id=1, words=u"charlie delta echo")
+    w.add_document(id=1, words=u"delta echo foxtrot")
+    w.add_document(id=1, words=u"echo foxtrot golf")
+    w.commit()
+    with ix.searcher() as s:
+        r1 ="words", u"hotel"))
+        assert_equal(len(r1), 0)
+        r1c = r1.copy()
+        assert_equal(len(r1c), 0)
+        r2 ="words", u"delta"))
+        assert_equal(len(r2), 3)
+        r2c = r2.copy()
+        assert_equal(len(r2c), 3)
+        r1.extend(r2)
+        assert_equal(len(r1), 3)
+        r1c.extend(r2c)
+        assert_equal(len(r1c), 3)
 def test_pages():
     from whoosh.scoring import Frequency

File tests/

 from datetime import datetime, timedelta
-from whoosh import analysis, fields, index, qparser, searching, scoring
+from whoosh import analysis, fields, index, qparser, query, searching, scoring
 from whoosh.filedb.filestore import RamStorage
 from whoosh.query import *
 from whoosh.util import permutations
         print "titles2=", titles
         assert "Juliet Kilo Bravo" in titles
+def test_collect_limit():
+    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id="a", text=u"alfa bravo charlie delta echo")
+    w.add_document(id="b", text=u"bravo charlie delta echo foxtrot")
+    w.add_document(id="c", text=u"charlie delta echo foxtrot golf")
+    w.add_document(id="d", text=u"delta echo foxtrot golf hotel")
+    w.add_document(id="e", text=u"echo foxtrot golf hotel india")
+    w.commit()
+    with ix.searcher() as s:
+        r ="text", u"golf"), limit=10)
+        assert_equal(len(r), 3)
+        count = 0
+        for _ in r:
+            count += 1
+        assert_equal(count, 3)
+    w = ix.writer()
+    w.add_document(id="f", text=u"foxtrot golf hotel india juliet")
+    w.add_document(id="g", text=u"golf hotel india juliet kilo")
+    w.add_document(id="h", text=u"hotel india juliet kilo lima")
+    w.add_document(id="i", text=u"india juliet kilo lima mike")
+    w.add_document(id="j", text=u"juliet kilo lima mike november")
+    w.commit(merge=False)
+    with ix.searcher() as s:
+        r ="text", u"golf"), limit=20)
+        assert_equal(len(r), 5)
+        count = 0
+        for _ in r:
+            count += 1
+        assert_equal(count, 5)