Commits

Matt Chaput committed ec3c6a2 Merge

Merging cleanup2.4 branch into mainline.

  • Participants
  • Parent commits 3ee3e97, 2eb0159

Comments (0)

Files changed (15)

 9a84c1a1d557b809e46808768f2c451e2560c5cd 2.2.2
 5d1064ce4c8550fe7dda58b17ab389347d6cbb77 2.3
 19c2df0a94efd8fdf7be8ea480f3cdd219a06c7a 2.3.1
+303cef16ed5e01e6ab681a8adb9497ad00be02c4 2.3.2
   full-text search solution I know of.
 * Pluggable scoring algorithm (including BM25F), text analysis, storage,
   posting format, etc.
-* Powerful query language parsed by pyparsing.
+* Powerful query language.
 * Pure Python spell-checker (as far as I know, the only one). 
 
 Whoosh might be useful in the following circumstances:

docs/source/batch.rst

 
 The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the
 number of processors the writer will use for the indexing pool (via the
-``multiprocessing`` module).
+``multiprocessing`` module)::
 
     from whoosh import index
     

docs/source/releases/2_0.rst

 Whoosh 2.x release notes
 ========================
 
+Whoosh 2.3.2
+============
+
+* Fixes bug in BM25F scoring function, leading to increased precision in search
+  results.
+
+* Fixes issues #203, #205, #206, #208, #209, #212.
+
+
+Whoosh 2.3.1
+============
+
+* Fixes issue #200.
+
+
 Whoosh 2.3
 ==========
 

src/whoosh/__init__.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-__version__ = (2, 3, 1)
+__version__ = (2, 3, 2)
 
 
 def versionstring(build=True, extra=True):

src/whoosh/analysis.py

                     t.text = stemfn(text)
             yield t
 
-
 class PyStemmerFilter(StemFilter):
     """This is a simple subclass of StemFilter that works with the py-stemmer
     third-party library. You must have the py-stemmer library installed to use
     this filter.
-    
+
     >>> PyStemmerFilter("spanish")
     """
 
         :param cachesize: the maximum number of words to cache.
         """
 
-        import Stemmer  #@UnresolvedImport
-
-        stemmer = Stemmer.Stemmer(lang)
-        stemmer.maxCacheSize = cachesize
-        self._stem = stemmer.stemWord
+        self.lang = lang
         self.ignore = frozenset() if ignore is None else frozenset(ignore)
+        self.cachesize = cachesize
+        self._stem = self._get_stemmer_fn()
 
     def algorithms(self):
         """Returns a list of stemming algorithms provided by the py-stemmer
     def cache_info(self):
         return None
 
+    def _get_stemmer_fn(self):
+        import Stemmer  #@UnresolvedImport
+
+        stemmer = Stemmer.Stemmer(self.lang)
+        stemmer.maxCacheSize = self.cachesize
+        return stemmer.stemWord
+
+    def __getstate__(self):
+        # Can't pickle a dynamic function, so we have to remove the _stem
+        # attribute from the state
+        return dict([(k, self.__dict__[k]) for k in self.__dict__
+                     if k != "_stem"])
+
+    def __setstate__(self, state):
+        # Check for old instances of StemFilter class, which didn't have a
+        # cachesize attribute and pickled the cache attribute
+        if "cachesize" not in state:
+            self.cachesize = 10000
+        if "ignores" in state:
+            self.ignore = state["ignores"]
+        elif "ignore" not in state:
+            self.ignore = frozenset()
+        if "cache" in state:
+            del state["cache"]
+
+        self.__dict__.update(state)
+        # Set the _stem attribute
+        self._stem = self._get_stemmer_fn()
+
 
 class CharsetFilter(Filter):
     """Translates the text of tokens by calling unicode.translate() using the

src/whoosh/filedb/multiproc.py

 from multiprocessing import Process, Queue, cpu_count
 
 from whoosh.compat import dump, load, xrange, iteritems
-from whoosh.filedb.filetables import Lengths
+from whoosh.filedb.filetables import LengthWriter, LengthReader
 from whoosh.filedb.fileindex import Segment
 from whoosh.filedb.filewriting import SegmentWriter
 from whoosh.filedb.pools import (imerge, read_run, PoolBase, TempfilePool)
 from whoosh.writing import IndexWriter
 
 
-## Multiprocessing writer
-#
-#class SegmentWritingTask(Process):
-#    def __init__(self, storage, indexname, segname, kwargs, jobqueue,
-#                 resultqueue, firstjob=None):
-#        Process.__init__(self)
-#        self.storage = storage
-#        self.indexname = indexname
-#        self.segname = segname
-#        self.kwargs = kwargs
-#        self.jobqueue = jobqueue
-#        self.resultqueue = resultqueue
-#        self.firstjob = firstjob
-#
-#        self.segment = None
-#        self.running = True
-#
-#    def _add_file(self, args):
-#        writer = self.writer
-#        filename, length = args
-#        f = open(filename, "rb")
-#        for _ in xrange(length):
-#            writer.add_document(**load(f))
-#        f.close()
-#        os.remove(filename)
-#
-#    def run(self):
-#        jobqueue = self.jobqueue
-#        ix = self.storage.open_index(self.indexname)
-#        writer = self.writer = SegmentWriter(ix, _lk=False, name=self.segname,
-#                                             **self.kwargs)
-#
-#        if self.firstjob:
-#            self._add_file(self.firstjob)
-#
-#        while self.running:
-#            args = jobqueue.get()
-#            if args is None:
-#                break
-#            self._add_file(args)
-#
-#        if not self.running:
-#            writer.cancel()
-#        else:
-#            writer.pool.finish(writer.termswriter, writer.docnum,
-#                               writer.lengthfile)
-#            writer._close_all()
-#            self.resultqueue.put(writer._getsegment())
-#
-#    def cancel(self):
-#        self.running = False
-#
-#
-#class MultiSegmentWriter(IndexWriter):
-#    def __init__(self, ix, procs=None, batchsize=100, dir=None, **kwargs):
-#        self.index = ix
-#        self.procs = procs or cpu_count()
-#        self.bufferlimit = batchsize
-#        self.dir = dir
-#        self.kwargs = kwargs
-#        self.kwargs["dir"] = dir
-#
-#        self.segnames = []
-#        self.tasks = []
-#        self.jobqueue = Queue(self.procs * 4)
-#        self.resultqueue = Queue()
-#        self.docbuffer = []
-#
-#        self.writelock = ix.lock("WRITELOCK")
-#        self.writelock.acquire()
-#
-#        info = ix._read_toc()
-#        self.schema = info.schema
-#        self.segment_number = info.segment_counter
-#        self.generation = info.generation + 1
-#        self.segments = info.segments
-#        self.storage = ix.storage
-#
-#    def _new_task(self, firstjob):
-#        ix = self.index
-#        self.segment_number += 1
-#        segmentname = Segment.basename(ix.indexname, self.segment_number)
-#        task = SegmentWritingTask(ix.storage, ix.indexname, segmentname,
-#                                  self.kwargs, self.jobqueue,
-#                                  self.resultqueue, firstjob)
-#        self.tasks.append(task)
-#        task.start()
-#        return task
-#
-#    def _enqueue(self):
-#        doclist = self.docbuffer
-#        fd, filename = tempfile.mkstemp(".doclist", dir=self.dir)
-#        f = os.fdopen(fd, "wb")
-#        for doc in doclist:
-#            dump(doc, f, -1)
-#        f.close()
-#        args = (filename, len(doclist))
-#
-#        if len(self.tasks) < self.procs:
-#            self._new_task(args)
-#        else:
-#            self.jobqueue.put(args)
-#
-#        self.docbuffer = []
-#
-#    def cancel(self):
-#        try:
-#            for task in self.tasks:
-#                task.cancel()
-#        finally:
-#            self.writelock.release()
-#
-#    def add_document(self, **fields):
-#        self.docbuffer.append(fields)
-#        if len(self.docbuffer) >= self.bufferlimit:
-#            self._enqueue()
-#
-#    def commit(self, **kwargs):
-#        try:
-#            # index the remaining stuff in self.docbuffer
-#            self._enqueue()
-#
-#            for task in self.tasks:
-#                self.jobqueue.put(None)
-#
-#            for task in self.tasks:
-#                task.join()
-#
-#            for task in self.tasks:
-#                taskseg = self.resultqueue.get()
-#                assert isinstance(taskseg, Segment), type(taskseg)
-#                self.segments.append(taskseg)
-#
-#            self.jobqueue.close()
-#            self.resultqueue.close()
-#
-#            from whoosh.filedb.fileindex import _write_toc, _clean_files
-#            _write_toc(self.storage, self.schema, self.index.indexname,
-#                       self.generation, self.segment_number, self.segments)
-#
-#            # Delete leftover files
-#            _clean_files(self.storage, self.index.indexname,
-#                         self.generation, self.segments)
-#        finally:
-#            self.writelock.release()
-#
-#
-## Multiprocessing pool
-#
-#class PoolWritingTask(Process):
-#    def __init__(self, schema, dir, jobqueue, resultqueue, limitmb,
-#                 firstjob=None):
-#        Process.__init__(self)
-#        self.schema = schema
-#        self.dir = dir
-#        self.jobqueue = jobqueue
-#        self.resultqueue = resultqueue
-#        self.limitmb = limitmb
-#        self.firstjob = firstjob
-#
-#    def _add_file(self, filename, length):
-#        subpool = self.subpool
-#        f = open(filename, "rb")
-#        for _ in xrange(length):
-#            code, args = load(f)
-#            if code == 0:
-#                subpool.add_content(*args)
-#            elif code == 1:
-#                subpool.add_posting(*args)
-#            elif code == 2:
-#                subpool.add_field_length(*args)
-#        f.close()
-#        os.remove(filename)
-#
-#    def run(self):
-#        jobqueue = self.jobqueue
-#        rqueue = self.resultqueue
-#        subpool = self.subpool = TempfilePool(self.schema,
-#                                              limitmb=self.limitmb,
-#                                              dir=self.dir)
-#
-#        if self.firstjob:
-#            self._add_file(*self.firstjob)
-#
-#        while True:
-#            arg1, arg2 = jobqueue.get()
-#            if arg1 is None:
-#                doccount = arg2
-#                break
-#            else:
-#                self._add_file(arg1, arg2)
-#
-#        lenfd, lenfilename = tempfile.mkstemp(".lengths", dir=subpool.dir)
-#        lenf = os.fdopen(lenfd, "wb")
-#        subpool._write_lengths(StructFile(lenf), doccount)
-#        subpool.dump_run()
-#        rqueue.put((subpool.runs, subpool.fieldlength_totals(),
-#                    subpool.fieldlength_mins(), subpool.fieldlength_maxes(),
-#                    lenfilename))
-#
-#
-#class MultiPool(PoolBase):
-#    def __init__(self, schema, dir=None, procs=2, limitmb=32, batchsize=100,
-#                 **kw):
-#        PoolBase.__init__(self, schema, dir=dir)
-#        self._make_dir()
-#
-#        self.procs = procs
-#        self.limitmb = limitmb
-#        self.jobqueue = Queue(self.procs * 4)
-#        self.resultqueue = Queue()
-#        self.tasks = []
-#        self.buffer = []
-#        self.bufferlimit = batchsize
-#
-#    def _new_task(self, firstjob):
-#        task = PoolWritingTask(self.schema, self.dir, self.jobqueue,
-#                               self.resultqueue, self.limitmb,
-#                               firstjob=firstjob)
-#        self.tasks.append(task)
-#        task.start()
-#        return task
-#
-#    def _enqueue(self):
-#        commandlist = self.buffer
-#        fd, filename = tempfile.mkstemp(".commands", dir=self.dir)
-#        f = os.fdopen(fd, "wb")
-#        for command in commandlist:
-#            dump(command, f, -1)
-#        f.close()
-#        args = (filename, len(commandlist))
-#
-#        if len(self.tasks) < self.procs:
-#            self._new_task(args)
-#        else:
-#            self.jobqueue.put(args)
-#
-#        self.buffer = []
-#
-#    def _append(self, item):
-#        self.buffer.append(item)
-#        if len(self.buffer) > self.bufferlimit:
-#            self._enqueue()
-#
-#    def add_content(self, *args):
-#        self._append((0, args))
-#
-#    def add_posting(self, *args):
-#        self.postingqueue.put((1, args))
-#
-#    def add_field_length(self, *args):
-#        self.postingqueue.put((2, args))
-#
-#    def cancel(self):
-#        for task in self.tasks:
-#            task.terminate()
-#        self.cleanup()
-#
-#    def cleanup(self):
-#        self._clean_temp_dir()
-#
-#    def finish(self, termswriter, doccount, lengthfile):
-#        if self.buffer:
-#            self._enqueue()
-#
-#        _fieldlength_totals = self._fieldlength_totals
-#        if not self.tasks:
-#            return
-#
-#        jobqueue = self.jobqueue
-#        rqueue = self.resultqueue
-#
-#        for task in self.tasks:
-#            jobqueue.put((None, doccount))
-#
-#        for task in self.tasks:
-#            task.join()
-#
-#        runs = []
-#        lenfilenames = []
-#        for task in self.tasks:
-#            truns, flentotals, flenmins, flenmaxes, lenfilename = rqueue.get()
-#            runs.extend(truns)
-#            lenfilenames.append(lenfilename)
-#            for fieldname, total in iteritems(flentotals):
-#                _fieldlength_totals[fieldname] += total
-#
-#            for fieldname, length in iteritems(flenmins):
-#                if length < self._fieldlength_maxes.get(fieldname, 9999999999):
-#                    self._fieldlength_mins[fieldname] = length
-#
-#            for fieldname, length in flenmaxes.iteritems():
-#                if length > self._fieldlength_maxes.get(fieldname, 0):
-#                    self._fieldlength_maxes[fieldname] = length
-#
-#        jobqueue.close()
-#        rqueue.close()
-#
-#        lengths = Lengths()
-#        for lenfilename in lenfilenames:
-#            sublengths = Lengths.from_file(StructFile(open(lenfilename, "rb")),
-#                                           doccount)
-#            lengths.add_all(sublengths)
-#            os.remove(lenfilename)
-#        lengths.to_file(lengthfile, doccount)
-#
-##        if len(runs) >= self.procs * 2:
-##            pool = Pool(self.procs)
-##            tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir)
-##            while len(runs) >= self.procs * 2:
-##                runs2 = [(runs[i:i+4], tempname())
-##                         for i in xrange(0, len(runs), 4)]
-##                if len(runs) % 4:
-##                    last = runs2.pop()[0]
-##                    runs2[-1][0].extend(last)
-##                runs = pool.map(merge_runs, runs2)
-##            pool.close()
-#
-#        iterator = imerge([read_run(rname, count) for rname, count in runs])
-#        total = sum(count for runname, count in runs)
-#        termswriter.add_iter(iterator, lengths.get)
-#        for runname, count in runs:
-#            os.remove(runname)
-#
-#        self.cleanup()
+# Multiprocessing writer
+
+class SegmentWritingTask(Process):
+    def __init__(self, storage, indexname, segname, kwargs, jobqueue,
+                 resultqueue, firstjob=None):
+        Process.__init__(self)
+        self.storage = storage
+        self.indexname = indexname
+        self.segname = segname
+        self.kwargs = kwargs
+        self.jobqueue = jobqueue
+        self.resultqueue = resultqueue
+        self.firstjob = firstjob
+
+        self.segment = None
+        self.running = True
+
+    def _add_file(self, args):
+        writer = self.writer
+        filename, length = args
+        f = open(filename, "rb")
+        for _ in xrange(length):
+            writer.add_document(**load(f))
+        f.close()
+        os.remove(filename)
+
+    def run(self):
+        jobqueue = self.jobqueue
+        ix = self.storage.open_index(self.indexname)
+        writer = self.writer = SegmentWriter(ix, _lk=False, name=self.segname,
+                                             **self.kwargs)
+
+        if self.firstjob:
+            self._add_file(self.firstjob)
+
+        while self.running:
+            args = jobqueue.get()
+            if args is None:
+                break
+            self._add_file(args)
+
+        if not self.running:
+            writer.cancel()
+        else:
+            writer.pool.finish(writer.termswriter, writer.docnum,
+                               writer.lengthfile)
+            writer._close_all()
+            self.resultqueue.put(writer._getsegment())
+
+    def cancel(self):
+        self.running = False
+
+
+class MultiSegmentWriter(IndexWriter):
+    def __init__(self, ix, procs=None, batchsize=100, dir=None, **kwargs):
+        self.index = ix
+        self.procs = procs or cpu_count()
+        self.bufferlimit = batchsize
+        self.dir = dir
+        self.kwargs = kwargs
+        self.kwargs["dir"] = dir
+
+        self.segnames = []
+        self.tasks = []
+        self.jobqueue = Queue(self.procs * 4)
+        self.resultqueue = Queue()
+        self.docbuffer = []
+
+        self.writelock = ix.lock("WRITELOCK")
+        self.writelock.acquire()
+
+        info = ix._read_toc()
+        self.schema = info.schema
+        self.segment_number = info.segment_counter
+        self.generation = info.generation + 1
+        self.segments = info.segments
+        self.storage = ix.storage
+
+    def _new_task(self, firstjob):
+        ix = self.index
+        self.segment_number += 1
+        segmentname = Segment.basename(ix.indexname, self.segment_number)
+        task = SegmentWritingTask(ix.storage, ix.indexname, segmentname,
+                                  self.kwargs, self.jobqueue,
+                                  self.resultqueue, firstjob)
+        self.tasks.append(task)
+        task.start()
+        return task
+
+    def _enqueue(self):
+        doclist = self.docbuffer
+        fd, filename = tempfile.mkstemp(".doclist", dir=self.dir)
+        f = os.fdopen(fd, "wb")
+        for doc in doclist:
+            dump(doc, f, -1)
+        f.close()
+        args = (filename, len(doclist))
+
+        if len(self.tasks) < self.procs:
+            self._new_task(args)
+        else:
+            self.jobqueue.put(args)
+
+        self.docbuffer = []
+
+    def cancel(self):
+        try:
+            for task in self.tasks:
+                task.cancel()
+        finally:
+            self.writelock.release()
+
+    def add_document(self, **fields):
+        self.docbuffer.append(fields)
+        if len(self.docbuffer) >= self.bufferlimit:
+            self._enqueue()
+
+    def commit(self, **kwargs):
+        try:
+            # index the remaining stuff in self.docbuffer
+            self._enqueue()
+
+            for task in self.tasks:
+                self.jobqueue.put(None)
+
+            for task in self.tasks:
+                task.join()
+
+            for task in self.tasks:
+                taskseg = self.resultqueue.get()
+                assert isinstance(taskseg, Segment), type(taskseg)
+                self.segments.append(taskseg)
+
+            self.jobqueue.close()
+            self.resultqueue.close()
+
+            from whoosh.filedb.fileindex import _write_toc, _clean_files
+            _write_toc(self.storage, self.schema, self.index.indexname,
+                       self.generation, self.segment_number, self.segments)
+
+            # Delete leftover files
+            _clean_files(self.storage, self.index.indexname,
+                         self.generation, self.segments)
+        finally:
+            self.writelock.release()
+
+
+# Multiprocessing pool
+
+class PoolWritingTask(Process):
+    def __init__(self, schema, dir, jobqueue, resultqueue, limitmb,
+                 firstjob=None):
+        Process.__init__(self)
+        self.schema = schema
+        self.dir = dir
+        self.jobqueue = jobqueue
+        self.resultqueue = resultqueue
+        self.limitmb = limitmb
+        self.firstjob = firstjob
+
+    def _add_file(self, filename, length):
+        subpool = self.subpool
+        f = open(filename, "rb")
+        for _ in xrange(length):
+            code, args = load(f)
+            if code == 0:
+                subpool.add_content(*args)
+            elif code == 1:
+                subpool.add_posting(*args)
+            elif code == 2:
+                subpool.add_field_length(*args)
+        f.close()
+        os.remove(filename)
+
+    def run(self):
+        jobqueue = self.jobqueue
+        rqueue = self.resultqueue
+        subpool = self.subpool = TempfilePool(self.schema,
+                                              limitmb=self.limitmb,
+                                              dir=self.dir)
+
+        if self.firstjob:
+            self._add_file(*self.firstjob)
+
+        while True:
+            arg1, arg2 = jobqueue.get()
+            if arg1 is None:
+                doccount = arg2
+                break
+            else:
+                self._add_file(arg1, arg2)
+
+        lenfd, lenfilename = tempfile.mkstemp(".lengths", dir=subpool.dir)
+        lenf = os.fdopen(lenfd, "wb")
+        subpool._write_lengths(StructFile(lenf), doccount)
+        subpool.dump_run()
+        rqueue.put((subpool.runs, subpool.fieldlength_totals(),
+                    subpool.fieldlength_mins(), subpool.fieldlength_maxes(),
+                    lenfilename))
+
+
+class MultiPool(PoolBase):
+    def __init__(self, schema, dir=None, procs=2, limitmb=32, batchsize=100,
+                 **kw):
+        PoolBase.__init__(self, schema, dir=dir)
+        self._make_dir()
+
+        self.procs = procs
+        self.limitmb = limitmb
+        self.jobqueue = Queue(self.procs * 4)
+        self.resultqueue = Queue()
+        self.tasks = []
+        self.buffer = []
+        self.bufferlimit = batchsize
+
+    def _new_task(self, firstjob):
+        task = PoolWritingTask(self.schema, self.dir, self.jobqueue,
+                               self.resultqueue, self.limitmb,
+                               firstjob=firstjob)
+        self.tasks.append(task)
+        task.start()
+        return task
+
+    def _enqueue(self):
+        commandlist = self.buffer
+        fd, filename = tempfile.mkstemp(".commands", dir=self.dir)
+        f = os.fdopen(fd, "wb")
+        for command in commandlist:
+            dump(command, f, -1)
+        f.close()
+        args = (filename, len(commandlist))
+
+        if len(self.tasks) < self.procs:
+            self._new_task(args)
+        else:
+            self.jobqueue.put(args)
+
+        self.buffer = []
+
+    def _append(self, item):
+        self.buffer.append(item)
+        if len(self.buffer) > self.bufferlimit:
+            self._enqueue()
+
+    def add_content(self, *args):
+        self._append((0, args))
+
+    def add_posting(self, *args):
+        self._append((1, args))
+
+    def add_field_length(self, *args):
+        self._append((2, args))
+
+    def cancel(self):
+        for task in self.tasks:
+            task.terminate()
+        self.cleanup()
+
+    def cleanup(self):
+        self._clean_temp_dir()
+
+    def finish(self, termswriter, doccount, lengthfile):
+        if self.buffer:
+            self._enqueue()
+
+        _fieldlength_totals = self._fieldlength_totals
+        if not self.tasks:
+            return
+
+        jobqueue = self.jobqueue
+        rqueue = self.resultqueue
+
+        for task in self.tasks:
+            jobqueue.put((None, doccount))
+
+        for task in self.tasks:
+            task.join()
+
+        runs = []
+        lenfilenames = []
+        for task in self.tasks:
+            truns, flentotals, flenmins, flenmaxes, lenfilename = rqueue.get()
+            runs.extend(truns)
+            lenfilenames.append(lenfilename)
+            for fieldname, total in iteritems(flentotals):
+                _fieldlength_totals[fieldname] += total
+
+            for fieldname, length in iteritems(flenmins):
+                if length < self._fieldlength_maxes.get(fieldname, 9999999999):
+                    self._fieldlength_mins[fieldname] = length
+
+            for fieldname, length in flenmaxes.iteritems():
+                if length > self._fieldlength_maxes.get(fieldname, 0):
+                    self._fieldlength_maxes[fieldname] = length
+
+        jobqueue.close()
+        rqueue.close()
+
+        lw = LengthWriter(lengthfile, doccount)
+        for lenfilename in lenfilenames:
+            sublengths = LengthReader(StructFile(open(lenfilename, "rb")),
+                                      doccount)
+            lw.add_all(sublengths)
+            os.remove(lenfilename)
+        lw.close()
+        lengths = lw.reader()
+
+#        if len(runs) >= self.procs * 2:
+#            pool = Pool(self.procs)
+#            tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir)
+#            while len(runs) >= self.procs * 2:
+#                runs2 = [(runs[i:i+4], tempname())
+#                         for i in xrange(0, len(runs), 4)]
+#                if len(runs) % 4:
+#                    last = runs2.pop()[0]
+#                    runs2[-1][0].extend(last)
+#                runs = pool.map(merge_runs, runs2)
+#            pool.close()
+
+        iterator = imerge([read_run(rname, count) for rname, count in runs])
+        total = sum(count for runname, count in runs)
+        termswriter.add_iter(iterator, lengths.get)
+        for runname, count in runs:
+            os.remove(runname)
+
+        self.cleanup()

src/whoosh/qparser/common.py

 parser modules.
 """
 
+from __future__ import print_function
 from whoosh.compat import string_type
 
 

src/whoosh/qparser/plugins.py

     (?P<start>
         ('[^']*?'\s+)             # single-quoted 
         |                         # or
-        (.+?(?=[Tt][Oo]))         # everything until "to"
+        ([^\]}]+?(?=[Tt][Oo]))    # everything until "to"
     )?
     [Tt][Oo]                      # "to"
     (?P<end>
         (\s+'[^']*?')             # single-quoted
         |                         # or
-        ((.+?)(?=]|}))            # everything until "]" or "}"
+        ([^\]}]+?)                # everything until "]" or "}"
     )?
     (?P<close>}|])                # Close paren
     """, verbose=True)

src/whoosh/query.py

         raise NotImplementedError
 
     def _find_prefix(self, text):
+        # Subclasses/instances should set the SPECIAL_CHARS attribute to a set
+        # of characters that mark the end of the literal prefix
         specialchars = self.SPECIAL_CHARS
         for i, char in enumerate(self.text):
             if char in specialchars:
         else:
             return self
 
+    # _words() implemented in PatternQuery
+
 
 class Regex(PatternQuery):
     """Matches documents that contain any terms that match a regular
 
     __str__ = __unicode__
 
-    def _get_pattern(self, text):
-        return text
+    def _get_pattern(self):
+        return self.text
 
-    def _get_prefix(self, text):
+    def _find_prefix(self, text):
         if "|" in text:
             return ""
         if text.startswith("^"):
 
         return PatternQuery._find_prefix(self, text)
 
+    # _words() implemented in PatternQuery
+
 
 class ExpandingTerm(MultiTerm):
     """Intermediate base class for queries such as FuzzyTerm and Variations

src/whoosh/scoring.py

     # avgfl - average field length across documents in collection
     # B, K1 - free paramters
 
-    return idf * ((tf * (K1 + 1)) / (tf + K1 * (1 - B + B * (fl / avgfl))))
+    return idf * ((tf * (K1 + 1)) / (tf + K1 * ((1 - B) + B * fl / avgfl)))
 
 
 class BM25F(WeightingModel):

tests/test_analysis.py

+from __future__ import with_statement
+
 from nose.tools import assert_equal  #@UnresolvedImport
 
 from whoosh import analysis, fields, qparser
 from whoosh.compat import u, unichr
 from whoosh.filedb.filestore import RamStorage
+from whoosh.support.testing import skip_if_unavailable
 
 
 def test_regextokenizer():
     value = u("AAAaaaBBBbbbCCCcccDDDddd")
-    
+
     rex = analysis.RegexTokenizer("[A-Z]+")
     assert_equal([t.text for t in rex(value)], ["AAA", "BBB", "CCC", "DDD"])
-    
+
     rex = analysis.RegexTokenizer("[A-Z]+", gaps=True)
     assert_equal([t.text for t in rex(value)], ["aaa", "bbb", "ccc", "ddd"])
 
         for t in tokens:
             t.text = t.text.upper()
             yield t
-            
+
     analyzer = analysis.RegexTokenizer() | filter
     assert_equal([t.text for t in analyzer(u("abc def"))], ["ABC", "DEF"])
 
 def test_shared_composition():
     shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter()
-    
+
     ana1 = shared | analysis.NgramFilter(3)
     ana2 = shared | analysis.DoubleMetaphoneFilter()
-    
+
     assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"])
     assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
 
     ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
     result = " ".join([t.text for t in ana(target)])
     assert_equal(result, "alfa aflA bravo ovarB charlie eilrahC")
-    
+
     class ucfilter(analysis.Filter):
         def __call__(self, tokens):
             for t in tokens:
                 t.text = t.text.upper()
                 yield t
-    
+
     f2 = analysis.ReverseTextFilter() | ucfilter()
     ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
     result = " ".join([t.text for t in ana(target)])
     assert_equal(result, "alfa AFLA bravo OVARB charlie EILRAHC")
-    
+
     f1 = analysis.PassFilter()
     f2 = analysis.BiWordFilter()
     ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) | analysis.LowercaseFilter()
 def test_intraword():
     iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
     ana = analysis.RegexTokenizer(r"\S+") | iwf
-    
+
     def check(text, ls):
         assert_equal([(t.pos, t.text) for t in ana(text)], ls)
-        
+
     check(u("PowerShot)"), [(0, "Power"), (1, "Shot"), (1, "PowerShot")])
     check(u("A's+B's&C's"), [(0, "A"), (1, "B"), (2, "C"), (2, "ABC")])
     check(u("Super-Duper-XL500-42-AutoCoder!"),
 def test_intraword_chars():
     iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
     ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter()
-    
+
     target = u("WiKiWo-rd")
     tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)]
     assert_equal(tokens, [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6),
                           ("rd", 7, 9), ("wikiword", 0, 9)])
-    
+
     target = u("Zo WiKiWo-rd")
     tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)]
     assert_equal(tokens, [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7),
 def test_intraword_possessive():
     iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
     ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter()
-    
+
     target = u("O'Malley's-Bar")
     tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)]
     assert_equal(tokens, [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14),
 
 def test_word_segments():
     wordset = set(u("alfa bravo charlie delta").split())
-    
+
     cwf = analysis.CompoundWordFilter(wordset, keep_compound=True)
     ana = analysis.RegexTokenizer(r"\S+") | cwf
     target = u("alfacharlie bravodelta delto bravo subalfa")
     tokens = [t.text for t in ana(target)]
     assert_equal(tokens, ["alfacharlie", "alfa", "charlie", "bravodelta",
                           "bravo", "delta", "delto", "bravo", "subalfa"])
-    
+
     cwf = analysis.CompoundWordFilter(wordset, keep_compound=False)
     ana = analysis.RegexTokenizer(r"\S+") | cwf
     target = u("alfacharlie bravodelta delto bravo subalfa")
     tokens = [t.text for t in ana(target)]
     assert_equal(tokens, ["alfa", "charlie", "bravo", "delta", "delto", "bravo", "subalfa"])
-    
+
     #target = u("alfacharlie bravodelta")
     #tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)]
     #assert_equal(tokens, [("alfa", 0, 4), ("charlie", 4, 11), ("bravo", 12, 17), ("delta", 17, 22)])
     assert_equal(["the-sign", "sign-of", "of-four"], [t.text for t in result])
     assert_equal([(0, 8), (4, 11), (9, 16)], [(t.startchar, t.endchar) for t in result])
     assert_equal([0, 1, 2], [t.pos for t in result])
-    
+
     result = [t.copy() for t in ana(u("single"))]
     assert_equal(len(result), 1)
     assert_equal(result[0].text, "single")
     assert_equal([t.pos for t in results], list(range(len(results))))
     for t in results:
         assert_equal(t.text, source[t.startchar:t.endchar])
-    
+
 def test_unicode_blocks():
     from whoosh.support.unicode import blocks, blockname, blocknum
-    
+
     assert_equal(blockname(u('a')), 'Basic Latin')
     assert_equal(blockname(unichr(0x0b80)), 'Tamil')
     assert_equal(blockname(unichr(2048)), None)
     assert_equal(blocknum(unichr(2048)), None)
     assert_equal(blocknum(u('a')), blocks.Basic_Latin)  #@UndefinedVariable
     assert_equal(blocknum(unichr(0x0b80)), blocks.Tamil)  #@UndefinedVariable
-    
+
 def test_double_metaphone():
     mf = analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter()
     results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
     assert_equal(results, [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)])
-    
+
     mf = analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)
     results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
     assert_equal(results, [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0),
     mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "")
     assert_equal([t.text for t in mf(u("one-two th-re-ee four"))],
                  ["onetwo", "threee", "four"])
-    
+
     mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("([^=]*)=(.*)", r"\2=\1")
     assert_equal([t.text for t in mf(u("a=b c=d ef"))], ["b=a", "d=c", "ef"])
 
     ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
     results = [(t.text, t.boost) for t in ana(u("image render^2 file^0.5"))]
     assert_equal(results, [("image", 1.0), ("render", 2.0), ("file", 0.5)])
-    
+
 def test_porter2():
     from whoosh.lang.porter2 import stem
-    
+
     plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
                'died', 'agreed', 'owned', 'humbled', 'sized',
                'meeting', 'stating', 'siezing', 'itemization',
                'sensational', 'traditional', 'reference', 'colonizer',
                'plotted']
     singles = [stem(w) for w in plurals]
-    
+
     assert_equal(singles, ['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre',
                            'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item',
                            'sensat', 'tradit', 'refer', 'colon', 'plot'])
     assert_equal(stem("bill's"), "bill")
     assert_equal(stem("y's"), "y")
 
+@skip_if_unavailable("Stemmer")
+def test_pystemmer():
+    ana = (analysis.RegexTokenizer()
+           | analysis.LowercaseFilter()
+           | analysis.PyStemmerFilter())
+    schema = fields.Schema(text=fields.TEXT(analyzer=ana))
+    st = RamStorage()
+
+    ix = st.create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("rains falling strangely"))
+
+    ix = st.open_index()
+    with ix.writer() as w:
+        w.add_document(text=u("pains stalling strongly"))
+
+    ix = st.open_index()
+    with ix.reader() as r:
+        print list(r.lexicon("text"))
+        assert_equal(list(r.lexicon("text")), ["fall", "pain", "rain", "stall",
+                                               "strang", "strong"])
+
 def test_url():
     sample = u("Visit http://bitbucket.org/mchaput/whoosh or urn:isbn:5930502 or http://www.apple.com/.")
-    
+
     for ana in (analysis.SimpleAnalyzer(analysis.url_pattern),
                 analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None)):
         ts = [t.text for t in ana(sample)]
            | analysis.DoubleMetaphoneFilter(combine=True))
     namefield = fields.TEXT(analyzer=ana, multitoken_query="or")
     schema = fields.Schema(id=fields.STORED, name=namefield)
-    
+
     ix = RamStorage().create_index(schema)
     w = ix.writer()
     w.add_document(id=u("one"), name=u("Leif Ericson"))
     w.commit()
-    
+
     s = ix.searcher()
     qp = qparser.QueryParser("name", schema)
     q = qp.parse(u("leaf eriksen"), normalize=False)
     ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter()
     kw = {"positions": True}
     assert_equal([t.pos for t in formats.tokens(u("alfa bravo charlie delta"), ana, kw)], [0, 1, 2, 3])
-    
+
     kw["start_pos"] = 3
     ts = [t.copy() for t in formats.tokens(u("A B C D").split(), ana, kw)]
     assert_equal(" ".join([t.text for t in ts]), "A B C D")
     # text is all delimiters
     tokens = [t.text for t in ana(u(":-("))]
     assert_equal(tokens, [])
-    
+
     # text has consecutive delimiters
     tokens = [t.text for t in ana(u("LOL:)"))]
     assert_equal(tokens, ["LOL"])

tests/test_parse_plugins.py

     assert_equal(q.startdate, adatetime(2010, 3, 30).floor())
     assert_equal(q.enddate, None)
 
-    print("!!!!!!!!!!!!!!!!!!!!")
     q = qp.parse(u("date:[30 march to next wednesday]"))
     print("q=", q)
     assert_equal(q.__class__, query.DateRange)
     assert_equal(q.startdate, adatetime(2010, 3, 30).floor())
     assert_equal(q.enddate, None)
 
+def test_daterange_multi():
+    schema = fields.Schema(text=fields.TEXT, start=fields.DATETIME, end=fields.DATETIME)
+    qp = qparser.QueryParser("text", schema)
+    basedate = datetime(2010, 9, 20, 15, 16, 6, 454000)
+    qp.add_plugin(dateparse.DateParserPlugin(basedate))
+
+    q = qp.parse("start:[2008 to] AND end:[2011 to 2011]")
+    assert_equal(q.__class__, query.And)
+    assert_equal(q[0].__class__, query.DateRange)
+    assert_equal(q[1].__class__, query.DateRange)
+    assert_equal(q[0].startdate, adatetime(2008).floor())
+    assert_equal(q[0].enddate, None)
+    assert_equal(q[1].startdate, adatetime(2011).floor())
+    assert_equal(q[1].enddate, adatetime(2011).ceil())
+
 def test_daterange_empty_field():
     schema = fields.Schema(test=fields.DATETIME)
     ix = RamStorage().create_index(schema)

tests/test_parsing.py

     assert_equal(q.start, None)
     assert_equal(q.end, None)
 
+def test_numrange_multi():
+    schema = fields.Schema(text=fields.TEXT, start=fields.NUMERIC, end=fields.NUMERIC)
+    qp = default.QueryParser("text", schema)
+
+    q = qp.parse("start:[2008 to]")
+    assert_equal(q.__class__, query.NumericRange)
+    assert_equal(q.fieldname, "start")
+    assert_equal(q.start, 2008)
+    assert_equal(q.end, None)
+
+    q = qp.parse("start:[2011 to 2012]")
+    assert_equal(q.__class__.__name__, "NumericRange")
+    assert_equal(q.fieldname, "start")
+    assert_equal(q.start, 2011)
+    assert_equal(q.end, 2012)
+
+    q = qp.parse("start:[2008 to] AND end:[2011 to 2012]")
+    assert_equal(q.__class__, query.And)
+    assert_equal(q[0].__class__, query.NumericRange)
+    assert_equal(q[1].__class__, query.NumericRange)
+    assert_equal(q[0].start, 2008)
+    assert_equal(q[0].end, None)
+    assert_equal(q[1].start, 2011)
+    assert_equal(q[1].end, 2012)
+
 def test_nonexistant_fieldnames():
     # Need an analyzer that won't mangle a URL
     a = analysis.SimpleAnalyzer("\\S+")

tests/test_queries.py

+from __future__ import with_statement
+
 from nose.tools import assert_equal, assert_not_equal  #@UnresolvedImport
 
 import copy
     r = s.search(DateRange('released', datetime(2007, 1, 1), None))
     assert_equal(len(r), 1)
     assert_equal(r[0].highlights("content"), '')
+
+def test_patterns():
+    domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo "
+               "alembic all amiga amount ampere").split()
+    schema = fields.Schema(word=fields.KEYWORD(stored=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        for word in domain:
+            w.add_document(word=word)
+
+    with ix.reader() as r:
+        assert_equal(list(r.lexicon("word")), domain)
+
+        assert_equal(list(r.expand_prefix("word", "al")), ["alembic", "all"])
+        q = query.Prefix("word", "al")
+        assert_equal(q.simplify(r).__unicode__(), "(word:alembic OR word:all)")
+
+        q = query.Wildcard("word", "a*[ae]")
+        assert_equal(q.simplify(r).__unicode__(),
+                     "(word:able OR word:acre OR word:adage OR word:amiga OR word:ampere)")
+        assert_equal(q._find_prefix(q.text), "a")
+
+        q = query.Regex("word", "am.*[ae]")
+        assert_equal(q.simplify(r).__unicode__(), "(word:amiga OR word:ampere)")
+        assert_equal(q._find_prefix(q.text), "am")
+
+        q = query.Regex("word", "able|ago")
+        assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:ago)")
+        assert_equal(q._find_prefix(q.text), "")
+
+
+
+
+
+
+
+
+
+
+
+