Matt Chaput avatar Matt Chaput committed de28475 Merge

Merged bug fixes from default branch.

Comments (0)

Files changed (38)

 303cef16ed5e01e6ab681a8adb9497ad00be02c4 2.3.2
 dc819b811cf9361865d2068f4b675e198fce38f2 2.4
 efc7fb29500f792cdd308b990ebbb6a5ceecba22 2.5
+00a347c14207793d07ee69a8575ab40b693c1eaa 2.5.1
 1999121eed1d96ada1fa603a88268cdd28da99d0 2.5.2

src/whoosh/analysis/acore.py

 from whoosh.compat import iteritems
 
 
+# Exceptions
+
+class CompositionError(Exception):
+    pass
+
+
 # Utility functions
 
 def unstopped(tokenstream):

src/whoosh/analysis/analyzers.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-from whoosh.analysis.acore import Composable
+from whoosh.analysis.acore import Composable, CompositionError
+from whoosh.analysis.tokenizers import Tokenizer
 from whoosh.analysis.filters import LowercaseFilter
 from whoosh.analysis.filters import StopFilter, STOP_WORDS
 from whoosh.analysis.morph import StemFilter
 class CompositeAnalyzer(Analyzer):
     def __init__(self, *composables):
         self.items = []
+
         for comp in composables:
             if isinstance(comp, CompositeAnalyzer):
                 self.items.extend(comp.items)
             else:
                 self.items.append(comp)
 
+        # Tokenizers must start a chain, and then only filters after that
+        # (because analyzers take a string and return a generator of tokens,
+        # and filters take and return generators of tokens)
+        for item in self.items[1:]:
+            if isinstance(item, Tokenizer):
+                raise CompositionError("Only one tokenizer allowed at the start"
+                                       " of the analyzer: %r" % self.items)
+
     def __repr__(self):
         return "%s(%s)" % (self.__class__.__name__,
                            ", ".join(repr(item) for item in self.items))
     >>> [token.text for token in ana("Por el mar corren las liebres")]
     ['mar', 'corr', 'liebr']
 
+    The list of available languages is in `whoosh.lang.languages`.
+    You can use :func:`whoosh.lang.has_stemmer` and
+    :func:`whoosh.lang.has_stopwords` to check if a given language has a
+    stemming function and/or stop word list available.
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param gaps: If True, the tokenizer *splits* on the expression, rather
         than matching on the expression.
     """
 
     from whoosh.lang import NoStemmer, NoStopWords
-    from whoosh.lang import stopwords_for_language
 
     # Make the start of the chain
     chain = (RegexTokenizer(expression=expression, gaps=gaps)
 
     # Add a stop word filter
     try:
-        stoplist = stopwords_for_language(lang)
-        chain = chain | StopFilter(stoplist=stoplist)
+        chain = chain | StopFilter(lang=lang)
     except NoStopWords:
         pass
 

src/whoosh/analysis/filters.py

     """Marks "stop" words (words too common to index) in the stream (and by
     default removes them).
 
-    >>> rext = RegexTokenizer()
-    >>> stream = rext("this is a test")
-    >>> stopper = StopFilter()
-    >>> [token.text for token in stopper(stream)]
-    ["this", "test"]
+    Make sure you precede this filter with a :class:`LowercaseFilter`.
+
+    >>> stopper = RegexTokenizer() | StopFilter()
+    >>> [token.text for token in stopper(u"this is a test")]
+    ["test"]
+    >>> es_stopper = RegexTokenizer() | StopFilter(lang="es")
+    >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")]
+    ["lapiz", "mesa"]
+
+    The list of available languages is in `whoosh.lang.languages`.
+    You can use :func:`whoosh.lang.has_stopwords` to check if a given language
+    has a stop word list available.
     """
 
-    __inittypes__ = dict(stoplist=list, minsize=int, maxsize=int,
-                         renumber=bool)
-
     def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None,
-                 renumber=True):
+                 renumber=True, lang=None):
         """
         :param stoplist: A collection of words to remove from the stream.
             This is converted to a frozenset. The default is a list of
             common English stop words.
         :param minsize: The minimum length of token texts. Tokens with
-            text smaller than this will be stopped.
+            text smaller than this will be stopped. The default is 2.
         :param maxsize: The maximum length of token texts. Tokens with text
             larger than this will be stopped. Use None to allow any length.
         :param renumber: Change the 'pos' attribute of unstopped tokens
             to reflect their position with the stopped words removed.
-        :param remove: Whether to remove the stopped words from the stream
-            entirely. This is not normally necessary, since the indexing
-            code will ignore tokens it receives with stopped=True.
+        :param lang: Automatically get a list of stop words for the given
+            language
         """
 
-        if stoplist is None:
-            self.stops = frozenset()
-        else:
-            self.stops = frozenset(stoplist)
+        stops = set()
+        if stoplist:
+            stops.update(stoplist)
+        if lang:
+            from whoosh.lang import stopwords_for_language
+
+            stops.update(stopwords_for_language(lang))
+
+        self.stops = frozenset(stops)
         self.min = minsize
         self.max = maxsize
         self.renumber = renumber

src/whoosh/analysis/intraword.py

             return tk
 
         for token in tokens:
-            buf.append(token.copy())
-            if len(buf) == size:
-                atleastone = True
-                yield make_token()
-                buf.popleft()
+            if not token.stopped:
+                buf.append(token.copy())
+                if len(buf) == size:
+                    atleastone = True
+                    yield make_token()
+                    buf.popleft()
 
         # If no shingles were emitted, that is, the token stream had fewer than
         # 'size' tokens, then emit a single token with whatever tokens there
         # were
-        if not atleastone:
+        if not atleastone and buf:
             yield make_token()
 
 

src/whoosh/analysis/morph.py

 
     >>> stemfilter = StemFilter(stem_function)
 
+    You can also use one of the Snowball stemming functions by passing the
+    `lang` keyword argument.
+
+    >>> stemfilter = StemFilter(lang="ru")
+
+    The list of available languages is in `whoosh.lang.languages`.
+    You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
+    a stemming function available.
+
     By default, this class wraps an LRU cache around the stemming function. The
     ``cachesize`` keyword argument sets the size of the cache. To make the
     cache unbounded (the class caches every input), use ``cachesize=-1``. To

src/whoosh/analysis/tokenizers.py

     def __init__(self, expression="[^/]+"):
         self.expr = rcompile(expression)
 
-    def __call__(self, value, **kwargs):
-        assert isinstance(value, text_type), "%r is not unicode" % value
-        token = Token(**kwargs)
-        for match in self.expr.finditer(value):
-            token.text = value[:match.end()]
-            yield token
+    def __call__(self, value, positions=False, start_pos=0, **kwargs):
+         assert isinstance(value, text_type), "%r is not unicode" % value
+         token = Token(positions, **kwargs)
+         pos = start_pos
+         for match in self.expr.finditer(value):
+             token.text = value[:match.end()]
+             if positions:
+                 token.pos = pos
+                 pos += 1
+             yield token
+

src/whoosh/collectors.py

 generally a good idea to create a new collector for each search.
 """
 
+import os
+import signal
 import threading
 from array import array
 from bisect import insort
         negated = 0 - global_docnum
         items = self.items
 
-        # Search through the results for the document and remove it
+        # Remove the document if it's on the list (it may not be since
+        # TopCollector forgets documents that don't make the top N list)
         for i in xrange(len(items)):
             if items[i][1] == negated:
                 items.pop(i)
                 # Restore the heap invariant
                 heapify(items)
-                self.minscore = items[0][0]
+                self.minscore = items[0][0] if items else 0
                 return
 
-        # The document wasn't on the list... somebody's confused!
-        raise KeyError(global_docnum)
-
     def results(self):
         # The items are stored (postive score, negative docnum) so the heap
         # keeps the highest scores and lowest docnums, in order from lowest to
 
         # We can still get partial results from the collector
         print(tlc.results())
+
+    IMPORTANT: On Unix systems (systems where signal.SIGALRM is defined), the
+    code uses signals to stop searching immediately when the time limit is
+    reached. On Windows, the OS does not support this functionality, so the
+    search only checks the time between each found document, so if a matcher
+    is slow the search could exceed the time limit.
     """
 
     def __init__(self, child, timelimit, greedy=False):
         self.child = child
         self.timelimit = timelimit
         self.greedy = greedy
+        self.use_alarm = hasattr(signal, "SIGALRM")
 
     def prepare(self, top_searcher, q, context):
         self.child.prepare(top_searcher, q, context)
 
+        self.timedout = False
+        if self.use_alarm:
+            signal.signal(signal.SIGALRM, self._was_signaled)
+
         # Start a timer thread. If the timer fires, it will call this object's
         # _timestop() method
-        self.timedout = False
         self.timer = threading.Timer(self.timelimit, self._timestop)
         self.timer.start()
 
     def _timestop(self):
+        # Called when the timer expires
         self.timer = None
         # Set an attribute that will be noticed in the collect_matches() loop
         self.timedout = True
 
+        if self.use_alarm:
+            os.kill(os.getpid(), signal.SIGALRM)
+
+    def _was_signaled(self, signum, frame):
+        raise TimeLimit
+
     def collect_matches(self):
         child = self.child
         greedy = self.greedy

src/whoosh/columns.py

 
             self._read_lengths()
             # Create an array of offsets into the strings using the lengths
-            offsets = array("i", (0,))
+            offsets = array("L", (0,))
             for length in self._lengths:
                 offsets.append(offsets[-1] + length)
             self._offsets = offsets

src/whoosh/fields.py

         self.signed = signed
         self.analyzer = analysis.IDAnalyzer()
         self.format = formats.Existence(field_boost=field_boost)
-
-        # Calculate the minimum and maximum possible values for error checking
-        self.min_value = from_sortable(numtype, bits, signed, 0)
-        self.max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1)
+        self.min_value, self.max_value = self._min_max()
 
         # Column configuration
         if default is None:
 
     def __getstate__(self):
         d = self.__dict__.copy()
-        del d["_struct"]
+        if "_struct" in d:
+            del d["_struct"]
         return d
 
     def __setstate__(self, d):
         self.__dict__.update(d)
         self._struct = struct.Struct(">" + self.sortable_typecode)
+        if "min_value" not in d:
+            d["min_value"], d["max_value"] = self._min_max()
+
+    def _min_max(self):
+        numtype = self.numtype
+        bits = self.bits
+        signed = self.signed
+
+        # Calculate the minimum and maximum possible values for error checking
+        min_value = from_sortable(numtype, bits, signed, 0)
+        max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1)
+
+        return min_value, max_value
 
     def default_column(self):
         return columns.NumericColumn(self.sortable_typecode,
         dc = self.decimal_places
         if dc and isinstance(x, (string_type, Decimal)):
             x = Decimal(x) * (10 ** dc)
-        x = self.numtype(x)
+
+        try:
+            x = self.numtype(x)
+        except OverflowError:
+            raise ValueError("Value %r overflowed number type %r"
+                             % (x, self.numtype))
 
         if x < self.min_value or x > self.max_value:
             raise ValueError("Numeric field value %s out of range [%s, %s]"

src/whoosh/highlight.py

         ec = self.endchar
         fsc = fragment.startchar
         fec = fragment.endchar
-        return (fsc > sc and fsc < ec) or (fec > sc and fec < ec)
+        return (sc < fsc < ec) or (sc < fec < ec)
 
     def overlapped_length(self, fragment):
         sc = self.startchar
     """Doesn't fragment the token stream. This object just returns the entire
     entire stream as one "fragment". This is useful if you want to highlight
     the entire text.
+
+    Note that even if you use the `WholeFragmenter`, the highlight code will
+    return no fragment if no terms matched in the given field. To return the
+    whole fragment even in that case, call `highlights()` with `minscore=0`::
+
+        # Query where no terms match in the "text" field
+        q = query.Term("tag", "new")
+
+        r = mysearcher.search(q)
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        # Since no terms in the "text" field matched, we get no fragments back
+        assert r[0].highlights("text") == ""
+
+        # If we lower the minimum score to 0, we get a fragment even though it
+        # has no matching terms
+        assert r[0].highlights("text", minscore=0) == "This is the text field."
+
     """
 
     def __init__(self, charlimit=DEFAULT_CHARLIMIT):
 
 def SCORE(fragment):
     "Sorts higher scored passages first."
-    return None
+    return 1
 
 
 def FIRST(fragment):
 def top_fragments(fragments, count, scorer, order, minscore=1):
     scored_fragments = ((scorer(f), f) for f in fragments)
     scored_fragments = nlargest(count, scored_fragments)
-    best_fragments = [sf for score, sf in scored_fragments if score > minscore]
+    best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
     best_fragments.sort(key=order)
     return best_fragments
 
                     assert m.id() == docnum
                     cache[docnum][text] = m.value_as("characters")
 
-    def highlight_hit(self, hitobj, fieldname, text=None, top=3):
+    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
         results = hitobj.results
         schema = results.searcher.schema
         field = schema[fieldname]
         # Convert bytes to unicode
         words = frozenset(from_bytes(term[1]) for term in bterms)
 
-        if not words:
-            # No terms matches in this field
-            return self.formatter.format([])
+        # if not words:
+        #     # No terms matches in this field
+        #     return self.formatter.format([])
 
         # If we can do "pinpoint" highlighting...
         if self.can_load_chars(results, fieldname):
         else:
             # Retokenize the text
             analyzer = results.searcher.schema[fieldname].analyzer
-            tokens = analyzer(text, chars=True, mode="query",
+            tokens = analyzer(text, positions=True, chars=True, mode="query",
                               removestops=False)
             # Set Token.matched attribute for tokens that match a query term
             tokens = set_matched_filter(tokens, words)
             fragments = self.fragmenter.fragment_tokens(text, tokens)
 
-        fragments = top_fragments(fragments, top, self.scorer, self.order)
+        fragments = top_fragments(fragments, top, self.scorer, self.order,
+                                  minscore=minscore)
         output = self.formatter.format(fragments)
         return output

src/whoosh/lang/__init__.py

 
 # Getter functions
 
+def has_stemmer(lang):
+    try:
+        return bool(stemmer_for_language(lang))
+    except NoStemmer:
+        return False
+
+
+def has_stopwords(lang):
+    try:
+        return bool(stopwords_for_language(lang))
+    except NoStopWords:
+        return False
+
+
 def stemmer_for_language(lang):
     if lang == "en_porter":
         # Original porter stemming algorithm is several times faster than the
     if tlc in snowball_classes:
         return snowball_classes[tlc]().stem
 
-    raise Exception("No stemmer available for %r" % lang)
+    raise NoStemmer("No stemmer available for %r" % lang)
 
 
 def stopwords_for_language(lang):
     if tlc in stoplists:
         return stoplists[tlc]
 
-    raise Exception("No stop-word list available for %r" % lang)
+    raise NoStopWords("No stop-word list available for %r" % lang)

src/whoosh/lang/stopwords.py

 # coding=utf-8
 
+from __future__ import unicode_literals
+
 # Stopwords Corpus
 #
 # This module contains lists of stop words for several languages.  These
 
 
 stoplists = {
-    "da": frozenset(u"""
+    "da": frozenset("""
     og i jeg det at en den til er som på de med han af for ikke der var mig
     sig men et har om vi min havde ham hun nu over da fra du ud sin dem os
     op man hans hvor eller hvad skal selv her alle vil blev kunne ind når
     hvis din nogle hos blive mange ad bliver hendes været thi jer sådan
     """.split()),
 
-    "nl": frozenset(u"""
+    "nl": frozenset("""
     de en van ik te dat die in een hij het niet zijn is was op aan met als
     voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich
     bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u
     iemand geweest andere
     """.split()),
 
-    "en": frozenset(u"""
+    "en": frozenset("""
     i me my myself we our ours ourselves you your yours yourself yourselves
     he him his himself she her hers herself it its itself they them their
     theirs themselves what which who whom this that these those am is are
     same so than too very s t can will just don should now
     """.split()),
 
-    "fi": frozenset(u"""
+    "fi": frozenset("""
     olla olen olet on olemme olette ovat ole oli olisi olisit olisin
     olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet
     en et ei emme ette eivät minä minun minut minua minussa minusta minuun
     noin poikki yli kun niin nyt itse
     """.split()),
 
-    "fr": frozenset(u"""
+    "fr": frozenset("""
     au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma
     mais me même mes moi mon ne nos notre nous on ou par pas pour qu que
     qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l
     eusses eût eussions eussiez eussent
     """.split()),
 
-    "de": frozenset(u"""
+    "de": frozenset("""
     aber alle allem allen aller alles als also am an ander andere anderem
     anderen anderer anderes anderm andern anderr anders auch auf aus bei bin
     bis bist da damit dann der den des dem die das daß derselbe derselben
     wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen
     """.split()),
 
-    "hu": frozenset(u"""
+    "hu": frozenset("""
     a ahogy ahol aki akik akkor alatt által általában amely amelyek
     amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át
     abban ahhoz annak arra arról az azok azon azt azzal azért aztán
     voltunk vissza vele viszont volna
     """.split()),
 
-    "it": frozenset(u"""
+    "it": frozenset("""
     ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli
     dall dagl dalla dalle di del dello dei degli dell degl della delle in
     nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull
     stesse stessimo stessero stando
     """.split()),
 
-    "no": frozenset(u"""
+    "no": frozenset("""
     og i jeg det at en et den til er som på de med han av ikke ikkje der
     så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved
     fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl
     sia sidan so somt somme um upp vere vore verte vort varte vart
     """.split()),
 
-    "pt": frozenset(u"""
+    "pt": frozenset("""
     de a o que e do da em um para com não uma os no se na por mais as dos
     como mas ao ele das à seu sua ou quando muito nos já eu também só
     pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse
     teria teríamos teriam
     """.split()),
 
-    "ru": frozenset(u"""
+    "ru": frozenset("""
     и в во не что он на я с со как а то все она
     так его но да ты к у же вы за бы по только
     ее мне было вот от меня еще нет о из ему
     всегда конечно всю между
     """.split()),
 
-    "es": frozenset(u"""
+    "es": frozenset("""
     de la que el en y a los del se las por un para con no una su al lo como
     más pero sus le ya o este sí porque esta entre cuando muy sin sobre
     también me hasta hay donde quien desde todo nos durante todos uno les
     tenidas tened
     """.split()),
 
-    "sv": frozenset(u"""
+    "sv": frozenset("""
     och det att i en jag hon som han på den med var sig för så till är
     men ett om hade de av icke mig du henne då sin nu har inte hans honom
     skulle hennes där min man ej vid kunde något från ut när efter upp
     vilket sitta sådana vart dina vars vårt våra ert era vilkas
     """.split()),
 
-    "tr": frozenset(u"""
+    "tr": frozenset("""
     acaba ama aslında az bazı belki biri birkaç birşey biz bu çok
     çünkü da daha de defa diye eğer en gibi hem hep hepsi her hiç için
     ile ise kez ki kim mı mu mü nasıl ne neden nerde nerede nereye niçin

src/whoosh/matching/combo.py

             self._find_next()
             return
 
-        # Advance all submatchers
+        # Advance all active submatchers
         submatchers = self._submatchers
         active = False
         for subm in submatchers:
-            subm.skip_to(docnum)
-            active = active or subm.is_active()
+            if subm.is_active():
+                subm.skip_to(docnum)
 
-        if active:
+        if any(subm.is_active() for subm in submatchers):
             # Rebuffer
             self._docnum = self._min_id()
             self._read_part()

src/whoosh/matching/mcore.py

     def reset(self):
         self._i = 0
 
+    def skip_to(self, id):
+        if not self.is_active():
+            raise ReadTooFar
+        if id < self.id():
+            return
+
+        while self._i < len(self._ids) and self._ids[self._i] < id:
+            self._i += 1
+
     def term(self):
         return self._term
 
                               self._all_weights)
 
     def replace(self, minquality=0):
-        if not self.is_active() or (minquality
-                                    and self.max_quality() < minquality):
+        if not self.is_active():
+            return NullMatcher()
+        elif minquality and self.max_quality() < minquality:
             return NullMatcher()
         else:
             return self
     def max_quality(self):
         # This matcher treats all postings in the list as one "block", so the
         # block quality is the same as the quality of the entire list
-        return self._scorer.block_quality(self)
+        if self._scorer:
+            return self._scorer.block_quality(self)
+        else:
+            return self.block_max_weight()
 
     def block_quality(self):
         return self._scorer.block_quality(self)

src/whoosh/query/compound.py

     """
 
     def __init__(self, subqueries, boost=1.0):
+        for subq in subqueries:
+            if not isinstance(subq, qcore.Query):
+                raise qcore.QueryError("%r is not a query" % subq)
         self.subqueries = subqueries
         self.boost = boost
 
 
     def __unicode__(self):
         r = u("(")
-        r += (self.JOINT).join([text_type(s) for s in self.subqueries])
+        r += self.JOINT.join([text_type(s) for s in self.subqueries])
         r += u(")")
         return r
 
     __str__ = __unicode__
 
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and\
-        self.subqueries == other.subqueries and\
-        self.boost == other.boost
+        return (other
+                and self.__class__ is other.__class__
+                and self.subqueries == other.subqueries
+                and self.boost == other.boost)
 
     def __getitem__(self, i):
         return self.subqueries.__getitem__(i)
         subqs = []
         seenqs = set()
         for s in subqueries:
-            if (not isinstance(s, Every) and s.field() in everyfields):
+            if not isinstance(s, Every) and s.field() in everyfields:
                 continue
             if s in seenqs:
                 continue
             raise ValueError("Unknown matcher_type %r" % self.matcher_type)
 
         return cls(subs, boost=self.boost, minmatch=self.minmatch,
-                    scale=self.scale).matcher(searcher, context)
+                   scale=self.scale).matcher(searcher, context)
 
 
 class DefaultOr(Or):

src/whoosh/query/positional.py

         # Build a list of Term queries from the words in the phrase
         reader = searcher.reader()
         for word in self.words:
-            word = field.to_bytes(word)
+            try:
+                word = field.to_bytes(word)
+            except ValueError:
+                return matching.NullMatcher()
+
             if (fieldname, word) not in reader:
                 # Shortcut the query if one of the words doesn't exist.
                 return matching.NullMatcher()

src/whoosh/query/qcore.py

 
         return iter(())
 
-    def expanded_terms(self, ixreader):
-        return self.terms()
+    def expanded_terms(self, ixreader, phrases=True):
+        return self.terms(phrases=phrases)
 
     def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None):
         """Returns a set of all byteterms in this query tree that exist in
                 continue
 
             if expand:
-                terms = q.expanded_terms(ixreader)
+                terms = q.expanded_terms(ixreader, phrases=phrases)
             else:
-                terms = q.terms(phrases)
+                terms = q.terms(phrases=phrases)
 
             for fieldname, text in terms:
                 if (fieldname, text) in termset:
                     continue
+
                 if fieldname in schema:
                     field = schema[fieldname]
-                    btext = field.to_bytes(text)
+
+                    try:
+                        btext = field.to_bytes(text)
+                    except ValueError:
+                        continue
+
                     if (fieldname, btext) in ixreader:
                         termset.add((fieldname, btext))
         return termset

src/whoosh/query/ranges.py

         if self.start is None:
             start = b("")
         else:
-            start = field.to_bytes(self.start)
+            try:
+                start = field.to_bytes(self.start)
+            except ValueError:
+                return
+
         if self.end is None:
             end = b("\xFF\xFF\xFF\xFF")
         else:
-            end = field.to_bytes(self.end)
+            try:
+                end = field.to_bytes(self.end)
+            except ValueError:
+                return
 
         for fname, t in ixreader.terms_from(fieldname, start):
             if fname != fieldname:

src/whoosh/query/terms.py

         fieldname = self.fieldname
         if fieldname not in ixreader.schema:
             return 0
+
         field = ixreader.schema[fieldname]
-        text = field.to_bytes(self.text)
+        try:
+            text = field.to_bytes(self.text)
+        except ValueError:
+            return 0
+
         return ixreader.doc_frequency(fieldname, text)
 
     def matcher(self, searcher, context=None):
             return matching.NullMatcher()
 
         field = searcher.schema[fieldname]
-        text = field.to_bytes(text)
+        try:
+            text = field.to_bytes(text)
+        except ValueError:
+            return matching.NullMatcher()
 
         if (self.fieldname, text) in searcher.reader():
             if context is None:
     def _btexts(self, ixreader):
         raise NotImplementedError(self.__class__.__name__)
 
-    def expanded_terms(self, ixreader):
+    def expanded_terms(self, ixreader, phrases=False):
         fieldname = self.field()
         if fieldname:
             for btext in self._btexts(ixreader):
         fieldname = self.fieldname
         to_bytes = ixreader.schema[fieldname].to_bytes
         for word in variations(self.text):
-            btext = to_bytes(word)
+            try:
+                btext = to_bytes(word)
+            except ValueError:
+                continue
+
             if (fieldname, btext) in ixreader:
                 yield btext
 

src/whoosh/searching.py

             # Wrap it with a TimeLimitedCollector with a time limit of
             # 10.5 seconds
             from whoosh.collectors import TimeLimitedCollector
-            c = TimeLimitedCollector(c, 10.5)
+            c = TimeLimitCollector(c, 10.5)
 
             # Search using the custom collector
             results = mysearcher.search_with_collector(myquery, c)
 
     def key_terms(self, fieldname, docs=10, numterms=5,
                   model=classify.Bo1Model, normalize=True):
-        """Returns the 'numterms' most important terms from the top 'numdocs'
+        """Returns the 'numterms' most important terms from the top 'docs'
         documents in these results. "Most important" is generally defined as
         terms that occur frequently in the top hits but relatively infrequently
         in the collection as a whole.
         :param fieldname: Look at the terms in this field. This field must
             store vectors.
         :param docs: Look at this many of the top documents of the results.
-        :param terms: Return this number of important terms.
+        :param numterms: Return this number of important terms.
         :param model: The classify.ExpansionModel to use. See the classify
             module.
         :returns: list of unicode strings.
             raise NoTermsException
         return self.results.docterms[self.docnum]
 
-    def highlights(self, fieldname, text=None, top=3):
+    def highlights(self, fieldname, text=None, top=3, minscore=1):
         """Returns highlighted snippets from the given field::
 
             r = searcher.search(myquery)
             access to the text another way (for example, loading from a file or
             a database), you can supply it using the ``text`` parameter.
         :param top: the maximum number of fragments to return.
+        :param minscore: the minimum score for fragments to appear in the
+            highlights.
         """
 
         hliter = self.results.highlighter
-        return hliter.highlight_hit(self, fieldname, text=text, top=top)
+        return hliter.highlight_hit(self, fieldname, text=text, top=top,
+                                    minscore=minscore)
 
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):
     >>> for i, fields in enumerate(page):
     ...   print("%s. %r" % (page.offset + i + 1, fields))
     >>> mysearcher.close()
+
+    To set highlighter attributes (for example ``formatter``), access the
+    underlying :class:`Results` object::
+
+        page.results.formatter = highlight.UppercaseFormatter()
+
     """
 
     def __init__(self, results, pagenum, pagelen=10):

src/whoosh/writing.py

         try:
             count = 0
             for docnum in s.docs_for_query(q, for_deletion=True):
-                if not self.is_deleted(docnum):
-                    self.delete_document(docnum)
-                    count += 1
+                self.delete_document(docnum)
+                count += 1
         finally:
             if not searcher:
                 s.close()
         # Start timer
         if self.period:
             self.timer = threading.Timer(self.period, self.commit)
+            self.timer.start()
 
     def _make_ram_index(self):
         from whoosh.codec.memory import MemoryCodec
             self.writer = self.index.writer(**self.writerargs)
             if self.period:
                 self.timer = threading.Timer(self.period, self.commit)
+                self.timer.start()
 
     def add_reader(self, reader):
         # Pass through to the underlying on-disk index

tests/test_analysis.py

 import pytest
 
 from whoosh import analysis, fields, qparser
-from whoosh.compat import u, unichr
+from whoosh.compat import b, u, unichr
 from whoosh.compat import dumps
 from whoosh.filedb.filestore import RamStorage
 
                                            "/alfa/bravo/charlie/delta"]
 
 
+def test_path_tokenizer2():
+    path_field = fields.TEXT(analyzer=analysis.PathTokenizer())
+    st = RamStorage()
+    schema = fields.Schema(path=path_field)
+    index = st.create_index(schema)
+
+    with index.writer() as writer:
+        writer.add_document(path=u('/alfa/brvo/charlie/delta/'))
+        writer.add_document(path=u('/home/user/file.txt'))
+    assert not index.is_empty()
+
+    with index.reader() as reader:
+        items = list(reader.all_terms())
+    assert 'path' in [field for field, value in items]
+    assert b('/alfa') in [value for field, value in items]
+
+
 def test_composition1():
     ca = analysis.RegexTokenizer() | analysis.LowercaseFilter()
     assert ca.__class__.__name__ == "CompositeAnalyzer"
                       ("lm", 14, 16)]
 
 
+@pytest.mark.skipif("sys.version_info < (2,6)")
 def test_language_analyzer():
     domain = [("da", u("Jeg gik mig over s\xf8 og land"),
                [u('gik'), u('s\xf8'), u('land')]),
         assert words == target
 
 
+@pytest.mark.skipif("sys.version_info < (2,6)")
 def test_la_pickleability():
     ana = analysis.LanguageAnalyzer("en")
     _ = dumps(ana, -1)
     _ = dumps(ana, -1)
 
 
+def test_shingle_stopwords():
+    # Note that the stop list is None here
+    ana = (analysis.RegexTokenizer()
+           | analysis.StopFilter(stoplist=None, minsize=3)
+           | analysis.ShingleFilter(size=3))
+
+    texts = [t.text for t
+             in ana(u("some other stuff and then some things To Check     "))]
+    assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then",
+                     "and-then-some", "then-some-things", "some-things-Check"]
+
+    # Use a stop list here
+    ana = (analysis.RegexTokenizer()
+           | analysis.LowercaseFilter()
+           | analysis.StopFilter()
+           | analysis.ShingleFilter(size=3))
+
+    texts = [t.text for t
+             in ana(u("some other stuff and then some things To Check     "))]
+    assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some",
+                     "then-some-things", "some-things-check"]
+
+
+def test_biword_stopwords():
+    # Note that the stop list is None here
+    ana = (analysis.RegexTokenizer()
+           | analysis.StopFilter(stoplist=None, minsize=3)
+           | analysis.BiWordFilter())
+
+    texts = [t.text for t in ana(u("stuff and then some"))]
+    assert texts == ["stuff-and", "and-then", "then-some"]
+
+    # Use a stop list here
+    ana = (analysis.RegexTokenizer()
+           | analysis.LowercaseFilter()
+           | analysis.StopFilter()
+           | analysis.BiWordFilter())
+
+    texts = [t.text for t in ana(u("stuff and then some"))]
+    assert texts == ["stuff-then", "then-some"]
+
+
+@pytest.mark.skipif("sys.version_info < (2,6)")
+def test_stop_lang():
+    stopper = analysis.RegexTokenizer() | analysis.StopFilter()
+    ls = [token.text for token in stopper(u("this is a test"))]
+    assert ls == [u("test")]
+
+    es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es")
+    ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))]
+    assert ls == ["lapiz", "mesa"]
+
+
+def test_issue358():
+    t = analysis.RegexTokenizer("\w+")
+    with pytest.raises(analysis.CompositionError):
+        _ = t | analysis.StandardAnalyzer()

tests/test_codecs.py

         assert (" ".join(s.field_terms("a"))
                 == "alfa bravo charlie delta echo foxtrot india")
 
-        assert reader.doc_field_length(2, "a"), 3
+        assert reader.doc_field_length(2, "a") == 3
 
         cfield = schema["c"]
         assert type(cfield), fields.NUMERIC

tests/test_collector.py

 from __future__ import with_statement
 
-from whoosh import fields, qparser, query
-from whoosh.compat import b, u
+import pytest
+
+from whoosh import collectors, fields, query, searching
+from whoosh.compat import b, u, xrange
 from whoosh.filedb.filestore import RamStorage
+from whoosh.util.testing import TempIndex
 
 
 def test_add():
         assert len(r) == 0
 
 
-def test_daterange_matched_terms():
-    from whoosh.qparser import GtLtPlugin
-    from datetime import datetime
+def test_timelimit():
+    schema = fields.Schema(text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    for _ in xrange(50):
+        w.add_document(text=u("alfa"))
+    w.commit()
 
-    schema = fields.Schema(id=fields.KEYWORD(stored=True),
-                           body=fields.TEXT,
-                           num=fields.NUMERIC(stored=True, unique=True),
-                           created=fields.DATETIME(stored=True))
-    ix = RamStorage().create_index(schema)
+    import time
+    from whoosh import collectors, matching
 
-    with ix.writer() as w:
-        w.add_document(id=u"one", body=u"this and this", num='5',
-                       created=datetime.now())
-        w.add_document(id=u"three", body=u"that and that", num='7',
-                       created=datetime.now())
-        w.add_document(id=u"two", body=u"this and that", num='6',
-                       created=datetime.now())
+    class SlowMatcher(matching.WrappingMatcher):
+        def next(self):
+            time.sleep(0.02)
+            self.child.next()
+
+    class SlowQuery(query.WrappingQuery):
+        def matcher(self, searcher, context=None):
+            return SlowMatcher(self.child.matcher(searcher, context))
 
     with ix.searcher() as s:
-        parser = qparser.QueryParser("body", ix.schema)
-        parser.add_plugin(GtLtPlugin())
-        q = parser.parse(u"created:>='2013-07-01'")
-        r = s.search(q, terms=True)
+        oq = query.Term("text", u("alfa"))
+        sq = SlowQuery(oq)
 
-        assert r.has_matched_terms()
-        termlist = r[0].matched_terms()
-        assert len(termlist) == 1
-        pair = termlist[0]
-        assert pair[0] == "created"
-        assert pair[1] == b("(\x00\x00\x00\x00\x00\x80\xe1\xa3")
+        col = collectors.TimeLimitCollector(s.collector(limit=None),
+                                            timelimit=0.1)
+        with pytest.raises(searching.TimeLimit):
+            s.search_with_collector(sq, col)
 
+        col = collectors.TimeLimitCollector(s.collector(limit=40),
+                                            timelimit=0.1)
+        with pytest.raises(collectors.TimeLimit):
+            s.search_with_collector(sq, col)
+
+        col = collectors.TimeLimitCollector(s.collector(limit=None),
+                                            timelimit=0.25)
+        try:
+            s.search_with_collector(sq, col)
+            assert False  # Shouldn't get here
+        except collectors.TimeLimit:
+            r = col.results()
+            assert r.scored_length() > 0
+
+        col = collectors.TimeLimitCollector(s.collector(limit=None),
+                                            timelimit=0.5)
+        s.search_with_collector(oq, col)
+        assert col.results().runtime < 0.5
+
+
+@pytest.mark.skipif("not hasattr(__import__('signal'), 'SIGALRM')")
+def test_timelimit_alarm():
+    import time
+    from whoosh import matching
+
+    class SlowMatcher(matching.Matcher):
+        def __init__(self):
+            self._id = 0
+
+        def id(self):
+            return self._id
+
+        def is_active(self):
+            return self._id == 0
+
+        def next(self):
+            time.sleep(10)
+            self._id = 1
+
+        def score(self):
+            return 1.0
+
+    class SlowQuery(query.Query):
+        def matcher(self, searcher, context=None):
+            return SlowMatcher()
+
+    schema = fields.Schema(text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("Hello"))
+
+    with ix.searcher() as s:
+        q = SlowQuery()
+
+        t = time.time()
+        c = s.collector()
+        c = collectors.TimeLimitCollector(c, 0.2)
+        with pytest.raises(searching.TimeLimit):
+            _ = s.search_with_collector(q, c)
+        assert time.time() - t < 0.5
+
+
+
+

tests/test_columns.py

     domain = {}
     for _ in xrange(100):
         name = randstring(random.randint(5, 10))
-        value = randstring(10000)
+        value = randstring(2500)
         domain[name] = value
 
     outfiles = dict((name, BytesIO(value)) for name, value in domain.items())
 
     with TempStorage() as st:
-        msw = compound.CompoundWriter(st, buffersize=4096)
+        msw = compound.CompoundWriter(st, buffersize=1024)
         mfiles = {}
         for name in domain:
             mfiles[name] = msw.create_file(name)
         f = st.create_file("test")
         cw = col.writer(f)
         for i in xrange(size):
-            cw.add(i, str(i).encode("latin1"))
+            cw.add(i, hex(i).encode("latin1"))
         cw.finish(size)
         length = f.tell()
         f.close()
             v = cr[i]
             # Column ignores additional unique values after 65535
             if i <= 65535 - 1:
-                assert v == str(i).encode("latin1")
+                assert v == hex(i).encode("latin1")
             else:
                 assert v == b('')
         f.close()
 
             assert len(w) == 2
             assert issubclass(w[-1].category, UserWarning)
-    else:
-        rw(65537)

tests/test_dawg.py

 
 def test_random():
     def randstring():
-        length = random.randint(1, 10)
+        length = random.randint(1, 5)
         a = array("B", (random.randint(0, 255) for _ in xrange(length)))
         return array_tobytes(a)
-    keys = sorted(randstring() for _ in xrange(1000))
+    keys = sorted(randstring() for _ in xrange(100))
 
     with TempStorage() as st:
         gwrite(keys, st)

tests/test_highlighting.py

                                  fragmenter=highlight.ContextFragmenter(),
                                  formatter=highlight.UppercaseFormatter())
     assert result == "INDEXED!\n1"
+
+
+def test_whole_noterms():
+    schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"),
+                       tag=u("foo"))
+
+    with ix.searcher() as s:
+        r = s.search(query.Term("text", u("delta")))
+        assert len(r) == 1
+
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        hi = r[0].highlights("text")
+        assert hi == u("alfa bravo charlie DELTA echo foxtrot golf")
+
+        r = s.search(query.Term("tag", u("foo")))
+        assert len(r) == 1
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        hi = r[0].highlights("text")
+        assert hi == u("")
+
+        hi = r[0].highlights("text", minscore=0)
+        assert hi == u("alfa bravo charlie delta echo foxtrot golf")

tests/test_indexing.py

         assert not ix.is_empty()
 
 
-def _check_writer(name, writer_fn):
+def test_simple_indexing():
     schema = fields.Schema(text=fields.TEXT, id=fields.STORED)
     domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
               u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"),
               u("kilo"), u("lima"), u("mike"), u("november"))
     docs = defaultdict(list)
-    with TempIndex(schema, name) as ix:
-        w = writer_fn(ix)
-        for i in xrange(1000):
-            smp = random.sample(domain, 5)
-            for word in smp:
-                docs[word].append(i)
-            w.add_document(text=u(" ").join(smp), id=i)
-        w.commit()
+    with TempIndex(schema, "simple") as ix:
+        with ix.writer() as w:
+            for i in xrange(100):
+                smp = random.sample(domain, 5)
+                for word in smp:
+                    docs[word].append(i)
+                w.add_document(text=u(" ").join(smp), id=i)
 
         with ix.searcher() as s:
             for word in domain:
                 assert rset == docs[word]
 
 
-def test_simple_indexing():
-    _check_writer("simplew", lambda ix: ix.writer())
-
-
 def test_integrity():
     s = fields.Schema(name=fields.TEXT, value=fields.TEXT)
     st = RamStorage()
     schema = fields.Schema(num=fields.NUMERIC(unique=True, stored=True),
                            text=fields.ID(stored=True))
     with TempIndex(schema, "updatenum") as ix:
-        nums = list(range(10)) * 3
+        nums = list(range(5)) * 3
         random.shuffle(nums)
         for num in nums:
             with ix.writer() as w:
         with ix.searcher() as s:
             results = [d["text"] for _, d in s.iter_docs()]
             results = " ".join(sorted(results))
-            assert results == "0 1 2 3 4 5 6 7 8 9"
+            assert results == "0 1 2 3 4"
 
 
 def test_reindex():
 
     with TempIndex(schema, "globlenmerge") as ix:
         with ix.writer() as w:
-            w.add_document(title=u"First document", path=u"/a",
-                           content_text=u"This is the first document we've added!")
+            w.add_document(title=u("First document"), path=u("/a"),
+                           content_text=u("This is the first document we've added!"))
 
         with ix.writer() as w:
-            w.add_document(title=u"Second document", path=u"/b",
-                           content_text=u"The second document is even more interesting!")
+            w.add_document(title=u("Second document"), path=u("/b"),
+                           content_text=u("The second document is even more interesting!"))
 
         with ix.searcher() as s:
             docnum = s.document_number(path="/a")

tests/test_matching.py

     assert aum.id() == 50
     aum.skip_to(550)
     assert aum.id() == 600
+
+
+def test_arrayunion2():
+    l1 = matching.ListMatcher([1, 2])
+    l2 = matching.ListMatcher([1, 2, 10, 20])
+    l3 = matching.ListMatcher([1, 5, 10, 50])
+    aum = matching.ArrayUnionMatcher([l1, l2, l3], 51, partsize=2)
+
+    assert aum.id() == 1
+    assert not l1.is_active()
+    aum.skip_to(50)
+    assert aum.id() == 50
+

tests/test_mpwriter.py

     docs = []
     # A ring buffer for creating string values
     buf = deque()
-    for ls in permutations(u("abcdef")):
+    for ls in permutations(u("abcd")):
         word = "".join(ls)
         # Remember this word is in the index (to check lexicon)
         words.append(word)

tests/test_queries.py

 from __future__ import with_statement
 import copy
 
+import pytest
+
 from whoosh import fields, qparser, query
 from whoosh.compat import b, u
 from whoosh.filedb.filestore import RamStorage
         else:
             return Or([nq(level - 1), nq(level - 1), nq(level - 1)])
 
-    q = nq(7)
+    q = nq(5)
     q = q.normalize()
     assert q == Or([Term("a", u("a")), Term("a", u("b"))])
 
         r2 = [hit["id"] for hit in s.search(q2, sortedby="id")]
 
         assert r1 == r2 == [4]
+
+
+def test_none_in_compounds():
+    with pytest.raises(query.QueryError):
+        _ = query.And([query.Term("a", "b"), None, query.Term("c", "d")])
+
+
+def test_issue_355():
+    schema = fields.Schema(seats=fields.NUMERIC(bits=8, stored=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(seats=0)
+        w.add_document(seats=10)
+        w.add_document(seats=20)
+
+    with ix.searcher() as s:
+        # Passing a bytestring for a numeric field
+        q = Term("seats", b("maker"))
+        r1 = [hit["seats"] for hit in s.search(q, limit=5)]
+
+        # Passing a unicode string for a numeric field
+        q = Term("seats", u("maker"))
+        r2 = [hit["seats"] for hit in s.search(q, limit=5)]
+
+        # Passing a value too large for the numeric field
+        q = Term("seats", 260)
+        r3 = [hit["seats"] for hit in s.search(q, limit=5)]
+
+        assert r1 == r2 == r3 == []
+

tests/test_reading.py

         self.ix = ix
 
     def run(self):
-        for _ in xrange(200):
+        for _ in xrange(50):
             r = self.ix.reader()
             r.close()
 
         self.ix = ix
 
     def run(self):
-        for _ in xrange(20):
+        for _ in xrange(10):
             w = self.ix.writer()
             w.add_document(text=random.sample(self.domain, 4))
             w.commit()
-            time.sleep(0.05)
+            time.sleep(0.01)
 
 
 def test_delete_recovery():

tests/test_results.py

         assert c[1] == "bravo"
         assert s.reader().has_word_graph("key")
         assert s.suggest("key", "brovo") == ["bravo"]
+
+
+def test_paged_highlights():
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("alfa bravo charlie delta echo foxtrot"))
+        w.add_document(text=u("bravo charlie delta echo foxtrot golf"))
+        w.add_document(text=u("charlie delta echo foxtrot golf hotel"))
+        w.add_document(text=u("delta echo foxtrot golf hotel india"))
+        w.add_document(text=u("echo foxtrot golf hotel india juliet"))
+        w.add_document(text=u("foxtrot golf hotel india juliet kilo"))
+
+    with ix.searcher() as s:
+        q = query.Term("text", u("alfa"))
+        page = s.search_page(q, 1, pagelen=3)
+
+        page.results.fragmenter = highlight.WholeFragmenter()
+        page.results.formatter = highlight.UppercaseFormatter()
+        hi = page[0].highlights("text")
+        assert hi == u("ALFA bravo charlie delta echo foxtrot")
+
+
+def test_phrase_keywords():
+    schema = fields.Schema(text=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("alfa bravo charlie delta"))
+        w.add_document(text=u("bravo charlie delta echo"))
+        w.add_document(text=u("charlie delta echo foxtrot"))
+        w.add_document(text=u("delta echo foxtrot alfa"))
+        w.add_document(text=u("echo foxtrot alfa bravo"))
+
+    with ix.searcher() as s:
+        q = query.Phrase("text", u("alfa bravo").split())
+        r = s.search(q)
+        assert len(r) == 2
+        kts = " ".join(t for t, score in r.key_terms("text"))
+        assert kts == "alfa bravo charlie foxtrot delta"

tests/test_searching.py

+#encoding: utf-8
+
 from __future__ import with_statement
 import copy
 from datetime import datetime, timedelta
 
 
 def test_ors():
-    domain = u("alfa bravo charlie delta echo foxtrot").split()
+    domain = u("alfa bravo charlie delta").split()
     s = fields.Schema(num=fields.STORED, text=fields.TEXT)
     st = RamStorage()
     ix = st.create_index(s)
 
 
 def test_open_numeric_ranges():
-    domain = range(0, 10000, 7)
+    domain = range(0, 1000, 7)
 
     schema = fields.Schema(num=fields.NUMERIC(stored=True))
     ix = RamStorage().create_index(schema)
         r = [hit["num"] for hit in s.search(q, limit=None)]
         assert r == [n for n in domain if n >= 100]
 
-        q = qp.parse("[to 5000]")
+        q = qp.parse("[to 500]")
         r = [hit["num"] for hit in s.search(q, limit=None)]
-        assert r == [n for n in domain if n <= 5000]
+        assert r == [n for n in domain if n <= 500]
 
 
 def test_open_date_ranges():
         assert [d["id"] for d in r] == [1, 2, 5, 7, ]
 
 
-def test_timelimit():
-    schema = fields.Schema(text=fields.TEXT)
-    ix = RamStorage().create_index(schema)
-    w = ix.writer()
-    for _ in xrange(50):
-        w.add_document(text=u("alfa"))
-    w.commit()
-
-    import time
-    from whoosh import collectors, matching
-
-    class SlowMatcher(matching.WrappingMatcher):
-        def next(self):
-            time.sleep(0.02)
-            self.child.next()
-
-    class SlowQuery(query.WrappingQuery):
-        def matcher(self, searcher, context=None):
-            return SlowMatcher(self.child.matcher(searcher, context))
-
-    with ix.searcher() as s:
-        oq = query.Term("text", u("alfa"))
-        sq = SlowQuery(oq)
-
-        col = collectors.TimeLimitCollector(s.collector(limit=None),
-                                            timelimit=0.1)
-        with pytest.raises(searching.TimeLimit):
-            s.search_with_collector(sq, col)
-
-        col = collectors.TimeLimitCollector(s.collector(limit=40),
-                                            timelimit=0.1)
-        with pytest.raises(collectors.TimeLimit):
-            s.search_with_collector(sq, col)
-
-        col = collectors.TimeLimitCollector(s.collector(limit=None),
-                                            timelimit=0.25)
-        try:
-            s.search_with_collector(sq, col)
-            assert False  # Shouldn't get here
-        except collectors.TimeLimit:
-            r = col.results()
-            assert r.scored_length() > 0
-
-        col = collectors.TimeLimitCollector(s.collector(limit=None),
-                                            timelimit=0.5)
-        s.search_with_collector(oq, col)
-        assert col.results().runtime < 0.5
-
-
 def test_fieldboost():
     schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT)
     ix = RamStorage().create_index(schema)

tests/test_sorting.py

         q = query.Term("ev", u("a"))
 
     correct = [d["id"] for d in sorted(docs, key=key, reverse=reverse)][:limit]
+    schema = get_schema()
 
     for fn in (make_single_index, make_multi_index):
-        with TempIndex(get_schema()) as ix:
-            fn(ix)
-            with ix.searcher() as s:
-                r = s.search(q, sortedby=sortedby, limit=limit,
-                             reverse=reverse)
-                rids = [d["id"] for d in r]
-                assert rids == correct
+        ix = RamStorage().create_index(schema)
+        fn(ix)
+        with ix.searcher() as s:
+            r = s.search(q, sortedby=sortedby, limit=limit,
+                         reverse=reverse)
+            rids = [d["id"] for d in r]
+            assert rids == correct
 
 
 def test_sortedby():

tests/test_tables.py

 
 
 def test_random_access():
-    times = 10000
+    times = 1000
     with TempStorage("orderedhash") as st:
         hw = HashWriter(st.create_file("test.hsh"))
         hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times))

tests/test_writing.py

 def test_buffered():
     schema = fields.Schema(id=fields.ID, text=fields.TEXT)
     with TempIndex(schema, "buffered") as ix:
-        domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
-                  u("foxtrot"), u("golf"), u("hotel"), u("india"))
+        domain = u("alfa bravo charlie delta echo foxtrot golf hotel india")
+        domain = domain.split()
 
         w = writing.BufferedWriter(ix, period=None, limit=10,
                                    commitargs={"merge": False})
-        for i in xrange(100):
+        for i in xrange(20):
             w.add_document(id=text_type(i),
                            text=u(" ").join(random.sample(domain, 5)))
-        time.sleep(0.5)
+        time.sleep(0.1)
         w.close()
 
-        assert len(ix._segments()) == 10
+        assert len(ix._segments()) == 2
 
 
 def test_buffered_search():
     with TempIndex(schema, "buffthreads") as ix:
         class SimWriter(threading.Thread):
             def run(self):
-                for _ in xrange(10):
+                for _ in xrange(5):
                     w.update_document(name=random.choice(domain))
                     time.sleep(random.uniform(0.01, 0.1))
 
         w = writing.BufferedWriter(ix, limit=10)
-        threads = [SimWriter() for _ in xrange(10)]
+        threads = [SimWriter() for _ in xrange(5)]
         for thread in threads:
             thread.start()
         for thread in threads:
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.