Commits

pombredanne committed 57a744e Merge

Merged latest from Matt into default

  • Participants
  • Parent commits 49b47f4, 55f9c48

Comments (0)

Files changed (22)

 303cef16ed5e01e6ab681a8adb9497ad00be02c4 2.3.2
 dc819b811cf9361865d2068f4b675e198fce38f2 2.4
 efc7fb29500f792cdd308b990ebbb6a5ceecba22 2.5
+00a347c14207793d07ee69a8575ab40b693c1eaa 2.5.1

File src/whoosh/analysis/analyzers.py

     >>> [token.text for token in ana("Por el mar corren las liebres")]
     ['mar', 'corr', 'liebr']
 
+    The list of available languages is in `whoosh.lang.languages`.
+    You can use :func:`whoosh.lang.has_stemmer` and
+    :func:`whoosh.lang.has_stopwords` to check if a given language has a
+    stemming function and/or stop word list available.
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param gaps: If True, the tokenizer *splits* on the expression, rather
         than matching on the expression.
     """
 
     from whoosh.lang import NoStemmer, NoStopWords
-    from whoosh.lang import stopwords_for_language
 
     # Make the start of the chain
     chain = (RegexTokenizer(expression=expression, gaps=gaps)
 
     # Add a stop word filter
     try:
-        stoplist = stopwords_for_language(lang)
-        chain = chain | StopFilter(stoplist=stoplist)
+        chain = chain | StopFilter(lang=lang)
     except NoStopWords:
         pass
 

File src/whoosh/analysis/filters.py

     """Marks "stop" words (words too common to index) in the stream (and by
     default removes them).
 
-    >>> rext = RegexTokenizer()
-    >>> stream = rext("this is a test")
-    >>> stopper = StopFilter()
-    >>> [token.text for token in stopper(stream)]
-    ["this", "test"]
+    Make sure you precede this filter with a :class:`LowercaseFilter`.
+
+    >>> stopper = RegexTokenizer() | StopFilter()
+    >>> [token.text for token in stopper(u"this is a test")]
+    ["test"]
+    >>> es_stopper = RegexTokenizer() | StopFilter(lang="es")
+    >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")]
+    ["lapiz", "mesa"]
+
+    The list of available languages is in `whoosh.lang.languages`.
+    You can use :func:`whoosh.lang.has_stopwords` to check if a given language
+    has a stop word list available.
     """
 
-    __inittypes__ = dict(stoplist=list, minsize=int, maxsize=int,
-                         renumber=bool)
-
     def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None,
-                 renumber=True):
+                 renumber=True, lang=None):
         """
         :param stoplist: A collection of words to remove from the stream.
             This is converted to a frozenset. The default is a list of
             common English stop words.
         :param minsize: The minimum length of token texts. Tokens with
-            text smaller than this will be stopped.
+            text smaller than this will be stopped. The default is 2.
         :param maxsize: The maximum length of token texts. Tokens with text
             larger than this will be stopped. Use None to allow any length.
         :param renumber: Change the 'pos' attribute of unstopped tokens
             to reflect their position with the stopped words removed.
+        :param lang: Automatically get a list of stop words for the given
+            language
         """
 
-        if stoplist is None:
-            self.stops = frozenset()
-        else:
-            self.stops = frozenset(stoplist)
+        stops = set()
+        if stoplist:
+            stops.update(stoplist)
+        if lang:
+            from whoosh.lang import stopwords_for_language
+
+            stops.update(stopwords_for_language(lang))
+
+        self.stops = frozenset(stops)
         self.min = minsize
         self.max = maxsize
         self.renumber = renumber

File src/whoosh/analysis/intraword.py

             return tk
 
         for token in tokens:
-            buf.append(token.copy())
-            if len(buf) == size:
-                atleastone = True
-                yield make_token()
-                buf.popleft()
+            if not token.stopped:
+                buf.append(token.copy())
+                if len(buf) == size:
+                    atleastone = True
+                    yield make_token()
+                    buf.popleft()
 
         # If no shingles were emitted, that is, the token stream had fewer than
         # 'size' tokens, then emit a single token with whatever tokens there
         # were
-        if not atleastone:
+        if not atleastone and buf:
             yield make_token()
 
 

File src/whoosh/analysis/morph.py

 
     >>> stemfilter = StemFilter(stem_function)
 
+    You can also use one of the Snowball stemming functions by passing the
+    `lang` keyword argument.
+
+    >>> stemfilter = StemFilter(lang="ru")
+
+    The list of available languages is in `whoosh.lang.languages`.
+    You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
+    a stemming function available.
+
     By default, this class wraps an LRU cache around the stemming function. The
     ``cachesize`` keyword argument sets the size of the cache. To make the
     cache unbounded (the class caches every input), use ``cachesize=-1``. To

File src/whoosh/analysis/tokenizers.py

     def __init__(self, expression="[^/]+"):
         self.expr = rcompile(expression)
 
-    def __call__(self, value, **kwargs):
-        assert isinstance(value, text_type), "%r is not unicode" % value
-        token = Token(**kwargs)
-        for match in self.expr.finditer(value):
-            token.text = value[:match.end()]
-            yield token
+    def __call__(self, value, positions=False, start_pos=0, **kwargs):
+         assert isinstance(value, text_type), "%r is not unicode" % value
+         token = Token(positions, **kwargs)
+         pos = start_pos
+         for match in self.expr.finditer(value):
+             token.text = value[:match.end()]
+             if positions:
+                 token.pos = pos
+                 pos += 1
+             yield token
+

File src/whoosh/columns.py

 
             self._read_lengths()
             # Create an array of offsets into the strings using the lengths
-            offsets = array("i", (0,))
+            offsets = array("L", (0,))
             for length in self._lengths:
                 offsets.append(offsets[-1] + length)
             self._offsets = offsets

File src/whoosh/fields.py

         self.signed = signed
         self.analyzer = analysis.IDAnalyzer()
         self.format = formats.Existence(field_boost=field_boost)
-
-        # Calculate the minimum and maximum possible values for error checking
-        self.min_value = from_sortable(numtype, bits, signed, 0)
-        self.max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1)
+        self.min_value, self.max_value = self._min_max()
 
         # Column configuration
         if default is None:
     def __setstate__(self, d):
         self.__dict__.update(d)
         self._struct = struct.Struct(">" + self.sortable_typecode)
+        if "min_value" not in d:
+            d["min_value"], d["max_value"] = self._min_max()
+
+    def _min_max(self):
+        numtype = self.numtype
+        bits = self.bits
+        signed = self.signed
+
+        # Calculate the minimum and maximum possible values for error checking
+        min_value = from_sortable(numtype, bits, signed, 0)
+        max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1)
+
+        return min_value, max_value
 
     def default_column(self):
         return columns.NumericColumn(self.sortable_typecode,

File src/whoosh/highlight.py

         ec = self.endchar
         fsc = fragment.startchar
         fec = fragment.endchar
-        return (fsc > sc and fsc < ec) or (fec > sc and fec < ec)
+        return (sc < fsc < ec) or (sc < fec < ec)
 
     def overlapped_length(self, fragment):
         sc = self.startchar
     """Doesn't fragment the token stream. This object just returns the entire
     entire stream as one "fragment". This is useful if you want to highlight
     the entire text.
+
+    Note that even if you use the `WholeFragmenter`, the highlight code will
+    return no fragment if no terms matched in the given field. To return the
+    whole fragment even in that case, call `highlights()` with `minscore=0`::
+
+        # Query where no terms match in the "text" field
+        q = query.Term("tag", "new")
+
+        r = mysearcher.search(q)
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        # Since no terms in the "text" field matched, we get no fragments back
+        assert r[0].highlights("text") == ""
+
+        # If we lower the minimum score to 0, we get a fragment even though it
+        # has no matching terms
+        assert r[0].highlights("text", minscore=0) == "This is the text field."
+
     """
 
     def __init__(self, charlimit=DEFAULT_CHARLIMIT):
 
 def SCORE(fragment):
     "Sorts higher scored passages first."
-    return None
+    return 1
 
 
 def FIRST(fragment):
 def top_fragments(fragments, count, scorer, order, minscore=1):
     scored_fragments = ((scorer(f), f) for f in fragments)
     scored_fragments = nlargest(count, scored_fragments)
-    best_fragments = [sf for score, sf in scored_fragments if score > minscore]
+    best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
     best_fragments.sort(key=order)
     return best_fragments
 
                     assert m.id() == docnum
                     cache[docnum][text] = m.value_as("characters")
 
-    def highlight_hit(self, hitobj, fieldname, text=None, top=3):
+    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
         results = hitobj.results
         schema = results.searcher.schema
         field = schema[fieldname]
         # Convert bytes to unicode
         words = frozenset(from_bytes(term[1]) for term in bterms)
 
-        if not words:
-            # No terms matches in this field
-            return self.formatter.format([])
+        # if not words:
+        #     # No terms matches in this field
+        #     return self.formatter.format([])
 
         # If we can do "pinpoint" highlighting...
         if self.can_load_chars(results, fieldname):
         else:
             # Retokenize the text
             analyzer = results.searcher.schema[fieldname].analyzer
-            tokens = analyzer(text, chars=True, mode="query",
+            tokens = analyzer(text, positions=True, chars=True, mode="query",
                               removestops=False)
             # Set Token.matched attribute for tokens that match a query term
             tokens = set_matched_filter(tokens, words)
             fragments = self.fragmenter.fragment_tokens(text, tokens)
 
-        fragments = top_fragments(fragments, top, self.scorer, self.order)
+        fragments = top_fragments(fragments, top, self.scorer, self.order,
+                                  minscore=minscore)
         output = self.formatter.format(fragments)
         return output

File src/whoosh/lang/__init__.py

 
 # Getter functions
 
+def has_stemmer(lang):
+    try:
+        return bool(stemmer_for_language(lang))
+    except NoStemmer:
+        return False
+
+
+def has_stopwords(lang):
+    try:
+        return bool(stopwords_for_language(lang))
+    except NoStopWords:
+        return False
+
+
 def stemmer_for_language(lang):
     if lang == "en_porter":
         # Original porter stemming algorithm is several times faster than the
     if tlc in snowball_classes:
         return snowball_classes[tlc]().stem
 
-    raise Exception("No stemmer available for %r" % lang)
+    raise NoStemmer("No stemmer available for %r" % lang)
 
 
 def stopwords_for_language(lang):
     if tlc in stoplists:
         return stoplists[tlc]
 
-    raise Exception("No stop-word list available for %r" % lang)
+    raise NoStopWords("No stop-word list available for %r" % lang)

File src/whoosh/lang/stopwords.py

 # coding=utf-8
 
+from __future__ import unicode_literals
+
 # Stopwords Corpus
 #
 # This module contains lists of stop words for several languages.  These
 
 
 stoplists = {
-    "da": frozenset(u"""
+    "da": frozenset("""
     og i jeg det at en den til er som på de med han af for ikke der var mig
     sig men et har om vi min havde ham hun nu over da fra du ud sin dem os
     op man hans hvor eller hvad skal selv her alle vil blev kunne ind når
     hvis din nogle hos blive mange ad bliver hendes været thi jer sådan
     """.split()),
 
-    "nl": frozenset(u"""
+    "nl": frozenset("""
     de en van ik te dat die in een hij het niet zijn is was op aan met als
     voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich
     bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u
     iemand geweest andere
     """.split()),
 
-    "en": frozenset(u"""
+    "en": frozenset("""
     i me my myself we our ours ourselves you your yours yourself yourselves
     he him his himself she her hers herself it its itself they them their
     theirs themselves what which who whom this that these those am is are
     same so than too very s t can will just don should now
     """.split()),
 
-    "fi": frozenset(u"""
+    "fi": frozenset("""
     olla olen olet on olemme olette ovat ole oli olisi olisit olisin
     olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet
     en et ei emme ette eivät minä minun minut minua minussa minusta minuun
     noin poikki yli kun niin nyt itse
     """.split()),
 
-    "fr": frozenset(u"""
+    "fr": frozenset("""
     au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma
     mais me même mes moi mon ne nos notre nous on ou par pas pour qu que
     qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l
     eusses eût eussions eussiez eussent
     """.split()),
 
-    "de": frozenset(u"""
+    "de": frozenset("""
     aber alle allem allen aller alles als also am an ander andere anderem
     anderen anderer anderes anderm andern anderr anders auch auf aus bei bin
     bis bist da damit dann der den des dem die das daß derselbe derselben
     wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen
     """.split()),
 
-    "hu": frozenset(u"""
+    "hu": frozenset("""
     a ahogy ahol aki akik akkor alatt által általában amely amelyek
     amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át
     abban ahhoz annak arra arról az azok azon azt azzal azért aztán
     voltunk vissza vele viszont volna
     """.split()),
 
-    "it": frozenset(u"""
+    "it": frozenset("""
     ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli
     dall dagl dalla dalle di del dello dei degli dell degl della delle in
     nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull
     stesse stessimo stessero stando
     """.split()),
 
-    "no": frozenset(u"""
+    "no": frozenset("""
     og i jeg det at en et den til er som på de med han av ikke ikkje der
     så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved
     fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl
     sia sidan so somt somme um upp vere vore verte vort varte vart
     """.split()),
 
-    "pt": frozenset(u"""
+    "pt": frozenset("""
     de a o que e do da em um para com não uma os no se na por mais as dos
     como mas ao ele das à seu sua ou quando muito nos já eu também só
     pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse
     teria teríamos teriam
     """.split()),
 
-    "ru": frozenset(u"""
+    "ru": frozenset("""
     и в во не что он на я с со как а то все она
     так его но да ты к у же вы за бы по только
     ее мне было вот от меня еще нет о из ему
     всегда конечно всю между
     """.split()),
 
-    "es": frozenset(u"""
+    "es": frozenset("""
     de la que el en y a los del se las por un para con no una su al lo como
     más pero sus le ya o este sí porque esta entre cuando muy sin sobre
     también me hasta hay donde quien desde todo nos durante todos uno les
     tenidas tened
     """.split()),
 
-    "sv": frozenset(u"""
+    "sv": frozenset("""
     och det att i en jag hon som han på den med var sig för så till är
     men ett om hade de av icke mig du henne då sin nu har inte hans honom
     skulle hennes där min man ej vid kunde något från ut när efter upp
     vilket sitta sådana vart dina vars vårt våra ert era vilkas
     """.split()),
 
-    "tr": frozenset(u"""
+    "tr": frozenset("""
     acaba ama aslında az bazı belki biri birkaç birşey biz bu çok
     çünkü da daha de defa diye eğer en gibi hem hep hepsi her hiç için
     ile ise kez ki kim mı mu mü nasıl ne neden nerde nerede nereye niçin

File src/whoosh/query/compound.py

     """
 
     def __init__(self, subqueries, boost=1.0):
+        for subq in subqueries:
+            if not isinstance(subq, qcore.Query):
+                raise qcore.QueryError("%r is not a query")
         self.subqueries = subqueries
         self.boost = boost
 
 
     def __unicode__(self):
         r = u("(")
-        r += (self.JOINT).join([text_type(s) for s in self.subqueries])
+        r += self.JOINT.join([text_type(s) for s in self.subqueries])
         r += u(")")
         return r
 
     __str__ = __unicode__
 
     def __eq__(self, other):
-        return other and self.__class__ is other.__class__ and\
-        self.subqueries == other.subqueries and\
-        self.boost == other.boost
+        return (other
+                and self.__class__ is other.__class__
+                and self.subqueries == other.subqueries
+                and self.boost == other.boost)
 
     def __getitem__(self, i):
         return self.subqueries.__getitem__(i)
         subqs = []
         seenqs = set()
         for s in subqueries:
-            if (not isinstance(s, Every) and s.field() in everyfields):
+            if not isinstance(s, Every) and s.field() in everyfields:
                 continue
             if s in seenqs:
                 continue
             raise ValueError("Unknown matcher_type %r" % self.matcher_type)
 
         return cls(subs, boost=self.boost, minmatch=self.minmatch,
-                    scale=self.scale).matcher(searcher, context)
+                   scale=self.scale).matcher(searcher, context)
 
 
 class DefaultOr(Or):

File src/whoosh/query/qcore.py

 
         return iter(())
 
-    def expanded_terms(self, ixreader):
-        return self.terms()
+    def expanded_terms(self, ixreader, phrases=True):
+        return self.terms(phrases=phrases)
 
     def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None):
         """Returns a set of all byteterms in this query tree that exist in
                 continue
 
             if expand:
-                terms = q.expanded_terms(ixreader)
+                terms = q.expanded_terms(ixreader, phrases=phrases)
             else:
-                terms = q.terms(phrases)
+                terms = q.terms(phrases=phrases)
 
             for fieldname, text in terms:
                 if (fieldname, text) in termset:

File src/whoosh/query/terms.py

     def _btexts(self, ixreader):
         raise NotImplementedError(self.__class__.__name__)
 
-    def expanded_terms(self, ixreader):
+    def expanded_terms(self, ixreader, phrases=False):
         fieldname = self.field()
         if fieldname:
             for btext in self._btexts(ixreader):

File src/whoosh/searching.py

             raise NoTermsException
         return self.results.docterms[self.docnum]
 
-    def highlights(self, fieldname, text=None, top=3):
+    def highlights(self, fieldname, text=None, top=3, minscore=1):
         """Returns highlighted snippets from the given field::
 
             r = searcher.search(myquery)
             access to the text another way (for example, loading from a file or
             a database), you can supply it using the ``text`` parameter.
         :param top: the maximum number of fragments to return.
+        :param minscore: the minimum score for fragments to appear in the
+            highlights.
         """
 
         hliter = self.results.highlighter
-        return hliter.highlight_hit(self, fieldname, text=text, top=top)
+        return hliter.highlight_hit(self, fieldname, text=text, top=top,
+                                    minscore=minscore)
 
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):

File tests/test_analysis.py

 import pytest
 
 from whoosh import analysis, fields, qparser
-from whoosh.compat import u, unichr
+from whoosh.compat import b, u, unichr
 from whoosh.compat import dumps
 from whoosh.filedb.filestore import RamStorage
 
                                            "/alfa/bravo/charlie/delta"]
 
 
+def test_path_tokenizer2():
+    path_field = fields.TEXT(analyzer=analysis.PathTokenizer())
+    st = RamStorage()
+    schema = fields.Schema(path=path_field)
+    index = st.create_index(schema)
+
+    with index.writer() as writer:
+        writer.add_document(path=u'/alfa/brvo/charlie/delta/')
+        writer.add_document(path=u'/home/user/file.txt')
+    assert not index.is_empty()
+
+    with index.reader() as reader:
+        items = list(reader.all_terms())
+    assert 'path' in [field for field, value in items]
+    assert b('/alfa') in [value for field, value in items]
+
+
 def test_composition1():
     ca = analysis.RegexTokenizer() | analysis.LowercaseFilter()
     assert ca.__class__.__name__ == "CompositeAnalyzer"
                       ("lm", 14, 16)]
 
 
+@pytest.mark.skipif("sys.version_info < (2,6)")
 def test_language_analyzer():
     domain = [("da", u("Jeg gik mig over s\xf8 og land"),
                [u('gik'), u('s\xf8'), u('land')]),
         assert words == target
 
 
+@pytest.mark.skipif("sys.version_info < (2,6)")
 def test_la_pickleability():
     ana = analysis.LanguageAnalyzer("en")
     _ = dumps(ana, -1)
     _ = dumps(ana, -1)
 
 
+def test_shingle_stopwords():
+    # Note that the stop list is None here
+    ana = (analysis.RegexTokenizer()
+           | analysis.StopFilter(stoplist=None, minsize=3)
+           | analysis.ShingleFilter(size=3))
+
+    texts = [t.text for t
+             in ana(u("some other stuff and then some things To Check     "))]
+    assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then",
+                     "and-then-some", "then-some-things", "some-things-Check"]
+
+    # Use a stop list here
+    ana = (analysis.RegexTokenizer()
+           | analysis.LowercaseFilter()
+           | analysis.StopFilter()
+           | analysis.ShingleFilter(size=3))
+
+    texts = [t.text for t
+             in ana(u("some other stuff and then some things To Check     "))]
+    assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some",
+                     "then-some-things", "some-things-check"]
+
+
+def test_biword_stopwords():
+    # Note that the stop list is None here
+    ana = (analysis.RegexTokenizer()
+           | analysis.StopFilter(stoplist=None, minsize=3)
+           | analysis.BiWordFilter())
+
+    texts = [t.text for t in ana(u("stuff and then some"))]
+    assert texts == ["stuff-and", "and-then", "then-some"]
+
+    # Use a stop list here
+    ana = (analysis.RegexTokenizer()
+           | analysis.LowercaseFilter()
+           | analysis.StopFilter()
+           | analysis.BiWordFilter())
+
+    texts = [t.text for t in ana(u("stuff and then some"))]
+    assert texts == ["stuff-then", "then-some"]
+
+
+@pytest.mark.skipif("sys.version_info < (2,6)")
+def test_stop_lang():
+    stopper = analysis.RegexTokenizer() | analysis.StopFilter()
+    ls = [token.text for token in stopper(u("this is a test"))]
+    assert ls == [u("test")]
+
+    es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es")
+    ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))]
+    assert ls == ["lapiz", "mesa"]

File tests/test_collector.py

         assert len(r) == 0
 
 
-def test_daterange_matched_terms():
-    from whoosh.qparser import GtLtPlugin
-    from datetime import datetime
 
-    schema = fields.Schema(id=fields.KEYWORD(stored=True),
-                           body=fields.TEXT,
-                           num=fields.NUMERIC(stored=True, unique=True),
-                           created=fields.DATETIME(stored=True))
-    ix = RamStorage().create_index(schema)
 
-    with ix.writer() as w:
-        w.add_document(id=u"one", body=u"this and this", num='5',
-                       created=datetime.now())
-        w.add_document(id=u"three", body=u"that and that", num='7',
-                       created=datetime.now())
-        w.add_document(id=u"two", body=u"this and that", num='6',
-                       created=datetime.now())
-
-    with ix.searcher() as s:
-        parser = qparser.QueryParser("body", ix.schema)
-        parser.add_plugin(GtLtPlugin())
-        q = parser.parse(u"created:>='2013-07-01'")
-        r = s.search(q, terms=True)
-
-        assert r.has_matched_terms()
-        termlist = r[0].matched_terms()
-        assert len(termlist) == 1
-        pair = termlist[0]
-        assert pair[0] == "created"
-        assert pair[1] == b("(\x00\x00\x00\x00\x00\x80\xe1\xa3")
-

File tests/test_columns.py

 
             assert len(w) == 2
             assert issubclass(w[-1].category, UserWarning)
-    else:
-        rw(65537)

File tests/test_highlighting.py

                                  fragmenter=highlight.ContextFragmenter(),
                                  formatter=highlight.UppercaseFormatter())
     assert result == "INDEXED!\n1"
+
+
+def test_whole_noterms():
+    schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"),
+                       tag=u("foo"))
+
+    with ix.searcher() as s:
+        r = s.search(query.Term("text", u("delta")))
+        assert len(r) == 1
+
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        hi = r[0].highlights("text")
+        assert hi == u("alfa bravo charlie DELTA echo foxtrot golf")
+
+        r = s.search(query.Term("tag", u("foo")))
+        assert len(r) == 1
+        r.fragmenter = highlight.WholeFragmenter()
+        r.formatter = highlight.UppercaseFormatter()
+        hi = r[0].highlights("text")
+        assert hi == u("")
+
+        hi = r[0].highlights("text", minscore=0)
+        assert hi == u("alfa bravo charlie delta echo foxtrot golf")

File tests/test_indexing.py

 
     with TempIndex(schema, "globlenmerge") as ix:
         with ix.writer() as w:
-            w.add_document(title=u"First document", path=u"/a",
-                           content_text=u"This is the first document we've added!")
+            w.add_document(title=u("First document"), path=u("/a"),
+                           content_text=u("This is the first document we've added!"))
 
         with ix.writer() as w:
-            w.add_document(title=u"Second document", path=u"/b",
-                           content_text=u"The second document is even more interesting!")
+            w.add_document(title=u("Second document"), path=u("/b"),
+                           content_text=u("The second document is even more interesting!"))
 
         with ix.searcher() as s:
             docnum = s.document_number(path="/a")

File tests/test_queries.py

 from __future__ import with_statement
 import copy
 
+import pytest
+
 from whoosh import fields, qparser, query
 from whoosh.compat import b, u
 from whoosh.filedb.filestore import RamStorage
         r2 = [hit["id"] for hit in s.search(q2, sortedby="id")]
 
         assert r1 == r2 == [4]
+
+
+def test_none_in_compounds():
+    with pytest.raises(query.QueryError):
+        _ = query.And([query.Term("a", "b"), None, query.Term("c", "d")])
+

File tests/test_searching.py

+#encoding: utf-8
+
 from __future__ import with_statement
 import copy
 from datetime import datetime, timedelta