Commits

Matt Chaput committed e4ef1fa Merge

Merging bug fixes from default branch.

Comments (0)

Files changed (14)

src/whoosh/collectors.py

 """
 
 import os
-import signal
 import threading
 from array import array
 from bisect import insort
     is slow the search could exceed the time limit.
     """
 
-    def __init__(self, child, timelimit, greedy=False):
+    def __init__(self, child, timelimit, greedy=False, use_alarm=True):
         """
         :param child: the collector to wrap.
         :param timelimit: the maximum amount of time (in seconds) to
             raise a ``TimeLimit`` exception.
         :param greedy: if ``True``, the collector will finish adding the most
             recent hit before raising the ``TimeLimit`` exception.
+        :param use_alarm: if ``True`` (the default), the collector will try to
+            use signal.SIGALRM (on UNIX).
         """
         self.child = child
         self.timelimit = timelimit
         self.greedy = greedy
-        self.use_alarm = hasattr(signal, "SIGALRM")
+
+        if use_alarm:
+            import signal
+            self.use_alarm = use_alarm and hasattr(signal, "SIGALRM")
+
+        self.timer = None
+        self.timedout = False
 
     def prepare(self, top_searcher, q, context):
         self.child.prepare(top_searcher, q, context)
 
         self.timedout = False
         if self.use_alarm:
+            import signal
             signal.signal(signal.SIGALRM, self._was_signaled)
 
         # Start a timer thread. If the timer fires, it will call this object's
         self.timedout = True
 
         if self.use_alarm:
+            import signal
             os.kill(os.getpid(), signal.SIGALRM)
 
     def _was_signaled(self, signum, frame):

src/whoosh/fields.py

         this behavior.
         """
 
-        wordset = sorted(set(token.text for token
-                             in self.analyzer(value, no_morph=True)))
-        return iter(wordset)
+        if isinstance(value, (list, tuple)):
+            words = value
+        else:
+            words = [token.text for token
+                     in self.analyzer(value, no_morph=True)]
+
+        return iter(sorted(set(words)))
 
     def has_morph(self):
         """Returns True if this field by default performs morphological

src/whoosh/highlight.py

     tokens = analyzer(text, chars=True, mode=mode, removestops=False)
     tokens = set_matched_filter(tokens, termset)
     fragments = fragmenter.fragment_tokens(text, tokens)
-    fragments = top_fragments(fragments, top, scorer, order)
+    fragments = top_fragments(fragments, top, scorer, order, minscore)
     return formatter(text, fragments)
 
 
         # Convert bytes to unicode
         words = frozenset(from_bytes(term[1]) for term in bterms)
 
-        # if not words:
-        #     # No terms matches in this field
-        #     return self.formatter.format([])
-
         # If we can do "pinpoint" highlighting...
         if self.can_load_chars(results, fieldname):
             # Build the docnum->[(startchar, endchar),] map

src/whoosh/qparser/plugins.py

         return node
 
     def filters(self, parser):
-        return [(self.clean_boost, 0), (self.do_boost, 700)]
+        return [(self.clean_boost, 0), (self.do_boost, 510)]
 
     def clean_boost(self, parser, group):
         """This filter finds any BoostNodes in positions where they can't boost
     """, verbose=True)
 
     class FuzzinessNode(syntax.SyntaxNode):
-        def __init__(self, maxdist, prefix, original):
+        def __init__(self, maxdist, prefixlength, original):
             self.maxdist = maxdist
-            self.prefix = prefix
+            self.prefixlength = prefixlength
             self.original = original
 
         def __repr__(self):
-            return "<~%d>" % (self.maxdist,)
+            return "<~%d/%d>" % (self.maxdist, self.prefixlength)
 
     class FuzzyTermNode(syntax.TextNode):
         qclass = query.FuzzyTerm
 
-        def __init__(self, wordnode, maxdist, prefix):
+        def __init__(self, wordnode, maxdist, prefixlength):
             self.fieldname = wordnode.fieldname
             self.text = wordnode.text
             self.boost = wordnode.boost
             self.startchar = wordnode.startchar
             self.endchar = wordnode.endchar
             self.maxdist = maxdist
-            self.prefix = prefix
+            self.prefixlength = prefixlength
 
         def r(self):
-            return "%r ~%d" % (self.text, self.maxdist)
+            return "%r ~%d/%d" % (self.text, self.maxdist, self.prefixlength)
 
         def query(self, parser):
             # Use the superclass's query() method to create a FuzzyTerm query
             q = syntax.TextNode.query(self, parser)
             # Set FuzzyTerm-specific attributes
             q.maxdist = self.maxdist
-            q.prefix = self.prefix
+            q.prefixlength = self.prefixlength
             return q
 
     def create(self, parser, match):
         maxdist = int(mdstr) if mdstr else 1
 
         pstr = match.group("prefix")
-        prefix = int(pstr) if pstr else 0
+        prefixlength = int(pstr) if pstr else 0
 
-        return self.FuzzinessNode(maxdist, prefix, match.group(0))
+        return self.FuzzinessNode(maxdist, prefixlength, match.group(0))
 
     def filters(self, parser):
         return [(self.do_fuzzyterms, 0)]
                 nextnode = group[i + 1]
                 if isinstance(nextnode, self.FuzzinessNode):
                     node = self.FuzzyTermNode(node, nextnode.maxdist,
-                                              nextnode.prefix)
+                                              nextnode.prefixlength)
                     i += 1
             if isinstance(node, self.FuzzinessNode):
                 node = syntax.to_word(node)
                 and isinstance(group[i + 1], syntax.GroupNode)):
                 nextnode = group[i + 1]
                 node.nodes = list(self.do_functions(parser, nextnode))
+
+                if nextnode.boost != 1:
+                    node.set_boost(nextnode.boost)
+
                 i += 1
             elif isinstance(node, syntax.GroupNode):
                 node = self.do_functions(parser, node)
         return [(FnTagger(self.expr, self.QuoteNode, "quote"), 0)]
 
     def filters(self, parser):
-        return [(self.do_quotes, 650)]
+        return [(self.do_quotes, 550)]
 
     def do_quotes(self, parser, group):
         # New group to copy nodes into

src/whoosh/qparser/syntax.py

 
     def query(self, parser):
         assert len(self.nodes) == 2
-        q = self.qclass(self.nodes[0].query(parser),
-                        self.nodes[1].query(parser))
+
+        qa = self.nodes[0].query(parser)
+        qb = self.nodes[1].query(parser)
+        if qa is None and qb is None:
+            q = query.NullQuery
+        elif qa is None:
+            q = qb
+        elif qb is None:
+            q = qa
+        else:
+            q = self.qclass(self.nodes[0].query(parser),
+                            self.nodes[1].query(parser))
+
         return attach(q, self)
 
 

src/whoosh/query/positional.py

                               self.slop, self.ordered, self.boost)
 
     def _and_query(self):
-        return compound.And([terms.Term(self.fieldname, word)
-                             for word in self.words])
+        return compound.And(self.subqueries)
 
     def estimate_size(self, ixreader):
         return self._and_query().estimate_size(ixreader)

src/whoosh/searching.py

 
         if not self.has_matched_terms():
             raise NoTermsException
-
         return set(self.termdocs.keys())
 
     def _get_fragmenter(self):
 
         if not self.results.has_matched_terms():
             raise NoTermsException
-        return self.results.docterms[self.docnum]
+        return self.results.docterms.get(self.docnum, [])
 
     def highlights(self, fieldname, text=None, top=3, minscore=1):
         """Returns highlighted snippets from the given field::

src/whoosh/writing.py

         for fieldname, fieldobj in self.schema.items():
             if (fieldobj.separate_spelling()
                 and reader.has_word_graph(fieldname)):
-                for word in reader.word_graph(fieldname).flatten():
+
+                gr = reader._get_graph()
+                cursor = gr.cursor(fieldname)
+                for word in cursor.flatten():
                     # Adding a post where docnum=None marks it as a separate
                     # spelling word
                     add_post((fieldname, word, -1, -1, emptybytes))
         clean_files(self.storage, self.indexname, self.generation, segments)
 
     def _finish(self):
+        self._tempstorage.destroy()
         if self.writelock:
             self.writelock.release()
-        self._tempstorage.destroy()
         self.is_closed = True
         #self.storage.close()
 

tests/test_parse_plugins.py

     assert q.text == "bob~"
 
 
+def test_fuzzy_prefix():
+    from whoosh import scoring
+
+    schema = fields.Schema(title=fields.TEXT(stored=True),
+                           content=fields.TEXT(spelling=True))
+
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        # Match -> first
+        w.add_document(title=u("First"),
+                       content=u("This is the first document we've added!"))
+        # No match
+        w.add_document(title=u("Second"),
+                       content=u("The second one is even more interesting! filst"))
+        # Match -> first
+        w.add_document(title=u("Third"),
+                       content=u("The world first line we've added!"))
+        # Match -> zeroth
+        w.add_document(title=u("Fourth"),
+                       content=u("The second one is alaways comes after zeroth!"))
+        # Match -> fire is within 2 edits (transpose + delete) of first
+        w.add_document(title=u("Fifth"),
+                       content=u("The fire is beautiful"))
+
+    from whoosh.qparser import QueryParser, FuzzyTermPlugin #, BoundedFuzzyTermPlugin
+    parser = QueryParser("content", ix.schema)
+    parser.add_plugin(FuzzyTermPlugin())
+    q = parser.parse("first~2/3 OR zeroth", debug=False)
+
+    assert isinstance(q, query.Or)
+    ft = q[0]
+    assert isinstance(ft, query.FuzzyTerm)
+    assert ft.maxdist == 2
+    assert ft.prefixlength == 3
+
+    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
+        results = searcher.search(q)
+        assert len(results) == 4
+        assert " ".join(hit["title"] for hit in results) == "Fourth First Third Fifth"
+
+
 def test_function_plugin():
     class FakeQuery(query.Query):
         def __init__(self, children, *args, **kwargs):
     assert q[1].slop == 2
     assert q[1][1].__class__ == query.FuzzyTerm
     assert q[1][1].maxdist == 3
+
+
+def test_sequence_andmaybe():
+    qp = default.QueryParser("f", None)
+    qp.remove_plugin_class(plugins.PhrasePlugin)
+    qp.add_plugins([plugins.FuzzyTermPlugin(), plugins.SequencePlugin()])
+
+    q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"'))
+    assert isinstance(q, query.AndMaybe)
+    assert q[0] == query.Term("f", u("Dahmen"))
+    assert q[1] == query.Sequence([query.Term("f", u("Besov")),
+                                   query.Term("f", u("Spaces"))])
+

tests/test_parsing.py

 
 def test_unicode_num():
     schema = fields.Schema(num=fields.NUMERIC)
-    parser = default.QueryParser(u"num", schema=schema)
-    q = parser.parse(u"num:1")
+    parser = default.QueryParser(u("num"), schema=schema)
+    q = parser.parse(u("num:1"))
 
     _ = text_type(q)
+
+
+def test_phrase_andmaybe():
+    qp = default.QueryParser("f", None)
+
+    q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"'))
+    assert isinstance(q, query.AndMaybe)
+    assert q[0] == query.Term("f", u("Dahmen"))
+    assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")])
+
+
+def test_phrase_boost():
+    qp = default.QueryParser("f", None)
+    q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"^9'))
+    assert isinstance(q, query.AndMaybe)
+    assert q[0] == query.Term("f", u("Dahmen"))
+    assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")], boost=9)
+
+
+def test_andmaybe_none():
+    schema = fields.Schema(f=fields.TEXT, year=fields.NUMERIC)
+    qp = default.QueryParser("f", schema)
+    _ = qp.parse(u("Dahmen ANDMAYBE @year:[2000 TO]"))
+

tests/test_queries.py

 
         assert r1 == r2 == r3 == []
 
+
+def test_sequence():
+    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(id=0, text=u("alfa bravo charlie delta echo"))
+        w.add_document(id=1, text=u("bravo charlie delta echo alfa"))
+        w.add_document(id=2, text=u("charlie delta echo bravo"))
+        w.add_document(id=3, text=u("delta echo charlie"))
+        w.add_document(id=4, text=u("echo delta"))
+
+    with ix.searcher() as s:
+        seq = query.Sequence([query.Term("text", u("echo")),
+                              query.Term("text", u("alfa"))])
+        q = query.And([query.Term("text", "bravo"), seq])
+
+        r = s.search(q, limit=4)
+        assert len(r) == 1
+        assert r[0]["id"] == 1
+
+
+def test_andmaybe():
+    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(id=0, text=u("alfa bravo charlie delta echo"))
+        w.add_document(id=1, text=u("bravo charlie delta echo alfa"))
+        w.add_document(id=2, text=u("charlie delta echo bravo"))
+        w.add_document(id=3, text=u("delta echo charlie"))
+        w.add_document(id=4, text=u("echo delta"))
+
+    qp = qparser.QueryParser("text", schema)
+    q = qp.parse(u('bravo ANDMAYBE "echo alfa"'))
+
+    with ix.searcher() as s:
+        r = s.search(q)
+        assert len(r) == 3
+        assert [hit["id"] for hit in r] == [1, 2, 0]
+

tests/test_results.py

         assert len(r) == 2
         kts = " ".join(t for t, score in r.key_terms("text"))
         assert kts == "alfa bravo charlie foxtrot delta"
+
+
+def test_every_keywords():
+    schema = fields.Schema(title=fields.TEXT, content=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(title=u("alfa"), content=u("bravo"))
+        w.add_document(title=u("charlie"), content=u("delta"))
+
+    with ix.searcher() as s:
+        q = qparser.QueryParser("content", ix.schema).parse("*")
+        assert isinstance(q, query.Every)
+
+        r = s.search(q, terms=True)
+        assert len(r) == 2
+        hit = r[0]
+        assert hit["content"] == "bravo"
+        assert hit.highlights("content") == ""

tests/test_spelling.py

     assert c.format_string(highlight.UppercaseFormatter()) == "dworska"
 
 
+def test_very_long_words():
+    import sys
+    length = int(sys.getrecursionlimit() * 1.5)
+
+    strings1 = [u(chr(i) * length) for i in range(65, 70)]
+    strings2 = [u(chr(i) * length) for i in range(71, 75)]
+
+    ana = analysis.StemmingAnalyzer()
+    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        for string in strings1:
+            w.add_document(text=string)
+
+    with ix.writer() as w:
+        for string in strings2:
+            w.add_document(text=string)
+        w.optimize = True
+
+

tests/test_writing.py

         assert s.doc_count_all() == 1
         assert list(s.reader().lexicon("a")) == [b("bar"), b("baz"), b("foo")]
 
+
+def test_spellable_list():
+    # Make sure a spellable field works with a list of pre-analyzed tokens
+
+    ana = analysis.StemmingAnalyzer()
+    schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED,
+                           Title=fields.TEXT(spelling=True, analyzer=ana))
+    ix = RamStorage().create_index(schema)
+
+    doc = {'Location': '1000/123', 'Lang': 'E',
+           'Title': ['Introduction', 'Numerical', 'Analysis']}
+
+    with ix.writer() as w:
+        w.add_document(**doc)
+
+
+def test_zero_procs():
+    schema = fields.Schema(text=fields.TEXT)
+    ix = RamStorage().create_index(schema)
+    with ix.writer(procs=0) as w:
+        assert isinstance(w, writing.IndexWriter)
+
+    with ix.writer(procs=1) as w:
+        assert isinstance(w, writing.IndexWriter)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.