Commits

Matt Chaput committed 1a8ab31

Fixed sorting error. Fixed bug where original word could appear in suggestions.
Updated docstring to explicitly state the original word will not be in the suggestions.
Fixes #174.

Comments (0)

Files changed (3)

src/whoosh/reading.py

         """
 
         from whoosh.spelling import ReaderCorrector
+
         return ReaderCorrector(self, fieldname)
 
     def terms_within(self, fieldname, text, maxdist, prefix=0, seen=None):
             for word in self.expand_prefix(fieldname, text[:prefix]):
                 if word in seen:
                     continue
-                if (word == text
-                    or distance(word, text, limit=maxdist) <= maxdist):
+                k = distance(word, text, limit=maxdist)
+                if k <= maxdist:
                     yield word
                     seen.add(word)
 

src/whoosh/spelling.py

 from whoosh.support.levenshtein import distance
 
 
-# Suggestion scorers
-
-def simple_scorer(word, cost):
-    """Ranks suggestions by the edit distance.
-    """
-
-    return (cost, 0)
-
+# Corrector objects
 
 class Corrector(object):
     """Base class for spelling correction objects. Concrete sub-classes should
 
     def suggest(self, text, limit=5, maxdist=2, prefix=0):
         """
-        :param text: the text to check.
+        :param text: the text to check. This word will **not** be added to the
+            suggestions, even if it appears in the word graph.
         :param limit: only return up to this many suggestions. If there are not
             enough terms in the field within ``maxdist`` of the given word, the
             returned list will be shorter than this number.
         _suggestions = self._suggestions
 
         heap = []
-        seen = set()
+        seen = set([text])
         for k in xrange(1, maxdist + 1):
             for item in _suggestions(text, k, prefix, seen):
+                # Note that the *higher* scores (item[0]) are better!
                 if len(heap) < limit:
                     heappush(heap, item)
-                elif item < heap[0]:
+                elif item > heap[0]:
                     heapreplace(heap, item)
 
             # If the heap is already at the required length, don't bother going
             if len(heap) >= limit:
                 break
 
-        return [sug for _, sug in sorted(heap)]
+        sugs = sorted(heap, key=lambda item: (0 - item[0], item[1]))
+        return [sug for _, sug in sugs]
 
     def _suggestions(self, text, maxdist, prefix, seen):
         """Low-level method that yields a series of (score, "suggestion")
         freq = self.reader.frequency
         for sug in self.reader.terms_within(fieldname, text, maxdist,
                                             prefix=prefix, seen=seen):
-            yield ((maxdist, 0 - freq(fieldname, sug)), sug)
+            # Higher scores are better, so negate the distance and frequency
+            score = 0 - (maxdist + (1.0 / freq(fieldname, sug) * 0.5))
+            yield (score, sug)
 
 
 class GraphCorrector(Corrector):
     By default ranks suggestions based on the edit distance.
     """
 
-    def __init__(self, word_graph, ranking=None):
+    def __init__(self, word_graph):
         self.word_graph = word_graph
-        self.ranking = ranking or simple_scorer
 
     def _suggestions(self, text, maxdist, prefix, seen):
-        ranking = self.ranking
         for sug in dawg.within(self.word_graph, text, maxdist, prefix=prefix,
                                seen=seen):
-            yield (ranking(sug, maxdist), sug)
+            # Higher scores are better, so negate the edit distance
+            yield (0 - maxdist, sug)
 
     def to_file(self, f):
         """
         dawg.DawgWriter(f).write(root)
 
     @classmethod
-    def from_word_list(cls, wordlist, ranking=None, strip=True):
+    def from_word_list(cls, wordlist, strip=True):
         dw = dawg.DawgBuilder(reduced=False)
         for word in wordlist:
             if strip:
                 word = word.strip()
             dw.insert(word)
         dw.finish()
-        return cls(dw.root, ranking=ranking)
+        return cls(dw.root)
 
     @classmethod
-    def from_graph_file(cls, dbfile, ranking=None):
+    def from_graph_file(cls, dbfile):
         dr = dawg.DiskNode.load(dbfile)
-        return cls(dr, ranking=ranking)
+        return cls(dr)
 
 
 class MultiCorrector(Corrector):

tests/test_spelling.py

 from __future__ import with_statement
 import gzip
 
-from nose.tools import assert_equal, assert_raises  #@UnresolvedImport
+from nose.tools import assert_equal, assert_not_equal, assert_raises  #@UnresolvedImport
 
 from whoosh import analysis, fields, highlight, spelling
 from whoosh.compat import u
 
 def test_dawg():
     from whoosh.support.dawg import DawgBuilder
-    
+
     with TempStorage() as st:
         df = st.create_file("test.dawg")
-        
+
         dw = DawgBuilder(field_root=True)
         dw.insert(["test"] + list("special"))
         dw.insert(["test"] + list("specials"))
         dw.write(df)
-        
+
         assert_equal(list(dawg.flatten(dw.root.edge("test"))), ["special", "specials"])
 
 def test_fields_out_of_order():
     dw.insert("bravo")
     assert_raises(Exception, dw.insert, "baker")
     dw.finish()
-    
+
     dw = dawg.DawgBuilder(field_root=True)
     dw.insert(["bravo"] + list("test"))
     dw.insert(["alfa"] + list("test"))
                        "low", "zone", "xylophone", "crown",
                        "vale", "brown", "neat", "meat", "reduction",
                        "blunder", "preaction"])
-    
+
     sp = spelling.GraphCorrector.from_word_list(wordlist)
     sugs = sp.suggest("reoction", maxdist=2)
     assert_equal(sugs, ["reaction", "preaction", "reduction"])
     w.add_document(text=u("leader libra ooala paster"))
     w.add_document(text=u("feeder lorry zoala baster"))
     w.commit()
-    
+
     with ix.reader() as r:
         sp = spelling.ReaderCorrector(r, "text")
-        assert_equal(sp.suggest(u("kaola"), maxdist=1), [u('koala')])
-        assert_equal(sp.suggest(u("kaola"), maxdist=2), [u('koala'), u('kaori'), u('ooala'), u('zoala')])
+        assert_equal(sp.suggest(u("kaola"), maxdist=1), ['koala'])
+        assert_equal(sp.suggest(u("kaola"), maxdist=2), ['koala', 'kaori', 'ooala', 'zoala'])
 
 def test_reader_corrector():
     schema = fields.Schema(text=fields.TEXT(spelling=True))
     w.add_document(text=u("leader libra ooala paster"))
     w.add_document(text=u("feeder lorry zoala baster"))
     w.commit()
-    
+
     with ix.reader() as r:
         assert r.has_word_graph("text")
         sp = spelling.ReaderCorrector(r, "text")
     w.add_document(text1=u("leader libra ooala paster"), text2=u("alpha"))
     w.add_document(text1=u("feeder lorry zoala baster"), text2=u("olfo"))
     w.commit()
-    
+
     with ix.reader() as r:
         assert not r.has_word_graph("text1")
         assert not r.has_word_graph("text2")
-    
+
     from whoosh.filedb.filewriting import add_spelling
     add_spelling(ix, ["text1", "text2"])
-    
+
     with ix.reader() as r:
         assert r.has_word_graph("text1")
         assert r.has_word_graph("text2")
-        
+
         sp = spelling.ReaderCorrector(r, "text1")
         assert_equal(sp.suggest(u("kaola"), maxdist=1), [u('koala')])
         assert_equal(sp.suggest(u("kaola"), maxdist=2), [u('koala'), u('kaori'), u('ooala'), u('zoala')])
         w = ix.writer()
         w.add_document(text=word)
         w.commit(merge=False)
-    
+
     with ix.reader() as r:
         assert not r.is_atomic()
         words = list(dawg.flatten(r.word_graph("text")))
 
         corr = r.corrector("text")
         assert_equal(corr.suggest("specail", maxdist=2), ["special", "specials"])
-        
+
 def test_multicorrector():
     schema = fields.Schema(text=fields.TEXT(spelling=True))
     ix = RamStorage().create_index(schema)
         w.commit(merge=False)
 
     c1 = ix.reader().corrector("text")
-    
+
     wordlist = sorted(u("bear bare beer sprung").split())
     c2 = spelling.GraphCorrector.from_word_list(wordlist)
-    
+
     mc = spelling.MultiCorrector([c1, c2])
     assert_equal(mc.suggest("specail"), ["special", "specials"])
     assert_equal(mc.suggest("beur"), ["bear", "beer"])
-    assert_equal(mc.suggest("sprang"), ["spring", "sprung"])
+    assert_equal(mc.suggest("sprang"), ["sprung", "spring"])
 
 def test_wordlist():
     domain = sorted("special specious spectacular spongy spring specials".split())
 
 def test_wordfile():
     import os.path
-    
+
     files = os.listdir(".")
     testdir = "tests"
     fname = "english-words.10.gz"
         path = fname
     else:
         return
-    
+
     if not os.path.exists(path):
         return
     wordfile = gzip.open(path, "r")
     cor = spelling.GraphCorrector.from_word_list(word.decode("latin-1")
                                                  for word in wordfile)
     wordfile.close()
-    
+
     #dawg.dump_dawg(cor.word_graph)
     assert_equal(cor.suggest("specail"), ["special"])
-    
+
     st = RamStorage()
     gf = st.create_file("test.dawg")
     cor.to_file(gf)
-    
+
     gf = st.open_file("test.dawg")
     cor = spelling.GraphCorrector.from_graph_file(gf)
-    
+
     assert_equal(cor.suggest("specail", maxdist=1), ["special"])
     gf.close()
 
 def test_query_highlight():
     qp = QueryParser("a", None)
     hf = highlight.HtmlFormatter()
-    
+
     def do(text, terms):
         q = qp.parse(text)
         tks = [tk for tk in q.all_tokens() if tk.text in terms]
                 assert False, tk
         fragment = highlight.Fragment(text, tks)
         return hf.format_fragment(fragment)
-    
+
     assert_equal(do("a b c d", ["b"]),
                  'a <strong class="match term0">b</strong> c d')
     assert_equal(do('a (x:b OR y:"c d") e', ("b", "c")),
 
 def test_query_terms():
     qp = QueryParser("a", None)
-    
+
     q = qp.parse("alfa b:(bravo OR c:charlie) delta")
     assert_equal(sorted(q.iter_all_terms()), [("a", "alfa"), ("a", "delta"),
                                               ("b", "bravo"), ("c", "charlie")])
-    
+
     q = qp.parse("alfa brav*")
     assert_equal(sorted(q.iter_all_terms()), [("a", "alfa")])
-    
+
     q = qp.parse('a b:("b c" d)^2 e')
     tokens = [(t.fieldname, t.text, t.boost) for t in q.all_tokens()]
     assert_equal(tokens, [('a', 'a', 1.0), ('b', 'b', 2.0), ('b', 'c', 2.0),
     w.add_document(a=u("golf hotel india juliet"))
     w.add_document(a=u("juliet kilo lima mike"))
     w.commit()
-    
+
     s = ix.searcher()
     qp = QueryParser("a", ix.schema)
     qtext = u('alpha ("brovo november" OR b:dolta) detail')
     q = qp.parse(qtext, ix.schema)
-    
+
     c = s.correct_query(q, qtext)
     assert_equal(c.query.__unicode__(), '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)')
     assert_equal(c.string, 'alfa ("bravo november" OR b:dolta) detail')
     c = s.correct_query(q, qtext)
     assert_equal(c.query.__unicode__(), '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)')
     assert_equal(c.string, 'alfa b:("brovo november" a:delta) detail')
-    
+
     hf = highlight.HtmlFormatter(classname="c")
     assert_equal(c.format_string(hf), '<strong class="c term0">alfa</strong> b:("brovo november" a:<strong class="c term1">delta</strong>) detail')
-    
+
 def test_bypass_stemming():
     from whoosh.support.dawg import flatten
-    
+
     ana = analysis.StemmingAnalyzer()
     schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
     ix = RamStorage().create_index(schema)
     w = ix.writer()
     w.add_document(text=u("rendering shading modeling reactions"))
     w.commit()
-    
+
     with ix.reader() as r:
         assert_equal(list(r.lexicon("text")), ["model", "reaction", "render", "shade"])
         assert_equal(list(flatten(r.word_graph("text"))), ["modeling", "reactions", "rendering", "shading"])
                            c=fields.TEXT, d=fields.TEXT(analyzer=ana),
                            e=fields.TEXT(analyzer=ana), f=fields.TEXT)
     ix = RamStorage().create_index(schema)
-    
+
     domain = u("alfa bravo charlie delta").split()
     w = ix.writer()
     for ls in permutations(domain):
         value = " ".join(ls)
         w.add_document(a=value, b=value, c=value, d=value, e=value, f=value)
     w.commit()
+
+def test_find_self():
+    wordlist = sorted(u("book bake bike bone").split())
+    gc = spelling.GraphCorrector.from_word_list(wordlist)
+    assert_not_equal(gc.suggest("book")[0], "book")
+    assert_not_equal(gc.suggest("bake")[0], "bake")
+    assert_not_equal(gc.suggest("bike")[0], "bike")
+    assert_not_equal(gc.suggest("bone")[0], "bone")
+
+
+
+
+
+
+