Matt Chaput committed 0bde35c

Remove assert for spell words not in index since they can be separate spelling words. Fixes issue #296.
Use Field.from_bytes() to decode separate spelling words instead of utf8decode.

Comments (0)

Files changed (3)


         lasttext = None
         # The (fieldname, btext) of the previous spelling posting
         lastspell = None
+        # The field object for the current field
+        fieldobj = None
         for fieldname, btext, docnum, weight, value in items:
             # Check for out-of-order postings. This is convoluted because Python
             # 3 removed the ability to compare a string to None
                 if lastfn is not None and fieldname != lastfn:
-                start_field(fieldname, schema[fieldname])
+                fieldobj = schema[fieldname]
+                start_field(fieldname, fieldobj)
                 lastfn = fieldname
                 lasttext = None
                 # There can be duplicates of spelling terms, so only add a spell
                 # term if it's greater than the last one
                 if lastspell is None or spellterm > lastspell:
-                    # TODO: how to decode the btext bytes?
-                    self.add_spell_word(fieldname, btext.decode("utf8"))
+                    spellword = fieldobj.from_bytes(btext)
+                    self.add_spell_word(fieldname, spellword)
                     lastspell = spellterm


         for sug in self.reader.terms_within(fieldname, text, maxdist,
             # Higher scores are better, so negate the distance and frequency
-            f = freq(fieldname, sug)
-            assert f, "Suggestion %s:%r not in index" % (fieldname, sug)
+            # TODO: store spelling frequencies in the graph
+            f = freq(fieldname, sug) or 1
             score = 0 - (maxdist + (1.0 / f * 0.5))
             yield (score, sug)


             assert sorted(sugs) == ["aa12", "aa34", "aa56", "aa78"]
+def test_missing_suggestion():
+    ana = analysis.StemmingAnalyzer()
+    schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True),
+                           organism=fields.ID)
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(organism=u("hs"), content=u("cells"))
+        w.add_document(organism=u("hs"), content=u("cell"))
+    with ix.searcher() as s:
+        r = s.reader()
+        assert r.has_word_graph("content")
+        gr = r.word_graph("content")
+        assert list(gr.flatten()) == ["cell", "cells"]
+        c = s.corrector("content")
+        # Note that corrector won't suggest the word you submit even though it's
+        # in the index
+        assert c.suggest("cell") == ["cells"]
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.