Commits

Matt Chaput committed 1c5cea8

Fixed uninitialized variable when NgramFilter at="end" is called with chars=True. Fixes issue #226.
Added more informative error message for assert in spelling.py.

  • Participants
  • Parent commits f81f923

Comments (0)

Files changed (3)

src/whoosh/analysis.py

                     t.text = stemfn(text)
             yield t
 
+
 class PyStemmerFilter(StemFilter):
     """This is a simple subclass of StemFilter that works with the py-stemmer
     third-party library. You must have the py-stemmer library installed to use
         library.
         """
 
-        import Stemmer  #@UnresolvedImport
+        import Stemmer  # @UnresolvedImport
 
         return Stemmer.algorithms()
 
         return None
 
     def _get_stemmer_fn(self):
-        import Stemmer  #@UnresolvedImport
+        import Stemmer  # @UnresolvedImport
 
         stemmer = Stemmer.Stemmer(self.lang)
         stemmer.maxCacheSize = self.cachesize
                         yield t
 
                 elif at == 1:
+                    if chars:
+                        original_startchar = t.startchar
                     start = max(0, len(text) - self.max)
                     for i in xrange(start, len(text) - self.min + 1):
                         t.text = text[i:]
                         if chars:
-                            t.startchar = t.endchar - size
+                            t.startchar = original_startchar + i
                         yield t
                 else:
                     for start in xrange(0, len(text) - self.min + 1):

src/whoosh/spelling.py

                                             prefix=prefix):
             # Higher scores are better, so negate the distance and frequency
             f = freq(fieldname, sug)
-            assert f
+            assert f, "Suggestion %s:%r not in index" % (fieldname, sug)
             score = 0 - (maxdist + (1.0 / f * 0.5))
             yield (score, sug)
 

tests/test_analysis.py

     # text has consecutive delimiters
     tokens = [t.text for t in ana(u("LOL:)"))]
     assert_equal(tokens, ["LOL"])
+
+def test_ngrams():
+    s = u("abcdefg h ij klm")
+    tk = analysis.RegexTokenizer(r"\S+")
+
+    def dotest(f):
+        ana = tk | f
+        tokens = ana(s, positions=True, chars=True)
+        return "/".join(t.text for t in tokens)
+
+    f = analysis.NgramFilter(3, 4)
+    assert_equal(dotest(f), "abc/abcd/bcd/bcde/cde/cdef/def/defg/efg/klm")
+
+    f = analysis.NgramFilter(3, 4, at="start")
+    assert_equal(dotest(f), "abc/abcd/klm")
+
+    f = analysis.NgramFilter(3, 4, at="end")
+    assert_equal(dotest(f), "defg/efg/klm")
+
+    ana = tk | analysis.NgramFilter(2, 5, at="end")
+    tokens = [(t.text, t.startchar, t.endchar) for t in ana(s, chars=True)]
+    assert_equal(tokens, [("cdefg", 2, 7), ("defg", 3, 7), ("efg", 4, 7),
+                          ("fg", 5, 7), ("ij", 10, 12), ("klm", 13, 16),
+                          ("lm", 14, 16)])
+
+
+