1. Matt Chaput
  2. whoosh


Matt Chaput  committed 14de108

Raise an error when a bad analyzer chain is constructed.
See issue #358.

  • Participants
  • Parent commits 2e77e72
  • Branches default

Comments (0)

Files changed (3)

File src/whoosh/analysis/acore.py

View file
  • Ignore whitespace
 from whoosh.compat import iteritems
+# Exceptions
+class CompositionError(Exception):
+    pass
 # Utility functions
 def unstopped(tokenstream):

File src/whoosh/analysis/analyzers.py

View file
  • Ignore whitespace
 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
-from whoosh.analysis.acore import Composable
+from whoosh.analysis.acore import Composable, CompositionError
+from whoosh.analysis.tokenizers import Tokenizer
 from whoosh.analysis.filters import LowercaseFilter
 from whoosh.analysis.filters import StopFilter, STOP_WORDS
 from whoosh.analysis.morph import StemFilter
 class CompositeAnalyzer(Analyzer):
     def __init__(self, *composables):
         self.items = []
         for comp in composables:
             if isinstance(comp, CompositeAnalyzer):
+        # Tokenizers must start a chain, and then only filters after that
+        # (because analyzers take a string and return a generator of tokens,
+        # and filters take and return generators of tokens)
+        for item in self.items[1:]:
+            if isinstance(item, Tokenizer):
+                raise CompositionError("Only one tokenizer allowed at the start"
+                                       " of the analyzer: %r" % self.items)
     def __repr__(self):
         return "%s(%s)" % (self.__class__.__name__,
                            ", ".join(repr(item) for item in self.items))

File tests/test_analysis.py

View file
  • Ignore whitespace
     es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es")
     ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))]
     assert ls == ["lapiz", "mesa"]
+def test_issue358():
+    t = analysis.RegexTokenizer("\w+")
+    with pytest.raises(analysis.CompositionError):
+        _ = t | analysis.StandardAnalyzer()