Commits

Jan Schrewe committed 3e46a70

Allow pickling og PyStemmer class.

Comments (0)

Files changed (1)

src/whoosh/analysis.py

                     t.text = stemfn(text)
             yield t
 
-
 class PyStemmerFilter(StemFilter):
     """This is a simple subclass of StemFilter that works with the py-stemmer
     third-party library. You must have the py-stemmer library installed to use
     this filter.
-    
+
     >>> PyStemmerFilter("spanish")
     """
 
 
         import Stemmer  #@UnresolvedImport
 
-        stemmer = Stemmer.Stemmer(lang)
-        stemmer.maxCacheSize = cachesize
-        self._stem = stemmer.stemWord
-        self.ignore = frozenset() if ignore is None else frozenset(ignore)
+        self.lang = lang
+        self.maxCacheSize = cachesize
+        super(PyStemmerFilter, self).__init__(stemfn=self._get_stemmer_fn(), ignore=ignore, cachesize=0)
 
     def algorithms(self):
         """Returns a list of stemming algorithms provided by the py-stemmer
     def cache_info(self):
         return None
 
+    def _get_stemmer_fn(self):
+        import Stemmer  #@UnresolvedImport
+        stemmer = Stemmer.Stemmer(self.lang)
+        stemmer.maxCacheSize = self.maxCacheSize
+        return stemmer.stemWord
+
+    def __getstate__(self):
+        # Can't pickle a dynamic function, so we have to remove the _stem
+        # attribute from the state
+        return dict([(k, self.__dict__[k]) for k in self.__dict__
+                          if k != "_stem" and k!= "stemfn"])
+
+    def __setstate__(self, state):
+        # Check for old instances of StemFilter class, which didn't have a
+        # cachesize attribute and pickled the cache attribute
+        if "cachesize" not in state:
+            self.cachesize = 0
+        if "cachesize" not in state:
+            self.maxCacheSize = 50000
+        if "ignores" in state:
+            self.ignore = state["ignores"]
+        elif "ignore" not in state:
+            self.ignore = frozenset()
+        if "cache" in state:
+            del state["cache"]
+
+        self.__dict__.update(state)
+        # Set the _stem attribute
+        self.stemfn = self.get_stemmerfn()
+        self.clear()
+
 
 class CharsetFilter(Filter):
     """Translates the text of tokens by calling unicode.translate() using the