Commits

Evgeniy Tatarkin committed f3a4d8b

fix example and minimal app

Comments (0)

Files changed (2)

docs/quickstart.rst

 For a minimal application all you need is to define you crawler 
 by inherit :class:`BaseCrawler`::
 
-    from pomp.core.base import BaseCrawler
+    import re
+    from pomp.core.base import BaseCrawler, BasePipeline
     from pomp.contrib import SimpleDownloader
 
 
+    python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M)
+
+
     class MyCrawler(BaseCrawler):
+        """Extract all sentences with `python` word"""
         ENTRY_URL = 'http://python.org/news' # entry point
 
         def extract_items(self, response):
-            response.body = response.body.decode('utf-8')
-            return None
+            for i in python_sentence_re.findall(response.body.decode('utf-8')):
+                yield i.strip()
 
         def next_url(self, response):
             return None # one page crawler, stop crawl
 
 
+    class PrintPipeline(BasePipeline):
+        def process(self, item):
+            print('Sentence:', item)
+
+
     if __name__ == '__main__':
         from pomp.core.engine import Pomp
 
         pomp = Pomp(
             downloader=SimpleDownloader(),
+            pipelines=(PrintPipeline(),)
         )
 
         pomp.pump(MyCrawler())

examples/pythonnews.py

     ENTRY_URL = 'http://python.org/news/'
 
     def extract_items(self, response):
-        response.body = response.body.decode('utf-8')
-        for i in news_re.findall(response.body):
+        for i in news_re.findall(response.body.decode('utf-8')):
             item = PythonNewsItem()
             item.title, item.published = i[0], i[2]
             yield item