Commits

Evgeniy Tatarkin  committed b15f7d3

add simple downloader, example

  • Participants
  • Parent commits d922278

Comments (0)

Files changed (2)

File examples/pythonnews.py

+"""
+Extract python news from python.org
+"""
+import re
+from pomp.core.base import BaseCrawler, BasePipeline
+from pomp.core.item import Item, Field
+from pomp.contrib import SimpleDownloader
+
+
+news_re = re.compile(r'<h2 class="news">(.*?)</h2>([\s\S]*?)<div class="pubdate">(.*?)</div>')
+
+
+class PythonNewsItem(Item):
+    title = Field()
+    published = Field()
+
+    def __repr__(self):
+        return '%s\n\t%s\n' % (
+            self.title,
+            self.published,
+        )
+
+
+class PythonNewsCrawler(BaseCrawler):
+    ENTRY_URL = 'http://python.org/news/'
+
+    def extract_items(self, url, page):
+
+        for i in news_re.findall(page):
+            item = PythonNewsItem()
+            item.title, item.published = i[0], i[2]
+            yield item
+
+    def next_url(self, page):
+        return None # one page crawler
+
+
+class PrintPipeline(BasePipeline):
+
+    def process(self, item):
+        print(item)
+
+
+if __name__ == '__main__':
+    from pomp.core.engine import Pomp
+
+    pomp = Pomp(
+        downloader=SimpleDownloader(),
+        pipelines=[PrintPipeline()],
+    )
+
+    pomp.pump(PythonNewsCrawler())

File pomp/contrib/__init__.py

+"""
+Standart downloaders
+"""
+import urllib2
+from pomp.core.base import BaseDownloader
+
+
+class SimpleDownloader(BaseDownloader):
+
+    TIMEOUT = 5
+
+    def get(self, url):
+        response = urllib2.urlopen(url, timeout=self.TIMEOUT)
+        return response.read()