Commits

Evgeniy Tatarkin committed e638428

base crawler and downloader

  • Participants
  • Parent commits 43c22dc

Comments (0)

Files changed (5)

+syntax: glob
+
+*.pyc
+*~
+*egg*

File pomp/core/__init__.py

Empty file added.

File pomp/core/base.py

+"""
+Base classes
+"""
+
+class BaseCrawler(object):
+    ENTRY_URL = None
+
+    def next_url(self, page):
+        raise NotImplementedError()
+
+    def process(self, page):
+        self.extract_items(page)
+
+    def extract_items(self, page):
+        raise NotImplementedError()
+
+
+class BaseDownloader(object):
+
+    def get(self, url):
+        raise NotImplementedError()

File pomp/core/engine.py

+"""
+Engine
+"""
+
+
+class Pomp(object):
+
+    def __init__(self, downloader):
+
+        self.downloader = downloader
+
+    def pump(self, crawler):
+
+        # fetch entry url
+        page = self.downloader.get(crawler.ENTRY_URL)
+
+        # process page
+        crawler.process(page)
+
+        # get next url
+        crawler.next_url(page)

File tests/test_simple_crawler.py

+from pomp.core.base import BaseCrawler, BaseDownloader
+from pomp.core.engine import Pomp
+
+
+class TestSimplerCrawler(object):
+
+    def test_crawler(self):
+
+        class PythonOrgCrawler(BaseCrawler):
+            ENTRY_URL = "http://python.org/"
+
+            def next_url(self, current_page):
+                return None
+
+            def extract_items(self, page):
+                return ['1', '2', '3', '4', '5']
+
+        class DummyDownloader(BaseDownloader):
+
+            def get(slef, url):
+                return '<html><head></head><body></body></html>'
+
+        pomp = Pomp(downloader=DummyDownloader())
+
+        pomp.pump(PythonOrgCrawler())