Evgeniy Tatarkin avatar Evgeniy Tatarkin committed 4becf06

recursive crawl

Comments (0)

Files changed (3)

pomp/core/base.py

 """
 Base classes
 """
+import types
+
+
+def iterator(var):
+    if isinstance(var, types.StringTypes):
+        return iter((var,))
+    return iter(var)
+
 
 class BaseCrawler(object):
     ENTRY_URL = None
     def next_url(self, page):
         raise NotImplementedError()
 
-    def process(self, page):
-        self.extract_items(page)
+    def process(self, url, page):
+        self.extract_items(url, page)
 
-    def extract_items(self, page):
+    def extract_items(self, url, page):
         raise NotImplementedError()
 
 
 class BaseDownloader(object):
 
+    def process(self, urls, callback, crawler):
+        map(lambda url: callback(crawler, url, self.get(url)), urls)
+
     def get(self, url):
         raise NotImplementedError()

pomp/core/engine.py

 """
 Engine
 """
+from pomp.core.base import iterator
 
 
 class Pomp(object):
 
         self.downloader = downloader
 
+    def response_callback(self, crawler, url, page):
+
+        crawler.process(url, page)
+
+        urls = crawler.next_url(page)
+
+        if urls:
+            self.downloader.process(
+                iterator(urls),
+                self.response_callback,
+                crawler
+            )
+
     def pump(self, crawler):
 
-        # fetch entry url
-        page = self.downloader.get(crawler.ENTRY_URL)
-
-        # process page
-        crawler.process(page)
-
-        # get next url
-        crawler.next_url(page)
+        self.downloader.process(
+            iterator(crawler.ENTRY_URL),
+            self.response_callback,
+            crawler
+        )

tests/test_simple_crawler.py

             f3 = Field()
 
         class DummyCrawler(BaseCrawler):
-            ENTRY_URL = "http://python.org/"
+            ENTRY_URL = (
+                "http://python.org/1",
+                "http://python.org/2"
+            )
 
-            def next_url(self, current_page):
-                return None
+            def __init__(self):
+                super(DummyCrawler, self).__init__()
+                self.crawled_urls = []
 
-            def extract_items(self, page):
+            def next_url(self, page):
+                url = 'http://python.org/1/1'
+                return url if url not in self.crawled_urls else None
+
+            def extract_items(self, url, page):
+                self.crawled_urls.append(url)
                 item = DummyItem()
                 item.f1 = '1'
                 item.f2 = '2'
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.