Evgeniy Tatarkin avatar Evgeniy Tatarkin committed 99e9ee7

downlader.get returnr tuple of (url, page_content)

Comments (0)

Files changed (3)

pomp/contrib/__init__.py

 
     def get(self, url):
         response = urlopen(url, timeout=self.TIMEOUT)
-        return response.read()
+        return url, response.read()
 
 
 class ThreadedDownloader(SimpleDownloader):
 
     def __init__(self, pool_size=5):
         self.workers_pool = ThreadPool(processes=pool_size)
-
+    
     def process(self, urls, callback, crawler):
         pages = self.workers_pool.map(self.get, urls)
         return filter(
             None,
-            list(map(lambda res: callback(crawler, None, res), pages))
+            list(map(lambda res: callback(crawler, *res), pages))
         )

pomp/core/base.py

         # return next urls
         return filter(
             None,
-            list(map(lambda url: callback(crawler, url, self.get(url)), urls))
+            list(map(lambda url: callback(crawler, *self.get(url)), urls))
         )
 
     def get(self, url):

tests/test_simple_crawler.py

 
 logging.basicConfig(level=logging.DEBUG)
 
-
 class DummyItem(Item):
     value = Field()
     url = Field()
 class DummyDownloader(BaseDownloader):
 
     def get(slef, url):
-        return '<html><head></head><body></body></html>'
+        return url, '<html><head></head><body></body></html>'
 
 
 class TestSimplerCrawler(object):
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.