Commits

Evgeniy Tatarkin  committed 99e9ee7

downlader.get returnr tuple of (url, page_content)

  • Participants
  • Parent commits ce38b5e

Comments (0)

Files changed (3)

File pomp/contrib/__init__.py

 
     def get(self, url):
         response = urlopen(url, timeout=self.TIMEOUT)
-        return response.read()
+        return url, response.read()
 
 
 class ThreadedDownloader(SimpleDownloader):
 
     def __init__(self, pool_size=5):
         self.workers_pool = ThreadPool(processes=pool_size)
-
+    
     def process(self, urls, callback, crawler):
         pages = self.workers_pool.map(self.get, urls)
         return filter(
             None,
-            list(map(lambda res: callback(crawler, None, res), pages))
+            list(map(lambda res: callback(crawler, *res), pages))
         )

File pomp/core/base.py

         # return next urls
         return filter(
             None,
-            list(map(lambda url: callback(crawler, url, self.get(url)), urls))
+            list(map(lambda url: callback(crawler, *self.get(url)), urls))
         )
 
     def get(self, url):

File tests/test_simple_crawler.py

 
 logging.basicConfig(level=logging.DEBUG)
 
-
 class DummyItem(Item):
     value = Field()
     url = Field()
 class DummyDownloader(BaseDownloader):
 
     def get(slef, url):
-        return '<html><head></head><body></body></html>'
+        return url, '<html><head></head><body></body></html>'
 
 
 class TestSimplerCrawler(object):