Commits

Evgeniy Tatarkin committed e3b4bb8

urllib now in conrib.urllibtools, rename SimplwDownloader to UrllibDownloader

Comments (0)

Files changed (10)

 
 Release date to be decided.
 
+- rename SimpleDownloader to UrllibDownloader
+- urllib code now in contrib/urllibtools.py
 - ENTRY_URL renamed to ENTRY_REQUESTS
 - next_url renamed to next_requests
 - async support
 Urllib
 ``````
 
-.. automodule:: pomp.contrib
+.. automodule:: pomp.contrib.urllibtools
     :members:
 
 
 
     if __name__ == '__main__':
         from pomp.core.engine import Pomp
-        from pomp.contrib import SimpleDownloader
+        from pomp.contrib.urllibtools import UrllibDownloader
 
         pomp = Pomp(
-            downloader=SimpleDownloader(),
+            downloader=UrllibDownloader(),
         )
 
         pomp.pump(MyCrawler())
             return item # return item for following processing
 
     pomp = Pomp(
-        downloader=SimpleDownloader(),
+        downloader=UrllibDownloader(),
         pipelines=(FilterPipeline(), PrintPipeline(),)
     )
 

examples/01_pythonnews.py

 import logging
 from pomp.core.base import BaseCrawler, BasePipeline
 from pomp.core.item import Item, Field
-from pomp.contrib import SimpleDownloader
+from pomp.contrib.urllibtools import UrllibDownloader
 
 
 logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
     from pomp.core.engine import Pomp
 
     pomp = Pomp(
-        downloader=SimpleDownloader(),
+        downloader=UrllibDownloader(),
         pipelines=[PrintPipeline()],
     )
 

examples/02_livejournal.py

 from pomp.core.base import BaseCrawler
 from pomp.contrib.pipelines import CsvPipeline
 from pomp.core.base import BasePipeline, BaseDownloaderMiddleware
-from pomp.contrib import UrllibHttpRequest
+from pomp.contrib.urllibtools import UrllibHttpRequest
 from pomp.core.item import Item, Field
 
 

pomp/contrib/__init__.py

-"""
-Simple downloaders and middlewares for fetching data by standard
-`urlopen` function from `urllib` package for python3.x
-or `urllib2` for python2.7+
-"""
-try:
-    from urllib.request import urlopen, Request
-except ImportError:
-    from urllib2 import urlopen, Request
-
-import logging
-from multiprocessing.pool import ThreadPool
-
-from pomp.core.base import BaseDownloader, BaseHttpRequest, \
-    BaseHttpResponse, BaseDownloaderMiddleware, BaseDownloadException
-from pomp.core.utils import iterator
-
-
-log = logging.getLogger('pomp.contrib.urllib')
-
-
-class SimpleDownloader(BaseDownloader):
-    """Simplest downloader
-
-    :param timeout: request timeout in seconds
-    """
-
-    def __init__(self, timeout=5, middlewares=None):
-        super(SimpleDownloader, self).__init__(middlewares=middlewares)
-        # insert urllib adpter middleware by default
-        self.middlewares.insert(0, UrllibAdapterMiddleware())
-        self.timeout = timeout
-
-    def get(self, requests):
-        responses = []
-        for request in iterator(requests):
-            response = self._fetch(request)
-            responses.append(response)
-        return responses
-
-    def _fetch(self, request):
-        try:
-            res = urlopen(request, timeout=self.timeout)
-            return UrllibHttpResponse(request, res)
-        except Exception as e:
-            log.exception('Exception on %s', request)
-            return BaseDownloadException(request, exception=e)
-
-
-class ThreadedDownloader(SimpleDownloader):
-    """Threaded downloader by `ThreadPool` from `multiprocessing.pool`
-    package.
-
-    :param pool_size: count of workers in pool
-    :param timeout: request timeout in seconds
-    """
-
-    def __init__(self, pool_size=5, timeout=5, middlewares=None):
-        self.workers_pool = ThreadPool(processes=pool_size)
-        super(ThreadedDownloader, self).__init__(middlewares=middlewares)
-
-    def get(self, requests):
-        return self.workers_pool.map(self._fetch, requests)
-
-
-class UrllibHttpRequest(Request, BaseHttpRequest):
-    """Adapter for urllib request to :class:`pomp.core.base.BaseHttpRequest`"""
-
-    @property
-    def url(self):
-        return self.get_full_url()
-
-
-class UrllibHttpResponse(BaseHttpResponse):
-    """Adapter for urllib response to
-    :class:`pomp.core.base.BaseHttpResponse`"""
-
-    def __init__(self, request, response):
-        self.req = request
-        self.resp = response
-
-        if not isinstance(response, Exception):
-            self.body = self.resp.read()
-
-    @property
-    def request(self):
-        return self.req
-
-    @property
-    def response(self):
-        return self.resp
-
-
-class UrllibAdapterMiddleware(BaseDownloaderMiddleware):
-    """Middlerware for adapting urllib.Request
-    to :class:`pomp.core.base.BaseHttpRequest`
-    """
-
-    def process_request(self, req):
-        if isinstance(req, BaseHttpRequest):
-            return req
-        return UrllibHttpRequest(req)
-
-    def process_response(self, response):
-        return response

pomp/contrib/concurrenttools.py

 import defer
 
 from pomp.core.base import BaseDownloadException
-from pomp.contrib import SimpleDownloader
+from pomp.contrib.urllibtools import UrllibDownloader
 
 
 log = logging.getLogger('pomp.contrib.concurrent')
 
 
-class ConcurrentUrllibDownloader(SimpleDownloader):
+class ConcurrentUrllibDownloader(UrllibDownloader):
     """Concurrent ThreadPoolExecutor downloader for fetching data by urllib
     :class:`pomp.contrib.SimpleDownloader`
 

pomp/contrib/urllibtools.py

+"""
+Simple downloaders and middlewares for fetching data by standard
+`urlopen` function from `urllib` package for python3.x
+or `urllib2` for python2.7+
+"""
+try:
+    from urllib.request import urlopen, Request
+except ImportError:
+    from urllib2 import urlopen, Request
+
+import logging
+from multiprocessing.pool import ThreadPool
+
+from pomp.core.base import BaseDownloader, BaseHttpRequest, \
+    BaseHttpResponse, BaseDownloaderMiddleware, BaseDownloadException
+from pomp.core.utils import iterator
+
+
+log = logging.getLogger('pomp.contrib.urllib')
+
+
+class UrllibDownloader(BaseDownloader):
+    """Simplest downloader
+
+    :param timeout: request timeout in seconds
+    """
+
+    def __init__(self, timeout=5, middlewares=None):
+        super(UrllibDownloader, self).__init__(middlewares=middlewares)
+        # insert urllib adpter middleware by default
+        self.middlewares.insert(0, UrllibAdapterMiddleware())
+        self.timeout = timeout
+
+    def get(self, requests):
+        responses = []
+        for request in iterator(requests):
+            response = self._fetch(request)
+            responses.append(response)
+        return responses
+
+    def _fetch(self, request):
+        try:
+            res = urlopen(request, timeout=self.timeout)
+            return UrllibHttpResponse(request, res)
+        except Exception as e:
+            log.exception('Exception on %s', request)
+            return BaseDownloadException(request, exception=e)
+
+
+class ThreadedDownloader(UrllibDownloader):
+    """Threaded downloader by `ThreadPool` from `multiprocessing.pool`
+    package.
+
+    :param pool_size: count of workers in pool
+    :param timeout: request timeout in seconds
+    """
+
+    def __init__(self, pool_size=5, timeout=5, middlewares=None):
+        self.workers_pool = ThreadPool(processes=pool_size)
+        super(ThreadedDownloader, self).__init__(middlewares=middlewares)
+
+    def get(self, requests):
+        return self.workers_pool.map(self._fetch, requests)
+
+
+class UrllibHttpRequest(Request, BaseHttpRequest):
+    """Adapter for urllib request to :class:`pomp.core.base.BaseHttpRequest`"""
+
+    @property
+    def url(self):
+        return self.get_full_url()
+
+
+class UrllibHttpResponse(BaseHttpResponse):
+    """Adapter for urllib response to
+    :class:`pomp.core.base.BaseHttpResponse`"""
+
+    def __init__(self, request, response):
+        self.req = request
+        self.resp = response
+
+        if not isinstance(response, Exception):
+            self.body = self.resp.read()
+
+    @property
+    def request(self):
+        return self.req
+
+    @property
+    def response(self):
+        return self.resp
+
+
+class UrllibAdapterMiddleware(BaseDownloaderMiddleware):
+    """Middlerware for adapting urllib.Request
+    to :class:`pomp.core.base.BaseHttpRequest`
+    """
+
+    def process_request(self, req):
+        if isinstance(req, BaseHttpRequest):
+            return req
+        return UrllibHttpRequest(req)
+
+    def process_response(self, response):
+        return response

tests/test_contrib_concurrent.py

 from nose.tools import assert_set_equal
 
 from pomp.core.engine import Pomp
-from pomp.contrib import UrllibHttpRequest
+from pomp.contrib.urllibtools import UrllibHttpRequest
 
 try:
     from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader

tests/test_contrib_urllib.py

 from nose.tools import assert_set_equal, assert_equal
 from pomp.core.base import BaseCrawler, BaseDownloaderMiddleware
 from pomp.core.engine import Pomp
-from pomp.contrib import SimpleDownloader, ThreadedDownloader
+from pomp.contrib.urllibtools import UrllibDownloader, ThreadedDownloader
 
 from mockserver import HttpServer, make_sitemap
 from tools import DummyCrawler
 
         catch_exception_middleware = CatchException()
         pomp = Pomp(
-            downloader=SimpleDownloader(
+            downloader=UrllibDownloader(
                 middlewares=[catch_exception_middleware]),
             pipelines=[],
         )