Commits

Evgeniy Tatarkin  committed 7deace6

document base classes and urlib contribs

  • Participants
  • Parent commits ec7bfc0

Comments (0)

Files changed (4)

File docs/api.rst

 This part of the documentation documents all the public classes and
 functions in pomp.
 
-Interface classes
-`````````````````
-.. module:: pomp.core.base
+Contrib
+*******
 
-.. autoclass:: BaseCrawler
-    :members: extract_items, next_url
-
-.. autoclass:: BaseDownloader
+.. automodule:: pomp.contrib
     :members: 
 
-.. autoclass:: BaseHttpRequest
+Interfaces
+**********
+
+.. automodule:: pomp.core.base
     :members:
-
-.. autoclass:: BaseHttpResponse
-    :members: 
-
-.. autoclass:: BaseDownloaderMiddleware
-    :members:
-
-.. autoclass:: BasePipeline
-    :members: 
-
-.. autoclass:: BaseDownloadException
-    :members:  

File docs/quickstart.rst

         def next_url(self, response):
             return None # one page crawler, stop crawl
 
+
     if __name__ == '__main__':
         from pomp.core.engine import Pomp
 
         )
 
         pomp.pump(MyCrawler())
+
+
+Item pipelines
+--------------
+
+
+Custom downloader
+-----------------
+
+
+Downloader middleware
+---------------------

File pomp/contrib/__init__.py

 """
-Standart downloaders
+Urllib
+``````
+
+Simple downloaders and middlewares for fetching urls by standard 
+`urlopen` function from `urllib` package for python3.x
+or `urllib2` for python2.7+
 """
 try:
     from urllib.request import urlopen, Request
 
 
 class SimpleDownloader(BaseDownloader):
+    """Simplest downloader
+    
+    :param timeout: request timeout in seconds
+    """
 
-    TIMEOUT = 5
-
-    def __init__(self, *args, **kwargs):
-        super(SimpleDownloader, self).__init__(*args, **kwargs)
+    def __init__(self, timeout=5, middlewares=None):
+        super(SimpleDownloader, self).__init__(middlewares=middlewares)
         # insert urllib adpter middleware by default
         self.middlewares.insert(0, UrllibAdapterMiddleware())
+        self.timeout = timeout
 
     def get(self, requests):
         responses = []
             responses.append(response)
         return responses
 
-    def _fetch(self, request, timeout=TIMEOUT):
+    def _fetch(self, request):
         try:
-            res = urlopen(request.url, timeout=timeout)
+            res = urlopen(request.url, timeout=self.timeout)
             return UrllibHttpResponse(request, res)
         except Exception as e:
             log.exception('Exception on %s', request)
 
 
 class ThreadedDownloader(SimpleDownloader):
+    """Threaded downloader by `ThreadPool` from `multiprocessing.pool`
+    package.
 
-    def __init__(self, pool_size=5, *args, **kwargs):
+    :param pool_size: count of workers in pool
+    :param timeout: request timeout in seconds
+    """
+
+    def __init__(self, pool_size=5, timeout=5, middlewares=None):
         self.workers_pool = ThreadPool(processes=pool_size)
-        super(ThreadedDownloader, self).__init__(*args, **kwargs)
-    
+        super(ThreadedDownloader, self).__init__(middlewares=middlewares)
+
     def get(self, requests):
         return self.workers_pool.map(self._fetch, requests)
 
 
 class UrllibHttpRequest(BaseHttpRequest):
+    """Adapter for urllib request to :class:`pomp.core.base.BaseHttpRequest`""" 
 
     def __init__(self, url):
         self.request = url if isinstance(url, Request) else Request(url)
 
 
 class UrllibHttpResponse(BaseHttpResponse):
+    """Adapter for urllib response to :class:`pomp.core.base.BaseHttpResponse`""" 
 
     def __init__(self, request, response):
         self.req = request
 
 
 class UrllibAdapterMiddleware(BaseDownloaderMiddleware):
+    """Middlerware for adapting urllib.Request 
+    to :class:`pomp.core.base.BaseHttpRequest`
+    """
 
     def process_request(self, req):
         if isinstance(req, BaseHttpRequest):

File pomp/core/base.py

 """
-Base classes
+Base class
+
+ .. note::
+
+    All this class must be overridden
 """
 CRAWL_DEPTH_FIRST_METHOD = 'depth'
 CRAWL_WIDTH_FIRST_METHOD = 'width'
 
+
 class BaseCrawler(object):
-    """Base crawler class.
+    """Crawler interface
 
     Crawler must resolve two main tasks:
 
 
     - ``depth first`` is pomp.core.base.CRAWL_DEPTH_FIRST_METHOD (default)
     - ``width first`` is pomp.core.base.CRAWL_WIDTH_FIRST_METHOD
-
-    .. note::
-
-        This class must be overridden
     """
     ENTRY_URL = None
     CRAWL_METHOD = CRAWL_DEPTH_FIRST_METHOD
 
 
 class BaseDownloader(object):
-    """Base downloader class.
+    """Downloader interface
      
     Downloader must resolve one main task - execute request 
     and fetch response.
             if response:
                 yield callback(crawler, response)
 
-    def get(self, url):
-        """Execute request
+    def get(self, requests):
+        """Execute requests
 
-        :param url: url or instance of :class:`BaseHttpRequest`
-        :rtype: instance of :class:`BaseHttpResponse` or 
+        :param requests: urls or instances of :class:`BaseHttpRequest`
+        :rtype: instances of :class:`BaseHttpResponse` or 
                 :class:`BaseDownloadException`
         """
         raise NotImplementedError()
 
 
 class BasePipeline(object):
+    """Pipeline interface
+
+    The main goals of pipe is:
+
+    - filter items
+    - change items
+    - store items
+    """ 
 
     def start(self):
+        """Initialize pipe
+
+        Open files and database connections etc.
+        
+        """
         pass
 
     def process(self, item):
+        """Process extracted item
+        
+        :param item: extracted item
+        :rtype: item or ``None`` if this item must be skipped
+        """
         raise NotImplementedError()
 
     def stop(self):
+        """Finalize pipe
+        
+        Close files and database connections etc.
+        """
         pass
 
 
 class BaseDownloaderMiddleware(object):
+    """Downloader middleware interface"""
 
     def porcess_request(self, request):
+        """Change request before it will be executed by downloader
+
+        :param request: instance of :class:`BaseHttpRequest`
+        :rtype: changed request or ``None`` to skip
+                execution of this request
+        """
         raise NotImplementedError()
  
     def porcess_response(self, response):
+        """Change response before it will be sent to crawler for exctracting
+        items
+
+        :param response: instance of :class:`BaseHttpResponse`
+                         or :class:`BaseDownloadException`
+        :rtype: changed response or ``None`` to skip
+                processing of this response
+        """ 
         raise NotImplementedError() 
 
 
 class BaseHttpRequest(object):
+    """Request interface"""
 
     def __init__(self, *args, **kwargs):
         for key, value in kwargs.items():
 
     @property
     def url(self):
+        """Requested URL"""
         raise NotImplementedError()
 
 
 class BaseHttpResponse(object):
+    """Response interface"""
 
     def __init__(self, *args, **kwargs):
         for key, value in kwargs.items():
 
     @property
     def request(self):
-        raise NotImplementedError() 
+        """Request :class:`BaseHttpRequest`"""
+        raise NotImplementedError()
 
     @property
     def response(self):
-        return self.req
+        """Response :class:`BaseHttpResponse`"""
+        raise NotImplementedError()
 
 
 class BaseDownloadException(Exception):
+    """Download exception interface
+    
+    :param request: request raises this exception
+    :param exception: original exception
+    """
 
     def __init__(self, request, exception):
         self.request = request