Commits

Evgeniy Tatarkin committed 93ae1a1

docs

Comments (0)

Files changed (8)

 Contrib
 *******
 
+Urllib
+``````
+
 .. automodule:: pomp.contrib
     :members:
 
+
+Twisted
+```````
+
+.. automodule:: pomp.contrib.twistedtools
+    :members:
+    :mockimport: twisted, zope
+    :mocktype: protocol.Protocol
+
+
+Concurrent future
+`````````````````
+
+.. automodule:: pomp.contrib.concurrenttools
+    :members: 
+
+
+.. _contrib-pipelines:
+
+Simple pipelines
+````````````````
+
 .. automodule:: pomp.contrib.pipelines
     :members:
 
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
-import sys, os
+import sys
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc']#, 'sphinx.ext.viewcode']
+extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.mockautodoc',]#, 'sphinx.ext.viewcode']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

docs/quickstart.rst

 Quickstart
 ==========
 
-.. currentmodule:: pomp.core.base
+.. currentmodule:: pomp.core
 
 Pomp is fun to use, incredibly easy for basic applications.
 
 ---------------------
 
 For a minimal application all you need is to define you crawler 
-by inherit :class:`BaseCrawler`::
+by inherit :class:`base.BaseCrawler`::
 
     import re
-    from pomp.core.base import BaseCrawler, BasePipeline
-    from pomp.contrib import SimpleDownloader
+    from pomp.core.base import BaseCrawler
 
 
     python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M)
 
         def extract_items(self, response):
             for i in python_sentence_re.findall(response.body.decode('utf-8')):
-                yield i.strip()
+                item = i.strip()
+                print(item)
+                return item
 
         def next_url(self, response):
             return None # one page crawler, stop crawl
 
 
-    class PrintPipeline(BasePipeline):
-        def process(self, item):
-            print('Sentence:', item)
-
-
     if __name__ == '__main__':
         from pomp.core.engine import Pomp
+        from pomp.contrib import SimpleDownloader
 
         pomp = Pomp(
             downloader=SimpleDownloader(),
-            pipelines=(PrintPipeline(),)
         )
 
         pomp.pump(MyCrawler())
 Item pipelines
 --------------
 
+For processing extracted items pomp has pipelines mechanism.
+Define pipe by subclass of :class:`base.BasePipeline` and pass
+it to :class:`engine.Pomp` constructor.
+
+Pipe calls one by one 
+
+Example pipelines for filtering items with length less the 10 symbols and
+printing sentence::
+
+    class FilterPipeline(BasePipeline):
+        def process(self, item):
+            # None - skip item for following processing
+            return None if len(item) < 10 else item
+
+    class PrintPipeline(BasePipeline):
+        def process(self, item):
+            print('Sentence:', item, ' length:', len(item))
+            return item # return item for following processing
+
+    pomp = Pomp(
+        downloader=SimpleDownloader(),
+        pipelines=(FilterPipeline(), PrintPipeline(),)
+    )
+
+See :ref:`contrib-pipelines`
+
 
 Custom downloader
 -----------------
+For download data from source target application can define
+downloader to implement special protocols or strategies.
+
+Custom downloader must be subclass of :class:`base.BaseDownloader`
+
+For example downloader fetching data by requests_ package::
+
+    import requests as requestslib
+    from pomp.core.base import BaseDownloader, BaseDownloadException
+    from pomp.core.base import BaseHttpRequest, BaseHttpResponse
+
+    from pomp.core.utils import iterator
+
+
+    class ReqRequest(BaseHttpRequest):
+        def __init__(self, url):
+            self._url = url
+        
+        @property
+        def url(self):
+            return self._url
+
+
+    class ReqResponse(BaseHttpResponse):
+        def __init__(self, request, response):
+            self.req = request
+            self.resp = response
+
+            if not isinstance(response, Exception):
+                self.body = self.resp.text
+
+        @property
+        def request(self):
+            return self.req
+
+        @property
+        def response(self):
+            return self.resp 
+
+
+    class RequestsDownloader(BaseDownloader):
+
+        def get(self, requests):
+            responses = []
+            for request in iterator(requests):
+                response = self._fetch(request)
+                responses.append(response)
+            return responses
+
+        def _fetch(self, request):
+            try:
+                res = requestslib.get(request.url)
+                return ReqResponse(request, res)
+            except Exception as e:
+                print('Exception on %s: %s', request, e)
+                return BaseDownloadException(request, exception=e)
+
+
+    if __name__ == '__main__':
+        from pomp.core.base import BaseCrawler
+        from pomp.core.engine import Pomp
+
+        class Crawler(BaseCrawler):
+            ENTRY_URL = ReqRequest('http://python.org/news/')
+
+            def extract_items(self, response):
+                print(response.body)
+
+            def next_url(self, response):
+                return None # one page crawler
+
+        pomp = Pomp(
+            downloader=RequestsDownloader(),
+        )
+
+        pomp.pump(Crawler())
 
 
 Downloader middleware
 ---------------------
+
+For hook request before it executed by downloader or response before it
+passed to crawler in pomp exists middlewares framework.
+
+Middleware must be subclass of :class:`base.BaseDownloaderMiddleware`.
+
+
+Each request will be passed to middlewares one by one in order it will passed to
+downloader.
+Each response/exception will be passed to middlewares one by one in
+reverse order.
+
+For example statistic middleware::
+
+    from pomp.core.base import BaseDownloaderMiddleware
+
+    class StatisticMiddleware(BaseDownloaderMiddleware):
+        def __init__(self):
+            self.requests = self.responses = self.exceptions = 0
+
+        def process_request(self, request):
+            self.requests += 1
+            return request
+
+        def process_response(self, response):
+            self.responses += 1
+            return response
+
+        def process_exception(self, exception):
+            self.exceptions += 1
+            return exception
+
+
+.. _requests: http://docs.python-requests.org/

pomp/contrib/__init__.py

 """
-Urllib
-``````
-
-Simple downloaders and middlewares for fetching urls by standard 
+Simple downloaders and middlewares for fetching data by standard 
 `urlopen` function from `urllib` package for python3.x
 or `urllib2` for python2.7+
 """

pomp/contrib/concurrenttools.py

 """
-Concurrent
-``````````
-
 Concurrent downloaders and middlewares for fetching urls by standard 
 `concurrent` package for python3
 """
 
 
 class ConcurrentUrllibDownloader(SimpleDownloader):
-    """
+    """Concurrent ThreadPoolExecutor downloader for fetching data by urllib
+    :class:`pomp.contrib.SimpleDownloader`
+
+    :param pool_size: size of ThreadPoolExecutor
+    :param timeout: request timeout in seconds
     """
 
     def __init__(self, pool_size=5, timeout=5, middlewares=None):

pomp/contrib/pipelines.py

 """
 Simple pipelines
-````````````````
-
-Simple pipelines
 """ 
 import csv
 import codecs

pomp/contrib/twistedtools.py

 """
-Twisted
-```````
-
-Simple downloaders and middlewares for fetching urls by Twisted.
+Simple downloaders and middlewares for fetching data by Twisted.
 """
 import urllib
 import logging
 from twisted.web.client import Agent
 from twisted.web.iweb import IBodyProducer
 from zope.interface import implements
-from twisted.web.http_headers import Headers 
+from twisted.web.http_headers import Headers
 
 from pomp.core.base import BaseDownloader, BaseHttpRequest, BaseHttpResponse, \
     BaseDownloadException 
 
 
 class TwistedDownloader(BaseDownloader):
+    """Download urls by twisted.web.client.Agent
 
+    :param reactor: twisted reacor
+    :param timeout: request timeout in seconds
+    """
     def __init__(self, reactor, timeout=5, middlewares=None):
         super(TwistedDownloader, self).__init__(middlewares=middlewares)
         self.reactor = reactor
 
 
 class TwistedHttpRequest(BaseHttpRequest):
+    """Adapter for twisted request to :class:`pomp.core.base.BaseHttpRequest`
+    
+    Map params to twisted.web.client.Agent().request(method, url, headers, data)
 
+    :param url: request url
+    :param data: request data
+    :param header: request headers
+    :param method: request method
+    """ 
     def __init__(self, url, data=None, headers=None, method='GET'):
         self._url = url.encode('utf-8')
         self.data = StringProducer(urllib.urlencode(data)) if data else None
 
 
 class TwistedHttpResponse(BaseHttpResponse):
-
+    """Adapter for twisted request to :class:`pomp.core.base.BaseHttpResponse`""" 
     def __init__(self, request, response):
         self.req = request
         self.resp = response
 
 
 class SimpleReceiver(protocol.Protocol):
-
     def __init__(s, d):
         s.buf = ''; s.d = d
 

pomp/core/base.py

     def next_url(self, page):
         """Getting next urls for processing.
  
+        Called after `extract_items` method.
+
         :param page: the instance of :class:`BaseHttpResponse`
         :rtype: ``None`` or one url or list of urls. If ``None`` returned
                 this mean that page have not any urls to following
 
         :param requests: urls or instances of :class:`BaseHttpRequest`
         :rtype: instances of :class:`BaseHttpResponse` or 
-                :class:`BaseDownloadException`
+                :class:`BaseDownloadException` or deferred for async behavior
         """
         raise NotImplementedError()
 
 
         :param exception: instance of :class:`BaseDownloadException`
         :rtype: changed response or ``None`` to skip
-                processing of this response
+                processing of this exception
         """ 
         return exception