Commits

Evgeniy Tatarkin committed ec7bfc0

docs

Comments (0)

Files changed (4)

 API
 ---
 
-.. module:: pomp.core.base
-
 This part of the documentation documents all the public classes and
 functions in pomp.
 
+Interface classes
+`````````````````
+.. module:: pomp.core.base
+
 .. autoclass:: BaseCrawler
     :members: extract_items, next_url
+
+.. autoclass:: BaseDownloader
+    :members: 
+
+.. autoclass:: BaseHttpRequest
+    :members:
+
+.. autoclass:: BaseHttpResponse
+    :members: 
+
+.. autoclass:: BaseDownloaderMiddleware
+    :members:
+
+.. autoclass:: BasePipeline
+    :members: 
+
+.. autoclass:: BaseDownloadException
+    :members:  
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
+extensions = ['sphinx.ext.autodoc']#, 'sphinx.ext.viewcode']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

docs/quickstart.rst

 A Minimal Application
 ---------------------
 
-For a minimal application all you need is to define you crawler inherited 
-from :class:`BaseCrawler`::
+For a minimal application all you need is to define you crawler 
+by inherit :class:`BaseCrawler`::
 
     from pomp.core.base import BaseCrawler
     from pomp.contrib import SimpleDownloader

pomp/core/base.py

 CRAWL_WIDTH_FIRST_METHOD = 'width'
 
 class BaseCrawler(object):
+    """Base crawler class.
+
+    Crawler must resolve two main tasks:
+
+    - Extract data from response
+    - Extract next urls for following processing from response
+
+    Each crawler must have starting point - entry url.
+    To set entry url declare them as class attribute ``ENTRY_URL`` like that::
+
+        class MyGoogleCrawler(BaseCrawler):
+            ENTRY_URL = 'http://google.com/'
+            ...
+
+    ``ENTRY_URL`` may be list of urls or list of requests
+    (instances of :class:`BaseHttpRequest`).
+
+    Crawler may choose which method for crawling to use by setting class 
+    attribute ``CRAWL_METHOD`` with values:
+
+    - ``depth first`` is pomp.core.base.CRAWL_DEPTH_FIRST_METHOD (default)
+    - ``width first`` is pomp.core.base.CRAWL_WIDTH_FIRST_METHOD
+
+    .. note::
+
+        This class must be overridden
+    """
     ENTRY_URL = None
     CRAWL_METHOD = CRAWL_DEPTH_FIRST_METHOD
 
     def next_url(self, page):
+        """Getting next urls for processing.
+ 
+        :param page: the instance of :class:`BaseHttpResponse`
+        :rtype: ``None`` or one url or list of urls. If ``None`` returned
+                this mean that page have not any urls to following
+                processing
+        """
         raise NotImplementedError()
 
     def process(self, response):
         return self.extract_items(response)
 
     def extract_items(self, url, page):
+        """Extract items - parse page.
+
+        :param url: the url of downloaded page
+        :param page: the instance of :class:`BaseHttpResponse`
+        :rtype: one data item or list of items
+        """    
         raise NotImplementedError()
 
     @property
 
 
 class BaseDownloader(object):
+    """Base downloader class.
+     
+    Downloader must resolve one main task - execute request 
+    and fetch response.
+
+    :param middlewares: list of middlewares, instances
+                        of :class:`BaseDownloaderMiddleware`
+    """
 
     def __init__(self, middlewares=None):
         self.middlewares = middlewares or []
 
     def prepare(self):
+        """Prepare downloader before start processing"""
         self.request_middlewares = self.middlewares
         self.response_middlewares = self.middlewares[:]
         self.response_middlewares.reverse()
                 yield callback(crawler, response)
 
     def get(self, url):
+        """Execute request
+
+        :param url: url or instance of :class:`BaseHttpRequest`
+        :rtype: instance of :class:`BaseHttpResponse` or 
+                :class:`BaseDownloadException`
+        """
         raise NotImplementedError()