Commits

Evgeniy Tatarkin committed 16fd741

change docs

Comments (0)

Files changed (7)

-pomp
+Pomp
 ====
 
-pomp is a Screen scraping and web crawling framework. Like `Scrapy`_, but more simple.
+Pomp is a screen scraping and web crawling framework. Like `Scrapy`_, but more simple.
+
+Inspired by `Scrapy`_ but simpler implementation and without hard `Twisted`_ dependency.
 
 Features:
 
-* One file applications
-* Engines - threaded, greenlet/gevent, Twisted, Tornado etc.
+* one file applications
+* networking by choice
 
-pomp is inspired by `Scrapy`_ but simpler implementation and without hard `Twisted`_ dependency.
+ - `urllib`_
+ - `Twisted`_ by package only for py2.x
+ - `futures`_ standard for py3.2+ or by backport package
+ - yours own method
+
+* content parsing by yours own method
+
 
 Roadmap:
 
 * `gevent`_ support
 * `Tornado`_ support
 
+`Pomp examples`_
+
 Pomp is written and maintained by Evgeniy Tatarkin and is licensed under BSD license.
 
+.. _urllib: http://docs.python.org/3.3/library/urllib.html
 .. _Scrapy: http://scrapy.org/
 .. _Twisted: http://twistedmatrix.com/
-.. _gevent:  http://www.gevent.org/
-.. _Tornado:  http://www.tornadoweb.org/
+.. _gevent: http://www.gevent.org/
+.. _Tornado: http://www.tornadoweb.org/
+.. _futures: http://pythonhosted.org/futures/
+.. _Pomp examples:
+   https://bitbucket.org/estin/pomp/src/tip/examples?at=default

docs/examples/customdowloader.py

+import requests as requestslib
+from pomp.core.base import BaseDownloader, BaseDownloadException
+from pomp.core.base import BaseHttpRequest, BaseHttpResponse
+
+from pomp.core.utils import iterator
+
+
+class ReqRequest(BaseHttpRequest):
+    def __init__(self, url):
+        self._url = url
+
+    @property
+    def url(self):
+        return self._url
+
+
+class ReqResponse(BaseHttpResponse):
+    def __init__(self, request, response):
+        self.req = request
+        self.resp = response
+
+        if not isinstance(response, Exception):
+            self.body = self.resp.text
+
+    @property
+    def request(self):
+        return self.req
+
+    @property
+    def response(self):
+        return self.resp
+
+
+class RequestsDownloader(BaseDownloader):
+
+    def get(self, requests):
+        responses = []
+        for request in iterator(requests):
+            response = self._fetch(request)
+            responses.append(response)
+        return responses
+
+    def _fetch(self, request):
+        try:
+            res = requestslib.get(request.url)
+            return ReqResponse(request, res)
+        except Exception as e:
+            print('Exception on %s: %s', request, e)
+            return BaseDownloadException(request, exception=e)
+
+
+if __name__ == '__main__':
+    from pomp.core.base import BaseCrawler
+    from pomp.core.engine import Pomp
+
+    class Crawler(BaseCrawler):
+        ENTRY_REQUESTS = ReqRequest('http://python.org/news/')
+
+        def extract_items(self, response):
+            print(response.body)
+
+        def next_requests(self, response):
+            return None  # one page crawler
+
+    pomp = Pomp(
+        downloader=RequestsDownloader(),
+    )
+
+    pomp.pump(Crawler())

docs/examples/minimalapp.py

+import re
+from pomp.core.base import BaseCrawler
+
+
+python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M)
+
+
+class MyCrawler(BaseCrawler):
+    """Extract all sentences with `python` word"""
+    ENTRY_REQUESTS = 'http://python.org/news'  # entry point
+
+    def extract_items(self, response):
+        for i in python_sentence_re.findall(response.body.decode('utf-8')):
+            item = i.strip()
+            print(item)
+            return item
+
+    def next_requests(self, response):
+        return None  # one page crawler, stop crawl
+
+
+if __name__ == '__main__':
+    from pomp.core.engine import Pomp
+    from pomp.contrib.urllibtools import UrllibDownloader
+
+    pomp = Pomp(
+        downloader=UrllibDownloader(),
+    )
+
+    pomp.pump(MyCrawler())
 :orphan:
 
-Pomp
-====
+.. include:: ../README.rst
 
-.. module:: pomp.engine
-
-Pomp is an screen scraping framework like `Scrapy`_ but more simple and
-without hard `Twisted`_ dependency.
-
-Main goal of project - easy extracting structured data from lots of sources
 
 .. _Scrapy: http://scrapy.org/
 .. _Twisted: http://twistedmatrix.com/

docs/quickstart.rst

 ---------------------
 
 For a minimal application all you need is to define you crawler 
-by inherit :class:`base.BaseCrawler`::
+by inherit :class:`base.BaseCrawler`
 
-    import re
-    from pomp.core.base import BaseCrawler
 
+.. literalinclude:: examples/minimalapp.py
 
-    python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M)
-
-
-    class MyCrawler(BaseCrawler):
-        """Extract all sentences with `python` word"""
-        ENTRY_REQUESTS = 'http://python.org/news' # entry point
-
-        def extract_items(self, response):
-            for i in python_sentence_re.findall(response.body.decode('utf-8')):
-                item = i.strip()
-                print(item)
-                return item
-
-        def next_requests(self, response):
-            return None # one page crawler, stop crawl
-
-
-    if __name__ == '__main__':
-        from pomp.core.engine import Pomp
-        from pomp.contrib.urllibtools import UrllibDownloader
-
-        pomp = Pomp(
-            downloader=UrllibDownloader(),
-        )
-
-        pomp.pump(MyCrawler())
 
 
 Item pipelines
 
 Custom downloader must be subclass of :class:`base.BaseDownloader`
 
-For example downloader fetching data by requests_ package::
+For example downloader fetching data by requests_ package.
 
-    import requests as requestslib
-    from pomp.core.base import BaseDownloader, BaseDownloadException
-    from pomp.core.base import BaseHttpRequest, BaseHttpResponse
 
-    from pomp.core.utils import iterator
-
-
-    class ReqRequest(BaseHttpRequest):
-        def __init__(self, url):
-            self._url = url
-        
-        @property
-        def url(self):
-            return self._url
-
-
-    class ReqResponse(BaseHttpResponse):
-        def __init__(self, request, response):
-            self.req = request
-            self.resp = response
-
-            if not isinstance(response, Exception):
-                self.body = self.resp.text
-
-        @property
-        def request(self):
-            return self.req
-
-        @property
-        def response(self):
-            return self.resp 
-
-
-    class RequestsDownloader(BaseDownloader):
-
-        def get(self, requests):
-            responses = []
-            for request in iterator(requests):
-                response = self._fetch(request)
-                responses.append(response)
-            return responses
-
-        def _fetch(self, request):
-            try:
-                res = requestslib.get(request.url)
-                return ReqResponse(request, res)
-            except Exception as e:
-                print('Exception on %s: %s', request, e)
-                return BaseDownloadException(request, exception=e)
-
-
-    if __name__ == '__main__':
-        from pomp.core.base import BaseCrawler
-        from pomp.core.engine import Pomp
-
-        class Crawler(BaseCrawler):
-            ENTRY_REQUESTS = ReqRequest('http://python.org/news/')
-
-            def extract_items(self, response):
-                print(response.body)
-
-            def next_requests(self, response):
-                return None # one page crawler
-
-        pomp = Pomp(
-            downloader=RequestsDownloader(),
-        )
-
-        pomp.pump(Crawler())
+.. literalinclude:: examples/customdowloader.py
 
 
 Downloader middleware
         'License :: OSI Approved :: BSD License',
         'Operating System :: OS Independent',
         'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
         'Programming Language :: Python :: 3',
     ],
 )
 [testenv:examples]
 deps=
     lxml
+    requests
 commands=
+    python -c"import sys, subprocess; sys.exit(subprocess.call('python docs/examples/minimalapp.py', shell=True, stdout=subprocess.PIPE))"
+    python -c"import sys, subprocess; sys.exit(subprocess.call('python docs/examples/customdowloader.py', shell=True, stdout=subprocess.PIPE))"
     python -c"import sys, subprocess; sys.exit(subprocess.call('python examples/01_pythonnews.py', shell=True, stdout=subprocess.PIPE))"
     python -c"import sys, subprocess; sys.exit(subprocess.call('python examples/02_livejournal.py', shell=True, stdout=subprocess.PIPE))"