Commits

Evgeniy Tatarkin committed d7c3e47

rename ENTRY_URL to ENTRY_REQUESTS, fix tests

Comments (0)

Files changed (10)

 
 Release date to be decided.
 
+- ENTRY_URL renamed to ENTRY_REQUESTS
 - next_url renamed to next_requests
 - async support
 - Twisted support

docs/quickstart.rst

 
     class MyCrawler(BaseCrawler):
         """Extract all sentences with `python` word"""
-        ENTRY_URL = 'http://python.org/news' # entry point
+        ENTRY_REQUESTS = 'http://python.org/news' # entry point
 
         def extract_items(self, response):
             for i in python_sentence_re.findall(response.body.decode('utf-8')):
         from pomp.core.engine import Pomp
 
         class Crawler(BaseCrawler):
-            ENTRY_URL = ReqRequest('http://python.org/news/')
+            ENTRY_REQUESTS = ReqRequest('http://python.org/news/')
 
             def extract_items(self, response):
                 print(response.body)

pomp/core/base.py

     - Extract next urls for following processing from response
 
     Each crawler must have starting point - entry url.
-    To set entry url declare them as class attribute ``ENTRY_URL`` like that::
+    To set entry url declare them as class attribute ``ENTRY_REQUESTS`` like that::
 
         class MyGoogleCrawler(BaseCrawler):
-            ENTRY_URL = 'http://google.com/'
+            ENTRY_REQUESTS = 'http://google.com/'
             ...
 
-    ``ENTRY_URL`` may be list of urls or list of requests
+    ``ENTRY_REQUESTS`` may be list of urls or list of requests
     (instances of :class:`BaseHttpRequest`).
 
     Crawler may choose which method for crawling to use by setting class 
     - ``depth first`` is pomp.core.base.CRAWL_DEPTH_FIRST_METHOD (default)
     - ``width first`` is pomp.core.base.CRAWL_WIDTH_FIRST_METHOD
     """
-    ENTRY_URL = None
+    ENTRY_REQUESTS = None
     CRAWL_METHOD = CRAWL_DEPTH_FIRST_METHOD
 
     def __init__(self):

pomp/core/engine.py

         self.stop_deferred = defer.Deferred()
 
         next_requests = self.downloader.process(
-            iterator(crawler.ENTRY_URL),
+            iterator(crawler.ENTRY_REQUESTS),
             self.response_callback,
             crawler
         )

tests/test_contrib_concurrent.py

             pipelines=[],
         )
 
-        DummyCrawler.ENTRY_URL = '/root'
-        pomp.pump(DummyCrawler())
+        class Crawler(DummyCrawler):
+            ENTRY_REQUESTS = '/root'
+
+        pomp.pump(Crawler())
 
         assert_set_equal(
             set([r.url.replace(self.httpd.location, '') \

tests/test_contrib_twisted.py

             pipelines=[PrintPipeline()],
         )
 
-        DummyCrawler.ENTRY_URL = '/root'
+        class Crawler(DummyCrawler):
+            ENTRY_REQUESTS = '/root'
 
         done_defer = defer.Deferred()
-        d = pomp.pump(DummyCrawler())
+        d = pomp.pump(Crawler())
 
         d.add_callback(done_defer.callback)
 
             pipelines=[PrintPipeline()],
         )
 
-        DummyCrawler.ENTRY_URL = '/root'
+        class Crawler(DummyCrawler):
+            ENTRY_REQUESTS = '/root' 
 
         done_defer = defer.Deferred()
-        d = pomp.pump(DummyCrawler())
+        d = pomp.pump(Crawler())
 
         d.add_callback(done_defer.callback)
         #d.add_callback(done_defer.callback)
             pipelines=[PrintPipeline()],
         )
 
-        DummyCrawler.ENTRY_URL = '/sleep'
+        class Crawler(DummyCrawler):
+            ENTRY_REQUESTS = '/sleep'
 
         done_defer = defer.Deferred()
-        d = pomp.pump(DummyCrawler())
+        d = pomp.pump(Crawler())
 
         d.add_callback(done_defer.callback)
 

tests/test_contrib_urllib.py

             pipelines=[],
         )
 
-        DummyCrawler.ENTRY_URL = '/root'
-        pomp.pump(DummyCrawler())
+        class Crawler(DummyCrawler):
+            ENTRY_REQUESTS = '/root'
+
+        pomp.pump(Crawler())
 
         assert_set_equal(
             set([r.url.replace(self.httpd.location, '') \
             pipelines=[],
         )
 
-        MockCrawler.ENTRY_URL = [
+        MockCrawler.ENTRY_REQUESTS = [
             'https://123.456.789.01:8081/fake_url',
             '%s/root' % self.httpd.location,
         ]

tests/test_middleware.py

 logging.basicConfig(level=logging.DEBUG)
 
 
+class Crawler(DummyCrawler):
+    ENTRY_REQUESTS = '/'
+
+
 class RaiseOnRequestMiddleware(BaseDownloaderMiddleware):
     def process_request(self, request):
         raise Exception('Some exception on Request')
         ]),
     )
 
-    pomp.pump(DummyCrawler())
+    pomp.pump(Crawler())
 
     assert_equal(len(collect_middleware.exceptions), 1)
     assert_equal(len(collect_middleware.requests), 0)
         ]),
     )
 
-    pomp.pump(DummyCrawler())
+    pomp.pump(Crawler())
 
     assert_equal(len(collect_middleware.exceptions), 1)
     assert_equal(len(collect_middleware.requests), 1)
         ]),
     )
 
-    pomp.pump(DummyCrawler())
+    pomp.pump(Crawler())
 
     assert_equal(len(collect_middleware.exceptions), 1)
     assert_equal(len(collect_middleware.requests), 0)

tests/test_simple_crawler.py

 
 
 class Crawler(DummyCrawler):
-    ENTRY_URL = (
+    ENTRY_REQUESTS = (
         "http://python.org/1",
         "http://python.org/2"
     )
 
 
 class DummyCrawler(BaseCrawler):
-    ENTRY_URL = None
+    ENTRY_REQUESTS = None
     CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD
 
     def __init__(self):