Commits

Evgeniy Tatarkin committed c3990ce

tests tools

Comments (0)

Files changed (6)

tests/test_contrib_concurrent.py

-import json
 import logging
 from nose import SkipTest
 from nose.tools import assert_set_equal
-from pomp.core.base import BaseCrawler, BaseDownloaderMiddleware
-from pomp.core.base import CRAWL_WIDTH_FIRST_METHOD
 
 from pomp.core.engine import Pomp
+from pomp.contrib import UrllibHttpRequest
 
 try:
     from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader
 except ImportError:
     raise SkipTest('concurrent future not available') 
 
+from tools import DummyCrawler
+from tools import RequestResponseMiddleware, CollectRequestResponseMiddleware
 from mockserver import HttpServer, make_sitemap
 
 logging.basicConfig(level=logging.DEBUG)
 
 
-class DummyCrawler(BaseCrawler):
-    ENTRY_URL = None
-    CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD
-
-    def __init__(self):
-        super(DummyCrawler, self).__init__()
-
-    def next_url(self, response):
-        return response.body.get('links', [])
-
-    def extract_items(self, response):
-        return
-
-
-class RequestResponseMiddleware(BaseDownloaderMiddleware):
-
-    def __init__(self, prefix_url=None):
-        self.requested_urls = []
-        self.prefix_url = prefix_url
-    
-    def process_request(self, url):
-        self.requested_urls.append(url)
-        url = '%s%s' % (self.prefix_url, url) \
-            if self.prefix_url else url
-        return url
-    
-    def process_response(self, response):
-        response.body = json.loads(response.body.decode('utf-8'))
-        return response
-
-
 class TestContribConcurrent(object):
 
     @classmethod
         cls.httpd.stop()
 
     def test_concurrent_urllib_downloader(self):
-        req_resp_midlleware = RequestResponseMiddleware(prefix_url=self.httpd.location)
+        req_resp_midlleware = RequestResponseMiddleware(
+            prefix_url=self.httpd.location,
+            request_factory=UrllibHttpRequest,
+        )
 
-        downloader = ConcurrentUrllibDownloader()
+        collect_middleware = CollectRequestResponseMiddleware()
+
+        downloader = ConcurrentUrllibDownloader(
+            middlewares=[collect_middleware]
+        )
 
         downloader.middlewares.insert(0, req_resp_midlleware)
 
         pomp.pump(DummyCrawler())
 
         assert_set_equal(
-            set(req_resp_midlleware.requested_urls),
+            set([r.url.replace(self.httpd.location, '') \
+                for r in collect_middleware.requests]),
             set(self.httpd.sitemap.keys())
         )

tests/test_contrib_twisted.py

-import json
 import logging
 from nose import SkipTest
 from nose.tools import assert_set_equal
 except ImportError:
     raise SkipTest('twisted not installed')
 
-from pomp.core.base import BaseCrawler, BaseDownloaderMiddleware, \
-    BasePipeline, BaseDownloadException
+from pomp.core.base import BaseDownloadException
 from pomp.core.engine import Pomp
 from pomp.contrib.twistedtools import TwistedDownloader, TwistedHttpRequest
-from pomp.core.base import CRAWL_WIDTH_FIRST_METHOD
-from pomp.core.item import Item, Field
 
+from tools import DummyCrawler, RequestResponseMiddleware, \
+    CollectRequestResponseMiddleware, PrintPipeline
 from mockserver import HttpServer, make_sitemap
 
+
 logging.basicConfig(level=logging.DEBUG)
 
 
-class DummyItem(Item):
-    value = Field()
-    url = Field()
-
-    def __repr__(self):
-        return '<DummyItem(%s, %s)>' % (self.url, self.value)
-
-
-class DummyCrawler(BaseCrawler):
-    ENTRY_URL = None
-    CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD
-
-    def __init__(self):
-        super(DummyCrawler, self).__init__()
-
-    def next_url(self, response):
-        res = response.body.get('links', [])
-        return res
-
-    def extract_items(self, response):
-        item = DummyItem()
-        item.value = 1
-        item.url = response.request.url
-        yield item 
-
-
-class RequestResponseMiddleware(BaseDownloaderMiddleware):
-
-    def __init__(self, prefix_url=None):
-        self.prefix_url = prefix_url
-    
-    def process_request(self, url):
-        url = '%s%s' % (self.prefix_url, url) \
-            if self.prefix_url else url
-        return TwistedHttpRequest(url)
-    
-    def process_response(self, response):
-        response.body = json.loads(response.body.decode('utf-8'))
-        return response
-
-
-class CollectRequestResponseMiddleware(BaseDownloaderMiddleware):
-
-    def __init__(self, prefix_url=None):
-        self.requests = []
-        self.responses = []
-        self.exceptions = []
-
-    def process_request(self, request):
-        self.requests.append(request)
-        return request
-
-    def process_response(self, response):
-        self.responses.append(response)
-        return response
-    
-    def process_exception(self, exception):
-        self.exceptions.append(exception)
-        return exception
-
-
-class PrintPipeline(BasePipeline):
-
-    def process(self, crawler, item):
-        print('Pipeline:', item)
-
-
 class TestContribTiwsted(object):
 
     @classmethod
     @deferred(timeout=1.0)
     def test_downloader(self):
 
-        req_resp_middleware = RequestResponseMiddleware(prefix_url=self.httpd.location)
+        req_resp_middleware = RequestResponseMiddleware(
+            prefix_url=self.httpd.location,
+            request_factory=TwistedHttpRequest,
+        )
         collect_middleware = CollectRequestResponseMiddleware()
         downloader = TwistedDownloader(reactor, middlewares=[collect_middleware])
 
     @deferred(timeout=1.0)
     def test_exceptions(self):
 
-        req_resp_midlleware = RequestResponseMiddleware(prefix_url='ivalid url')
+        req_resp_middleware = RequestResponseMiddleware(
+            prefix_url='invalid url',
+            request_factory=TwistedHttpRequest,
+        ) 
         collect_middleware = CollectRequestResponseMiddleware()
 
         downloader = TwistedDownloader(reactor, middlewares=[collect_middleware])
 
-        downloader.middlewares.insert(0, req_resp_midlleware)
+        downloader.middlewares.insert(0, req_resp_middleware)
 
         pomp = Pomp(
             downloader=downloader,
     @deferred(timeout=1.0)
     def test_timeout(self):
 
-        req_resp_midlleware = RequestResponseMiddleware(prefix_url=self.httpd.location)
+        req_resp_midlleware = RequestResponseMiddleware(
+            prefix_url=self.httpd.location,
+            request_factory=TwistedHttpRequest,
+        )
         collect_middleware = CollectRequestResponseMiddleware()
 
         downloader = TwistedDownloader(reactor,

tests/test_contrib_urllib.py

-import json
 import logging
 from nose.tools import assert_set_equal, assert_equal
 from pomp.core.base import BaseCrawler, BaseDownloaderMiddleware
 from pomp.core.engine import Pomp
 from pomp.contrib import SimpleDownloader, ThreadedDownloader
-from pomp.core.base import CRAWL_WIDTH_FIRST_METHOD
 
 from mockserver import HttpServer, make_sitemap
+from tools import DummyCrawler
+from tools import RequestResponseMiddleware, CollectRequestResponseMiddleware
+
 
 logging.basicConfig(level=logging.DEBUG)
 
 
-class DummyCrawler(BaseCrawler):
-    ENTRY_URL = None
-    CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD
-
-    def __init__(self):
-        super(DummyCrawler, self).__init__()
-
-    def next_url(self, response):
-        return response.body.get('links', [])
-
-    def extract_items(self, response):
-        return
-
-
-class RequestResponseMiddleware(BaseDownloaderMiddleware):
-
-    def __init__(self, prefix_url=None):
-        self.requested_urls = []
-        self.prefix_url = prefix_url
-    
-    def process_request(self, url):
-        self.requested_urls.append(url)
-        url = '%s%s' % (self.prefix_url, url) \
-            if self.prefix_url else url
-        return url
-    
-    def process_response(self, response):
-        response.body = json.loads(response.body.decode('utf-8'))
-        return response
-
-
 class TestContribUrllib(object):
 
     @classmethod
         cls.httpd.stop()
 
     def test_thread_pooled_downloader(self):
-        req_resp_midlleware = RequestResponseMiddleware(prefix_url=self.httpd.location)
+        req_resp_midlleware = RequestResponseMiddleware(
+            prefix_url=self.httpd.location,
+            request_factory=lambda x: x,
+        )
 
-        downloader = ThreadedDownloader()
+        collect_middleware = CollectRequestResponseMiddleware()
+
+        downloader = ThreadedDownloader(
+            middlewares=[collect_middleware]
+        )
 
         downloader.middlewares.insert(0, req_resp_midlleware)
 
         pomp.pump(DummyCrawler())
 
         assert_set_equal(
-            set(req_resp_midlleware.requested_urls),
+            set([r.url.replace(self.httpd.location, '') \
+                for r in collect_middleware.requests]),
             set(self.httpd.sitemap.keys())
         )
 
+
     def test_exception_handling(self):
 
         class CatchException(BaseDownloaderMiddleware):

tests/test_middleware.py

 import logging
 from nose.tools import assert_equal
-from pomp.core.base import BaseCrawler, BaseDownloader
-from pomp.core.base import BaseDownloaderMiddleware, BaseHttpResponse
+from pomp.core.base import BaseDownloaderMiddleware
 
 from pomp.core.engine import Pomp
 
+from tools import DummyDownloader, DummyCrawler
+
 
 logging.basicConfig(level=logging.DEBUG)
 
 
-class DummyCrawler(BaseCrawler):
-    ENTRY_URL = (
-        "http://python.org/1",
-    )
-
-    def next_url(self, response):
-        pass
-
-    def extract_items(self, response):
-        return []
-
-
-class DummyDownloader(BaseDownloader):
-
-    def get(self, requests):
-        for request in requests:
-            response = DummyResponse(request, 'some html code')
-            yield response 
-
-
-class DummyResponse(BaseHttpResponse):
-    
-    def __init__(self, request, response):
-        self.req = request
-        self.resp = response
-
-    @property
-    def request(self):
-        return self.req
-
-    @property
-    def response(self):
-        return self.response 
-
-
 class RaiseOnRequestMiddleware(BaseDownloaderMiddleware):
     def process_request(self, request):
         raise Exception('Some exception on Request')

tests/test_simple_crawler.py

 import logging
 from nose.tools import assert_equal
-from pomp.core.base import BaseCrawler, BaseDownloader, BasePipeline, \
-    BaseDownloaderMiddleware, BaseHttpRequest, BaseHttpResponse
+from pomp.core.base import BasePipeline
 from pomp.core.base import CRAWL_WIDTH_FIRST_METHOD
 from pomp.core.engine import Pomp
-from pomp.core.item import Item, Field
+
+
+from tools import DummyCrawler, DummyDownloader, DummyRequest
+from tools import RequestResponseMiddleware
 
 
 logging.basicConfig(level=logging.DEBUG)
 
-class DummyItem(Item):
-    value = Field()
-    url = Field()
 
-    def __repr__(self):
-        return '<DummyItem(%s, %s)>' % (self.url, self.value)
+url_to_request_middl = RequestResponseMiddleware(
+    request_factory=DummyRequest,
+    bodyjson=False
+)
 
 
-class DummyCrawler(BaseCrawler):
+class Crawler(DummyCrawler):
     ENTRY_URL = (
         "http://python.org/1",
         "http://python.org/2"
     )
 
     def __init__(self):
-        super(DummyCrawler, self).__init__()
+        super(Crawler, self).__init__()
         self.crawled_urls = []
 
     def next_url(self, response):
         self.crawled_urls.append(url)
         return result
 
-    def extract_items(self, response):
-        item = DummyItem()
-        item.value = 1
-        item.url = response.request.url
-        yield item
-
-
-class DummyDownloader(BaseDownloader):
-
-    def get(self, requests):
-        for request in requests:
-            response = DummyResponse(request, 'some html code')
-            yield response
-
-
-class DummyRequest(BaseHttpRequest):
-
-    def __init__(self, url):
-        self.request = url
-
-    @property
-    def url(self):
-        return self.request
-
-
-class DummyResponse(BaseHttpResponse):
-    
-    def __init__(self, request, response):
-        self.req = request
-        self.resp = response
-
-    @property
-    def request(self):
-        return self.req
-
-    @property
-    def response(self):
-        return self.response
-
-
-class UrlToRequestMiddleware(BaseDownloaderMiddleware):
-
-    def process_request(self, req):
-        if isinstance(req, BaseHttpRequest):
-            return req
-        return DummyRequest(url=req)
-
-    def process_response(self, response):
-        return response
-
 
 class TestSimplerCrawler(object):
 
         road = RoadPipeline()
 
         pomp = Pomp(
-            downloader=DummyDownloader(middlewares=[UrlToRequestMiddleware()]),
+            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
             pipelines=[
                 road,
             ],
         )
 
         # Depth first method
-        pomp.pump(DummyCrawler())
+        pomp.pump(Crawler())
 
         assert_equal(set([item.url for item in road.collection]), set([
             'http://python.org/1',
         # Width first method
         road.reset()
 
-        class DummyWidthCrawler(DummyCrawler):
+        class DummyWidthCrawler(Crawler):
             CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD
 
         pomp.pump(DummyWidthCrawler())
         result = []
 
         pomp = Pomp(
-            downloader=DummyDownloader(middlewares=[UrlToRequestMiddleware()]),
+            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
             pipelines=[
                 IncPipeline(),
                 FilterPipeline(),
             ],
         )
 
-        pomp.pump(DummyCrawler())
+        pomp.pump(Crawler())
 
         assert_equal([(item.url, item.value) for item in result], [
             ('http://python.org/1', 2),
             ('http://python.org/2', 2),
         ])
-
+import json
+from pomp.core.base import BaseCrawler, BaseDownloaderMiddleware, BasePipeline
+from pomp.core.base import BaseHttpRequest, BaseHttpResponse
+from pomp.core.base import BaseDownloader
+from pomp.core.base import CRAWL_WIDTH_FIRST_METHOD
+from pomp.core.item import Item, Field 
+
+
+class DummyItem(Item):
+    value = Field()
+    url = Field()
+
+    def __repr__(self):
+        return '<DummyItem(%s, %s)>' % (self.url, self.value)
+
+
+class DummyCrawler(BaseCrawler):
+    ENTRY_URL = None
+    CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD
+
+    def __init__(self):
+        super(DummyCrawler, self).__init__()
+
+    def next_url(self, response):
+        res = response.body.get('links', [])
+        return res
+
+    def extract_items(self, response):
+        item = DummyItem()
+        item.value = 1
+        item.url = response.request.url
+        yield item 
+
+
+class DummyDownloader(BaseDownloader):
+
+    def get(self, requests):
+        for request in requests:
+            response = DummyResponse(request, 'some html code')
+            yield response
+
+
+class RequestResponseMiddleware(BaseDownloaderMiddleware):
+
+    def __init__(self, request_factory, prefix_url=None, bodyjson=True):
+        self.bodyjson = bodyjson
+        self.prefix_url = prefix_url
+        self.request_factory = request_factory
+    
+    def process_request(self, url):
+        url = '%s%s' % (self.prefix_url, url) \
+            if self.prefix_url else url
+        return self.request_factory(url)
+    
+    def process_response(self, response):
+        if self.bodyjson:
+            response.body = json.loads(response.body.decode('utf-8'))
+        return response
+
+
+class CollectRequestResponseMiddleware(BaseDownloaderMiddleware):
+
+    def __init__(self, prefix_url=None):
+        self.requests = []
+        self.responses = []
+        self.exceptions = []
+
+    def process_request(self, request):
+        self.requests.append(request)
+        return request
+
+    def process_response(self, response):
+        self.responses.append(response)
+        return response
+    
+    def process_exception(self, exception):
+        self.exceptions.append(exception)
+        return exception
+
+
+class PrintPipeline(BasePipeline):
+
+    def process(self, crawler, item):
+        print('Pipeline:', item)
+
+
+class DummyRequest(BaseHttpRequest):
+
+    def __init__(self, url):
+        self.request = url
+
+    @property
+    def url(self):
+        return self.request
+
+
+class DummyResponse(BaseHttpResponse):
+    
+    def __init__(self, request, response):
+        self.req = request
+        self.resp = response
+
+    @property
+    def request(self):
+        return self.req
+
+    @property
+    def response(self):
+        return self.response