Commits

Evgeniy Tatarkin  committed aab1f01

pep8 cleanup

  • Participants
  • Parent commits 9a1d250

Comments (0)

Files changed (16)

File examples/livejournal.py

 # -*- coding: utf-8 -*-
 """
     LiveJournal friends of http://grrm.livejournal.com/ user
-    
+
     requires: lxml
 
     store csv data to /tmp/friends.csv
 
     def process_response(self, response):
         if self.encoding:
-            response.tree = html.fromstring(response.body.decode(self.encoding))
+            response.tree = html.fromstring(
+                response.body.decode(self.encoding))
         else:
             response.tree = html.fromstring(response.body)
         return response
     )
 
     FRIENDS_XPATH = '//dl[contains(@data-widget-options, "socconns")]' \
-    '/dd[@class="b-profile-group-body"]' \
-    '/div[@class="b-tabs-content"]/a'
+                    '/dd[@class="b-profile-group-body"]' \
+                    '/div[@class="b-tabs-content"]/a'
 
     def __init__(self, max_level=2, friends_limit=2):
         """LiveJournal spider
         self._next_requests = []
         super(LJFriendSpider, self).__init__()
 
-
     def extract_items(self, response):
         items = []
         k = 0
             item.username = i.text
 
             # associate parsed user with hist friend from parent request
-            item.friend_to = response.request.username 
+            item.friend_to = response.request.username
             items.append(item)
 
             # follow to item.username
-            if response.request.level < self.max_level and k < self.friend_limit:
+            if response.request.level < self.max_level \
+                    and k < self.friend_limit:
                 # generate new url to follow
                 url = i.get('href') + self.QS
                 self._next_requests.append(FriendLevelRequest(
         return items
 
     def next_requests(self, response):
-        # when users parsed pomp call next_url method for getting next targets
+        # when users parsed pomp call next_url method
+        # for getting next targets
         def _urls():
             if self._next_requests:
                 yield self._next_requests.pop()

File examples/pythonnews.py

 
 
 logging.basicConfig(level=logging.DEBUG)
-news_re = re.compile(r'<h2 class="news">(.*?)</h2>([\s\S]*?)<div class="pubdate">(.*?)</div>')
+news_re = re.compile(
+    r'<h2 class="news">(.*?)</h2>([\s\S]*?)<div class="pubdate">(.*?)</div>')
 
 
 class PythonNewsItem(Item):
             yield item
 
     def next_requests(self, response):
-        return None # one page crawler
+        return None  # one page crawler
 
 
 class PrintPipeline(BasePipeline):

File pomp/contrib/__init__.py

 """
-Simple downloaders and middlewares for fetching data by standard 
+Simple downloaders and middlewares for fetching data by standard
 `urlopen` function from `urllib` package for python3.x
 or `urllib2` for python2.7+
 """
 
 class SimpleDownloader(BaseDownloader):
     """Simplest downloader
-    
+
     :param timeout: request timeout in seconds
     """
 
 
 
 class UrllibHttpRequest(Request, BaseHttpRequest):
-    """Adapter for urllib request to :class:`pomp.core.base.BaseHttpRequest`""" 
+    """Adapter for urllib request to :class:`pomp.core.base.BaseHttpRequest`"""
 
     @property
     def url(self):
 
 
 class UrllibHttpResponse(BaseHttpResponse):
-    """Adapter for urllib response to :class:`pomp.core.base.BaseHttpResponse`""" 
+    """Adapter for urllib response to
+    :class:`pomp.core.base.BaseHttpResponse`"""
 
     def __init__(self, request, response):
         self.req = request
 
 
 class UrllibAdapterMiddleware(BaseDownloaderMiddleware):
-    """Middlerware for adapting urllib.Request 
+    """Middlerware for adapting urllib.Request
     to :class:`pomp.core.base.BaseHttpRequest`
     """
 

File pomp/contrib/concurrenttools.py

 """
-Concurrent downloaders and middlewares for fetching urls by standard 
+Concurrent downloaders and middlewares for fetching urls by standard
 `concurrent` package for python3
 """
 import logging

File pomp/contrib/pipelines.py

 """
 Simple pipelines
-""" 
+"""
 import csv
 import codecs
 from pomp.core.base import BasePipeline
         # no close file if it not opened by this instance
         self._need_close = False
 
-
     def start(self, crawler):
         if isstring(self.output_file):
             self.csvfile = codecs.open(self.output_file, 'w', encoding='utf-8')

File pomp/contrib/twistedtools.py

 from twisted.web.http_headers import Headers
 
 from pomp.core.base import BaseDownloader, BaseHttpRequest, BaseHttpResponse, \
-    BaseDownloadException 
+    BaseDownloadException
 from pomp.core.utils import iterator
 
 
         # Set timeout to request
         # on timeout will be errorBack with CancelledError
         watchdog = self.reactor.callLater(self.timeout, d.cancel)
+
         def _reset_timeout(res):
             if watchdog.active():
                 watchdog.cancel()
 
 class TwistedHttpRequest(BaseHttpRequest):
     """Adapter for twisted request to :class:`pomp.core.base.BaseHttpRequest`
-    
-    Map params to twisted.web.client.Agent().request(method, url, headers, data)
+
+    Map params to
+    twisted.web.client.Agent().request(method, url, headers, data)
 
     :param url: request url
     :param data: request data
     :param header: request headers
     :param method: request method
-    """ 
+    """
     def __init__(self, url, data=None, headers=None, method='GET'):
         self._url = url.encode('utf-8')
         self.data = StringProducer(urllib.urlencode(data)) if data else None
 
 
 class TwistedHttpResponse(BaseHttpResponse):
-    """Adapter for twisted request to :class:`pomp.core.base.BaseHttpResponse`""" 
+    """Adapter for twisted request to
+    :class:`pomp.core.base.BaseHttpResponse`"""
     def __init__(self, request, response):
         self.req = request
         self.resp = response
 
     @property
     def response(self):
-        return self.resp 
+        return self.resp
 
 
 class SimpleReceiver(protocol.Protocol):
     def __init__(s, d):
-        s.buf = ''; s.d = d
+        s.buf = ''
+        s.d = d
 
     def dataReceived(s, data):
         s.buf += data
 
     def connectionLost(s, reason):
-        # TODO: test if reason is twisted.web.client.ResponseDone, if not, do an errback
-        s.d.callback(s.buf) 
+        # TODO: test if reason is twisted.web.client.ResponseDone,
+        # if not, do an errback
+        s.d.callback(s.buf)
 
 
 class StringProducer(object):

File pomp/core/base.py

     - Extract next urls for following processing from response
 
     Each crawler must have starting point - entry url.
-    To set entry url declare them as class attribute ``ENTRY_REQUESTS`` like that::
+    To set entry url declare them as class attribute ``ENTRY_REQUESTS``
+    like that::
 
         class MyGoogleCrawler(BaseCrawler):
             ENTRY_REQUESTS = 'http://google.com/'
     ``ENTRY_REQUESTS`` may be list of urls or list of requests
     (instances of :class:`BaseHttpRequest`).
 
-    Crawler may choose which method for crawling to use by setting class 
+    Crawler may choose which method for crawling to use by setting class
     attribute ``CRAWL_METHOD`` with values:
 
     - ``depth first`` is pomp.core.base.CRAWL_DEPTH_FIRST_METHOD (default)
 
     def next_requests(self, page):
         """Getting next requests for processing.
- 
+
         Called after `extract_items` method.
 
         :param page: the instance of :class:`BaseHttpResponse`
         :param url: the url of downloaded page
         :param page: the instance of :class:`BaseHttpResponse`
         :rtype: one data item or list of items
-        """    
+        """
         raise NotImplementedError()
 
     def is_depth_first(self):
 
 class BaseDownloader(object):
     """Downloader interface
-     
-    Downloader must resolve one main task - execute request 
+
+    Downloader must resolve one main task - execute request
     and fetch response.
 
     :param middlewares: list of middlewares, instances
                 try:
                     request = middleware.process_request(request)
                 except Exception as e:
-                    log.exception('Exception on process %s by %s',
-                        request, middleware)
-                    request = None # stop processing request by middlewares
+                    log.exception(
+                        'Exception on process %s by %s', request, middleware)
+                    request = None  # stop processing request by middlewares
                     self._process_exception(
                         BaseDownloadException(request, exception=e)
                     )
                 try:
                     response = getattr(middleware, func)(response)
                 except Exception as e:
-                    log.exception('Exception on process %s by %s',
-                        response, middleware)
-                    response = None # stop processing response by middlewares
+                    log.exception(
+                        'Exception on process %s by %s', response, middleware)
+                    response = None  # stop processing response by middlewares
                     self._process_exception(
                         BaseDownloadException(response, exception=e)
                     )
             if response and not is_error:
                 return callback(crawler, response)
 
-        crawler.dive(len(requests)) # dive in
+        crawler.dive(len(requests))  # dive in
         for response in self.get(requests):
-            if isinstance(response, defer.Deferred): # async behavior
+            if isinstance(response, defer.Deferred):  # async behavior
                 def _(res):
-                    crawler.dive(-1) # dive out
+                    crawler.dive(-1)  # dive out
                     return res
                 response.add_callback(_)
                 response.add_callback(_process_resp)
                 yield response
-            else: # sync behavior
-                crawler.dive(-1) # dive out
+            else:  # sync behavior
+                crawler.dive(-1)  # dive out
                 yield _process_resp(response)
 
     def _process_exception(self, exception):
             try:
                 value = middleware.process_exception(exception)
             except Exception:
-                log.exception('Exception on prcess %s by %s',
-                    exception, middleware)
-            if not value: # stop processing exception
+                log.exception(
+                    'Exception on prcess %s by %s', exception, middleware)
+            if not value:  # stop processing exception
                 break
 
     def get(self, requests):
         """Execute requests
 
         :param requests: urls or instances of :class:`BaseHttpRequest`
-        :rtype: instances of :class:`BaseHttpResponse` or 
+        :rtype: instances of :class:`BaseHttpResponse` or
                 :class:`BaseDownloadException` or deferred for async behavior
         """
         raise NotImplementedError()
     - filter items
     - change items
     - store items
-    """ 
+    """
 
     def start(self, crawler):
         """Initialize pipe
-        
+
         Open files and database connections etc.
-        
+
         :param crawler: crawler who extract items
         """
         pass
 
     def stop(self, crawler):
         """Finalize pipe
-        
+
         Close files and database connections etc.
-        
+
         :param crawler: crawler who extract items
         """
         pass
                 execution of this request
         """
         return request
- 
+
     def process_response(self, response):
         """Change response before it will be sent to crawler for exctracting
         items
         :param response: instance of :class:`BaseHttpResponse`
         :rtype: changed response or ``None`` to skip
                 processing of this response
-        """ 
+        """
         return response
 
     def process_exception(self, exception):
         :param exception: instance of :class:`BaseDownloadException`
         :rtype: changed response or ``None`` to skip
                 processing of this exception
-        """ 
+        """
         return exception
 
 
 
 class BaseDownloadException(Exception):
     """Download exception interface
-    
+
     :param request: request raises this exception
     :param exception: original exception
     """

File pomp/core/engine.py

 Engine
 """
 import logging
-import itertools
 
 import defer
-
 from pomp.core.utils import iterator, DeferredList
 
 
     - Downloader implementation with middlewares
     - Item pipelines
     - Crawler
-    
+
     :param downloader: :class:`pomp.core.base.BaseDownloader`
-    :param pipelines: list of item pipelines :class:`pomp.core.base.BasePipeline`
+    :param pipelines: list of item pipelines
+                      :class:`pomp.core.base.BasePipeline`
     """
 
     def __init__(self, downloader, pipelines=None):
                 if not self.stoped and not crawler.in_process():
                     self._stop(crawler)
 
-            return None # end of recursion
+            return None  # end of recursion
         else:
             return urls
 
     def pump(self, crawler):
         """Start crawling
-        
+
         :param crawler: crawler to execute :class:`pomp.core.base.BaseCrawler`
         """
 
         return self.stop_deferred
 
     def _call_next_requests(self, next_requests, crawler):
-        deferreds = [n for n in next_requests if n and isinstance(n, defer.Deferred)]
-        if deferreds: # async behavior
+        deferreds = [
+            n for n in next_requests if n and isinstance(n, defer.Deferred)]
+        if deferreds:  # async behavior
             d = DeferredList(deferreds)
             d.add_callback(self._on_next_requests, crawler)
-        else: # sync behavior
+        else:  # sync behavior
             self._on_next_requests(next_requests, crawler)
 
     def _on_next_requests(self, next_requests, crawler):
             pipe.stop(crawler)
 
         log.info('Stop crawler: %s', crawler)
-        self.stop_deferred.callback(None) 
+        self.stop_deferred.callback(None)
 
 setup(
     name='pomp',
-    version = ":versiontools:pomp:",
+    version=":versiontools:pomp:",
     url='http://bitbucket.org/estin/pomp',
     license='BSD',
     author='Evgeniy Tatarkin',
     packages=find_packages(),
     zip_safe=False,
     platforms='any',
-    tests_require=['nose >= 1.0',],
+    tests_require=['nose >= 1.0', ],
     test_suite='nose.collector',
-    setup_requires=['versiontools >= 1.8',],
-    install_requires=['defer',],
+    setup_requires=['versiontools >= 1.8', ],
+    install_requires=['defer', ],
     classifiers=[
         'Intended Audience :: Developers',
         'License :: OSI Approved :: BSD License',

File tests/mockserver.py

         response = sitemap_app.sitemap.get(requested_url)
     log.debug('Requested url: %s, response: %s', requested_url, response)
     ret = [response.encode('utf-8')]
-    return ret 
+    return ret
 
 
 def make_reponse_body(items, links):
             child_url = '%s/%s' % (entry, child)
             make_entry(child_url, sitemap, links_on_page if level > 1 else 0)
             make_sitemap(level=level - 1, sitemap=sitemap, entry=child_url)
-            
+
     return sitemap
 
 
     def start(self):
         log.debug('Start http server: %s', self)
         self.process.start()
-    
+
     def stop(self):
         log.debug('Stop http server: %s', self)
         self.process.terminate()

File tests/test_contrib_concurrent.py

 try:
     from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader
 except ImportError:
-    raise SkipTest('concurrent future not available') 
+    raise SkipTest('concurrent future not available')
 
 from tools import DummyCrawler
 from tools import RequestResponseMiddleware, CollectRequestResponseMiddleware
         pomp.pump(Crawler())
 
         assert_set_equal(
-            set([r.url.replace(self.httpd.location, '') \
+            set([r.url.replace(self.httpd.location, '')
                 for r in collect_middleware.requests]),
             set(self.httpd.sitemap.keys())
         )

File tests/test_contrib_pipelines.py

     field4 = Field()
 
 
-
 class TestContribPipelines(object):
 
     def test_csv_pipeline(self):
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL
             )
-            
+
             # prepare pipe
             pipe.start(None)
 

File tests/test_contrib_twisted.py

             request_factory=TwistedHttpRequest,
         )
         collect_middleware = CollectRequestResponseMiddleware()
-        downloader = TwistedDownloader(reactor, middlewares=[collect_middleware])
+        downloader = TwistedDownloader(
+            reactor, middlewares=[collect_middleware])
 
         downloader.middlewares.insert(0, req_resp_middleware)
 
 
         def check(x):
             assert_set_equal(
-                set([r.url.replace(self.httpd.location, '') \
-                    for r in  collect_middleware.requests]),
+                set([r.url.replace(self.httpd.location, '')
+                    for r in collect_middleware.requests]),
                 set(self.httpd.sitemap.keys())
             )
 
         req_resp_middleware = RequestResponseMiddleware(
             prefix_url='invalid url',
             request_factory=TwistedHttpRequest,
-        ) 
+        )
         collect_middleware = CollectRequestResponseMiddleware()
 
-        downloader = TwistedDownloader(reactor, middlewares=[collect_middleware])
+        downloader = TwistedDownloader(
+            reactor, middlewares=[collect_middleware])
 
         downloader.middlewares.insert(0, req_resp_middleware)
 
         )
 
         class Crawler(DummyCrawler):
-            ENTRY_REQUESTS = '/root' 
+            ENTRY_REQUESTS = '/root'
 
         done_defer = defer.Deferred()
         d = pomp.pump(Crawler())
 
         def check(x):
             assert len(collect_middleware.exceptions) == 1
-            assert isinstance(collect_middleware.exceptions[0], 
-                BaseDownloadException)
+            assert isinstance(
+                collect_middleware.exceptions[0], BaseDownloadException)
 
         done_defer.addCallback(check)
-        return done_defer 
-
+        return done_defer
 
     @deferred(timeout=1.0)
     def test_timeout(self):
         )
         collect_middleware = CollectRequestResponseMiddleware()
 
-        downloader = TwistedDownloader(reactor,
+        downloader = TwistedDownloader(
+            reactor,
             timeout=0.5,
             middlewares=[collect_middleware]
         )

File tests/test_contrib_urllib.py

         pomp.pump(Crawler())
 
         assert_set_equal(
-            set([r.url.replace(self.httpd.location, '') \
+            set([r.url.replace(self.httpd.location, '')
                 for r in collect_middleware.requests]),
             set(self.httpd.sitemap.keys())
         )
 
-
     def test_exception_handling(self):
 
         class CatchException(BaseDownloaderMiddleware):
 
         catch_exception_middleware = CatchException()
         pomp = Pomp(
-            downloader=SimpleDownloader(middlewares=[catch_exception_middleware]),
+            downloader=SimpleDownloader(
+                middlewares=[catch_exception_middleware]),
             pipelines=[],
         )
 

File tests/test_middleware.py

 
 class RaiseOnResponseMiddleware(BaseDownloaderMiddleware):
     def process_response(self, response):
-        raise Exception('Some exception on Response') 
+        raise Exception('Some exception on Response')
 
 
 class RaiseOnExceptionMiddleware(BaseDownloaderMiddleware):
     def process_response(self, response):
         self.responses.append(response)
         return response
-    
+
     def process_exception(self, exception):
         self.exceptions.append(exception)
-        return exception 
+        return exception
 
 
 def test_exception_on_processing_request():
 
     assert_equal(len(collect_middleware.exceptions), 1)
     assert_equal(len(collect_middleware.requests), 1)
-    assert_equal(len(collect_middleware.responses), 1) 
+    assert_equal(len(collect_middleware.responses), 1)
 
 
 def test_exception_on_processing_exception():

File tests/tools.py

 from pomp.core.base import BaseHttpRequest, BaseHttpResponse
 from pomp.core.base import BaseDownloader
 from pomp.core.base import CRAWL_WIDTH_FIRST_METHOD
-from pomp.core.item import Item, Field 
+from pomp.core.item import Item, Field
 
 
 class DummyItem(Item):
         item = DummyItem()
         item.value = 1
         item.url = response.request.url
-        yield item 
+        yield item
 
 
 class DummyDownloader(BaseDownloader):
         self.bodyjson = bodyjson
         self.prefix_url = prefix_url
         self.request_factory = request_factory
-    
+
     def process_request(self, url):
         url = '%s%s' % (self.prefix_url, url) \
             if self.prefix_url else url
         return self.request_factory(url)
-    
+
     def process_response(self, response):
         if self.bodyjson:
             response.body = json.loads(response.body.decode('utf-8'))
     def process_response(self, response):
         self.responses.append(response)
         return response
-    
+
     def process_exception(self, exception):
         self.exceptions.append(exception)
         return exception
 
 
 class DummyResponse(BaseHttpResponse):
-    
+
     def __init__(self, request, response):
         self.req = request
         self.resp = response
 
     @property
     def response(self):
-        return self.response 
+        return self.response