Commits

Chris Adams committed b6594f7

Added a default timeout to Retriever, Spider

This makes it easy for things like check_site.py to have a command-line option to change the request timeout.

Comments (0)

Files changed (2)

bin/check_site.py

     parser = optparse.OptionParser(__doc__.strip())
 
     parser.add_option("--max-connections", type="int", default="2", help="Set the number of simultaneous connections to the remote server(s)")
+    parser.add_option("--timeout", type="int", default="15", help="Set the number of seconds to wait for a request to load")
     parser.add_option("--format", dest="report_format", default="text", help='Generate the report as HTML or text')
     parser.add_option("-o", "--report", "--output", dest="report_file", default=sys.stdout, help='Save report to a file instead of stdout')
     parser.add_option("--follow-offsite-redirects", action="store_true", default=False, help="Follow redirects which lead to outside servers to check for 404s")
             logging.critical("Cannot perform HTML validation. Try `pip install pytidylib` or see http://countergram.com/software/pytidylib")
             sys.exit(42)
 
-    spider = QASpider(validate_html=options.validate_html, max_simultaneous_connections=options.max_connections)
+    spider = QASpider(
+        validate_html=options.validate_html,
+        max_simultaneous_connections=options.max_connections,
+        default_request_timeout=options.timeout
+    )
     spider.skip_media = options.skip_media
     spider.skip_resources = options.skip_resources
     spider.follow_offsite_redirects = options.follow_offsite_redirects

webtoolbox/clients.py

         request_args = {
                 "follow_redirects": False,
                 "max_redirects": 5,
-                "request_timeout": 15
+                "request_timeout": self.default_request_timeout
         }
         request_args.update(kwargs)
 
     # servers but you might want to check them for reporting purposes:
     follow_offsite_redirects = False
 
+    #: This is the default time in seconds which we'll wait to receive a response:
+    default_request_timeout = 15
+
     #: All urls processed by this spider as a URL-keyed list of :class:URLStatus elements
     site_structure = defaultdict(URLStatus)
     url_history = set()
 
     redirect_map = {}
 
-    def __init__(self, log_name="Spider", **kwargs):
+    def __init__(self, log_name="Spider", default_request_timeout=15, **kwargs):
         """Create a new Spider, optionally with a custom logging name"""
         super(Spider, self).__init__(**kwargs)
 
+        self.default_request_timeout = default_request_timeout
+
         self.log = logging.getLogger(log_name)
 
         self.response_processors.append(self.process_page)