Chris Adams avatar Chris Adams committed 8b7faa4

Added an option to follow offsite redirects

Comments (0)

Files changed (2)

bin/check_site.py

 
 """
 
-from cgi import escape
 from collections import defaultdict
 
 import logging
 import optparse
-import os
 import re
 import sys
 import time
     parser.add_option("--max-connections", type="int", default="2", help="Set the number of simultaneous connections to the remote server(s)")
     parser.add_option("--format", dest="report_format", default="text", help='Generate the report as HTML or text')
     parser.add_option("-o", "--report", "--output", dest="report_file", default=sys.stdout, help='Save report to a file instead of stdout')
+    parser.add_option("--follow-offsite-redirects", action="store_true", default=False, help="Follow redirects which lead to outside servers to check for 404s")
     parser.add_option("--validate-html", action="store_true", default=False, help="Validate HTML using tidylib")
     parser.add_option("--skip-media", action="store_true", default=False, help="Skip media files: <img>, <object>, etc.")
     parser.add_option("--skip-resources", action="store_true", default=False, help="Skip resources: <script>, <link>")
             sys.exit(42)
 
     spider = QASpider(validate_html=options.validate_html, max_simultaneous_connections=options.max_connections)
-    spider.skip_media     = options.skip_media
+    spider.skip_media = options.skip_media
     spider.skip_resources = options.skip_resources
+    spider.follow_offsite_redirects = options.follow_offsite_redirects
 
     if options.skip_link_re:
         i = options.skip_link_re

webtoolbox/clients.py

     # links or simply record them.
     allowed_hosts = set()
 
+    #: This flag controls whether we'll follow redirects to pages which are
+    # not in allowed_hosts. It defaults to off to avoid hammering third-party
+    # servers but you might want to check them for reporting purposes:
+    follow_offsite_redirects = False
+
     #: All urls processed by this spider as a URL-keyed list of :class:URLStatus elements
     site_structure = defaultdict(URLStatus)
     url_history = set()
         if url != request.url:
             if not parsed_url.netloc or parsed_url.netloc in self.allowed_hosts:
                 self.queue(url)
+            elif self.follow_offsite_redirects:
+                self.queue(url)
             else:
-                # TODO: Add an option to follow links for off-site redirect validation
                 self.log.info("Not following external redirect from %s to %s", request.url, url)
             return
         elif response.error:
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.