Commits

Lynn Rees  committed 5f48859

- added more tuning and comments

  • Participants
  • Parent commits fd2482f

Comments (0)

Files changed (1)

 #! /usr/bin/env python
 
-## Copyright (c) 1999 - 2002 L. C. Rees.  All rights reserved.
+## Copyright (c) 1999 - 2003 L. C. Rees.  All rights reserved.
 ## See COPYRIGHT file for license terms.
 
 __name__ = 'spider'
     'badurlreport', 'badhtmlreport', 'redirectreport', 'outsidereport',
     'unhttpreport']
 
-'''Provides FTP and Website crawling, reporting, and mirroring in one
-convenient module.'''
+'''FTP and Web crawling, reporting, and mirroring in one convenient module.'''
 
 from __future__ import generators
 
     import urllib as _ulib
     import urlparse as _uparse
     from os import path as _path 
-    from ftplib import FTP as _ftp      
+    from ftplib import FTP as _ftp
+    from time import strftime as _formtime
+    from time import localtime as _localtime
     from ftplib import error_perm as _ftperr
     from sgmllib import SGMLParser as _sparser
     from sgmllib import SGMLParseError as _sperror
     except ImportError: pass
 
     def __init__(self, base=None, width=None, depth=None):
-        '''Initializes a Spider instance and base values
+        '''Initializes a Spider instance and its base attributes
 
         Arguments:
         base -- URL to crawl (default: None)
-        width -- maximum resources to crawl (default: None) 
+        width -- maximum resources to crawl (default: None)
         depth -- how deep in a hierarchy to crawl (default: None)'''
+        self._bdsig, self._bfsig, self._session = None, None, None
         if base: self.base = base
         else: self.base = None
         if width: self.width = width
         else: self.width = None
         if depth: self.depth = depth
-        else: self.depth = None
-        self._bdsig, self._bfsig, self._session = None, None, None
+        else: self.depth = None       
         
     def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
-        '''Returns an FTP client session
+        '''Returns FTP client session
 
         Arguments:
         base -- FTP server URL
                     import sys
                     sys.exit(0)
 
-        su, ftp, ftperr = self._uparse.urlsplit(base), self._ftp, self._ftperr
-        self._name, self._password = name, password
+        # Assignments
+        su, ftp = self._uparse.urlsplit(base), self._ftp
+        self._name, self._password, ftperr = name, password, self._ftperr
         # Set URL, path, and strip 'ftp://' off
         base, path = su[1], '/'.join([su[2], ''])
         try: session = ftp(base, name, password)
         if path: session.cwd(path)
         return session
 
-    def ftpmirror(self, b=None, l=None, t=None, w=200, d=6, n='anonymous', p=None):
+    def ftpmirror(self, l, t=None, b=None, w=200, d=6, n='anonymous', p=None):
         '''Mirrors an FTP site on a local filesystem
-        
+
         Arguments:
+        l -- local filesystem path (default: None)
         b -- FTP server URL (default: None)
-        l -- local filesystem path (default: None)
         t -- number of download threads (default: None)
         w -- maximum amount of resources to crawl (default: 200)   
         d -- depth in hierarchy to crawl (default: 6)             
         d -- depth in hierarchy to crawl (default: 6)               
         n -- login username (default: 'anonymous')
         p -- login password (default: None)'''
-        return ftppaths(b, w, d, n, p), ftpurls(), self._session
+        if b: ftppaths(b, w, d, n, p)
+        return self.paths, ftpurls(), self._session
 
     def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns a list of FTP URLs
         d -- depth in hierarchy to crawl (default: 6)               
         n -- login username (default: 'anonymous')
         p -- login password (default: None)'''
-        if b: ftppaths(b, w, d, n, p)
-        # Get rid of trailing '/' in base if present before joining
-        if self.base[-1] == '/': base = self.base[:-1]
-        else: base = self.base
+        if b:
+            ftppaths(b, w, d, n, p)
+            # Get rid of trailing '/' in base if present before joining
+            if b[-1] == '/': base = b[:-1]       
+        else:           
+            base = self.base
+            # Get rid of trailing '/' in base if present before joining
+            if base[-1] == '/': base = self.base[:-1]
         paths = self.paths
         self.urls = [''.join([base, i]) for i in paths]
         return self.urls
         from string import letters, digits
         from random import choice, randint
         jibber = ''.join([letters, digits])
-        ru = ''.join([choice(jibber) for x in range(randint(1, 30))]) 
-        # Builds bad URL signature of a bad file URL
+        ru = ''.join([choice(jibber) for x in range(randint(1, 30))])
+        # Builds signature of a bad URL for a file
         self._bfsig.extend(badurl(urljoin(base, '%s.html' % ru)))
         parser.reset()
-        # Builds bad URL signature of a bad directory URL
-        self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))    
+        # Builds signature of a bad URL for a directory
+        self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))
 
     def _webfactory(self):
         '''Returns UrlExtract instance'''        
                 if img: self.urls.extend(img)
         
             def start_link(self, attrs):
-                '''Collects URLs from link tags'''
-                
+                '''Collects URLs from link tags'''                
                 link = [v for k, v in attrs if k in ('href', 'src')]
                 if link: self.urls.extend(link)
         
 
         return UrlExtract()            
 
-    def _urlresolve(self, urllist, base):
-        '''Resolve full URL relative to base URL
+    def _urlresolve(self, urls, base):
+        '''Returns a full URL relative to a base URL
 
         Arguments:
-        urllist -- list of extracted URLs
-        base -- base URL'''
-        # Make globals local
+        urls -- list of raw URLs
+        base -- referring URL'''
+        # Assignments
         cache, visited, webopen = self._cache, self._visited, self._webopen
         sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
         urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
-        outside, redirs = self.outside, self.redirs
+        outside, redirs, unhttp = self.outside, self.redirs, self.unhttp
+        # Supported protocols
         supported = ['HTTP', 'http', 'HTTPS', 'https', 'FTP', 'ftp']
-        # Prepare base for joining
-        tbase, unhttp = base.replace(base.split('/')[-1], ''), self.unhttp 
-        for url in urllist:
+        # Strip file off base URL for joining
+        tbase = base.replace(base.split('/')[-1], '') 
+        for url in urls:
             if url not in visited:
+                # Remove whitespace from URL
                 if url.find(' ') != -1:
                     visited[url], url = 1, url.replace(' ', '')
                     if url in visited: continue
                 # Remove fragments i.e. 'http:foo/bar#frag'
                 if url.find('#') != -1:
                     visited[url], url = 1, urldefrag(url)[0]
-                    if url in visited: continue               
+                    if url in visited: continue
+                # Process full URLs i.e. 'http://foo/bar
                 if url.find(':') != -1:
                     urlseg = urlsplit(url)
                     # Block non-FTP, HTTP URLs
                     if urlseg[0] not in supported:
+                        # Log as non-FTP/HTTP URL
                         unhttp[url], visited[url] = 1, 1
                         continue
-                    # If URL is not from root domain, block it
+                    # If URL is not in root domain, block it
                     if urlseg[1] not in sb:
                         visited[url], outside[url] = 1, 1                        
                         continue
                     elif not urlseg[2] and urlseg[1] == sb:
                         visited[url] = 1
                         continue
-                # Handle relative URLs
+                # Handle relative URLs i.e. ../foo/bar
                 elif url.find(':') == -1:
                     # Join root domain and relative URL
                     visited[url], url = 1, urljoin(tbase, url)
                     if url in visited: continue
-                rurl = webopen((url, base))                 
+                # Test URL by attempting to open it
+                rurl = webopen((url, base))
                 if rurl and rurl[0] not in visited:
-                    visited[url], turl = 1, rurl[0]
-                    visited[turl] = 1
+                    # Get URL
+                    turl = rurl[0]
+                    visited[url], visited[turl] = 1, 1
+                    # If URL resolved to a different URL, process it
                     if turl != url:
                         urlseg = urlsplit(turl)
+                        # If URL is not in root domain, block it
                         if urlseg[1] not in sb:
+                            # Log as a redirected internal URL
                             redirs[(url, turl)] = 1
                             continue
+                        # Block duplicate root URLs
                         elif not urlseg[2] and urlseg[1] == sb: continue
+                    # If URL exceeds depth, don't process 
                     if len(turl.split('/')) >= depth: continue
+                    # Otherwise put in cache and yield url
                     else:
                         cache[turl] = rurl[1]
-                        yield turl
-                else:
-                    visited[url] = 1
-                    continue
+                        yield turl               
 
     def _webopen(self, base):
+        '''Verifies URL and returns actual URL and extracted child URLs
+
+        Arguments:
+        base -- tuple containing a URL and its referring URL'''
+        # Assignments
         good = self._good        
         try:
             # If webspiders can access URL, open it            
             if self._robot.can_fetch('*', base[0]):
                 url = self._ulib.urlopen(base[0])
-            else: return False
+            # Otherwise, mark as visited and abort
+            else:
+                self._visited[base[0]] = 1
+                return False
         # If HTTP error, log bad URL and abort
         except IOError:
             self._visited[base[0]] = 1
             self.badurls.append((base[1], base[0]))
             return False
+        # Get real URL
         tbase = url.geturl()
-        if tbase != base[0]:                
-            if base[0] in good:
-                del good[base[0]]
-                good[tbase] = 1
-            base = (tbase, base[1])    
-        # URLs with mimetype 'text/html" scanned for links
+        # Change URL if different from old URL
+        if tbase != base[0]: base = (tbase, base[1])    
+        # URLs with mimetype 'text/html" scanned for URLs
         if url.headers.type == 'text/html':
             # Feed parser
             parser = self._webfactory()
             try: parser.feed(url.read())
-            # Log URL if HTML so broken SGML parser can't parse it 
+            # Log URL if SGML parser can't parse it 
             except self._sperror:
                 self._visited[base[0]], self.badhtml[base[0]] = 1, 1
                 return False
             url.close()
             parser.close()            
+            # Return URL and extracted urls if it's good
+            if not parser.badurl: return base[0], parser.urls
             # If the URL is bad (after BadUrl), stop processing and log URL
-            if not parser.badurl: return base[0], parser.urls
             else:
                 self._visited[base[0]] = 1
                 self.badurls.append((base[1], base[0]))
                 return False
+        # Return URL of non-HTML resources and empty list
         else:
             url.close()
-            return base[0], []        
+            return base[0], []
                 
     def _webwalk(self):
-        '''Extracts URLs under a base URL'''
+        '''Yields good URLs from under a base URL'''
+        # Assignments
         webwalk, cache = self._webwalk, self._cache
         width, urlresolve = self.width, self._urlresolve
+        # End processing if cache is empty
         while cache:
+            # Fetch item from cache
             base, urls = cache.popitem()
-            yield base
+            # If item has child URLs, process them and yield good URLs
             if urls:
-                for url in urlresolve(urls, base):
-                    if url: yield url
+                for url in urlresolve(urls, base): yield url
 
     def weburls(self, base=None, width=200, depth=5):
         '''Returns a list of child URLs.
         
-        Arguments: (default: None)
-        base -- base URL that is going to be crawled
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
-        # Make globals local
+        Arguments:
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
+        # Assignments
         self._visited, self._good, self._cache, self.badurls = {}, {}, {}, [] 
         self.redirs, self.outside, self.badhtml, self.unhttp = {}, {}, {}, {}        
-        webwalk, good, self._robot = self._webwalk, self._good, self._rparser()       
-        # Resolve real base URL (no redirection aliases)
+        webwalk, good, self._robot = self._webwalk, self._good, self._rparser()
+        uparse = self._uparse 
+        # Use global base if present
         if not base: base = self.base
-        uparse, tbase = self._uparse, self._webopen((base, ''))
-        if tbase[0] != base: base = tbase[0]
-        # Ensure there's a trailing '/' in base URL
-        if base[-1] != '/':
-            url = list(uparse.urlsplit(base))
-            url[1] = ''.join([url[1], '/'])
-            base = uparse.urlunsplit(url)
-        self._cache[base], self._visited[base], good[base] = tbase[1], 1, 1
-        self.base, self._sb = base, base.split('/') 
+        # Verify URL
+        tbase =  self._webopen((base, ''))
+        # If URL is good...
+        if tbase:
+            # Change base URL if different
+            if tbase[0] != base: base = tbase[0]            
+            # Ensure there's a trailing '/' in base URL
+            if base[-1] != '/':
+                url = list(uparse.urlsplit(base))
+                url[1] = ''.join([url[1], '/'])
+                base = uparse.urlunsplit(url)
+            # Put in cache, visited list
+            self._cache[base], self._visited[base] = tbase[1], 1
+            # Make base URL, get split, and put in verified URL list
+            self.base, self._sb, good[base] = base, base.split('/'), 1
+        # If URL is bad, abort and raise error
+        else:
+            raise IOError, "URL does not exist."
+            return False
+        # Assign width
         if self.width and width == 200: width = self.width
-        else: self.width = width
-        if self.depth and depth == 6: self.depth = self.depth + len(self._sb)
+        # Adjust dept to length of base URL
+        if self.depth and depth == 6: self.depth += len(self._sb)
         else: self.depth = depth + len(self._sb)
-        # Set robot limits
+        # Get robot limits
         self._robot.set_url(''.join([base, 'robots.txt']))
         self._robot.read()
         # Get signature of bad URL
         self._webtest()
-        # Get good URLs
+        # Get good URLs as long as total width isn't exceeded
         try:
             for item in webwalk():
                 if len(good) <= width: good[item] = 1
                 elif len(good) >= width: break
+        # If user interrupts crawl, return what's done
         except KeyboardInterrupt: pass
+        # Get URLs, sort them, and return list
         self.urls = good.keys()
         self.urls.sort()
         return self.urls
     
     def webpaths(self, base=None, width=200, depth=5):
-        '''Crawls an HTTP site and returns a list of child paths.
+        '''Returns a list of web paths.
         
         Arguments:
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
 
         def pathize():            
-            '''Strips base URL from full URLs to produce paths'''
-            
+            '''Strips base URL from full URLs to produce paths'''            
             for url in urls:
                 # Remove base URL from path list
                 url = url.replace(self.base, '')
-                if not url: url = '/index.html'
+                # Add default name 'index.html' to root URLs and directories
+                if not url: url = 'index.html'
                 elif url[-1] == '/': url = ''.join([url, 'index.html'])
                 # Verify removal of base URL and remove it if found
-                if url.find('http:') != -1: url = urlsplit(url)[2:][0]                
+                if url.find(':') != -1: url = urlsplit(url)[2:][0]                
                 yield url
 
+        # Assignments
         urlsplit = self._uparse.urlsplit
-        # Run weburls if base is passed as an argument
+        # Run weburls if base passed as an argument
         if base: self.weburls(base, width, depth)
         # Strip off trailing resource or query from base URL
         if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
         self.paths = list(pathize())
         return self.paths
         
-    def webmirror(self, base=None, root=None, t=None, width=200, depth=5):
+    def webmirror(self, root=None, t=None, base=None, width=200, depth=5):
         '''Mirrors a website on a local filesystem
 
         Arguments:
-        base -- base URL to crawl (default: None)
         root -- local filesystem path (default: None)
         t -- number of threads (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
-        self.webspider(base, width, depth)
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
+        if base: self.webspider(base, width, depth)
         return self._mirror((self.paths, self.urls), root, t)
     
     def webspider(self, base=None, width=200, depth=5):
         '''Returns two lists of child URLs and paths
         
         Arguments:
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
-        return self.weburls(base, width, depth), self.webpaths()
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
+        if base: self.weburls(base, width, depth)
+        return self.webpaths(), self.urls
 
     def badurlreport(self, file=None, base=None, width=200, depth=5):
         '''Pretties up a list of bad URLs
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
+        # Format report if information is available
         if self.badurls:
-            header = 'Broken links under %s on %s:\n'
+            # Number of bad URLs
+            amount = str(len(self.badurls))
+            header = '%s broken URLs under %s on %s:\n'
+            # Print referring URL pointing to bad URL
             body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.badurls])
-            report = self._formatreport(header, body, file)
+            report = self._formatreport(amount, header, body, file)
+            # Return if just getting string
             if report: return report
 
     def badhtmlreport(self, file=None, base=None, width=200, depth=5):
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
+        # Format report if information is available
         if self.badhtml:
-            header = 'Unparsable HTML URLs under %s on %s:\n'
+            amount = str(len(self.badhtml))
+            header = '%s unparsable HTML URLs under %s on %s:\n'
             body = '\n'.join(self.badhtml)
-            report = self._formatreport(header, body, file)
+            report = self._formatreport(amount, header, body, file)
+            # Return if just getting string
             if report: return report
 
     def redirectreport(self, file=None, base=None, width=200, depth=5):
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
+        # Format report if information is available
         if self.redirs:
-            header = 'Redirects to external URLs under %s on %s:\n'
+            amount = str(len(self.redirs))
+            header = '%s redirects to external URLs under %s on %s:\n'
+            # Print referring URL pointing to new URL
             body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
-            report = self._formatreport(header, body, file)
+            report = self._formatreport(amount, header, body, file)
+            # Return if just getting string
             if report: return report
 
     def outsidereport(self, file=None, base=None, width=200, depth=5):
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
+        # Format report if information is available
         if self.outside:
-            header = 'Links to external URLs under %s on %s:\n'
+            amount = str(len(self.outside))
+            header = '%s links to external URLs under %s on %s:\n'
             body = '\n'.join(self.outside)
-            report = self._formatreport(header, body, file)
+            report = self._formatreport(amount, header, body, file)
+            # Return if just getting string
             if report: return report            
 
     def unhttpreport(self, file=None, base=None, width=200, depth=5):
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
+        # Format report if information is available
         if self.unhttp:
-            header = 'Non-FTP and non-HTTP URLs under %s on %s:\n'
+            amount = str(len(self.unhttp))
+            header = '%s non-FTP/non-HTTP URLs under %s on %s:\n'
             body = '\n'.join(self.unhttp)
-            report = self._formatreport(header, body, file)
+            report = self._formatreport(amount, header, body, file)
+            # Return if just getting string
             if report: return report
 
     def urlreport(self, file=None, base=None, width=200, depth=5):
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
+        # Format report if information is available
         if self.urls:
-            header = 'Verified URLs under %s on %s:\n'
+            amount = str(len(self.urls))
+            header = '%s verified URLs under %s on %s:\n'
             body = '\n'.join(self.urls)
-            report = self._formatreport(header, body, file)
+            report = self._formatreport(amount, header, body, file)
+            # Return if just getting string
             if report: return report
 
     def webreport(self, file=None, base=None, width=200, depth=5, *vargs):
         
         Arguments:
         file -- output file for report (default: None)
-        base -- base URL to crawl (default: None)        
-        width -- how many remote resources to crawl (default: 200)
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)
-        vargs -- report sections to exclude'''
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)
+        vargs -- report sections to include or exclude
+        To override defaults:
+        To include a section add 'badhtml', 'redirs', 'outside', or 'unhttp'
+        To exclude a section add 'badurls' or "urls"'''
         if base: self.weburls(base, width, depth)
-        badurls, badhtml, redirs, urls, outside, unhttp = 1, 1, 1, 1, 1, 1
-        comp = []
+        # Defaults for report
+        badurls, badhtml, redirs, urls, outside, unhttp = 1, 0, 0, 1, 0, 0
+        # Create compilation list
+        compile = []
+        # Override default report settings if argument is passed to vargs
         for arg in vargs:
             if arg == 'badurls': badurls = 0
-            elif arg == 'badhtml': badhtml = 0
-            elif arg == 'redirs': redirs = 0
+            elif arg == 'badhtml': badhtml = 1
+            elif arg == 'redirs': redirs = 1
             elif arg == 'urls': urls = 0
-            elif arg == 'outside': outside = 0
-            elif arg == 'unhttp': unhttp = 0        
+            elif arg == 'outside': outside = 1
+            elif arg == 'unhttp': unhttp = 1
+        # Compile report
         if badurls:
             badurls = self.badurlreport()
-            if badurls: comp.append(badurls)
+            if badurls: compile.append(badurls)
         if urls:
             urls = self.urlreport()
-            if urls: comp.append(urls)
+            if urls: compile.append(urls)
         if outside:
             outside = self.outsidereport()
-            if outside: comp.append(outside)
+            if outside: compile.append(outside)
         if redirs:
             redirs = self.redirectreport()
-            if redirs: comp.append(redirs)
+            if redirs: compile.append(redirs)
         if badhtml:
             badhtml = self.badhtmlreport()
-            if badhtml: comp.append(badhtml)        
+            if badhtml: compile.append(badhtml)        
         if unhttp:
             unhttp = self.unhttpreport()
-            if unhttp: comp.append(unhttp)
-        report = '\n\n'.join(comp)
+            if unhttp: compile.append(unhttp)
+        # Make report
+        report = '\n\n'.join(compile)
+        # Write to file if argument present
         if file: open(file, 'w').write(report)
+        # Or return string
         else: return report
         
-    def _formatreport(self, header, body, file=None):
+    def _formatreport(self, amount, header, body, file=None):
         '''Generic prettifier with date/time stamper
         
         Arguments:
         header -- title of report
         body -- body of report
         file -- output file for report (default: None)'''
-        from time import strftime, localtime
+        # Get current time
+        localtime, strftime = self._localtime, self._formtime        
         curtime = strftime('%A, %B %d, %Y at %I:%M %p', localtime())
-        header = header % (self.base, curtime)
+        # Make section header
+        header = header % (amount, self.base, curtime)
+        # Add header to body
         report = '\n'.join([header, body])
+        # Write to file if argument present
         if file: open(file, 'w').write(report)
+        # Or return string
         else: return report    
 
     def _mirror(self, lists, root=None, threads=None):
                         for thread in pool:
                             if not thread.isAlive(): pool.remove(thread)
                             
-        
+
 # Instance of Spider enables exporting Spider's methods as standalone functions
 _inst = Spider()
 ftpurls = _inst.ftpurls