Commits

Lynn Rees committed 59ec3bf

- web reporting framework

  • Participants
  • Parent commits c74b4b2

Comments (0)

Files changed (1)

 __author__ = 'L.C. Rees (xanimal@users.sf.net)'
 __all__ = ['badurls', 'badhtml', 'ftpurls', 'ftppaths', 'weburls',
     'ftpmirror', 'ftpspider', 'webpaths', 'webreport', 'webmirror',
-    'webspider', 'urlresolve']
+    'webspider', 'redirects', 'prettyurls', 'prettybadurls', 'prettybadhtml',
+    'prettyredirects', 'prettywebreport']
 
 '''This module provides FTP and Web spiders and mirroring utilities in one
 convenient module. FTP sites are crawled by walking a remote directory tree.
         from threading import Thread as _thread
         from threading import BoundedSemaphore as _lock
     except ImportError: pass
-    _weballow = ('HTTP', 'http', 'FTP', 'ftp')
 
     def __init__(self):
         self._bflist, self._bdlist, self._robot = None, None, None        
         
-    def _ftpconnect(self, url, name='anonymous', password=None, attempts=3):
+    def _ftpopen(self, url, name='anonymous', password=None, attempts=3):
         '''Returns an FTP client session
 
         Arguments:
         if path: session.cwd(path)
         return session
 
-    def ftpmirror(self, u, r=None, t=None, d=6, w=200, n='anonymous', p=None):
+    def ftpmirror(self, u, r=None, w=200, d=6, n='anonymous', p=None, t=None):
         '''Crawls an FTP site and mirrors its content on a local filesystem
         
         Arguments:
         n -- FTP user name (default: 'anonymous')
         p -- FTP password (default: None)'''
         # Pass ftpspider and root to base '_mirror' function
-        return self._mirror(self.ftpspider(u, d, w, n, p), r, t)
+        return self._mirror(self.ftpspider(u, w, d, n, p), r, t)
 
-    def ftppaths(self, url, depth=6, width=200, name='anonymous', pw=None):
+    def ftppaths(self, url, width=200, depth=6, name='anonymous', pw=None):
         '''Crawls an FTP site and returns a list of paths.
         
         Arguments:
 
             Arguments:
             rdir -- remote directory list'''
-            rlist = list()
+            rlist, rappend = [], rlist.append
             for rl in rdir:
                 # Split remote file based on whitespace
                 ri = rl.split()[-1]
                 # Add tuple of remote item type, permissions & name to rlist
-                if ri not in ('.', '..'): rlist.append((rl[0], rl[7], ri))
+                if ri not in ('.', '..'): rappend((rl[0], rl[7], ri))
             return rlist
        
         def visitftp():
             wd = pwd()
             if wd[-1] != '/': wd = '/'.join([wd, ''])
             # Add working directory to list of visited directories
-            dlist[wd], rlist = 1, list()
+            dlist[wd], rlist = 1, []
             # Get list of current directory's contents
             retr('LIST -a', rlist.append)
             for url in sortftp(rlist):
                             visitftp()
 
         # Set depth and clear visited file and directory dictionaries
-        depth, flist, dlist, self._base = depth + 1, dict(), dict(), url 
+        depth, flist, dlist, self._base = depth + 1, {}, {}, url 
         # Connect to FTP site
-        ftp = self._ftpconnect(url, name, pw)        
+        ftp = self._ftpopen(url, name, pw)        
         # Avoid outside namespace lookups
         cwd, pwd, retr, self._session = ftp.cwd, ftp.pwd, ftp.retrlines, ftp
         # Walk FTP site
         self._plist = flist.keys()
         return self._plist
 
-    def ftpspider(self, url, depth=6, width=200, name='anonymous', pw=None):
+    def ftpspider(self, url, width=200, depth=6, name='anonymous', pw=None):
         '''Crawls an FTP site and returns lists of full URLs and paths
         
         Arguments:
         name -- FTP user name (default: 'anonymous')
         pw -- FTP password (default: None)'''
         # Return list of paths (ftppaths) and full URLs (ftpurls) 
-        return ftppaths(url, depth, width, name, pw), ftpurls(), self._session
+        return ftppaths(url, width, depth, name, pw), ftpurls(), self._session
 
-    def ftpurls(self, url=None, depth=6, width=200, name='anonymous', pw=None):
+    def ftpurls(self, url=None, width=200, depth=6, name='anonymous', pw=None):
         '''Crawls an FTP site and returns a list of full URLs
         
         Arguments:
         name -- FTP user name (default: 'anonymous')
         pw -- FTP password (default: None)'''
         # Run ftppaths if URL passed as argument
-        if url: ftppaths(url, depth, width, name, pw)
+        if url: ftppaths(url, width, depth, name, pw)
         # Return a list of full URLs
         base, plist = self._base, self._plist
         plist.sort()
         return [''.join([base, i]) for i in plist]
 
-    def _webfactory(self, use=None):
+    def _webfactory(self):
         '''Returns the needed class as an object
 
         Arguments:
             def reset(self):
                 '''Resets SGML parser and clears lists'''
                 Spider._sparser.reset(self)
-                self.urls, self.text, self.badurl = list(), list(), None
+                self.urls, self.text, self.badurl = [], [], None
         
             def start_a(self, attrs):
                 '''Collects URLs from a tags'''
             def reset(self):
                 '''Resets the SGML parser and clears the list text'''
                 Spider._sparser.reset(self)
-                self.text = list()        
+                self.text = []        
 
         def badurl(url):
             '''Runs URL opening and parsing for the BadUrl class.
             Arguments:                
             url -- Bad URL to open and parse'''
             # Read URL 
-            parser.feed(ulib.urlopen(url).read())
+            parser.feed(urlopen(url).read())
             parser.close()
             return parser.text
-                
-        base, ulib, uparse = self._base, self._ulib, self._uparse
-        parser, self._bflist, self._bdlist = BadUrl(), list(), list()
+
+        # Make globals local                
+        base, self._bflist, urljoin = self._base, [], self._uparse.urljoin
+        parser, urlopen, self._bdlist = BadUrl(), self._ulib.urlopen, []
         # Generate random string of jibber
         from string import letters, digits
         from random import choice, randint
         jibber = ''.join([letters, digits])
         ru = ''.join([choice(jibber) for x in range(randint(1, 30))]) 
         # Builds bad URL signature of a bad file URL
-        self._bflist.extend(badurl(uparse.urljoin(base, '%s.html' % ru)))
+        self._bflist.extend(badurl(urljoin(base, '%s.html' % ru)))
         parser.reset()
         # Builds bad URL signature of a bad directory URL
-        self._bdlist.extend(badurl(uparse.urljoin(base,'%s/' % ru)))        
+        self._bdlist.extend(badurl(urljoin(base,'%s/' % ru)))        
 
     def _urlresolve(self, urllist, base):
         '''Resolve full URL relative to base URL
         Arguments:
         urllist -- list of extracted URLs
         base -- base URL'''
-        uparse, vlist, webopen = self._uparse, self._vlist, self._webopen
-        sr, sb, weballow = self._sb, base.split('/'), self._weballow
-        cache, depth = self._cache, self._depth
-        # Get root domain of 'base'
-        try: tbase = base.replace(sb[-1], '')
-        except ValueError: pass
+        # Make globals local
+        cache, visited, webopen = self._cache, self._visited, self._webopen
+        sr, depth, urljoin = self._sb[2], self._depth, self._uparse.urljoin
+        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
+        tbase, redirect = base.replace(base.split('/')[-1], ''), self._redirect
         for url in urllist:
-            # Ensures URL hasn't been processed by checking vlist
-            if url not in vlist:                
+            if url not in visited:
                 if url.find(' ') != -1:
-                    vlist[url], url = 1, url.replace(' ', '')
-                    if url in vlist: continue
-                # Remove fragments i.e. "http:foo/bar#frag"
+                    visited[url], url = None, url.replace(' ', '')
+                    if url in visited: continue
+                # Remove fragments i.e. 'http:foo/bar#frag'
                 if url.find('#') != -1:
-                    vlist[url], url = 1, uparse.urldefrag(url)[0]
-                    if url in vlist: continue
-                # If URL is not from root domain, block it
+                    visited[url], url = None, urldefrag(url)[0]
+                    if url in visited: continue               
                 if url.find(':') != -1:
-                    urlseg = uparse.urlsplit(url[0])
-                    if urlseg[0] not in weballow:
-                        vlist[url] = 1
+                    urlseg = urlsplit(url)
+                    # Block non-FTP, HTTP URLs
+                    if urlseg[0] not in ['HTTP', 'http', 'FTP', 'ftp']:
+                        visited[url] = None
                         continue
-                    if urlseg[1] not in sr[2]:
-                        vlist[url] = 1
+                    # If URL is not from root domain, block it
+                    if urlseg[1] not in sr:
+                        visited[url] = None
                         continue
-                    elif not urlseg[2] and urlseg[1] == sr[2]:
-                        vlist[url] = 1
+                    # Block duplicate root URLs
+                    elif not urlseg[2] and urlseg[1] == sr:
+                        visited[url] = None
                         continue
-                # Handle relative URLs without '.' or '..'
+                # Handle relative URLs
                 elif url.find(':') == -1:
-                    vlist[url] = 1
                     # Join root domain and relative URL
-                    url = uparse.urljoin(tbase, url)
-                    if url in vlist: continue
-                if url not in vlist:
-                    rurl = webopen((url, base))
-                    if rurl and rurl[0] not in vlist:
-                        vlist[url], vlist[rurl[0]] = 1, 1
-                        if rurl[0] != url:
-                            urlseg = uparse.urlsplit(rurl[0])
-                            if urlseg[1] not in sr[2]: continue
-                            elif not urlseg[2] and urlseg[1] == sr[2]: continue
-                        if len(rurl[0].split('/')) >= depth: continue
-                        else:
-                            cache[rurl[0]] = rurl[1]
-                            yield rurl[0], base
+                    visited[url], url = None, urljoin(tbase, url)
+                    if url in visited: continue
+                rurl = webopen((url, base))                 
+                if rurl and rurl[0] not in visited:
+                    visited[url], turl = None, rurl[0]
+                    visited[turl] = None
+                    if turl != url:
+                        urlseg = urlsplit(turl)
+                        if urlseg[1] not in sr:
+                            redirect[(url, turl)] = None
+                            continue
+                        elif not urlseg[2] and urlseg[1] == sr: continue
+                    if len(turl.split('/')) >= depth: continue
                     else:
-                        vlist[url] = 1
-                        continue
+                        cache[turl] = rurl[1]
+                        yield turl
+                else:
+                    visited[url] = None
+                    continue
 
     def _webopen(self, base):
-        vlist, fudict = self._vlist, self._fudict        
+        visited, good = self._visited, self._good        
         try:
             # If webspiders can access URL, open it
             if self._robot:
             else: url = self._ulib.urlopen(base[0])            
         # If HTTP error, log bad URL and abort
         except IOError:
-            vlist[base[0]] = 1
-            self._badurls.append((base[1], base[0]))
+            visited[base[0]] = None
+            self._bad.append((base[1], base[0]))
             return False
         tbase = url.geturl()
         if tbase != base[0]:                
-            if base[0] in fudict:
-                del fudict[base[0]]
-                fudict[tbase] = 1
+            if base[0] in good:
+                del good[base[0]]
+                good[tbase] = None
             base = (tbase, base[1])    
         # URLs with mimetype 'text/html" scanned for links
         if url.headers.type == 'text/html':
             try: parser.feed(url.read())
             # Log URL if HTML so broken SGML parser can't parse it 
             except self._sperror:
-                vlist[base[0]] = 1
+                visited[base[0]] = None
                 self._badhtml.append(base[0])
                 return False
             url.close()
             # If the URL is bad (after BadUrl), stop processing and log URL
             if not parser.badurl: return base[0], parser.urls
             else:
-                vlist[base[0]] = 1
-                self._badurls.append((base[1], base[0]))
+                visited[base[0]] = None
+                self._bad.append((base[1], base[0]))
                 return False
         else:
             url.close()
-            return base[0], list()        
+            return base[0], []        
                 
     def _webextract(self, base):
         '''Extracts URLs from HTML documents and puts them in a list
 
         base -- base URL searched for references to other URLS'''
         # Avoid outside namespace lookups
-        cache = self._cache
+        cache, urlresolve = self._cache, self._urlresolve
         #if base[0] in cache:
-        urls = cache.get(base[0])
-        del cache[base[0]]
+        urls = cache.get(base)
+        del cache[base]
         #else: urls = self._webopen(base)[1]; print base
         if urls:
             # Final and potential list of all URLs 
-            for url in self._urlresolve(urls, base[0]): yield url
+            for url in urlresolve(urls, base): yield url
         else: return
 
     def _webwalk(self, source):
 
         Arguments:                
         source -- source to search for URLs'''
-        for goodurl in self._webextract(source):
+        webextract, webwalk = self._webextract, self._webwalk
+        for goodurl in webextract(source):
             if goodurl:
                 yield goodurl
-                for childurl in self._webwalk(goodurl): yield childurl
+                for childurl in webwalk(goodurl): yield childurl
 
-    def weburls(self, base, depth=5, width=100):
+    def weburls(self, base, width=200, depth=5):
         '''Crawls a URL via HTTP and returns a full list of child URLs.
         
         Arguments:
         base -- base URL that is going to be crawled
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)
-        width -- how many remote resources to crawl (default: 100)'''
-        self._vlist, self._fudict, self._badurls, self._cache = {}, {}, [], {}
-        self._badhtml, uparse, fudict = [], self._uparse, self._fudict
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
+        # Make globals local
+        self._visited, self._good, self._cache, self._bad = {}, {}, {}, [] 
+        self._redirect, good, self._badhtml = {}, self._good, []
         # Resolve real base URL (no redirection aliases)
-        tbase = self._webopen((base, ''))
-        base, self._cache[tbase[0]] = tbase[0], tbase[1]
+        uparse, tbase = self._uparse, self._webopen((base, ''))
+        if tbase[0] != base: base = tbase[0]
         # Ensure there's a trailing '/' in base URL
         if base[-1] != '/':
             url = list(uparse.urlsplit(base))
             url[1] = ''.join([url[1], '/'])
             base = uparse.urlunsplit(url)
-        self._base, self._vlist[base], self._sb = base, 1, base.split('/')
-        self._depth, fudict[(base, '')] = depth + len(self._sb), 1
+        self._cache[base], self._visited[base] = tbase[1], None
+        self._base, self._sb, webwalk = base, base.split('/'), self._webwalk
+        self._depth, good[base] = depth + len(self._sb), None
         # Set robot limits
         self._robot = self._robotparser(''.join([base, 'robots.txt']))
         self._robot.read()
         self._webtest()
         # Get good URLs
         try:
-            for item in self._webwalk((base, '')):
-                # Extend fulist as long as max resource count not exceeded
+            for item in webwalk(base):
+                # Extend good as long as max resource count not exceeded
                 if item:
-                    if len(fudict) <= width: fudict[item] = 1
-                    elif len(fudict) >= width: break
+                    if len(good) <= width: good[item] = None
+                    elif len(good) >= width: break
         except KeyboardInterrupt: pass
-        self._fulist = [i[0] for i in fudict.keys()]
+        self._fulist = good.keys()
         self._fulist.sort()
         return self._fulist
     
-    def webpaths(self, base=None, depth=5, width=100):
+    def webpaths(self, base=None, width=200, depth=5):
         '''Crawls an HTTP site and returns a list of child paths.
         
         Arguments:
-        base -- base URL to crawled
+        base -- base URL to crawl
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
         width -- how many remote resources to crawl (default: 100)'''
 
                 # Remove base URL from path list
                 url = url.replace(self._base, '')
                 # Verify removal of base URL and remove it if found
-                if url.find('http:') != -1: url = uparse.urlsplit(url)[2:][0]
+                if url.find('http:') != -1: url = urlsplit(url)[2:][0]
                 yield url
 
-        uparse = self._uparse
+        urlsplit = self._uparse.urlsplit
         # Run weburls if base is passed as an argument
         if base: self.weburls(base, depth, width)
         # Strip off trailing resource or query from base URL
         # Return path list after stripping base URL
         return list(pathize())
         
-    def webmirror(self, base, root=None, t=None, depth=5, width=100):
+    def webmirror(self, base, root=None, width=200, depth=5, t=None):
         '''Crawls an HTTP site and mirrors its contents on local filesystem
 
         Arguments:
         t -- number of threads to spawn (default: None)
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
         width -- how many remote resources to crawl (default: 100)'''
-        return self._mirror(self.webspider(base, depth, width), root, t)
+        return self._mirror(self.webspider(base, width, depth), root, t)
     
-    def webspider(self, base, depth=5, width=100):
-        '''Crawls a URL via HTTP and returns two lists of child URLs.
+    def webspider(self, base, width=200, depth=5):
+        '''Crawls a URL via HTTP and returns two lists of child URLs
         
         Arguments:
         base -- base URL to crawl
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
         width -- how many remote resources to crawl (default: 100)'''
-        self.weburls(base, depth, width)
+        self.weburls(base, width, depth)
         pulist = self.webpaths()
         pulist.sort()
         return pulist, self._fulist
 
-    def badurls(self, base, depth=5, width=100):
-        '''Crawls a URL via HTTP and returns a lisk of bad child URLs.
+    def badurls(self, base, width=200, depth=5):
+        '''Crawls a URL via HTTP and returns a lisk of bad child URLs
         
         Arguments:
         base -- base URL to crawl
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
         width -- how many remote resources to crawl (default: 100)'''
-        self.weburls(base, depth, width)
-        return [' -> '.join([i[0], i[1]]) for i in self._badurls]
+        self.weburls(base, width, depth)
+        return self._bad
 
-    def badhtml(self, base, depth=5, width=100):
-        '''Crawls a URL via HTTP and returns a lisk of bad child URLs.
+    def badhtml(self, base, width=200, depth=5):
+        '''Returns a list of bad child URLs
         
         Arguments:
         base -- base URL to crawl
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
-        width -- how many remote resources to crawl (default: 100)'''
-        self.weburls(base, depth, width)
-        return self._badhtml.keys()
+        width -- how many remote resources to crawl (default: 200)'''
+        self.weburls(base, width, depth)
+        return self._badhtml
 
-    def webreport(self, base, depth=5, width=100):
-        '''Returns a tuple containing good URLs, bad URLs, and bad HTML
+    def redirects(self, base, width=200, depth=5):
+        '''Returns a list of URLs redirected to an external URL
         
         Arguments:
         base -- base URL to crawl
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
-        width -- how many remote resources to crawl (default: 100)'''
+        width -- how many remote resources to crawl (default: 200)'''
+        self.weburls(base, width, depth)
+        return self._redirect    
+
+    def webreport(self, base, width=200, depth=5):
+        '''Returns a tuple containing good URLs, bad URLs, bad HTML
+        and redirects
+        
+        Arguments:
+        base -- base URL to crawl
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
         self.weburls(base, depth, width)
-        return self._fulist, self._badurls, self._badhtml
+        return self._fulist, self._bad, self._badhtml, self._redirect
+
+    def prettybadurls(self, base, width=200, depth=5, file=None):
+        '''Pretties up a list of bad URLs
+        
+        Arguments:
+        base -- base URL to crawl
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)
+        file -- output file for report (default: None)'''
+        import time
+        bad = self.badurls(base, width, depth)
+        headstring = 'Bad URLs under %s as of %s\n'
+        header = headstring % (self._base, time.ctime())
+        body = '\n'.join([' -> '.join([i[0], i[1]]) for i in bad])
+        report = '\n'.join([header, body])
+        if file: open(file, 'w').write(report)
+        else: return report
+
+    def prettybadhtml(self, base, width=200, depth=5, file=None):
+        '''Pretties up a list of unparsed HTML URLs
+        
+        Arguments:
+        base -- base URL to crawl
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)
+        file -- output file for report (default: None)'''
+        import time
+        badhtml = self.badhtml(base, width, depth)
+        headstring = 'Unparsed HTML under %s as of %s\n'
+        header =  headstring % (self._base, time.ctime())
+        body = '\n'.join(badhtml)        
+        report = '\n'.join([header, body])
+        if file: open(file, 'w').write(report)
+        else: return report
+
+    def prettyredirects(self, base, width=200, depth=5, file=None):
+        '''Pretties up a list of URLs redirected to an external URL
+        
+        Arguments:
+        base -- base URL to crawl
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)
+        file -- output file for report (default: None)'''
+        import time
+        redirects = self.redirects(base, width, depth)
+        headstring = 'Externally redirected URLs under %s as of %s\n\n'
+        header = headstring % (self._base, time.ctime())
+        body = '\n'.join([' -> '.join([i[0], i[1]]) for i in bad])
+        report = '\n'.join([header, body])
+        if file: open(file, 'w').write(report)
+        else: return report
+
+    def prettyurls(self, base, width=200, depth=5, file=None):
+        '''Pretties up a list of all URLs under a URL
+        
+        Arguments:
+        base -- base URL to crawl
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)
+        file -- output file for report (default: None)'''
+        import time
+        urls = self.weburls(base, width, depth)
+        headstring = 'Good URLs under %s as of %s\n'
+        header =  headstring % (self._base, time.ctime())
+        body = '\n'.join(urls)
+        report = '\n'.join([header, body])
+        if file: open(file, 'w').write(report)
+        else: return report
+
+    def prettywebreport(self, base, width=200, depth=5, file=None):
+        '''Pretties up a list of all gathered information under a URL
+        
+        Arguments:
+        base -- base URL to crawl
+        width -- how many remote resources to crawl (default: 200)
+        depth -- how deep in HTTP hierarchy to crawl (default: 5)
+        file -- output file for report (default: None)'''
+        import time
+        urls, badurls, badhtml, redirects = self.webreport(base, width, depth)
+        basetime = (self._base, time.ctime())
+        buhstring = 'Bad URLs under %s as of %s\n'
+        bhhstring = 'Unparsed HTML under %s as of %s\n'        
+        rhstring = 'Externally redirected URLs under %s as of %s\n'
+        ustring = 'Good URLs under %s as of %s\n'
+        buh, bhh = buhstring % basetime, bhhstring % basetime
+        rh, uh = rhstring % basetime, ustring % basetime
+        bub = '\n'.join([' -> '.join([i[0], i[1]]) for i in badurls])
+        rb = '\n'.join([' -> '.join([i[0], i[1]]) for i in badurls])
+        bhb, ub = '\n'.join(badhtml), '\n'.join(urls)
+        report = '\n'.join([buh, bub, bhh, bhb, rh, rb, uh, ub])
+        if file: open(file, 'w').write(report)
+        else: return report
 
     def _hthread(self, source):
         '''Recursively extracts all URLs within a base URL
             lock.release()
             return threat        
 
-        Thread, pool, lock = self._thread, list(), self._mlock
-        fudict, hthread = self._fudict, self._hthread
+        Thread, pool, lock = self._thread, [], self._mlock
+        good, hthread = self._good, self._hthread
         newurls = self._textract(source)
         # If URL is good and newurls returns a list of good URLs...
-        if self._width >= len(fudict):
+        if self._width >= len(good):
             if newurls:
                 lock.acquire()
-                fudict[source[0]] = 1
+                good[source[0]] = 1
                 lock.release()
                 while newurls: pool.append(hethread(newurls.popitem()))
                 for thread in pool: thread.start()
 
         base -- base URL searched for references to other URLS'''
         # Avoid outside namespace lookups        
-        resolve, vlist, depth = self._urlresolve, self._vlist, self._depth
+        resolve, visited, depth = self._urlresolve, self._visited, self._depth
         lock = self._mlock
         # Test if robots can download
         try:            
         # If HTTP error, log as bad URL and abort.
         except IOError:
             lock.acquire()
-            vlist[base[0]] = 1           
-            self._badurls.append((base[1], base[0]))
+            visited[base[0]] = 1           
+            self._bad.append((base[1], base[0]))
             lock.release()
             return False
         # Only URLs with mimetype 'text/html" are processed
             # Log URL if HTML so broken SGML parser can't parse it 
             except self._sperror:
                 lock.acquire()
-                vlist[base[0]] = 1
+                visited[base[0]] = 1
                 self._badhtml.append(base[0])
                 lock.release()
                 return False 
             # If the URL is bad (after BadUrl), stop processing and log URL
             if parser.badurl:
                 lock.acquire()
-                vlist[base[0]] = 1
-                self._badurls.append((base[1], base[0]))
+                visited[base[0]] = 1
+                self._bad.append((base[1], base[0]))
                 lock.release()
                 return False
             # Otherwise assume the URL is good...
             else:
                 # Final and potential list of all URLs 
-                fdict = dict()
+                fdict = {}
                 lock.acquire()
                 for url in resolve(parser.urls, base[0]):
                     # Block URLS that exceed allowed depth
-                    if len(url[0].split('/')) >= depth: vlist[url[0]] = 1
+                    if len(url[0].split('/')) >= depth: visited[url[0]] = 1
                     # Put others in final list
-                    elif url not in vlist:
+                    elif url not in visited:
                         # Ensure visited URLs are never visited again
-                        vlist[url[0]] = 1
+                        visited[url[0]] = 1
                         # Add processed URL to final list of URLs
                         fdict[url[0]] = url[1]
                 lock.release()
         # If a URL does not have the MIME type "text/html", return
         else: return False
 
-    def webthread(self, base, depth=5, width=100):
+    def webthread(self, base, width=200, depth=5):
         '''Crawls a URL via HTTP and returns a full list of child URLs.
         
         Arguments:
         base -- base URL that is going to be crawled
         depth -- how deep in HTTP hierarchy to crawl (default: 5)
         width -- how many remote resources to crawl (default: 100)'''
-        self._vlist, self._fudict, self._badurls = {}, {}, []
-        uparse, fudict, self._badhtml = self._uparse, self._fudict, []       
+        self._visited, self._good, self._bad = {}, {}, []
+        uparse, good, self._badhtml = self._uparse, self._good, []       
         # Resolve real base URL (no redirection aliases)
         base, self._mlock = self._ulib.urlopen(base).geturl(), _lock(9)
         # Ensure there's a trailing '/' in base URL
             url = list(uparse.urlsplit(base))
             url[1] = ''.join([url[1], '/'])
             base = uparse.urlunsplit(url)
-        fudict[base] = 1
-        self._base, self._vlist[base], self._sb = base, 1, base.split('/')
+        good[base] = 1
+        self._base, self._visited[base], self._sb = base, 1, base.split('/')
         self._depth, self._width = depth + len(self._sb), width
         # Set robot limits
         self._robot = self._robotparser(''.join([base, 'robots.txt']))
         self._webtest()
         # Get good URLs
         self._hthread((base, ''))
-        self._fulist = fudict.keys()
+        self._fulist = good.keys()
         self._fulist.sort()
-        return self._fulist, self._badurls
+        return self._fulist, self._bad
 
     def _mirror(self, lists, root=None, threads=None):
         '''Mirrors a site on a local filesystem based on lists passed to it
                 # Open local file
                 local = open(np, 'wb')
                 # Download using FTP session
-                ftp = ftpconnect(base, name, password)
+                ftp = ftpopen(base, name, password)
                 ftp.retrbinary('RETR %s' % op, local.write)
                 ftp.close()
                 # Close local file
         ulib, makedirs, sep = self._ulib, self._os.makedirs, self._os.sep
         normcase, split = self._path.normcase, self._path.split
         exists, isdir = self._path.exists, self._path.isdir
-        ftpconnect = self._ftpconnect
+        ftpopen = self._ftpopen
         # Create local names for thread class and thread pool
-        if threads: Thread, pool = self._thread, list()
+        if threads: Thread, pool = self._thread, []
         # Localize name and password if exists
         try: base, name, password = self._base, self._name, self._password
         except AttributeError: pass
 ftpspider = _inst.ftpspider
 webmirror = _inst.webmirror
 webspider = _inst.webspider
-webthread = _inst.webthread
+webthread = _inst.webthread
+redirects = _inst.redirects
+prettyurls = _inst.prettyurls
+prettybadurls = _inst.prettybadurls
+prettybadhtml = _inst.prettybadhtml
+prettyredirects = _inst.prettyredirects
+prettywebreport = _inst.prettywebreport