1. Lynn Rees
  2. psilib

Commits

Lynn Rees  committed f734b1c

- another multithreading experiment

  • Participants
  • Parent commits 75d099a
  • Branches default

Comments (0)

Files changed (1)

File spider.py

View file
     # Use threads if available 
     try: from threading import Thread as _thread
     except ImportError: pass
-    # Use faster SGMLParser if available
-    try: from sgmlop import SGMLParser as _newparser
-    except ImportError: from sgmllib import SGMLParser as _oldparser
     
     # HTML tags with URLs
-    _urltags = ['a', 'img', 'link', 'script', 'iframe', 'object', 'embed',
-        'area', 'frame', 'applet', 'input', 'base', 'div', 'layer', 'ilayer',
-        'bgsound']
+    _urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
+        'embed':1, 'area':1, 'frame':1, 'applet':1, 'input':1, 'base':1,
+        'div':1, 'layer':1, 'ilayer':1, 'bgsound':1}
     # Supported protocols
-    _supported = ['HTTP', 'http', 'HTTPS', 'https', 'FTP', 'ftp']
+    _supported = {'HTTP':1, 'http':1, 'HTTPS':1, 'https':1, 'FTP':1, 'ftp':1}
     # HTML attributes with URLs
-    _urlattrs = ['href', 'src', 'data']
+    _urlattrs = {'href':1, 'src':1, 'data':1}
 
     def __init__(self, base=None, width=None, depth=None):
         '''Initializes a Spider instance and its base attributes
         Arguments:
         base -- URL to crawl (default: None)
         width -- maximum resources to crawl (default: None)
-        depth -- how deep in a hierarchy to crawl (default: None)'''                       
-        bfsig, bdsig = self._bfsig, self._bdsig
-        urltags, urlattrs = self._urltags, self._urlattrs
-
-        # Use different classes if faster SGML Parser is available
-        if self._newparser:
-
-            class UrlExtract:
-                '''Extracts URLs from a SGMLish document'''            
-                
-                def __init__(self):
-                    '''Resets SGML parser and clears lists'''
-                    self.urls, self.text, self.badurl = [], [], None
-    
-                def handle_data(self, data):
-                    '''Handles non-markup data
-    
-                    Arguments:
-                    data -- non-markup data in a SGMLish document'''            
-                    # Get first 5 lines of non-markup data
-                    if len(self.text) <= 5: self.text.append(data)
-                    # Compare signature of known bad URL to a new web page
-                    if self.text == bfsig: self.badurl = 1
-                    elif self.text == bdsig: self.badurl = 1
-                    
-                def finish_starttag(self, tag, attrs):
-                    '''Extracts URL bearing tags
-    
-                    Arguments:
-                    tag -- tag name
-                    attrs -- dictionary with attributes'''
-                    if tag in urltags:
-                        # Get key, vale in attributes if they match
-                        url = [v for k, v in attrs if k in urlattrs]
-                        if url: self.urls.extend(url)
-
-            class BadUrl:            
-                '''Collects results of intentionally incorrect URLs'''
-
-                def __init__(self):
-                    '''Resets SGML parser and clears lists'''
-                    self.text = []            
-                
-                def handle_data(self, data):
-                    '''Collects lines to profile not found responses'''
-                    # Adds first 5 lines of non-markup data to list "text"
-                    if len(self.text) <= 5: self.text.append(data)
-
-        else:
-            
-            class UrlExtract(self._oldparser):
-                '''Extracts URLs from a SGMLish document'''            
-                
-                def reset(self):
-                    '''Resets SGML parser and clears lists'''
-                    Spider._oldparser.reset(self)
-                    self.urls, self.text, self.badurl = [], [], None
-    
-                def handle_data(self, data):
-                    '''Handles non-markup data
-    
-                    Arguments:
-                    data -- non-markup data in a SGMLish document'''            
-                    # Get first 5 lines of non-markup data
-                    if len(self.text) <= 5: self.text.append(data)
-                    # Compare signature of known bad URL to a new web page
-                    if self.text == bfsig: self.badurl = 1
-                    elif self.text == bdsig: self.badurl = 1
-                    
-                def finish_starttag(self, tag, attrs):
-                    '''Extracts URL bearing tags
-    
-                    Arguments:
-                    tag -- tag name
-                    attrs -- dictionary with attributes'''
-                    if tag in urltags:
-                        # Get key, vale in attributes if they match
-                        url = [v for k, v in attrs if k in urlattrs]
-                        if url: self.urls.extend(url)
-
-            class BadUrl(self._oldparser):            
-                '''Collects results of intentionally incorrect URLs'''
-
-                def reset(self):
-                    '''Resets SGML parser and clears lists'''
-                    Spider._oldparser.reset(self)
-                    self.text = []            
-                
-                def handle_data(self, data):
-                    '''Collects lines to profile not found responses'''
-                    # Adds first 5 lines of non-markup data to list "text"
-                    if len(self.text) <= 5: self.text.append(data)
-                    
+        depth -- how deep in a hierarchy to crawl (default: None)'''                             
         if base: self.base = base
         else: self.base = None
         if width: self.width = width
         else: self.width = None
         if depth: self.depth = depth
-        else: self.depth = None 
-        self._UrlExtract, self._BadUrl = UrlExtract, BadUrl                    
+        else: self.depth = None
         
     def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
         '''Returns FTP client session
                     return session
                 # Too many login attempts? End program
                 elif attempts <= tries:
-                    print 'Permission denied'
+                    raise IOError, 'Permission denied.'
                     import sys
                     sys.exit(0)           
 
         self.urls = [''.join([base, i]) for i in paths]
         return self.urls
 
+    def _classpicker(self, old=None):
+        self._bfsig, self._bdsig = [], []
+        bfsig, bdsig = self._bfsig, self._bdsig
+        urltags, urlattrs = self._urltags, self._urlattrs
+        # Use faster SGMLParser if available
+        try:
+            from sgmlop import SGMLParser as newparser
+            self._newparser = newparser
+        except ImportError:
+            from sgmllib import SGMLParser as _oldparser
+            old = 1
+        # Use different classes if faster SGML Parser is available
+        if old:
+            from sgmllib import SGMLParser as _oldparser
+            old, self._newparser = 1, None
+            class UrlExtract(_oldparser):
+                '''Extracts URLs from a SGMLish document'''
+                def reset(self):
+                    '''Resets SGML parser and clears lists'''
+                    _oldparser.reset(self)
+                    self.urls, self.text, self.badurl = [], [], None
+                def handle_data(self, data):
+                    '''Handles non-markup data'''            
+                    # Get first 5 lines of non-markup data
+                    if len(self.text) <= 5: self.text.append(data)
+                    # Compare signature of known bad URL to a new web page
+                    if self.text == bfsig: self.badurl = 1
+                    elif self.text == bdsig: self.badurl = 1
+                def finish_starttag(self, tag, attrs):
+                    '''Extracts URL bearing tags'''
+                    if tag in urltags:
+                        # Get key, vale in attributes if they match
+                        url = [v for k, v in attrs if k in urlattrs]
+                        if url: self.urls.extend(url)
+            class BadUrl(_oldparser):            
+                '''Collects results of intentionally incorrect URLs'''
+                def reset(self):
+                    '''Resets SGML parser and clears lists'''
+                    _oldparser.reset(self)
+                    self.text = []
+                def handle_data(self, data):
+                    '''Collects lines to profile not found responses'''
+                    # Adds first 5 lines of non-markup data to list 'text'
+                    if len(self.text) <= 5: self.text.append(data)
+        else:
+            class UrlExtract:
+                '''Extracts URLs from a SGMLish document'''            
+                def __init__(self):
+                    '''Resets SGML parser and clears lists'''
+                    self.urls, self.text, self.badurl = [], [], None
+                def handle_data(self, data):
+                    '''Handles non-markup data'''            
+                    # Get first 5 lines of non-markup data
+                    if len(self.text) <= 5: self.text.append(data)
+                    # Compare signature of known bad URL to a new web page
+                    if self.text == bfsig: self.badurl = 1
+                    elif self.text == bdsig: self.badurl = 1
+                def finish_starttag(self, tag, attrs):
+                    '''Extracts URL bearing tags'''
+                    if tag in urltags:
+                        # Get key, vale in attributes if they match
+                        url = [v for k, v in attrs if k in urlattrs]
+                        if url: self.urls.extend(url)
+            class BadUrl:            
+                '''Collects results of intentionally incorrect URLs'''
+                def __init__(self):
+                    '''Resets SGML parser and clears lists'''
+                    self.text = []            
+                def handle_data(self, data):
+                    '''Collects lines to profile not found responses'''
+                    # Adds first 5 lines of non-markup data to list 'text'
+                    if len(self.text) <= 5: self.text.append(data)
+        self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
+
     def _webtest(self):
         '''Generates signatures for identifying bad URLs'''
 
             return urlget.text
 
         # Make globals local
-        base, self._bfsig, urljoin = self.base, [], self._uparse.urljoin
-        urlopen, self._bdsig, BadUrl = self._ulib.urlopen, [], self._BadUrl
+        base, urljoin = self.base, self._uparse.urljoin
+        urlopen, BadUrl = self._ulib.urlopen, self._BadUrl
         # Generate random string of jibber
         from string import letters, digits
         from random import choice, randint
             urlget.close()
         return urlget.badurl, urlget.urls
 
-    def _urlresolve(self, urls, base):
-        '''Returns a full URL relative to a base URL
-
-        Arguments:
-        urls -- list of raw URLs
-        base -- referring URL'''
-        # Assignments
-        cache, visited, webopen = self._cache, self._visited, self._webopen
-        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
-        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
-        outside, redirs, unhttp = self.outside, self.redirs, self.unhttp
-        supported = self._supported
-        # Strip file off base URL for joining
-        tbase = base.replace(base.split('/')[-1], '') 
-        for url in urls:
-            if url not in visited:
-                # Remove whitespace from URL
-                if url.find(' ') != -1:
-                    visited[url], url = 1, url.replace(' ', '')
-                    if url in visited: continue
-                # Remove fragments i.e. 'http:foo/bar#frag'
-                if url.find('#') != -1:
-                    visited[url], url = 1, urldefrag(url)[0]
-                    if url in visited: continue
-                # Process full URLs i.e. 'http://foo/bar
-                if url.find(':') != -1:
-                    urlseg = urlsplit(url)
-                    # Block non-FTP, HTTP URLs
-                    if urlseg[0] not in supported:
-                        # Log as non-FTP/HTTP URL
-                        unhttp[url], visited[url] = 1, 1
-                        continue
-                    # If URL is not in root domain, block it
-                    if urlseg[1] not in sb:
-                        visited[url], outside[url] = 1, 1                        
-                        continue
-                    # Block duplicate root URLs
-                    elif not urlseg[2] and urlseg[1] == sb:
-                        visited[url] = 1
-                        continue
-                # Handle relative URLs i.e. ../foo/bar
-                elif url.find(':') == -1:
-                    # Join root domain and relative URL
-                    visited[url], url = 1, urljoin(tbase, url)
-                    if url in visited: continue
-                # Test URL by attempting to open it
-                rurl = webopen((url, base))
-                if rurl and rurl[0] not in visited:
-                    # Get URL
-                    turl, rawurls = rurl
-                    visited[url], visited[turl] = 1, 1
-                    # If URL resolved to a different URL, process it
-                    if turl != url:
-                        urlseg = urlsplit(turl)
-                        # If URL is not in root domain, block it
-                        if urlseg[1] not in sb:
-                            # Log as a redirected internal URL
-                            redirs[(url, turl)] = 1
-                            continue
-                        # Block duplicate root URLs
-                        elif not urlseg[2] and urlseg[1] == sb: continue
-                    # If URL exceeds depth, don't process 
-                    if len(turl.split('/')) >= depth: continue
-                    # Otherwise put in cache and yield url
-                    else:
-                        newurls = {}
-                        # Eliminate duplicates
-                        for rawurl in rawurls:
-                            # Eliminated known visited URLs
-                            if rawurl not in visited: newurls[rawurl] = 1
-                        # Put new urls in cache if present
-                        if newurls: cache[turl] = newurls
-                        yield turl
-
     def _webopen(self, base):
         '''Verifies URL and returns actual URL and extracted child URLs
 
         else:
             url.close()
             return cbase, []
-                
-    def _webwalk(self):
-        '''Yields good URLs from under a base URL'''
-        # Assignments
-        cache, urlresolve = self._cache, self._urlresolve
-        # End processing if cache is empty
-        while cache:
-            # Fetch item from cache
-            base, urls = cache.popitem()
-            # If item has child URLs, process them and yield good URLs
-            if urls:
-                for url in urlresolve(urls, base): yield url
 
-    def weburls(self, base=None, width=200, depth=5):
-        '''Returns a list of child URLs.
-        
-        Arguments:
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        # Assignments
-        self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
-        self.redirs, self.outside, self.badhtml, self.unhttp = {}, {}, {}, {}
-        webwalk, good, self._robot = self._webwalk, self._good, self._rparser()
-        uparse = self._uparse 
-        # Use global base if present
-        if not base: base = self.base
-        # Verify URL
-        tbase =  self._webopen((base, ''))
-        # If URL is good...
-        if tbase:
-            # Change base URL if different
-            if tbase[0] != base: base = tbase[0]            
-            # Ensure there's a trailing '/' in base URL
-            if base[-1] != '/':
-                url = list(uparse.urlsplit(base))
-                url[1] = ''.join([url[1], '/'])
-                base = uparse.urlunsplit(url)
-            # Put in cache, visited list
-            self._cache[base], self._visited[base] = tbase[1], 1
-            # Make base URL, get split, and put in verified URL list
-            self.base, self._sb, good[base] = base, base.split('/'), 1
-        # If URL is bad, abort and raise error
-        else:
-            raise IOError, "URL is invalid"
-        # Assign width
-        if self.width and width == 200: width = self.width
-        # Adjust dept to length of base URL
-        if self.depth and depth == 6: self.depth += len(self._sb)
-        else: self.depth = depth + len(self._sb)
-        # Get robot limits
-        self._robot.set_url(''.join([base, 'robots.txt']))
-        self._robot.read()
-        # Get signature of bad URL
-        self._webtest()
-        # Get good URLs as long as total width isn't exceeded
-        try:
-            for item in webwalk():
-                if len(good) <= width: good[item] = 1
-                elif len(good) >= width: break
-        # If user interrupts crawl, return what's done
-        except KeyboardInterrupt: pass
-        # Get URLs, sort them, and return list
-        self.urls = good.keys()
-        self.urls.sort()
-        return self.urls
-    
-    def webpaths(self, base=None, width=200, depth=5):
+    def weburls(self, b=None, w=200, d=5, t=None):
         '''Returns a list of web paths.
         
         Arguments:
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads to spawn'''
+        if t: self._webthreads(b, w, d, t)
+        else: self._singleweb(b, w, d)
+        return self.urls
+
+    def _singleweb(self, b=None, w=200, d=5, t=None):
+        '''Returns a list of web paths.
+        
+        Arguments:
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads to spawn'''    
+        pass
+    
+    def webpaths(self, b=None, w=200, d=5, t=None):
+        '''Returns a list of web paths.
+        
+        Arguments:
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
 
         def pathize():            
             '''Strips base URL from full URLs to produce paths'''            
         # Assignments
         urlsplit = self._uparse.urlsplit
         # Run weburls if base passed as an argument
-        if base: self.weburls(base, width, depth)
+        if b: self.weburls(b, w, d, t)
         # Strip off trailing resource or query from base URL
         if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
         urls = self.urls
         base -- base web URL (default: None)
         width -- amount of resources to crawl (default: 200)
         depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.webspider(base, width, depth)
+        if base: self.webspider(base, width, depth, t)
         return self._mirror((self.paths, self.urls), root, t)
     
-    def webspider(self, base=None, width=200, depth=5):
+    def webspider(self, b=None, w=200, d=5, t=None):
         '''Returns two lists of child URLs and paths
         
         Arguments:
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         return self.webpaths(), self.urls
 
-    def badurlreport(self, file=None, base=None, width=200, depth=5):
+    def badurlreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of bad URLs
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.badurls:
             # Number of bad URLs
             header = '%s broken URLs under %s on %s:\n'
             # Print referring URL pointing to bad URL
             body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.badurls])
-            report = self._formatreport(amount, header, body, file)
+            report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
 
-    def badhtmlreport(self, file=None, base=None, width=200, depth=5):
+    def badhtmlreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of unparsed HTML URLs
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.badhtml:
             amount = str(len(self.badhtml))
             header = '%s unparsable HTML URLs under %s on %s:\n'
             body = '\n'.join(self.badhtml)
-            report = self._formatreport(amount, header, body, file)
+            report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
 
-    def redirectreport(self, file=None, base=None, width=200, depth=5):
+    def redirectreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of URLs redirected to an external URL
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.redirs:
             amount = str(len(self.redirs))
             header = '%s redirects to external URLs under %s on %s:\n'
             # Print referring URL pointing to new URL
             body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
-            report = self._formatreport(amount, header, body, file)
+            report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
 
-    def outsidereport(self, file=None, base=None, width=200, depth=5):
+    def outsidereport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of outside URLs referenced under the base URL
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.outside:
             amount = str(len(self.outside))
             header = '%s links to external URLs under %s on %s:\n'
             body = '\n'.join(self.outside)
-            report = self._formatreport(amount, header, body, file)
+            report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report            
 
-    def unhttpreport(self, file=None, base=None, width=200, depth=5):
+    def unhttpreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of non-HTTP/FTP URLs
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.unhttp:
             amount = str(len(self.unhttp))
             header = '%s non-FTP/non-HTTP URLs under %s on %s:\n'
             body = '\n'.join(self.unhttp)
-            report = self._formatreport(amount, header, body, file)
+            report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
 
-    def urlreport(self, file=None, base=None, width=200, depth=5):
+    def urlreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of all URLs under a URL
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.weburls(base, width, depth)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)'''
+        if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.urls:
             amount = str(len(self.urls))
             header = '%s verified URLs under %s on %s:\n'
             body = '\n'.join(self.urls)
-            report = self._formatreport(amount, header, body, file)
+            report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
 
-    def webreport(self, file=None, base=None, width=200, depth=5, *vargs):
+    def webreport(self, f=None, b=None, w=200, d=5, t=None, *vargs):
         '''Pretties up a list of logged information under a URL
         
         Arguments:
-        file -- output file for report (default: None)
-        base -- base web URL (default: None)
-        width -- amount of resources to crawl (default: 200)
-        depth -- depth in hierarchy to crawl (default: 5)
+        f -- output file for report (default: None)
+        b -- base web URL (default: None)
+        w -- amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 5)
+        t -- number of threads (default: None)
         vargs -- report sections to include or exclude
         To override defaults:
         To include a section add 'badhtml', 'redirs', 'outside', or 'unhttp'
         To exclude a section add 'badurls' or "urls"'''
-        if base: self.weburls(base, width, depth)
+        if b: self.weburls(b, w, d, t)
         # Defaults for report
         badurls, badhtml, redirs, urls, outside, unhttp = 1, 0, 0, 1, 0, 0
         # Create compilation list
         # Make report
         report = '\n\n'.join(compile)
         # Write to file if argument present
-        if file: open(file, 'w').write(report)
+        if file: open(f, 'w').write(report)
         # Or return string
         else: return report
         
         # Write to file if argument present
         if file: open(file, 'w').write(report)
         # Or return string
-        else: return report    
+        else: return report
+
+    def _threadresolve(self, url, base):
+        '''Returns a full URL relative to a base URL
+
+        Arguments:
+        urls -- list of raw URLs
+        base -- referring URL'''
+        # Assignments
+        cache, visited, webopen = self._cache, self._visited, self._webopen
+        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
+        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
+        outside, redirs, unhttp = self.outside, self.redirs, self.unhttp
+        supported, good = self._supported, self._good
+        # Strip file off base URL for joining
+        tbase = base.replace(base.split('/')[-1], '') 
+        # for url in urls:
+        if url not in visited:
+            # Remove whitespace from URL
+            if url.find(' ') != -1:
+                visited[url], url = 1, url.replace(' ', '')
+                if url in visited: return False
+            # Remove fragments i.e. 'http:foo/bar#frag'
+            if url.find('#') != -1:
+                visited[url], url = 1, urldefrag(url)[0]
+                if url in visited: return False
+            # Process full URLs i.e. 'http://foo/bar
+            if url.find(':') != -1:
+                urlseg = urlsplit(url)
+                # Block non-FTP, HTTP URLs
+                if urlseg[0] not in supported:
+                    # Log as non-FTP/HTTP URL
+                    unhttp[url], visited[url] = 1, 1
+                    return False
+                # If URL is not in root domain, block it
+                if urlseg[1] not in sb:
+                    visited[url], outside[url] = 1, 1                        
+                    return False
+                # Block duplicate root URLs
+                elif not urlseg[2] and urlseg[1] == sb:
+                    visited[url] = 1
+                    return False
+            # Handle relative URLs i.e. ../foo/bar
+            elif url.find(':') == -1:
+                # Join root domain and relative URL
+                visited[url], url = 1, urljoin(tbase, url)
+                if url in visited: return False
+            # Test URL by attempting to open it
+            rurl = webopen((url, base))
+            if rurl and rurl[0] not in visited:
+                # Get URL
+                turl, rawurls = rurl
+                visited[url], visited[turl] = 1, 1
+                # If URL resolved to a different URL, process it
+                if turl != url:
+                    urlseg = urlsplit(turl)
+                    # If URL is not in root domain, block it
+                    if urlseg[1] not in sb:
+                        # Log as a redirected internal URL
+                        redirs[(url, turl)] = 1
+                        return False
+                    # Block duplicate root URLs
+                    elif not urlseg[2] and urlseg[1] == sb: return False
+                # If URL exceeds depth, don't process 
+                if len(turl.split('/')) >= depth: return False
+                # Otherwise put in cache and yield url
+                else:
+                    if rawurls:
+                        for rawurl in rawurls:
+                            if rawurl not in visited: cache[rawurl] = turl
+                    good[turl] = 1        
 
     def _mirror(self, lists, root=None, threads=None):
         '''Mirrors a site on a local filesystem based on lists passed to it
                         for thread in pool:
                             if not thread.isAlive(): pool.remove(thread)
                             
+    def _threadwalk(self, threads=3):
+        '''Yields good URLs from under a base URL
+        
+        Arguments:
+        threads -- number of threads to run (default: 3)'''
+
+        def urlthread(url, base):
+            '''Spawns a thread containing the download function'''
+            dthread = Thread(target=threadresolve, args=(url, base))
+            pool.append(dthread)
+        
+        pool, cache, threadresolve = [], self._cache, self._threadresolve
+        Thread, width, good = self._thread, self.width, self._good
+        while cache:
+            if len(good) <= width:
+                url, base = cache.popitem()
+                if url: urlthread(url, base)
+                if len(pool) == threads or threads >= len(cache):
+                    for thread in pool: thread.start()
+                    while pool:
+                        for thread in pool:
+                            if not thread.isAlive(): pool.remove(thread)
+            elif len(good) >= width: break
+
+    def _webthreads(self, base=None, width=200, depth=5, threads=3):
+        '''Returns a list of child URLs.
+        
+        Arguments:
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)
+        threads -- number of threads to run (default: 3)'''
+        # Assignments
+        self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
+        self.redirs, self.outside, self.badhtml, self.unhttp = {}, {}, {}, {}
+        uparse, self._robot = self._uparse, self._rparser()
+        good, cache, robot = self._good, self._cache, self._robot
+        # sgmlop crashes Python after too many iterations
+        if width > 5000: self._classpicker(1)
+        else: self._classpicker() 
+        # Use class base if present
+        if not base: base = self.base
+        # Verify URL
+        tbase, rawurls = self._webopen((base, ''))
+        if tbase:
+            # Change base URL if different from resolved URL
+            if tbase != base: base = tbase
+            # Ensure base URL has a trailing '/' 
+            if base[-1] != '/':
+                url = list(uparse.urlsplit(base))
+                url[1] = ''.join([url[1], '/'])
+                base = uparse.urlunsplit(url)
+            # Put raw URLs in cache
+            for rawurl in rawurls: cache[rawurl] = base
+            # Make base URL, get split, and put in verified URL list
+            self.base, self._sb = base, base.split('/')
+            self._visited[base], good[base] = 1, 1
+        # Bad URL? Abort and throw exception
+        else: raise IOError, "URL is invalid."
+        # Assign width
+        if self.width and width == 200: width = self.width
+        else: self.width = width
+        # Adjust dept to length of base URL
+        if self.depth and depth == 6: self.depth += len(self._sb)
+        else: self.depth = depth + len(self._sb)
+        # Get robot permissions
+        robot.set_url(''.join([base, 'robots.txt']))
+        robot.read()
+        # Get signature of bad URL
+        self._webtest()
+        # Get good URLs as long as total width isn't exceeded
+        try: self._threadwalk()
+        # If user interrupts crawl, return what's done
+        except KeyboardInterrupt: pass
+        # Get URLs, sort them, and return list
+        self.urls = good.keys()
+        self.urls.sort()
+        return self.urls                            
+                            
 
 # Instance of Spider enables exporting Spider's methods as standalone functions
 _inst = Spider()