Commits

Lynn Rees committed 08a8d84

- made core of web extractor faster and simpler

Comments (0)

Files changed (1)

     from ftplib import error_perm as _ftperr
     from sgmllib import SGMLParser as _sparser
     from sgmllib import SGMLParseError as _sperror
-    from robotparser import RobotFileParser as _robotparser
+    from robotparser import RobotFileParser as _rparser
     # Use threads if available 
     try: from threading import Thread as _thread
     except ImportError: pass
         cache, visited, webopen = self._cache, self._visited, self._webopen
         sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
         urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
-        outside, redirects = self.outside, self.redirects
+        outside, redirs = self.outside, self.redirs
         supported = ['HTTP', 'http', 'HTTPS', 'https', 'FTP', 'ftp']
         # Prepare base for joining
         tbase, unhttp = base.replace(base.split('/')[-1], ''), self.unhttp 
                     if turl != url:
                         urlseg = urlsplit(turl)
                         if urlseg[1] not in sb:
-                            redirects[(url, turl)] = 1
+                            redirs[(url, turl)] = 1
                             continue
                         elif not urlseg[2] and urlseg[1] == sb: continue
                     if len(turl.split('/')) >= depth: continue
             url.close()
             return base[0], []        
                 
-    def _webextract(self, base):
-        '''Extracts URLs from HTML documents and puts them in a list
-        Based on 'urllist.py' by Mark Pilgrim
-
-        base -- base URL searched for references to other URLS'''
-        # Avoid outside namespace lookups
-        cache, urlresolve = self._cache, self._urlresolve
-        #if base[0] in cache:
-        urls = cache.get(base)
-        del cache[base]
-        #else: urls = self._webopen(base)[1]; print base
-        if urls:
-            # Final and potential list of all URLs 
-            for url in urlresolve(urls, base): yield url
-        else: return
-
-    def _webwalk(self, source):
-        '''Recursively extracts all URLs within a base URL
-
-        Arguments:                
-        source -- source to search for URLs'''
-        webextract, webwalk = self._webextract, self._webwalk
-        for goodurl in webextract(source):
-            if goodurl:
-                yield goodurl
-                for childurl in webwalk(goodurl): yield childurl
+    def _webwalk(self):
+        '''Extracts URLs under a base URL'''
+        webwalk, cache = self._webwalk, self._cache
+        width, urlresolve = self.width, self._urlresolve
+        while cache:
+            base, urls = cache.popitem()
+            yield base
+            if urls:
+                for url in urlresolve(urls, base):
+                    if url: yield url
 
     def weburls(self, base=None, width=200, depth=5):
         '''Returns a list of child URLs.
         depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
         # Make globals local
         self._visited, self._good, self._cache, self.badurls = {}, {}, {}, [] 
-        self.redirects, good, self.badhtml = {}, self._good, {}        
-        self._robot, webwalk, self.outside = None, self._webwalk, {}
-        self.unhttp, self._robot = {}, self._robotparser() 
+        self.redirs, self.outside, self.badhtml, self.unhttp = {}, {}, {}, {}        
+        webwalk, good, self._robot = self._webwalk, self._good, self._rparser()       
         # Resolve real base URL (no redirection aliases)
         if not base: base = self.base
         uparse, tbase = self._uparse, self._webopen((base, ''))
         self._cache[base], self._visited[base], good[base] = tbase[1], 1, 1
         self.base, self._sb = base, base.split('/') 
         if self.width and width == 200: width = self.width
+        else: self.width = width
         if self.depth and depth == 6: self.depth = self.depth + len(self._sb)
         else: self.depth = depth + len(self._sb)
         # Set robot limits
         self._webtest()
         # Get good URLs
         try:
-            for item in webwalk(base):
-                # Extend good as long as max resource count not exceeded
-                if item:
-                    if len(good) <= width: good[item] = None
-                    elif len(good) >= width: break
+            for item in webwalk():
+                if len(good) <= width: good[item] = 1
+                elif len(good) >= width: break
         except KeyboardInterrupt: pass
         self.urls = good.keys()
         self.urls.sort()
         width -- how many remote resources to crawl (default: 200)
         depth -- how deep in HTTP hierarchy to crawl (default: 5)'''
         if base: self.weburls(base, width, depth)
-        if self.redirects:
+        if self.redirs:
             header = 'Redirects to external URLs under %s on %s:\n'
-            body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirects])
+            body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
             report = self._formatreport(header, body, file)
             if report: return report