1. Lynn Rees
  2. psilib

Commits

Lynn Rees  committed 5aaa454

- include original URL in broken link report i.e. referrer -> original url -> resolved URL

  • Participants
  • Parent commits 9ba3572
  • Branches default

Comments (0)

Files changed (1)

File spider.py

View file
  • Ignore whitespace
         # Return badurl marker and list of child URLS
         return urlget.badurl, urlget.urls
 
-    def _webopen(self, base):
+    def _webopen(self, newurl, base, oldurl):
         '''Verifies URL and returns actual URL and extracted child URLs
 
         Arguments:
-        base -- tuple containing a URL and its referring URL'''
+        newurl -- newly resolved URL
+        base -- the referring URL
+        oldurl - the original URL'''
         # Assignments
-        good, cbase = self._good, base[0]        
+        good = self._good        
         try:
             # If webspiders can access URL, open it            
-            if self._robot.can_fetch('*', cbase):
-                url = self._ulib.urlopen(cbase)
+            if self._robot.can_fetch('*', newurl):
+                url = self._ulib.urlopen(newurl)
             # Otherwise, mark as visited and abort
             else:
-                self._visited[cbase] = 1
-                return False
+                self._visited[newurl] = 1
+                return 0, 0
         # If HTTP error, log bad URL and abort
         except IOError:
-            self._visited[cbase] = 1
-            self.badurls.append((base[1], cbase))
-            return False
+            self._visited[newurl] = 1
+            self.badurls.append((base, oldurl, newurl))
+            return 0, 0
         # Get real URL
-        newbase = url.geturl()
-        # Change URL if different from old URL
-        if newbase != cbase: cbase, base = newbase, (newbase, base[1])    
+        newurl = url.geturl()
         # URLs with mimetype 'text/html" scanned for URLs
         if url.headers.type == 'text/html':
             # Feed parser
             try: badurl, urls = self._webparser(contents)
             # Log URL if SGML parser can't parse it 
             except self._sperror:
-                self._visited[cbase], self.badhtm[cbase] = 1, 1
-                return False
+                self._visited[newurl], self.badhtm[newurl] = 1, 1
+                return 0, 0
             url.close()
             # Return URL and extracted urls if it's good
-            if not badurl: return cbase, urls
+            if not badurl: return newurl, urls
             # If the URL is bad (after BadUrl), stop processing and log URL
             else:
-                self._visited[cbase] = 1
-                self.badurls.append((base[1], cbase))
-                return False
+                self._visited[newurl] = 1
+                self.badurls.append((base, oldurl, newurl))
+                return 0, 0
         # Return URL of non-HTML resources and empty list
         else:
             url.close()
-            return cbase, []
+            return newurl, []
+
+    def _urlverify(self, url, base, newbase):
+        '''Returns a full URL relative to a base URL
+
+        Arguments:
+        urls -- list of raw URLs
+        base -- referring URL
+        newbase -- temporary version of referring URL for joining'''
+        # Assignments
+        visited, webopen, other = self._visited, self._webopen, self.other
+        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
+        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
+        outside, redirs, supported = self.outside, self.redirs, self._supported
+        if url not in visited:
+            # Remove whitespace from URL
+            if url.find(' ') != -1:
+                visited[url], nurl = 1, url.replace(' ', '')
+                if url in visited: return 0, 0
+            # Remove fragments i.e. 'http:foo/bar#frag'
+            if url.find('#') != -1:
+                visited[url], nurl = 1, urldefrag(url)[0]
+                if url in visited: return 0, 0
+            # Process full URLs i.e. 'http://foo/bar
+            if url.find(':') != -1:
+                urlseg = urlsplit(url)
+                # Block non-FTP, HTTP URLs
+                if urlseg[0] not in supported:
+                    # Log as non-FTP/HTTP URL
+                    other[url], visited[url] = 1, 1
+                    return 0, 0
+                # If URL is not in root domain, block it
+                elif urlseg[1] not in sb:
+                    visited[url], outside[url] = 1, base                        
+                    return 0, 0
+                # Block duplicate root URLs
+                elif not urlseg[2] and urlseg[1] == sb:
+                    visited[url] = 1
+                    return 0, 0
+                nurl = url
+            # Handle relative URLs i.e. ../foo/bar
+            elif url.find(':') == -1:
+                # Join root domain and relative URL
+                visited[url], nurl = 1, urljoin(newbase, url)
+                if nurl in visited: return 0, 0
+            # Test URL by attempting to open it
+            turl, rawurls = webopen(nurl, base, url)
+            if turl and turl not in visited:
+                visited[nurl], visited[turl]  = 1, 1
+                # If URL resolved to a different URL, process it
+                if turl != nurl:
+                    urlseg = urlsplit(turl)
+                    # If URL is not in root domain, block it
+                    if urlseg[1] not in sb:
+                        # Log as a redirected internal URL
+                        redirs[(nurl, turl)] = base
+                        return 0, 0
+                    # Block duplicate root URLs
+                    elif not urlseg[2] and urlseg[1] == sb: return 0, 0
+                # If URL exceeds depth, don't process 
+                if len(turl.split('/')) >= depth: return 0, 0
+                # Otherwise return URL
+                else:
+                    if rawurls: return turl, rawurls
+                    else: return turl, []
+            else: return 0,0
+        else: return 0, 0        
 
     def _genverify(self, urls, base):
         '''Verifies a list of full URL relative to a base URL
                 # Put new URLs in cache if present
                 if newurls: cache[url] = newurls
             # Yield new URL
-            if url: yield url
+            if url: yield url, base
 
     def _multiverify(self, url, base):
         '''Verifies a full URL relative to a base URL
                 # Put new URLs in cache if present
                 if rawurl not in visited: cache[rawurl] = url
         # Put URL in list of good URLs
-        if url: self._good[url] = 1
-
-    def _urlverify(self, url, base, newbase):
-        '''Returns a full URL relative to a base URL
-
-        Arguments:
-        urls -- list of raw URLs
-        base -- referring URL
-        newbase -- temporary version of referring URL for joining'''
-        # Assignments
-        visited, webopen, other = self._visited, self._webopen, self.other
-        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
-        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
-        outside, redirs, supported = self.outside, self.redirs, self._supported
-        if url not in visited:
-            # Remove whitespace from URL
-            if url.find(' ') != -1:
-                visited[url], url = 1, url.replace(' ', '')
-                if url in visited: return 0, 0
-            # Remove fragments i.e. 'http:foo/bar#frag'
-            if url.find('#') != -1:
-                visited[url], url = 1, urldefrag(url)[0]
-                if url in visited: return 0, 0
-            # Process full URLs i.e. 'http://foo/bar
-            if url.find(':') != -1:
-                urlseg = urlsplit(url)
-                # Block non-FTP, HTTP URLs
-                if urlseg[0] not in supported:
-                    # Log as non-FTP/HTTP URL
-                    other[url], visited[url] = 1, 1
-                    return 0, 0
-                # If URL is not in root domain, block it
-                if urlseg[1] not in sb:
-                    visited[url], outside[url] = 1, 1                        
-                    return 0, 0
-                # Block duplicate root URLs
-                elif not urlseg[2] and urlseg[1] == sb:
-                    visited[url] = 1
-                    return 0, 0
-            # Handle relative URLs i.e. ../foo/bar
-            elif url.find(':') == -1:
-                # Join root domain and relative URL
-                visited[url], url = 1, urljoin(newbase, url)
-                if url in visited: return 0, 0
-            # Test URL by attempting to open it
-            rurl = webopen((url, base))
-            if rurl and rurl[0] not in visited:
-                # Get URL
-                turl, rawurls = rurl
-                visited[url], visited[turl] = 1, 1
-                # If URL resolved to a different URL, process it
-                if turl != url:
-                    urlseg = urlsplit(turl)
-                    # If URL is not in root domain, block it
-                    if urlseg[1] not in sb:
-                        # Log as a redirected internal URL
-                        redirs[(url, turl)] = 1
-                        return 0, 0
-                    # Block duplicate root URLs
-                    elif not urlseg[2] and urlseg[1] == sb: return 0, 0
-                # If URL exceeds depth, don't process 
-                if len(turl.split('/')) >= depth: return 0, 0
-                # Otherwise return URL
-                else:
-                    if rawurls: return turl, rawurls
-                    else: return turl, []
-            else: return 0,0
-        else: return 0, 0
+        if url: self._good[url] = base
 
     def _onewalk(self):
         '''Yields good URLs from under a base URL'''
         # Use global base if present
         if not base: base = self.base
         # Verify URL and get child URLs
-        newbase, rawurls = self._webopen((base, ''))
+        newbase, rawurls = self._webopen(base, '', base)
         if newbase:
             # Change base URL if different
-            if newbase != base: base = newbase            
+            base = newbase            
             # Ensure there's a trailing '/' in base URL
             if base[-1] != '/':
                 url = list(uparse.urlsplit(base))
             # Make base URL, get split, and put in verified URL list
             self.base, self._sb = base, base.split('/')
             self._visited[base], good[base] = 1, 1
-        # If URL is bad, abort and raise error
-        else: raise IOError, "URL is invalid"
+        # If URL is bad, abort and raise exception
+        else: raise IOError, "Invalid URL"
         # Adjust dept to length of base URL
         if self.depth and depth == 6: self.depth += len(self._sb)
         else: self.depth = depth + len(self._sb)
             else:
                 for item in onewalk():
                     # Don't exceed maximum width
-                    if len(good) <= width: good[item] = 1
+                    if len(good) <= width: good[item[0]] = item[1]
                     elif len(good) >= width: break
         # If user interrupts crawl, return what's done
         except KeyboardInterrupt: pass
         # Get URLs, sort them, and return list
         self.urls = good.keys()
         self.urls.sort()
-        return self.urls                        
+        return self.urls
 
     def webpaths(self, b=None, w=200, d=5, t=None):
         '''Returns a list of web paths.
         if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.badurls:
+            badurls = self.badurls
             # Number of bad URLs
             amount = str(len(self.badurls))
             header = '%s broken URLs under %s on %s:\n'
             # Print referring URL pointing to bad URL
-            body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.badurls])
+            body = '\n'.join([' -> '.join([i[0], i[1], i[2]]) for i in badurls])
             report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report