Commits

Lynn Rees committed b9737f3

- make information collected on websites more flexible

Comments (0)

Files changed (1)

         return urlget.badurl, urlget.urls
 
     def _webopen(self, newurl, base, oldurl):
-        '''Verifies URL and returns actual URL and extracted child URLs
+        '''Returns real URL and extracted child URLs
 
         Arguments:
         newurl -- newly resolved URL
-        base -- the referring URL
-        oldurl - the original URL'''
+        base -- referring URL
+        oldurl - original URL'''
         # Assignments
-        good = self._good        
+        good = self._good
         try:
-            # If webspiders can access URL, open it            
+            # If webspiders can access URL, open it
             if self._robot.can_fetch('*', newurl):
                 url = self._ulib.urlopen(newurl)
             # Otherwise, mark as visited and abort
         # If HTTP error, log bad URL and abort
         except IOError:
             self._visited[newurl] = 1
-            self.badurls.append((base, oldurl, newurl))
+            if self._badurl: self.badurls.append((base, oldurl, newurl))
             return 0, 0
         # Get real URL
         newurl = url.geturl()
             try: badurl, urls = self._webparser(contents)
             # Log URL if SGML parser can't parse it 
             except self._sperror:
-                self._visited[newurl], self.badhtm[newurl] = 1, 1
+                self._visited[newurl] = 1
+                if self._badhtm: self.badhtms[newurl] = 1                
                 return 0, 0
             url.close()
             # Return URL and extracted urls if it's good
             # If the URL is bad (after BadUrl), stop processing and log URL
             else:
                 self._visited[newurl] = 1
-                self.badurls.append((base, oldurl, newurl))
+                if self._badurl: self.badurls.append((base, oldurl, newurl))
                 return 0, 0
         # Return URL of non-HTML resources and empty list
         else:
         base -- referring URL
         newbase -- temporary version of referring URL for joining'''
         # Assignments
-        visited, webopen, other = self._visited, self._webopen, self.other
-        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
+        visited, webopen, others = self._visited, self._webopen, self.others
         urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
-        outside, redirs, supported = self.outside, self.redirs, self._supported
+        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin        
+        outs, redirs, supported = self.outs, self.redirs, self._supported
+        redir, other, out = self._redir, self._other, self._out
         if url not in visited:
             # Remove whitespace from URL
             if url.find(' ') != -1:
                 urlseg = urlsplit(url)
                 # Block non-FTP, HTTP URLs
                 if urlseg[0] not in supported:
-                    # Log as non-FTP/HTTP URL
-                    other[url], visited[url] = 1, 1
+                    visited[url] = 1
+                    if other: others[url] = 1
                     return 0, 0
                 # If URL is not in root domain, block it
                 elif urlseg[1] not in sb:
-                    visited[url], outside[url] = 1, base                        
+                    visited[url] = 1
+                    if out: outs[url] = base
                     return 0, 0
                 # Block duplicate root URLs
                 elif not urlseg[2] and urlseg[1] == sb:
                     # If URL is not in root domain, block it
                     if urlseg[1] not in sb:
                         # Log as a redirected internal URL
-                        redirs[(nurl, turl)] = base
+                        if redir: redirs[(nurl, turl)] = base
                         return 0, 0
                     # Block duplicate root URLs
                     elif not urlseg[2] and urlseg[1] == sb: return 0, 0
             # Fetch item from cache
             base, urls = cache.popitem()
             # If item has child URLs, process them and yield good URLs
-            if urls:
+            if urls: 
                 for url in genverify(urls, base): yield url
 
     def _multiwalk(self, threads):
             '''Spawns a thread containing a multiverify function
 
             Arguments:
-
             url -- URL to verify
             base -- referring URL'''
             # Create instance of Thread
             # End if width reached
             elif len(good) >= width: break
 
-    def weburls(self, base=None, width=200, depth=5, thread=None):
+    def weburls(self, base=None, width=200, depth=5, thread=None, *vargs):
         '''Returns a list of web paths.
         
         Arguments:
         base -- base web URL (default: None)
         width -- amount of resources to crawl (default: 200)
         depth -- depth in hierarchy to crawl (default: 5)
-        thread -- number of threads to run (default: None)'''
-        # Assignments
-        self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
-        self.redirs, self.outside, self.badhtm, self.other = {}, {}, {}, {}
-        onewalk, good, self._robot = self._onewalk, self._good, self._rparser()
+        thread -- number of threads to run (default: None)
+        vargs -- information to include or exclude
+        To override defaults:
+        To include information add 'badhtm', 'redir', 'out', or 'other'
+        To exclude information add "badurl"'''
+        # Default information gathering
+        self._badurl, self._badhtm, self._redir, self._out  = 1, 0, 0, 0
+        self.badhtms, self.others, self._other = None, None, 0
+        self.badurls, self.redirs, self.outs = [], None, None
+        # Override default settings if argument is passed to vargs
+        for arg in vargs:
+            if arg == 'badurl': self._badurl, self.badurls = 0, None
+            elif arg == 'badhtm': self._badhtm, self.badhtms = 1, {}
+            elif arg == 'redir': self._redir, self.redirs = 1, {}
+            elif arg == 'out': self._out, self.outs = 1, {}
+            elif arg == 'other': self._other, self.others = 1, {}
+        # Assignments        
+        self._visited, self._good, self._cache = {}, {}, {}
+        onewalk, self._robot = self._onewalk, self._rparser()
         uparse, robot, multiwalk = self._uparse, self._robot, self._multiwalk
-        cache = self._cache
+        cache, good = self._cache, self._good
         # Assign width
         if self.width and width == 200: width = self.width
         else: self.width = width
         newbase, rawurls = self._webopen(base, '', base)
         if newbase:
             # Change base URL if different
-            base = newbase            
+            base, newurls = newbase, {}           
             # Ensure there's a trailing '/' in base URL
             if base[-1] != '/':
                 url = list(uparse.urlsplit(base))
                 url[1] = ''.join([url[1], '/'])
-                base = uparse.urlunsplit(url)            
+                base = uparse.urlunsplit(url)
             # Eliminate duplicates and put raw URLs in cache
-            newurls = {}
             for rawurl in rawurls: newurls[rawurl] = 1
             if newurls:
                 # Cache URLs individually if threads are desired
                 else: cache[base] = newurls
             # Make base URL, get split, and put in verified URL list
             self.base, self._sb = base, base.split('/')
-            self._visited[base], good[base] = 1, 1
+            self._visited[base], good[base] = 1, base
         # If URL is bad, abort and raise exception
-        else: raise IOError, "Invalid URL"
+        else: raise IOError, 'Invalid URL'
         # Adjust dept to length of base URL
         if self.depth and depth == 6: self.depth += len(self._sb)
         else: self.depth = depth + len(self._sb)
         self.urls.sort()
         return self.urls
 
-    def webpaths(self, b=None, w=200, d=5, t=None):
+    def webpaths(self, b=None, w=200, d=5, t=None, *vargs):
         '''Returns a list of web paths.
         
         Arguments:
         b -- base web URL (default: None)
         w -- amount of resources to crawl (default: 200)
         d -- depth in hierarchy to crawl (default: 5)
-        t -- number of threads (default: None)'''
+        t -- number of threads (default: None)
+        vargs -- information to include or exclude'''
 
         def pathize():            
             '''Strips base URL from full URLs to produce paths'''            
         # Assignments
         urlsplit = self._uparse.urlsplit
         # Run weburls if base passed as an argument
-        if b: self.weburls(b, w, d, t)
+        if b: self.weburls(b, w, d, t, vargs)
         # Strip off trailing resource or query from base URL
         if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
         urls = self.urls
         base -- base web URL (default: None)
         width -- amount of resources to crawl (default: 200)
         depth -- depth in hierarchy to crawl (default: 5)'''
-        if base: self.webspider(base, width, depth, t)
+        if base: self.webspider(base, width, depth, t, 'badurl')
         return self._mirror((self.paths, self.urls), root, t)
     
-    def webspider(self, b=None, w=200, d=5, t=None):
+    def webspider(self, b=None, w=200, d=5, t=None, *vargs):
         '''Returns two lists of child URLs and paths
         
         Arguments:
         b -- base web URL (default: None)
         w -- amount of resources to crawl (default: 200)
         d -- depth in hierarchy to crawl (default: 5)
-        t -- number of threads (default: None)'''
+        t -- number of threads (default: None)
+        vargs -- information to include or exclude'''
         if b: self.weburls(b, w, d, t)
         return self.webpaths(), self.urls
 
         if b: self.weburls(b, w, d, t)
         # Format report if information is available
         if self.badurls:
-            badurls = self.badurls
             # Number of bad URLs
             amount = str(len(self.badurls))
             header = '%s broken URLs under %s on %s:\n'
             # Print referring URL pointing to bad URL
-            body = '\n'.join([' -> '.join([i[0], i[1], i[2]]) for i in badurls])
+            body = '\n'.join([' -> '.join(i) for i in self.badurls])
             report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
         w -- amount of resources to crawl (default: 200)
         d -- depth in hierarchy to crawl (default: 5)
         t -- number of threads (default: None)'''
-        if b: self.weburls(b, w, d, t)
+        if b: self.weburls(b, w, d, t, 'badhtm')
         # Format report if information is available
-        if self.badhtm:
-            amount = str(len(self.badhtm))
+        if self.badhtms:
+            amount = str(len(self.badhtms))
             header = '%s unparsable HTML URLs under %s on %s:\n'
-            body = '\n'.join(self.badhtm)
+            body = '\n'.join(self.badhtms)
             report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
         w -- amount of resources to crawl (default: 200)
         d -- depth in hierarchy to crawl (default: 5)
         t -- number of threads (default: None)'''
-        if b: self.weburls(b, w, d, t)
+        if b: self.weburls(b, w, d, t, 'redir')
         # Format report if information is available
         if self.redirs:
             amount = str(len(self.redirs))
             header = '%s redirects to external URLs under %s on %s:\n'
             # Print referring URL pointing to new URL
-            body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
+            body = '\n'.join([' -> '.join(i) for i in self.redirs])
             report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
         w -- amount of resources to crawl (default: 200)
         d -- depth in hierarchy to crawl (default: 5)
         t -- number of threads (default: None)'''
-        if b: self.weburls(b, w, d, t)
+        if b: self.weburls(b, w, d, t, 'out')
         # Format report if information is available
-        if self.outside:
-            amount = str(len(self.outside))
+        if self.outs:
+            amount = str(len(self.outs))
             header = '%s links to external URLs under %s on %s:\n'
-            body = '\n'.join(self.outside)
+            body = '\n'.join(self.outs)
             report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report            
         w -- amount of resources to crawl (default: 200)
         d -- depth in hierarchy to crawl (default: 5)
         t -- number of threads (default: None)'''
-        if b: self.weburls(b, w, d, t)
+        if b: self.weburls(b, w, d, t, 'other')
         # Format report if information is available
-        if self.other:
-            amount = str(len(self.other))
+        if self.others:
+            amount = str(len(self.others))
             header = '%s non-FTP/non-HTTP URLs under %s on %s:\n'
-            body = '\n'.join(self.other)
+            body = '\n'.join(self.others)
             report = self._formatreport(amount, header, body, f)
             # Return if just getting string
             if report: return report
         t -- number of threads (default: None)
         vargs -- report sections to include or exclude
         To override defaults:
-        To include a section add 'badhtm', 'redirs', 'outside', or 'other'
-        To exclude a section add 'badurls' or "urls"'''
+        To include a section add 'badhtm', 'redir', 'out', or 'other'
+        To exclude a section add 'badurl' or "url"'''
         if b: self.weburls(b, w, d, t)
         # Defaults for report
-        badurls, badhtm, redirs, urls, outside, other = 1, 0, 0, 1, 0, 0
+        badurl, badhtm, redir, url, out, other = 1, 0, 0, 1, 0, 0
         # Create compilation list
         compile = []
         # Override default report settings if argument is passed to vargs
         for arg in vargs:
-            if arg == 'badurls': badurls = 0
+            if arg == 'badurl': badurl = 0
             elif arg == 'badhtm': badhtm = 1
-            elif arg == 'redirs': redirs = 1
-            elif arg == 'urls': urls = 0
-            elif arg == 'outside': outside = 1
+            elif arg == 'redir': redir = 1
+            elif arg == 'url': url = 0
+            elif arg == 'out': out = 1
             elif arg == 'other': other = 1
         # Compile report
-        if badurls:
+        if badurl:
             badurls = self.badurlreport()
             if badurls: compile.append(badurls)
-        if urls:
+        if url:
             urls = self.urlreport()
             if urls: compile.append(urls)
-        if outside:
-            outside = self.outreport()
-            if outside: compile.append(outside)
-        if redirs:
+        if out:
+            outs = self.outreport()
+            if outs: compile.append(outs)
+        if redir:
             redirs = self.redireport()
             if redirs: compile.append(redirs)
         if badhtm:
-            badhtm = self.badhtmreport()
-            if badhtm: compile.append(badhtm)        
+            badhtms = self.badhtmreport()
+            if badhtms: compile.append(badhtms)        
         if other:
-            other = self.othereport()
-            if other: compile.append(other)
+            others = self.othereport()
+            if others: compile.append(others)
         # Make report
         report = '\n\n'.join(compile)
         # Write to file if argument present
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.