Lynn Rees avatar Lynn Rees committed 9ba3572

- pre-release cleanups

Comments (0)

Files changed (2)

 __author__ = 'L.C. Rees (xanimal@users.sf.net)'
 __all__ = ['ftpurls', 'ftppaths', 'weburls', 'ftpmirror', 'ftpspider',
     'webpaths', 'webreport', 'webmirror', 'webspider', 'urlreport',
-    'badurlreport', 'badhtmreport', 'redirsreport', 'outreport',
-    'otherreport']
+    'badurlreport', 'badhtmreport', 'redireport', 'outreport', 'othereport']
 
-'''FTP and Web crawling, reporting, and mirroring in one convenient module.'''
+'''Multithreaded crawling, reporting, and mirroring for Web and FTP.'''
 
 from __future__ import generators
 
 class Spider:
 
     '''HTTP and FTP crawling, reporting, and checking'''
-    _bdsig, _bfsig, _session, _newparser = None, None, None, None
+    
     import os as _os
     import urllib as _ulib
     import urlparse as _uparse
     # Use threads if available 
     try: from threading import Thread as _thread
     except ImportError: pass
-    
+    _bdsig, _bfsig, _session, _newparser = None, None, None, None
     # HTML tags with URLs
     _urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
         'embed':1, 'area':1, 'frame':1, 'applet':1, 'input':1, 'base':1,
         attempts -- number of login attempts to try (default: 3)'''
 
         def ftpprompt(tries=0):        
-            '''Prompts for FTP username and password'''
+            '''Prompts for FTP username and password
+
+            Arguments:
+            tries -- number of login attempts'''
             tries += tries
             try:
                 self._name = raw_input('Enter login name: ')
                     sys.exit(0)           
 
         # Assignments
+        self._name, self._password, ftperr = name, password, self._ftperr
         su, ftp = self._uparse.urlsplit(base), self._ftp
-        self._name, self._password, ftperr = name, password, self._ftperr
         # Set URL, path, and strip 'ftp://' off
         base, path = su[1], '/'.join([su[2], ''])
         try: session = ftp(base, name, password)
             # Get rid of trailing '/' in base if present before joining
             if base[-1] == '/': base = self.base[:-1]
         paths = self.paths
+        # Add FTP URL
         self.urls = [''.join([base, i]) for i in paths]
         return self.urls
 
-    def _classpicker(self, old=None):
+    def _parserpick(self, old=None):
+        '''Returns a class using the sgmllib parser or the sgmlop parser
+
+        Arguments:
+        old -- use classic sgmllib SGMLParser'''
+        # Assignments
+        urltags, urlattrs = self._urltags, self._urlattrs
+        # Lists for bad file and bad directory signatures
         self._bfsig, self._bdsig = [], []
-        bfsig, bdsig = self._bfsig, self._bdsig
-        urltags, urlattrs = self._urltags, self._urlattrs
+        bfsig, bdsig = self._bfsig, self._bdsig        
         # Use faster SGMLParser if available
         try:
             from sgmlop import SGMLParser as newparser
             self._newparser = newparser
+        # If unavailable, use classic SGML parser
         except ImportError:
-            from sgmllib import SGMLParser as _oldparser
+            from sgmllib import SGMLParser as oldparser
             old = 1
-        # Use different classes if faster SGML Parser is available
+        # Classes using classic sgmllib SGML Parser
         if old:
-            from sgmllib import SGMLParser as _oldparser
-            old, self._newparser = 1, None
-            class UrlExtract(_oldparser):
+            from sgmllib import SGMLParser as oldparser
+            # Remove sgmlop parser if present
+            self._newparser = None
+            # UrlExtract class using classic parser
+            class UrlExtract(oldparser):
                 '''Extracts URLs from a SGMLish document'''
                 def reset(self):
                     '''Resets SGML parser and clears lists'''
-                    _oldparser.reset(self)
+                    oldparser.reset(self)
                     self.urls, self.text, self.badurl = [], [], None
                 def handle_data(self, data):
                     '''Handles non-markup data'''            
                         # Get key, vale in attributes if they match
                         url = [v for k, v in attrs if k in urlattrs]
                         if url: self.urls.extend(url)
-            class BadUrl(_oldparser):            
+            # BadUrl class using classic parser
+            class BadUrl(oldparser):            
                 '''Collects results of intentionally incorrect URLs'''
                 def reset(self):
                     '''Resets SGML parser and clears lists'''
-                    _oldparser.reset(self)
+                    oldparser.reset(self)
                     self.text = []
                 def handle_data(self, data):
-                    '''Collects lines to profile not found responses'''
-                    # Adds first 5 lines of non-markup data to list 'text'
+                    '''Collects lines to profile bad URLs'''
+                    # Adds first 5 lines of non-markup data to text
                     if len(self.text) <= 5: self.text.append(data)
+        # If no old flag, use SGMLParser from sgmlop and related classes
         else:
+            # UrlExtract class using sgmlop parser
             class UrlExtract:
                 '''Extracts URLs from a SGMLish document'''            
                 def __init__(self):
                         # Get key, vale in attributes if they match
                         url = [v for k, v in attrs if k in urlattrs]
                         if url: self.urls.extend(url)
+            # BadUrl class using sgmlop parser
             class BadUrl:            
                 '''Collects results of intentionally incorrect URLs'''
                 def __init__(self):
                     '''Collects lines to profile not found responses'''
                     # Adds first 5 lines of non-markup data to list 'text'
                     if len(self.text) <= 5: self.text.append(data)
+        # Make resulting classes available class wide
         self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
 
     def _webtest(self):
             url -- Bad URL to open and parse'''
             # Use different classes if faster SGML Parser is available
             if self._newparser:
+                # sgmlop parser must have a handler passed to it
                 parser, urlget = self._newparser(), BadUrl()
+                # Pass handler (sgmlop cannot be subclassed)
                 parser.register(urlget)
                 parser.feed(urlopen(url).read())
                 parser.close()
+            # Use classic parser
             else:
                 urlget = BadUrl()
                 urlget.feed(urlopen(url).read())
                 urlget.close()
+            # Return singature of bad URL
             return urlget.text
 
         # Make globals local
             urlget = self._UrlExtract()
             urlget.feed(html)
             urlget.close()
+        # Return badurl marker and list of child URLS
         return urlget.badurl, urlget.urls
 
     def _webopen(self, base):
             self.badurls.append((base[1], cbase))
             return False
         # Get real URL
-        tbase = url.geturl()
+        newbase = url.geturl()
         # Change URL if different from old URL
-        if tbase != cbase: cbase, base = tbase, (tbase, base[1])    
+        if newbase != cbase: cbase, base = newbase, (newbase, base[1])    
         # URLs with mimetype 'text/html" scanned for URLs
         if url.headers.type == 'text/html':
             # Feed parser
         urls -- list of raw URLs
         base -- referring URL'''
         # Assignments
-        cache, visit, urlresolve = self._cache, self._visited, self._urlverify
+        cache, visit, urlverify = self._cache, self._visited, self._urlverify
         # Strip file off base URL for joining
-        tbase = base.replace(base.split('/')[-1], '') 
+        newbase = base.replace(base.split('/')[-1], '') 
         for url in urls:
-            turl, rawurls = urlresolve(url, base, tbase)
+            # Get resolved url and raw child URLs
+            url, rawurls = urlverify(url, base, newbase)
+            # Handle any child URLs
             if rawurls:
                 newurls = {}
-                # Eliminate duplicates
+                # Eliminate duplicate URLs
                 for rawurl in rawurls:
-                    # Eliminated known visited URLs
+                    # Eliminate known visited URLs
                     if rawurl not in visit: newurls[rawurl] = 1
-                # Put new urls in cache if present
-                if newurls: cache[turl] = newurls
-            if turl: yield turl
+                # Put new URLs in cache if present
+                if newurls: cache[url] = newurls
+            # Yield new URL
+            if url: yield url
 
     def _multiverify(self, url, base):
         '''Verifies a full URL relative to a base URL
         url -- a raw URLs
         base -- referring URL'''
         # Assignments
-        good, cache, visited, = self._good, self._cache, self._visited
+        cache, visited = self._cache, self._visited
         # Strip file off base URL for joining
-        tbase = base.replace(base.split('/')[-1], '') 
-        turl, rawurls = self._urlverify(url, base, tbase)
+        newbase = base.replace(base.split('/')[-1], '')
+        # Get resolved url and raw child URLs
+        url, rawurls = self._urlverify(url, base, newbase)
+        # Handle any child URLs
         if rawurls:
+            # Eliminate known visited URLs and duplicates
             for rawurl in rawurls:
-                if rawurl not in visited: cache[rawurl] = turl
-        if turl: good[turl] = 1
+                # Put new URLs in cache if present
+                if rawurl not in visited: cache[rawurl] = url
+        # Put URL in list of good URLs
+        if url: self._good[url] = 1
 
-    def _urlverify(self, url, base, tbase):
+    def _urlverify(self, url, base, newbase):
         '''Returns a full URL relative to a base URL
 
         Arguments:
         urls -- list of raw URLs
         base -- referring URL
-        tbase -- temporary version of referring URL for joining'''
+        newbase -- temporary version of referring URL for joining'''
         # Assignments
         visited, webopen, other = self._visited, self._webopen, self.other
         sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
             # Handle relative URLs i.e. ../foo/bar
             elif url.find(':') == -1:
                 # Join root domain and relative URL
-                visited[url], url = 1, urljoin(tbase, url)
+                visited[url], url = 1, urljoin(newbase, url)
                 if url in visited: return 0, 0
             # Test URL by attempting to open it
             rurl = webopen((url, base))
                     elif not urlseg[2] and urlseg[1] == sb: return 0, 0
                 # If URL exceeds depth, don't process 
                 if len(turl.split('/')) >= depth: return 0, 0
-                # Otherwise put in cache and yield url
+                # Otherwise return URL
                 else:
                     if rawurls: return turl, rawurls
                     else: return turl, []
                 for url in genverify(urls, base): yield url
 
     def _multiwalk(self, threads):
-        '''Yields good URLs from under a base URL
+        '''Extracts good URLs from under a base URL
         
         Arguments:
         threads -- number of threads to run'''
 
         def urlthread(url, base):
-            '''Spawns a thread containing multiverify function'''
+            '''Spawns a thread containing a multiverify function
+
+            Arguments:
+
+            url -- URL to verify
+            base -- referring URL'''
+            # Create instance of Thread
             dthread = Thread(target=multiverify, args=(url, base))
+            # Put in pool
             pool.append(dthread)
-        
+
+        # Assignments        
         pool, cache, multiverify = [], self._cache, self._multiverify
         Thread, width, good = self._thread, self.width, self._good
+        # End processing if cache is empty
         while cache:
+            # Process URLs as long as width not exceeded
             if len(good) <= width:
+                # Fetch item from cache
                 url, base = cache.popitem()
+                # Make thread
                 if url: urlthread(url, base)
+                # Run threads once pool size is reached
                 if len(pool) == threads or threads >= len(cache):
+                    # Start threads
                     for thread in pool: thread.start()
+                    # Empty thread pool as threads complete
                     while pool:
                         for thread in pool:
                             if not thread.isAlive(): pool.remove(thread)
+            # End if width reached
             elif len(good) >= width: break
 
     def weburls(self, base=None, width=200, depth=5, thread=None):
         if self.width and width == 200: width = self.width
         else: self.width = width
         # sgmlop crashes Python after too many iterations
-        if width > 5000: self._classpicker(1)
-        else: self._classpicker() 
+        if width > 5000: self._parserpick(1)
+        else: self._parserpick() 
         # Use global base if present
         if not base: base = self.base
-        # Verify URL
-        tbase, rawurls = self._webopen((base, ''))
-        if tbase:
+        # Verify URL and get child URLs
+        newbase, rawurls = self._webopen((base, ''))
+        if newbase:
             # Change base URL if different
-            if tbase != base: base = tbase            
+            if newbase != base: base = newbase            
             # Ensure there's a trailing '/' in base URL
             if base[-1] != '/':
                 url = list(uparse.urlsplit(base))
             newurls = {}
             for rawurl in rawurls: newurls[rawurl] = 1
             if newurls:
+                # Cache URLs individually if threads are desired
                 if thread:
                     for newurl in newurls: cache[newurl] = base
+                # Cache in group if no threads
                 else: cache[base] = newurls
             # Make base URL, get split, and put in verified URL list
             self.base, self._sb = base, base.split('/')
         self._webtest()
         # Get good URLs as long as total width isn't exceeded
         try:
+            # Multiwalk if threaded
             if thread: self._multiwalk(thread)
+            # Otherwise, use single thread
             else:
                 for item in onewalk():
+                    # Don't exceed maximum width
                     if len(good) <= width: good[item] = 1
                     elif len(good) >= width: break
         # If user interrupts crawl, return what's done
             # Return if just getting string
             if report: return report
 
-    def redirsreport(self, f=None, b=None, w=200, d=5, t=None):
+    def redireport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of URLs redirected to an external URL
         
         Arguments:
             # Return if just getting string
             if report: return report            
 
-    def otherreport(self, f=None, b=None, w=200, d=5, t=None):
+    def othereport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of non-HTTP/FTP URLs
         
         Arguments:
             outside = self.outreport()
             if outside: compile.append(outside)
         if redirs:
-            redirs = self.redirsreport()
+            redirs = self.redireport()
             if redirs: compile.append(redirs)
         if badhtm:
             badhtm = self.badhtmreport()
             if badhtm: compile.append(badhtm)        
         if other:
-            other = self.otherreport()
+            other = self.othereport()
             if other: compile.append(other)
         # Make report
         report = '\n\n'.join(compile)
                     while pool:
                         for thread in pool:
                             if not thread.isAlive(): pool.remove(thread)
-                            
+
 
 # Instance of Spider enables exporting Spider's methods as standalone functions
 _inst = Spider()
 webreport = _inst.webreport
 urlreport = _inst.urlreport
 outreport = _inst.outreport
-redirsreport = _inst.redirsreport
-otherreport = _inst.otherreport
+redireport = _inst.redireport
+othereport = _inst.othereport
 badurlreport = _inst.badurlreport
 badhtmreport = _inst.badhtmreport
     spider.urlreport('e:\\web2.txt', 'http://localhost/',)
     spider.badurlreport('e:\\web3.txt', 'http://localhost/')
     spider.badhtmreport('e:\\web4.txt', 'http://localhost/')
-    spider.redirsreport('e:\\web5.txt', 'http://localhost/')
+    spider.redireport('e:\\web5.txt', 'http://localhost/')
     spider.outreport('e:\\web6.txt', 'http://localhost')
-    spider.otherreport('e:\\web7.txt', 'http://localhost/')
+    spider.othereport('e:\\web7.txt', 'http://localhost/')
     a = spider.Spider('ftp://localhost/', 200, 16)
     a.ftppaths()
     print 1; pprint(a.paths)
     a.urlreport('e:\\web2.txt')
     a.badurlreport('e:\\web3.txt')
     a.badhtmreport('e:\\web4.txt')
-    a.redirsreport('e:\\web5.txt')
+    a.redireport('e:\\web5.txt')
     a.outreport('e:\\web6.txt')
-    a.otherreport('e:\\web7.txt')
+    a.othereport('e:\\web7.txt')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.