Commits

Lynn Rees  committed 7a14073

- polishing

  • Participants
  • Parent commits 9bcfddc

Comments (0)

Files changed (1)

 ## See COPYRIGHT file for license terms.
 
 __name__ = 'spider'
-__version__ = '0.45'
+__version__ = '0.5'
 __author__ = 'L.C. Rees (xanimal@users.sf.net)'
 __all__ = ['ftpurls', 'ftppaths', 'weburls', 'ftpmirror', 'ftpspider',
     'webpaths', 'webreport', 'webmirror', 'webspider', 'urlreport',
     'badurlreport', 'badhtmlreport', 'redirectreport', 'outsidereport',
     'unhttpreport']
 
-'''This module provides FTP and Web spiders and mirroring utilities in one
-convenient module. FTP sites are crawled by walking a remote directory tree.
-Websites are crawled by visiting URLs extracted from HTML pages. Downloads
-during mirroring are multi-threaded. Other features included logging bad URLs
-and their referring URL and finding horrifically malformed HTML.'''
+'''Provides FTP and Website crawling, reporting, and mirroring in one
+convenient module.'''
 
 from __future__ import generators
 
 
 class Spider:
 
-    '''Network protocol spiders, reporting, and checking'''
+    '''HTTP and FTP crawling, reporting, and checking'''
     
+    import os as _os
     import urllib as _ulib
     import urlparse as _uparse
-    from ftplib import FTP as _ftp    
+    from os import path as _path 
+    from ftplib import FTP as _ftp      
     from ftplib import error_perm as _ftperr
     from sgmllib import SGMLParser as _sparser
     from sgmllib import SGMLParseError as _sperror
     from robotparser import RobotFileParser as _robotparser
-    import os as _os
-    from os import path as _path
     # Use threads if available 
-    try:
-        from threading import Thread as _thread
-        from threading import BoundedSemaphore as _lock
+    try: from threading import Thread as _thread
     except ImportError: pass
 
     def __init__(self, base=None, width=None, depth=None):
-        '''Initializes Spider class and base values
+        '''Initializes a Spider instance and base values
 
         Arguments:
-        base -- FTP site to crawl (default: None)
+        base -- URL to crawl (default: None)
         width -- maximum resources to crawl (default: None) 
         depth -- how deep in a hierarchy to crawl (default: None)'''
         if base: self.base = base
         else: self.width = None
         if depth: self.depth = depth
         else: self.depth = None
-        self._bdlist, self._bflist, self._session = None, None, None
+        self._bdsig, self._bfsig, self._session = None, None, None
         
     def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
         '''Returns an FTP client session
 
         Arguments:
-        base -- FTP server address
+        base -- FTP server URL
         name -- login name (default: 'anonymous')
         password -- login password (default: None)
-        attempts -- number of login failures to allow (default: 3)'''
+        attempts -- number of login attempts to try (default: 3)'''
 
         def ftpprompt(tries=0):        
             '''Prompts for FTP username and password'''
                 self._password = raw_input('Enter password: ')
                 session = ftp(base, self._name, self._password)
                 return session
-            # Retry login depending on number of attempts
+            # If login attempt fails, retry login
             except ftperr:               
                 if attempts >= tries:
                     session = ftpprompt(tries)
                     return session
-                # If login name and password incorrect, end program
+                # Too many login attempts? End program
                 elif attempts <= tries:
                     print 'Permission denied'
                     import sys
                     sys.exit(0)
 
-        # Strip 'ftp://' from URL
         su, ftp, ftperr = self._uparse.urlsplit(base), self._ftp, self._ftperr
         self._name, self._password = name, password
-        # Set URL, path
+        # Set URL, path, and strip 'ftp://' off
         base, path = su[1], '/'.join([su[2], ''])
-        # Connect if arguments are correct
         try: session = ftp(base, name, password)
-        # Otherwise, prompt for username, password
+        # Prompt for username, password if initial arguments are incorrect
         except ftperr: session = ftpprompt()
         # Change to remote path if it exits
         if path: session.cwd(path)
         return session
 
-    def ftpmirror(self, b=None, r=None, t=None, w=200, d=6, n='anonymous', p=None):
-        '''Mirrors an FTP site's content on a local filesystem
+    def ftpmirror(self, b=None, l=None, t=None, w=200, d=6, n='anonymous', p=None):
+        '''Mirrors an FTP site on a local filesystem
         
         Arguments:
-        b -- FTP site to crawl (default: None)
-        r -- local filesystem path (default: None)
-        t -- number of threads to spawn (default: None)
-        w -- maximum resources to crawl (default: 200)   
-        d -- how deep in FTP hierarchy to crawl (default: 6)             
-        n -- FTP user name (default: 'anonymous')
-        p -- FTP password (default: None)'''
+        b -- FTP server URL (default: None)
+        l -- local filesystem path (default: None)
+        t -- number of download threads (default: None)
+        w -- maximum amount of resources to crawl (default: 200)   
+        d -- depth in hierarchy to crawl (default: 6)             
+        n -- login username (default: 'anonymous')
+        p -- login password (default: None)'''
         if b: self.ftpspider(b, w, d, n, p)
-        return self._mirror((self.paths, self.urls), r, t)
+        return self._mirror((self.paths, self.urls), l, t)
 
     def ftppaths(self, b=None, w=200, d=6, n='anonymous', p=None):
-        '''Crawls an FTP site and returns a list of paths.
+        '''Returns a list of FTP paths.
         
         Arguments:
-        b -- FTP site to crawl (default: None)
-        w -- maximum resources to crawl (default: 200) 
-        d -- how deep in FTP hierarchy to crawl (default: 6)               
-        n -- FTP user name (default: 'anonymous')
-        p -- FTP password (default: None)'''
+        b -- FTP server URL (default: None)
+        w -- maximum amount of resources to crawl (default: 200) 
+        d -- depth in hierarchy to crawl (default: 6)               
+        n -- login username (default: 'anonymous')
+        p -- login password (default: None)'''
         
         def sortftp(rdir):
-            '''Returns a list of FTP items marked as files or directories
+            '''Returns a list of entries marked as files or directories
 
             Arguments:
             rdir -- remote directory list'''
             return rlist
        
         def visitftp():
-            '''Visits an FTP site and extracts a list of its contents'''
+            '''Extracts contents of an FTP directory'''
             wd = pwd()
             if wd[-1] != '/': wd = '/'.join([wd, ''])
-            # Add working directory to list of visited directories
+            # Add present working directory to visited directories
             dirs[wd], rlist = None, []
             # Get list of current directory's contents
             retr('LIST -a', rlist.append)
                             # Run 'visitftp' on new directory
                             visitftp()
                             
+        # Use classwide attributes if set
         if b: self.base = b
         else: b = self.base
+        # Use classwide width if different from method default
         if self.width and w == 200: width = self.width
         else: width = w
+        # Use classwide depth if different from method default
         if self.depth and d == 6: depth = self.depth + 1
         else: depth = d + 1
+        # File and directory dicts
         files, dirs = {}, {}
-        # Connect to FTP site
+        # Use existing FTP client session if present
         if self._session: ftp = self._session
+        # Create new FTP client session if necessary
         else:
             ftp = self._ftpopen(b, n, p)
             self._session = ftp
         cwd, pwd, retr = ftp.cwd, ftp.pwd, ftp.retrlines
         # Walk FTP site
         visitftp()
-        # Make path list out of files keys and return it
+        # Make path list out of files' keys and return it
         self.paths = files.keys()
         self.paths.sort()
         return self.paths
 
     def ftpspider(self, b=None, w=200, d=6, n='anonymous', p=None):
-        '''Returns two lists of URLs and paths and an active FTP session
+        '''Returns lists of URLs and paths plus a live FTP client session
         
         Arguments:
-        b -- FTP site to crawl (default: None)
-        w -- maximum resources to crawl (default: 200)
-        d -- how deep in FTP hierarchy to crawl (default: 6)                
-        n -- FTP user name (default: 'anonymous')
-        p -- FTP password (default: None)'''
+        b -- FTP server URL (default: None)
+        w -- maximum amount of resources to crawl (default: 200) 
+        d -- depth in hierarchy to crawl (default: 6)               
+        n -- login username (default: 'anonymous')
+        p -- login password (default: None)'''
         return ftppaths(b, w, d, n, p), ftpurls(), self._session
 
     def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns a list of FTP URLs
         
         Arguments:
-        b -- Remote FTP address (default: None)
-        w -- maximum amount of resources to crawl (default: 200)
-        d -- how deep in FTP hierarchy to crawl (default: 6)
-        n -- FTP user name (default: 'anonymous')
-        p -- FTP password (default: None)'''
+        b -- FTP server URL (default: None)
+        w -- maximum amount of resources to crawl (default: 200) 
+        d -- depth in hierarchy to crawl (default: 6)               
+        n -- login username (default: 'anonymous')
+        p -- login password (default: None)'''
         if b: ftppaths(b, w, d, n, p)
-        base, paths = self.base, self.paths
+        # Get rid of trailing '/' in base if present before joining
+        if self.base[-1] == '/': base = self.base[:-1]
+        else: base = self.base
+        paths = self.paths
         self.urls = [''.join([base, i]) for i in paths]
         return self.urls
 
         '''Generates signatures for identifying bad URLs'''
 
         class BadUrl(self._sparser):
+            
             '''Collects results of intentionally incorrect URLs'''
             
             def handle_data(self, data):
                 '''Collects lines to profile not found responses'''
-                # Adds first 5 lines of non-markup data to the list "text"
+                # Adds first 5 lines of non-markup data to list "text"
                 if len(self.text) <= 5: self.text.append(data)
     
             def reset(self):
                 self.text = []        
 
         def badurl(url):
-            '''Runs URL opening and parsing for the BadUrl class.
+            '''Returns first 5 lines of a bad URL
 
             Arguments:                
             url -- Bad URL to open and parse'''
-            # Read URL 
             parser.feed(urlopen(url).read())
             parser.close()
             return parser.text
 
         # Make globals local                
-        base, self._bflist, urljoin = self.base, [], self._uparse.urljoin
-        parser, urlopen, self._bdlist = BadUrl(), self._ulib.urlopen, []
+        base, self._bfsig, urljoin = self.base, [], self._uparse.urljoin
+        parser, urlopen, self._bdsig = BadUrl(), self._ulib.urlopen, []
         # Generate random string of jibber
         from string import letters, digits
         from random import choice, randint
         jibber = ''.join([letters, digits])
         ru = ''.join([choice(jibber) for x in range(randint(1, 30))]) 
         # Builds bad URL signature of a bad file URL
-        self._bflist.extend(badurl(urljoin(base, '%s.html' % ru)))
+        self._bfsig.extend(badurl(urljoin(base, '%s.html' % ru)))
         parser.reset()
         # Builds bad URL signature of a bad directory URL
-        self._bdlist.extend(badurl(urljoin(base,'%s/' % ru)))    
+        self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))    
 
     def _webfactory(self):
-        '''Returns the needed class as an object'''        
-        bflist, bdlist = self._bflist, self._bdlist
+        '''Returns UrlExtract instance'''        
+        bfsig, bdsig = self._bfsig, self._bdsig
 
         class UrlExtract(self._sparser):
+            
             '''Extracts URLs from HTML documents'''
 
             def handle_data(self, data):
                 # Get first 5 lines of non-markup data
                 if len(self.text) <= 5: self.text.append(data)
                 # Compare signature of known bad URL to a new web page
-                if self.text == bflist: self.badurl = 1
-                elif self.text == bdlist: self.badurl = 1
+                if self.text == bfsig: self.badurl = 1
+                elif self.text == bdsig: self.badurl = 1
     
             def reset(self):
                 '''Resets SGML parser and clears lists'''
         # Make globals local
         cache, visited, webopen = self._cache, self._visited, self._webopen
         sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
-        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag        
-        outside, redirects = self.outside,  self.redirects
+        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
+        outside, redirects = self.outside, self.redirects
         supported = ['HTTP', 'http', 'HTTPS', 'https', 'FTP', 'ftp']
         # Prepare base for joining
         tbase, unhttp = base.replace(base.split('/')[-1], ''), self.unhttp 
         report = '\n\n'.join(comp)
         if file: open(file, 'w').write(report)
         else: return report
-
-    def _hthread(self, source):
-        '''Recursively extracts all URLs within a base URL
-
-        Arguments:                
-        source -- source to search for URLs'''
-
-        def hethread(url):
-            '''Spawns a thread containing the download function'''
-            # Create thread
-            lock.acquire()
-            threat = Thread(target=hthread, args=(url,))
-            lock.release()
-            return threat        
-
-        Thread, pool, lock = self._thread, [], self._mlock
-        good, hthread = self._good, self._hthread
-        newurls = self._textract(source)
-        # If URL is good and newurls returns a list of good URLs...
-        if self.width >= len(good):
-            if newurls:
-                lock.acquire()
-                good[source[0]] = 1
-                lock.release()
-                while newurls: pool.append(hethread(newurls.popitem()))
-                for thread in pool: thread.start()
-                while pool:
-                    for thread in pool:
-                        if not thread.isAlive(): pool.remove(thread)
-                return None
-        else:
-            import sys
-            sys.exit(0)
-
-    def _textract(self, base):
-        '''Extracts URLs from HTML documents and puts them in a list
-        Based on 'urllist.py' by Mark Pilgrim
-
-        base -- base URL searched for references to other URLS'''
-        # Avoid outside namespace lookups        
-        resolve, visited, depth = self._urlresolve, self._visited, self.depth
-        lock = self._mlock
-        # Test if robots can download
-        try:            
-            if self._robot.can_fetch('*', base[0]):
-                lock.acquire()
-                url = self._ulib.urlopen(base[0])
-                lock.release()
-        # If HTTP error, log as bad URL and abort.
-        except IOError:
-            lock.acquire()
-            visited[base[0]] = 1           
-            self.badurls.append((base[1], base[0]))
-            lock.release()
-            return False
-        # Only URLs with mimetype 'text/html" are processed
-        if url.headers.type == 'text/html':
-            # Feed parser
-            lock.acquire()
-            parser = self._webfactory()
-            lock.release()
-            try: parser.feed(url.read())
-            # Log URL if HTML so broken SGML parser can't parse it 
-            except self._sperror:
-                lock.acquire()
-                visited[base[0]] = 1
-                self.badhtml.append(base[0])
-                lock.release()
-                return False 
-            parser.close()
-            url.close()
-            # If the URL is bad (after BadUrl), stop processing and log URL
-            if parser.badurl:
-                lock.acquire()
-                visited[base[0]] = 1
-                self.badurls.append((base[1], base[0]))
-                lock.release()
-                return False
-            # Otherwise assume the URL is good...
-            else:
-                # Final and potential list of all URLs 
-                fdict = {}
-                lock.acquire()
-                for url in resolve(parser.urls, base[0]):
-                    # Block URLS that exceed allowed depth
-                    if len(url[0].split('/')) >= depth: visited[url[0]] = 1
-                    # Put others in final list
-                    elif url not in visited:
-                        # Ensure visited URLs are never visited again
-                        visited[url[0]] = 1
-                        # Add processed URL to final list of URLs
-                        fdict[url[0]] = url[1]
-                lock.release()
-                return fdict            
-        # If a URL does not have the MIME type "text/html", return
-        else: return False
-
-    def webthread(self, base, width=200, depth=5):
-        '''Crawls a URL via HTTP and returns a full list of child URLs.
         
-        Arguments:
-        base -- base URL that is going to be crawled
-        depth -- how deep in HTTP hierarchy to crawl (default: 5)
-        width -- how many remote resources to crawl (default: 100)'''
-        self._visited, self._good, self.badurls = {}, {}, []
-        uparse, good, self.badhtml = self._uparse, self._good, []       
-        # Resolve real base URL (no redirection aliases)
-        base, self._mlock = self._ulib.urlopen(base).geturl(), _lock(9)
-        # Ensure there's a trailing '/' in base URL
-        if base[-1] != '/':
-            url = list(uparse.urlsplit(base))
-            url[1] = ''.join([url[1], '/'])
-            base = uparse.urlunsplit(url)
-        good[base] = 1
-        self.base, self._visited[base], self._sb = base, 1, base.split('/')
-        self.depth, self.width = depth + len(self._sb), width
-        # Set robot limits
-        self._robot = self._robotparser(''.join([base, 'robots.txt']))
-        self._robot.read()
-        # Find good URLs under base URL
-        self._webtest()
-        # Get good URLs
-        self._hthread((base, ''))
-        self.urls = good.keys()
-        self.urls.sort()
-        return self.urls, self.badurls
-
     def _formatreport(self, header, body, file=None):
         '''Generic prettifier with date/time stamper
         
 ftpspider = _inst.ftpspider
 webmirror = _inst.webmirror
 webspider = _inst.webspider
-webthread = _inst.webthread
 webreport = _inst.webreport
 urlreport = _inst.urlreport
 unhttpreport = _inst.unhttpreport