Commits

Lynn Rees  committed eaab93e

- general bug swatting

  • Participants
  • Parent commits 9779314

Comments (0)

Files changed (1)

 
     import urlparse as _uparse
     import urllib as _ulib
-    import ftplib as _ftplib
-    from robotparser import RobotFileParser as _robotparser 
-    from ftplib import FTP as _ftp
+    from ftplib import FTP as _ftp    
+    from ftplib import error_perm as _ftperr
     from sgmllib import SGMLParser as _sparser
-    # SGML exception handler
     from sgmllib import SGMLParseError as _sperror    
     import os as _os
     from os import path as _path
     # Use threads if available
+    from robotparser import RobotFileParser as _robotparser 
     try:
         from threading import Thread as _thread
         from threading import BoundedSemaphore as _lock
         else: self.width = None
         if depth: self.depth = depth
         else: self.depth = None
-        self._bdlist, self._bflist = None, None
+        self._bdlist, self._bflist, self._session = None, None, None
         
     def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
         '''Returns an FTP client session
             try:
                 self._name = raw_input('Enter login name: ')
                 self._password = raw_input('Enter password: ')
-                session = self._ftp(base, self._name, self._password)
+                session = ftp(base, self._name, self._password)
                 return session
             # Retry login depending on number of attempts
-            except self._ftplib.error_perm:               
+            except ftperr:               
                 if attempts >= tries:
                     session = ftpprompt(tries)
                     return session
                     sys.exit(0)
 
         # Strip 'ftp://' from URL
-        su = self._uparse.urlsplit(base)
+        su, ftp, ftperr = self._uparse.urlsplit(base), self._ftp, self._ftperr
+        self._name, self._password = name, password
         # Set URL, path
         base, path = su[1], '/'.join([su[2], ''])
         # Connect if arguments are correct
-        try: session = self._ftp(base, self._name, self._password)
+        try: session = ftp(base, name, password)
         # Otherwise, prompt for username, password
-        except self._ftplib.error_perm: session = ftpprompt()
+        except ftperr: session = ftpprompt()
         # Change to remote path if it exits
         if path: session.cwd(path)
         return session
         d -- how deep in FTP hierarchy to crawl (default: 6)             
         n -- FTP user name (default: 'anonymous')
         p -- FTP password (default: None)'''
-        # Pass ftpspider and root to base '_mirror' function
-        if base: self.ftpspider(b, w, d, n, p)
+        if b: self.ftpspider(b, w, d, n, p)
         return self._mirror((self.paths, self.urls), r, t)
 
     def ftppaths(self, b=None, w=200, d=6, n='anonymous', p=None):
 
             Arguments:
             rdir -- remote directory list'''
-            rlist, rappend = [], rlist.append
+            rlist = []
+            rappend = rlist.append
             for rl in rdir:
                 # Split remote file based on whitespace
                 ri = rl.split()[-1]
                             # Run 'visitftp' on new directory
                             visitftp()
                             
-        if base: self.base = base
-        else: base = self.base
+        if b: self.base = b
+        else: b = self.base
         if self.width and w == 200: width = self.width
         else: width = w
         if self.depth and d == 6: depth = self.depth + 1
         else: depth = d + 1
-        files, dirs = {}, {}        
+        files, dirs = {}, {}
         # Connect to FTP site
-        ftp = self._ftpopen(b, n, p)        
+        if self._session: ftp = self._session
+        else:
+            ftp = self._ftpopen(b, n, p)
+            self._session = ftp
         # Avoid outside namespace lookups
-        cwd, pwd, retr, self._session = ftp.cwd, ftp.pwd, ftp.retrlines, ftp
+        cwd, pwd, retr = ftp.cwd, ftp.pwd, ftp.retrlines
         # Walk FTP site
         visitftp()
         # Make path list out of files keys and return it
         self.paths = files.keys()
         return self.paths
 
-    def ftpspider(self, b, w=200, depth=6, n='anonymous', p=None):
+    def ftpspider(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns two lists of URLs and paths and an active FTP session
         
         Arguments:
         d -- how deep in FTP hierarchy to crawl (default: 6)                
         n -- FTP user name (default: 'anonymous')
         p -- FTP password (default: None)'''
-        return ftppaths(url, width, depth, name, pw), ftpurls(), self._session
+        return ftppaths(b, w, d, n, p), ftpurls(), self._session
 
     def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns a list of URLs
         n -- FTP user name (default: 'anonymous')
         p -- FTP password (default: None)'''
         # Run ftppaths if URL passed as argument
-        if url: ftppaths(url, width, depth, name, pw)
-        base, urls = self.base, self.urls
-        self.paths = [''.join([base, i]) for i in paths]
-        self.paths.sort()
-        return self.paths
+        if b: ftppaths(b, w, d, n, p)
+        base, paths = self.base, self.paths
+        paths.sort()
+        self.urls = [''.join([base, i]) for i in paths]
+        return self.urls
 
     def _webtest(self):
         '''Generates signatures for identifying bad URLs'''
 
         def pathize():            
             '''Strips base URL from full URLs to produce paths'''
-            for url in self.urls:
+            urls = self.urls
+            for url in urls:
                 # Remove base URL from path list
                 url = url.replace(self.base, '')
+                if not url: url = '/index.html'
+                elif url[-1] == '/': url = ''.join([url, 'index.html'])
                 # Verify removal of base URL and remove it if found
-                if url.find('http:') != -1: url = urlsplit(url)[2:][0]
+                if url.find('http:') != -1: url = urlsplit(url)[2:][0]                
                 yield url
 
         urlsplit = self._uparse.urlsplit
         if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
         # Return path list after stripping base URL
         self.paths = list(pathize())
-        self.paths.sort()
         return self.paths
         
     def webmirror(self, base=None, root=None, t=None, width=200, depth=5):
         self.webspider(base, width, depth)
         return self._mirror((self.paths, self.urls), root, t)
     
-    def webspider(self, base, width=200, depth=5):
+    def webspider(self, base=None, width=200, depth=5):
         '''Returns two lists of child URLs and paths
         
         Arguments:
             # Sync with the URL for oldpath
             url = urls[paths.index(oldpath)]
             # Create name of local copy
-            newpath = normcase(oldpath).lstrip(sep)
+            newpath = normcase(oldpath).lstrip(sep)            
             # Get directory name
             dirname = split(newpath)[0]
             # If the directory exists, download the file directly