Commits

Lynn Rees  committed 32ba7f6

- begin merging in experimental asynchronous thing

  • Participants
  • Parent commits b9737f3

Comments (0)

Files changed (1)

     
     import os as _os
     import urllib as _ulib
-    import urlparse as _uparse
+    import asyncore as _async
+    import urlparse as _uparse    
     from os import path as _path 
     from ftplib import FTP as _ftp
+    import mimetools as _mimetools
+    import traceback as _traceback    
+    from socket import AF_INET as _afinet
     from time import strftime as _formtime
-    from time import localtime as _localtime
-    from ftplib import error_perm as _ftperr        
-    from sgmllib import SGMLParseError as _sperror
-    from robotparser import RobotFileParser as _rparser
+    from time import localtime as _localtime    
+    from ftplib import error_perm as _ftperr
+    from cStringIO import StringIO as _stringio
+    from socket import SOCK_STREAM as _sockstream
+    from sgmllib import SGMLParseError as _sperror    
+    from robotparser import RobotFileParser as _rparser    
     # Use threads if available 
     try: from threading import Thread as _thread
     except ImportError: pass
+    
     _bdsig, _bfsig, _session, _newparser = None, None, None, None
     # HTML tags with URLs
     _urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
     _supported = {'HTTP':1, 'http':1, 'HTTPS':1, 'https':1, 'FTP':1, 'ftp':1}
     # HTML attributes with URLs
     _urlattrs = {'href':1, 'src':1, 'data':1}
+    # HTTP redirect codes
+    _rcodes = {'300':1, '301':1, '302':1, '303':1, '304':1, '305':1, '306':1}
+    # Good HTTP codes
+    _okcodes = {'100':1, '101':1, '200':1, '201':1, '202':1, '203':1, '204':1,
+        '205':1, '206':1}
 
     def __init__(self, base=None, width=None, depth=None):
         '''Initializes a Spider instance and its base attributes
         Arguments:
         base -- URL to crawl (default: None)
         width -- maximum resources to crawl (default: None)
-        depth -- how deep in a hierarchy to crawl (default: None)'''                             
+        depth -- how deep in a hierarchy to crawl (default: None)'''
         if base: self.base = base
         else: self.base = None
         if width: self.width = width
         else: self.width = None
         if depth: self.depth = depth
         else: self.depth = None
+        urlparse, stringio = self._uparse.urlparse, self._stringio
+        sockstream, afinet = self._sockstream, self._afinet
+        rcodes, okcodes = self._rcodes, self._okcodes
+        badurl, badhtm = self._badurl, self._badhtm
+        sperror, visited = self._sperror, self._visited
+        badhtms = self._badhtms 
         
+        class AsyncHttp(self._async.dispatcher_with_send):
+            
+            def __init__(self, url, consumer):
+                self._async.dispatcher_with_send.__init__(self)
+                self.url, self.consumer = url, consumer
+                getstring = 'GET %s HTTP/1.0\r\nHost: %s\r\n\r\n'
+                scheme, host, path, params, query, fragment = urlparse(url)
+                assert scheme == 'http', 'only supports HTTP requests'
+                try:
+                    host, port = host.split(':', 1)
+                    port = int(port)
+                except (TypeError, ValueError): port = 80
+                if not path: path = '/'
+                if params: path = ';'.join([path, params])
+                if query: path = '?'.join([path, query])
+                self.request = getstring % (path, host)
+                self.host, self.port, self.status = host, port, None
+                self.header, self.data = None, ''
+                self.create_socket(afinet, sockstream)
+                self.connect((host, port))
+
+            def handle_connect(self):
+                self.send(self.request)
+
+            def handle_error(self):
+                traceback.print_exc()
+                self.close()
+
+            def handle_expt(self):
+                self.close()
+                try: http_header = self.consumer.http_header
+                except AttributeError: pass
+                else: http_header(self)
+
+            def handle_read(self):
+                data = self.recv(2048)
+                if not self.header:
+                    self.data = ''.join([self.data, data])
+                    try: i = self.data.index('\r\n\r\n')
+                    except ValueError: return
+                    else:
+                        fp = stringio(self.data[:i+4])
+                        status = fp.readline()
+                        self.status = status.split(' ', 2)
+                        self.header = mimetools.Message(fp)
+                        data = self.data[i+4:]
+                        self.data = ''
+                        self.type = self.header['Content-Type']
+                        self.consumer.url = self.url
+                        self.consumer.type = self.type
+                        try: http_header = self.consumer.http_header
+                        except AttributeError: pass
+                        else: http_header(self)
+                        if not self.connected: return
+                try: self.consumer.feed(data)
+                except self._sperror:
+                    visited[self.url] = 1
+                    if badhtm: badhtms[self.url] = 1
+                
+            def handle_close(self):
+                self.consumer.close()
+                self.close()
+
+        class Redir:
+            
+            def __init__(self, consumer):
+                self.consumer, self.url, self.type = consumer, None, None
+                self.badurl = None
+                
+            def http_header(self, request):
+                if request.status is None or request.status[1] not in rcodes:
+                    try: http_header = self.consumer.http_header
+                    except AttributeError: pass
+                    else: return http_header(request)
+                    if request.status[1] not in goodhttp: self.badurl = 1
+                else:
+                    if 'location' in request.header:
+                        url = request.header['location']
+                    elif 'uri' in request.header:
+                        url = request.header['uri']
+                    request.close()
+                    AsyncHTTP(url, self)
+
+        class ExRedir(Redir):
+
+            def feed(self, data):
+                if self.type == 'text/html': self.consumer.feed(data)
+
+            def close(self):
+                if self.badurl:
+                    self._visited[newurl] = 1
+                    if badurl: badurls.append((base, oldurl, newurl))
+                elif self.consumer.badurl:
+                    self._visited[newurl] = 1
+                    if badurl: badurls.append((base, oldurl, newurl))                    
+                elif self.consumer.urls:
+                    stash[self.url] = self.consumer.urls
+                else: stash[self.url] = []
+                self.consumer.close()
+
+        class BadRedir(Redir):
+
+            def feed(self, data):
+                self.consumer.feed(data)
+
+            def close(self):
+                stash.append(self.consumer.text)
+                self.consumer.close()
+
+        self._BadRedir, self._ExRedir = BadRedir, ExRedir
+        self._AsyncHttp = AsyncHttp                 
+
     def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
         '''Returns FTP client session
 
         password -- login password (default: None)
         attempts -- number of login attempts to try (default: 3)'''
 
-        def ftpprompt(tries=0):        
+        def ftpprompt(tries=0):
             '''Prompts for FTP username and password
 
             Arguments:
                 elif attempts <= tries:
                     raise IOError, 'Permission denied.'
                     import sys
-                    sys.exit(0)           
+                    sys.exit(0)
 
         # Assignments
         self._name, self._password, ftperr = name, password, self._ftperr
         l -- local filesystem path (default: None)
         b -- FTP server URL (default: None)
         t -- number of download threads (default: None)
-        w -- maximum amount of resources to crawl (default: 200)   
-        d -- depth in hierarchy to crawl (default: 6)             
+        w -- maximum amount of resources to crawl (default: 200)
+        d -- depth in hierarchy to crawl (default: 6)
         n -- login username (default: 'anonymous')
         p -- login password (default: None)'''
         if b: self.ftpspider(b, w, d, n, p)
 
     def ftppaths(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns a list of FTP paths.
-        
+
         Arguments:
         b -- FTP server URL (default: None)
         w -- maximum amount of resources to crawl (default: 200) 
-        d -- depth in hierarchy to crawl (default: 6)               
+        d -- depth in hierarchy to crawl (default: 6)
         n -- login username (default: 'anonymous')
         p -- login password (default: None)'''
-        
+
         def sortftp(rdir):
             '''Returns a list of entries marked as files or directories
 
                 # Add tuple of remote item type, permissions & name to rlist
                 if ri not in ('.', '..'): rappend((rl[0], rl[7], ri))
             return rlist
-       
+
         def visitftp():
             '''Extracts contents of an FTP directory'''
             wd = pwd()
                             cwd(purl)
                             # Run 'visitftp' on new directory
                             visitftp()
-                            
+
         # Use classwide attributes if set
         if b: self.base = b
         else: b = self.base
 
     def ftpspider(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns lists of URLs and paths plus a live FTP client session
-        
+
         Arguments:
         b -- FTP server URL (default: None)
         w -- maximum amount of resources to crawl (default: 200) 
-        d -- depth in hierarchy to crawl (default: 6)               
+        d -- depth in hierarchy to crawl (default: 6)
         n -- login username (default: 'anonymous')
         p -- login password (default: None)'''
         if b: ftppaths(b, w, d, n, p)
 
     def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
         '''Returns a list of FTP URLs
-        
+
         Arguments:
         b -- FTP server URL (default: None)
         w -- maximum amount of resources to crawl (default: 200) 
-        d -- depth in hierarchy to crawl (default: 6)               
+        d -- depth in hierarchy to crawl (default: 6)
         n -- login username (default: 'anonymous')
         p -- login password (default: None)'''
         if b:
             ftppaths(b, w, d, n, p)
             # Get rid of trailing '/' in base if present before joining
-            if b[-1] == '/': base = b[:-1]       
-        else:           
+            if b[-1] == '/': base = b[:-1]
+        else:
             base = self.base
             # Get rid of trailing '/' in base if present before joining
             if base[-1] == '/': base = self.base[:-1]
         urltags, urlattrs = self._urltags, self._urlattrs
         # Lists for bad file and bad directory signatures
         self._bfsig, self._bdsig = [], []
-        bfsig, bdsig = self._bfsig, self._bdsig        
+        bfsig, bdsig = self._bfsig, self._bdsig
         # Use faster SGMLParser if available
         try:
             from sgmlop import SGMLParser as newparser
                     oldparser.reset(self)
                     self.urls, self.text, self.badurl = [], [], None
                 def handle_data(self, data):
-                    '''Handles non-markup data'''            
+                    '''Handles non-markup data'''
                     # Get first 5 lines of non-markup data
                     if len(self.text) <= 5: self.text.append(data)
                     # Compare signature of known bad URL to a new web page
                         url = [v for k, v in attrs if k in urlattrs]
                         if url: self.urls.extend(url)
             # BadUrl class using classic parser
-            class BadUrl(oldparser):            
+            class BadUrl(oldparser):
                 '''Collects results of intentionally incorrect URLs'''
                 def reset(self):
                     '''Resets SGML parser and clears lists'''
         else:
             # UrlExtract class using sgmlop parser
             class UrlExtract:
-                '''Extracts URLs from a SGMLish document'''            
+                '''Extracts URLs from a SGMLish document'''
                 def __init__(self):
                     '''Resets SGML parser and clears lists'''
                     self.urls, self.text, self.badurl = [], [], None
                 def handle_data(self, data):
-                    '''Handles non-markup data'''            
+                    '''Handles non-markup data'''
                     # Get first 5 lines of non-markup data
                     if len(self.text) <= 5: self.text.append(data)
                     # Compare signature of known bad URL to a new web page
                         url = [v for k, v in attrs if k in urlattrs]
                         if url: self.urls.extend(url)
             # BadUrl class using sgmlop parser
-            class BadUrl:            
+            class BadUrl:
                 '''Collects results of intentionally incorrect URLs'''
                 def __init__(self):
                     '''Resets SGML parser and clears lists'''
-                    self.text = []            
+                    self.text = []
                 def handle_data(self, data):
                     '''Collects lines to profile not found responses'''
                     # Adds first 5 lines of non-markup data to list 'text'
         # Make resulting classes available class wide
         self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
 
+    def _makeasync(urllist):
+        for url in urllist:
+            if self._robot.can_fetch('*', url):
+                consumer = self._ExRedir(UrlExtract())
+                self._AsyncHttp(url, consumer)
+            else:
+                self._visited[url] = 1
+                continue
+        asyncore.loop()
+
+    def webtest2(base):
+        '''Generates signatures for identifying bad URLs'''
+        # Assignments
+        BadUrl, BadRedir = self._BadUrl, self._BadRedir
+        urljoin, AsyncHttp = self._uparse.urljoin, self._AsyncHttp
+        stash = []
+        # Generate random string of jibber
+        from string import letters, digits
+        from random import choice, randint
+        jibber = ''.join([letters, digits])
+        ru = ''.join([choice(jibber) for x in range(randint(1, 30))])
+        # Builds signature of a bad URL for a file
+        AsyncHttp(urljoin(base, '%s.html' % ru), BadRedir(BadUrl()))
+        # Builds signature of a bad URL for a directory
+        AsyncHttp(urljoin(base, '%s/' % ru), BadRedir(BadUrl()))
+        # Builds signature of a bad URL for a query
+        AsyncHttp(urljoin(base, '%s?%s' % (ru, ru)), BadRedir(BadUrl()))
+        # Loop
+        asyncore.loop()
+        # Store signatures
+        self._bfsig.extend(stash[0])
+        self._bdsig.extend(stash[1])
+        self._bqsig.extend(stash[2])        
+
     def _webtest(self):
         '''Generates signatures for identifying bad URLs'''
 
         def badurl(url):
             '''Returns first 5 lines of a bad URL
 
-            Arguments:                
+            Arguments:
             url -- Bad URL to open and parse'''
             # Use different classes if faster SGML Parser is available
             if self._newparser:
         self._bfsig.extend(badurl(urljoin(base, '%s.html' % ru)))
         # Builds signature of a bad URL for a directory
         self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))
+        # Builds signature of a bad URL for a query
+        self._bqsig.extend(badurl(urljoin(base, '%s?%s' % (ru, ru))))
 
     def _webparser(self, html):
         '''Parses HTML and returns bad URL indicator and extracted URLs
         newurl -- newly resolved URL
         base -- referring URL
         oldurl - original URL'''
-        # Assignments
-        good = self._good
         try:
             # If webspiders can access URL, open it
             if self._robot.can_fetch('*', newurl):
             # Log URL if SGML parser can't parse it 
             except self._sperror:
                 self._visited[newurl] = 1
-                if self._badhtm: self.badhtms[newurl] = 1                
+                if self._badhtm: self.badhtms[newurl] = 1
                 return 0, 0
             url.close()
             # Return URL and extracted urls if it's good
                     if rawurls: return turl, rawurls
                     else: return turl, []
             else: return 0,0
-        else: return 0, 0        
+        else: return 0, 0
 
     def _genverify(self, urls, base):
         '''Verifies a list of full URL relative to a base URL
 
     def _multiwalk(self, threads):
         '''Extracts good URLs from under a base URL
-        
+
         Arguments:
         threads -- number of threads to run'''
 
             # Put in pool
             pool.append(dthread)
 
-        # Assignments        
+        # Assignments
         pool, cache, multiverify = [], self._cache, self._multiverify
         Thread, width, good = self._thread, self.width, self._good
         # End processing if cache is empty
 
     def weburls(self, base=None, width=200, depth=5, thread=None, *vargs):
         '''Returns a list of web paths.
-        
+
         Arguments:
         base -- base web URL (default: None)
         width -- amount of resources to crawl (default: 200)
         newbase, rawurls = self._webopen(base, '', base)
         if newbase:
             # Change base URL if different
-            base, newurls = newbase, {}           
+            base, newurls = newbase, {}
             # Ensure there's a trailing '/' in base URL
             if base[-1] != '/':
                 url = list(uparse.urlsplit(base))
         t -- number of threads (default: None)
         vargs -- information to include or exclude'''
 
-        def pathize():            
-            '''Strips base URL from full URLs to produce paths'''            
+        def pathize():
+            '''Strips base URL from full URLs to produce paths'''
             for url in urls:
                 # Remove base URL from path list
                 url = url.replace(self.base, '')
         # Return path list after stripping base URL
         self.paths = list(pathize())
         return self.paths
-        
+
     def webmirror(self, root=None, t=None, base=None, width=200, depth=5):
         '''Mirrors a website on a local filesystem
 
 
     def badurlreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of bad URLs
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
 
     def badhtmreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of unparsed HTML URLs
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
 
     def redireport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of URLs redirected to an external URL
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
 
     def outreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of outside URLs referenced under the base URL
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
 
     def othereport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of non-HTTP/FTP URLs
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
 
     def urlreport(self, f=None, b=None, w=200, d=5, t=None):
         '''Pretties up a list of all URLs under a URL
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
 
     def webreport(self, f=None, b=None, w=200, d=5, t=None, *vargs):
         '''Pretties up a list of logged information under a URL
-        
+
         Arguments:
         f -- output file for report (default: None)
         b -- base web URL (default: None)
         
     def _formatreport(self, amount, header, body, file=None):
         '''Generic prettifier with date/time stamper
-        
+
         Arguments:
         header -- title of report
         body -- body of report
         file -- output file for report (default: None)'''
         # Get current time
-        localtime, strftime = self._localtime, self._formtime        
+        localtime, strftime = self._localtime, self._formtime
         curtime = strftime('%A, %B %d, %Y at %I:%M %p', localtime())
         # Make section header
         header = header % (amount, self.base, curtime)
                 local.close()
             # Use normal urlretrieve if no FTP required
             else: ulib.urlretrieve(url, np)
-            
+
         def dlthread(url, np, op):
             '''Spawns a thread containing the download function'''
             # Create thread
             dthread = Thread(target=download, args=(url, np, op))
             # Add to thread pool
             pool.append(dthread)
-                
+
         # Extract path and URL lists
         paths, urls = lists
         # Avoid outside namespace lookups
             # Sync with the URL for oldpath
             url = urls[paths.index(oldpath)]
             # Create name of local copy
-            newpath = normcase(oldpath).lstrip(sep)            
+            newpath = normcase(oldpath).lstrip(sep)
             # Get directory name
             dirname = split(newpath)[0]
-            # If the directory exists, download the file directly        
+            # If the directory exists, download the file directly
             if exists(dirname):
                 if isdir(dirname):
                     if threads: dlthread(url, newpath, oldpath)