Lynn Rees avatar Lynn Rees committed 8634ca2

- checkin for async networking

Comments (0)

Files changed (1)

     except ImportError: pass
     
     _bdsig, _bfsig, _session, _newparser = None, None, None, None
+    _badurl, _badhtm, _visited, _badhtms = None, None, {}, []
+    _bqsig = None
     # HTML tags with URLs
     _urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
         'embed':1, 'area':1, 'frame':1, 'applet':1, 'input':1, 'base':1,
         rcodes, okcodes = self._rcodes, self._okcodes
         badurl, badhtm = self._badurl, self._badhtm
         sperror, visited = self._sperror, self._visited
-        badhtms = self._badhtms 
+        badhtms, async, traceback = self._badhtms, self._async, self._traceback 
+        mimetools = self._mimetools
         
-        class AsyncHttp(self._async.dispatcher_with_send):
+        class AsyncHttp(async.dispatcher_with_send):
             
-            def __init__(self, url, consumer):
-                self._async.dispatcher_with_send.__init__(self)
+            def __init__(self, url, consumer, base=None, oldurl=None, nurl=None):
+                async.dispatcher_with_send.__init__(self)
                 self.url, self.consumer = url, consumer
+                if base: self.base = base
+                else: self.base = None
+                if oldurl: self.oldurl = oldurl
+                else: self.oldurl = None
+                if nurl: self.nurl = nurl
+                else: self.nurl = self.url
                 getstring = 'GET %s HTTP/1.0\r\nHost: %s\r\n\r\n'
                 scheme, host, path, params, query, fragment = urlparse(url)
                 assert scheme == 'http', 'only supports HTTP requests'
-                try:
-                    host, port = host.split(':', 1)
-                    port = int(port)
-                except (TypeError, ValueError): port = 80
                 if not path: path = '/'
                 if params: path = ';'.join([path, params])
                 if query: path = '?'.join([path, query])
                 self.request = getstring % (path, host)
-                self.host, self.port, self.status = host, port, None
+                self.host, self.port, self.status = host, 80, None
                 self.header, self.data = None, ''
                 self.create_socket(afinet, sockstream)
-                self.connect((host, port))
+                self.connect((host, self.port))
 
             def handle_connect(self):
                 self.send(self.request)
                         self.type = self.header['Content-Type']
                         self.consumer.url = self.url
                         self.consumer.type = self.type
+                        self.consumer.nurl = self.nurl
+                        if self.base: self.consumer.base = self.base
+                        if self.oldurl: self.consumer.oldurl = self.oldurl
                         try: http_header = self.consumer.http_header
                         except AttributeError: pass
                         else: http_header(self)
                         if not self.connected: return
                 try: self.consumer.feed(data)
-                except self._sperror:
+                except sperror:
                     visited[self.url] = 1
                     if badhtm: badhtms[self.url] = 1
                 
 
         class Redir:
             
-            def __init__(self, consumer):
+            def __init__(self, consumer, stash):
                 self.consumer, self.url, self.type = consumer, None, None
-                self.badurl = None
+                self.badurl, self.oldurl, self.base = None, None, None
+                self.nurl, self.stash = None, stash
                 
             def http_header(self, request):
                 if request.status is None or request.status[1] not in rcodes:
                     try: http_header = self.consumer.http_header
                     except AttributeError: pass
                     else: return http_header(request)
-                    if request.status[1] not in goodhttp: self.badurl = 1
+                    if request.status[1] not in okcodes:
+                        self.consumer.badurl = 1
                 else:
                     if 'location' in request.header:
                         url = request.header['location']
                     elif 'uri' in request.header:
                         url = request.header['uri']
                     request.close()
-                    AsyncHTTP(url, self)
+                    AsyncHTTP(url, self, self.base, self.oldurl, self.url)
 
         class ExRedir(Redir):
 
             def feed(self, data):
-                if self.type == 'text/html': self.consumer.feed(data)
+                if 'text/html' in self.type: self.consumer.feed(data)
 
             def close(self):
-                if self.badurl:
-                    self._visited[newurl] = 1
-                    if badurl: badurls.append((base, oldurl, newurl))
-                elif self.consumer.badurl:
-                    self._visited[newurl] = 1
-                    if badurl: badurls.append((base, oldurl, newurl))                    
+                if self.consumer.badurl:
+                    visited[self.url] = 1
+                    if badurl:
+                        badurls.append((self.base, self.oldurl, self.url))
                 elif self.consumer.urls:
-                    stash[self.url] = self.consumer.urls
-                else: stash[self.url] = []
+                    self.stash[(self.nurl, self.url)] = self.consumer.urls
+                else: self.stash[(self.nurl, self.url)] = []
                 self.consumer.close()
 
         class BadRedir(Redir):
                 self.consumer.feed(data)
 
             def close(self):
-                stash.append(self.consumer.text)
+                self.stash.append(self.consumer.text)
                 self.consumer.close()
 
         self._BadRedir, self._ExRedir = BadRedir, ExRedir
-        self._AsyncHttp = AsyncHttp                 
+        self._AsyncHttp = AsyncHttp
 
     def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
         '''Returns FTP client session
         # Assignments
         urltags, urlattrs = self._urltags, self._urlattrs
         # Lists for bad file and bad directory signatures
-        self._bfsig, self._bdsig = [], []
-        bfsig, bdsig = self._bfsig, self._bdsig
+        self._bfsig, self._bdsig, self._bqsig = [], [], []
+        bfsig, bdsig, bqsig = self._bfsig, self._bdsig, self._bqsig
         # Use faster SGMLParser if available
         try:
             from sgmlop import SGMLParser as newparser
                     # Compare signature of known bad URL to a new web page
                     if self.text == bfsig: self.badurl = 1
                     elif self.text == bdsig: self.badurl = 1
+                    elif self.text == bqsig: self.badurl = 1
                 def finish_starttag(self, tag, attrs):
                     '''Extracts URL bearing tags'''
                     if tag in urltags:
                         # Get key, vale in attributes if they match
-                        url = [v for k, v in attrs if k in urlattrs]
+                        url = [v for k, v in attrs if k in urlattrs]                        
                         if url: self.urls.extend(url)
             # BadUrl class using classic parser
             class BadUrl(oldparser):
                     # Compare signature of known bad URL to a new web page
                     if self.text == bfsig: self.badurl = 1
                     elif self.text == bdsig: self.badurl = 1
+                    elif self.text == bqsig: self.badurl = 1
                 def finish_starttag(self, tag, attrs):
                     '''Extracts URL bearing tags'''
                     if tag in urltags:
         # Make resulting classes available class wide
         self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
 
-    def _makeasync(urllist):
-        for url in urllist:
-            if self._robot.can_fetch('*', url):
-                consumer = self._ExRedir(UrlExtract())
-                self._AsyncHttp(url, consumer)
-            else:
-                self._visited[url] = 1
-                continue
-        asyncore.loop()
-
-    def webtest2(base):
+    def _webtest2(self):
         '''Generates signatures for identifying bad URLs'''
         # Assignments
+        base = self.base
         BadUrl, BadRedir = self._BadUrl, self._BadRedir
         urljoin, AsyncHttp = self._uparse.urljoin, self._AsyncHttp
         stash = []
         jibber = ''.join([letters, digits])
         ru = ''.join([choice(jibber) for x in range(randint(1, 30))])
         # Builds signature of a bad URL for a file
-        AsyncHttp(urljoin(base, '%s.html' % ru), BadRedir(BadUrl()))
+        AsyncHttp(urljoin(base, '%s.html' % ru), BadRedir(BadUrl(), stash))
         # Builds signature of a bad URL for a directory
-        AsyncHttp(urljoin(base, '%s/' % ru), BadRedir(BadUrl()))
+        AsyncHttp(urljoin(base, '%s/' % ru), BadRedir(BadUrl(), stash))
         # Builds signature of a bad URL for a query
-        AsyncHttp(urljoin(base, '%s?%s' % (ru, ru)), BadRedir(BadUrl()))
+        AsyncHttp(urljoin(base, '%s?%s' % (ru, ru)), BadRedir(BadUrl(), stash))
         # Loop
-        asyncore.loop()
+        self._async.loop()
         # Store signatures
         self._bfsig.extend(stash[0])
         self._bdsig.extend(stash[1])
-        self._bqsig.extend(stash[2])        
+        self._bqsig.extend(stash[2])
 
     def _webtest(self):
         '''Generates signatures for identifying bad URLs'''
             url.close()
             return newurl, []
 
+    def _genverify2(self, urllist, base):
+        '''Returns a full URL relative to a base URL
+
+        Arguments:
+        urls -- list of raw URLs
+        base -- referring URL'''
+        # Assignments
+        visited, webopen, others = self._visited, self._webopen, self.others
+        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
+        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin        
+        outs, redirs, supported = self.outs, self.redirs, self._supported
+        redir, other, out = self._redir, self._other, self._out
+        cache, toprocess, robot = self._cache, [], self._robot
+        AsyncHttp, ExRedir = self._AsyncHttp, self._ExRedir
+        UrlExtract, stash = self._UrlExtract, {}
+        # Strip file off base URL for joining
+        newbase = base.replace(base.split('/')[-1], '')
+        # Handle any child URLs
+        for url in urllist:
+            if url not in visited:
+                # Remove whitespace from URL
+                if url.find(' ') != -1:
+                    visited[url], nurl = 1, url.replace(' ', '')
+                    if url in visited: continue
+                # Remove fragments i.e. 'http:foo/bar#frag'
+                if url.find('#') != -1:
+                    visited[url], nurl = 1, urldefrag(url)[0]
+                    if url in visited: continue
+                # Process full URLs i.e. 'http://foo/bar
+                if url.find(':') != -1:
+                    urlseg = urlsplit(url)
+                    # Block non-FTP, HTTP URLs
+                    if urlseg[0] not in supported:
+                        visited[url] = 1
+                        if other: others[url] = 1
+                        continue
+                    # If URL is not in root domain, block it
+                    elif urlseg[1] not in sb:
+                        visited[url] = 1
+                        if out: outs[url] = base
+                        continue
+                    # Block duplicate root URLs
+                    elif not urlseg[2] and urlseg[1] == sb:
+                        visited[url] = 1
+                        continue
+                    nurl = url
+                # Handle relative URLs i.e. ../foo/bar
+                elif url.find(':') == -1:
+                    # Join root domain and relative URL
+                    visited[url], nurl = 1, urljoin(newbase, url)
+                    if nurl in visited: continue
+                toprocess.append((nurl, base, url))
+        for nurl, base, url in toprocess:
+            if robot.can_fetch('*', nurl):
+                consumer = ExRedir(UrlExtract(), stash)
+                AsyncHttp(nurl, consumer, base, url)
+            else: visited[nurl] = 1
+        self._async.loop()
+        print stash
+        for turl, nurl in stash:
+            if turl not in visited:
+                visited[nurl], visited[turl]  = 1, 1
+                # If URL resolved to a different URL, process it
+                if turl != nurl:
+                    urlseg = urlsplit(turl)
+                    # If URL is not in root domain, block it
+                    if urlseg[1] not in sb:
+                        # Log as a redirected internal URL
+                        if redir: redirs[(nurl, turl)] = base
+                        continue
+                    # Block duplicate root URLs
+                    elif not urlseg[2] and urlseg[1] == sb: continue
+                # If URL exceeds depth, don't process 
+                if len(turl.split('/')) >= depth: continue
+                # Otherwise return URL
+                else:
+                    newurls, rawurls = {}, stash.get((turl, nurl))
+                    if rawurls:
+                        for rawurl in rawurls:
+                            if rawurl not in visited: newurls[rawurl] = 1
+                        cache[turl] = newurls
+                    print len(visited)
+                    yield turl, base
+
     def _urlverify(self, url, base, newbase):
         '''Returns a full URL relative to a base URL
 
     def _onewalk(self):
         '''Yields good URLs from under a base URL'''
         # Assignments
-        cache, genverify = self._cache, self._genverify
+        cache, genverify2 = self._cache, self._genverify2
         # End processing if cache is empty
         while cache:
             # Fetch item from cache
             base, urls = cache.popitem()
             # If item has child URLs, process them and yield good URLs
             if urls: 
-                for url in genverify(urls, base): yield url
+                for url in genverify2(urls, base): yield url
 
     def _multiwalk(self, threads):
         '''Extracts good URLs from under a base URL
         if self.width and width == 200: width = self.width
         else: self.width = width
         # sgmlop crashes Python after too many iterations
-        if width > 5000: self._parserpick(1)
-        else: self._parserpick() 
+        #if width > 5000:
+        self._parserpick(1)
+        #else: self._parserpick() 
         # Use global base if present
         if not base: base = self.base
         # Verify URL and get child URLs
         robot.set_url(''.join([base, 'robots.txt']))
         robot.read()
         # Get signature of bad URL
-        self._webtest()
+        self._webtest2()
         # Get good URLs as long as total width isn't exceeded
         try:
             # Multiwalk if threaded
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.