1. Lynn Rees
  2. psilib

Commits

Lynn Rees  committed db8dcc1

- added single-threaded

  • Participants
  • Parent commits 700da08
  • Branches default

Comments (0)

Files changed (1)

File spider.py

View file
  • Ignore whitespace
             url.close()
             return cbase, []
 
+    def _urlresolve(self, urls, base):
+        '''Returns a full URL relative to a base URL
+
+        Arguments:
+        urls -- list of raw URLs
+        base -- referring URL'''
+        # Assignments
+        cache, visited, webopen = self._cache, self._visited, self._webopen
+        sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
+        urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
+        outside, redirs, unhttp = self.outside, self.redirs, self.unhttp
+        supported = self._supported
+        # Strip file off base URL for joining
+        tbase = base.replace(base.split('/')[-1], '') 
+        for url in urls:
+            if url not in visited:
+                # Remove whitespace from URL
+                if url.find(' ') != -1:
+                    visited[url], url = 1, url.replace(' ', '')
+                    if url in visited: continue
+                # Remove fragments i.e. 'http:foo/bar#frag'
+                if url.find('#') != -1:
+                    visited[url], url = 1, urldefrag(url)[0]
+                    if url in visited: continue
+                # Process full URLs i.e. 'http://foo/bar
+                if url.find(':') != -1:
+                    urlseg = urlsplit(url)
+                    # Block non-FTP, HTTP URLs
+                    if urlseg[0] not in supported:
+                        # Log as non-FTP/HTTP URL
+                        unhttp[url], visited[url] = 1, 1
+                        continue
+                    # If URL is not in root domain, block it
+                    if urlseg[1] not in sb:
+                        visited[url], outside[url] = 1, 1                        
+                        continue
+                    # Block duplicate root URLs
+                    elif not urlseg[2] and urlseg[1] == sb:
+                        visited[url] = 1
+                        continue
+                # Handle relative URLs i.e. ../foo/bar
+                elif url.find(':') == -1:
+                    # Join root domain and relative URL
+                    visited[url], url = 1, urljoin(tbase, url)
+                    if url in visited: continue
+                # Test URL by attempting to open it
+                rurl = webopen((url, base))
+                if rurl and rurl[0] not in visited:
+                    # Get URL
+                    turl, rawurls = rurl
+                    visited[url], visited[turl] = 1, 1
+                    # If URL resolved to a different URL, process it
+                    if turl != url:
+                        urlseg = urlsplit(turl)
+                        # If URL is not in root domain, block it
+                        if urlseg[1] not in sb:
+                            # Log as a redirected internal URL
+                            redirs[(url, turl)] = 1
+                            continue
+                        # Block duplicate root URLs
+                        elif not urlseg[2] and urlseg[1] == sb: continue
+                    # If URL exceeds depth, don't process 
+                    if len(turl.split('/')) >= depth: continue
+                    # Otherwise put in cache and yield url
+                    else:
+                        newurls = {}
+                        # Eliminate duplicates
+                        for rawurl in rawurls:
+                            # Eliminated known visited URLs
+                            if rawurl not in visited: newurls[rawurl] = 1
+                        # Put new urls in cache if present
+                        if newurls: cache[turl] = newurls
+                        yield turl
+
+    def _webwalk(self):
+        '''Yields good URLs from under a base URL'''
+        # Assignments
+        cache, urlresolve = self._cache, self._urlresolve
+        # End processing if cache is empty
+        while cache:
+            # Fetch item from cache
+            base, urls = cache.popitem()
+            # If item has child URLs, process them and yield good URLs
+            if urls:
+                for url in urlresolve(urls, base): yield url
+
+    def _singleweb(self, base=None, width=200, depth=5):
+        '''Returns a list of web paths.
+        
+        Arguments:
+        base -- base web URL (default: None)
+        width -- amount of resources to crawl (default: 200)
+        depth -- depth in hierarchy to crawl (default: 5)'''
+        # Assignments
+        self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
+        self.redirs, self.outside, self.badhtml, self.unhttp = {}, {}, {}, {}
+        webwalk, good, self._robot = self._webwalk, self._good, self._rparser()
+        uparse, robot = self._uparse, self._robot
+        # sgmlop crashes Python after too many iterations
+        if width > 5000: self._classpicker(1)
+        else: self._classpicker() 
+        # Use global base if present
+        if not base: base = self.base
+        # Verify URL
+        tbase =  self._webopen((base, ''))
+        # If URL is good...
+        if tbase:
+            # Change base URL if different
+            if tbase[0] != base: base = tbase[0]            
+            # Ensure there's a trailing '/' in base URL
+            if base[-1] != '/':
+                url = list(uparse.urlsplit(base))
+                url[1] = ''.join([url[1], '/'])
+                base = uparse.urlunsplit(url)
+            # Put in cache, visited list
+            self._cache[base], self._visited[base] = tbase[1], 1
+            # Make base URL, get split, and put in verified URL list
+            self.base, self._sb, good[base] = base, base.split('/'), 1
+        # If URL is bad, abort and raise error
+        else:
+            raise IOError, "URL is invalid"
+        # Assign width
+        if self.width and width == 200: width = self.width
+        # Adjust dept to length of base URL
+        if self.depth and depth == 6: self.depth += len(self._sb)
+        else: self.depth = depth + len(self._sb)
+        # Get robot limits
+        robot.set_url(''.join([base, 'robots.txt']))
+        robot.read()
+        # Get signature of bad URL
+        self._webtest()
+        # Get good URLs as long as total width isn't exceeded
+        try:
+            for item in webwalk():
+                if len(good) <= width: good[item] = 1
+                elif len(good) >= width: break
+        # If user interrupts crawl, return what's done
+        except KeyboardInterrupt: pass
+        # Get URLs, sort them, and return list
+        self.urls = good.keys()
+        self.urls.sort()
+        return self.urls                        
+
     def weburls(self, b=None, w=200, d=5, t=None):
         '''Returns a list of web paths.
         
         if t: self._webthreads(b, w, d, t)
         else: self._singleweb(b, w, d)
         return self.urls
-
-    def _singleweb(self, b=None, w=200, d=5, t=None):
-        '''Returns a list of web paths.
-        
-        Arguments:
-        b -- base web URL (default: None)
-        w -- amount of resources to crawl (default: 200)
-        d -- depth in hierarchy to crawl (default: 5)
-        t -- number of threads to spawn'''    
-        pass
     
     def webpaths(self, b=None, w=200, d=5, t=None):
         '''Returns a list of web paths.