Commits

Lynn Rees committed c74b4b2

more caching optimizations

Comments (0)

Files changed (1)

         base -- base URL'''
         uparse, vlist, webopen = self._uparse, self._vlist, self._webopen
         sr, sb, weballow = self._sb, base.split('/'), self._weballow
-        cache = self._cache
+        cache, depth = self._cache, self._depth
         # Get root domain of 'base'
         try: tbase = base.replace(sb[-1], '')
         except ValueError: pass
                 if url not in vlist:
                     rurl = webopen((url, base))
                     if rurl and rurl[0] not in vlist:
+                        vlist[url], vlist[rurl[0]] = 1, 1
                         if rurl[0] != url:
                             urlseg = uparse.urlsplit(rurl[0])
-                            if urlseg[1] not in sr[2]:
-                                vlist[rurl[0]] = 1
-                                continue
-                            elif not urlseg[2] and urlseg[1] == sr[2]:
-                                vlist[rurl[0]] = 1
-                                continue
-                        elif len(rurl) == 2:
+                            if urlseg[1] not in sr[2]: continue
+                            elif not urlseg[2] and urlseg[1] == sr[2]: continue
+                        if len(rurl[0].split('/')) >= depth: continue
+                        else:
                             cache[rurl[0]] = rurl[1]
                             yield rurl[0], base
-                        else: yield rurl, base
                     else:
                         vlist[url] = 1
                         continue
         vlist, fudict = self._vlist, self._fudict        
         try:
             # If webspiders can access URL, open it
-            if not self._robot: url = self._ulib.urlopen(base[0])
-            elif self._robot.can_fetch('*', base[0]):
-                url = self._ulib.urlopen(base[0])
-            else: return False
+            if self._robot:
+                if self._robot.can_fetch('*', base[0]):
+                    url = self._ulib.urlopen(base[0])
+                else: return False
+            else: url = self._ulib.urlopen(base[0])            
         # If HTTP error, log bad URL and abort
         except IOError:
             vlist[base[0]] = 1
 
         base -- base URL searched for references to other URLS'''
         # Avoid outside namespace lookups
-        vlist, depth, cache = self._vlist, self._depth, self._cache
-        if base[0] in cache:
-            urls = cache.get(base[0])
-            del cache[base[0]]
-        else: urls = self._webopen(base)[1]
+        cache = self._cache
+        #if base[0] in cache:
+        urls = cache.get(base[0])
+        del cache[base[0]]
+        #else: urls = self._webopen(base)[1]; print base
         if urls:
             # Final and potential list of all URLs 
-            for url in self._urlresolve(urls, base[0]):
-                # Block URLS exceeding allowed depth
-                if len(url[0].split('/')) >= depth: vlist[url[0]] = 1
-                # Add processed URL to final list of URLs
-                elif url[0] not in vlist:
-                    vlist[url[0]] = 1
-                    yield url
+            for url in self._urlresolve(urls, base[0]): yield url
         else: return
 
     def _webwalk(self, source):
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.