Commits

Anonymous committed fc1d09f

Refactor to avoid recursion and have more sensible methods.

Also add cml files.

  • Participants
  • Parent commits 253908b

Comments (0)

Files changed (1)

File crystalget.py

 
 # Where to start searching
 BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed.xml"
+BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed-6549.xml"
+BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed-6335.xml"
+BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed-5237.xml"
 # The maximum number of cifs to download.
-MAXCIF=200
+MAXFILE=1000000
 # Download directory - set to something other then blank to download the files
 # to a directory other then where the script is run from.
-DOWNLOAD_DIRECTORY=""
+DOWNLOAD_DIRECTORY="/Users/jmht/Documents/quixote/repositories/crystalget/files"
 
 class CrystalGet(HTMLParser):
     """Parse a CrystalEye page to extract the url of the cached cif file.
     def __init__(self):
         self.recording = 0
         self.tmpfile = None
-        self.ciffile = None
+        self.urlList = []
+        self.baseUrl = None
         HTMLParser.__init__(self)
 
-    def resetVars(self):
+    def reset_vars(self, pageUrl):
         self.recording = 0
         self.tmpfile = None
-        self.ciffile = None
+        self.urlList = []
+        self.baseUrl = base=os.path.split(pageUrl)[0]
+        
+    def get_urls(self):
+        return self.urlList
 
-    def cifUrl(self, url):
-        self.resetVars()
-        req = urlopen(url)
+    def parse_page(self, url):
+        #print "parse_page ",url
+        self.reset_vars(url)
+        try:
+            req = urlopen(url)
+        except Exception, e:
+            # Need to encode to deal with possible dodgy characters
+            print "Error accessing url: %s\n%s" % (url.encode('ascii','replace'),e)
+            return
+        # Calls handle_starttag, _data etc.
         self.feed(req.read())
-        if self.ciffile:
-            base=os.path.split(url)[0]
-            self.ciffile=os.path.join(base,self.ciffile)
-            return self.ciffile
-
+            
+    def full_url(self,shortUrl):
+        return os.path.join(self.baseUrl,shortUrl)
+        
     def handle_starttag(self, tag, attrs):
         """Set recording whenever we encounter a tag so handle_data can process it"""
         if tag == 'a' and attrs:
                     self.recording = 1
 
     def handle_data(self, data):
-        """We use the anchor enclosing the cached tag as the sign we've hit the cif url"""
-        if self.recording and data == "cached":
-            self.ciffile = os.path.basename(self.tmpfile)
+        """We use the anchor enclosing the cached tag as the sign we've hit the file url"""
+        if self.recording and ( data == "cached" or data == "Raw CML" or data == "Complete CML"):
+            newurl = self.full_url(self.tmpfile)
+            #print "Adding url: ",newurl
+            self.urlList.append(newurl)
 
     def handle_endtag(self, tag):
         if self.recording:
             self.recording = 0
 
+# End CrystalGet
+
+# List of urls for the files
+urls = []
+# next feedurl to process
+feedurl=BASEURL
+
+# List of all files
+file_list = open("fileList.txt", 'w')
+
+
+def log_files(urls):
+    global file_list
+    for f in urls:
+        file_list.write(f+"\n")
+
+def parse_entry(entry):
+    global urls
+    try:
+        # Need to instantiate here each time - otherwise random parsing errors
+        cg = CrystalGet()
+        #urls.append(cg.parse_page(entry.link))
+        cg.parse_page(entry.link)
+        myFiles = cg.get_urls()
+        #print "got files ",myFiles
+        if myFiles:
+            log_files(myFiles)
+            urls += myFiles
+    except HTMLParseError, e:
+        print("Parse error for file: %s\n%s") % (entry.link,e)
+
+def parse_feed(feed_url):
+    fparse = feedparser.parse(feed_url)
+    for entry in fparse.entries:
+        parse_entry(entry)
+    return fparse
 
 
 def get_prev_archive(fparse):
     for l in fparse.feed.links:
         if (l.has_key('rel') and l['rel'] == 'prev-archive'):
             return l['href']
+        
 
-def add_urls(urls,feedurl):
-    """"Recurse down the feeds using the CrystalGet class to extract the urls from the page."""
-    print "parsing ",feedurl
-    fparse = feedparser.parse(feedurl)
-    for entry in fparse.entries:
-        if (len(urls) < MAXCIF):
-            try:
-                # Need to instantiate here each time - otherwise random parsing errors
-                cg = CrystalGet()
-                urls.append(cg.cifUrl(entry.link))
-            except HTMLParseError, e:
-                print("Parse error for file: %s\n%s") % (l.link,e)
-    
-    feedurl=None
-    # Get next feed
-    if ( len(urls) < MAXCIF ):
-        feedurl=get_prev_archive(fparse)
-        if feedurl:
-            urls,feedurl = add_urls(urls,feedurl)
-        
-    return (urls,feedurl)
+# Loop through feeds
+while (feedurl and len(urls) < MAXFILE):
+    print "Parsing feed: ",feedurl
+    fparse = parse_feed(feedurl)
+    feedurl=get_prev_archive(fparse)
 
-# List of urls for the files
-urls = []
-# next feedurl to process
-feedurl=BASEURL
+# Finished parsing so close file
+file_list.close()
 
-# Get the list of urls for the cif files
-urls,feedurl = add_urls(urls,feedurl)
+# Comment below to download, otherwise we just get a list of urls
+sys.exit()
 
 # Change to the download directory if one was set
 if (os.access(DOWNLOAD_DIRECTORY, os.F_OK)):