Commits

jmht  committed bb0573f

Initial commit of the crystalget.py script.

  • Participants

Comments (0)

Files changed (1)

File crystalget.py

+"""
+Script to scrape cif files for the CrystalEye we site:
+http://wwmm.ch.cam.ac.uk/crystaleye/
+
+The script uses the "all" atom feed:
+http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed.xml
+
+It requires the feedparser package, which is available from:
+http://code.google.com/p/feedparser/
+You don't need to install it, just copy the feedparser.py file into the same
+directory as this script and run the script with:
+python ./crystalget.py
+
+Copyright (c) 2010, Jens Thomas
+This code is released under a Creative Commons Attribution 3.0 Unported License.
+"""
+
+import sys
+import os
+import feedparser
+
+from HTMLParser import HTMLParser,HTMLParseError
+from urllib2 import urlopen
+
+# Where to start searching
+BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed.xml"
+# The maximum number of cifs to download.
+MAXCIF=200
+# Download directory - set to something other then blank to download the files
+# to a directory other then where the script is run from.
+DOWNLOAD_DIRECTORY=""
+
+class CrystalGet(HTMLParser):
+    """Parse a CrystalEye page to extract the url of the cached cif file.
+       Edit the handle_data method if you want to extract one of the other files.
+    """
+
+    def __init__(self):
+        self.recording = 0
+        self.tmpfile = None
+        self.ciffile = None
+        HTMLParser.__init__(self)
+
+    def resetVars(self):
+        self.recording = 0
+        self.tmpfile = None
+        self.ciffile = None
+
+    def cifUrl(self, url):
+        self.resetVars()
+        req = urlopen(url)
+        self.feed(req.read())
+        if self.ciffile:
+            base=os.path.split(url)[0]
+            self.ciffile=os.path.join(base,self.ciffile)
+            return self.ciffile
+
+    def handle_starttag(self, tag, attrs):
+        """Set recording whenever we encounter a tag so handle_data can process it"""
+        if tag == 'a' and attrs:
+            for name, value in attrs:
+                if name == "href":
+                    self.tmpfile = value
+                    self.recording = 1
+
+    def handle_data(self, data):
+        """We use the anchor enclosing the cached tag as the sign we've hit the cif url"""
+        if self.recording and data == "cached":
+            self.ciffile = os.path.basename(self.tmpfile)
+
+    def handle_endtag(self, tag):
+        if self.recording:
+            self.recording = 0
+
+
+def add_urls(urls,feedurl):
+    """"Recurse down the feeds using the CrystalGet class to extract the urls from the page."""
+    print "parsing ",feedurl
+    d = feedparser.parse(feedurl)
+    for l in d.entries:
+        if (len(urls) < MAXCIF):
+            try:
+                # Need to instantiate here each time - otherwise random parsing errors
+                cg = CrystalGet()
+                urls.append(cg.cifUrl(l.link))
+            except HTMLParseError, e:
+                print("Parse error for file: %s\n%s") % (l.link,e)
+    
+    feedurl=None
+    if (len(urls) < MAXCIF and len(d.feed.links) >= 3):
+            feedurl=d.feed.links[2]['href']
+            urls,feedurl = add_urls(urls,feedurl)
+        
+    return (urls,feedurl)
+
+# List of urls for the files
+urls = []
+# next feedurl to process
+feedurl=BASEURL
+
+# Get the list of urls for the cif files
+urls,feedurl = add_urls(urls,feedurl)
+
+# Change to the download directory if one was set
+if (os.access(DOWNLOAD_DIRECTORY, os.F_OK)):
+    print("Downloading files to directory: %s" % DOWNLOAD_DIRECTORY)
+    os.chdir(DOWNLOAD_DIRECTORY)
+
+
+# Download the files
+for url in urls:
+    fname=os.path.split(url)[1]
+    print "downloading: %s" % fname
+    u = urlopen(url)
+    f=open(fname,'w')
+    f.write(u.read())
+    f.close()