Commits

Anonymous committed d092163

Updated to deal with 'moiety' entries where the cif file is not directly
linked to on the summary page.

  • Participants
  • Parent commits fc1d09f

Comments (0)

Files changed (1)

File crystalget.py

 
 # Where to start searching
 BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed.xml"
-BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed-6549.xml"
-BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed-6335.xml"
-BASEURL="http://wwmm.ch.cam.ac.uk/crystaleye/feed/all/feed-5237.xml"
+#BASEURL="file:////Users/jmht/Documents/quixote/repositories/crystalget/feed-5237.xml"
 # The maximum number of cifs to download.
 MAXFILE=1000000
 # Download directory - set to something other then blank to download the files
 DOWNLOAD_DIRECTORY="/Users/jmht/Documents/quixote/repositories/crystalget/files"
 
 class CrystalGet(HTMLParser):
-    """Parse a CrystalEye page to extract the url of the cached cif file.
+    """Parse a CrystalEye page to extract the url of the cif and cml files.
        Edit the handle_data method if you want to extract one of the other files.
     """
 
 
 # End CrystalGet
 
+class MoietyGet(HTMLParser):
+    """Parse a directory listing to extract the url of the cif files.
+    """
+    cifFile = None
+        
+    def handle_starttag(self, tag, attrs):
+        """Set recording whenever we encounter a tag so handle_data can process it"""
+        if tag == 'a' and attrs:
+            for name, value in attrs:
+                if name == "href" and str(value).endswith(".cif"):
+                    self.cifFile = str(value)
+    
+    def get_cifFile(self):
+        return self.cifFile
+
+# End MoietyGet
+
+
+
 # List of urls for the files
 urls = []
 # next feedurl to process
     global file_list
     for f in urls:
         file_list.write(f+"\n")
+        
+def get_moiety_cif(pageUrl):
+    dir = os.path.split(pageUrl)[0]
+    m = MoietyGet()
+    try:
+        req = urlopen(dir)
+    except Exception, e:
+        # Need to encode to deal with possible dodgy characters
+        print "Error accessing moiety url: %s\n%s" % (dir.encode('ascii','replace'),e)
+        return
+    # Calls handle_starttag, _data etc.
+    m.feed(req.read())
+    return os.path.join(dir,m.get_cifFile())
 
 def parse_entry(entry):
     global urls
         # Need to instantiate here each time - otherwise random parsing errors
         cg = CrystalGet()
         #urls.append(cg.parse_page(entry.link))
+        print "Parsing page: ",entry.link
         cg.parse_page(entry.link)
         myFiles = cg.get_urls()
         #print "got files ",myFiles
         if myFiles:
+            # For files with moieties, the CIF is held in the containing directory
+            # and there is no link on the webpage
+            if len(myFiles) == 2:
+                cifUrl = get_moiety_cif(entry.link)
+                if cifUrl:
+                    myFiles.append(cifUrl)
+                else:
+                    print "FAILED TO GET ADDITION CIF FOR: ",entry.link
             log_files(myFiles)
             urls += myFiles
     except HTMLParseError, e: