1. Giovanni Marco Dall'Olio
  2. dbcline_mechanize

Commits

Giovanni Marco Dall'Olio  committed 01343c0

ADD: draft script to query dbcline

  • Participants
  • Branches default

Comments (0)

Files changed (1)

File query_dbcline.py

View file
  • Ignore whitespace
+#!/usr/bin/env python
+"""
+"""
+import mechanize
+import cookielib
+import html2text
+import re
+import time
+
+dbcline_url = "http://genapps2.uchicago.edu:8081/dbcline/main.jsp"
+
+
+
+def initialize_browser():
+
+    br = mechanize.Browser()
+    # Cookie Jar
+    cj = cookielib.LWPCookieJar()
+    br.set_cookiejar(cj)
+
+    # Browser options
+    br.set_handle_equiv(True)
+    br.set_handle_gzip(True)
+    br.set_handle_redirect(True)
+    br.set_handle_referer(True)
+    br.set_handle_robots(False)
+
+    # Follows refresh 0 but not hangs on refresh > 0
+    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
+
+    # Want debugging messages?
+#    br.set_debug_http(True)
+#    br.set_debug_redirects(True)
+#    br.set_debug_responses(True)
+
+    # User-Agent (this is cheating, ok?)
+    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
+    br.addheaders.append(('email', 'giovanni.dallolio@upf.edu'))
+
+    return br
+
+
+def browse_dbcline(br, genes=['GCS1']):
+    r = br.open(dbcline_url)
+
+    for gene in genes:
+        br.select_form(nr=0)
+        br.form['landmark'] = gene
+        br.form['variable'] = ['-9']
+        br.submit()
+        parse_response(gene, br.response().read())
+        return br.response().read()
+        time.sleep(2)
+        br.back()
+
+
+nsnps_re = re.compile("\n\d+ SNPs found in dbCLINE in gene .* \(.*\).\n\n(\d+/\d+) SNPs pass filters.\n")
+snp_re = re.compile('<abbr title="Link to HGDP Selection Browser">(rs.*)</abbr></a></td>\n\n\n<td>Chr.*</td>\n<td align="right">(.*)</td>\n<td align="right">(.*)</td>\n<td align="right">(.*)</td>\n<td align="right">(.*)</td>')
+def parse_response(genename, response):
+    nsnps = nsnps_re.findall(response)[0]
+    print 'snps for gene %s: %s' % (genename, nsnps)
+    if nsnps[0] != '0':
+        print snp_re.findall(response)
+
+
+
+def pretty_print_page(br):
+    print html2text.html2text(br.response().read())
+
+
+
+if __name__ == '__main__':
+    br = initialize_browser()
+    resp = browse_dbcline(br, genes = ['GCS1'])
+#    browse_dbcline(br, genes = ['GCS1', 'MGAT3', 'ALG12'])
+