Commits

Faheem Mitha committed f886dea

Rename getfasta.py to getfasta. Change to downloading a single fasta file from EBI for all gene segments corresponding to a RSS dataset.

Comments (0)

Files changed (2)

+#!/usr/bin/python
+
+def wgetcmd(accnumlst, dataset):
+    """
+    Make single wget request to download a fasta file corresponding to
+    the accession numbers in accnumlst.
+    """
+    import os
+    accnumlst = "|".join(accnumlst)
+    cmd = "wget -c \"http://srs.ebi.ac.uk/srsbin/cgi-bin/wgetz?[IMGTLIGM-ID:%s]+-view+FastaSeqs+-ascii\" -O - >> data/%s.fasta"%(accnumlst, dataset)
+    os.system(cmd)
+
+def getfasta(dataset):
+    """
+    Make multiple wget calls to download a fasta file corresponding to
+    the accession numbers in 'datafiles_info' corresponding to
+    'dataset'. Each call downloads a fasta file. wgetcmd concatenates
+    them together. NOTE: The hardwired 30 is the maximum number of
+    accession numbers that ebi accepts in a single request.
+    """
+    from corrmodel import utils
+    conf = utils.get_conf()
+    datafiles_info = conf[dataset]["datafiles_info"]
+    accnumlst = list(set([d[0] for d in datafiles_info]))
+    accnumlst.sort()
+    lstlen = len(accnumlst)
+    for i in range(lstlen/30):
+        wgetcmd(accnumlst[30*i:30*(i+1)], dataset)
+    wgetcmd(accnumlst[30*(lstlen/30):lstlen], dataset)
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('dataset', help='name of dataset')
+    args = parser.parse_args()
+    getfasta(args.dataset)

getfasta.py

-#!/usr/bin/python
-
-def downloadfasta(accnum):
-    import os
-    os.system('curl -v -d livret=9 -d SelElem="^Selection nb=~0~/>" -d Session="^LocalSession client=~59.183.58.138~ code=~0~ appliName=~IMGTlect~ ident=~1357235333356~ time=~875~/>" -d l01p01c01="Y" -d l08p01c02="V" -d l09p01c02=%(accnum)s -d l09p01c03="6 Sequence (FASTA format)" -o %(accnum)s.fasta.tmp http://www.imgt.org/cgi-bin/IMGTlect.jv'%{'accnum':accnum})
-
-def cleanfasta(accnum):
-    from Bio import SeqIO
-    print "accnum", accnum
-    print "type(accnum)", type(accnum)
-    tmpfastafile = accnum+".fasta.tmp"
-    fastafile = accnum+".fasta"
-    infasta = 0
-    tff = open(tmpfastafile)
-    if "<pre>" not in tff.read():
-        raise ValueError("Bad accession number. Abort")
-    tff.close()
-    tff = open(tmpfastafile)
-    ff = open(fastafile, 'w')
-    for line in tff:
-        if "</pre>" in line:
-            infasta = 0
-        if infasta:
-            ff.write(line)
-        if "<pre>" in line:
-            infasta = 1
-    tff.close()
-    ff.close()
-    # run sanity check on fasta file
-    handle = open(fastafile, "rU")
-    # only one record
-    for record in SeqIO.parse(handle, "fasta") :
-        id = record.id
-        if id.split("|")[0] != accnum:
-            raise ValueError("fasta file id is not accession number. Abort")
-    return fastafile
-
-def getfasta(accnum):
-    downloadfasta(accnum)
-    return cleanfasta(accnum)
-
-if __name__ == '__main__':
-    import sys
-    from optparse import OptionParser
-    usage = "'%prog accession_number'"
-    parser = OptionParser(usage=usage)
-    (options, args) = parser.parse_args()
-
-    if len(args) != 1:
-        print "incorrect number of arguments"
-        parser.print_help()
-        sys.exit(1)
-
-    accnum = args[0]
-    getfasta(accnum)