Commits

beroe committed 94fe24b Merge

Merge remote-tracking branch 'origin/master'

Comments (0)

Files changed (1)

molecular/gb2qiime.py

Binary file added.
+#!/usr/bin/env python
+
+__author__ = "Mike McCann"
+__copyright__ = "Copyright 2014, MBARI"
+__license__ = "GPL"
+__maintainer__ = "Mike McCann"
+__email__ = "mccann at mbari.org"
+__status__ = "Development"
+__doc__ = '''
+
+Script to get sequence data from genbank or other server and convert to the input
+files that the Qiime toolkit can read.
+
+Mike McCann
+MBARI 11 April 2014
+
+@var __date__: Date of last svn commit
+@undocumented: __doc__ parser
+@author: __author__
+@status: __status__
+@license: __license__
+'''
+
+import os
+import sys
+import csv
+from collections import defaultdict
+
+
+# From: Shannon Johnson <sjohnson@mbari.org>
+# Subject: NGS database build
+# Date: April 7, 2014 3:29:51 PM PDT
+# To: Mike McCann <mccann@mbari.org>, Kevin Gomes <kgomes@mbari.org>
+# 
+# hi guys,
+# 
+# ok in english, exactly what i want (please please with sugar on top)..
+# 
+# on tempbox (in Mikes box) are three files... the first is the GB_COI.fasta datafile i downloaded from GenBank. 
+# I need that file to become like the two 28S data files. The first is a FASTA file, the second is the taxonomy file. 
+# In a perfect world, the script? would query genbank (https://www.ncbi.nlm.nih.gov/) or bold (http://www.boldsystems.org), 
+# grab the data and reformat it for qiime. realistically, i can go download the data i want, then run the script to 
+# reformat it to work with qiime. 
+# 
+# ok- the most relevant websites you need are: 
+# 
+# qiime.org
+# 
+# taxonomy files: 
+# ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
+# 
+# retraining the RDP taxonomy classifier in Qiime (query the correct taxonomy database)
+# http://qiime.org/tutorials/retraining_rdp.html
+# 
+# 
+# ahhh anything else?? 
+# 
+# thank you!!!
+
+# From: Shannon Johnson <sjohnson@mbari.org>
+# Subject: Re: NGS database build
+# Date: April 17, 2014 10:17:15 AM PDT
+# To: Mike McCann <mccann@mbari.org>
+# 
+# ok we are almost there... when i execute it in QIIME i get this error..
+# 
+# MacQIIME alviniconcha:SIMZ-COI_rawdata $ assign_taxonomy.py -i otus/rep_set/COI_seqs_rep_set.fasta -t COI_Database/GB_COI_taxonomy.txt -r COI_Database/GB_COI_sequence.fasta -o otus/RDP_assigned_taxonomy/ -m rdp
+# Traceback (most recent call last):
+# File "/macqiime/QIIME/bin/assign_taxonomy.py", line 349, in <module>
+#     main()
+# File "/macqiime/QIIME/bin/assign_taxonomy.py", line 327, in main
+#     log_path=log_path)
+# File "/macqiime/lib/python2.7/site-packages/qiime/assign_taxonomy.py", line 425, in __call__
+#     taxonomy_file, training_seqs_file = self._generate_training_files()
+# File "/macqiime/lib/python2.7/site-packages/qiime/assign_taxonomy.py", line 466, in _generate_training_files
+#     seq_id, lineage_str = map(strip, line.split('\t'))
+# ValueError: need more than 1 value to unpack
+# MacQIIME alviniconcha:SIMZ-COI_rawdata $ 
+# 
+# 
+# 
+# what i think it is looking for is the actual taxonomic assignments.. kingdom,phylum,superfamily,blah blah blah.. in our COI datafile we 
+# only have the species (which is what we are actually after) but it wants more... we can also get rid of the mitochondrial COI gene for 
+# cytochrome c oxidase I, partial cds, haplotype: COI_Bma_14 bits... i usually do that with reg expressions...
+# 
+# ok one easier option might be to use the download from bold.. i will put it in your temp box. it actually has the taxonomy linked 
+# into the sequence file (along with a bunch of other crap..) but it might be easier to parse into the two files that qiime wants...
+# 
+# its called iBOL_phase_4.75_COI.tsv
+# 
+# also, i will put the taxonomy 'dump' file in temp box too- incase the bold database requires more work than it is worth to re-write the script!
+# 
+# its called the gi_taxid_nucl.dump.gz
+# 
+# we are almost there! i am sooooo excited!!!
+# 
+# :))))
+
+
+class QiimeData(object):
+
+    taxHash = defaultdict(lambda:0)
+
+    def convertFASTA(self):
+        '''
+        Produce files that Qiime can use.  Start with a file looking like:
+
+$ head -23 GB_COI.fasta 
+>gi|377806846|gb|JN021270.1| Escarpia sp. 4 JMP-2012 COI (COI) gene, partial cds; mitochondrial
+TTGGAATCTGAACAGGACTAGTAGCCACGAGAATGAGACTCCTAATTCGAGCTGAGCTTGGACAACCTGG
+AACTCTTCTAGGAGACGATCAAATTTATAATTGCCTTATTACCGCTCATGGTCTATTAATGATATTTTTT
+GTAGTCCTACCTATTTTAATAGGAGGATTTGGAAATTGACTAGTTCCCTTAATACTAGGAGCTCCAGACA
+TGGCTTTTCCCCGGATTAATAATCTTGGGTTCTGACTTATTCCCCCCGCAGTAATTCTCCTAGTAATATC
+CGCTTTTATCGAAAAAGGGGCTGGAACAGGATGAACTGTCTACCCTCCTTTAGCCTCTAATATTGCCCAT
+GCAGGGCCATGCATTGATTTAGCTATTTTTGCCCTTCATTTATCCGGAGTATCCTCAATTCTAGCCTCTA
+TCAACTTTATTACAACTGTAATAAATATACGATATAAAGGTCTTCGACTAGAACGAGTTCCTTTATTTGT
+ATGAAGAGTAAAACTAACTGCAGTTCTTCTTCTTCTCTCAATTCCAGTTCTTGCCGGTGGACTTACTATA
+CTTCTCACCGATCGAAATTTAAATACGTCCTTCTTTGACCCCGCAGGAGGAGGGGACCCAGTTC
+
+>gi|377806844|gb|JN021269.1| Escarpia sp. 3 JMP-2012 COI (COI) gene, partial cds; mitochondrial
+GGTCAACAAATCATAAAGATATTGGAACCTTATACTTTCTAGTTGGAATCTGAACAGGACTAGTAGCCAC
+GAGAATGAGACTCCTAATTCGAGCTGAGCTTGGACAACCTGGAACTCTTCTAGGAGACGATCAAATTTAT
+AATTGCCTTATTACCGCTCATGGCCTATTAATGATATTTTTTGTAGTCCTACCTATTTTAATAGGAGGAT
+TTGGAAATTGACTAGTTCCCTTAATACTAGGAGCTCCAGACATGGCTTTTCCCCGGATTAATAATCTTGG
+GTTCTGACTTATTCCCCCCGCAGTAATTCTCCTAGTAATATCCGCTTTTATCGAAAAAGGGGCTGGAACA
+GGATGAACTGTCTACCCTCCTTTAGCCTCTAATATTGCCCATGCAGGGCCATGCATTGATTTAGCTATTT
+TTGCCCTTCATTTATCCGGAGTATCCTCAATTCTAGCCTCTATCAACTTTATTACAACTGTAATAAATAT
+ACGATATAAAGGTCTTCGACTAGAACGAGTTCCTCTATTTGTATGAAGAGTAAAACTAACTGCAGTTCTT
+CTTCTTCTCTCAATTCCAGTTCTTGCCGGTGGACTTACTATACTTCTCACCGATCGAAATTTAAATACGT
+CCTTCTTTGACCCCGCAGGAGGAGGGGACCCAGTTCTATATCAACACTTATTCTGATTTTTTGGTCACCC
+
+        Output files that look like:
+
+$ head -4 LSURef_115_sequence.fasta
+>FJ805841.1.4128
+GGTCAAGCGAAATACAGCTGACGGTGGATACCTAGACACGCAGAGGCGAAGAAGGACGTGGCGACCGACGAAAAGCTTCGGGGAGCTGGCAGCAAGCACTGAGCCGAAGATATCCGAATGGGGCAACCCCCTTGTACAGCCTGTTGAATCCATAGACAGGTATGAGCCAACCCAGCCAACTGAAACATCTTAGTAGCTGGAGGAAAAGAAATCAAACGAGATTCTCCCAGTAGTGGTGAGCGAAAGGAGAACAGCCTAAACCAGAGTACAAGTATTCTGGGGTAGTGGGACAGCGAGATGGAAGCACTAGACTAGACGAAACGATTGACAAATCGTACCAAAGAGGGTGATAGTCCCGTAGTCGAAAGTTGAAGTGCTACTAGCTGAATCCCGAGTAGCTAGGGACACGGGAAATCCCTAGTGAATCAGCCGAGACCACTCGGTAAGGCTAAATACTACTGCGTGAGCGATAGAGCAACAGTACCGCGAGGGAAAGGTGAAAAGAACCCCGATAAGGGGAGTGAAATAGAACATGAAACCGTCAGTTGACAAGCAGTGGAAGGATGATTCAACGTCCGACCGCGTGCCTGTTGAAGAATGAGCCGGCGAGTTATAGTCACTGGTATGGCGAATAGCCACAGCGAAAGCGAGTCTGAAGTGGGCGAATTATCAGTGTATATAGACCCGAACCTGGGTGATCTAACCATGGCCAGGATGAAGCTTGGGTAACACCAAGTGGAGGTCCGCACCGACCGATGTTGAAAAATCGGCGGATGAGCTGTGGTTAGGGGTGAAATGCCAATCGAACCCAGAGCTAGCTGGTTCTCCCCGAAATGTGTTGAGGCGCAGCGGTAAGTGATAATTGCTGGGGGTAAAGCACTGTTTCGGTGCGGGCGGCCTCAAGCTGTACCAAATCGAGACAAACTCAGAATACCAGCAACAATCTTACTAGTGAGACGGTGGGGGATAAGCTTCATCGTCAAGAGGGAAACAGCCCAGACCGCCAGCTAAGGTCCCCAAATCAAAGTTAAGTGGCAAAGGAGGTGGGAGTGCACAGACAACCAGGAGGTTTGCCTAGAAGCAGCCATCCTTGAAAGAGTGCGTAATAGCTCACTGGTCAAGCGCTCCTGCGCCGAAAATGAACGGGACTAAACTTTGTACCGAAGCTGCGGACTTGTAGCGATACAAGTGGTAGGGGAGCGTTCTGCGATAGGGTGAAGCACTAGCGGCAAGCAGGTGTGGACGAAGCAGAAGTGAGAATGTCGGCTTGAGTAGCGCAAATATTGGTGAGAATCCAATACCCCGAAACCCTAAGGGTTCCTCCACAAGGCTCGTCCGTGGAGGGTTAGTCAGGACCTAAGGCGAGGCCGAACGGCGTAGTCGATGGACAACGGGCGAACAATCCCGTACTGAAGATTAGTTGTGCAGAGGGACGGAGAAGGCACGCACCAGCCAGATGTTGGTTACTGGCGCAATTAACCGAGGCATCGAGAAGCGGCGAAAACGCTTCGAGCTGGGGTTAAGAGACCGAGGGTCTACGGACCCGAAGTGGTAGCGGTCAAGCTTCCAAGAAAAGCTCTAAACACGTTAACTAATCGTCACCTGTACCCAAAACCGACACAGGTAGGGAGGTAGAGAATACTAAGGGGCGCGAGATAACTCTCTCTAAGGAACTCGGCAACATGGCCCCGTAACTTCGGAAGAAGGGGTGCCTACGCAAGTAGGTCGCAGTGAAGAGATCCAGGCGACTGTTTACCAAAAACACAGGTCTCCGCAAAGTCAATAGACGAGGTATGGGGGCTGACGCCTGCCCAGTGCCGGAAGGTTAAGGAAGTCGGTCAGGGCCTTGAGCCTAAAGCTGGCGACCGAAGCCCCGGTGAACGGCGGCCGTAACTATAACGGTCCTAAGGTAGCGAAATTCCTTGTCGGGTAAGTTCCGACCCGCACGAAAGGCGTAACGATCTGGATGGTGTCTCAGAGAGAGACTCGGCGAAATAGGAATGTCTGTGAAGATACGGACTACCTGCACCTGGACAGAAAGACCCTATGAAGCTTTACTGTAGCCTGGAATGGTGTTCGGGCTTTGCTTGCGCAGAATAGGTGGGAGGCAATGATTCACTCCTTGTGGGGAGTGAGGAGCCGCAATGTGAGATACCACTCTGGCGAGGCTAGAATTCTAACCCGCGACCGTTATCCGGTCGGGGAACAGTTTCAGGTGGGCAGTTTGACTGGGGCGGTCGCCTCCTAAAGTGTAACGGAGGCGCGCAAAGGTTCCCTCAGGCTGGTTGGAAATCAGCCGCAGAGTGTAAAGGCAAAAGGGAGCTTGACTGCAAGAGTGACAACTCGCGCAGGGTGGAAACACGGCCTTAGTGATCCGACGGTAGTGAGTGGAAGCGCCGTCGCTCAACGGATAAAAGTTACTCTAGGGATAACAGGCTGATCTCCCCCAAGAGTCCACATCGACGGGGAGGTTTGGCACCTCGATGTCGGCTCATCGCAACCTGGGG
+>AJ506156.139292.142107
+TTCAAACGAGGAAAGGCTTACGGTGGATACCTAGGCACCCAGAGACGAGGAAGGGCGTAGCAAGCGACGAAATGCTTCGGGGAGTTGAAAATAAGCATAGATCCGGAGATTCCCGAATGGGTCAACCTTTCGAACTGCTGCTGAATCCATGGGCAGGCAAGAGACAACCTGGCGAACTGAAACATCTTAGTAGCCAGAGGAAAAGAAAGCAAAAGCGATTCCCGTAGTAGCGGCGAGCGAAATGGGAGCAGCCTAAACCGTGAAAACGGGGTTGTGGGAGAGCAATACAAGCGTCGTGCTGCTAGGCGAAGCGGTTGAGTACTGCACCCTAGATGGCGAGAGTCCAGTAACCGAAAGCATCACTGGCTTACGCTCTGACCCGAGTAGCATGGGGCACGTGGAATCCCGTGTGAATCAGCAAGGACCACCTTGTAAGGCTAAATACTCCTGGGTGACCGATAGCGAAGTAGTACCGTGAGGGAAAGGTGAAAAGAACCCCCATCGGGGAGTGAAATAGAACATGAAACCGTGAGCTCCCAAGCAGTGGGAGGAGAATTTGATCTCTGACCGCGTGCCTGTTGAAGAATGAGCCGGCGACTCATAGGCAGTGGCTTGGTTAAGGGAACCCACCGGAGCCGTAGCGAAAGCGAGTCTTCATAGGGCAATTGTCACTGCTTATGGACCCGAACCTGGGTGATCTATCCATGACCAGGATGAAGCTTGGGTGAAACTAAGTGGAGGTCCGAACCGACTGATGTTGAAGAATCAGCGGATGAGTTGTGGTTAGGGGTGAAATGCCACTCGAACCCAGAGCTAGCTGGTTCTCCCCGAAATGCGTTGAGGCGCAGCAGTTGACTGGACATCTAGGGGTAAAGCACTGTTTCGGTGCGGGCCGCGAGAGCGGTACCAAATCGAGGCAAACTCTGAATACTAGATATGACCCCAAAATAACAGGGGTAAAGGTCGGCCAGTGAGACGATGGGGGATAAGCTTCATCGTCGAGAGGGAAACAGCCCGGATCACCAGCTAAGGCCCCTAAATGACCGCTCAGTGATAAAGGAGGTAGGGGTGCAGAGACAGCCAGGAGGTTTGCCTAGAAGCAGCCACCCTTGAAAGAGTGCGTAATAGCTCACTGATCGAGCGCTCTTGCGCCGAAGATGAACGGGGCTAAGCGATCTGCCGAAGCTGTGGGATGTAAAAATGCATCGGTAGGGGAGCGTTCCGCCTTAGAGTGAAGCACCCGCGCGAGCAGGTGTGGACGAAGCGGAAGCGAGAATGTCGGCTTGAGTAACGCAAACATTGGTGAGAATCCAATGCCCCGAAAACCTAAGGGTTCCTCCGCAAGGTTCGTCCACGGAGGGTGAGTCAGGGCCTAAGATCAGGCCGAAAGGCGTAGTCGATGGACAACAGGTGAATATTCCTGTACTACCCCTTGTTGGTCCCGAGGGACGGAGGAGGCTAGGTTAGCCGAAAGGTGGTTTATCGGTTCAAGGACGCAAGGTGACCTTGCTTTTTTTTTCAGGGTAAGAAGGGGTAGAGGAAATGCCTCGAGCCAATGTCCGAGTACCAGGCGCTACGGCGCTGAAGTAACCCATGCCATACTCCCAGGAAAAGCTCGAACGACCTTCAACAAAAGGGTACCTGTACCCGAAACCAACACAGGTGGGTAGGTAGAGAATACCTAGGGGCACGAGACAACTCTCTCTAAGGAACTCGGCAAAATAGCCCCGTAACTTCGGGAGAAGGGGTGCCTCCTCACAAAGGGGGTCGCAGTGACCAGGCCCGGGCGACTGTTTACCAAAAACACAGGTCTCCGCAAAGTCGTAAGACCATGTATGGGGGCTGACGCCTGCCCAGTGCCGGAAGGTCAAGGAAGTTGGTGACCTGATGACAGGGGAGCCGGCGACCGAAGCCCCGGTGAACGGCGGCCGTAACTATAACGGTCCTAAGGTAGCGAAATTCCTTGTCGGGTAAGTTCCGACCCGCACGAAAGGCGTAACGATCTGGGCACTGTCTCGGAGAGAGGCTCGGTGAAATAGACATGTCTGTGAAGATGCGGACTACCTGCACCTGGACAGAAAGACCCTATGAAGCTTTACTGTTCCCTGGGATTGGCTTTGGGCCTTTCCTGCGCAGCTTAGGTGGAGGGCGAAGAAGGCCTCCTTCCGGGGGGGCCCGAGCCATCAGTGAGATACCACTCTGGAAGAGCTAGAATTCTAACCTTGTGTCAGGACCTACGGGCCAAGGGACAGTCTCAGGTAGACAGTTTCTATGGGGCGTAGGCCTCCCAAAAGGTAACGGAGGCGTGCAAAGGTTTCCTCGGGCCGGACGGAGATTGGCCCTCGAGTGCAAAGGCAGAAGGGAGCTTGACTGCAAGACCCACCCGTCGAGCAGGGACGAAAGTCGGCCTTAGTGATCCGACGGTGCCGAGTGGAAGGGCCGTCGCTCAACGGATAAAAGTTACTCTAGGGATAACAGGCTGATCTTCCCCAAGAGTTCACATCGACGGGAAGGTTTGGCACCTCGATGTCGGCTCTTCGCCACCTGGGGCGGTAGTACGTTCCAAGGGTTGGGCTGTTCGCCCATTAAAGCGGTACGTGAGCTGGGTTCAGAACGTCGTGAGACAGTTCGGTCCATATCCGGTGTGGGCGTTAGAGCATTGAGAGGACCTTTCCCTAGTACGAGAGGACCGGGAAGGACGCACCTCTGGTGTACCAGTTATCGTGCCCACGGTAAACGCTGGGTAGCCAAGTGCGGAGCGGATAACTGCTGAAAGCATCTAAGTAGTAAGCCCACCCCAAGATGAGTGCTCTCCT
+
+        and
+
+$ head LSURef_115_taxonomy_euk_new1.txt
+AY752988.1.3060 Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Ancyromonadida
+GU001166.2428.6009      Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Ancyromonadida
+GU001167.971.4531       Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Apusomonadida
+GU001168.1648.5734      Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Apusomonadida
+GU001170.833.4150       Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Apusomonadida
+DQ980467.1.3250 Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Apusomonadida
+AY752987.1.3065 Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Apusomonadida
+ADVD01001051.6294.9716  Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;Apusomonadida
+GU001155.1.3156 Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;o_Apusozoa
+GU001157.1.2802 Bikonta;Apusozoa;u_Apusozoa;c_Apusozoa;l_Apusozoa;o_Apusozoa
+        '''    
+
+        sequenceFH = open(self.args.Qsequence, 'w')
+        taxonomyFH = open(self.args.Qtaxonomy, 'w')
+
+        # Read each line from input file from GenBank. Parse header and write the correct portions to the output files.
+        seq = ''
+        count = 0
+        lastTaxonomy = ''
+        for line in open(self.args.input):
+            if line[0] == '>':
+                # Write to the output data assembled from the last sequence, if there is taxonomy
+                if lastTaxonomy:
+                    if seq:
+                        sequenceFH.write('>' + accession + '\n')
+                        sequenceFH.write(seq.replace('\n', '') + '\n')
+                    taxonomyFH.write('%s\t%s\n' % (accession, lastTaxonomy))
+
+                # From Wikipedia: GenBank: gi|gi-number|gb|accession|locus
+                gi, gi_number, gb, accession, locus = line.split('|')
+
+                # Take first 3 items of locus and apply fixTaxonomy() for just the species
+                lastTaxonomy = self.fixTaxonomy('_'.join(locus.strip().split(' ')[:3]), numItems=1)
+                seq = ''
+
+                count += 1
+                if self.args.verbosity > 0:
+                    if count % 100000 == 0:
+                        print 'Sequences converted: %s' % count
+            else:
+                seq += line
+
+        # Write last sequence
+        if lastTaxonomy and seq:
+            sequenceFH.write('>' + accession + '\n')
+            sequenceFH.write(seq.replace('\n', '') + '\n')
+            taxonomyFH.write('%s\t%s\n' % (accession, lastTaxonomy))
+
+        sequenceFH.close()
+        taxonomyFH.close()
+
+    def convertBOLD(self):
+        '''
+        Produce files that Qiime can use from a BOLD file looking like:
+
+$ head -3 iBOL_phase_4.75_COI.tsv
+processid       sampleid        museumid        fieldid bin_guid        bin_name        vouchertype     inst_reg        phylum_reg      class_reg       order_reg   family_reg      subfamily_reg   genus_reg       species_reg     taxonomist_reg  collectors      collectiondate  lifestage       lat     lon     sitsector   region  province_reg    country_reg     fundingsrc      seqentryid      seqdataid       marker_code     nucraw  aminoraw        seq_update      total_trace_count   high_trace_count        accession
+TREMA2883-10.COI-5P     Tb.IN..L.BUR.10-0184.2  slides 814 1151 (45.20 45.26)   Tb.IN..L.BUR.10-0184.2  BOLD:AAM7874    BIN127874               "Concordia University, Montreal"    Platyhelminthes Cestoda Tetrabothriidea                         Tetrabothriidea sp. BOLD:AAM7874                        2010-03-07 49.317   -123.1                                  Canada  iBOL:NA- Tetrabothriidea        1540292 3667036 COI-5P  CGTATCGGTATGATTTATACTATAATTGGTATATGGTCTGGGTTCATAGGTCTAAGTTTTAGTATGTTGATTCGTATAAATTTCTTGGAACCTTACTATAATTTAATTTCTTTAGACTGCTATAAATTCCTTATCACTAATCATGGTATTATTATGATTTTCTTTTTTCTTATGCCTATACTAATAGGTGGTTTTGGTAAATATCTTGTTCCTTTATTAAGAGGGTTGTCTGACTTGAGTCTACCTCGTATAAATGCGTTAAGCGCTTGATTATTAATACCTTCAATCTTATTTTTGACAATAAGCATGGTGATGGGCGCTGGTGTTGGTTGAACTTTTTACCCTCCTTTGTCTTCATCATTGTTTACAGGAAGTAAGGGAGTGGACTTTCTTTTATTTTCTTTACATTTGGCTGGTGTTTCTAGTATACTTAGATCAATTAAATTTATATGTACTCTTTATACTATATTTGGTAGAAATGTTTCTTCTCGTAGTTCTATAGTTTTATGATCATATTTGTTTACTTCTATGTTATTATTATTAACACTTCCTGTTCTGGCTGCTGCCATTACTATGTTGTTATTT       RIGMIYTIIGIWSGFIGLSFSMLIRINFLEPYYNLISLDCYNFLITNHGIIMIFFFLMPILIGGFGNYLVPLLSGLSDLSLPRINALSAWLLIPSILFLTISMVMGAGVGWTFYPPLSSSLFTGSKGVDFLLFSLHLAGVSSILSSINFICTLYTIFGSNVSSRSSIVLWSYLFTSMLLLLTLPVLAAAITMLLF 2014-02-05      2       2
+FLUKE2186-12.COI-5P     C.IN.Ss.LEZ.1.1 slide 1161 1162 (59.87) C.IN.Ss.LEZ.1.1 BOLD:ACK9811    BIN629811               Biodiversity Institute of Ontario  Platyhelminthes  Cestoda Trypanorhyncha                          Trypanorhyncha sp. BOLD:ACK9811         Maria Gregori i Casamayor       2010-11-24      A  Spain    iBOL:NA- Trypanorhyncha 2839092 4959447 COI-5P  CGTGTTGGAATTTTTTATACTCTTGTGGGGGTTTGGTCCGGTTTTGTAGGATTAAGATTTAGGATAATGATACGTATAAATTTCATCGAGCCTTATTTTAATGTTATATCTTCTGATTGCTATAAATTTTTAATAACTTATCACGGCATTATAATGATATTTTTTTTTTTGATGCCAGTTTTAGTGGGTGGATTTGGAAAATATTTAATACCCTTGCTGTGTGGGCTATCTGATTTAAACCTGCCGCGACTTAAAGCTTTAAGGGCTTGGCTTTTAATACCGTCGGTATTATTTTTATTAATAAGCATTTTTTTAGGTGCCGGGATAGGGTGAACACTTTACCCGCCTCTTTCGTCTTCTTTATTTGCTAGAAGTAAGGGAGTTGATTTTTTGATGTTTTCTTTGCATTTGGCGGGGGTTTCAAGGTTGTTAGGTTCTATAAATTTTATATGCACTGTTTACGGGTCTTTTAAAAAAAGCACAGTTTCACGCGGTTCTATAATTTTGTGGGCTTATGTATTTACATCGGTCCTATTATTAATTTCATTGCCCGTTTTGGCTGCAGCCATAACTATGCTTTTATTC       RVGIFYTLVGVWSGFVGLSFSIMIRINFIEPYFNVISSDCYNFLITYHGIIMIFFFLMPVLVGGFGNYLIPLLCGLSDLNLPRLNALSAWLLIPSVLFLLISIFLGAGIGWTLYPPLSSSLFASSKGVDFLMFSLHLAGVSSLLGSINFICTVYGSFNNSTVSRGSIILWAYVFTSVLLLISLPVLAAAITMLLF 2014-02-05      2       1
+
+        '''
+
+        sequenceFH = open(self.args.Qsequence, 'w')
+        taxonomyFH = open(self.args.Qtaxonomy, 'w')
+
+        # Read BOLD .tsv file into dictionary for each record and write appropriate things to Qiime files
+        count = 0
+        reader = csv.DictReader(open(self.args.input), delimiter="\t")
+        for r in reader:
+            taxonomy = '%s;%s;%s;%s;%s;%s\n' % ( r['phylum_reg'] if r['phylum_reg'] else 'p__', 
+                                                 r['class_reg'] if r['class_reg'] else 'c__', 
+                                                 r['order_reg'] if r['order_reg'] else 'o__', 
+                                                 r['family_reg'] if r['family_reg'] else 'f__', 
+                                                 r['genus_reg'] if r['genus_reg'] else 'g__', 
+                                                 r['species_reg'].strip().split(' ')[0] if r['species_reg'] else 's__')
+            taxonomy = self.fixTaxonomy(taxonomy)
+            if taxonomy:
+                taxonomyFH.write('%s\t%s\n' % (r['bin_guid'], taxonomy))
+                sequenceFH.write('>' + r['bin_guid'] + '\n')
+                sequenceFH.write(r['nucraw'] + '\n')
+
+                count += 1
+                if self.args.verbosity > 0:
+                    if count % 10000 == 0:
+                        print 'Sequences converted: %s' % count
+
+        sequenceFH.close()
+        taxonomyFH.close()
+
+    def fixTaxonomy(self, taxonomy, numItems=6):
+        '''
+        Given a semicolon separated white space filled string of classes make sure that there are exactly 6 items.
+        If necessary prepend the list with 'Dummy__' items to pad to 6 items.
+        '''
+        # Replace white space and assemble just the last numItems into a ; separated string
+        taxonomy = ';'.join(taxonomy.replace(' ', '').replace('\n', '').split(';')[-numItems:])
+
+        # Pad begining of string to make sure there are numItems items
+        items = taxonomy.split(';')
+        if len(items) < numItems:
+            if self.args.verbosity > 1:
+                print "Padding out taxonomy as only %s items in original" % len(items)
+            taxonomy = 'Dummy__;' * (numItems - len(items)) + taxonomy
+
+        # Detect duplicates and take action according to arguments specified
+        if self.args.duplicates == 'remove' or self.args.duplicates == 'modify':
+            self.taxHash[taxonomy] += 1
+            if self.args.duplicates == 'remove' and self.taxHash[taxonomy] > 1:
+                taxonomy = ''
+                if self.args.verbosity > 2:
+                    print "Removing duplicate taxonomy: " + taxonomy
+            if self.args.duplicates == 'modify' and self.taxHash[taxonomy] > 1:
+                taxonomy = '%s-%s' % (taxonomy, self.taxHash[taxonomy])
+                if self.args.verbosity > 2:
+                    print "Modifying duplicate taxonomy: " + taxonomy
+
+        return taxonomy
+            
+    def convertGenBank(self):
+        '''
+        Produce files that Qiime can use from a GenBank file looking like:
+
+medusa:Shan's data mccann$ head -64 sequence.gb
+LOCUS       JN021270                 624 bp    DNA     linear   INV 22-JUL-2013
+DEFINITION  Escarpia sp. 4 JMP-2012 COI (COI) gene, partial cds; mitochondrial.
+ACCESSION   JN021270
+VERSION     JN021270.1  GI:377806846
+KEYWORDS    .
+SOURCE      mitochondrion Escarpia sp. 4 JMP-2012
+  ORGANISM  Escarpia sp. 4 JMP-2012
+            Eukaryota; Metazoa; Lophotrochozoa; Annelida; Polychaeta; Palpata;
+            Canalipalpata; Sabellida; Siboglinidae; Escarpia.
+REFERENCE   1  (bases 1 to 624)
+  AUTHORS   Raggi,L., Schubotz,F., Hinrichs,K.U., Dubilier,N. and Petersen,J.M.
+  TITLE     Bacterial symbionts of Bathymodiolus mussels and Escarpia tubeworms
+            from Chapopote, an asphalt seep in the southern Gulf of Mexico
+  JOURNAL   Environ. Microbiol. 15 (7), 1969-1987 (2013)
+   PUBMED   23279012
+REFERENCE   2  (bases 1 to 624)
+  AUTHORS   Raggi,L., Schubotz,F., Hinrichs,K.-U., Petersen,J.M., Felden,J. and
+            Dubilier,N.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (25-MAY-2011) Symbiosis group - Molecular Ecology, Max
+            Planck Institute for Marine Microbiology, Celsiusstr. 1, Bremen,
+            Bremen 28359, Germany
+FEATURES             Location/Qualifiers
+     source          1..624
+                     /organism="Escarpia sp. 4 JMP-2012"
+                     /organelle="mitochondrion"
+                     /mol_type="genomic DNA"
+                     /isolation_source="tubeworm from an asphaltic cold seep at
+                     2915m depth; Chapopote, Gulf of Mexico"
+                     /specimen_voucher="MPI-MM GoM_Chapopote,_Meteor_M67/2_83
+                     Tubeworm 83-4"
+                     /db_xref="taxon:1134767"
+                     /clone="GoM_Chap_COI_TbW83-4"
+                     /lat_lon="21.54 N 93.26 W"
+                     /collection_date="14-Apr-2006"
+                     /collected_by="A. Boetius/J. Felden"
+     gene            <1..>624
+                     /gene="COI"
+     CDS             <1..>624
+                     /gene="COI"
+                     /note="mitochondrial cytochrome oxidase I"
+                     /codon_start=3
+                     /transl_table=5
+                     /product="COI"
+                     /protein_id="AFB76520.1"
+                     /db_xref="GI:377806847"
+                     /translation="GIWTGLVATSMSLLIRAELGQPGTLLGDDQIYNCLITAHGLLMM
+                     FFVVLPILMGGFGNWLVPLMLGAPDMAFPRINNLGFWLIPPAVILLVMSAFIEKGAGT
+                     GWTVYPPLASNIAHAGPCIDLAIFALHLSGVSSILASINFITTVMNMRYKGLRLERVP
+                     LFVWSVKLTAVLLLLSIPVLAGGLTMLLTDRNLNTSFFDPAGGGDPV"
+ORIGIN      
+        1 ttggaatctg aacaggacta gtagccacga gaatgagact cctaattcga gctgagcttg
+       61 gacaacctgg aactcttcta ggagacgatc aaatttataa ttgccttatt accgctcatg
+      121 gtctattaat gatatttttt gtagtcctac ctattttaat aggaggattt ggaaattgac
+      181 tagttccctt aatactagga gctccagaca tggcttttcc ccggattaat aatcttgggt
+      241 tctgacttat tccccccgca gtaattctcc tagtaatatc cgcttttatc gaaaaagggg
+      301 ctggaacagg atgaactgtc taccctcctt tagcctctaa tattgcccat gcagggccat
+      361 gcattgattt agctattttt gcccttcatt tatccggagt atcctcaatt ctagcctcta
+      421 tcaactttat tacaactgta ataaatatac gatataaagg tcttcgacta gaacgagttc
+      481 ctttatttgt atgaagagta aaactaactg cagttcttct tcttctctca attccagttc
+      541 ttgccggtgg acttactata cttctcaccg atcgaaattt aaatacgtcc ttctttgacc
+      601 ccgcaggagg aggggaccca gttc
+//
+
+        '''
+
+        sequenceFH = open(self.args.Qsequence, 'w')
+        taxonomyFH = open(self.args.Qtaxonomy, 'w')
+
+        # Read each line from input file from GenBank. Parse header and write the correct portions to the output files.
+        seq = ''
+        taxonomy = ''
+        count = 0
+        originFlag = False
+        taxonomyFlag = False
+        for line in open(self.args.input):
+            if line.startswith('ACCESSION'):
+                accession = line.split()[-1].strip()
+
+            if line.lstrip().startswith('ORGANISM'):
+                # Read species line and lines until line does not begin with a space
+                species = '_'.join(line.strip().split(' ')[3:])
+                taxonomyFlag = True
+            if taxonomyFlag:
+                if line.startswith(' '):
+                    if not line.lstrip().startswith('ORGANISM'):
+                        taxonomy += line
+                else:
+                    taxonomyFlag = False
+                    taxonomy = taxonomy.strip()
+                    if taxonomy[-1] == '.':
+                        taxonomy = taxonomy[:-1]
+                    taxonomy += ';' + species
+
+            if line.startswith('ORIGIN'):
+                originFlag = True
+            if originFlag and not line.startswith('ORIGIN'):
+                if line.startswith(' '):
+                    seq += ''.join(line.lstrip().split()[1:])
+                else:
+                    originFlag = False
+            if line.strip() == '//' and seq:
+                taxonomy = self.fixTaxonomy(taxonomy)
+                if seq and taxonomy:
+                    sequenceFH.write('>%s\n%s\n' % (accession, seq.upper()))
+                    taxonomyFH.write('%s\t%s\n' % (accession, taxonomy))
+                    count += 1
+                    if self.args.verbosity > 0:
+                        if count % 10000 == 0:
+                            print 'Sequences converted: %s' % count
+                    seq = ''
+                taxonomy = ''
+
+        sequenceFH.close()
+        taxonomyFH.close()
+
+
+    def process_command_line(self):
+        import argparse
+        from argparse import RawTextHelpFormatter
+
+        examples = 'Examples:' + '\n\n'
+        examples += sys.argv[0] + ' --format FASTA -i GB_COI.fasta --Qsequence GB_COI_sequence.fasta --Qtaxonomy GB_COI_taxonomy.txt\n'
+        examples += sys.argv[0] + ' -i iBOL_phase_4.75_COI.tsv -s iBOL_phase_4.75_COI_sequence.fasta -t iBOL_phase_4.75_COI_taxonomy.txt -v 1\n'
+        examples += sys.argv[0] + ' --format GenBank -i sequence.gb -s sequence.fasta -t sequence.txt -d modify -v 2\n'
+        examples += sys.argv[0] + ' --format GenBank -i sequence.gb -s sequence.fasta -t sequence.txt -d modify -v 3\n'
+        examples += '\n'
+
+        parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,
+                                         description='Convert genetic sequence file to files that Qiime can import',
+                                         epilog=examples)
+
+        parser.add_argument('-i', '--input', action='store', help='Input file name', required=True)
+        parser.add_argument('--format', nargs='?', choices=['FASTA', 'GenBank', 'BOLD'], help='Format of input file, default=BOLD', default='BOLD')
+        parser.add_argument('-s', '--Qsequence', action='store', help='Output fasta sequence file name for Qiime', required=True)
+        parser.add_argument('-t', '--Qtaxonomy', action='store', help='Output taxonomy file name for Qiime', required=True)
+        parser.add_argument('-d', '--duplicates', nargs='?', choices=['remove', 'modify'], action='store', help='What to do with duplicate taxonomy')
+        parser.add_argument('-v', '--verbosity', nargs='?', choices=[1,2,3], type=int, help='Turn on verbose output. Higher number = more output.', const=1)
+
+        self.args = parser.parse_args()
+
+        # Construct a command line string for use in provenance metadata
+        self.commandline = ""
+        for item in sys.argv:
+            if item == '':
+                # Preserve empty string specifications in the command line
+                self.commandline += "''" + ' '
+            else:
+                self.commandline += item + ' '
+
+
+if __name__ == '__main__':
+
+    qd = QiimeData()
+    qd.process_command_line()
+
+    if qd.args.format == 'FASTA':
+        qd.convertFASTA()
+    elif qd.args.format == 'GenBank':
+        qd.convertGenBank()
+    else:
+        qd.convertBOLD()
+
+