Commits

Davide Cittaro committed 2e1e0b6

better handling of CTX and ITX

  • Participants
  • Parent commits 4fb1611

Comments (0)

Files changed (1)

File breakdancer2vcf.py

       # We cannot report the appropriate inverted or inserted sequence, we use simple notation
       self._alt = "<%s>" % self._type
     else:
+      self._id = "bnd_%s" % ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(5))
+      self._mate_id = "bnd_%s" % ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(5))
+
       # ITX and CTX... good luck with this!  
       # determine the orientation	
       alt_pos = "%s:%d" % (self._chr2, self._pos2)
-      ref = genome.fetch(reference = self._chr1, start = self._pos1, end = self._pos1 + 1).upper()
-      if self._orientation1:
-        # reverse
-        if self._orientation2:
-          # double reverse... 
-          self._alt = "[%s[%s" % (ref, alt_pos)
-        else:
-          self._alt = "]%s]%s" % (ref, alt_pos)
+      self._mate_ref = genome.fetch(reference = self._chr2, start = self._pos2, end = self._pos2 + 1).upper()
+      mate_alt_pos = "%s:%d" % (self._chr1, self._pos1)
+      
+      # build info for the mate
+      if self._orientation1 != self._orientation2:
+        self._alt = "%s[%s[" % (self._ref, alt_pos)
+        self._mate_alt = "]%s]%s" % (self._mate_ref, mate_alt_pos)
       else:
-        # forward    
-        if self._orientation2:
-          self._alt = "%s[%s[" % (ref, alt_pos)
-        else:
-          # double forward
-          self._alt = "%s]%s]" % (ref, alt_pos)
+        self._alt = "%s]%s]" % (self._ref, alt_pos)  
+        self._mate_alt = "]%s]%s" % (self._mate_ref, mate_alt_pos)
 
   def chromosome(self):
     return self._chr1
   def build_vcf_string(self, samples=None):
     sample_fields = '' 
     if self._type in ['CTX', 'ITX']:
-      self._id = "bnd_%s" % ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(5))
-      rev_id = "bnd_%s" % ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(5))
-      info_field = "DP=%d;AF:%.2f;SVTYPE=BND;BDTYPE=%s" % (self._num_reads, self._af, self._type)
+      info_field = "DP=%d;AF:%.2f;SVTYPE=BND;BDTYPE=%s;MATEID=%s" % (self._num_reads, self._af, self._type, self._mate_id)
     else:
       info_field = "DP=%d;AF=%.2f;SVTYPE=%s;SVLEN=%d;END=%d" % (self._num_reads, self._af, self._type, abs(self._size), self._pos2)
       
 
     # for ITX and CTX also "reversed" bp should be returned
     if self._type in ['CTX', 'ITX']:
-      rev_alt_pos = "%s:%d" % (self._chr1, self._pos1)
-      rev_ref = genome.fetch(reference = self._chr2, start = self._pos2, end = self._pos2 + 1).upper()
-      if self._orientation2:
-        # reverse
-        if self._orientation1:
-          # double reverse... 
-          rev_alt = "[%s[%s" % (rev_ref, rev_alt_pos)
-        else:
-          rev_alt = "]%s]%s" % (rev_ref, rev_alt_pos)
-      else:
-        # forward    
-        if self._orientation1:
-          rev_alt = "%s[%s[" % (rev_ref, rev_alt_pos)
-        else:
-          # double forward
-          rev_alt = "%s]%s]" % (rev_ref, rev_alt_pos)
-      rev_fields = "%s\t%d\t%s\t%s\t%s\t%d\t.\t%s" % (self._chr2, self._pos2, rev_id, rev_ref, rev_alt, self._score, info_field)    
+      info_field = "DP=%d;AF:%.2f;SVTYPE=BND;BDTYPE=%s;MATEID=%s" % (self._num_reads, self._af, self._type, self._id)
+      rev_fields = "%s\t%d\t%s\t%s\t%s\t%d\t.\t%s" % (self._chr2, self._pos2, self._mate_id, self._mate_ref, self._mate_alt, self._score, info_field)    
       if samples: 
         rev_fields += sample_fields #these are the same 
       # join
   options.output.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="SV type (BND, DEL, INS, INV)">\n')
   options.output.write('##INFO=<ID=BDTYPE,Number=1,Type=String,Description="Original SV type from BD for BND (CTX, ITX)">\n')
   options.output.write('##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">\n')
-#  options.output.write('##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file">\n')
+  options.output.write('##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of the mate breakpoint">\n')
   options.output.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
   options.output.write('##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise variant">\n')
   options.output.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')