Commits

Davide Cittaro  committed 748adae

simple pileup converter

  • Participants
  • Parent commits c2a5adf

Comments (0)

Files changed (1)

File pileup2vcf.py

+import sys
+import argparse
+import pysam
+
+
+def argsort(seq):
+  return sorted(range(len(seq)), key=seq.__getitem__)
+
+def plp2ac(pfields):
+  t = pfields.split()
+  coverage = float(t[3])
+  pstring = t[4]
+  n_ins = pstring.count('+')
+  n_del = pstring.count('-')
+  n_ref = pstring.count(',') + pstring.count('.')
+  n_nt = {'A':0,'C':0,'G':0,'T':0,'N':0,'a':0,'c':0,'g':0,'t':0,'n':0}
+  if n_ins > 0:
+    return (n_ref, n_ins)
+  if n_del > 0:
+    return (n_ref, n_del)
+  for nt in 'ACGTNacgtn':  
+    n_nt[nt] = pstring.count(nt)
+  return (n_ref, max(n_nt.values()))
+
+def plp2af(pfields):
+  t = pfields.split()
+  coverage = float(t[3])
+  pstring = t[4]
+  n_ins = pstring.count('+')
+  n_del = pstring.count('-')
+  n_nt = {'A':0,'C':0,'G':0,'T':0,'N':0,'a':0,'c':0,'g':0,'t':0,'n':0}
+  if n_ins > 0:
+    return n_ins / coverage
+  if n_del > 0:
+    return n_del / coverage
+  for nt in 'ACGTNacgtn':  
+    n_nt[nt] = pstring.count(nt)
+  return max(n_nt.values()) / coverage
+
+def get_n_samples(plp):
+  return (len(plp.split()) -  3) / 3
+
+def print_header(n_samples):
+  sys.stdout.write("##fileformat=VCFv4.1\n")
+  # inser more INFO
+  sys.stdout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
+  sys.stdout.write('##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allele depth">\n')
+  sys.stdout.write('##FORMAT=<ID=DP,Number=2,Type=Integer,Description="Sample read depth">\n')
+  h_samples = '\t'.join(["SAMPLE%03d" % x for x in range(n_samples)])
+  sys.stdout.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % h_samples)
+
+
+def process_sample(sample_data):
+  dp = int(sample_data[0])
+  pstring = sample_data[1]
+  a_count = {}
+  ref_count = 0
+  
+  indel_pstring = pstring.upper().replace('.','').replace(',','').replace('$','').replace(']','').replace('^', '').replace('>', '').replace('<','')
+  for x in range(10):
+    indel_pstring = indel_pstring.replace(str(x), '')
+  
+  
+  n_ins = pstring.count('+')
+  n_del = pstring.count('-')
+  
+  if n_ins:
+    indels =   ["+" + x for x in indel_pstring.split('+') if len(x) > 0]
+    for x in indels: 
+      a_count[x] = indels.count(x)  
+  if n_del:  
+    indels = ["-" + x for x in indel_pstring.split('-') if len(x) > 0]
+    for x in indels:
+      a_count[x] = indels.count(x)  
+  
+  if n_ins == 0 and n_del == 0:
+    for nt in 'ACGTacgt':  
+      cnt = pstring.count(nt)
+      if cnt > 0:
+        a_count[nt.upper()] = pstring.count(nt)
+  ref_count = pstring.count(',') + pstring.count('.')
+  return (dp, a_count, ref_count)
+  
+def hetr(ac):
+  h = 0
+  s = sum(ac.values())
+  if s == 0:
+    return 1
+  acv = [float(x)/s for x in ac.values()]
+  for v in acv:
+    h = h + v**2
+  return 1 - h  
+    
+  
+def build_alleles(chrom, pos, genome, alts):
+  alts = [x for x in alts if '*' not in x]
+  n_del = sum([x.count('-') for x in alts])
+  n_ins = sum([x.count('+') for x in alts])
+  max_ins = max_del = 0
+  start = pos - 1
+  end = start + 1
+  allele_map = {}
+
+  if n_ins > 0:
+    max_ins = max([len(x) for x in alts if '+' in  x]) - 1 
+  if n_del > 0:  
+    max_del = max([len(x) for x in alts if '-' in  x]) - 1
+  if max_ins or max_del:  
+    end = start + abs(max_ins - max_del) + 1
+  ref = genome.fetch(reference = chrom, start = start, end = end).upper()
+  # rewrite alternates
+  for x in range(len(alts)):
+    if '-' in alts[x]:
+      t = ref[:-len(alts[x]) + 1]
+      allele_map[alts[x]] = t
+    elif '+' in alts[x]:
+      t = ref + alts[x][1:]
+      allele_map[alts[x]] = t
+    else:
+      t = ref.replace(ref[0], alts[x], 1)  
+      allele_map[alts[x]] = t
+  return ref, allele_map   
+    
+
+def process_line(line, n_samples, genome):
+  max_alleles = 2
+  
+  alternate_alleles = []
+  sample_depth_list = {}
+  sample_alleles = {}
+  ref_alleles = {}
+  new_sample_alleles = {}
+  
+  grand_count = {}
+
+  f = line.strip().split()
+  chrom = f[0]
+  pos = int(f[1])
+  ref = f[2]
+  
+  for x in range(n_samples):
+    s = 3 + (3 * x)
+    e = s + 3
+    s_dp, s_ac, r_c = process_sample(f[s:e])
+    sample_depth_list[x] = s_dp
+    sample_alleles[x] = s_ac
+    ref_alleles[x] = r_c
+    for k in s_ac.keys():
+      try:
+        grand_count[k] += s_ac[k]
+      except KeyError:
+        grand_count[k] = s_ac[k]
+
+  n_alleles = len(grand_count.keys())
+  tot_count = grand_count.values()
+  count_sort = argsort(tot_count)
+  count_sort = count_sort[::-1]
+  p = 0
+  while len(alternate_alleles) <= max_alleles - 1:
+    try:
+      v = tot_count[count_sort[p]]  
+      alternate_alleles += [x for x in grand_count.keys() if grand_count[x] == v]
+      p += 1
+    except IndexError:
+      break
+  
+  ref, allele_map = build_alleles(chrom, pos, genome, alternate_alleles)
+  alt = []
+  # build first
+  
+  for sample in sample_alleles.keys():
+    new_sample_alleles[sample] = {ref:ref_alleles[sample]}
+    
+
+  for k in grand_count:
+    if grand_count[k] == tot_count[count_sort[0]]:
+      try:
+        alt.append(allele_map[k])
+        for sample in sample_alleles.keys():
+          try:
+            new_sample_alleles[sample][allele_map[k]] = sample_alleles[sample][k]
+          except KeyError:
+            new_sample_alleles[sample][allele_map[k]] = 0
+      except KeyError:
+        continue  
+      break
+
+  if len(tot_count) > 1:
+    for k in grand_count:
+      if grand_count[k] == tot_count[count_sort[1]]:
+        try:
+          alt.append(allele_map[k])
+          try:
+            new_sample_alleles[sample][allele_map[k]] = sample_alleles[sample][k]
+          except KeyError:
+            new_sample_alleles[sample][allele_map[k]] = 0
+        except KeyError:
+          continue  
+        break
+  alt = list(set(alt))
+  allele_list = [ref] + alt
+  
+  samples = {}
+  
+  for sample in range(n_samples):
+    h = hetr(new_sample_alleles[sample])
+    is_hom = False
+    if h < 0.25:
+      is_hom = True
+    ac = new_sample_alleles[sample].values()
+    ac_order = sorted(range(len(ac)), key=ac.__getitem__)
+    ac_order = ac_order[::-1]
+    alleles = new_sample_alleles[sample].keys()
+    # first allele
+    all1 = allele_list.index(alleles[ac_order[0]])
+    ac1 = ac[ac_order[0]]
+    try:
+      all2 = allele_list.index(alleles[ac_order[1]])
+      ac2 = ac[ac_order[1]]
+    except IndexError:
+      all2 = all1
+      ac2 = 0
+    if all1 > all2:
+      all1, all2 = all2, all1
+      ac1, ac2 = ac2, ac1
+    if min(ac2, ac1) == 0:
+      if ac2 > ac1:
+        all1 = all2
+      else:
+        all2 = all1  
+          
+    samples[sample] = ["%d/%d" % (all1, all2), "%d,%d" % (ac1, ac2), str(sample_depth_list[sample])]
+  return chrom, pos, ref, ','.join(alt), samples
+  
+  
+def vcf_print(data):
+#  sys.stdout.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % h_samples)
+
+  spool = "%s\t%d\t.\t%s\t%s\t255\tPASS\t.\tGT:AD:DP" % (data[0], data[1], data[2], data[3])
+  samples = data[4].keys()
+  samples.sort()
+  for sample in samples:
+    spool = "%s\t%s:%s:%s" % (spool, data[4][sample][0], data[4][sample][1], data[4][sample][2])
+  sys.stdout.write("%s\n" % spool)  
+  
+def plp2vcf():
+  option_parser = argparse.ArgumentParser(
+  description = "Converts a pileup into a vcf", 
+  prog="pileup2vcf")
+  option_parser.add_argument("-v", "--version", action="version", version="%(prog)s 0.1")
+  option_parser.add_argument("-i", "--input", help="input pileup file", action="store", type=argparse.FileType('r'), default=sys.stdin) 
+  option_parser.add_argument("-r", "--ref", help="Reference Genoem", action="store", required=True) 
+
+  # parse arguments
+  cli_options = option_parser.parse_args()
+  
+  genome = pysam.Fastafile(cli_options.ref)
+  line = cli_options.input.readline()
+  
+  n_samples = get_n_samples(line)
+  
+  print_header(n_samples)
+  
+  vcf_print(process_line(line, n_samples, genome))
+  for line in cli_options.input:
+    vcf_print(process_line(line, n_samples, genome))
+  
+  
+
+  
+  
+
+if __name__ == '__main__':
+  plp2vcf()