Source

utils / vcf2table.py

Full commit
#!/usr/bin/env python2.7

import sys
import vcf
import openpyxl

# fields to dump
#    VALIDATED GENOTYPES EFFECTS

_MAX_EFFECT = 24

def get_efflist(eff):
  effs = [x.replace("(","|").replace("|)","").split('|') for x in  eff.split(',')]
  results = ''
  gene_effect = {}
  gene_codon = {}
  gene_aa = {}
  for x in effs:
    try:
      gene_effect[x[6]].append(x[0])
    except KeyError:
      gene_effect[x[6]] = [x[0]]
      gene_aa[x[6]] = []  
      gene_codon[x[6]] = []  
    gene_codon[x[6]].append(x[3])
    gene_aa[x[6]].append(x[4])
  results = []  
  for gene in gene_effect.keys():
    n = len(gene_effect[gene])
    results.append(gene)
    t_data = []
    for x in range(n):
      this_effect = gene_effect[gene][x]
      if 'HIGH' in t_data:
        continue
      elif 'MODERATE' in t_data:
        if this_effect == 'HIGH':
          t_data  = [gene_codon[gene][x], gene_aa[gene][x], gene_effect[gene][x]] 
        else:  
          continue
      elif 'LOW' in t_data:
        if this_effect in ['HIGH', 'MODERATE']:
          t_data  = [gene_codon[gene][x], gene_aa[gene][x], gene_effect[gene][x]] 
        else:
          continue
      elif 'MODIFIER' in t_data:
        if this_effect in ['HIGH', 'MODERATE', 'LOW']:
          t_data  = [gene_codon[gene][x], gene_aa[gene][x], gene_effect[gene][x]] 
        else:
          continue
      else:        
      t_data  = [gene_codon[gene][x], gene_aa[gene][x], gene_effect[gene][x]] 
    results = results t_data
  return results
  


    
#  alt_alleles = all.split(',')
#  effs = [x.replace("(","|").replace("|)","").split('|') for x in  eff.split(',')]
#  high = [x for x in effs if x[1] == 'HIGH']
#  moderate = [x for x in effs if x[1] == 'MODERATE']
#  low = [x for x in effs if x[1] == 'LOW']
#  modifier = [x for x in effs if x[1] == 'MODIFIER']
#  # get the highest impact
#  if len(high) > 0:
#    effout = high
#  elif len(moderate) > 0:
#    effout = moderate
#  elif len(low) > 0:
#    effout = low
#  else:
#    effout = modifier
#  # remove redundancy
#  gene_effect = {}
#  gene_codon = {}
#  gene_aa = {}
#  for x in effout:
#    try:
#      gene_effect[x[6]].append(x[0])
#    except KeyError:
#      gene_effect[x[6]] = [x[0]]
#      gene_aa[x[6]] = [x[4]]  
#      gene_codon[x[6]] = [x[3]]  
#    gene_codon[x[6]].append(x[3])
#    gene_aa[x[6]].append(x[4])
#  results = ''  
#  for gene in gene_effect.keys():
#    n = len(gene_effect[gene])
#    t_data = []
#    for x in range(n):
#      t_data.append('\t'.join([gene_codon[gene][x], gene_aa[gene][x], gene_effect[gene][x] ]))
#    t_data = list(set(t_data))
#    results = results + gene + '\t' + '\t'.join(t_data)
#  return results  
  


parser = vcf.Reader(open(sys.argv[1], 'r'))
samples = parser.samples
fields_in_record = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER']
fields_in_INFO = sys.argv[2].split(',')
fileout = sys.argv[3]
wb = openpyxl.workbook.Workbook()
ws = wb.worksheets[0]
#fields_in_INFO = ['DP','MQ','VQSLOD','AC','AF','InbreedingCoeff','GMAF','VALIDATED','dbnsfpAncestral_allele','','LOF','dbnsfpPolyphen2_HVAR_pred','dbnsfpSIFT_score','dbnsfpGERP++_RS','dbnsfpGERP++_NR','','isPolymorphic','Phigene','Phiscore','Phiclass','TG_gene','TG_rank','dbnsfpUniprot_acc','dbnsfpEnsembl_transcriptid']
header = fields_in_record + fields_in_INFO
for s in samples:
  header.append("%s:GT" % s)
  header.append("%s:A1" % s)
  header.append("%s:A2" % s)
header = header ["GENE","CODOON","AA","EFFECT"]
ws.title = sys.argv[1]
row = 1
for idx, f in enumerate(header):
  #write header
  col = openpyxl.cell.get_column_letter(1 + idx)
  ws.cell("%s%s" % (col, row)).value = f

for record in parser:
  FILTER = "PASS"
  if len(record.FILTER):
    FILTER=record.FILTER[0]
  line = [str(x) for x in (record.CHROM, record.POS, record.ID, record.REF, ','.join([str(x) for x in record.ALT]), record.QUAL, FILTER)]
  for k in fields_in_INFO:
    try:
      v = record.INFO[k]
    except KeyError:
      v = ''  
    if type(v) == list:
      v = ','.join([str(x) for x in v])  
    else:
      v = str(v)  
    line.append(v)
  # get samples
  for x in range(len(samples)):
    s = record.samples[x]
    genotype = s['GT']
    try:
      ad = s['AD']
    except AttributeError:
      ad = ['0','0']  
    if not genotype:
      s_data = ['./.', '0', '0']
    elif not ad:
      ad = ['0','0']  
    else:
      s_data = [genotype] + ad
    line = line + s_data
  # get effects
  line = line + get_efflist(record.INFO['EFF'])
  row += 1
  for idx, f in enumerate(line):
    col = openpyxl.cell.get_column_letter(1 + idx)
    ws.cell("%s%s" % (col, row)).value = f

ws.create_sheet()
wb.save(filename = fileout)