1. Davide Cittaro
  2. utils

Commits

Davide Cittaro  committed 59dec39

use allele frequencies for distance calculation

  • Participants
  • Parent commits 260852b
  • Branches default

Comments (0)

Files changed (1)

File vcf2dm.py

View file
 import vcf
 import numpy as np
 import argparse
+#import scipy.cluster.hierarchy
+#import matplotlib
+#matplotlib.use("Agg")
+#import matplotlib.pyplot as plt
+#from Bio import Phylo
 
 def order_samples(vcf_samples, samples):
   # ensure samples are in the same order than vcf...
   # silly function, no check...
-  new_samples = samples[:]
+  new_samples = []
+  idx = []
   for s in samples:
-    new_samples[vcf_samples.index(s)] = s
+    idx.append(vcf_samples.index(s))
+  idx.sort()
+  for x in idx:
+    new_samples.append(vcf_samples[x])  
+    
   return new_samples  
 
 def print_matrix():
 
   option_parser = argparse.ArgumentParser(
   description = "create a distance matrix for phylip from multiple bed files", 
-  prog="bed2dm.py")
+  prog="vcf2dm.py")
   option_parser.add_argument("--version", action="version", version="%(prog)s 0.1")
   option_parser.add_argument("-v", "--vcf", help="Input VCF file", action='store', type=file, required=True)
-  option_parser.add_argument("-m", "--maxalleles", help="Max number of alleles to track", default=6)
+  option_parser.add_argument("-m", "--maxalleles", help="Max number of alleles to track", type=int, default=6)
   option_parser.add_argument("-s", "--samples", help="Only calculate for these samples", nargs="+")
+  option_parser.add_argument("-f", "--usefreq", help="Use allele freq in place of counts", default=False, action="store_true")
 
   # parse arguments
   cli_options = option_parser.parse_args()
 
   # read vcf file
   vcf_parser = vcf.Reader(cli_options.vcf)
-  if len(cli_options.samples) == 0:
+  if not cli_options.samples:
     samples = vcf_parser.samples
   else:
     samples = cli_options.samples
       adc = [0 for x in range(cli_options.maxalleles)]
       if sample.called:
         if sample['AD']:
-          adc[:len(sample['AD'])] = sample['AD']
+          try:
+            adc[:len(sample['AD'])] = sample['AD']
+          except TypeError:
+            # single alleles...
+            adc[0] = sample['AD']  
       row.append(adc)
     data.append(row)
   
   data = np.array(data)
+  if cli_options.usefreq:
+    data = data.astype(float)
+    data_sum = np.sum(data, axis=2)
+    for x in range(cli_options.maxalleles):
+      data[:,:,x] = data[:,:,x] / data_sum
+    data[np.isnan(data)] = 0.
   
   dmatrix = np.zeros(len(samples)**2).reshape((len(samples), len(samples)))  
 
     line = ' '.join(["%.4f" % y for y in dmatrix[x]])
     sys.stdout.write("%-10s %s\n" % (samples[x][:10], line))    
     
+#  clusters = scipy.cluster.hierarchy.linkage(dmatrix, method='average')
+  
+  # build a newick tree and write it out
+    
+    
 if __name__ == '__main__': 
   print_matrix()