Commits

Davide Cittaro  committed d05572b

added heterozigosity measure and probability distance for vcf2dm

  • Participants
  • Parent commits 72abd17

Comments (0)

Files changed (1)

 import numpy as np
 import argparse
 import scipy.stats
-#import scipy.cluster.hierarchy
-#import matplotlib
-#matplotlib.use("Agg")
-#import matplotlib.pyplot as plt
-#from Bio import Phylo
+import math
+import bx.intervals
+
+def fill_ad(sample, max_alleles):
+  adc = [0 for x in range(max_alleles)]
+  if sample.called:
+    try:
+      if sample['AD']:
+        try:
+          adc[:len(sample['AD'])] = sample['AD']
+        except TypeError:
+          adc[0] = sample['AD'] 
+    except AttributeError:
+      None       
+  return adc        
 
 
 def log_fact(n):
-  return np.sum([np.log(x) for x in range(1, n + 1 )])
+  return np.floor(sum([math.log(x) for x in np.arange(1, n + 1 )]))
 
 def log_binom(n, k):
   if k > n:
   return log_fact(n) - log_fact(k) - log_fact(n - k)
 
 def log_mh_p(X, Y):
+  den = 0.0
+  num = 0.0
+  
   grand_sum = np.sum(X) + np.sum(Y)
   s1_sum = np.sum(X)
   den = log_binom(grand_sum, s1_sum)
-  num = 0.0
+
   for x in range(len(X)):
     a_sum = X[x] + Y[x]
     num += log_binom(a_sum, X[x])
-  return num - den
-  
+  return den - num
 
-    
 
-def mh_p(X, Y):
-  # return p associated to the multivariate hypergeometric test.
-  # hope this is right
-  grand_sum = np.sum(X) + np.sum(Y)
-  s1_sum = np.sum(X)
-  den = scipy.special.binom(grand_sum, s1_sum)
-  num = 1.0
-  for x in range(len(X)):
-    a_sum = X[x] + Y[x]
-    num *= scipy.special.binom(a_sum, X[x])
-  return num / den
 
 def order_samples(vcf_samples, samples):
   # ensure samples are in the same order than vcf...
     
   return new_samples  
 
-def print_matrix():
+def print_distance():
 
   option_parser = argparse.ArgumentParser(
   description = "create a distance matrix for phylip from multiple bed files", 
   option_parser.add_argument("-m", "--maxalleles", help="Max number of alleles to track", type=int, default=6)
   option_parser.add_argument("-s", "--samples", help="Only calculate for these samples", nargs="+")
   option_parser.add_argument("-p", "--useprob", help="Use probabilty for distance", default=False, action="store_true")
+  option_parser.add_argument("-y", "--usehet", help="Use heterozigosity measure", default=False, action="store_true")
+  option_parser.add_argument("-i", "--intervals", help="Create a bedgraph output on the provided interval files", type=file)
+  option_parser.add_argument("-c", "--coupled", help="If creating a bedgraph, use these two samples for comparinson", nargs=2)
+
 
   # parse arguments
   cli_options = option_parser.parse_args()
 
+  # if
+
   # read vcf file
   vcf_parser = vcf.Reader(cli_options.vcf)
   if not cli_options.samples:
 
   # we start with a python list, easier to update...
   data = []
+
+  if cli_options.intervals:
+    #prepare samples
+    sample_idx = []
+    for s in cli_options.coupled:
+      sample_idx.append(vcf_parser.samples.index(s))
+    #prepare the intervals  
+    intervals = {}
+    # create a bedgraph
+    for line in cli_options.intervals:
+      fields = line.split()
+      chrom, start, end = fields[0], int(fields[1]), int(fields[2])
+      try:
+        intervals[chrom].add(start, end, {'start':start, 'end':end, 'ad1' :[], 'ad2':[]})
+      except KeyError:
+        intervals[chrom] = bx.intervals.Intersecter()
+        intervals[chrom].add(start, end, {'start':start, 'end':end, 'ad1' :[], 'ad2':[]})
   
   for record in vcf_parser:
     row = []
-    for sample in record.samples:
-      if not sample.sample in samples: continue
-      adc = [0 for x in range(cli_options.maxalleles)]
-      if sample.called:
-        if sample['AD']:
+    if cli_options.intervals:
+      this_d = 0.0
+      sample1 = record.samples[sample_idx[0]]
+      sample2 = record.samples[sample_idx[1]]
+      try:
+        node = intervals[record.CHROM].find(record.POS,record.POS)
+      except KeyError:
+        continue  
+      if len(node):
+        node[0]['ad1'].append(fill_ad(sample1, cli_options.maxalleles))
+        node[0]['ad2'].append(fill_ad(sample2, cli_options.maxalleles))
+    else:
+      if not cli_options.usehet:  
+        for sample in record.samples:
+          if not sample.sample in samples: continue
+          adc = fill_ad(sample, cli_options.maxalleles)
+          row.append(adc)
+        data.append(row)
+      else:
+        for sample in record.samples:
+          # if not using probabilities, calculate heterozigosity
           try:
-            adc[:len(sample['AD'])] = sample['AD']
-          except TypeError:
-            # single alleles...
-            adc[0] = sample['AD']  
-      row.append(adc)
-    data.append(row)
-  
+            adc = fill_ad(sample, 6)
+            f = [x / float(sum(adc)) for x in adc]
+          except ZeroDivisionError:
+            f = adc
+          row.append(1 - np.sum([x**2 for x in f]))  
+        data.append(row)  
   data = np.array(data)
   
-  dmatrix = np.zeros(len(samples)**2).reshape((len(samples), len(samples)))  
-
-  if cli_options.useprob:
-    for x in range(len(samples)-1):
-      for y in range(x + 1, len(samples)):
-        dist = 0.0
-        for z in range(np.shape(data)[0]):
-          dist += -log_mh_p(data[z, x], data[z, y]))
-        dmatrix[x, y] = dist
-        dmatrix[y, x] = dist
-  else:      
+  if cli_options.intervals:
+    #write a bedgraph
+    for chrom in intervals.keys():
+      i_tree = intervals[chrom]
+      start = 0
+      while 1:
+        node = i_tree.find(start, start + 1)
+        if len(node) == 0:
+          # there's no interval after this
+          break
+        start = node[0]['start']
+        end = node[0]['end']
+        node[0]['ad1'] = np.array(node[0]['ad1'])
+        node[0]['ad2'] = np.array(node[0]['ad2'])
+        distance = 0.0
+        if cli_options.useprob:
+          for x in range(len(node[0]['ad1'])):
+            distance += log_mg_p(node[0]['ad1'][x], node[0]['ad2'][x])
+        else:
+          distance = np.linalg.norm(node[0]['ad1'] - node[0]['ad2'])
+        if distance != 0:  
+          sys.stdout.write("%s\t%d\t%d\t%d\n" % (chrom, start , end, distance))
+        start = end 
+        
+  else:  
+    # calculate big matrix and output
+    dmatrix = np.zeros(len(samples)**2).reshape((len(samples), len(samples)))  
+  
+    if cli_options.useprob:
+      for x in range(len(samples)-1):
+        for y in range(x + 1, len(samples)):
+          dist = 0.0
+          for z in range(np.shape(data)[0]):
+            dist += log_mh_p(data[z, x], data[z, y])
+          dmatrix[x, y] = dist
+          dmatrix[y, x] = dist
+    else:      
+      for x in range(len(samples)):
+        for y in range(x, len(samples)):
+          dist = np.linalg.norm(data[:,x] - data[:,y])
+          dmatrix[x, y] = dist
+          dmatrix[y, x] = dist
+  
+    #write dmatrix for phylip
+    sys.stdout.write("%d\n" % len(samples))
     for x in range(len(samples)):
-      for y in range(x, len(samples)):
-        dist = np.linalg.norm(data[:,x] - data[:,y])
-        dmatrix[x, y] = dist
-        dmatrix[y, x] = dist
-
-  #write dmatrix for phylip
-  sys.stdout.write("%d\n" % len(samples))
-  for x in range(len(samples)):
-    line = ' '.join(["%.4f" % y for y in dmatrix[x]])
-    sys.stdout.write("%-10s %s\n" % (samples[x][:10], line))    
-    
-#  clusters = scipy.cluster.hierarchy.linkage(dmatrix, method='average')
-  
-  # build a newick tree and write it out
+      line = ' '.join(["%.4f" % y for y in dmatrix[x]])
+      sys.stdout.write("%-10s %s\n" % (samples[x][:10], line))    
     
     
 if __name__ == '__main__': 
-  print_matrix()
+  print_distance()