Source

utils / estimate_distribution.py

Full commit
#!/usr/bin/env python

import sys
import os
import pysam
import getopt
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import scipy.stats as SS

_version = '0.2'
_max_coverage = 4096
errors = {
1 : "Please, provide an alignment in BAM format", 
2 : "Please, provide a regions file in BED format"
}

def print_usage():
  sys.stdout.write("""
Usage:

  estimate_distribution.py [-b BED file] [-z] -f BAM file
  
""")
    
def update_stats(old_mean, old_variance, value, count):
  mean = old_mean + (value - old_mean) / count
  variance = old_variance + (value - old_mean)*(value - mean)
  return (mean, variance)


def binomial_parameters(m, n):
# this function estimate parameters for a binomial distribution
# given a mean value m and a number of elements n
  p = m / n
  return (n, p)

def negative_binomial_parameters(m, s):
# this function estimate parameters for a negative binomial distribution
# given a mean value m and a variance s
  r = abs(m**2 / (np.sqrt(s) - m))
  p = r / (r + m)
  return (r, p)


def estimate_distribution(bam_file, bed_file = None, keep_zero = True, upper_limit = 0, plot_distr = True):

  actual_coverage = np.zeros(_max_coverage, dtype=np.double)	# will store the actual coverage
  whole_genome = False											# A flag to differentiate whole genome or targeted regions
  regions = []													# the list of regions that will be used for estimating
  count_array = [] 												# This is really inefficient from memory point of view, but it takes ages to pileup
  mean = 1.0 													# this should be relatively safe for running average
  ss = 0.0														# sample variance for running average
  k = 1															# a counter
  
  try:
    bamh = pysam.Samfile(bam_file, 'rb')						
  except ValueError:  
    sys.stderr.write("%s\n" % errors[1])						
    sys.exit(1)

  if bed_file:
    whole_genome = False
    for line in open(bed_file, 'r'):
      f = line.split()
      chrom = f[0]
      start = int(f[1])
      end = int(f[2])
      regions.append((chrom, start, end))
  else:
    whole_genome = True
    for sq in bamh.header['SQ']:
      regions.append((sq['SN'], 0, sq['LN']))

  for region in regions:
    length = region[2] - region[1]
    count_array.append(np.zeros(length, dtype=np.uint16))
    if whole_genome:
      sys.stderr.write("Analyzing chromosome %s\n" % (region[0]))
    else:
      sys.stderr.write("Analyzing region %s\t%d\t%d\n" % (region))
# here I was using runnning average but it is slooow      
    old_pos = region[1]
    p_iter = bamh.pileup(*region)
    for p in p_iter:
      cov_idx = min(_max_coverage-1, p.n)
      if keep_zero and p.pos - old_pos > 1:
        nZ = p.pos - old_pos
        old_mean = mean
        mean = (old_mean * k) / (k + nZ)
        ss = ss + (old_mean * mean)
        k += nZ
        actual_coverage[0] += nZ
      else:  
        (mean, ss) = update_stats(mean, ss, p.n, k)
        k += 1
        actual_coverage[cov_idx] += 1
# so let's try the "all in memory" strategy...
#      for r in bamh.fetch(*region):
#        offset = r.qstart - region[1]
#        count_array[-1][offset:offset + r.alen] += 1
  variance = ss / (k - 1)
  sys.stdout.write("Mean: %.3e, Variance: %.3e\n" % (mean, variance))
  nb_r, nb_p = negative_binomial_parameters(mean, variance)
  b_n, b_p = binomial_parameters(mean, variance)  
  sys.stdout.write("Negative Binomial parameters are r: %3e p: %3e\n" % (nb_r, nb_p))
  sys.stdout.write("         Binomial parameters are n: %3e p: %3e\n" % (b_n, b_p))
  sys.stdout.write("          Poisson parameters are l: %3e\n" % (mean))
  
  # plot actual coverage
  actual_coverage = actual_coverage / np.sum(actual_coverage)
  plt.plot(actual_coverage, label="Actual Coverage", color='black', linestyle=' ', marker='o')
  
  # plot distributions
  stdev = np.sqrt(variance)
  xmin = 0
  xmax = np.int(max(mean, nb_r) + stdev / 2)
  if xmax < 10:
    xmax = 10
  if upper_limit:
    xmax = upper_limit    
  x = range(xmin, xmax)
  poisson = SS.distributions.poisson(mean)
  nbinom = SS.distributions.nbinom(nb_r, nb_p)
  binom = SS.distributions.binom(b_n, b_p)

  yp = poisson.pmf(x)
  yb = binom.pmf(x)
  ynb = nbinom.pmf(x)
  
  ymax = max(yp.max(), yb.max(), ynb.max(), actual_coverage.max())	* 1.1
  
  if plot_distr:
    plt.plot(x, yp, label="Poisson", color='blue', linestyle=' ', marker='o')
    plt.plot(x, yb, label="Binomial", color='green', linestyle=' ', marker='o')
    plt.plot(x, ynb, label="Negative Binomial", color='red', linestyle=' ', marker='o')
  plt.title("Distributions for %s" % (os.path.basename(bam_file)))
  plt.ylim(0, ymax)
  plt.xlim(-1, xmax)
  plt.grid()
  plt.rcParams['legend.fontsize'] = 'small'
  plt.legend(loc=0, ncol=2, scatterpoints=1 , numpoints=1)
  plt.savefig(os.path.basename(bam_file.replace(".bam", ".png")))
  
  # close fh
  bamh.close()
  
def process_arguments():
  
  regions_file = None
  bam_file = None
  keep_zero = True
  upper_limit = 0
  plot_distr = True
 
  # read options
  options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:zl:n')
  for o, a in options:
    if o == '-b':
      regions_file = os.path.expanduser(a)
    if o == '-v':
      sys.stdout.write("Version %s\n" % _version)
      sys.exit(0)
    if o == '-h':
      print_usage()
      sys.exit(0)
    if o == '-f':
      bam_file = os.path.expanduser(a)    
    if o == '-z':
      keep_zero = False  
    if o == '-l':
      upper_limit = int(a)  
    if  o == '-n':
      plot_distr = False

  # some sanity checks:
  if not bam_file or not os.path.exists(bam_file):
    sys.stderr.write("%s\n" % errors[1])
    sys.exit(1)
  
  if regions_file and not os.path.exists(regions_file):
    sys.stderr.write("%s\n" % errors[2])
    sys.exit(1)

  estimate_distribution(bam_file, regions_file, keep_zero, upper_limit, plot_distr)    



if __name__ == '__main__':
  process_arguments()