Source

utils / estimate_distribution.py

Full commit
#!/usr/bin/env python

import sys
import os
import pysam
import getopt
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import scipy.stats as SS

_version = '0.1'
errors = {
1 : "Please, provide an alignment in BAM format", 
2 : "Please, provide a regions file in BED format"
}

def print_usage():
  sys.stdout.write("""
Usage:

  estimate_distribution.py [-b BED file] -f BAM file
  
""")
    
def update_stats(old_mean, old_variance, value, count):
  mean = old_mean + (value - old_mean) / count
  variance = old_variance + (value - old_mean)*(value - mean)
  return (mean, variance)


def binomial_parameters(m, n):
  p = m / n
  return (n, p)

def negative_binomial_parameters(m, s):
  r = abs(m**2 / (np.sqrt(s) - m))
  p = r / (r + m)
  return (r, p)


def estimate_distribution(bam_file, bed_file = None, keep_zero = True):

  whole_genome = False
  regions = []
  count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup
  mean = 1.0 #this should be relatively safe...
  ss = 0.0
  k = 1
  
  try:
    bamh = pysam.Samfile(bam_file, 'rb')
  except ValueError:  
    sys.stderr.write("%s\n" % errors[1])
    sys.exit(1)

  if bed_file:
    whole_genome = False
    for line in open(bed_file, 'r'):
      f = line.split()
      chrom = f[0]
      start = int(f[1])
      end = int(f[2])
      regions.append((chrom, start, end))
  else:
    whole_genome = True
    for sq in bamh.header['SQ']:
      regions.append((sq['SN'], 0, sq['LN']))

  for region in regions:
    length = region[2] - region[1]
    count_array.append(np.zeros(length, dtype=np.uint16))
    if whole_genome:
      sys.stderr.write("Analyzing chromosome %s\n" % (region[0]))
    else:
      sys.stderr.write("Analyzing region %s\t%d\t%d\n" % (region))
# here I was using runnning average but it is slooow      
    old_pos = region[1]
    p_iter = bamh.pileup(*region)
    for p in p_iter:
      if keep_zero and p.pos - old_pos > 1:
        nZ = p.pos - old_pos
        old_mean = mean
        mean = (old_mean * k) / (k + nZ)
        ss = ss + (old_mean * mean)
        k += nZ
      else:  
        (mean, ss) = update_stats(mean, ss, p.n, k)
        k += 1
# so let's try the "all in memory" strategy...
#      for r in bamh.fetch(*region):
#        offset = r.qstart - region[1]
#        count_array[-1][offset:offset + r.alen] += 1
  variance = ss / (k - 1)
  sys.stdout.write("Mean: %.3e, Variance: %.3e\n" % (mean, variance))
  nb_r, nb_p = negative_binomial_parameters(mean, variance)
  b_n, b_p = binomial_parameters(mean, variance)  
  sys.stdout.write("Negative Binomial parameters are r: %3e p: %3e\n" % (nb_r, nb_p))
  sys.stdout.write("         Binomial parameters are n: %3e p: %3e\n" % (b_n, b_p))
  sys.stdout.write("          Poisson parameters are l: %3e\n" % (mean))
  
  # plot distributions
  stdev = np.sqrt(variance)
  xmin = 0
  xmax = np.int(max(mean, nb_r) + 2 * stdev)
  x = range(xmin, xmax)
  poisson = SS.distributions.poisson(mean)
  nbinom = SS.distributions.nbinom(nb_r, nb_p)
  binom = SS.distributions.binom(b_n, b_p)
  
  yp = poisson.pmf(x)
  yb = binom.pmf(x)
  ynb = nbinom.pmf(x)
  
  ymax = max(yp.max(), yb.max(), ynb.max())	* 1.1
  
  plt.plot(x, yp, label="Poisson")
  plt.plot(x, yb, label="Binomial")
  plt.plot(x, ynb, label="Negative Binomial")
  plt.title("Distributions for %s" % (bam_file))
  plt.ylim(0, ymax)
  plt.grid()
  plt.legend()
  plt.savefig(bam_file.replace(".bam", ".png"))
  
  # close fh
  bamh.close()
  
def process_arguments():
  
  regions_file = None
  bam_file = None
  keep_zero = True
  
  # read options
  options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:z')
  for o, a in options:
    if o == '-b':
      regions_file = os.path.expanduser(a)
    if o == '-v':
      sys.stdout.write("Version %s\n" % _version)
      sys.exit(0)
    if o == '-h':
      print_usage()
      sys.exit(0)
    if o == '-f':
      bam_file = os.path.expanduser(a)    
    if o == '-z':
      keep_zero = False  

  # some sanity checks:
  if not bam_file or not os.path.exists(bam_file):
    sys.stderr.write("%s\n" % errors[1])
    sys.exit(1)
  
  if regions_file and not os.path.exists(regions_file):
    sys.stderr.write("%s\n" % errors[2])
    sys.exit(1)

  estimate_distribution(bam_file, regions_file, keep_zero)    



if __name__ == '__main__':
  process_arguments()