# utils

committed bbaa733

various

# estimate_distribution.py

` import matplotlib.pyplot as plt`
` import scipy.stats as SS`
` `
`-_version = '0.1'`
`+_version = '0.2'`
`+_max_coverage = 4096`
` errors = {`
` 1 : "Please, provide an alignment in BAM format", `
` 2 : "Please, provide a regions file in BED format"`
`   sys.stdout.write("""`
` Usage:`
` `
`-  estimate_distribution.py [-b BED file] -f BAM file`
`+  estimate_distribution.py [-b BED file] [-z] -f BAM file`
`   `
` """)`
`     `
` `
` `
` def binomial_parameters(m, n):`
`+# this function estimate parameters for a binomial distribution`
`+# given a mean value m and a number of elements n`
`   p = m / n`
`   return (n, p)`
` `
` def negative_binomial_parameters(m, s):`
`+# this function estimate parameters for a negative binomial distribution`
`+# given a mean value m and a variance s`
`   r = abs(m**2 / (np.sqrt(s) - m))`
`   p = r / (r + m)`
`   return (r, p)`
` `
` `
`-def estimate_distribution(bam_file, bed_file = None, keep_zero = True):`
`+def estimate_distribution(bam_file, bed_file = None, keep_zero = True, upper_limit = 0, plot_distr = True):`
` `
`-  whole_genome = False`
`-  regions = []`
`-  count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup`
`-  mean = 1.0 #this should be relatively safe...`
`-  ss = 0.0`
`-  k = 1`
`+  actual_coverage = np.zeros(_max_coverage, dtype=np.double)	# will store the actual coverage`
`+  whole_genome = False											# A flag to differentiate whole genome or targeted regions`
`+  regions = []													# the list of regions that will be used for estimating`
`+  count_array = [] 												# This is really inefficient from memory point of view, but it takes ages to pileup`
`+  mean = 1.0 													# this should be relatively safe for running average`
`+  ss = 0.0														# sample variance for running average`
`+  k = 1															# a counter`
`   `
`   try:`
`-    bamh = pysam.Samfile(bam_file, 'rb')`
`+    bamh = pysam.Samfile(bam_file, 'rb')						`
`   except ValueError:  `
`-    sys.stderr.write("%s\n" % errors[1])`
`+    sys.stderr.write("%s\n" % errors[1])						`
`     sys.exit(1)`
` `
`   if bed_file:`
`     old_pos = region[1]`
`     p_iter = bamh.pileup(*region)`
`     for p in p_iter:`
`+      cov_idx = min(_max_coverage-1, p.n)`
`       if keep_zero and p.pos - old_pos > 1:`
`         nZ = p.pos - old_pos`
`         old_mean = mean`
`         mean = (old_mean * k) / (k + nZ)`
`         ss = ss + (old_mean * mean)`
`         k += nZ`
`+        actual_coverage[0] += nZ`
`       else:  `
`         (mean, ss) = update_stats(mean, ss, p.n, k)`
`         k += 1`
`+        actual_coverage[cov_idx] += 1`
` # so let's try the "all in memory" strategy...`
` #      for r in bamh.fetch(*region):`
` #        offset = r.qstart - region[1]`
`   sys.stdout.write("         Binomial parameters are n: %3e p: %3e\n" % (b_n, b_p))`
`   sys.stdout.write("          Poisson parameters are l: %3e\n" % (mean))`
`   `
`+  # plot actual coverage`
`+  actual_coverage = actual_coverage / np.sum(actual_coverage)`
`+  plt.plot(actual_coverage, label="Actual Coverage", color='black', linestyle=' ', marker='o')`
`+  `
`   # plot distributions`
`   stdev = np.sqrt(variance)`
`   xmin = 0`
`-  xmax = np.int(max(mean, nb_r) + 2 * stdev)`
`+  xmax = np.int(max(mean, nb_r) + stdev / 2)`
`+  if xmax < 10:`
`+    xmax = 10`
`+  if upper_limit:`
`+    xmax = upper_limit    `
`   x = range(xmin, xmax)`
`   poisson = SS.distributions.poisson(mean)`
`   nbinom = SS.distributions.nbinom(nb_r, nb_p)`
`   binom = SS.distributions.binom(b_n, b_p)`
`-  `
`+`
`   yp = poisson.pmf(x)`
`   yb = binom.pmf(x)`
`   ynb = nbinom.pmf(x)`
`   `
`-  ymax = max(yp.max(), yb.max(), ynb.max())	* 1.1`
`+  ymax = max(yp.max(), yb.max(), ynb.max(), actual_coverage.max())	* 1.1`
`   `
`-  plt.plot(x, yp, label="Poisson")`
`-  plt.plot(x, yb, label="Binomial")`
`-  plt.plot(x, ynb, label="Negative Binomial")`
`-  plt.title("Distributions for %s" % (bam_file))`
`+  if plot_distr:`
`+    plt.plot(x, yp, label="Poisson", color='blue', linestyle=' ', marker='o')`
`+    plt.plot(x, yb, label="Binomial", color='green', linestyle=' ', marker='o')`
`+    plt.plot(x, ynb, label="Negative Binomial", color='red', linestyle=' ', marker='o')`
`+  plt.title("Distributions for %s" % (os.path.basename(bam_file)))`
`   plt.ylim(0, ymax)`
`+  plt.xlim(-1, xmax)`
`   plt.grid()`
`-  plt.legend()`
`-  plt.savefig(bam_file.replace(".bam", ".png"))`
`+  plt.rcParams['legend.fontsize'] = 'small'`
`+  plt.legend(loc=0, ncol=2, scatterpoints=1 , numpoints=1)`
`+  plt.savefig(os.path.basename(bam_file.replace(".bam", ".png")))`
`   `
`   # close fh`
`   bamh.close()`
`   regions_file = None`
`   bam_file = None`
`   keep_zero = True`
`-  `
`+  upper_limit = 0`
`+  plot_distr = True`
`+ `
`   # read options`
`-  options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:z')`
`+  options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:zl:n')`
`   for o, a in options:`
`     if o == '-b':`
`       regions_file = os.path.expanduser(a)`
`       bam_file = os.path.expanduser(a)    `
`     if o == '-z':`
`       keep_zero = False  `
`+    if o == '-l':`
`+      upper_limit = int(a)  `
`+    if  o == '-n':`
`+      plot_distr = False`
` `
`   # some sanity checks:`
`   if not bam_file or not os.path.exists(bam_file):`
`     sys.stderr.write("%s\n" % errors[2])`
`     sys.exit(1)`
` `
`-  estimate_distribution(bam_file, regions_file, keep_zero)    `
`+  estimate_distribution(bam_file, regions_file, keep_zero, upper_limit, plot_distr)    `
` `
` `
` `
` if __name__ == '__main__':`
`   process_arguments()`
` `
`+`
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.