# HG changeset patch # User dawe # Date 1345445542 -7200 # Node ID ad70f056b3b0244c0bca8df6153d533f763f1bec # Parent f9979a94ce10a784b02067c90e1f1d775ee53985 added actual coverage plot diff --git a/estimate_distribution.py b/estimate_distribution.py --- a/estimate_distribution.py +++ b/estimate_distribution.py @@ -11,6 +11,7 @@ import scipy.stats as SS _version = '0.1' +_max_coverage = 4096 errors = { 1 : "Please, provide an alignment in BAM format", 2 : "Please, provide a regions file in BED format" @@ -20,7 +21,7 @@ sys.stdout.write(""" Usage: - estimate_distribution.py [-b BED file] -f BAM file + estimate_distribution.py [-b BED file] [-z] -f BAM file """) @@ -31,10 +32,14 @@ def binomial_parameters(m, n): +# this function estimate parameters for a binomial distribution +# given a mean value m and a number of elements n p = m / n return (n, p) def negative_binomial_parameters(m, s): +# this function estimate parameters for a negative binomial distribution +# given a mean value m and a variance s r = abs(m**2 / (np.sqrt(s) - m)) p = r / (r + m) return (r, p) @@ -42,17 +47,18 @@ def estimate_distribution(bam_file, bed_file = None, keep_zero = True): - whole_genome = False - regions = [] - count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup - mean = 1.0 #this should be relatively safe... - ss = 0.0 - k = 1 + actual_coverage = np.zeros(_max_coverage, dtype=np.double) # will store the actual coverage + whole_genome = False # A flag to differentiate whole genome or targeted regions + regions = [] # the list of regions that will be used for estimating + count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup + mean = 1.0 # this should be relatively safe for running average + ss = 0.0 # sample variance for running average + k = 1 # a counter try: - bamh = pysam.Samfile(bam_file, 'rb') + bamh = pysam.Samfile(bam_file, 'rb') except ValueError: - sys.stderr.write("%s\n" % errors[1]) + sys.stderr.write("%s\n" % errors[1]) sys.exit(1) if bed_file: @@ -79,15 +85,18 @@ old_pos = region[1] p_iter = bamh.pileup(*region) for p in p_iter: + cov_idx = min(_max_coverage, p.n) if keep_zero and p.pos - old_pos > 1: nZ = p.pos - old_pos old_mean = mean mean = (old_mean * k) / (k + nZ) ss = ss + (old_mean * mean) k += nZ + actual_coverage[0] += nZ else: (mean, ss) = update_stats(mean, ss, p.n, k) k += 1 + actual_coverage[cov_idx] += 1 # so let's try the "all in memory" strategy... # for r in bamh.fetch(*region): # offset = r.qstart - region[1] @@ -100,6 +109,10 @@ sys.stdout.write(" Binomial parameters are n: %3e p: %3e\n" % (b_n, b_p)) sys.stdout.write(" Poisson parameters are l: %3e\n" % (mean)) + # plot actual coverage + actual_coverage = actual_coverage / np.sum(actual_coverage) + plt.plot(actual_coverage, label="Actual Coverage", color='black', linestyle='dashed') + # plot distributions stdev = np.sqrt(variance) xmin = 0 @@ -113,11 +126,11 @@ yb = binom.pmf(x) ynb = nbinom.pmf(x) - ymax = max(yp.max(), yb.max(), ynb.max()) * 1.1 + ymax = max(yp.max(), yb.max(), ynb.max(), actual_coverage.max()) * 1.1 - plt.plot(x, yp, label="Poisson") - plt.plot(x, yb, label="Binomial") - plt.plot(x, ynb, label="Negative Binomial") + plt.plot(x, yp, label="Poisson", color='blue') + plt.plot(x, yb, label="Binomial", color='green') + plt.plot(x, ynb, label="Negative Binomial", color='red') plt.title("Distributions for %s" % (bam_file)) plt.ylim(0, ymax) plt.grid()