# HG changeset patch # User dawe # Date 1347540283 -7200 # Node ID bbaa733880f29de4753fab58280251f5069806ba # Parent 431a8dfbff43d65aa2ac704e002f02a19669ebe4 various diff --git a/estimate_distribution.py b/estimate_distribution.py --- a/estimate_distribution.py +++ b/estimate_distribution.py @@ -10,7 +10,8 @@ import matplotlib.pyplot as plt import scipy.stats as SS -_version = '0.1' +_version = '0.2' +_max_coverage = 4096 errors = { 1 : "Please, provide an alignment in BAM format", 2 : "Please, provide a regions file in BED format" @@ -20,7 +21,7 @@ sys.stdout.write(""" Usage: - estimate_distribution.py [-b BED file] -f BAM file + estimate_distribution.py [-b BED file] [-z] -f BAM file """) @@ -31,28 +32,33 @@ def binomial_parameters(m, n): +# this function estimate parameters for a binomial distribution +# given a mean value m and a number of elements n p = m / n return (n, p) def negative_binomial_parameters(m, s): +# this function estimate parameters for a negative binomial distribution +# given a mean value m and a variance s r = abs(m**2 / (np.sqrt(s) - m)) p = r / (r + m) return (r, p) -def estimate_distribution(bam_file, bed_file = None, keep_zero = True): +def estimate_distribution(bam_file, bed_file = None, keep_zero = True, upper_limit = 0, plot_distr = True): - whole_genome = False - regions = [] - count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup - mean = 1.0 #this should be relatively safe... - ss = 0.0 - k = 1 + actual_coverage = np.zeros(_max_coverage, dtype=np.double) # will store the actual coverage + whole_genome = False # A flag to differentiate whole genome or targeted regions + regions = [] # the list of regions that will be used for estimating + count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup + mean = 1.0 # this should be relatively safe for running average + ss = 0.0 # sample variance for running average + k = 1 # a counter try: - bamh = pysam.Samfile(bam_file, 'rb') + bamh = pysam.Samfile(bam_file, 'rb') except ValueError: - sys.stderr.write("%s\n" % errors[1]) + sys.stderr.write("%s\n" % errors[1]) sys.exit(1) if bed_file: @@ -79,15 +85,18 @@ old_pos = region[1] p_iter = bamh.pileup(*region) for p in p_iter: + cov_idx = min(_max_coverage-1, p.n) if keep_zero and p.pos - old_pos > 1: nZ = p.pos - old_pos old_mean = mean mean = (old_mean * k) / (k + nZ) ss = ss + (old_mean * mean) k += nZ + actual_coverage[0] += nZ else: (mean, ss) = update_stats(mean, ss, p.n, k) k += 1 + actual_coverage[cov_idx] += 1 # so let's try the "all in memory" strategy... # for r in bamh.fetch(*region): # offset = r.qstart - region[1] @@ -100,29 +109,40 @@ sys.stdout.write(" Binomial parameters are n: %3e p: %3e\n" % (b_n, b_p)) sys.stdout.write(" Poisson parameters are l: %3e\n" % (mean)) + # plot actual coverage + actual_coverage = actual_coverage / np.sum(actual_coverage) + plt.plot(actual_coverage, label="Actual Coverage", color='black', linestyle=' ', marker='o') + # plot distributions stdev = np.sqrt(variance) xmin = 0 - xmax = np.int(max(mean, nb_r) + 2 * stdev) + xmax = np.int(max(mean, nb_r) + stdev / 2) + if xmax < 10: + xmax = 10 + if upper_limit: + xmax = upper_limit x = range(xmin, xmax) poisson = SS.distributions.poisson(mean) nbinom = SS.distributions.nbinom(nb_r, nb_p) binom = SS.distributions.binom(b_n, b_p) - + yp = poisson.pmf(x) yb = binom.pmf(x) ynb = nbinom.pmf(x) - ymax = max(yp.max(), yb.max(), ynb.max()) * 1.1 + ymax = max(yp.max(), yb.max(), ynb.max(), actual_coverage.max()) * 1.1 - plt.plot(x, yp, label="Poisson") - plt.plot(x, yb, label="Binomial") - plt.plot(x, ynb, label="Negative Binomial") - plt.title("Distributions for %s" % (bam_file)) + if plot_distr: + plt.plot(x, yp, label="Poisson", color='blue', linestyle=' ', marker='o') + plt.plot(x, yb, label="Binomial", color='green', linestyle=' ', marker='o') + plt.plot(x, ynb, label="Negative Binomial", color='red', linestyle=' ', marker='o') + plt.title("Distributions for %s" % (os.path.basename(bam_file))) plt.ylim(0, ymax) + plt.xlim(-1, xmax) plt.grid() - plt.legend() - plt.savefig(bam_file.replace(".bam", ".png")) + plt.rcParams['legend.fontsize'] = 'small' + plt.legend(loc=0, ncol=2, scatterpoints=1 , numpoints=1) + plt.savefig(os.path.basename(bam_file.replace(".bam", ".png"))) # close fh bamh.close() @@ -132,9 +152,11 @@ regions_file = None bam_file = None keep_zero = True - + upper_limit = 0 + plot_distr = True + # read options - options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:z') + options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:zl:n') for o, a in options: if o == '-b': regions_file = os.path.expanduser(a) @@ -148,6 +170,10 @@ bam_file = os.path.expanduser(a) if o == '-z': keep_zero = False + if o == '-l': + upper_limit = int(a) + if o == '-n': + plot_distr = False # some sanity checks: if not bam_file or not os.path.exists(bam_file): @@ -158,10 +184,11 @@ sys.stderr.write("%s\n" % errors[2]) sys.exit(1) - estimate_distribution(bam_file, regions_file, keep_zero) + estimate_distribution(bam_file, regions_file, keep_zero, upper_limit, plot_distr) if __name__ == '__main__': process_arguments() +