# utils / estimate_distribution.py

 ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167``` ```#!/usr/bin/env python import sys import os import pysam import getopt import numpy as np import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import scipy.stats as SS _version = '0.1' errors = { 1 : "Please, provide an alignment in BAM format", 2 : "Please, provide a regions file in BED format" } def print_usage(): sys.stdout.write(""" Usage: estimate_distribution.py [-b BED file] -f BAM file """) def update_stats(old_mean, old_variance, value, count): mean = old_mean + (value - old_mean) / count variance = old_variance + (value - old_mean)*(value - mean) return (mean, variance) def binomial_parameters(m, n): p = m / n return (n, p) def negative_binomial_parameters(m, s): r = abs(m**2 / (np.sqrt(s) - m)) p = r / (r + m) return (r, p) def estimate_distribution(bam_file, bed_file = None, keep_zero = True): whole_genome = False regions = [] count_array = [] # This is really inefficient from memory point of view, but it takes ages to pileup mean = 1.0 #this should be relatively safe... ss = 0.0 k = 1 try: bamh = pysam.Samfile(bam_file, 'rb') except ValueError: sys.stderr.write("%s\n" % errors[1]) sys.exit(1) if bed_file: whole_genome = False for line in open(bed_file, 'r'): f = line.split() chrom = f[0] start = int(f[1]) end = int(f[2]) regions.append((chrom, start, end)) else: whole_genome = True for sq in bamh.header['SQ']: regions.append((sq['SN'], 0, sq['LN'])) for region in regions: length = region[2] - region[1] count_array.append(np.zeros(length, dtype=np.uint16)) if whole_genome: sys.stderr.write("Analyzing chromosome %s\n" % (region[0])) else: sys.stderr.write("Analyzing region %s\t%d\t%d\n" % (region)) # here I was using runnning average but it is slooow old_pos = region[1] p_iter = bamh.pileup(*region) for p in p_iter: if keep_zero and p.pos - old_pos > 1: nZ = p.pos - old_pos old_mean = mean mean = (old_mean * k) / (k + nZ) ss = ss + (old_mean * mean) k += nZ else: (mean, ss) = update_stats(mean, ss, p.n, k) k += 1 # so let's try the "all in memory" strategy... # for r in bamh.fetch(*region): # offset = r.qstart - region[1] # count_array[-1][offset:offset + r.alen] += 1 variance = ss / (k - 1) sys.stdout.write("Mean: %.3e, Variance: %.3e\n" % (mean, variance)) nb_r, nb_p = negative_binomial_parameters(mean, variance) b_n, b_p = binomial_parameters(mean, variance) sys.stdout.write("Negative Binomial parameters are r: %3e p: %3e\n" % (nb_r, nb_p)) sys.stdout.write(" Binomial parameters are n: %3e p: %3e\n" % (b_n, b_p)) sys.stdout.write(" Poisson parameters are l: %3e\n" % (mean)) # plot distributions stdev = np.sqrt(variance) xmin = 0 xmax = np.int(max(mean, nb_r) + 2 * stdev) x = range(xmin, xmax) poisson = SS.distributions.poisson(mean) nbinom = SS.distributions.nbinom(nb_r, nb_p) binom = SS.distributions.binom(b_n, b_p) yp = poisson.pmf(x) yb = binom.pmf(x) ynb = nbinom.pmf(x) ymax = max(yp.max(), yb.max(), ynb.max()) * 1.1 plt.plot(x, yp, label="Poisson") plt.plot(x, yb, label="Binomial") plt.plot(x, ynb, label="Negative Binomial") plt.title("Distributions for %s" % (bam_file)) plt.ylim(0, ymax) plt.grid() plt.legend() plt.savefig(bam_file.replace(".bam", ".png")) # close fh bamh.close() def process_arguments(): regions_file = None bam_file = None keep_zero = True # read options options, arguments = getopt.getopt(sys.argv[1:], 'b:vhf:z') for o, a in options: if o == '-b': regions_file = os.path.expanduser(a) if o == '-v': sys.stdout.write("Version %s\n" % _version) sys.exit(0) if o == '-h': print_usage() sys.exit(0) if o == '-f': bam_file = os.path.expanduser(a) if o == '-z': keep_zero = False # some sanity checks: if not bam_file or not os.path.exists(bam_file): sys.stderr.write("%s\n" % errors[1]) sys.exit(1) if regions_file and not os.path.exists(regions_file): sys.stderr.write("%s\n" % errors[2]) sys.exit(1) estimate_distribution(bam_file, regions_file, keep_zero) if __name__ == '__main__': process_arguments() ```