Commits

Inti Pedroso  committed b45afdc

file re-organization added which had not been updated before. Minor bug fixed when no CNVs were detected on a segment. Additional filters on CNV calls by CI overlapping with and CNV with mean close to 1, i.e., with hard coded genotype == 1

  • Participants
  • Parent commits fc3bc6b
  • Branches develop, feature/correction_gcv2 2
    1. feature/docs
    2. release/0.01

Comments (0)

Files changed (2)

File utilities/cnv_caller.py

+from __future__ import division
+import sys
+sys.path.append('/home/inti/My_Soft/')
+sys.path.append('/home/pedrosoi/APP/')
+import pyhsmm
+from pyhsmm.util.text import progprint_xrange
+
+import pandas as pd
+import numpy as np
+import time 
+import copy
+observed_states = ''
+genome_average_mean = ''
+log_w = ''
+states_means = ''
+
+class iCC():
+  def __init__(self):
+    print time.ctime(), "Initializing CNV caller object"
+    # set some defauls values
+    self.split_in_gaps = False
+    self.min_gap_size = 0
+    self.max_segment_size = 5000
+    self.min_segment_size = 500 
+    self.min_group_size = 500
+    self.Nmax = 0
+    self.hyper_priors = {}
+    self.prior_dist = {}
+    self.working_distn = None
+    self.MCMCopts = dict(niter = 100, burnin = 50, sample_freq = 5)
+    self.sampled_models = {}
+    self.changepoint_counts = {}
+    self.observed_states_counts = {}
+    self.model_opts = dict(alphakappa_a_0=50.,
+			    alphakappa_b_0=1.,
+			    rho_a_0=5000.,
+			    rho_b_0=1., # higher rho_a_0 means stickier
+			    gamma_a_0=1.,
+			    gamma_b_0=3.,# higher rho_a_0 means stickier, rhos is the self-transition fraction and is distributed as a Beta
+			    init_state_concentration=50.
+			   )
+    self.model = {}
+    
+  def _restart(self):
+    self.read_data()
+    self.set_hyperpriors()
+    self.define_prior_distns()
+    self.set_model()
+    self.load_data_into_model()
+
+    
+  def set_model(self):
+    import pyhsmm
+    self.model[self.working_distn] =   pyhsmm.models.StickyHMMEigen(
+						      obs_distns     = self.prior_dist[self.working_distn],
+						      alphakappa_a_0=50.,alphakappa_b_0=1.,
+						      rho_a_0=5000.,rho_b_0=1., # higher rho_a_0 means stickier
+						      gamma_a_0=1.,gamma_b_0=3.,# higher rho_a_0 means stickier, rhos is the self-transition fraction and is distributed as a Beta
+						      init_state_concentration=50.)
+						      #alphakappa_a_0 = self.model_opts['alphakappa_a_0'],
+						      #alphakappa_b_0 = self.model_opts['alphakappa_b_0'],
+						      #rho_a_0        = self.model_opts['rho_a_0'],
+						      #rho_b_0        = self.model_opts['rho_b_0'], # higher rho_a_0 means stickier
+						      #gamma_a_0      = self.model_opts['gamma_a_0'],
+						      #gamma_b_0      = self.model_opts['gamma_b_0'],# higher rho_a_0 means stickier, rhos is the self-transition fraction and is distributed as a Beta
+						      #init_state_concentration = 50.)#self.model_opts['init_state_concentration']# this implies a tight prior on gamma controlling the stick-breaking process. The stick-breaking process is a Beta(1,gamma)
+					
+  
+  def load_data_into_model(self,transform = None):
+    print time.ctime(), 'Loading data'
+    for i in self.data.keys():
+      data = self.data[i]
+      if transform is not None:
+	data = transform(data)
+      self.model[self.working_distn].add_data(data)
+    print time.ctime(), '   ... done'
+    
+  def sample(self,save = False,niter = None, freq = None, parallel=False,numtoresample='all',all_at = 0.1, verbose = False):
+    number_of_interations = self.MCMCopts['niter']
+    save_freq =  self.MCMCopts['sample_freq']
+    if niter is not None:
+      number_of_interations = niter
+    if freq is not None:
+      save_freq = freq
+     
+    for itr in progprint_xrange(number_of_interations):
+      if parallel:
+	n_to_resample = numtoresample
+	if (itr % all_at) == 0:
+	  n_to_resample = 'all'
+	self.model[self.working_distn].resample_model_parallel2(numtoresample=n_to_resample)
+      else:
+	self.model[self.working_distn].resample_model()
+      if save:
+	if self.working_distn not in self.sampled_models.keys():
+	  self.sampled_models[self.working_distn] = {}
+	if (itr % save_freq) == 0:
+	  self.sampled_models[self.working_distn][itr] = copy.deepcopy(self.model[self.working_distn])
+      if verbose:
+	m = self.model[self.working_distn] 
+	print itr, m.trans_distn.alpha, m.trans_distn.gamma, m.trans_distn.rho, m.trans_distn.kappa, m.trans_distn.beta.max()
+
+  def _define_data_storage(self):
+    self.changepoint_counts[self.working_distn] = {}
+    self.observed_states_counts[self.working_distn] = {}
+    for idx, s_list in enumerate(self.model[self.working_distn].states_list):
+      self.changepoint_counts[self.working_distn][idx] = np.zeros_like(s_list.data)
+      # observed states at sequences
+      self.observed_states_counts[self.working_distn][idx] = np.zeros((self.Nmax,len(s_list.data)))
+  
+  def _update_changepoint_and_observed_states_counts(self):
+    for m in self.sampled_models[self.working_distn].values():
+      for idx, s_list in enumerate(m.states_list):
+	self.changepoint_counts[self.working_distn][idx] += np.concatenate(( (0,), np.diff(s_list.stateseq) != 0 ))
+	for i, j in enumerate(s_list.stateseq):
+	  self.observed_states_counts[self.working_distn][idx][j,i] += 1
+
+  def _normalize_counts(self):
+    Nmodels = len(self.sampled_models[self.working_distn])
+    for idx in self.observed_states_counts[self.working_distn].keys():
+	for row_idx in xrange(self.observed_states_counts[self.working_distn][idx].shape[0]):
+	  self.observed_states_counts[self.working_distn][idx][row_idx] /= Nmodels
+	self.changepoint_counts[self.working_distn][idx] /= Nmodels
+	
+  def _calculate_average_states_mean_var(self):
+    self.states_moments = {}
+    self.states_params = {}
+    states_means = ''
+    states_var = ''
+    if self.working_distn == 'negative_binomial':
+      Rs = np.zeros(self.Nmax)
+      Ps = np.zeros(self.Nmax)
+      for m in self.sampled_models[self.working_distn].values():
+	for i,distn in enumerate(m.obs_distns):
+	  Rs[i] += distn.r
+	  Ps[i] += distn.p
+      Nmodels = len(self.sampled_models[self.working_distn])
+      Rs /= Nmodels
+      Ps /= Nmodels
+      
+      states_means = [ Rs[i]*Ps[i]/(1-Ps[i]) for i in xrange(self.Nmax) ]
+      states_var   = [ Rs[i]*Ps[i]/(1-Ps[i])**2 for i in xrange(self.Nmax) ]
+      self.states_params[self.working_distn] = {}
+      self.states_params[self.working_distn]['Rs'] = Rs
+      self.states_params[self.working_distn]['Ps'] = Ps
+
+    elif self.working_distn == 'gaussian':
+      MUs = np.zeros(self.Nmax)
+      SIGMASQs = np.zeros(self.Nmax)
+      for m in self.sampled_models[self.working_distn].values():
+	for i,distn in enumerate(m.obs_distns):
+	  MUs[i] += distn.mu
+	  SIGMASQs[i] += distn.sigmasq
+      Nmodels = len(self.sampled_models[self.working_distn])
+      MUs /= Nmodels
+      SIGMASQs /= Nmodels
+      self.states_params[self.working_distn] = {}
+      self.states_params[self.working_distn]['mu'] = MUs
+      self.states_params[self.working_distn]['sigmasq'] = SIGMASQs
+      states_means = MUs 
+      states_var = SIGMASQs
+    else:
+      NotImplementedError
+    
+    states_means = np.hstack(states_means)
+    states_var = np.hstack(states_var)
+    self.states_moments[self.working_distn] = np.zeros((self.Nmax,4))
+    self.states_moments[self.working_distn][:,0] = states_means
+    self.states_moments[self.working_distn][:,1] = states_var
+    self.states_moments[self.working_distn][:,2] = states_means + 1.96*np.sqrt(states_var)
+    self.states_moments[self.working_distn][:,3] = states_means - 1.96*np.sqrt(states_var)
+    
+  
+  def _data_median(self):
+    values = pyhsmm.util.stats.flattendata([l.data for l in self.model[self.working_distn].states_list ])
+    weights = pyhsmm.util.stats.flattendata([self.data_files[idx].post_prob_of_cnv.values for idx in self.data_files.keys()])
+    weights -= 1
+    self.data_mean, self.data_variance = self.weighted_avg_and_std(values, weights=weights)
+    #self.data_mean,self.data_variance = self._fixed_effect_combined_stat(self.states_moments[self.working_distn][:,0],self.states_moments[self.working_distn][:,1],np.exp(self.states_log_prob_of_CN1))
+    self._data_lower_ci = self.data_mean - 1.96*np.sqrt(self.data_variance)
+    self._data_upper_ci = self.data_mean + 1.96*np.sqrt(self.data_variance)
+    
+
+  def _states_log_prob_of_CN1(self):
+    Nmodels = len(self.sampled_models[self.working_distn])
+    log_w = np.vstack([ np.apply_along_axis(np.sum,1,self.observed_states_counts[self.working_distn][idx]*Nmodels) for idx in self.observed_states_counts[self.working_distn].keys()  ])
+    if len(self.observed_states_counts[self.working_distn].keys()) > 1:
+      log_w = np.apply_along_axis(np.sum,0,log_w)      
+    else:
+      log_w = log_w[0]
+    log_w = np.log(log_w)
+    log_w -= np.logaddexp.reduce(log_w)
+    self.states_log_prob_of_CN1 = log_w
+
+  def _post_prob_of_cnv(self):
+    print time.ctime(),"   Calculating prob of CNV" 
+    for idx in self.observed_states_counts[self.working_distn].keys():
+      self.data_files[idx]['post_prob_of_cnv'] = np.hstack([ np.dot(self.observed_states_counts[self.working_distn][idx][:,row_idx],self.observed_states_counts[self.working_distn][idx][:,row_idx]) - np.dot(self.observed_states_counts[self.working_distn][idx][:,row_idx], np.exp(self.states_log_prob_of_CN1)) for row_idx in xrange(len(self.model[self.working_distn].states_list[idx].data)) ])
+      self.data_files[idx]['post_prob_of_cnv'][ self.data_files[idx]['post_prob_of_cnv'] < 0 ] = 0
+      self.data_files[idx]['post_prob_of_cnv'][ self.data_files[idx]['post_prob_of_cnv'] > 1 ] = 1
+	
+  def _mean_ci_for_cnv(self):
+    for idx in self.observed_states_counts[self.working_distn].keys():
+	print time.ctime(),"   Working on dataset [", idx+1,"] of [",self.Ndata_files,"]" 
+	self.data_files[idx]['cp_count'] = self.changepoint_counts[self.working_distn][idx]
+	for col in ['cnv_call','cnv_call_lower_ci','cnv_call_upper_ci' ]:
+	  self.data_files[idx][col] = np.nan
+	for row_idx in xrange(self.observed_states_counts[self.working_distn][idx].shape[1]):
+	  # calculate meand and sd of CNVs 
+	  m,v = self._fixed_effect_combined_stat(self.states_moments[self.working_distn][:,0],self.states_moments[self.working_distn][:,1],self.observed_states_counts[self.working_distn][idx][:,row_idx])
+	  m_lci = m - 1.96*np.sqrt(v)
+	  m_uci = m + 1.96*np.sqrt(v)
+	  # store the cnv params
+	  self.data_files[idx]['cnv_call'][row_idx] = np.double(m)/np.double(self.data_mean)
+	  self.data_files[idx]['cnv_call_lower_ci'][row_idx] = m_lci/self.data_mean # self._data_lower_ci 
+	  self.data_files[idx]['cnv_call_upper_ci'][row_idx] = m_uci/self.data_mean # self._data_upper_ci
+  
+  def _summarize_cnv_regions(self,x,idx):
+    back = np.zeros(9)
+    # chromosome
+    back[0] = x.ix[x.irow(0).name,'chr']
+    # start
+    back[1] = x.ix[x.irow(0).name,'start']
+    # end
+    back[2] = x.ix[x.irow(len(x)-1).name,'end']
+    # n_data_points
+    back[3] = len(x)
+    # size  
+    back[4] = back[2] - back[1]
+    # meand and var
+    w = np.apply_along_axis( np.sum,1,self.observed_states_counts[self.working_distn][idx][:,x.index] )
+    w /= w.sum()
+    cn_m, cn_v = self._fixed_effect_combined_stat(self.states_moments[self.working_distn][:,0],self.states_moments[self.working_distn][:,1],w)
+    back[5] = cn_m/self.data_mean
+    back[6] = (cn_m - 1.96*np.sqrt(cn_v))/self._data_lower_ci
+    back[7] = (cn_m + 1.96*np.sqrt(cn_v))/self._data_upper_ci
+    cnv_post_prob = np.dot(w,w) - np.dot(np.exp(self.states_log_prob_of_CN1),w) #1 - np.exp(np.apply_along_axis( np.logaddexp.reduce,1,self.states_log_prob_of_CN1 + np.log(w) ) )
+    back[8] = cnv_post_prob
+    return pd.DataFrame(back).T
+
+  def _find_segment_positions(self,prob=0.9, min_size=0,max_size=np.inf, filter_overlap_1 = True, filter_genotype_eq_1 = True):
+    calls = {}
+    for file_idx in self.data_files.keys():
+      f = self.data_files[file_idx]
+      cnv_regions = f[f.post_prob_of_cnv > prob]
+      step = np.diff(cnv_regions.start.values)
+      normal_step = np.median(step)
+      cnv_regions['step'] = 0
+      cnv_regions['step'][1:] = step
+      ids = 0
+      cnv_regions['cnv_id'] = 0
+      for i in xrange(len(cnv_regions)):
+	true_row_id = cnv_regions.irow(i).name 
+	if cnv_regions.ix[true_row_id,'step'] > normal_step:
+	  ids +=1
+	cnv_regions.ix[true_row_id,'cnv_id'] = ids
+
+      cnv_regions = cnv_regions.groupby('cnv_id').apply(lambda x: self._summarize_cnv_regions(x,file_idx))
+      if cnv_regions.empty:
+	break
+      cnv_regions.columns = ['chromosome','start','end','n_dat_point','size','mean_cnv','lower_ci_cnv','upper_ci_cnv','cnv_post_prob']
+      cnv_regions['genotype'] = np.round(cnv_regions['mean_cnv']/0.5)*0.5
+	# filter
+      cnv_regions = cnv_regions[cnv_regions.size > min_size]
+      cnv_regions = cnv_regions[cnv_regions.size < max_size]
+      if filter_overlap_1:
+	both_over_one = (cnv_regions.lower_ci_cnv > 1) * (cnv_regions.upper_ci_cnv > 1) == True
+	both_under_one = (cnv_regions.lower_ci_cnv < 1) * (cnv_regions.upper_ci_cnv < 1) == True
+	cnv_regions = cnv_regions[both_over_one +  both_under_one]
+      if filter_genotype_eq_1:
+	cnv_regions = cnv_regions[cnv_regions.genotype != 1]
+      calls[file_idx] = cnv_regions.reset_index()
+    self.cnv_call = {}
+    self.cnv_call[self.working_distn] = calls
+  
+  def process_samples(self):
+    print time.ctime(),"Processing samples from the posterior"
+    self._define_data_storage()
+    # count number of times at each data point there is a change in state
+    #self._update_changepoint_counts()
+    #self._update_observed_states_counts()
+    self._update_changepoint_and_observed_states_counts()
+    # Normalize the counts to be frequencies.
+    self._normalize_counts()
+    # calculate the prob for each post-sample of a state representing copy number 1
+    self._states_log_prob_of_CN1()
+    # calculate prob of cnv at each data point
+    self._post_prob_of_cnv()
+    # calculate the mean and var of the states
+    self._calculate_average_states_mean_var()
+    # calculate the median of the data
+    if not hasattr(self,'data_mean'):
+      self._data_median()
+    print time.ctime(),"Summarising CNV calls"
+    self._mean_ci_for_cnv()
+    print time.ctime(), "Calling CNVs"
+    self._find_segment_positions()
+    print time.ctime(), "   ... done"
+
+
+  def focus_on_cnvs(self,prob = 0.5,flank = 100, burnin=0,niter=0,freq=0):
+    self.process_samples()
+    self.sampled_models = {}
+    self.observed_states_counts = {}
+    self.changepoint_counts = {}
+    if not hasattr(self ,'_original_data'):
+      self._original_data = {}
+      self._last_states_list = []
+      self._cnv_regions = {}
+      for idx in self.data_files.keys():
+	self._original_data[idx] = self.data_files[idx].copy()
+      self._last_states_list = copy.deepcopy(self.model[self.working_distn].states_list)
+	
+    for idx in self.data_files.keys():
+      # replace the original data file table to be able to call cnvs next time without problems
+      indexes = self.data_files[idx][self.data_files[idx].post_prob_of_cnv > prob].index.values
+      indexes = np.unique(np.ravel([[range(i,i+flank),range(i-flank,i)] for i in indexes ]))
+      indexes = indexes[indexes >= 0]
+      indexes = indexes[indexes <= np.max(self.data_files[idx].index.values)]
+      self.data_files[idx] = self.data_files[idx].ix[indexes,:]
+      # store the data index for the cnv regions
+      self._cnv_regions[idx] = self.data_files[idx].index.values
+      # update the data stored on the model 
+      self.model[self.working_distn].states_list[idx].stateseq = self.model[self.working_distn].states_list[idx].stateseq[ self._cnv_regions[idx] ]
+      self.model[self.working_distn].states_list[idx].data = self.model[self.working_distn].states_list[idx].data[ self._cnv_regions[idx] ]
+      # reset index of table
+      if hasattr(self.data_files[idx] ,'level_0'):
+	self.data_files[idx] = self.data_files[idx].drop(['level_0'],1)
+      self.data_files[idx] = self.data_files[idx].reset_index()
+      
+    if burnin > 0:
+      self.sample(niter = burnin,save=False)
+    if niter > 0:
+      self.sample(niter = niter,save=True, freq = freq)
+      self.process_samples()
+
+  def unfocus_on_cnvs(self,burnin=0,niter=0,freq=0):
+    # save the state seq we have now
+    focused_state_list = copy.deepcopy(self.model[self.working_distn].states_list)
+    # recover the last states sequence
+    self.model[self.working_distn].states_list = copy.deepcopy(self._last_states_list)
+    # empy this object
+    self._last_states_list = []
+    # overwritte the states values for the data points were we have been working on
+    for idx in xrange(len(self.model[self.working_distn].states_list)):
+      self.model[self.working_distn].states_list[idx].stateseq[ self._cnv_regions[idx] ] = focused_state_list[idx].stateseq
+
+    # recover the original data
+    self.data_files = self._original_data
+    self._original_data = {}
+    
+    # if desire run some sampling to update the genome wide estimates
+    if burnin > 0:
+      self.sample(niter = burnin,save=False)
+    if niter > 0:
+      self.sampled_models = {}
+      self.sample(niter = niter,save=True, freq = freq)
+      self.process_samples()
+
+    
+  def _sample_nbinomial(r,p,size=1):
+    return np.random.poisson(np.random.gamma(r,p/(1-p),size=size))
+
+    
+  def set_files(self,files):
+    self.files = files
+  
+  def define_prior_distns(self,distn = None):
+    import pyhsmm
+    if distn is not None:
+      self.working_distn = distn
+    if self.working_distn == 'negative_binomial':
+      self.prior_dist[self.working_distn] = [pyhsmm.distributions.NegativeBinomial(**self.hyper_priors[self.working_distn]) for idx in range(self.Nmax)]
+    elif self.working_distn == 'gaussian':
+      self.prior_dist[self.working_distn] = [pyhsmm.distributions.ScalarGaussianNonconjNIX(**self.hyper_priors[self.working_distn]) for idx in range(self.Nmax)]
+    else:
+      print "Distribution [",distn,"] not implemented"
+    
+
+  def set_Nmax(self,Nmax = 0):
+    if Nmax == 0:
+      alldata = np.hstack([ self.data[i] for i in self.data.keys()])
+      self.Nmax = int(2*(round(alldata.max()/np.mean(alldata)/0.5) + 1))
+      # put a limit on the maximun number of states
+      if self.Nmax > 100:
+	  self.Nmax = 100
+    else:
+      self.Nmax = Nmax
+
+    print time.ctime(), "Using weak limit approximation with a max number of states of [",self.Nmax,"]"
+
+
+  def set_hyperpriors(self,distn = None,tolerance = 0.4):
+    import pyhsmm
+    import pyhsmm.basic.distributions as distributions
+    import scipy as sp
+    import scipy.stats as ss
+    import random
+    if distn is not None:
+      self.working_distn = distn
+
+    print time.ctime(), "Getting hyper-prior estimates from the data"
+    alldata = np.hstack([ self.data[i] for i in self.data.keys()])
+    ratio = alldata[alldata > 0]/np.median(alldata[alldata > 0])
+    alldata = alldata[ratio > 0.5 + tolerance ]
+    ratio = alldata[alldata > 0]/np.median(alldata[alldata > 0])
+    alldata = alldata[ratio < 1.5 - tolerance]
+    if self.working_distn == 'negative_binomial':
+      Rs = []
+      Ps = []
+      nb = distributions.NegativeBinomial(5.,20.,1.,1.)
+      small_data = np.array(random.sample(alldata,1000))# np.random.poisson(np.median(alldata),100) 
+      # short burnin
+      for i in xrange(1000):
+	  nb.resample(small_data)
+
+      for i in xrange(100):
+	  nb.resample(small_data)
+	  Rs.append(nb.r)
+	  Ps.append(nb.p)
+      fit_gamma_alpha,fit_gamma_loc,fit_gamma_beta=ss.gamma.fit(Rs,loc=0,scale=1)
+      fit_beta_a, fit_beta_b, fit_beta_loc, fit_beta_scale = ss.beta.fit(Ps,loc=0,scale=1)
+      self.hyper_priors[self.working_distn] = { 'k_0': fit_gamma_alpha, 'theta_0': fit_gamma_beta , 'alpha_0': fit_beta_a, 'beta_0': fit_beta_b }
+    elif self.working_distn == 'gaussian':
+      log_data = np.log(alldata[alldata > 0])
+      del(alldata)
+      means_prior_mean = np.mean(log_data)
+      means_prior_sd = np.max(log_data) - 0
+      self.hyper_priors[self.working_distn] = dict(
+                      mu_0=means_prior_mean, tausq_0=(1/(means_prior_sd**2)), # mean and variance for the prior for means
+                      sigmasq_0=1, nu_0=1) # mean and number of pseudo-observations for variances
+                                                   # setting a large sigmasq_0 and increasing
+                                                   # nu_0 will encourage less segmentation
+      
+    print time.ctime(), "   ... done"
+
+  def read_data(self):
+    datatospread = {}
+    counter = 0
+    print time.ctime(), len(self.files), " files to load"
+    data_files = {}
+    for file in self.files:
+      table = pd.read_table(file,names=['chr','start','end','counts'],sep="\t",na_values=['inf','-inf'])
+      table['counts'] = table['counts'].astype(float)
+      # FILTER DATA
+      # TODO: this set NaN to 0 needs to change later on!!!
+      table = table.fillna(0)
+      # remove infinity values
+      table = table[table['counts'] != np.inf]
+      table = table[table['counts'] != -np.inf]
+      # rm rows with counts == 0
+      table = table[table['counts'].isnull() == False]
+      # rm values < 0. This may need to change if using some other type of values
+      table = table[table['counts'] >=0 ]
+      # remove values way off the mean?
+      #table = table[table['counts'] < 1000*np.median(table['counts'][table['counts'] > 0] )]
+      if self.split_in_gaps:
+	steps = np.diff(table['start'].values)
+	normal_step = np.median(steps)
+	if self.min_gap_size > 0:
+	  normal_step = self.min_gap_size
+	group_size = 0
+	labels = np.zeros_like(table['counts'].values)
+	for i,s in enumerate(steps):
+	  if s > normal_step:
+	    if group_size > self.min_group_size:
+	      counter +=1
+	      group_size = 0
+	    else:
+	      group_size += 1
+	  else:
+	    group_size +=1
+	  labels[i] = counter
+	# assign last data point to last cluster
+	labels[-1] = counter
+	# assign data to dict to load in data model later on
+	table['group_label'] = labels
+	if group_size < self.min_group_size:
+	  table['group_label'][table['group_label'] == counter] = counter-1
+	  counter -= 1
+      else:
+	table['group_label'] = counter
+	
+      table_grouped = table.groupby('group_label')
+
+      groups_idx = table_grouped.indices
+      for idx in groups_idx:
+	idx = int(idx)
+	datatospread[idx] = table['counts'][ groups_idx[idx] ].values.astype(int)
+	data_files[idx] = table_grouped.get_group( idx ).reset_index()
+      # increase counter to start new files on new data sequence
+      counter += 1
+
+    self.data = datatospread
+    self.data_files = data_files    
+    self.Ndata_files = len(self.data)
+    self.rename_chromosomes()
+    print time.ctime(), "   data loaded into [",len(self.data),"] data sequences"
+    print time.ctime(), "   ... done"
+
+  def rename_chromosomes(self):
+    alt_name_dict = {'X' : 23,'Y' : 24,'XY':25,'MT':26,'20':202}
+    for idx in self.data_files.keys():
+      for alt_name in alt_name_dict.keys():
+	self.data_files[idx].chr.values[self.data_files[idx].chr.values.astype(str) == alt_name] = alt_name_dict[alt_name]
+
+
+  def make_bed(self,filename):
+    if len(self.cnv_call[self.working_distn]) == 0:
+      print time.ctime(), "No CNVs detected"
+    else:
+      calls = pd.concat(self.cnv_call[self.working_distn].values())
+      calls = calls.sort(['chromosome','start'])
+      cols_to_print = ['chromosome','start','end','n_dat_point','size','mean_cnv','lower_ci_cnv','upper_ci_cnv','cnv_post_prob','cnv_post_prob_lower_ci','cnv_post_prob_upper_ci','genotype']
+      calls.ix[:,cols_to_print].to_csv(filename, header=True,index=None,sep="\t")
+
+  def print_summary(self,filename,how='all'):
+    data = pd.concat(self.data_files.values())
+    if how == 'all':
+      out = ''.join((filename,'.cnv_calls_summary.txt'))
+      data.to_csv(out,header=True,index=None,sep="\t")
+    elif how == 'by_chr':
+      index = data.groupby('chr').indices
+      for chr_id in index.keys():
+	out = ''.join((filename,'.cnv_calls_summary.',str(chr_id),'.txt'))
+	data.ix[index[chr_id],:].to_csv(out,header=True,index=None,sep="\t")
+
+  
+
+  def find_segment_positions(data_files, prob=0.9, min_size=0,max_size=np.inf, observed_states = observed_states, genome_average = genome_average_mean,log_w=log_w,states_means=states_means):
+    calls = {}
+    for file_idx in data_files.keys():
+      f = data_files[file_idx]
+      cnv_regions = f[f.post_prob_of_cnv > prob]
+      step = np.diff(cnv_regions.start.values)
+      normal_step = np.median(step)
+      cnv_regions['step'] = 0
+      cnv_regions['step'][1:] = step
+      ids = 0
+      cnv_regions['cnv_id'] = 0
+      for i in xrange(len(cnv_regions)):
+	true_row_id = cnv_regions.irow(i).name 
+	if cnv_regions.ix[true_row_id,'step'] > normal_step:
+	  ids +=1
+	cnv_regions.ix[true_row_id,'cnv_id'] = ids
+
+      cnv_regions = cnv_regions.groupby('cnv_id').apply(lambda x: summarize_cnv_regions(x,observed_states[file_idx],genome_average = genome_average_mean,log_w=log_w,states_means=states_means))
+      cnv_regions.columns = ['chromosome','start','end','n_dat_point','size','mean_cnv','lower_ci_cnv','upper_ci_cnv','cnv_post_prob','cnv_post_prob_lower_ci','cnv_post_prob_upper_ci']
+      cnv_regions['genotype'] = np.round(cnv_regions['mean_cnv']/0.5)*0.5
+      # filter
+      cnv_regions = cnv_regions[cnv_regions.size > min_size]
+      cnv_regions = cnv_regions[cnv_regions.size < max_size]
+      calls[file_idx] = cnv_regions.reset_index()
+    return calls
+
+      
+  def _changepoint_indicators_from_stateseq(stateseq):
+      '''
+      returns an binary array "ret" of same size as stateseq
+      where ret[i]=1 iff there's a changepoint between times i-1 and i
+      '''
+      return np.concatenate(( (0,), np.diff(stateseq) != 0 ))
+
+  def approx_k_0(x):
+    ''' Will make the asumption that p = 0.5, corresponding to a flat prior on p, i.e., Beta(1,1).
+	and that theta_0 = k_0/2
+    '''
+    return np.sqrt(mean_counts*2) + 0.001
+
+
+  def bma(mu, sd, pr_M):
+    average = np.average(mu, weights=pr_M)
+    var_avg = -1*average**2
+    for k in xrange(len(pr_M)):
+      var_avg += (sd[k] + mu[k]**2)*pr_M[k]
+    return average, var_avg
+
+  def weighted_avg_and_std(self,values, weights):
+      """
+	  Returns the weighted average and standard deviation.
+	  
+	  values, weights -- Numpy ndarrays with the same shape.
+	  """
+      average = np.average(values, weights=weights)
+      variance = np.dot(weights, (values-average)**2)/weights.sum()  # Fast and numerically precise
+      return average, variance
+    
+  def _fixed_effect_combined_stat(self,mu,sigma,w):
+    b_fixed = np.dot(mu,w)/w.sum()
+    v_fixed = np.dot(w*w,sigma)
+    return b_fixed, v_fixed
+    

File utilities/plot.py

+def plot_cnv_profile(table,cnv_segments=None,data_mean = None):
+    from matplotlib import pylab as plt
+    import pandas as pd
+    import numpy as np
+    # get the range of data on x and y axis
+    Xs = (table.end.values + table.start.values)/2
+    if data_mean is None:
+      data_mean = np.median(table['counts'])
+    Ys = table['counts']/data_mean
+    xmin, xmax = Xs.min(), Xs.max()
+    ymin, ymax = Ys.min(), Ys.max()
+    ymin = 0
+    plt.xlim(xmin=xmin,xmax=xmax)
+    # add horizontal grid lines
+    for cn in set([0.5/5*y_i for y_i in xrange(0, 1 + int(round(ymax+0.5))*10,5)]):
+      plt.axhline(y=cn,color='gray', linestyle='--')
+    #plot the raw count data normalize by the genome median
+    plt.plot(Xs,Ys,'g+')
+    
+    # plot the confidence interval shaded area for the posterior probability of being a CNV
+    #plt.fill_between(x=Xs,y1=table['post_prob_of_cnv_upper_ci'].values,y2=table['post_prob_of_cnv_lower_ci'].values, color='red', alpha=0.2, edgecolor='white' );
+    # plot the post-prob of being a CNV
+    plt.plot(Xs,table['post_prob_of_cnv'],'r-');
+    
+    # plot the estimaded mean anc CI for each bin
+    plt.errorbar(Xs,table['cnv_call'],yerr = [table['cnv_call_lower_ci']-table['cnv_call'], table['cnv_call']-table['cnv_call_upper_ci']  ], fmt='co');
+    if type(cnv_segments) == pd.core.frame.DataFrame:
+      plt.hlines(y=cnv_segments.genotype,colors='purple', linewidth = 2,linestyle='solid',xmin=cnv_segments.start,xmax=cnv_segments.end)
+      for row in cnv_segments.iterrows():
+	plt.fill_between(x=[row[1][3],row[1][4]], y1=row[1][9], y2=row[1][8],color='purple', alpha=0.2, edgecolor='white' )
+    ## make the plot
+    plt.show()