Commits

Inti Pedroso committed 10c389c

rm some files

Comments (0)

Files changed (8)

utilities/__init__.pyc

Binary file removed.

utilities/cnv_caller.pyc

Binary file removed.

utilities/cnv_caller.py~

-from __future__ import division
-import sys
-sys.path.append('/home/inti/My_Soft/')
-sys.path.append('/home/pedrosoi/APP/')
-import pyhsmm
-from pyhsmm.util.text import progprint_xrange
-
-import pandas as pd
-import numpy as np
-import time 
-import copy
-observed_states = ''
-genome_average_mean = ''
-log_w = ''
-states_means = ''
-
-class iCC():
-  def __init__(self):
-    print time.ctime(), "Initializing CNV caller object"
-    # set some defauls values
-    self.split_in_gaps = False
-    self.min_gap_size = 0
-    self.max_segment_size = 5000
-    self.min_segment_size = 500 
-    self.min_group_size = 500
-    self.Nmax = 0
-    self.hyper_priors = {}
-    self.prior_dist = {}
-    self.working_distn = None
-    self.MCMCopts = dict(niter = 100, burnin = 50, sample_freq = 5)
-    self.sampled_models = {}
-    self.changepoint_counts = {}
-    self.observed_states_counts = {}
-    self.model_opts = dict(alphakappa_a_0=50.,
-			    alphakappa_b_0=1.,
-			    rho_a_0=5000.,
-			    rho_b_0=1., # higher rho_a_0 means stickier
-			    gamma_a_0=1.,
-			    gamma_b_0=3.,# higher rho_a_0 means stickier, rhos is the self-transition fraction and is distributed as a Beta
-			    init_state_concentration=50.
-			   )
-    self.model = {}
-    
-  def _restart(self):
-    self.read_data()
-    self.set_hyperpriors()
-    self.define_prior_distns()
-    self.set_model()
-    self.load_data_into_model()
-
-    
-  def set_model(self):
-    import pyhsmm
-    self.model[self.working_distn] =   pyhsmm.models.StickyHMMEigen(
-						      obs_distns     = self.prior_dist[self.working_distn],
-						      alphakappa_a_0=50.,alphakappa_b_0=1.,
-						      rho_a_0=5000.,rho_b_0=1., # higher rho_a_0 means stickier
-						      gamma_a_0=1.,gamma_b_0=3.,# higher rho_a_0 means stickier, rhos is the self-transition fraction and is distributed as a Beta
-						      init_state_concentration=50.)
-						      #alphakappa_a_0 = self.model_opts['alphakappa_a_0'],
-						      #alphakappa_b_0 = self.model_opts['alphakappa_b_0'],
-						      #rho_a_0        = self.model_opts['rho_a_0'],
-						      #rho_b_0        = self.model_opts['rho_b_0'], # higher rho_a_0 means stickier
-						      #gamma_a_0      = self.model_opts['gamma_a_0'],
-						      #gamma_b_0      = self.model_opts['gamma_b_0'],# higher rho_a_0 means stickier, rhos is the self-transition fraction and is distributed as a Beta
-						      #init_state_concentration = 50.)#self.model_opts['init_state_concentration']# this implies a tight prior on gamma controlling the stick-breaking process. The stick-breaking process is a Beta(1,gamma)
-					
-  
-  def load_data_into_model(self,transform = None):
-    print time.ctime(), 'Loading data'
-    for i in self.data.keys():
-      data = self.data[i]
-      if transform is not None:
-	data = transform(data)
-      self.model[self.working_distn].add_data(data)
-    print time.ctime(), '   ... done'
-    
-  def sample(self,save = False,niter = None, freq = None, parallel=False,numtoresample='all',all_at = 0.1, verbose = False):
-    number_of_interations = self.MCMCopts['niter']
-    save_freq =  self.MCMCopts['sample_freq']
-    if niter is not None:
-      number_of_interations = niter
-    if freq is not None:
-      save_freq = freq
-     
-    for itr in progprint_xrange(number_of_interations):
-      if parallel:
-	n_to_resample = numtoresample
-	if (itr % all_at) == 0:
-	  n_to_resample = 'all'
-	self.model[self.working_distn].resample_model_parallel2(numtoresample=n_to_resample)
-      else:
-	self.model[self.working_distn].resample_model()
-      if save:
-	if self.working_distn not in self.sampled_models.keys():
-	  self.sampled_models[self.working_distn] = {}
-	if (itr % save_freq) == 0:
-	  self.sampled_models[self.working_distn][itr] = copy.deepcopy(self.model[self.working_distn])
-      if verbose:
-	m = self.model[self.working_distn] 
-	print itr, m.trans_distn.alpha, m.trans_distn.gamma, m.trans_distn.rho, m.trans_distn.kappa, m.trans_distn.beta.max()
-
-  def _define_data_storage(self):
-    self.changepoint_counts[self.working_distn] = {}
-    self.observed_states_counts[self.working_distn] = {}
-    for idx, s_list in enumerate(self.model[self.working_distn].states_list):
-      self.changepoint_counts[self.working_distn][idx] = np.zeros_like(s_list.data)
-      # observed states at sequences
-      self.observed_states_counts[self.working_distn][idx] = np.zeros((self.Nmax,len(s_list.data)))
-  
-  def _update_changepoint_and_observed_states_counts(self):
-    for m in self.sampled_models[self.working_distn].values():
-      for idx, s_list in enumerate(m.states_list):
-	self.changepoint_counts[self.working_distn][idx] += np.concatenate(( (0,), np.diff(s_list.stateseq) != 0 ))
-	for i, j in enumerate(s_list.stateseq):
-	  self.observed_states_counts[self.working_distn][idx][j,i] += 1
-
-  def _normalize_counts(self):
-    Nmodels = len(self.sampled_models[self.working_distn])
-    for idx in self.observed_states_counts[self.working_distn].keys():
-	for row_idx in xrange(self.observed_states_counts[self.working_distn][idx].shape[0]):
-	  self.observed_states_counts[self.working_distn][idx][row_idx] /= Nmodels
-	self.changepoint_counts[self.working_distn][idx] /= Nmodels
-	
-  def _calculate_average_states_mean_var(self):
-    self.states_moments = {}
-    self.states_params = {}
-    states_means = ''
-    states_var = ''
-    if self.working_distn == 'negative_binomial':
-      Rs = np.zeros(self.Nmax)
-      Ps = np.zeros(self.Nmax)
-      for m in self.sampled_models[self.working_distn].values():
-	for i,distn in enumerate(m.obs_distns):
-	  Rs[i] += distn.r
-	  Ps[i] += distn.p
-      Nmodels = len(self.sampled_models[self.working_distn])
-      Rs /= Nmodels
-      Ps /= Nmodels
-      
-      states_means = [ Rs[i]*Ps[i]/(1-Ps[i]) for i in xrange(self.Nmax) ]
-      states_var   = [ Rs[i]*Ps[i]/(1-Ps[i])**2 for i in xrange(self.Nmax) ]
-      self.states_params[self.working_distn] = {}
-      self.states_params[self.working_distn]['Rs'] = Rs
-      self.states_params[self.working_distn]['Ps'] = Ps
-
-    elif self.working_distn == 'gaussian':
-      MUs = np.zeros(self.Nmax)
-      SIGMASQs = np.zeros(self.Nmax)
-      for m in self.sampled_models[self.working_distn].values():
-	for i,distn in enumerate(m.obs_distns):
-	  MUs[i] += distn.mu
-	  SIGMASQs[i] += distn.sigmasq
-      Nmodels = len(self.sampled_models[self.working_distn])
-      MUs /= Nmodels
-      SIGMASQs /= Nmodels
-      self.states_params[self.working_distn] = {}
-      self.states_params[self.working_distn]['mu'] = MUs
-      self.states_params[self.working_distn]['sigmasq'] = SIGMASQs
-      states_means = MUs 
-      states_var = SIGMASQs
-    else:
-      NotImplementedError
-    
-    states_means = np.hstack(states_means)
-    states_var = np.hstack(states_var)
-    self.states_moments[self.working_distn] = np.zeros((self.Nmax,4))
-    self.states_moments[self.working_distn][:,0] = states_means
-    self.states_moments[self.working_distn][:,1] = states_var
-    self.states_moments[self.working_distn][:,2] = states_means + 1.96*np.sqrt(states_var)
-    self.states_moments[self.working_distn][:,3] = states_means - 1.96*np.sqrt(states_var)
-    
-  
-  def _data_median(self):
-    values = pyhsmm.util.stats.flattendata([l.data for l in self.model[self.working_distn].states_list ])
-    weights = pyhsmm.util.stats.flattendata([self.data_files[idx].post_prob_of_cnv.values for idx in self.data_files.keys()])
-    weights -= 1
-    self.data_mean, self.data_variance = self.weighted_avg_and_std(values, weights=weights)
-    #self.data_mean,self.data_variance = self._fixed_effect_combined_stat(self.states_moments[self.working_distn][:,0],self.states_moments[self.working_distn][:,1],np.exp(self.states_log_prob_of_CN1))
-    self._data_lower_ci = self.data_mean - 1.96*np.sqrt(self.data_variance)
-    self._data_upper_ci = self.data_mean + 1.96*np.sqrt(self.data_variance)
-    
-
-  def _states_log_prob_of_CN1(self):
-    Nmodels = len(self.sampled_models[self.working_distn])
-    log_w = np.vstack([ np.apply_along_axis(np.sum,1,self.observed_states_counts[self.working_distn][idx]*Nmodels) for idx in self.observed_states_counts[self.working_distn].keys()  ])
-    if len(self.observed_states_counts[self.working_distn].keys()) > 1:
-      log_w = np.apply_along_axis(np.sum,0,log_w)      
-    else:
-      log_w = log_w[0]
-    log_w = np.log(log_w)
-    log_w -= np.logaddexp.reduce(log_w)
-    self.states_log_prob_of_CN1 = log_w
-
-  def _post_prob_of_cnv(self):
-    print time.ctime(),"   Calculating prob of CNV" 
-    for idx in self.observed_states_counts[self.working_distn].keys():
-      self.data_files[idx]['post_prob_of_cnv'] = np.hstack([ np.dot(self.observed_states_counts[self.working_distn][idx][:,row_idx],self.observed_states_counts[self.working_distn][idx][:,row_idx]) - np.dot(self.observed_states_counts[self.working_distn][idx][:,row_idx], np.exp(self.states_log_prob_of_CN1)) for row_idx in xrange(len(self.model[self.working_distn].states_list[idx].data)) ])
-      self.data_files[idx]['post_prob_of_cnv'][ self.data_files[idx]['post_prob_of_cnv'] < 0 ] = 0
-      self.data_files[idx]['post_prob_of_cnv'][ self.data_files[idx]['post_prob_of_cnv'] > 1 ] = 1
-	
-  def _mean_ci_for_cnv(self):
-    for idx in self.observed_states_counts[self.working_distn].keys():
-	print time.ctime(),"   Working on dataset [", idx+1,"] of [",self.Ndata_files,"]" 
-	self.data_files[idx]['cp_count'] = self.changepoint_counts[self.working_distn][idx]
-	for col in ['cnv_call','cnv_call_lower_ci','cnv_call_upper_ci' ]:
-	  self.data_files[idx][col] = np.nan
-	for row_idx in xrange(self.observed_states_counts[self.working_distn][idx].shape[1]):
-	  # calculate meand and sd of CNVs 
-	  m,v = self._fixed_effect_combined_stat(self.states_moments[self.working_distn][:,0],self.states_moments[self.working_distn][:,1],self.observed_states_counts[self.working_distn][idx][:,row_idx])
-	  m_lci = m - 1.96*np.sqrt(v)
-	  m_uci = m + 1.96*np.sqrt(v)
-	  # store the cnv params
-	  self.data_files[idx]['cnv_call'][row_idx] = np.double(m)/np.double(self.data_mean)
-	  self.data_files[idx]['cnv_call_lower_ci'][row_idx] = m_lci/self.data_mean # self._data_lower_ci 
-	  self.data_files[idx]['cnv_call_upper_ci'][row_idx] = m_uci/self.data_mean # self._data_upper_ci
-  
-  def _summarize_cnv_regions(self,x,idx):
-    back = np.zeros(9)
-    # chromosome
-    back[0] = x.ix[x.irow(0).name,'chr']
-    # start
-    back[1] = x.ix[x.irow(0).name,'start']
-    # end
-    back[2] = x.ix[x.irow(len(x)-1).name,'end']
-    # n_data_points
-    back[3] = len(x)
-    # size  
-    back[4] = back[2] - back[1]
-    # meand and var
-    w = np.apply_along_axis( np.sum,1,self.observed_states_counts[self.working_distn][idx][:,x.index] )
-    w /= w.sum()
-    cn_m, cn_v = self._fixed_effect_combined_stat(self.states_moments[self.working_distn][:,0],self.states_moments[self.working_distn][:,1],w)
-    back[5] = cn_m/self.data_mean
-    back[6] = (cn_m - 1.96*np.sqrt(cn_v))/self._data_lower_ci
-    back[7] = (cn_m + 1.96*np.sqrt(cn_v))/self._data_upper_ci
-    cnv_post_prob = np.dot(w,w) - np.dot(np.exp(self.states_log_prob_of_CN1),w) #1 - np.exp(np.apply_along_axis( np.logaddexp.reduce,1,self.states_log_prob_of_CN1 + np.log(w) ) )
-    back[8] = cnv_post_prob
-    return pd.DataFrame(back).T
-
-  def _find_segment_positions(self,prob=0.9, min_size=0,max_size=np.inf, filter_overlap_1 = True, filter_genotype_eq_1 = True):
-    calls = {}
-    for file_idx in self.data_files.keys():
-      f = self.data_files[file_idx]
-      cnv_regions = f[f.post_prob_of_cnv > prob]
-      step = np.diff(cnv_regions.start.values)
-      normal_step = np.median(step)
-      cnv_regions['step'] = 0
-      cnv_regions['step'][1:] = step
-      ids = 0
-      cnv_regions['cnv_id'] = 0
-      for i in xrange(len(cnv_regions)):
-	true_row_id = cnv_regions.irow(i).name 
-	if cnv_regions.ix[true_row_id,'step'] > normal_step:
-	  ids +=1
-	cnv_regions.ix[true_row_id,'cnv_id'] = ids
-
-      cnv_regions = cnv_regions.groupby('cnv_id').apply(lambda x: self._summarize_cnv_regions(x,file_idx))
-      if cnv_regions.empty:
-	break
-      cnv_regions.columns = ['chromosome','start','end','n_dat_point','size','mean_cnv','lower_ci_cnv','upper_ci_cnv','cnv_post_prob']
-      cnv_regions['genotype'] = np.round(cnv_regions['mean_cnv']/0.5)*0.5
-	# filter
-      cnv_regions = cnv_regions[cnv_regions.size > min_size]
-      cnv_regions = cnv_regions[cnv_regions.size < max_size]
-      if filter_overlap_1:
-	both_over_one = (cnv_regions.lower_ci_cnv > 1) * (cnv_regions.upper_ci_cnv > 1) == True
-	both_under_one = (cnv_regions.lower_ci_cnv < 1) * (cnv_regions.upper_ci_cnv < 1) == True
-	cnv_regions = cnv_regions[both_over_one +  both_under_one]
-      if filter_genotype_eq_1:
-	cnv_regions = cnv_regions[cnv_regions.genotype != 1]
-      calls[file_idx] = cnv_regions.reset_index()
-    self.cnv_call = {}
-    self.cnv_call[self.working_distn] = calls
-  
-  def process_samples(self):
-    print time.ctime(),"Processing samples from the posterior"
-    self._define_data_storage()
-    # count number of times at each data point there is a change in state
-    #self._update_changepoint_counts()
-    #self._update_observed_states_counts()
-    self._update_changepoint_and_observed_states_counts()
-    # Normalize the counts to be frequencies.
-    self._normalize_counts()
-    # calculate the prob for each post-sample of a state representing copy number 1
-    self._states_log_prob_of_CN1()
-    # calculate prob of cnv at each data point
-    self._post_prob_of_cnv()
-    # calculate the mean and var of the states
-    self._calculate_average_states_mean_var()
-    # calculate the median of the data
-    if not hasattr(self,'data_mean'):
-      self._data_median()
-    print time.ctime(),"Summarising CNV calls"
-    self._mean_ci_for_cnv()
-    print time.ctime(), "Calling CNVs"
-    self._find_segment_positions()
-    print time.ctime(), "   ... done"
-
-
-  def focus_on_cnvs(self,prob = 0.5,flank = 100, burnin=0,niter=0,freq=0):
-    self.process_samples()
-    self.sampled_models = {}
-    self.observed_states_counts = {}
-    self.changepoint_counts = {}
-    if not hasattr(self ,'_original_data'):
-      self._original_data = {}
-      self._last_states_list = []
-      self._cnv_regions = {}
-      for idx in self.data_files.keys():
-	self._original_data[idx] = self.data_files[idx].copy()
-      self._last_states_list = copy.deepcopy(self.model[self.working_distn].states_list)
-	
-    for idx in self.data_files.keys():
-      # replace the original data file table to be able to call cnvs next time without problems
-      indexes = self.data_files[idx][self.data_files[idx].post_prob_of_cnv > prob].index.values
-      indexes = np.unique(np.ravel([[range(i,i+flank),range(i-flank,i)] for i in indexes ]))
-      indexes = indexes[indexes >= 0]
-      indexes = indexes[indexes <= np.max(self.data_files[idx].index.values)]
-      self.data_files[idx] = self.data_files[idx].ix[indexes,:]
-      # store the data index for the cnv regions
-      self._cnv_regions[idx] = self.data_files[idx].index.values
-      # update the data stored on the model 
-      self.model[self.working_distn].states_list[idx].stateseq = self.model[self.working_distn].states_list[idx].stateseq[ self._cnv_regions[idx] ]
-      self.model[self.working_distn].states_list[idx].data = self.model[self.working_distn].states_list[idx].data[ self._cnv_regions[idx] ]
-      # reset index of table
-      if hasattr(self.data_files[idx] ,'level_0'):
-	self.data_files[idx] = self.data_files[idx].drop(['level_0'],1)
-      self.data_files[idx] = self.data_files[idx].reset_index()
-      
-    if burnin > 0:
-      self.sample(niter = burnin,save=False)
-    if niter > 0:
-      self.sample(niter = niter,save=True, freq = freq)
-      self.process_samples()
-
-  def unfocus_on_cnvs(self,burnin=0,niter=0,freq=0):
-    # save the state seq we have now
-    focused_state_list = copy.deepcopy(self.model[self.working_distn].states_list)
-    # recover the last states sequence
-    self.model[self.working_distn].states_list = copy.deepcopy(self._last_states_list)
-    # empy this object
-    self._last_states_list = []
-    # overwritte the states values for the data points were we have been working on
-    for idx in xrange(len(self.model[self.working_distn].states_list)):
-      self.model[self.working_distn].states_list[idx].stateseq[ self._cnv_regions[idx] ] = focused_state_list[idx].stateseq
-
-    # recover the original data
-    self.data_files = self._original_data
-    self._original_data = {}
-    
-    # if desire run some sampling to update the genome wide estimates
-    if burnin > 0:
-      self.sample(niter = burnin,save=False)
-    if niter > 0:
-      self.sampled_models = {}
-      self.sample(niter = niter,save=True, freq = freq)
-      self.process_samples()
-
-    
-  def _sample_nbinomial(r,p,size=1):
-    return np.random.poisson(np.random.gamma(r,p/(1-p),size=size))
-
-    
-  def set_files(self,files):
-    self.files = files
-  
-  def define_prior_distns(self,distn = None):
-    import pyhsmm
-    if distn is not None:
-      self.working_distn = distn
-    if self.working_distn == 'negative_binomial':
-      self.prior_dist[self.working_distn] = [pyhsmm.distributions.NegativeBinomial(**self.hyper_priors[self.working_distn]) for idx in range(self.Nmax)]
-    elif self.working_distn == 'gaussian':
-      self.prior_dist[self.working_distn] = [pyhsmm.distributions.ScalarGaussianNonconjNIX(**self.hyper_priors[self.working_distn]) for idx in range(self.Nmax)]
-    else:
-      print "Distribution [",distn,"] not implemented"
-    
-
-  def set_Nmax(self,Nmax = 0):
-    if Nmax == 0:
-      alldata = np.hstack([ self.data[i] for i in self.data.keys()])
-      self.Nmax = int(2*(round(alldata.max()/np.mean(alldata)/0.5) + 1))
-      # put a limit on the maximun number of states
-      if self.Nmax > 100:
-	  self.Nmax = 100
-    else:
-      self.Nmax = Nmax
-
-    print time.ctime(), "Using weak limit approximation with a max number of states of [",self.Nmax,"]"
-
-
-  def set_hyperpriors(self,distn = None,tolerance = 0.4):
-    import pyhsmm
-    import pyhsmm.basic.distributions as distributions
-    import scipy as sp
-    import scipy.stats as ss
-    import random
-    if distn is not None:
-      self.working_distn = distn
-
-    print time.ctime(), "Getting hyper-prior estimates from the data"
-    alldata = np.hstack([ self.data[i] for i in self.data.keys()])
-    ratio = alldata[alldata > 0]/np.median(alldata[alldata > 0])
-    alldata = alldata[ratio > 0.5 + tolerance ]
-    ratio = alldata[alldata > 0]/np.median(alldata[alldata > 0])
-    alldata = alldata[ratio < 1.5 - tolerance]
-    if self.working_distn == 'negative_binomial':
-      Rs = []
-      Ps = []
-      nb = distributions.NegativeBinomial(5.,20.,1.,1.)
-      small_data = np.array(random.sample(alldata,1000))# np.random.poisson(np.median(alldata),100) 
-      # short burnin
-      for i in xrange(1000):
-	  nb.resample(small_data)
-
-      for i in xrange(100):
-	  nb.resample(small_data)
-	  Rs.append(nb.r)
-	  Ps.append(nb.p)
-      fit_gamma_alpha,fit_gamma_loc,fit_gamma_beta=ss.gamma.fit(Rs,loc=0,scale=1)
-      fit_beta_a, fit_beta_b, fit_beta_loc, fit_beta_scale = ss.beta.fit(Ps,loc=0,scale=1)
-      self.hyper_priors[self.working_distn] = { 'k_0': fit_gamma_alpha, 'theta_0': fit_gamma_beta , 'alpha_0': fit_beta_a, 'beta_0': fit_beta_b }
-    elif self.working_distn == 'gaussian':
-      log_data = np.log(alldata[alldata > 0])
-      del(alldata)
-      means_prior_mean = np.mean(log_data)
-      means_prior_sd = np.max(log_data) - 0
-      self.hyper_priors[self.working_distn] = dict(
-                      mu_0=means_prior_mean, tausq_0=(1/(means_prior_sd**2)), # mean and variance for the prior for means
-                      sigmasq_0=1, nu_0=1) # mean and number of pseudo-observations for variances
-                                                   # setting a large sigmasq_0 and increasing
-                                                   # nu_0 will encourage less segmentation
-      
-    print time.ctime(), "   ... done"
-
-  def read_data(self):
-    datatospread = {}
-    counter = 0
-    print time.ctime(), len(self.files), " files to load"
-    data_files = {}
-    for file in self.files:
-      table = pd.read_table(file,names=['chr','start','end','counts'],sep="\t",na_values=['inf','-inf'])
-      table['counts'] = table['counts'].astype(float)
-      # FILTER DATA
-      # TODO: this set NaN to 0 needs to change later on!!!
-      table = table.fillna(0)
-      # remove infinity values
-      table = table[table['counts'] != np.inf]
-      table = table[table['counts'] != -np.inf]
-      # rm rows with counts == 0
-      table = table[table['counts'].isnull() == False]
-      # rm values < 0. This may need to change if using some other type of values
-      table = table[table['counts'] >=0 ]
-      # remove values way off the mean?
-      #table = table[table['counts'] < 1000*np.median(table['counts'][table['counts'] > 0] )]
-      if self.split_in_gaps:
-	steps = np.diff(table['start'].values)
-	normal_step = np.median(steps)
-	if self.min_gap_size > 0:
-	  normal_step = self.min_gap_size
-	group_size = 0
-	labels = np.zeros_like(table['counts'].values)
-	for i,s in enumerate(steps):
-	  if s > normal_step:
-	    if group_size > self.min_group_size:
-	      counter +=1
-	      group_size = 0
-	    else:
-	      group_size += 1
-	  else:
-	    group_size +=1
-	  labels[i] = counter
-	# assign last data point to last cluster
-	labels[-1] = counter
-	# assign data to dict to load in data model later on
-	table['group_label'] = labels
-	if group_size < self.min_group_size:
-	  table['group_label'][table['group_label'] == counter] = counter-1
-	  counter -= 1
-      else:
-	table['group_label'] = counter
-	
-      table_grouped = table.groupby('group_label')
-
-      groups_idx = table_grouped.indices
-      for idx in groups_idx:
-	idx = int(idx)
-	datatospread[idx] = table['counts'][ groups_idx[idx] ].values.astype(int)
-	data_files[idx] = table_grouped.get_group( idx ).reset_index()
-      # increase counter to start new files on new data sequence
-      counter += 1
-
-    self.data = datatospread
-    self.data_files = data_files    
-    self.Ndata_files = len(self.data)
-    self.rename_chromosomes()
-    print time.ctime(), "   data loaded into [",len(self.data),"] data sequences"
-    print time.ctime(), "   ... done"
-
-  def rename_chromosomes(self):
-    alt_name_dict = {'X' : 23,'Y' : 24,'XY':25,'MT':26,'20':202}
-    for idx in self.data_files.keys():
-      for alt_name in alt_name_dict.keys():
-	self.data_files[idx].chr.values[self.data_files[idx].chr.values.astype(str) == alt_name] = alt_name_dict[alt_name]
-
-
-  def make_bed(self,filename):
-    if len(self.cnv_call[self.working_distn]) == 0:
-      print time.ctime(), "No CNVs detected"
-    else:
-      calls = pd.concat(self.cnv_call[self.working_distn].values())
-      calls = calls.sort(['chromosome','start'])
-      cols_to_print = ['chromosome','start','end','n_dat_point','size','mean_cnv','lower_ci_cnv','upper_ci_cnv','cnv_post_prob','cnv_post_prob_lower_ci','cnv_post_prob_upper_ci','genotype']
-      calls.ix[:,cols_to_print].to_csv(filename, header=True,index=None,sep="\t")
-
-  def print_summary(self,filename,how='all'):
-    data = pd.concat(self.data_files.values())
-    if how == 'all':
-      out = ''.join((filename,'.cnv_calls_summary.txt'))
-      data.to_csv(out,header=True,index=None,sep="\t")
-    elif how == 'by_chr':
-      index = data.groupby('chr').indices
-      for chr_id in index.keys():
-	out = ''.join((filename,'.cnv_calls_summary.',str(chr_id),'.txt'))
-	data.ix[index[chr_id],:].to_csv(out,header=True,index=None,sep="\t")
-
-  
-
-  def find_segment_positions(data_files, prob=0.9, min_size=0,max_size=np.inf, observed_states = observed_states, genome_average = genome_average_mean,log_w=log_w,states_means=states_means):
-    calls = {}
-    for file_idx in data_files.keys():
-      f = data_files[file_idx]
-      cnv_regions = f[f.post_prob_of_cnv > prob]
-      step = np.diff(cnv_regions.start.values)
-      normal_step = np.median(step)
-      cnv_regions['step'] = 0
-      cnv_regions['step'][1:] = step
-      ids = 0
-      cnv_regions['cnv_id'] = 0
-      for i in xrange(len(cnv_regions)):
-	true_row_id = cnv_regions.irow(i).name 
-	if cnv_regions.ix[true_row_id,'step'] > normal_step:
-	  ids +=1
-	cnv_regions.ix[true_row_id,'cnv_id'] = ids
-
-      cnv_regions = cnv_regions.groupby('cnv_id').apply(lambda x: summarize_cnv_regions(x,observed_states[file_idx],genome_average = genome_average_mean,log_w=log_w,states_means=states_means))
-      cnv_regions.columns = ['chromosome','start','end','n_dat_point','size','mean_cnv','lower_ci_cnv','upper_ci_cnv','cnv_post_prob','cnv_post_prob_lower_ci','cnv_post_prob_upper_ci']
-      cnv_regions['genotype'] = np.round(cnv_regions['mean_cnv']/0.5)*0.5
-      # filter
-      cnv_regions = cnv_regions[cnv_regions.size > min_size]
-      cnv_regions = cnv_regions[cnv_regions.size < max_size]
-      calls[file_idx] = cnv_regions.reset_index()
-    return calls
-
-      
-  def _changepoint_indicators_from_stateseq(stateseq):
-      '''
-      returns an binary array "ret" of same size as stateseq
-      where ret[i]=1 iff there's a changepoint between times i-1 and i
-      '''
-      return np.concatenate(( (0,), np.diff(stateseq) != 0 ))
-
-  def approx_k_0(x):
-    ''' Will make the asumption that p = 0.5, corresponding to a flat prior on p, i.e., Beta(1,1).
-	and that theta_0 = k_0/2
-    '''
-    return np.sqrt(mean_counts*2) + 0.001
-
-
-  def bma(mu, sd, pr_M):
-    average = np.average(mu, weights=pr_M)
-    var_avg = -1*average**2
-    for k in xrange(len(pr_M)):
-      var_avg += (sd[k] + mu[k]**2)*pr_M[k]
-    return average, var_avg
-
-  def weighted_avg_and_std(self,values, weights):
-      """
-	  Returns the weighted average and standard deviation.
-	  
-	  values, weights -- Numpy ndarrays with the same shape.
-	  """
-      average = np.average(values, weights=weights)
-      variance = np.dot(weights, (values-average)**2)/weights.sum()  # Fast and numerically precise
-      return average, variance
-    
-  def _fixed_effect_combined_stat(self,mu,sigma,w):
-    b_fixed = np.dot(mu,w)/w.sum()
-    v_fixed = np.dot(w*w,sigma)
-    return b_fixed, v_fixed
-    

utilities/functions.pyc

Binary file removed.

utilities/functions.py~

-import pandas as pd
-import numpy as np
-import time 
-import copy
-
-
-def plot_cnv_profile(table,cnv_segments=None,data_mean = None):
-    from matplotlib import pylab as plt
-    # get the range of data on x and y axis
-    Xs = (table.end.values + table.start.values)/2
-    if data_mean is None:
-      data_mean = np.median(table['counts'])
-    Ys = table['counts']/data_mean
-    xmin, xmax = Xs.min(), Xs.max()
-    ymin, ymax = Ys.min(), Ys.max()
-    ymin = 0
-    plt.xlim(xmin=xmin,xmax=xmax)
-    # add horizontal grid lines
-    for cn in set([0.5/5*y_i for y_i in xrange(0, 1 + int(round(ymax+0.5))*10,5)]):
-      plt.axhline(y=cn,color='gray', linestyle='--')
-    #plot the raw count data normalize by the genome median
-    plt.plot(Xs,Ys,'g+')
-    
-    # plot the confidence interval shaded area for the posterior probability of being a CNV
-    #plt.fill_between(x=Xs,y1=table['post_prob_of_cnv_upper_ci'].values,y2=table['post_prob_of_cnv_lower_ci'].values, color='red', alpha=0.2, edgecolor='white' );
-    # plot the post-prob of being a CNV
-    plt.plot(Xs,table['post_prob_of_cnv'],'r-');
-    
-    # plot the estimaded mean anc CI for each bin
-    plt.errorbar(Xs,table['cnv_call'],yerr = [table['cnv_call_lower_ci']-table['cnv_call'], table['cnv_call']-table['cnv_call_upper_ci']  ], fmt='co');
-    if type(cnv_segments) == pd.core.frame.DataFrame:
-      plt.hlines(y=cnv_segments.genotype,colors='purple', linewidth = 2,linestyle='solid',xmin=cnv_segments.start,xmax=cnv_segments.end)
-      for row in cnv_segments.iterrows():
-	plt.fill_between(x=[row[1][3],row[1][4]], y1=row[1][9], y2=row[1][8],color='purple', alpha=0.2, edgecolor='white' )
-    ## make the plot
-    plt.show()
-
-
-def start_ipcluster(N,profile = None):
-  import os
-  if profile is None:
-    profile = ''.join(('PID',str(os.getpid())))
-  print time.ctime(),'Starting Ipython Cluster with [',N,'] engines and profile name [',profile,']'
-  cmd = ''.join(('ipcluster start --daemonize=True --n=',str(N),' --profile=',profile))
-  os.system(cmd)
-  os.system('sleep 10')
-  return profile
-
-def stop_ipcluster(profile=None):
-  import os
-  if profile is None:
-    profile = ''.join(('PID',str(os.getpid())))
-  print time.ctime(),'Stopping Ipython Cluster with profile name [',profile,']'
-  cmd = ''.join(('ipcluster stop --profile=',profile))
-  os.system(cmd)
-
-    
-def unique_rows2(A):
-  import pandas as pd
-  return pd.DataFrame(A.T).drop_duplicates().T.values
-   
-def ratio_of_gaussians(mu_z,sigma_z,mu_w,sigma_w,rho):
-  s = rho * sigma_z/sigma_w
-  h = sigma_z *((1 - rho*rho)**0.5)
-  r = sigma_w/h
-  a =  (mu_z - s*mu_w)/h  #(mu_z/sigma_z - rho*(mu_w/sigma_w))/(1-rho*rho)**0.5
-  b = mu_w/sigma_w
-  if a < 0:
-    a = -a
-    r = -r
-  r_mu = a/(1.01*b - 0.2713)
-  r_sigma = (a**2 + 1)/(b**2 + 0.108*b  - 3.795) - r_mu**2 
-  r_mu = s+r_mu/r
-  r_sigma = np.sqrt(r_sigma)/abs(r)
-  return r_mu, r_sigma
-
-
-def g_t(t,mu_z,sig_z,mu_w,sig_w,rho):
-  v=np.sqrt(sig_z*sig_z-2*t*sig_z*sig_w*rho+t*t*sig_w*sig_w)
-  w=(t*mu_w-mu_z)/v
-  v=(sig_z*(mu_w*sig_z-mu_z*rho*sig_w)+t*sig_w*(mu_z*sig_w-sig_z*rho*mu_w))/(v*v*v)
-  return .39894228*exp(-.5*w*w)*v
-
-def marsaglia():
-  muz=30.5;muw=32;sigz=5;sigw=4;rho=.8;
-  s=rho*sigz/sigw;
-  r=sigw/(sigz*sqrt(1-rho*rho));
-  b=muw/sigw;
-  a=(muz/sigz-rho*muw/sigw)/sqrt(1-rho*rho);
-  if(a<0):
-    a=-a;
-    r=-r;
-  print 'a=',a,';b=',b, ';s=',s,';r=',r
-  appmu=a/(1.01*b-.2713);
-  appsig=np.sqrt((a*a+1)/(b*b+.108*b-3.795)-appmu*appmu);
-  return appmu, appsig
-
-
-
-

utilities/genomeDBs.pyc

Binary file removed.

utilities/plot.pyc

Binary file removed.

utilities/plot.py~

Empty file removed.