Commits

Inti Pedroso committed 291ff6b

all bones of single position correction for GC are implemented. TODO: write to correct the counts by looping over the bam file ... thiw will be slow ... auch!

  • Participants
  • Parent commits 21936f4
  • Branches feature/correction_gcv2

Comments (0)

Files changed (2)

File utilities/cnv_caller.py

 	  self.observed_states_counts[self.working_distn][idx][row_idx] /= Nmodels
 	self.changepoint_counts[self.working_distn][idx] /= Nmodels
 	
+    
   def _calculate_average_states_mean_var(self):
     self.states_moments = {}
     self.states_params = {}

File utilities/functions.py

     cnvs = pd.read_table(''.join((filename,str(chr),'.bed')))
     utilities.functions.make_big_bed(d,cnvs,filename=''.join((filename,str(chr),'.bedGraph')))
 
+
+def gc_content(seq):
+  return np.sum([seq.count(letter) for letter in ['G','C'] ])/float(len(seq))
+
+def get_pos_ones(chrom,size):
+  ones_size[chrom] = len(np.where(bwh.get_as_array(chrom, 1,size) == 1)[0].astype(np.int32))
+
+def summarise_by_chromosome(chrom, precision = 1000):
+  n_sample = int(ones_size[chrom]/float(all_one_nt)*target)
+  chrom_size = c_size[c_size.chrom == chrom]['size'].values[0]
+  ctable = pd.DataFrame( { 'chromosome': chrom, 'gc':np.nan,  'position': random.sample(np.where(bwh.get_as_array(chrom, 1,chrom_size) == 1)[0].astype(np.int32),n_sample)}) 
+  ctable['gc'] = ctable.apply(lambda x: gc_content(ref.fetch(reference=x['chromosome'].strip('chr'), start=x['position'] + a, end=x['position'] + a + l))  ,axis=1)    
+  ctable['gc_approx'] = (ctable['gc']*precision).astype(int)
+  ctable['counts'] = ctable.apply(lambda x: samfile.count(reference=x['chromosome'].strip('chr'), start=x['position'], end=x['position'] +1), axis=1)
+  ctable_grouped = ctable.groupby('gc_approx')
+  return pd.concat(pd.DataFrame({'chrom':chrom,'S_gc': pd.unique(group.gc_approx), 'N_gc': len(group),'F_gc': np.sum(group.counts.values)}) for name, group in ctable_grouped)
+
+def site_model_gc_correction(self):
+  import pysam
+  import bx.bbi.bigwig_file
+  import cruzdb
+  import pandas as pd
+  import numpy as np
+  import tables
+  import random
+  import time
+
+  chromosomes_to_use = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21','chr22']
+  
+  # read bigWig file with mappability scores
+  bwh = bx.bbi.bigwig_file.BigWigFile(open("wgEncodeCrgMapabilityAlign100mer.bigWig", "rb"))
+
+  # read chromosome sizes
+  c_size = pd.read_table('resources/hg19.genome')
+  print time.ctime(),"   '-> Extrating position mapability equal to 1" 
+  # get positions with mappability equal to 1
+  ones_size = {}
+  for chrom in chromosomes_to_use:
+    c_size[c_size.chrom == chrom].apply(lambda x:  get_pos_ones(x['chrom'],x['size']) ,axis=1 )
+
+  all_one_nt = np.sum(ones_size.values())
+  # set how many sites you want to sample in total
+  target = 50000 
+  a = 0
+  l = 100
+  # read reference genome in fasta format
+  pysam.Fastafile('human_g1k_v37.fasta')
+  # read bam file
+  samfile = pysam.Samfile( "NA12878/NA12878.mapped.ILLUMINA.bwa.CEU.low_coverage.20120522.bam", "rb" )
+
+  print time.ctime(),"   '-> Extrating summary for alignments in chromosome [",chrom,"]" 
+
+  summary_table = pd.concat([summarise_by_chromosome(chrom,precision = 10) for chrom in chromosomes_to_use])
+  S_gc_table = summary_table.groupby('S_gc').apply(lambda x: x[['F_gc','N_gc']].sum(axis=0,numeric_only=True))
+  S_gc_table['lambda_gc'] = S_gc_table['F_gc']/float(S_gc_table['N_gc']).astype(float)
+    
+