Commits

Inti Pedroso committed bd785f1

new version of normalise script

  • Participants
  • Parent commits 10c389c
  • Branches develop, feature/correction_gcv2 2
    1. feature/docs
    2. release/0.01

Comments (0)

Files changed (2)

File normalise_counts.py

+#import numpy as np
+#import os
+#import pandas as pd
+#import pybedtools
+#from Bio import SeqIO
+#import sys
+
+
+import argparse
+#import functions
+
 import numpy as np
-import os
 import pandas as pd
-import pybedtools
-from Bio import SeqIO
-import sys
+import pysam
+import time
 
-def mean_from_wig(chr_name,start,end):
-  return np.median([ data[2] for data in bw.get(chr_name,int(start), int(end))  ])
+print time.ctime(), "Started"
 
-def normalise_by_gc(table):
-  table['coverage'] + 1
-  median_all_gc = np.median(table.ix[table['coverage']>0,'coverage'])
-  table['int_pct_gc'] = (bins_plus_nucl_comp['pct_gc']*100).astype(int)
-  median_by_gc = table.groupby('int_pct_gc').apply(lambda x: np.median(x['coverage']))
-  table['coverage_gc'] = table.apply(lambda x: x['coverage']*(median_all_gc/median_by_gc[x['int_pct_gc']]),axis=1)
-
-  #table['mean_mappability'] = table.apply(  lambda x:  mean_from_wig(x['chr'], x['start'],x['end']),  axis=1)
-  #table = table.fillna(0)
-  #table['mean_mappability'] = (table['mean_mappability']*100).astype(int)
-  #median_all_mappability = np.median(table.ix[table['coverage_gc']>0,'coverage_gc'])
-  #median_by_mappability = table.groupby('mean_mappability').apply(lambda x: np.median(x['coverage_gc']))
-  #table['coverage_gc_mappability'] = table.apply(lambda x: x['coverage_gc']*(median_all_mappability/median_by_mappability[x['mean_mappability']]),axis=1)
-  table['normalised'] = table['coverage_gc']
-  #table = table.fillna(0)
-  return table
-
-def read_cov_table(file):
-  table = pd.read_table(file, names=['chr','start','end','coverage','cov_bp','region_length','frac_bases_covered'])
-  #table['bin_mean_pos'] = (table['start'] + table['end'])/2
-  table['log_cov'] = np.log10(table['coverage'] + 1)
-  #table = table.reindex_axis(table.ix[['chr','start','end']],axis=1)
-  return table
+parser = argparse.ArgumentParser(description='Call Copy Number variants using a Sticky-Hidden Markov Model')
+
+# Input Files
+parser.add_argument('--bam',required=True)
+parser.add_argument('--bed',default=0)
+
+#Output files
+parser.add_argument('--output', required=True)
+
+#Options
+parser.add_argument('--chr', required=False)
+parser.add_argument('--ref', default = 0)
+parser.add_argument('--ref_folder',default = 0)
+parser.add_argument('--win_size',type=int,default=1000)
+args = parser.parse_args()
 
 def make_bins(chr,max_value,size):
   end = max_value - 1
   return b
 
 
-genome_fasta = "human_g1k_v37.fasta"
-win_size = int(sys.argv[2])
-#print 'Retrieving chromosome size from bam file'
-import pysam
-f = pysam.Samfile(sys.argv[1],'rb')
-chr_sizes = {}
-for seq in f.header['SQ']:
-    #chr_sizes[seq['SN'].strip('chr')] = seq['LN']   
-    chr_sizes[seq['SN']] = seq['LN']
-f.close()
-
-
-tmp = {}
-tmp[sys.argv[3]] = chr_sizes[sys.argv[3]]
-chr_sizes = tmp
-#handle = open(genome_fasta, "rU")
-#for record in SeqIO.parse(handle, "fasta") :
-#    chr_sizes[ record.id ] = len(record.seq) 
-#handle.close()
-
-# lets make some bins
-print "Building bins"
-bins = pd.concat([ make_bins(chr,chr_sizes[chr], win_size) for chr in chr_sizes ])
-#bins['chr'] = bins['chr'].str.strip('chr')
-bins = pybedtools.BedTool(bins.to_string(index=None,header=False),from_string=True)
-bins_bed_file = ''.join((genome_fasta.strip('fasta'),str(np.int(win_size/1000)),"kb",".bed"))
-#bins.saveas(bins_bed_file) 
-#calculate coverage
-bam = sys.argv[1] #'/home/inti/APP/forestSV/inst/scripts/NA12878.chrom20.ILLUMINA.bwa.CEU.high_coverage.20100311.bam'
-
-# name output files
-kb_string = ''.join((str(np.int(win_size/1000)),"kb"))
-out = sys.argv[1].strip('bam')
-out = ''.join((out,kb_string,'.',sys.argv[3]))
-
-name_bins_coverage = ''.join((out,".coverage.bed"))
-name_bins_nuc_comp = ''.join((out,".nclcomposition.bed"))
-name_normalised = ''.join((out,".normalized.bed"))
-
-bam = pybedtools.BedTool(bam)
-print "Calculating coverage for each bin"
-
-bins_coverage = bam.coverage(bins).saveas( name_bins_coverage )
-# calculate the gc content and other nucleodite composition features of the bins
-print "Calculating nucleotide content for each bin"
-#withchr = '/home/inti/RESOURCES/fasta/human_g1k_v37.fasta'
-nochr='human_g1k_v37.fasta'
-bins_nuc_comp = bins_coverage.nucleotide_content(nochr).saveas(name_bins_nuc_comp) 
-
-#from bx.bbi.bigwig_file import BigWigFile
-#bw = BigWigFile( open("wgEncodeCrgMapabilityAlign100mer.bigWig") )
-
-bins_plus_nucl_comp = pd.read_table(name_bins_nuc_comp,names=['chr','start','end','coverage','cov_bp','region_length','frac_bases_covered','pct_at','pct_gc','num_A','num_C','num_G','num_T','num_N','num_oth','seq_len'])
-bins_plus_nucl_comp = bins_plus_nucl_comp[bins_plus_nucl_comp['num_N'] == 0]
-print "Normalizing by GC content"
-bins_plus_nucl_comp = normalise_by_gc(bins_plus_nucl_comp)
-bins_plus_nucl_comp = bins_plus_nucl_comp.sort(['chr', 'start'], ascending=[1, 1])
-bins_plus_nucl_comp[['chr','start','end','normalised']].fillna(0).to_csv(name_normalised,header=None,index=None,sep="\t")
-print "Done"
+def calculate_coverage(pysam_obj,bed):
+  return bed.apply( lambda x: pysam_obj.count(reference=str(int(x['chr'])),start=int(x['start']),end=int(x['end']))   ,axis=1  )
+
+def calculate_gc_content(pysam_fasta_obj,bed):
+  return bed.apply( lambda x: gc_content(ref.fetch(reference=str(int(x['chr'])),start=int(x['start']),end=int(x['end']) )) ,axis=1  )
+
+def gc_content(seq):
+  return np.sum([seq.count(letter) for letter in ['G','C'] ])/float(len(seq))
+
+def normalise_by_gc(table):
+  table['coverage'] + 1
+  median_all_gc = np.median(table.ix[table['coverage']>0,'coverage']).astype(int)
+  table['gc_intervals'] = (table['gc']*100).astype(int)
+  median_by_gc = table.groupby('gc_intervals').apply(lambda x: np.median(x['coverage']))
+  table['normalised'] = table.coverage.values * median_all_gc/median_by_gc[table['gc_intervals']].values
+  table = table.fillna(0)
+  table.ix[np.where((table['normalised'] == np.inf) == True)[0],'normalised'] = 0.0
+  return table['normalised']
+
+print time.ctime(), "Reading bam file"
+
+bam = pysam.Samfile(str(args.bam)) #('NA12878/NA12878.mapped.ILLUMINA.bwa.CEU.low_coverage.20120522.bam')
+ref_filename = bam.header['SQ'][0]['UR'].split('/')[-1]
+
+
+if args.bed == 0:
+  chr_sizes = {}
+  for seq in bam.header['SQ']:
+      chr_sizes[seq['SN']] = seq['LN']
+  tmp = {}
+  tmp[str(args.chr)] = chr_sizes[str(args.chr)]
+  chr_sizes = tmp
+  print time.ctime(), "Building bins"
+  bed = pd.concat([ make_bins(chr,chr_sizes[chr], args.win_size) for chr in chr_sizes ])
+
+else:
+  print time.ctime(), "Reading bed file"
+  bed = pd.read_table(str(args.bed))#('NA12878.1kb.chr20.cnv_calls_summary.txt')
+
+print time.ctime(), "Calculating coverage"
+bed['coverage'] = calculate_coverage(bam,bed)
+
+print time.ctime(), "Reading reference genome fasta"
+ref = ''
+if args.ref != 0:
+  ref = pysam.Fastafile(args.ref) #/home/inti/RESOURCES/ref_genomes/hs37d5.fa.gz
+else:
+  ref = pysam.Fastafile(''.join((args.ref_folder,ref_filename)))
+
+print time.ctime(), "Calculating GC content for bins"
+bed['gc'] = calculate_gc_content(ref,bed)
+print time.ctime(), "Normalizing by GC content"
+bed['counts']=normalise_by_gc(bed)
+bed = bed[['chr','start','end','coverage','counts']]
+bed.to_csv(args.output,header=None,index=None,sep="\t")
+print time.ctime(),"Done"
+
+#genome_fasta = "human_g1k_v37.fasta"
+#win_size = int(sys.argv[2])
+##print 'Retrieving chromosome size from bam file'
+#import pysam
+#f = pysam.Samfile(sys.argv[1],'rb')
+#chr_sizes = {}
+#for seq in f.header['SQ']:
+    ##chr_sizes[seq['SN'].strip('chr')] = seq['LN']   
+    #chr_sizes[seq['SN']] = seq['LN']
+#f.close()
+
+
+#tmp = {}
+#tmp[sys.argv[3]] = chr_sizes[sys.argv[3]]
+#chr_sizes = tmp
+##handle = open(genome_fasta, "rU")
+##for record in SeqIO.parse(handle, "fasta") :
+##    chr_sizes[ record.id ] = len(record.seq) 
+##handle.close()
+
+## lets make some bins
+#print "Building bins"
+#bins = pd.concat([ make_bins(chr,chr_sizes[chr], win_size) for chr in chr_sizes ])
+##bins['chr'] = bins['chr'].str.strip('chr')
+#bins = pybedtools.BedTool(bins.to_string(index=None,header=False),from_string=True)
+#bins_bed_file = ''.join((genome_fasta.strip('fasta'),str(np.int(win_size/1000)),"kb",".bed"))
+##bins.saveas(bins_bed_file) 
+##calculate coverage
+#bam = sys.argv[1] #'/home/inti/APP/forestSV/inst/scripts/NA12878.chrom20.ILLUMINA.bwa.CEU.high_coverage.20100311.bam'
+
+## name output files
+#kb_string = ''.join((str(np.int(win_size/1000)),"kb"))
+#out = sys.argv[1].strip('bam')
+#out = ''.join((out,kb_string,'.',sys.argv[3]))
+
+#name_bins_coverage = ''.join((out,".coverage.bed"))
+#name_bins_nuc_comp = ''.join((out,".nclcomposition.bed"))
+#name_normalised = ''.join((out,".normalized.bed"))
+
+#bam = pybedtools.BedTool(bam)
+#print "Calculating coverage for each bin"
+
+#bins_coverage = bam.coverage(bins).saveas( name_bins_coverage )
+## calculate the gc content and other nucleodite composition features of the bins
+#print "Calculating nucleotide content for each bin"
+##withchr = '/home/inti/RESOURCES/fasta/human_g1k_v37.fasta'
+#nochr='human_g1k_v37.fasta'
+#bins_nuc_comp = bins_coverage.nucleotide_content(nochr).saveas(name_bins_nuc_comp) 
+
+##from bx.bbi.bigwig_file import BigWigFile
+##bw = BigWigFile( open("wgEncodeCrgMapabilityAlign100mer.bigWig") )
+
+#bins_plus_nucl_comp = pd.read_table(name_bins_nuc_comp,names=['chr','start','end','coverage','cov_bp','region_length','frac_bases_covered','pct_at','pct_gc','num_A','num_C','num_G','num_T','num_N','num_oth','seq_len'])
+#bins_plus_nucl_comp = bins_plus_nucl_comp[bins_plus_nucl_comp['num_N'] == 0]
+#print "Normalizing by GC content"
+#bins_plus_nucl_comp = normalise_by_gc(bins_plus_nucl_comp)
+#bins_plus_nucl_comp = bins_plus_nucl_comp.sort(['chr', 'start'], ascending=[1, 1])
+#bins_plus_nucl_comp[['chr','start','end','normalised']].fillna(0).to_csv(name_normalised,header=None,index=None,sep="\t")
 

File utilities/functions.py

 import time 
 import copy
 
+
+
+#### FUNTIONS TO NORMALISE COUNTS
+def mean_from_wig(chr_name,start,end):
+  return np.median([ data[2] for data in bw.get(chr_name,int(start), int(end))  ])
+
+def normalise_by_gc(table):
+  table['coverage'] + 1
+  median_all_gc = np.median(table.ix[table['coverage']>0,'coverage']).astype(int)
+  table['gc_intervals'] = (table['gc']*100).astype(int)
+  median_by_gc = table.groupby('gc_intervals').apply(lambda x: np.median(x['coverage']))
+  return table[table['gc_intervals'] > 0].coverage.values * median_all_gc/median_by_gc[table['gc_intervals']].values
+
+def read_cov_table(file):
+  table = pd.read_table(file, names=['chr','start','end','coverage','cov_bp','region_length','frac_bases_covered'])
+  #table['bin_mean_pos'] = (table['start'] + table['end'])/2
+  table['log_cov'] = np.log10(table['coverage'] + 1)
+  #table = table.reindex_axis(table.ix[['chr','start','end']],axis=1)
+  return table
+
+def make_bins(chr,max_value,size):
+  end = max_value - 1
+  if size > max_value:
+    return pd.DataFrame( { 'chr': chr, 'start': 0, 'end': max_value }, columns = ['chr','start','end'], index=['0'] )
+  counter = 0
+  start = counter*size
+  starts= []
+  ends = []
+  while(end < max_value):
+    starts.append(start)
+    end = (counter + 1)*size + 1 #start + size
+    if end > max_value:
+      end = max_value
+    ends.append(end)
+    start = (counter + 1)*size
+    if start > max_value:
+      ends[-1] = max_value
+    counter +=1
+
+  if (ends[-1] - starts[-1]) < 0.5*size:
+    if len(ends) < 3:
+      starts = 0
+      ends = max_value
+    else:
+      ends.remove(ends[-1])
+      starts.remove(starts[-1])
+      ends[-1] = max_value
+  b = pd.DataFrame( { 'chr': chr, 'start': starts, 'end': ends }, columns = ['chr','start','end'] )
+  return b
+
+####### 
+
+
 def start_ipcluster(N,profile = None):
   import os
   if profile is None: