Commits

Hanfei Sun committed 4e0d3b9

Add the function for QC_mappable_reads_cnt

Comments (0)

Files changed (2)

-# Time-stamp: <2011-11-17 16:58:33 sunhf>
+# Time-stamp: <2011-11-18 18:30:51 sunhf>
 import os
 import sys
 import subprocess
     @type  command: str
     @param command: the command you want to run, for example, "ls -l"
     """
-    info ("[Run->] %s" % command)
+    info ("[Run ->] %s" % command)
     try:
         subpcall (command, shell=True)
     except KeyboardInterrupt:
         raise KeyboardInterruptError()
-
+def run_cmd_pipe ( command ):
+    """
+    Run a command and save the command's string to the log file
+    @type  command: str
+    @param command: the command you want to run, for example, "ls -l"
+    """
+    info ("[Run <->] %s" % command)
+    try:
+        return subprocess.Popen(command,shell=True,stdout=-1).stdout.read()
+    except KeyboardInterrupt:
+        raise KeyboardInterruptError()
 
 def check_cmd( command ):
     """
 #! /usr/bin/env python
-# Time-stamp: <2011-11-17 21:28:05 sunhf>
+# Time-stamp: <2011-11-18 19:46:11 sunhf>
 """Rnaseq pipeline main script
 
 Copyright (c) 2011 Hanfei Sun <hfsun.tju@gmail.com>
 import sys
 import string
 import re
+import subprocess
 from ConfigParser import SafeConfigParser 
 
 # try:
 space_join = lambda *args:" ".join(args)
 List_append = list.append
 parser = SafeConfigParser()
-pjoin=os.path.join
+pjoin = os.path.join
+get_output = run_cmd_pipe
 #############
 # main part #
 #############
                 List_append (self.cmd_list,
                              space_join(conf['tophat.main'], '-G', self.gtf_file, '-o', a_dir, "-p","5",
                                         "-r", conf['pairdefine.mate_inner_distance'], self.ebwt_index, 
-                                        two_fastq[0],two_fastq[1], conf['tophat.extend_option']))
+                                        two_fastq[0], two_fastq[1], conf['tophat.extend_option']))
 
     def check_data(self):
         if not ModuleBase.check_data(self):
 
             if a_fastq != '@pair':
                 self.data_path[a_tag]=a_fastq
-                List_append( self.cmd_list, space_join(conf['fastqc.main'], '-q','-o', a_dir, a_fastq))
+                List_append( self.cmd_list,
+                             space_join(conf['fastqc.main'], '-q','-o', a_dir, a_fastq))
             else:
                 two_fastq = conf['paired_datalist'][a_tag]
                 self.data_path[a_tag]=two_fastq
-                List_append( self.cmd_list, space_join(conf['fastqc.main'], '-q','-o', a_dir, two_fastq[0]))
-                List_append( self.cmd_list, space_join(conf['fastqc.main'], '-q','-o', a_dir, two_fastq[1]))                
+                List_append( self.cmd_list,
+                             space_join(conf['fastqc.main'], '-q','-o', a_dir, two_fastq[0]))
+                List_append( self.cmd_list,
+                             space_join(conf['fastqc.main'], '-q','-o', a_dir, two_fastq[1]))                
 class FastqcMapped(ModuleBase):
     ''' Use Fastqc to do the Quality Control for the mapped BAM file'''
-    def __init__(self, conf,source="Tophat"):
+    def __init__(self, conf, source="Tophat"):
         ModuleBase.__init__(self, name="quality control mapped", conf=conf)
         List_append (self.tool_path, conf["fastqc.main"])
-        self.tag2bam=lambda tag:self.find_path(conf['output.outputdir'], tag, source,"accepted_hits.bam")
+        self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'], tag, source,"accepted_hits.bam")
         for a_tag in conf['datalist'].keys():
             self.data_path[a_tag]=self.tag2bam(a_tag)
 
     def __init__(self, conf, source="Tophat"):
         ModuleBase.__init__(self, name="Convert from Bam to Bed", conf=conf)
         
-        self.source=source
+        self.source = source
         # used to get the source, for example, the output bam file of Tophat
-        self.tag2bam=lambda tag:self.find_path(conf['output.outputdir'], tag,
+        self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'], tag,
                                                self.source,"accepted_hits.bam")
         # generate the path of the input file from a tag, such as 'heart' or 'sample1'
         for a_tag in conf['datalist']:
             self.data_path[a_tag]=self.tag2bam(a_tag)
         List_append (self.tool_path, conf["tophat.bam2bed"])
 
-        tag2bed=lambda tag:self.find_path(conf['output.outputdir'], tag,"Bam2bed","accepted_hits.bed")
-        tag2dir=lambda tag:self.find_path(conf['output.outputdir'], tag,"Bam2bed")
+        tag2bed = lambda tag:self.find_path(conf['output.outputdir'], tag,"Bam2bed","accepted_hits.bed")
+        tag2dir = lambda tag:self.find_path(conf['output.outputdir'], tag,"Bam2bed")
 
         for a_tag,a_bam in self.data_path.items():
             a_dir = tag2dir(a_tag)
         ModuleBase.__init__(self, name="Cufflinks transcripts finding", conf=conf)
 
         self.gtf_file = conf["%s.gtf" %conf['specie.version']]        
-        self.tag2bam=lambda tag:self.find_path(conf['output.outputdir'], tag, source,"accepted_hits.bam")
+        self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'], tag, source,"accepted_hits.bam")
         for a_tag in conf['datalist']:
             self.data_path[a_tag]=self.tag2bam(a_tag)
         List_append (self.tool_path, conf["cufflinks.main"])
         for a_tag,a_bam in self.data_path.items():
             a_dir = self.find_path(conf['output.outputdir'], a_tag,"Cufflinks")
             a_dir = self.get_dir(a_dir)
-            List_append (self.cmd_list, space_join(conf['cufflinks.main'],'-G', self.gtf_file,
-                                                   '--output-dir', a_dir, a_bam))
+            List_append (self.cmd_list,
+                         space_join(conf['cufflinks.main'],'-G', self.gtf_file,
+                                    '--output-dir', a_dir, a_bam))
         
 class gfold(ModuleBase):
     '''
     '''        
     def __init__(self, conf, source="Tophat"):
         ModuleBase.__init__(self, name="gfold differential genes finding", conf=conf)
-        source=source
-        self.tag2bam=lambda tag:self.find_path(conf['output.outputdir'], tag,
+        self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'], tag,
                                                source,"accepted_hits.bam")        
         self.annotation_file = conf["%s.gpf" %conf['specie.version']]
         for a_compare,two_group in conf['comparelist'].items():
         self.cmd_list_count=[]
         self.cmd_list_diff=[]
         comma_join = lambda x:",".join(x)
-        tag2dir=lambda tag:self.find_path(conf['output.outputdir'], tag,"gfold") # to dir
-        tag2prefix=lambda tag:tag2dir(tag)+"/"+tag 
-        tag2cnt=lambda tag:tag2prefix(tag)+".cnt"
-        tag2diff=lambda tag:tag2prefix(tag)+".diff"
-        bam2tag=lambda bam:bam.split("/")[1]
+        tag2dir = lambda tag:self.find_path(conf['output.outputdir'], tag,"gfold") # to dir
+        tag2prefix = lambda tag:tag2dir(tag)+"/"+tag 
+        tag2cnt = lambda tag:tag2prefix(tag)+".cnt"
+        tag2diff = lambda tag:tag2prefix(tag)+".diff"
+        bam2tag = lambda bam:bam.split("/")[1]
 
         tags=[]
         for a_compare,two_group in self.data_path.items():
             self.get_dir(tag2dir(a_compare))
             for i in range(0,2):
                 for a_bam in two_group[i]:
-                    a_tag=bam2tag(a_bam)
+                    a_tag = bam2tag(a_bam)
 
                     # check whether this tag (data) has already been counted
                     if a_tag in tags: continue
         if not self.multi_process_:
             map(run_cmd, self.cmd_list_count + self.cmd_list_diff)
         else:
-            p=Pool(self.process_num)
+            p = Pool(self.process_num)
             p.map(run_cmd, self.cmd_list_count)
             p.map(run_cmd, self.cmd_list_diff)
 
 class Summary_tophat(ModuleBase):
-    def __init__(self,conf, source="Tophat"):
-        ModuleBase.__init__(self, name="Make the graph of evenness for quality control",conf=conf)
+    def __init__(self, conf, source="Tophat"):
+        ModuleBase.__init__(self, name="Make the graph of evenness for quality control", conf=conf)
         # Evenness hasn't supported multi process yet
         self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'], tag, source,"accepted_hits.bam")
-        self.tag2dir = lambda tag:self.find_path(conf['output.outputdir'], tag,"Summary_mappped")
+        self.tag2dir = lambda tag:self.find_path(conf['output.outputdir'], tag,"Summary_mapped")
         self.tag2even = lambda tag:self.tag2dir(tag)+"/"+tag
         self.tag2rscript = lambda tag:self.tag2even(tag)+".r"
         
             self.data_path[a_tag] = self.tag2bam(a_tag)
         self.gtf_file = conf["%s.gtf" %conf['specie.version']]
 
-        List_append (self.tool_path,conf['gfold.samtools'])
-        List_append (self.tool_path,pjoin(conf['scripts.rootdir'],"evenness_qclib.py"))
-        List_append (self.tool_path,pjoin(conf['scripts.rootdir'],"Rlib/evenness_summary.r"))
+        List_append (self.tool_path, conf['gfold.samtools'])
+        List_append (self.tool_path, pjoin(conf['scripts.rootdir'],"evenness_qclib.py"))
+        List_append (self.tool_path, pjoin(conf['scripts.rootdir'],"Rlib/evenness_summary.r"))
         List_append (self.lib_path, self.gtf_file)
 
         self.for_summary=""
-        self.summary_output_dir=self.find_path(conf['output.outputdir'], "Summary")
-        self.summary_output_dir=self.get_dir(self.summary_output_dir)
+        self.summary_output_dir = self.find_path(conf['output.outputdir'], "Summary")
+        self.summary_output_dir = self.get_dir(self.summary_output_dir)
         for a_tag,a_bam in self.data_path.items():
             a_dir = self.tag2dir(a_tag)
             a_dir = self.get_dir(a_dir)
 
             List_append (self.cmd_list,
                          space_join(conf['gfold.samtools'],'view', a_bam,"|",
-                                    "python",pjoin(conf['scripts.rootdir'],"evenness_qclib.py")
-                                    ,self.gtf_file, an_output))
+                                    "python", pjoin(conf['scripts.rootdir'],"evenness_qclib.py"),
+                                    self.gtf_file, an_output))
             
             self.for_summary+=an_output+" "+a_tag+" "
 
-        self.conf=conf
+        self.conf = conf
+    def qc_for_mapped_cnt(self):
+        '''Calculate how many counts are mapped or uniq mapped'''
+        self.tag2mappableQC = lambda tag:self.tag2dir(tag)+"/"+tag+".mappable_cnt"
+        self.tag2fastq = lambda tag: self.conf['datalist'][tag] if self.conf['datalist'][tag]!="@pair" else self.conf['pairlist'][tag][0]
+        get_first_string = lambda cmd: get_output(cmd).split(" ")[0]
+        for a_tag,a_bam in self.data_path.items():
+            an_output = self.tag2mappableQC(a_tag)
+            a_fastq = self.tag2fastq(a_tag)
+            with open(an_output,"w") as mappable_file:
+                qc_content=""
+                mapped_cnt = int(get_first_string(space_join(self.conf['gfold.samtools'],'view', a_bam,"|",
+                                                             "cut","-f1","|"                                                           
+                                                             "uniq","|",
+                                                             "wc","-l",)))
+                uniq_cnt = int(get_first_string(space_join(self.conf['gfold.samtools'], 'view', a_bam,"|",
+                                                           "cut","-f1","|"
+                                                           "uniq","-u","|",
+                                                           "wc","-l",)))
+
+                raw_cnt = int(get_first_string(space_join("wc","-l", a_fastq)))/4
+                percent = (mapped_cnt/float(raw_cnt))*100
+                qc_content+="QC for %s (BAM:%s FASTQ:%s)\n"%(a_tag, a_bam, a_fastq)
+                qc_content+="%s mapped reads (count 1 time for multiple mapping)\n"%mapped_cnt
+                qc_content+="%s unique reads (eliminate mutiple mapping)\n"%uniq_cnt
+                qc_content+="%s total reads\n"%raw_cnt
+                qc_content+="%%%.2f percent reads are mappable\n"%percent
+                qc_content+="Result file is at %s\n"%an_output
+                print "-"*20
+                print qc_content
+                print "-"*20                
+                mappable_file.write(qc_content)
             
     def main(self):
         ModuleBase.main(self)
-        run_cmd(space_join("Rscript",pjoin(self.conf['scripts.rootdir'],"Rlib/evenness_summary.r"),
-                self.for_summary,self.summary_output_dir+"/evenness.pdf"))
-        
+        run_cmd(space_join("Rscript", pjoin(self.conf['scripts.rootdir'],"Rlib/evenness_summary.r"),
+                self.for_summary, self.summary_output_dir+"/evenness.pdf"))
+        self.qc_for_mapped_cnt()
+                
+            
 class Summary_cufflinks(ModuleBase):
-    def __init__(self,conf,source="Cufflinks"):
+    def __init__(self, conf, source="Cufflinks"):
         ModuleBase.__init__(self, name="Correlation of the expression of genes", conf=conf)
-        self.tag2exp=self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'], tag, source,"genes.fpkm_tracking")
+        self.tag2exp = self.tag2bam = lambda tag:self.find_path(conf['output.outputdir'],
+                                                              tag, source,"genes.fpkm_tracking")
         
         for a_tag in conf['datalist']:
             self.data_path[a_tag] = self.tag2exp(a_tag)
         self.outputdir = self.find_path(conf['output.outputdir'], "Summary")            
         self.get_dir(self.outputdir)
 
-        cmd_pairs=space_join(*[an_exp+" "+a_tag for a_tag,an_exp in self.data_path.items()])
+        cmd_pairs = space_join(*[an_exp+" "+a_tag for a_tag,an_exp in self.data_path.items()])
         List_append(self.cmd_list,
                     space_join("Rscript","Rlib/summary.r",
                                cmd_pairs,
                                self.outputdir+"/"+"rpkm_correlation"))
             
 class Summary_Fastqc(ModuleBase):
-    def __init__(self,conf,source="Fastqc"):
+    def __init__(self, conf, source="Fastqc"):
         ModuleBase.__init__(self, name="Correlation of the expression of genes", conf=conf)
         self.tag2realname = lambda tag:self.parse_name(conf['datalist'][tag])
         self.tag2source = lambda tag:self.find_path(conf['output.outputdir'], tag, source,
                                                    self.tag2realname(tag)+"_fastqc/fastqc_data.txt")
         for a_tag in conf['datalist']:
             with open(self.tag2source(a_tag)) as qc_data:
-                          m=re.search(r">>Per sequence quality scores.+?Count\n(.+?)\n>>END",t,re.S)
+                          m = re.search(r">>Per sequence quality scores.+?Count\n(.+?)\n>>END", t, re.S)
                           
-alias_dict=dict(fastqc_raw=FastqcRaw, fastqc_mapped=FastqcMapped,tophat=Tophat, cufflinks=Cufflinks,
-                gfold=gfold, bam2bed= Bam2Bed, summary_mapped= Summary_tophat,
-                summary_rpkm=Summary_cufflinks, summary_naiveqc= Summary_Fastqc)
+alias_dict = dict(fastqc_raw=FastqcRaw, fastqc_mapped=FastqcMapped, tophat=Tophat, cufflinks=Cufflinks,
+                  gfold=gfold, bam2bed= Bam2Bed, summary_mapped= Summary_tophat,
+                  summary_rpkm=Summary_cufflinks, summary_naiveqc= Summary_Fastqc)
 def read_config(configFile):
     """Read configuration file and parse it into a dictionary.
 
     conf['datalist']={}
     conf['paired_datalist']={}
     
-    comma_split=lambda x:x.split(",")
+    comma_split = lambda x:x.split(",")
     trim = lambda x,prefix:x[len(prefix):].lower()
     
     for a_term in conf:
         if a_term.startswith(compare_pre):
-            compare_groups=conf[a_term].split(':')
+            compare_groups = conf[a_term].split(':')
             if len(compare_groups)==2:
                 conf[new_compare_pre][trim(a_term,compare_pre)] = map(comma_split,
                                                                       map(string.lower, compare_groups))
     conf['steplist']=[alias_dict[alias] for alias in conf['steps.steps'].split(",")]
 
 def dependency_check(conf):
-    dependency_dict=dict(fastqc=[], tophat=[], cufflinks=["tophat"],
-                         gfold=["tophat"], bam2bed=["tophat"], summary_rpkm=["cufflinks"],
-                         summary_naiveqc=["fastqc"], summary_mapping=["tophat"])
+    dependency_dict = dict(fastqc=[], tophat=[], cufflinks=["tophat"],
+                           gfold=["tophat"], bam2bed=["tophat"], summary_rpkm=["cufflinks"],
+                           summary_naiveqc=["fastqc"], summary_mapping=["tophat"])
     step_list_=conf['steps.steps'].strip().split(",")
     pre_step_list=[]
     for a_cursor in step_list_:
-        deps=dependency_dict.get(a_cursor,[])
+        deps = dependency_dict.get(a_cursor,[])
         for a_dep in deps:
             if a_dep not in pre_step_list:
                 a_tip=("{0} need {1} as a dependency, please make sure you have run {1} previously.".format(a_cursor,a_dep))