Hanfei Sun avatar Hanfei Sun committed 75f3033

Improve the summary_mapped function to be faster

Comments (0)

Files changed (5)

+# Time-stamp: <2011-11-21 20:13:26 sunhf>
+
+The files in this folder are for an RNA-Seq pipeline. 
+
+
+Install
+----------
+1. Clone the pipeline from bitbucket:
+$ hg clone https://bitbucket.org/hanfeisun/rna-seq-pipeline/
+
+2. Make a copy of the origal rna.sample.conf as your own:
+$ cp rna.sample.conf myRNA_pipeline.conf
+
+3. Modify the 'ROOTDIR' key in 'Scrips' section to the download
+folder. For example, you put the downloaded 'rna-seq-pipeline' folder
+under '/scripts/pipeline' folder, then you should set the 'ROOTDIR' in
+your conf file like this:
+
+[Scripts]
+ROOTDIR=/scripts/pipeline/rna-seq-pipeline/
+
+4. OK, that's all.
+
 How to use
 ----------
-
 for test:
 
 Just type in
 
 for your own use:
 
-Create a config file follows the pattern in rna.sample.conf,
-and then type in:
+Create a config file follows the pattern in rna.sample.conf, and then
+type in:
 $ python rnaseq_pipe.py your_config.conf
 
 Only want to have a look at the quality control
 $ python rnaseq_pipe.py your_config.conf q
 
+Tips
+----------
+1. The pipeline can check whether you have made the right order of the
+steps. For example, you should run Tophat before Cufflinks as
+Cufflinks' input is Tophat's output. So if not sure about the order of
+the steps in the pipeline, set the 'strict' key in 'strict' section to
+True. The pipeline will stop immediately when it found something wrong
+in the order.
+
+
+
+
+rna-seq-pipeline 0.11		Nov 2011	Hanfei Sun

evenness_qclib.py

         cleanGenes = self.cleanGenes
 
         for gene_id in self.diffChrGene.keys():
-            print "Warning, gene %s has transcripts on different chromosomes. Transcripts on chromosome %s are kept for clean genes" % (gene_id, rawGenes[gene_id].chrom)
+            print "Warn: gene %s has transcripts on different chromosomes. Transcripts on chromosome %s are kept; " % (gene_id, rawGenes[gene_id].chrom),
         print "Warning,", self.skippedExonCnt, "exons have been skipped due to genes with isoforms on different chromosomes."
 
         print "Regrouping clean genes by chromosome"
-# Time-stamp: <2011-11-18 18:30:51 sunhf>
+# Time-stamp: <2011-11-21 22:59:45 sunhf>
 import os
 import sys
 import subprocess
         print "adsfdfasd"
         print >>sys.stderr, "Execution failed:", e
         return False
+
+def list_or_tuple(x):  
+    return isinstance(x, (list, tuple))  
+def flatten(sequence, to_expand=list_or_tuple):
+    '''
+    used to flatten a nested list
+
+    >>> for x in flatten([1, 2, [3, [  ], 4, [5, 6], 7, [8,], ], 9]):print x,
+    1 2 3 4 5 6 7 8 9
+    '''
+    for item in sequence:  
+        if to_expand(item):  
+            for subitem in flatten(item, to_expand):  
+                yield subitem  
+        else:  
+            yield item 
     
 class ModuleBase():
     '''
         return True
     def check_data(self):
         ''' Check the data_path variable '''
-        for a_data in self.data_path.values():
-            if isinstance(a_data, str):
+        for a_data_family in self.data_path.values():
+            # if there are several data paths under one key, it's
+            # called data family
+            if isinstance(a_data_family,str):
+                a_data_family=[a_data_family]
+            for a_data in flatten(a_data_family):
                 if not os.path.isfile(a_data):
                     error("Data %s doesn't exists ---> %s"%(a_data, self.name))
                     return False
                 else:
                     info("(data-valid) %s ---> %s"%(a_data, self.name))
-            elif isinstance(a_data, list):
-                for a_data_ in a_data:
-                    if isinstance(a_data_, list):
-                        for a_data__ in a_data_:
-                            if not os.path.isfile(a_data__):
-                                error("Data %s doesn't exists ---> %s"%(a_data_, self.name))
-                                return False
-                            else:
-                                info("(data-valid) %s ---> %s"%(a_data__, self.name))
-                    else:
-                        if not os.path.isfile(a_data_):
-                            error("Data %s doesn't exists ---> %s"%(a_data_, self.name))
-                            return False
-                        else:
-                            info("(data-valid) %s ---> %s"%(a_data_, self.name))
         return True
     def check_output_dir(self):
         ''' Check the output_dir_ptah variable ''' 
         print self.help_str
     def main(self, arguments=None):
         ''' Start a step using the specified tool and data '''
+        if self.cmd_list==[]:
+            warn("No command input @ %s"%self.name)
+            return False
         if not self.multi_process_:
             map(run_cmd, self.cmd_list)
         else:
     def parse_name(self, file_path):
         ''' Get prefix name of a file but no folder name '''
         return os.path.splitext(file_path)[0].split("/")[-1]
+    
 # Define a variable with a fastq path, which is used in [comparison]
 # section below
 
-RNAi = /mnt/Storage/home/liuj/Han_histone/prdm14/prdm14_RNAi.fastq
-control = /mnt/Storage/home/liuj/Han_histone/prdm14/prdm14_control.fastq
+RNAi = /mnt/Storage/home/Sam/Han_histone/prdm14/prdm14_RNAi.fastq
+control = /mnt/Storage/home/Sam/Han_histone/prdm14/prdm14_control.fastq
 
 
 [comparison]
 #
 # fastqc_raw <= None
 # fastqc_mapped <= tophat
+# tophat <= None
 # gfold <= tophat
 # cufflinks <= tophat
 # summary_rpkm <= cufflinks
-# summary_mappped <= tophat
+# summary_mapped <= tophat
 # bam2bed <=tophat
 
 # a complete step order may be like the following
 #! /usr/bin/env python
-# Time-stamp: <2011-11-18 19:46:11 sunhf>
+# Time-stamp: <2011-11-22 00:38:32 sunhf>
 """Rnaseq pipeline main script
 
 Copyright (c) 2011 Hanfei Sun <hfsun.tju@gmail.com>
 import subprocess
 from ConfigParser import SafeConfigParser 
 
-# try:
-#     import rpy
-# except:
-#     print "Need to install rpy first"
-#     print "If you have already install rpy, please specify the RHOME for the r library"
-#     sys.exit(1)
-    
 from modulebase import *
 
 
             error("bowtie genome index can't be found!")
             return False
         return True
+    
 class FastqcRaw(ModuleBase):
     ''' Use Fastqc to do the Quality Control for the raw Fastq file'''
     def __init__(self, conf):
         ModuleBase.__init__(self, name="quality control", conf=conf)
+        self.conf=conf
         List_append (self.tool_path, conf["fastqc.main"])
-        for a_tag,a_fastq in conf['datalist'].items():
+        self.tag2fastqs = lambda tag: [self.conf['datalist'][tag]] if self.conf['datalist'][tag]!="@pair" else self.conf['paired_datalist'][tag]
+        for a_tag in conf['datalist'].keys():
             a_dir = self.find_path(self.output_dir_path, a_tag, "FastqcRaw")
             a_dir = self.get_dir(a_dir)
-
-            if a_fastq != '@pair':
-                self.data_path[a_tag]=a_fastq
+            
+            self.data_path[a_tag]=[]
+            for a_fastq in self.tag2fastqs(a_tag):
+                self.data_path[a_tag].append(a_fastq)
                 List_append( self.cmd_list,
                              space_join(conf['fastqc.main'], '-q','-o', a_dir, a_fastq))
-            else:
-                two_fastq = conf['paired_datalist'][a_tag]
-                self.data_path[a_tag]=two_fastq
-                List_append( self.cmd_list,
-                             space_join(conf['fastqc.main'], '-q','-o', a_dir, two_fastq[0]))
-                List_append( self.cmd_list,
-                             space_join(conf['fastqc.main'], '-q','-o', a_dir, two_fastq[1]))                
+    def qc_result_summary(self):
+        self.tag2realnames = lambda tag:map(self.parse_name,self.tag2fastqs(tag))
+        self.realname2source = lambda tag,realname:self.find_path(self.conf['output.outputdir'], tag,"FastqcRaw",
+                                                   realname+"_fastqc/fastqc_data.txt")
+        for a_tag in self.conf['datalist']:
+            for a_realname in self.tag2realnames(a_tag):
+                print self.realname2source(a_tag,a_realname)
+    def main(self):
+        ModuleBase.main(self)
+        self.qc_result_summary()
+        
+                
 class FastqcMapped(ModuleBase):
     ''' Use Fastqc to do the Quality Control for the mapped BAM file'''
     def __init__(self, conf, source="Tophat"):
         self.tag2dir = lambda tag:self.find_path(conf['output.outputdir'], tag,"Summary_mapped")
         self.tag2even = lambda tag:self.tag2dir(tag)+"/"+tag
         self.tag2rscript = lambda tag:self.tag2even(tag)+".r"
+        self.tag2mm = lambda tag:self.tag2dir(tag)+"/"+tag+".multimapped"
+        # save the temp result for calculate uniq maping rate
         
         for a_tag in conf['datalist']:
             self.data_path[a_tag] = self.tag2bam(a_tag)
                          space_join(conf['gfold.samtools'],'view', a_bam,"|",
                                     "python", pjoin(conf['scripts.rootdir'],"evenness_qclib.py"),
                                     self.gtf_file, an_output))
+            List_append (self.cmd_list,
+                         space_join(self.conf['gfold.samtools'],'view', a_bam,"|",
+                                    "cut","-f1","|",
+                                    "sort","|",
+                                    "uniq","-c",">",self.tag2mm(a_tag)))
             
             self.for_summary+=an_output+" "+a_tag+" "
 
     def qc_for_mapped_cnt(self):
         '''Calculate how many counts are mapped or uniq mapped'''
         self.tag2mappableQC = lambda tag:self.tag2dir(tag)+"/"+tag+".mappable_cnt"
-        self.tag2fastq = lambda tag: self.conf['datalist'][tag] if self.conf['datalist'][tag]!="@pair" else self.conf['pairlist'][tag][0]
+        self.tag2fastq = lambda tag: self.conf['datalist'][tag] if self.conf['datalist'][tag]!="@pair" else self.conf['paired_datalist'][tag][0]
         get_first_string = lambda cmd: get_output(cmd).split(" ")[0]
         for a_tag,a_bam in self.data_path.items():
             an_output = self.tag2mappableQC(a_tag)
             a_fastq = self.tag2fastq(a_tag)
             with open(an_output,"w") as mappable_file:
                 qc_content=""
-                mapped_cnt = int(get_first_string(space_join(self.conf['gfold.samtools'],'view', a_bam,"|",
-                                                             "cut","-f1","|"                                                           
-                                                             "uniq","|",
-                                                             "wc","-l",)))
-                uniq_cnt = int(get_first_string(space_join(self.conf['gfold.samtools'], 'view', a_bam,"|",
-                                                           "cut","-f1","|"
-                                                           "uniq","-u","|",
-                                                           "wc","-l",)))
+                mapped_cnt = int(get_first_string(space_join("cat", self.tag2mm(a_tag),"|"
+                                                             "wc","-l")))
+                uniq_cnt = int(get_first_string(space_join("cat", self.tag2mm(a_tag),"|",
+                                                           "grep","\ 1\ ","|",
+                                                             "wc","-l")))
 
                 raw_cnt = int(get_first_string(space_join("wc","-l", a_fastq)))/4
                 percent = (mapped_cnt/float(raw_cnt))*100
                                cmd_pairs,
                                self.outputdir+"/"+"rpkm_correlation"))
             
-class Summary_Fastqc(ModuleBase):
-    def __init__(self, conf, source="Fastqc"):
-        ModuleBase.__init__(self, name="Correlation of the expression of genes", conf=conf)
-        self.tag2realname = lambda tag:self.parse_name(conf['datalist'][tag])
-        self.tag2source = lambda tag:self.find_path(conf['output.outputdir'], tag, source,
-                                                   self.tag2realname(tag)+"_fastqc/fastqc_data.txt")
-        for a_tag in conf['datalist']:
-            with open(self.tag2source(a_tag)) as qc_data:
-                          m = re.search(r">>Per sequence quality scores.+?Count\n(.+?)\n>>END", t, re.S)
+
                           
 alias_dict = dict(fastqc_raw=FastqcRaw, fastqc_mapped=FastqcMapped, tophat=Tophat, cufflinks=Cufflinks,
                   gfold=gfold, bam2bed= Bam2Bed, summary_mapped= Summary_tophat,
-                  summary_rpkm=Summary_cufflinks, summary_naiveqc= Summary_Fastqc)
+                  summary_rpkm=Summary_cufflinks)
+def dependency_check(conf):
+    dependency_dict = dict(fastqc_raw=[], tophat=[], fastqc_mapped=["tophat"], cufflinks=["tophat"],
+                           gfold=["tophat"], bam2bed=["tophat"], summary_rpkm=["cufflinks"],
+                           summary_naiveqc=["fastqc_raw"], summary_mapped=["tophat"])
+    step_list_=conf['steps.steps'].strip().split(",")
+    pre_step_list=[]
+    for a_cursor in step_list_:
+        deps = dependency_dict.get(a_cursor,[])
+        for a_dep in deps:
+            if a_dep not in pre_step_list:
+                a_tip=("{0} need {1} as a dependency, please make sure you have run {1} previously.".format(a_cursor,a_dep))                
+                if conf.get("strict.strict","False") == "False":
+                    warn(a_tip)
+                else:
+                    error(a_tip)
+                    sys.exit(1)
+
+        if a_cursor not in pre_step_list:
+            pre_step_list.append(a_cursor)
+
 def read_config(configFile):
     """Read configuration file and parse it into a dictionary.
 
 
     conf['steplist']=[alias_dict[alias] for alias in conf['steps.steps'].split(",")]
 
-def dependency_check(conf):
-    dependency_dict = dict(fastqc=[], tophat=[], cufflinks=["tophat"],
-                           gfold=["tophat"], bam2bed=["tophat"], summary_rpkm=["cufflinks"],
-                           summary_naiveqc=["fastqc"], summary_mapping=["tophat"])
-    step_list_=conf['steps.steps'].strip().split(",")
-    pre_step_list=[]
-    for a_cursor in step_list_:
-        deps = dependency_dict.get(a_cursor,[])
-        for a_dep in deps:
-            if a_dep not in pre_step_list:
-                a_tip=("{0} need {1} as a dependency, please make sure you have run {1} previously.".format(a_cursor,a_dep))                
-                if conf.get("strict.strict","False") == "False":
-                    warn(a_tip)
-                else:
-                    error(a_tip)
-                    sys.exit(1)
-
-        if a_cursor not in pre_step_list:
-            pre_step_list.append(a_cursor)
 
 ###########
 # running #
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.