Commits

duanxk committed ee5b2a0

Fastqc_stat_scripts_by_duanxk

Comments (0)

Files changed (1)

Memberscript/duanxklib/Fastqc_stat_for_a_new_dataset_NEWMETHOD_Final_Final.py

+import os
+import sys
+import string
+import re
+from optparse import OptionParser
+import logging
+
+# ------------------------------------
+# constants
+# ------------------------------------
+
+logfhd = open("log","w")
+
+logging.basicConfig(level=20,
+                    format='%(levelname)-5s @ %(asctime)s: %(message)s ',
+                    datefmt='%a, %d %b %Y %H:%M:%S',
+                    stream=sys.stderr,
+                    filemode="w"
+                    )
+
+error   = logging.critical        # function alias
+warn    = logging.warning
+
+def info(a):
+    logging.info(a)
+    logfhd.write(a+"\n")
+    logfhd.flush()
+    
+#from subprocess import Popen, PIPE
+#def calculate_peak(inf):
+
+def process_new(new_dataset_path_list,fpath):
+    info("Begin to process the bam files of the new dataset...")
+    bamfiles=[]
+    for new_dataset_path in new_dataset_path_list:
+        dirs=os.listdir(new_dataset_path)
+        for d in dirs:
+            if "bam" in d.split('.'):
+                bamfiles.append(d)
+                os.system("/mnt/Storage/home/dcadmin/FastQC/fastqc "+new_dataset_path+d+" --extract -o "+fpath)
+            else:
+                continue
+    info("Finish processing the bam files of the new dataset...")
+    if len(bamfiles):
+        return bamfiles
+    else:
+        error("No bam file in the new dataset folder!")
+        sys.exit(1)
+
+def get_new_scores(path,bamfiles):
+    #baml=process_new(new_dataset_path)
+    npeakl=[]
+    for b in bamfiles:
+        fn=b.split('.')[0]+'_fastqc/fastqc_data.txt'
+        (newseq,nquality_dict)=foo(path+fn)
+        nt=sum(nquality_dict.values())
+        #print nt
+        #print nquality_dict
+        n=0
+        #print sorted(nquality_dict.items(),key=lambda e:e[0],reverse=True)
+        for item in sorted(nquality_dict.items(),key=lambda e:e[0],reverse=True):
+            n=n+item[1]
+            percentage=n/float(nt)
+            if percentage > 0.5:
+                peak=int(item[0])
+                npeakl.append(peak)
+                break
+            else:
+                continue
+    #print npeakl
+    return npeakl
+    
+def foo(infile):
+    data = open(infile).readlines()
+    seqlen = 0
+    seqquality = []
+    quality_flag = 0
+    for line in data:
+        if line.startswith('#') or not line:
+            continue
+        hits = re.findall("Sequence length\t(\d+)", line)
+        if hits and len(hits)==1 and not seqlen:
+            seqlen = int(hits[0])
+        if line.startswith(">>Per sequence quality"):
+            quality_flag = 1
+        if quality_flag:
+            seqquality.append(line)
+        if line.startswith(">>END_MODULE") and quality_flag:
+            quality_flag = 0
+
+    quality_dict = {}
+    for i in seqquality[1:-1]:
+        #print i
+        i2 = i.strip().split('\t')
+        quality_dict[int(i2[0])] = float(i2[1])
+
+    return (seqlen, quality_dict)
+#---------------------------------------------------main()-------------------------------------------------   
+def main():
+    usage = "python %prog <new datasets folder path(split by ',')> <output folder path> [options]"
+    description = "This script is capable of generating Fastqc stat result for a new dataset."
+
+    optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False)
+    optparser.add_option("-h","--help",action="help",
+                         help="Show this help message and exit.")
+    optparser.add_option("-r","--rerun_fastqc",dest="rerun_fastqc",action='store_true',
+                         help="if -r is input, we will rerun fastqc to get a new version of all DC fastqc results")
+    (options,args) = optparser.parse_args()
+    if not args or len(args)<2:
+        optparser.print_help()
+        sys.exit(1)
+    inputpathl=sys.argv[1].split(',')
+    new_dataset_path_list=[]
+    for inputpath in inputpathl:
+        if os.path.isdir(inputpath):
+            if inputpath.endswith('/'):
+                new_dataset_path_list.append(inputpath)
+            else:
+                new_dataset_path_list.append(inputpath+'/')
+        else:
+            error('The new dataset folder you give is not right!!')
+            sys.exit(1)
+    if os.path.isdir(sys.argv[2]):
+        if sys.argv[2].endswith('/'):
+            outdir=sys.argv[2]
+        else:
+            outdir=sys.argv[2]+'/'
+    else:
+        error('The output folder you give is not right!!')
+        sys.exit(1)
+    rerun=options.rerun_fastqc
+    if rerun:
+        info("Begin to check which dataset in DC has not run Fastqc, then run it...")
+        os.system("python /mnt/Storage/home/dcadmin/HgDnase_run_fastqc.py")
+        info("Finish checking HgDnase data.")
+        os.system("python /mnt/Storage/home/dcadmin/Hg_run_fastqc.py")
+        info("Finish checking HgChIPseq data.")
+        os.system("python /mnt/Storage/home/dcadmin/MmDnase_run_fastqc.py")
+        info("Finish checking MmDnase data.")
+        os.system("python /mnt/Storage/home/dcadmin/Mm_run_fastqc.py")
+        info("Finish checking MmChIPseq data.")
+        info("Do fastqc stat now...")
+    else:
+        info("Skip reruning Fastqc, do stat for new data now...")
+        pass
+    
+    
+#new_dataset_path="/mnt/Storage/home/dcadmin/DC_results/HgChIPseq/dataset2597/"
+    count=0
+    #dic={}
+    peaklist=[]
+#oufb=open('/mnt/Storage/home/dcadmin/DC_results/fastqc_summay_boxplot.r','w')
+    oufd=open(outdir+'add_new_fastqc_summay_density.r','w')
+    oufe=open(outdir+'fastqc_summay_ecdf.r','w')
+    ouft=open(outdir+'Sequence_Quality_score_summary.r','w')
+    path="/mnt/Storage/home/dcadmin/DC_results/All_fastqc_result/"
+    dirs=os.listdir(path)
+    for d in dirs:
+        if not d.endswith(".zip"):
+            count=count+1
+            os.chdir(path)
+            os.chdir(d)
+            inf=path+d+"/fastqc_data.txt"
+    	#print inf
+            (seqlen,quality_dict)=foo(inf)
+            total_reads=sum(quality_dict.values())
+            #print total_reads
+            n=0
+            for item in  sorted(quality_dict.items(),key=lambda e:e[0],reverse=True):
+                n=n+item[1]
+                percentage=n/float(total_reads)
+                #print percentage
+                if percentage > 0.5:
+                    peak=int(item[0])
+                    peaklist.append(peak)
+                    break
+                else:
+                    continue
+    
+        else:
+            pass
+
+    oufd.write("setwd('%s')\n" %outdir)
+    oufe.write("setwd('%s')\n" %outdir)
+    ouft.write("setwd('%s')\n" %outdir)
+    oufd.write("sequence_quality_score<-c(%s)\n" %str(peaklist)[1:-1])
+    oufe.write("sequence_quality_score<-c(%s)\n" %str(peaklist)[1:-1])
+    ouft.write("sequence_quality_score<-c(%s)\n" %str(peaklist)[1:-1])
+    oufd.write("pdf('Sequence_quality_score_density_plot.pdf')\n")
+    oufd.write("plot(density(sequence_quality_score),main='Sequence Quality Score Density')\n")
+    bamfiles=process_new(new_dataset_path_list,path)
+    npeakl=get_new_scores(path,bamfiles)
+    col=['#FFB5C5','#5CACEE','#7CFC00','#FFD700','#8B475D','#8E388E','#FF6347','#FF83FA','#EEB422','#CD7054']
+    pch=[21,22,24,25,21,22,24,25,21,22,24,25,21,22,24,25]
+    i=0
+    for p in npeakl:
+    #print p
+        oufd.write("abline(v=%d,lty=2,col='%s',lwd=2)\n" %(int(p),col[i]))
+        i=i+1
+    oufd.write("legend('topright',c(%s),lty=2,col=c(%s),lwd=2)\n" %(str(bamfiles)[1:-1],str(col[:len(bamfiles)])[1:-1]))
+    oufd.write("dev.off()\n")
+    
+    oufe.write("ecdf(sequence_quality_score)->fn\n")
+    oufe.write("fn(sequence_quality_score)->density\n")
+    oufe.write("cbind(sequence_quality_score,density)->fndd\n")
+    oufe.write("fndd2<-fndd[order(fndd[,1]),]\n")
+    oufe.write("pdf('Sequence_quality_score_Cumulative_percentage.pdf')\n")
+    oufe.write("plot(fndd2,type='b',pch=18,col=2,main='Sequence Quality Score Cumulative Percentage')\n")
+    oufe.write("abline(v=25,lty=2,col='gray60')\n")
+    oufe.write("text(25,1,'cutoff', col = 'gray60'\n")
+    j=0
+    for p in npeakl:
+    #print p
+        oufe.write("points(%d,fn(%d),pch=%d,bg='%s')\n" %(int(p),int(p),int(pch[j]),col[j]))
+        j=j+1
+    oufe.write("legend('topleft',c(%s),pch=c(%s),pt.bg=c(%s))\n" %(str(bamfiles)[1:-1],str(pch[:len(bamfiles)])[1:-1],str(col[:len(bamfiles)])[1:-1]))
+    oufe.write("dev.off()\n")
+    oufe.close()
+    oufd.close()
+    ouft.write("ecdf(sequence_quality_score)->fn\n")
+    ouft.write("fn(sequence_quality_score)->density\n")
+    ouft.write("cbind(sequence_quality_score,density)->fndd\n")
+    ouft.write("write('#The summary of all sequence quality score in DC :','Sequence_quality_score_summary.txt')\n")
+    ouft.write("write(summary(fndd)[1:6],'Sequence_quality_score_summary.txt',append=T)\n")
+    ouft.close()
+    os.system('Rscript '+outdir+'add_new_fastqc_summay_density.r')
+    os.system('Rscript '+outdir+'fastqc_summay_ecdf.r')
+    os.system('Rscript '+outdir+'Sequence_Quality_score_summary.r')
+    ropen=open(outdir+'Sequence_quality_score_summary.txt','rU')
+    topen=open(outdir+'Sequence_quality_score_summary_final.txt','w')
+    quater=[]
+    for liner in ropen:
+        if liner.startswith('Min.'):
+            quater.append(liner.strip().split(':')[1])
+        elif liner.startswith('1st Qu.'):
+            quater.append(liner.strip().split(':')[1])
+        elif liner.startswith('Median'):
+            quater.append(liner.strip().split(':')[1])
+        elif liner.startswith('3rd Qu.'):
+            quater.append(liner.strip().split(':')[1])
+        elif liner.startswith('Max.'):
+            quater.append(liner.strip().split(':')[1])
+        else:
+            pass
+    topen.write("#The summary of the sequence quality score:\n")
+    topen.write('\t1st Qu.\t2rd Qu.\t3rd Qu.\t4th Qu.\n')
+    topen.write('All in DC\t'+'-'.join(quater[0:2])+'\t'+'-'.join(quater[1:3])+'\t'+'-'.join(quater[2:4])+'\t'+'-'.join(quater[3:5])+'\n')
+    j=0
+    for p in npeakl:
+        topen.write("%s:" %bamfiles[j])
+        if p <= float(quater[1]):
+            if p<25:##Please Updata this value according to the update cumulative plot!!!!
+                topen.write("\t"+str(p)+"(Fail)\t\t\t\n")
+            else:
+                topen.write("\t"+str(p)+"(Pass)\t\t\t\n")
+        elif float(quater[1])<p<=float(quater[2]):
+            if p<25:##Please Updata this value according to the update cumulative plot!!!!
+                topen.write("\t\t"+str(p)+"(Fail)\t\t\n")
+            else:
+                topen.write("\t\t"+str(p)+"(Pass)\t\t\n")
+            
+        elif float(quater[2])<p<=float(quater[3]):
+            if p<25:
+                topen.write("\t\t\t"+str(p)+"(Fail)\t\n")
+            else:
+                topen.write("\t\t\t"+str(p)+"(Good)\t\n")
+        else:
+            topen.write("\t\t\t\t"+str(p)+"(Pretty Good)\n")
+        j=j+1
+    ropen.close()
+    topen.close()
+    
+    os.system('rm '+outdir+'Sequence_quality_score_summary.txt')
+
+if __name__ == '__main__':
+
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt me! ;-) See you!\n")
+        sys.exit(0)
+        
+    
+    
+
+
+
+    
+