Commits

Ross Lazarus committed 9e204bc

First working version of an automated 2x2 factorial model generator
Requires user to specify the two main models and the difference is also returned - the interaction

  • Participants
  • Parent commits 578ae68

Comments (0)

Files changed (1)

tools/rgenetics/rgedgeRFactorial.xml

   <description>digital DGE Factorial design</description>
   <command interpreter="python">
      rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR" 
-    --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes"
+    --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
   </command>
   <inputs>
     <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
        help="Use the DGE matrix preparation tool to create these matrices from BAM files and a BED file of contigs"/>
-    <param name="title" type="text" value="DGE" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
+    <param name="title" type="text" value="Factorial DGE" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
       <sanitizer invalid_char="">
         <valid initial="string.letters,string.digits"><add value="_" /> </valid>
       </sanitizer>
     </param>
   </inputs>
   <outputs>
-    <data format="tabular" name="outtab" label="${input1.name}_${title}.xls"/>
+    <data format="tabular" name="outtab1" label="${treatment1_name}-${control1_name}_topTable.xls"/>
+    <data format="tabular" name="outtab2" label="${treatment2_name}-${control2_name}_topTable.xls"/>
+    <data format="tabular" name="outtab3" label="${treatment1_name}-${control1_name}-${treatment2_name}-${control2_name}_topTable.xls"/>
     <data format="html" name="html_file" label="${input1.name}_${title}.html"/>
   </outputs>
 <configfiles>
 <configfile name="runme">
 # edgeR.Rscript
-# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
+# new nov 2012 for 2x2 factorial designs 
+# updated nov 2011 for R 2.14.0 and edgeR 2.4.0 by ross
 # Performs DGE on a count table containing n replicates of two conditions
 #
-# Parameters
-#
-# 1 - Output Dir
-
-# Original edgeR code by: S.Lunke and A.Kaspi
 
 reallybig = log10(.Machine\$double.xmax)
 reallysmall = log10(.Machine\$double.xmin)
 require('gplots')
 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
 {
-# Perform clustering for significant pvalues after controlling FWER
     samples = colnames(cmat)
     gu = unique(group)
     if (length(gu) == 2) {
     print(paste('pcols',pcols))
     gn = rownames(cmat)
     dm = cmat[(! is.na(gn)),] 
-    # remove unlabelled hm rows
     nprobes = nrow(dm)
-    # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
     if (nprobes &gt; nsamp) {
       dm =dm[1:nsamp,]
-      #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
     }
     newcolnames = substr(colnames(dm),1,20)
     colnames(dm) = newcolnames
 
 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
 {
-    # for 2 groups only was
-    #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
-    #pcols = unlist(lapply(group,col.map))
     gu = unique(group)
     colours = rainbow(length(gu),start=0.3,end=0.6)
     pcols = colours[match(group,gu)]
 }
 
 qqPlot = function(descr='Title',pvector, ...)
-# stolen from https://gist.github.com/703512
 {
     o = -log10(sort(pvector,decreasing=F))
     e = -log10( 1:length(o)/length(o) )
         }
         
 boxPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # 
+{    
         newcolnames = substr(colnames(rawrs),1,15)
         colnames(rawrs) = newcolnames
         newcolnames = substr(colnames(cleanrs),1,15)
         pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_')
         defpar = par(no.readonly=T)
         pdf(pdfname,height=6,width=8)
-        par(mfrow=c(1,2)) # 2 rows 1 col
-        # layout(matrix(c(1,1,0,2), 2, 2, byrow = TRUE))
+        par(mfrow=c(1,2)) 
         boxplot(rawrs,main=paste('Before:',maint),col="maroon",las=3,cex.axis=0.4)
         grid(col="blue")
         lrs = log(cleanrs,10) 
 }
 
 cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
+{   
         pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
         defpar = par(no.readonly=T)
         pdf(pdfname)
         par(mfrow=c(2,1))
         lrs = log(rawrs,10) 
         lim = max(lrs)
-        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
+        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="Reads (log)",
              ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
         grid(col="blue")
         lrs = log(cleanrs,10) 
-        hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
+        hist(lrs,breaks=100,main=paste('After:',maint),xlab="Reads (log)",
              ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
         grid(col="blue")
         dev.off()
 }
 
 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
+{   
         pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
         pdf(pdfname)
         par(mfrow=c(2,1))
 
 
 
-edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priorn=5,fdrthresh=0.05,outputdir='.',
+edgeIt = function (Count_Matrix,group,outtab1,outtab2,outtab3,fdrtype='fdr',priorn=5,fdrthresh=0.05,outputdir='.',
                    myTitle='edgeR',libSize=c(),useQuantile=T,filterquantile=0.2) {
     
-    # Error handling
     if (length(unique(group))!=4){
     print.noquote("Number of conditions identified in experiment does not equal 4 - full 2x2 factorial not possible")
     q()
     }
     require(edgeR)
     mt = paste(unlist(strsplit(myTitle,' ')),collapse="")
-    expfact = factor(group,levels=unique(group)) # assume has been ordered so keep order
+    expfact = factor(group,levels=unique(group)) 
     sampleTypes = levels(expfact)
     mycontrast = NA
     colnamesDesign = list()
     nscut = round(ncol(Count_Matrix)/2)
     colTotmillionreads = colSums(Count_Matrix)/1e6
     rawrs = rowSums(Count_Matrix)
-    nonzerod = Count_Matrix[(rawrs &gt; 0),] # remove all zero count genes
+    nonzerod = Count_Matrix[(rawrs &gt; 0),] 
     nzN = nrow(nonzerod)
     nzrs = rowSums(nonzerod)
     zN = allN - nzN
-    print.noquote('# Quantiles for non-zero row counts:')
+    print.noquote('Quantiles for non-zero row counts:')
     print.noquote(quantile(nzrs,probs=seq(0,1,0.1)))
     if (useQuantile == "T")
     {
-      gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) &gt1= 1) &gt;= nscut
+      gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) &gt;= 1) &gt;= nscut
       lo = colSums(Count_Matrix[!gt1rpin3,])
       workCM = Count_Matrix[gt1rpin3,]
       cleanrs = rowSums(workCM)
       maint = paste('Filter below',filterquantile,'quantile')
     }
     cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-    print.noquote(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')))
+    print.noquote(paste("Total low count contigs per sample = ",paste(lo,collapse=',')))
     rsums = rowSums(workCM)
-    # Setup DGEList object
     allttn = c()
     colnamesDesign = list()
     l = sampleTypes
     mydesign = model.matrix(~0+expfact)
     colnames(mydesign) = sampleTypes   
     mycontrast = makeContrasts(contrasts=colnamesDesign,levels=mydesign)
-    print.noquote('## design matrix=')
+    print.noquote('Design matrix=')
     print(mydesign,quote=F)
-    print.noquote('## contrast matrix=')
+    print.noquote('Contrast matrix=')
     print(mycontrast,quote=F)
-    print.noquote('## samples=')
+    print.noquote('Samples=')
     print(colnames(workCM),quote=F)
     DGEList = DGEList(counts=workCM, group = group)
-    #Extract T v C names
-    print.noquote(paste('prior.n =',priorn))
-    # Outfile name - we need something more predictable than
-    # outfile = paste(Out_Dir,"/",TName,"_Vs_",CName, sep="")
-    # since it needs to be renamed and squirted into the history so added as a parameter
+    print.noquote(paste("prior.df =",priorn))
     DGEList = calcNormFactors(DGEList)
     DGEList = estimateCommonDisp(DGEList)
-    comdisp = DGEList$common.dispersion
+    comdisp = DGEList\$common.dispersion
     DGEList = estimateTagwiseDisp(DGEList,prior.df=priorn, trend="movingave", grid.length=1000)
     estpriorn = getPriorN(DGEList)
     print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-    normData = (1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList)))
+    normData = (1e+06 * DGEList\$counts/expandAsMatrix(DGEList\$samples\$lib.size, dim(DGEList)))
     colnames(normData) = paste( colnames(normData),'N',sep="_")
     boxPlot(rawrs=log(nonzerod),cleanrs=log(normData),maint='TMM Normalisation (log scale)',myTitle=myTitle)
-    # Plot MDS
     ug = unique(group)
-    sample_colors =  match(DGEList$samples$group,ug) #ifelse (DGEList$samples$group==group[1], 1, 2)
-    pdf(paste(outputdir,paste(fname,"MDSplot.pdf",sep='_'),sep='/'))
+    sample_colors =  match(DGEList\$samples\$group,ug) 
+    pdf(paste(outputdir,paste(mt,"MDSplot.pdf",sep='_'),sep='/'))
     plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
     legend(x="topleft", legend = c(group[1],group[length(group)]),col=c(1,2), pch=19)
     grid(col="blue")
     print(mydesign)
     DGLM = glmFit(DGEList, mydesign)
     goodness = gof(DGLM, pcutoff=fdrthresh)
-    if (sum(goodness$outlier) &gt; 0) {
-        print.noquote('## GLM outliers:')
-        rownames(DE)[goodness$outlier]
-        z = limma::zscoreGamma(goodness$gof.statistic, shape=goodness$df/2, scale=2)
+    if (sum(goodness\$outlier) &gt; 0) {
+        print.noquote('GLM outliers:')
+        rownames(DE)[goodness\$outlier]
+        z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
         pdf(paste(outputdir,paste(fname,"GoodnessofFit.pdf",sep='_'),sep='/'))
         qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
         abline(0,1,lwd=3)
-        points(qq$x[goodness$outlier],qq$y[goodness$outlier], pch=16, col="dodgerblue")
+        points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
         dev.off()
-    } else { print('## No GLM fit outlier genes found\n')}
-    options(width=132)
+    } else { print('No GLM fit outlier genes found\n')}
+    outtabs = c(outtab1,outtab2,outtab3)
     for (coeff in c(1:3))
         {
         DE = glmLRT(DGLM,contrast=mycontrast[,coeff])
         cont = paste(unlist(strsplit(colnames(mycontrast)[coeff],' ')),collapse="")
-        #Prepare our output file
-        output = cbind(Name=as.character(rownames(DGEList$counts)),
-        DE$table,adj.p.value=p.adjust(DE$table$PValue, method=fdrtype),
-        Dispersion=DGEList$tagwise.dispersion,totreads=rsums,normData,
-        DGEList$counts)
-        soutput = output[order(output$PVal),] # sorted into p value order - for quick toptable
-        nreads = soutput$totreads # ordered same way
-        print('# writing output')
-        #Write output
-        write.table(soutput,paste(mt,'_',cont,'_topTable.xls',sep=''), quote=FALSE, sep="\t",row.names=F)
-        print.noquote(paste("# Top tags for",cont))
+        output = cbind(Name=as.character(rownames(DGEList\$counts)),
+        DE\$table,adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+        Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData,
+        DGEList\$counts)
+        soutput = output[order(output\$PVal),] 
+        nreads = soutput\$totreads 
+        write.table(soutput,outtabs[coeff], quote=FALSE, sep="\t",row.names=F)
+        print.noquote(paste("Top tags for",cont))
         genecards="&lt;a href='http://www.genecards.org/index.php?path=/Search/keyword/"
         tt = topTags(DE,n=nrow(DE))
-        rn = rownames(tt$table)
+        rn = rownames(tt\$table)
         rn = paste(genecards,rn,"'&gt;",rn,'&lt;/a&gt;',sep="")
-        tt$table = cbind(tt$table,ntotreads=nreads,geneCardslink=rn) # add to end so table isn't laid out strangely
+        tt\$table = cbind(tt\$table,ntotreads=nreads,geneCardslink=rn) 
         print.noquote(tt[1:50,])
-        # Plot MAplot
-        deTags = rownames(output[output$adj.p.value &lt; fdrthresh,])
+        deTags = rownames(output[output\$adj.p.value &lt; fdrthresh,])
         nsig = length(deTags)
-        print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
+        print(paste(nsig,'tags significant at adj p=',fdrthresh),quote=F)
         outSmear = paste(outputdir,paste(mt,cont,"Smearplot.pdf",sep='_'),sep='/')
         outMain = paste("Smear Plot",cont,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-        smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
-        qqPlot(descr=paste(myTitle,cont),pvector=DE$table$PValue)
+        smearPlot(DGEList=DE,deTags=deTags, outSmear=outSmear, outMain = outMain)
+        qqPlot(descr=paste(myTitle,cont),pvector=DE\$table\$PValue)
         }
-}       #Done
-
+}
 
 rossDecide = function (object, adjust.method="BH", p.value=0.05,verbose=T) 
-    # modified from AgiMicroRna code rml august 2012
 {
-    if (!is(object, "DGEExact") & !is(object, "DGELRT")) 
+    if (!is(object, "DGEExact") &amp; !is(object, "DGELRT")) 
         stop("Need DGEExact or DGELRT object")
     DE = decideTestsDGE(object, adjust.method = adjust.method, p.value = p.value)
     sumDE = summary(DE)
 options(width=512)
 Out_Dir = "$html_file.files_path"
 Input =  "$input1"
-TreatmentName1 = "$treatment1_name"
-TreatmentCols1 = "$Treat1_cols"
-ControlName1 = "$control1_name"
-ControlCols1= "$Control1_cols"
-TreatmentName2 = "$treatment2_name"
-TreatmentCols2 = "$Treat2_cols"
-ControlName2 = "$control2_name"
-ControlCols2= "$Control2_cols"
-outputfilename = "$outtab"
+Treatment1Name = "$treatment1_name"
+Treatment1Cols = "$Treat1_cols"
+Control1Name = "$control1_name"
+Control1Cols= "$Control1_cols"
+Treatment2Name = "$treatment2_name"
+Treatment2Cols = "$Treat2_cols"
+Control2Name = "$control2_name"
+Control2Cols= "$Control2_cols"
+outtab1 = "$outtab1"
+outtab2 = "$outtab2"
+outtab3 = "$outtab3"
 fdrtype = "$fdrtype"
 priorn = $priorn
 fdrthresh = $fdrthresh
 useNDfilt = "$useNDfilt"
-fQ = $fQ # non-differential centile cutoff
+fQ = $fQ 
 myTitle = "$title"
+TCols1           = as.numeric(strsplit(Treatment1Cols,",")[[1]])-1
+CCols1           = as.numeric(strsplit(Control1Cols,",")[[1]])-1 
+TCols2           = as.numeric(strsplit(Treatment2Cols,",")[[1]])-1
+CCols2           = as.numeric(strsplit(Control2Cols,",")[[1]])-1 
+print.noquote(paste('# got TCols1=',paste(TCols1,collapse=',')))
+print.noquote(paste('# got CCols1=',paste(CCols1,collapse=',')))
+print.noquote(paste('# got TCols2=',paste(TCols2,collapse=',')))
+print.noquote(paste('# got CCols2=',paste(CCols2,collapse=',')))
 
-# Create output dir if non existent
 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')                           #Load tab file assume header
+Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')                           
 rn = rownames(Count_Matrix)
 nsamp = length(colnames(Count_Matrix))
 islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
+LibSizes = Count_Matrix[subset(rn,islib),][1] 
 Count_Matrix = Count_Matrix[subset(rn,! islib),]
 group = rep('',nsamp)
 for (i in c(1:nsamp)) {
          else {s2 = Control2Name}
     group[i] = paste(s1,s2,sep='_')
 }
-if (priorn &lt;= 0) {priorn = ceiling(20/(length(group)-1))} # estimate prior.n if not provided
-# see http://comments.gmane.org/gmane.comp.lang.r.sequencing/2009
+if (priorn &lt;= 0) {priorn = ceiling(20/(length(group)-1))} 
 cn = colnames(Count_Matrix)
-# colnames are LTAR_AAV_CTRL_14dfastq...
-results = edgeIt(Count_Matrix,group=group,outputfilename,fdrtype,priorn,fdrthresh,
-                 Out_Dir,myTitle,libSize=c(),useQuantile=useQuantile,filterquantile=fQ) #Run the main function
-# for the log
+results = edgeIt(Count_Matrix,group=group,outtab1=outtab1,outtab2=outtab2,outtab3=outtab3,fdrtype,priorn,fdrthresh,
+                 Out_Dir,myTitle,libSize=c(),useQuantile=useNDfilt,filterquantile=fQ) 
+options(width=80)
 sessionInfo()
 
 </configfile>
 **What it does**
 
 Performs digital gene expression factorial analysis between two factors - typically a treatment applied to 2 different cell types, or two treatments applied in a factorial design.
-Data with the four groups is supplied as a count matrix.
+Data with the four groups is supplied as a count matrix and the two primary comparisons are defined by selecting samples to represent
+control and treatment (whatever that means) for each comparison. The interaction is then defined as the difference between those two 
+comparisons. All comparisons are reported as separate tabular spreadsheets ordered by p value and a comprehensive summary is
+provided in the html output
 
 **Input**