Source

Cistrome-Harvard-dev2 / tools / expression / eiplot.py

"""
exptf.py
A galaxy tool to find the top-n highest expressed TFs
"""
import os.path
import sys
import tempfile
import os
import re
import string
import subprocess

rprog="""rplot=function(tdir, title, txtfname, logfpath, output1, output2)
#  rcall = "('%s','%s')" % (tdir, txtfname, logfpath)
# /home/c104611/cistrome/dist/galaxy/database/tmp
# /home/c104611/cistrome/src/gx/GX/SmallExpression.txt
# /home/c104611/cistrome/dist/galaxy/database/tmp/test.dat
{
sink(file=file(logfpath, "at"), type="message")
sink(file=file(logfpath, "at"), type="output")

newfiles = c()
newnames = c()
newtypes = c()

setwd(tdir)

library(oligo)
library(affy)
library(genefilter)
library(geneplotter)
library(limma)
library(RColorBrewer)
library(genefilter)

palette(brewer.pal(8, "Dark2"))
pdffname = file.path(tdir, title) #provides full file path

#for output
newnames = c(newnames, title)
newfiles = c(newfiles, pdffname)
newtypes = c(newtypes, 'pdf')

x <- read.table(output1,header=TRUE,sep="\t")
xx <- read.table(output2,header=TRUE,sep="\t")

require(graphics)

par(mfrow=c(3,1))
pdf(file=pdffname, paper='a4')

boxplot(x[,2],xx[,2], names= TRUE,las=1, cex.axis=0.7, main="Expression Intensity of two Factors", col=c("lightblue","lightgreen"))
hist(x[,2], col='lightblue', names= FALSE, xlab="Intensity", main="Distribution of Intensity Factor1")
hist(xx[,2], col='lightgreen', names= FALSE,xlab="Intensity",  main="Distribution of Intensity Factor2")

dev.off()

sink()
return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes))
}
"""

mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory

def findExpression(inputExprFilePath, inputSampleName, inputCompareGeneFilePath, inputPartialFilePath, refChoice, logFileName, output1, output2):

    #read infpath
    #drop header
    #get sselect sample name
    #Get Full/Partial Reference Genes and write into file
        #genes = extract 1st column (genes) in vector
        #exp_val = extract n-th column in vector
        #map(exp_val, gene)
        #sort exp_val
        #loop sorted_exp_val and print gene until sorted_exp_val < cutoff
    #write into log file as matrix

    #read expression index file
    f = file(inputExprFilePath, 'r')
    exprFilelines = f.readlines()
    f.close()      
   
    #find sample_name & remove if doubel quotes
    header_line =  string.strip(exprFilelines[0])

    samples = []
    x_re = re.compile(r'\t')
    headerItems = x_re.split(header_line)

    for (sample) in headerItems:
        sample = string.strip(sample, '"')
        samples.append(sample)

    sample_name = samples[inputSampleName-1]
    
    #drop header
    exprFilelines = exprFilelines[1:]

    #prepare mapping for ALL & Partial    
    expr_genes_arr = []
    exp_val_arr = []
    full_mapping = {}

    partial_exp_val_arr = []
    partial_mapping = {}

    if refChoice=="All":
        #when reference genes are ALL
        for (line) in exprFilelines:
            columns = line.strip().split()
            geneName = columns[0].strip('"')            
            value = float(columns[inputSampleName])
            expr_genes_arr.append(geneName)
            exp_val_arr.append(value)
            full_mapping[value] = geneName
    else:
        #when reference genes are partial
        #read (gene = expr_value) from expression index file
        fpartial = file(inputPartialFilePath, 'r')
        partial_genes = fpartial.readlines()
        fpartial.close()
        
        #Match inputPartialFileFenes with expression_index file genes and list out
        for (partialGeneName) in partial_genes:
            partialGeneName = partialGeneName.strip().strip('"')
            for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
                columns = eachLine.strip().split()
                exprGeneName = columns[0].strip('"')
                exprValue = float(columns[inputSampleName])
                #check and add partial gene
                if exprGeneName==partialGeneName:
                    partial_exp_val_arr.append(exprValue)
                    partial_mapping[exprValue] = exprGeneName

    #write mapping into logs
    out1 = file(output1, 'w')

    #write mapping. 1st Preference to ALL    
    if len(exp_val_arr)>0:  
        exp_val_arr.sort()
        exp_val_arr.reverse()
        out1.write("%s\t%s\n" % ("GeneName",sample_name ))
        n=0
        for (evalue) in exp_val_arr:
            out1.write("%s\t%f" % (full_mapping[evalue], evalue)) # maintains value & gene
            n=n+1
            #if n<len(exp_val_arr):
            out1.write("\n")
    else:
        
        partial_exp_val_arr.sort()
        partial_exp_val_arr.reverse()
        out1.write("%s\t%s\n" % ("GeneName",sample_name ))
        m=0
        for (pevalue) in partial_exp_val_arr:
            out1.write("%s\t%f" % (partial_mapping[pevalue], pevalue)) # maintains value & gene
            m=m+1            
            out1.write("\n")

    out1.close()

    #read compare gene lists expression index
    #read expression index file
    cf = file(inputCompareGeneFilePath, 'r')
    comp_genes = cf.readlines()
    cf.close()

    comp_exp_val_arr = []
    comp_mapping = {}
    #Match inputPartialFileFenes with expression_index file genes and list out
    for (cGeneName) in comp_genes:
        cGeneName = cGeneName.strip().strip('"')
        for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
            columns = eachLine.strip().split()
            cExprGeneName = columns[0].strip('"')
            cExprValue = float(columns[inputSampleName])
            #check and add partial gene
            if cExprGeneName==cGeneName:
                comp_exp_val_arr.append(cExprValue)
                comp_mapping[cExprValue] = cExprGeneName
                
    #write mapping into logs
    out2 = file(output2, 'w')

    #write mapping. 1st Preference to ALL
    if len(comp_exp_val_arr)>0:
        comp_exp_val_arr.sort()
        comp_exp_val_arr.reverse()
        out2.write("%s\t%s\n" % ("GeneName",sample_name ))
        i=0
        for (cevalue) in comp_exp_val_arr:
            i = i+1
            out2.write("%s\t%f" % (comp_mapping[cevalue], cevalue)) # maintains value & gene           
            out2.write("\n")
    out2.close()
       
            
def main():
    """called as 
<command interpreter="python">
exptf.py '$expVal' '$i' '$cutoff' '$logmeta'
</command>
    """
    nparm = 8
    appname = sys.argv[0]

   # print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
    
    if (len(sys.argv) < nparm):
        print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
        print 'usage: %s expression_values n out_file' % (appname)
        sys.exit(1)
        # short command line error
    title = sys.argv[1].strip()
    inputExprFilePath = sys.argv[2].strip()
    sampleName = int(sys.argv[3].strip())
    geneList1 =  sys.argv[4].strip()
    refChoice = sys.argv[5].strip()
    geneList2 = sys.argv[6].strip()
    logFileName = sys.argv[7].strip()
    
    #create temp directory
    title = title.strip(' ').replace('  ',' ').replace(' ','_')
    tdir = tempfile.mkdtemp(prefix=title)
    title = title + '.pdf'   

    #dirName = os.path.dirname(tdir)
    
    out1 = os.path.join(tdir, "out1.txt")
    out2 = os.path.join(tdir, "out2.txt")

    #remove temp files if already exists
    if os.path.exists(out1):
        os.remove(out1)
        
    if os.path.exists(out2):
        os.remove(out2)
    
    #create temp files
    output1_file = open(out1, "w")    
    output1_file.close()

    output2_file = open(out2, "w")    
    output2_file.close()

    #output1 = sys.argv[6].strip()
    #output2 = sys.argv[6].strip()

    print 'title:%s'% (title)
    print 'ExpressionFilePath:%s'% (inputExprFilePath)
    print 'Genes To be compared:%s'% (geneList1)
    print 'Choice:%s'% (refChoice)
    print 'Partial Genes:%s'% (geneList2)
    print 'logfilename:%s'% (logFileName)
    print 'output1:%s'% (out1)
    print 'output2:%s'% (out2)

    #write {gene expr_val) into tmp logfname
    #or
    #write {partial_gene expr_val) into tmp logFileName
    findExpression(inputExprFilePath, sampleName, geneList1, geneList2, refChoice, logFileName, out1, out2)
   
    # call R function & read return array
    rprogname = 'rplot'
    rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,inputExprFilePath,logFileName,out1,out2)
    print '##rcall=',rcall
    
    p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
    p.communicate(rprog + "\n" + rcall)
    makenew = getFilenames(tdir, title)

    newfiles = makenew[0]
    newnames = makenew[1]
    newtypes = makenew[2]

    #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process()
    logf = file(logFileName,'a')
    s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall)
    logf.write(s)
    try:
        outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))]
    except:
        outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1
    tmpString = '\n%s\t%s\n' % (tmp, tdir)
    logf.write(tmpString)
    logf.write('\n'.join(outlist))
    logf.write('\n')
    logf.close()
    
def getFilenames(tdir, title):
    return [[os.path.join(tdir, title)], [title], ['pdf']]

if __name__ == "__main__":
    main()