1. cistrome
  2. Cistrome-Applications-Harvard

Source

Cistrome-Applications-Harvard / cistrome-extra-apps / Scripts / conservation_plot.py

#!/usr/bin/env python
# Time-stamp: <2011-07-15 21:12:20 Jian Ma>

"""Description: Draw correlation plot for many wiggle files.

Copyright (c) 2008 Tao Liu <taoliu@jimmy.harvard.edu>

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).

@status:  experimental
@version: $Revision$
@author:  Tao Liu
@contact: taoliu@jimmy.harvard.edu
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
import re
import logging
import subprocess
import math
from optparse import OptionParser

from CistromeAP.taolib.CoreLib.Parser import WiggleIO, BedIO
from CistromeAP.taolib.CoreLib.BasicStat.Func import * 

try:
    from bx.bbi.bigwig_file import BigWigFile
except:
    sys.stderr.write("Need bx-python!")
    sys.exit()

# ------------------------------------
# constants
# ------------------------------------
logging.basicConfig(level=20,
                    format='%(levelname)-5s @ %(asctime)s: %(message)s ',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    stream=sys.stderr,
                    filemode="w"
                    )
#bigWigSummary = 'bigWigSummary'

# ------------------------------------
# Misc functions
# ------------------------------------

error  = logging.critical		# function alias
warn   = logging.warning
debug  = logging.debug
info   = logging.info

# ------------------------------------
# Main function
# ------------------------------------
def main():
    usage = "usage: %prog <-d path> [options] <bed files> ..."
    description = "Draw conservation plot for many bed files."
    
    optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False)
    optparser.add_option('-H','--height', dest='height',type='int',default=10, help="height of plot")
    optparser.add_option('-W','--width',dest='width',type='int',default=10, help="width of plot")
    optparser.add_option('-w',dest='w',type='int',default=1000, help="window width centered at middle of bed regions,default: 1000")    
    optparser.add_option('-t','--title',dest='title',help="title of the figure. Default: 'Average Phastcons around the Center of Sites'",default= 'Average Phastcons around the Center of Sites')
    optparser.add_option('-d','--phasdb',dest='phasdb',help= 'The directory to store phastcons scores in the server')
    optparser.add_option("-l","--bed-label",dest="bedlabel",type="string",action="append",
                         help="the BED file labels in the figure. No space is allowed. This option should be used same times as -w option, and please input them in the same order as BED files. default: will use the BED file filename as labels.")
    optparser.add_option("-h","--help",action="help",help="Show this help message and exit.")
    
    (options,bedfiles) = optparser.parse_args()
    options.pf_res = options.w / 100 # get 100 points to plot
    options.w = options.pf_res * 100 # trim

    bedfiles = map(os.path.abspath,bedfiles)
    bedfilenames = map(os.path.basename,bedfiles)

    bedfilenum = len(bedfiles)

    if bedfilenum < 1 or not options.phasdb:
        optparser.print_help()
        sys.exit(1)

    if options.bedlabel and len(options.bedlabel) == bedfilenum:
        bedlabel = options.bedlabel
    else:                               # or use the filename
        bedlabel = map(lambda x:os.path.basename(x),bedfiles)

    if options.height < 10:
        error("Height can not be lower than 10!")
        sys.exit(1)
    if options.width < 10:
        error("Width can not be smaller than 10!")
        sys.exit(1)

    # check the files
    for f in bedfiles:
        if not os.path.isfile(f):
            error("%s is not valid!" % f)
            sys.exit(1)

    # check phastcons db
    if not os.path.isdir(options.phasdb):
        error("%s is not valid!" % options.phasdb)
        sys.exit(1)

    # change wd to phastcons db path
    olddir = os.path.abspath('.')
    os.chdir(options.phasdb)

    phas_chrnames = []
    
    files_phasdb = os.listdir('.')
    for file_phasdb in files_phasdb:
        if file_phasdb.endswith('.bw'):
            name = file_phasdb.rstrip('.bw')
            phas_chrnames.append(name)

    if not phas_chrnames:
        error("%s has no valid phastcons db bw files!" % options.phasdb)
        sys.exit(1)
        
    info("number of bed files: %d" % bedfilenum)

    avgValues = []

    # for each bed file
    for f in bedfiles:
        info("extract phastcons scores using %s" % f)
        scores = extract_phastcons(f,phas_chrnames, options.w, options.pf_res)
        avgValues.append(scores)
    makeBmpFile(avgValues,olddir,options.height,options.width,options.w,options.pf_res,options.title,bedlabel)

def extract_phastcons ( bedfile, phas_chrnames, width, pf_res ):
    """Extract phastcons scores from a bed file.

    Return the average scores
    """
    info("read bed file...")
    bfhd = open(bedfile)
    bed = BedIO.parse_BED(bfhd)

    # calculate the middle point of bed regions then extend left and right by 1/2 width
    bchrs = bed.peaks.keys()
    bchrs.sort()

    chrs = []
    for c in phas_chrnames:
        if c in bchrs:
            chrs.append(c)

    sumscores = []
    for chrom in chrs:
        info("processing chromosome: %s" %chrom)
        pchrom = bed.peaks[chrom]
        bw = BigWigFile(open(chrom+'.bw', 'rb'))
        for i in range(len(pchrom)):
            mid = int((pchrom[i][0]+pchrom[i][1])/2)
            left = int(mid - width/2)
            right = int(mid + width/2)
            
            if left < 0:
                left = 0
                right = width
            
            summarize = bw.summarize(chrom, left, right, width/pf_res)
            if not summarize:
                continue
            dat = summarize.sum_data / summarize.valid_count
            #dat = dat.strip().split('\t')
            sumscores.append(dat)
            
    sumscores = map(list, zip(*sumscores))
    sumscores = [[t2 for t2 in t if not math.isnan(t2)] for t in sumscores]
    try:
        conscores = [sum(t)/len(t) for t in sumscores]
    except ZeroDivisionError:
        conscores = [0] * (width/pf_res)

    return  conscores
        
def makeBmpFile(avgValues, wd, h,w, width, pf_res, title, bedlabel):
    
    #creating R file in which to write the rscript which defines the correlation plot
    #create and save the file in the current working directory

    fileName = os.path.join(wd, 'tmp')
    rFile = open(fileName+'.R','w')
    bmpname = fileName+'.pdf'
    rscript = 'sink(file=file("/dev/null", "w"), type="message")\n'
    rscript += 'sink(file=file("/dev/null", "w"), type="output")\n'    
    rscript += 'pdf("%s",height=%d,width=%d)\n' %(bmpname,h,w)
    xInfo = range(int(-width/2),int(width/2), pf_res)
    rscript += 'x<-c('+','.join(map(str,xInfo[:-1]))+')\n' # throw the last point which may be buggy
    for i in range(len(avgValues)):
        avgscores = avgValues[i]
        tmpname = 'y'+str(i)
        rscript += tmpname+'<-c('+','.join(map(str,avgscores[:-1]))+')\n' # throw the last point which may be buggy

    tmplist = []
    for i in range(len(avgValues)):
        tmplist.append( "y%d" % i )
    
    rscript += "ymax <- max("+ ",".join(tmplist) +")\n"
    rscript += "ymin <- min("+ ",".join(tmplist) +")\n"    
    rscript += "yquart <- (ymax-ymin)/4\n"

    rscript += 'plot(x,y0,type="l",col=rainbow(%d)[1],main=\"%s\",xlab="Distance from the Center (bp)",ylab="Average Phastcons",ylim=c(ymin-yquart,ymax+yquart))\n' % (len(avgValues),title)
    for i in range(1,len(avgValues)):
        rscript += 'lines(x,y'+str(i)+',col=rainbow(%d)[%d])\n' % (len(avgValues),i+1)
    rscript += 'abline(v=0)\n'
    legend_list = map(lambda x:"'"+x+"'", bedlabel)
    rscript += 'legend("topright",c(%s),col=rainbow(%d),lty=c(%s))\n' % (','.join(legend_list),len(avgValues),','.join(['1']*len(avgValues)))
        
    rscript += 'dev.off()\n'
    rFile.write(rscript)
    rFile.close()
    #executing the R file and forming the pdf file
    data = subprocess.call(['Rscript',fileName+'.R'])

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrupt me! ;-) See you!\n")
        sys.exit(0)