Wiki

Clone wiki

Flink / Estimating and plotting posterior distributions

Estimating and plotting posterior distributions

Estimation output

The estimation output is a file listing the parameter values samples during the MCMC chain. Each row of this file corresponds to one multi-dimensional parameter vector sampled from the posterior distribution. Each column lists the sampled parameter values.

An example of such a file looks like:

 iterations      Group_A_lnkappa Group_A_lnMu    Group_A_lnNu    Pop_A0_beta       Pop_A1_beta
 300100           -5.34993        -0.0780549      -0.405734       -12.4582        -4.44505
 300200           -5.34633        -0.0837091      -0.280782       -12.2368        -4.4878
 300300           -5.30717        -0.0776627      -0.395413       -10.8117        -4.44051
 300400           -5.31584        -0.078312       -0.276065       -10.5736        -4.48414

The file only contains the parameters that are estimated (see Parameter Estimation (task estimate)).

Plotting posterior distributions

We recommend using R to plot posterior distributions from the output of Flink.

Step 1) Load output file into R To load an output file "Finches_MCMC_hierarchicalParameters.txt" into R, use a command like:

data <- read.table("Finches_MCMC_hierarchicalParameters.txt", header=T);

Note that this may take a long time if the file is huge. But since only a few thousand values are required to plot posterior distributions, it is recommended to subsample the MCMC output using the argument thinning (see Parameter Estimation (task estimate)).

Step 2) Plot posterior distributions To plot the posterior distribution of the parameters, simply use the function density in R. For the kappa values of the groups, for instance:

 plot(density(data$Group_A_lnkappa),main=paste("lnkappa distribution"), xlab=expression(paste(log~kappa)), ylab="Posterior Density")
 lines(density(data$Group_B_lnkappa),col="green")
 legend("topright", inset=.05,  nome<-c('GroupA','GroupB'), col=c('black','green'), lwd=1.5,cex=1.2)

Assessing Convergence

Markov Chain Monte Carlo is a technique to generate samples from a particular distribution, in our case the multidimensional posterior distribution. It is important that the MCMC chain has converged.
Convergence can be assessed by looking at the trace of the MCMC for log-likelihood as well as different parameters. An example to see the convergence of the different parameters is given typing in R:

data <- read.table("Finches_MCMC_hierarchicalParameters.txt", header=T);
par(mfrow=c(1,3),mar=c(5.5,6,1,1.5),mgp=c(3.5,1.5,0))
plot(data$Group_A_alpha_max, type='l', ylab="Group_A_alpha_max");
plot(data$Pop_A0_beta, type='l', ylab="Pop_A0_beta");
plot(data$Group_A_lnkappa, type='l', ylab="Group_A_lnkappa");

This will result in a picture like this one:

example.jpg

Plotting coefficients of selection and print loci under selection

The information to determine if a locus is under selection are included in the files Posterior_alphas_group_"number".txt for the groups and in Posterior_A.txt for the higher hierarchy. An example of a script to plot genome-wide the minus the logarithm in base 10 of the false discovery rate (FDR) for a model of 2 groups and the higher hierarchy is provided below

 args <- commandArgs(trailingOnly = TRUE);
 name <- args[1];
 rm(args);

 pdf(paste("plotSel",name,".pdf",sep=""),width = 14, height = 3.6);
 layout(matrix(c(1,1,1,2,2,2,3,3,3), 9,1, byrow = TRUE))
 par(mar=c(1,6.5,1.0,2.0),mgp=c(3.5,1.5,0),oma=c(6,0,0,0))

 library(data.table)
 freq_pos<-read.table(paste("inputFlink.txt",sep=""),header=F,skip = 2)
 numloci<-nrow(freq_pos)
 divPostProb<-vector(mode="numeric",length=numloci)
 balPostProb<-vector(mode="numeric",length=numloci)

 calcDivSel <- function(data,numloci,divPostProb) {
   rows<-nrow(data)
   divPostProb=(colSums(data[,2:(numloci+1)]>0))/(rows+1)
 }

 calcBalSel <- function(data,numloci,balPostProb) {
   rows<-nrow(data)
   balPostProb=(colSums(data[,2:(numloci+1)]<0))/(rows+1)
 }

 plotdata <- function(freq,divPostProb,balPostProb,namepop) {
   plot(-log(1.0-divPostProb,base=10),main=,ylab="",xlab="",cex.main=2.5,las=1, type='l',ylim=c(0,4),cex.lab=2.5,cex.axis=2,xaxs="i",yaxs="i",col="orange2",xaxt="n",yaxt="n")
   lines(-log(1.0-balPostProb,base=10),col="dodgerblue")
   legend("topright",legend=namepop, bty ="n", pch=NA,cex=1.6,inset=c(-0.0001,-0.15)) 
   mtext(expression(paste(-log[10]~q)), side=2, line=4,cex=1.3)
   axis(2, seq(0,4,2),las=2,cex.axis=2)
   abline(h=2.0,lty="dashed") 
 }

 data_flink_g<-fread("Posterior_A.txt",header=T)
 divPostProb<-calcDivSel(data_flink_g,numloci,divPostProb)
 balPostProb<-calcBalSel(data_flink_g,numloci,balPostProb)
 plotdata(freq_pos$V2,divPostProb,balPostProb,"Higher hierarchy")

 data_flink_g<-fread("Posterior_alphas_group_0.txt",header=T)
 divPostProb<-calcDivSel(data_flink_g,numloci,divPostProb)
 balPostProb<-calcBalSel(data_flink_g,numloci,balPostProb)
 plotdata(freq_pos$V2,divPostProb,balPostProb,"Group 0")

 data_flink_g<-fread("Posterior_alphas_group_1.txt",header=T)
 divPostProb<-calcDivSel(data_flink_g,numloci,divPostProb)
 balPostProb<-calcBalSel(data_flink_g,numloci,balPostProb)
 plotdata(freq_pos$V2,divPostProb,balPostProb,"Group 1")
 axis(1, las=TRUE, cex.axis=2.0)
 mtext(paste("Locus",sep=""), side=1, line=4,cex=1.6,outer=T)

 dev.off();

Script to detect loci under selection. You need to specify the selection (divergent or balancing) and if you want to analyze the higher hierarchy you need to write Hierarchy as group

 args <- commandArgs(trailingOnly = TRUE);
 name <- args[1];  #chromosome or group of scaffolds
 group <- args[2];  #give a name for the group (dolphin example: Coastal or Pelagic)
 index <- args[3];  #index of the group in the Flink output. It starts from 0
 sel <-args[4];  #divergent or balancing
 prob <-as.numeric(args[5]); #suggested value 0.99 that corresponds to FDR=0.01
 rm(args);

 if((sel != "divergent") && (sel != "balancing")) {
    stop("type a proper kind of selection: divergent or balancing!")
 }

 library(data.table)
 freq_pos<-read.table(paste("reg",name,"/inputFlink.txt",sep=""),header=F,skip = 2)
 numloci<-nrow(freq_pos)
 if(group == "Hierarchy"){
    data_flink<-read.table(paste("reg",name,"/Posterior_A.txt",sep=""),header=T)
 }else{
    data_flink<-read.table(paste("reg",name,"/Posterior_alphas_group_",index,".txt",sep=""),header=T)
 }

 flink_sel<-vector(mode="numeric",length=numloci)
 rows<-nrow(data_flink)

 if(sel == "divergent"){
    flink_sel=colSums(data_flink[,2:(numloci+1)]>0)/rows
 }else{
    flink_sel=colSums(data_flink[,2:(numloci+1)]<0)/rows
 }

 lim=0
 locus=1
 numreg=0
 while(locus <= (numloci-1)){
 j=0
 if(flink_sel[locus] >= prob){
    selection=sel
    j=j+1
    numreg=numreg+1
    while((flink_sel[locus+j] >= 0.99) & (locus+j<=numloci)){
        j=j+1
    }
    start=as.numeric(freq_pos$V2[locus])
    end=as.numeric(freq_pos$V2[locus+j-1])
    diff=end-start
    if(diff>=lim){
        i=1
        while(i<=j){
             write(c(name, numreg, as.character(freq_pos$V1[locus]), freq_pos$V2[locus], flink_sel[locus], selection, group), file=paste("coefSel",sel,group,".txt",sep=""), ncolumns=7, append=TRUE, sep = "\t")
             locus=locus+1
             i=i+1
             }
       }
    }
    locus=locus+1
 }

Example of how to run it:

 Rscript findSel.r chr1 Pelagic 0 divergent 0.99

Updated