Wiki
Clone wikiFlink / Estimating and plotting posterior distributions
Estimating and plotting posterior distributions
Estimation output
The estimation output is a file listing the parameter values samples during the MCMC chain. Each row of this file corresponds to one multi-dimensional parameter vector sampled from the posterior distribution. Each column lists the sampled parameter values.
An example of such a file looks like:
iterations Group_A_lnkappa Group_A_lnMu Group_A_lnNu Pop_A0_beta Pop_A1_beta
300100 -5.34993 -0.0780549 -0.405734 -12.4582 -4.44505
300200 -5.34633 -0.0837091 -0.280782 -12.2368 -4.4878
300300 -5.30717 -0.0776627 -0.395413 -10.8117 -4.44051
300400 -5.31584 -0.078312 -0.276065 -10.5736 -4.48414
The file only contains the parameters that are estimated (see Parameter Estimation (task estimate)).
Plotting posterior distributions
We recommend using R to plot posterior distributions from the output of Flink.
Step 1) Load output file into R To load an output file "Finches_MCMC_hierarchicalParameters.txt" into R, use a command like:
data <- read.table("Finches_MCMC_hierarchicalParameters.txt", header=T);
Note that this may take a long time if the file is huge. But since only a few thousand values are required to plot posterior distributions, it is recommended to subsample the MCMC output using the argument thinning (see Parameter Estimation (task estimate)).
Step 2) Plot posterior distributions To plot the posterior distribution of the parameters, simply use the function density in R. For the kappa values of the groups, for instance:
plot(density(data$Group_A_lnkappa),main=paste("lnkappa distribution"), xlab=expression(paste(log~kappa)), ylab="Posterior Density")
lines(density(data$Group_B_lnkappa),col="green")
legend("topright", inset=.05, nome<-c('GroupA','GroupB'), col=c('black','green'), lwd=1.5,cex=1.2)
Assessing Convergence
Markov Chain Monte Carlo is a technique to generate samples from a particular distribution, in our case the multidimensional posterior distribution. It is important that the MCMC chain has converged.
Convergence can be assessed by looking at the trace of the MCMC for log-likelihood as well as different parameters. An example to see the convergence of the different parameters is given typing in R:
data <- read.table("Finches_MCMC_hierarchicalParameters.txt", header=T);
par(mfrow=c(1,3),mar=c(5.5,6,1,1.5),mgp=c(3.5,1.5,0))
plot(data$Group_A_alpha_max, type='l', ylab="Group_A_alpha_max");
plot(data$Pop_A0_beta, type='l', ylab="Pop_A0_beta");
plot(data$Group_A_lnkappa, type='l', ylab="Group_A_lnkappa");
This will result in a picture like this one:
Plotting coefficients of selection and print loci under selection
The information to determine if a locus is under selection are included in the files Posterior_alphas_group_"number".txt for the groups and in Posterior_A.txt for the higher hierarchy. An example of a script to plot genome-wide the minus the logarithm in base 10 of the false discovery rate (FDR) for a model of 2 groups and the higher hierarchy is provided below
args <- commandArgs(trailingOnly = TRUE);
name <- args[1];
rm(args);
pdf(paste("plotSel",name,".pdf",sep=""),width = 14, height = 3.6);
layout(matrix(c(1,1,1,2,2,2,3,3,3), 9,1, byrow = TRUE))
par(mar=c(1,6.5,1.0,2.0),mgp=c(3.5,1.5,0),oma=c(6,0,0,0))
library(data.table)
freq_pos<-read.table(paste("inputFlink.txt",sep=""),header=F,skip = 2)
numloci<-nrow(freq_pos)
divPostProb<-vector(mode="numeric",length=numloci)
balPostProb<-vector(mode="numeric",length=numloci)
calcDivSel <- function(data,numloci,divPostProb) {
rows<-nrow(data)
divPostProb=(colSums(data[,2:(numloci+1)]>0))/(rows+1)
}
calcBalSel <- function(data,numloci,balPostProb) {
rows<-nrow(data)
balPostProb=(colSums(data[,2:(numloci+1)]<0))/(rows+1)
}
plotdata <- function(freq,divPostProb,balPostProb,namepop) {
plot(-log(1.0-divPostProb,base=10),main=,ylab="",xlab="",cex.main=2.5,las=1, type='l',ylim=c(0,4),cex.lab=2.5,cex.axis=2,xaxs="i",yaxs="i",col="orange2",xaxt="n",yaxt="n")
lines(-log(1.0-balPostProb,base=10),col="dodgerblue")
legend("topright",legend=namepop, bty ="n", pch=NA,cex=1.6,inset=c(-0.0001,-0.15))
mtext(expression(paste(-log[10]~q)), side=2, line=4,cex=1.3)
axis(2, seq(0,4,2),las=2,cex.axis=2)
abline(h=2.0,lty="dashed")
}
data_flink_g<-fread("Posterior_A.txt",header=T)
divPostProb<-calcDivSel(data_flink_g,numloci,divPostProb)
balPostProb<-calcBalSel(data_flink_g,numloci,balPostProb)
plotdata(freq_pos$V2,divPostProb,balPostProb,"Higher hierarchy")
data_flink_g<-fread("Posterior_alphas_group_0.txt",header=T)
divPostProb<-calcDivSel(data_flink_g,numloci,divPostProb)
balPostProb<-calcBalSel(data_flink_g,numloci,balPostProb)
plotdata(freq_pos$V2,divPostProb,balPostProb,"Group 0")
data_flink_g<-fread("Posterior_alphas_group_1.txt",header=T)
divPostProb<-calcDivSel(data_flink_g,numloci,divPostProb)
balPostProb<-calcBalSel(data_flink_g,numloci,balPostProb)
plotdata(freq_pos$V2,divPostProb,balPostProb,"Group 1")
axis(1, las=TRUE, cex.axis=2.0)
mtext(paste("Locus",sep=""), side=1, line=4,cex=1.6,outer=T)
dev.off();
Script to detect loci under selection. You need to specify the selection (divergent or balancing) and if you want to analyze the higher hierarchy you need to write Hierarchy as group
args <- commandArgs(trailingOnly = TRUE);
name <- args[1]; #chromosome or group of scaffolds
group <- args[2]; #give a name for the group (dolphin example: Coastal or Pelagic)
index <- args[3]; #index of the group in the Flink output. It starts from 0
sel <-args[4]; #divergent or balancing
prob <-as.numeric(args[5]); #suggested value 0.99 that corresponds to FDR=0.01
rm(args);
if((sel != "divergent") && (sel != "balancing")) {
stop("type a proper kind of selection: divergent or balancing!")
}
library(data.table)
freq_pos<-read.table(paste("reg",name,"/inputFlink.txt",sep=""),header=F,skip = 2)
numloci<-nrow(freq_pos)
if(group == "Hierarchy"){
data_flink<-read.table(paste("reg",name,"/Posterior_A.txt",sep=""),header=T)
}else{
data_flink<-read.table(paste("reg",name,"/Posterior_alphas_group_",index,".txt",sep=""),header=T)
}
flink_sel<-vector(mode="numeric",length=numloci)
rows<-nrow(data_flink)
if(sel == "divergent"){
flink_sel=colSums(data_flink[,2:(numloci+1)]>0)/rows
}else{
flink_sel=colSums(data_flink[,2:(numloci+1)]<0)/rows
}
lim=0
locus=1
numreg=0
while(locus <= (numloci-1)){
j=0
if(flink_sel[locus] >= prob){
selection=sel
j=j+1
numreg=numreg+1
while((flink_sel[locus+j] >= 0.99) & (locus+j<=numloci)){
j=j+1
}
start=as.numeric(freq_pos$V2[locus])
end=as.numeric(freq_pos$V2[locus+j-1])
diff=end-start
if(diff>=lim){
i=1
while(i<=j){
write(c(name, numreg, as.character(freq_pos$V1[locus]), freq_pos$V2[locus], flink_sel[locus], selection, group), file=paste("coefSel",sel,group,".txt",sep=""), ncolumns=7, append=TRUE, sep = "\t")
locus=locus+1
i=i+1
}
}
}
locus=locus+1
}
Example of how to run it:
Rscript findSel.r chr1 Pelagic 0 divergent 0.99
Updated