Commits

Anonymous committed 4bc026d

Updates and changes to the CGES QC and consensus scripts

  • Participants
  • Parent commits 8836f57

Comments (0)

Files changed (21)

tools/cges_qc/R/QCplots.R

+## there is an array of input files:
+##  * vcftools imiss
+##  * vcftools lmiss
+##  * vcftools frq
+##  * vcftools TsTv
+##  * plink imendel
+##  * plink lmendel
+
+### LIBRARIES ###
+#################
+
+library(ggplot2)
+
+
+### FUNCTIONS ###
+#################
+
+
+## read in data for loose consensus base
+qual.plot <- function(fpath, name){
+    dat <- read.table(fpath, na.strings='-', header=T,
+      colClasses=c(rep('character',4), rep('numeric',3)))
+    plt <- ggplot(dat, aes(x=log10(FQ), y=log10(GQ))) + geom_point(alpha=0.4) +
+      geom_abline(aes(intercept=0, slope=1)) +
+      labs(title=paste(name, "QUAL score comparison"), x="log freebayes QUAL",
+          y="log GATK qual")
+    show(plt)
+}
+
+
+### INPUTS ###
+##############
+
+atlas.base <-
+'/nas40t0/vasya/consensus_call/galaxy.consensus/ts_exomes_qc/atlas_exonBed_filter/atlas_exome_bed' 
+freebayes.base <-
+'/nas40t0/vasya/consensus_call/galaxy.consensus/ts_exomes_qc/freebayes_exonBed_filter/freebayes_03-25_exome_bed'
+gatk.base <-
+'/nas40t0/vasya/consensus_call/galaxy.consensus/ts_exomes_qc/gatk_exonBed_filter/gatk_05-05_exome_bed'
+loose.consensus.base <-
+'/nas40t0/vasya/consensus_call/galaxy.consensus/ts_exomes_qc/loose_consensus_exonBed/consensus.loose.exonBed'
+strict.consensus.base <-
+'/nas40t0/vasya/consensus_call/galaxy.consensus/ts_exomes_qc/strict_consensus_exonBed/consensus.strict.exonBed'
+
+bases <- c(atlas.base, freebayes.base, gatk.base, strict.consensus.base,
+  loose.consensus.base)
+
+base.names <- c('atlas', 'freebayes', 'gatk', 'strict', 'loose')
+
+
+## calculate missingness spectra
+lmiss <- c()
+total.loci <- 0
+for (i in seq_along(bases)) {
+    file.path <- tmpFile <- paste(bases[i], '.lmiss', sep='')
+    lmiss.dat <- read.table(file.path, header=T)
+    lmiss <- c(lmiss, list((1.0 - lmiss.dat$F_MISS)))
+    total.loci <- total.loci + length(lmiss.dat$F_MISS)
+}
+miss.df <- rbind(data.frame(dataset=base.names[1], missf=unlist(lmiss[1])),
+        data.frame(dataset=base.names[2], missf=unlist(lmiss[2])),
+        data.frame(dataset=base.names[3], missf=unlist(lmiss[3])),
+        data.frame(dataset=base.names[4], missf=unlist(lmiss[4])),
+        data.frame(dataset=base.names[5], missf=unlist(lmiss[5])))
+ 
+
+
+## locus MAF spectrum
+mafs <- c()
+for (i in seq_along(bases)) {
+    file.path <- paste(bases[i], '.frq', sep='')
+    maf.dat <- read.table(file.path, header=T)
+    mafs <- c(mafs, list(maf.dat$MAF))
+}
+maf.df <- rbind(data.frame(dataset=base.names[1], maf=unlist(mafs[1])),
+        data.frame(dataset=base.names[2], maf=unlist(mafs[2])),
+        data.frame(dataset=base.names[3], maf=unlist(mafs[3])),
+        data.frame(dataset=base.names[4], maf=unlist(mafs[4])),
+        data.frame(dataset=base.names[5], maf=unlist(mafs[5])))
+
+## genome Ts/Tv ratio
+tstv.dat <- vector(mode="list", length(base.names)) 
+names(tstv.dat) <- base.names
+for (i in seq_along(bases)) {
+    file.path <- paste(bases[i], '.tstv.TsTv.summary', sep='')
+    dat <- read.table(file.path, header=T)
+    tstv.dat[base.names[i]] <- dat$COUNT[7] / dat$COUNT[8]
+}
+
+
+
+## locus mendelian inconsistencies and number of variants
+lmendel.dat <- vector(mode="list", length(base.names))
+names(lmendel.dat) <- base.names
+mvar.dat <- vector(mode="list", length(base.names))
+names(mvar.dat) <- base.names
+for (i in seq_along(bases)) {
+    file.path <- paste(bases[i], '.lmendel', sep='')
+    dat <- read.table(file.path, header=T)
+    lmendel.dat[base.names[i]] <- sum(as.numeric(dat$N))
+    mvar.dat[base.names[i]] <- length(dat$SNP)
+}
+
+
+
+
+### PLOT GENERATION
+## note we step through with the enter
+
+## missingness
+#plt2 <- ggplot(miss.df, aes(x=missf, color=dataset)) + stat_ecdf(geom="smooth")
+#show(plt2)
+
+readline()
+plt2 <- ggplot(miss.df, aes(x=missf, fill=dataset)) + geom_histogram() +
+    facet_grid(dataset~.) + labs(title="Missingness Spectra",
+        xlab="rate of missing genotypes")
+show(plt2)
+
+## regular histogram of missingness
+#plt <- ggplot(miss.df, aes(x=missf, fill=dataset)) +
+#  geom_histogram(colour="black", position="dodge",
+#    aes(y=..count../ total.loci )) +
+#  scale_fill_manual(values=c("blue","green","red","orange", "purple"))
+#show(plt)
+
+## MAF
+#readline()
+#plt <- ggplot(maf.df, aes(x=maf, fill=dataset)) +
+#  geom_histogram(position="dodge", binwidth = 0.05) 
+#show(plt)
+
+readline()
+plt <- ggplot(maf.df, aes(x=maf, fill=dataset)) + geom_histogram() +
+    facet_grid(dataset~.) + labs(title="Minor Allele Frequency Distributions",
+        x="maf")
+show(plt)
+
+## separate out individual data sets
+readline()
+## this was an attempt to reweight the densities by number of variants
+#plt <- ggplot(maf.df, aes(x=maf, color=dataset)) +
+#    geom_density(data=subset(maf.df, dataset=='freebayes'), adjust=8,
+#      weights=1/length(which(maf.df$dataset=='freebayes'))) +
+#    geom_density(data=subset(maf.df, dataset=='atlas'), adjust=0.5,
+#      weights=1/length(which(maf.df$dataset=='atlas'))) +
+#    geom_density(data=subset(maf.df, dataset=='gatk'), adjust=0.5,
+#      weights=1/length(which(maf.df$dataset=='gatk'))) +
+#    geom_density(data=subset(maf.df, dataset=='strict'), adjust=1,
+#      weights=1/length(which(maf.df$dataset=='strict'))) +
+#    geom_density(data=subset(maf.df, dataset=='loose'), adjust=0.5,
+#      weights=1/length(which(maf.df$dataset=='loose'))) 
+plt <- ggplot(maf.df, aes(x=maf, color=dataset)) +
+    geom_density(data=subset(maf.df, dataset=='freebayes'), adjust=8) +
+    geom_density(data=subset(maf.df, dataset=='atlas'), adjust=0.5) +
+    geom_density(data=subset(maf.df, dataset=='gatk'), adjust=0.5) +
+    geom_density(data=subset(maf.df, dataset=='strict'), adjust=1) +
+    geom_density(data=subset(maf.df, dataset=='loose'), adjust=0.5) 
+show(plt)
+
+## experimental ecdf representation
+readline()
+plt2 <- ggplot(maf.df, aes(x=maf, color=dataset)) + stat_ecdf(geom="smooth") +
+    labs(title="Minor Allele Frequency Cumulative Distribution Function")
+show(plt2)
+
+## ts/tv
+readline()
+tstv.df <- data.frame(dataset=base.names, tstv=unlist(tstv.dat) )
+plt <- ggplot(tstv.df, aes(y=tstv, x=dataset)) + geom_bar() +
+    labs(title="Ts/Tv Ratio", y="ts/tv ratio")
+show(plt)
+
+## number of variants
+readline()
+mvar.df <- data.frame(dataset=base.names, mvar=unlist(mvar.dat))
+plt <- ggplot(mvar.df, aes(x=dataset, y=mvar)) + geom_bar() +
+    labs(title="Number of Variants", y="number of variants")
+show(plt)
+
+## mendel errors
+readline()
+lmendel.df <- data.frame(dataset=base.names,
+lmendel.rate=unlist(lmendel.dat)/unlist(mvar.dat), lmendel=unlist(lmendel.dat))
+plt <- ggplot(lmendel.df, aes(x=dataset, y=lmendel.rate)) + geom_bar() +
+    labs(title="Mendelian Error Rate", y="mendel error rate")
+show(plt)
+
+
+## QUAL distributions
+readline()
+qual.plot(paste(strict.consensus.base, '.INFO', sep=''), 'strict')
+
+readline()
+qual.plot(paste(loose.consensus.base, '.INFO', sep=''), 'loose')
+
+## HWE
+plot.new()
+text(0.5, 0.5, 'HWE data not yet available')
+
+
+
+

tools/cges_qc/R/het.R

+library(ggplot2)
+library(plyr)
+
+"%&%" <- function(a,b) paste(a, b, sep="")
+
+extract_tstv <- function(x) {
+  return( x$COUNT[7] / x$COUNT[8] )
+}
+
+parse_file <- function(x) {
+  read.table(x, header=T)
+}
+
+args <- commandArgs(trailing=TRUE)
+
+atlas.base <- args[1]
+gatk.base <- args[2]
+freebayes.base <- args[3]
+cges.base <- args[4]
+pdf.file <- args[5]
+
+callers <- list( 'Atlas', 'GATK', 'Freebayes', 'CGES')
+files <- list(atlas.base %&% '.het',
+              gatk.base %&% '.het',
+              freebayes.base %&% '.het',
+              cges.base %&% '.het')
+
+
+## read in tables
+dat <- lapply(files, parse_file)
+
+## drop unnecessary columns
+dat <- Map( function(dataf) dataf[ ,c("INDV", "F") ], dat )
+
+## uniquify errors column names
+dat <- Map( function(dataf, newname) rename(dataf, c("F"=newname)), dat, callers )
+
+## Merge them into a single data frame properly ordered by sample
+dat <- Reduce( function(...) merge(..., by.x=c('INDV'), by.y=c('INDV'), suffixes), dat )
+
+## friendly form for plotting
+plt.dat <- stack( dat, select = unlist(callers))
+
+
+plt <- ggplot(plt.dat, aes(y=values, x=c(rep(1:length(dat$INDV), length(callers))), color = ind)) + facet_grid( ind ~ . ,scale= "free") +
+      geom_point(show_guide = FALSE) +
+      xlab('F statistics') +
+      ylab('# of samples') +
+      theme_bw()
+
+pdf(pdf.file)
+show(plt)
+dev.off()
+

tools/cges_qc/R/maf.R

+library(ggplot2)
+
+"%&%" <- function(a,b) paste(a, b, sep="")
+
+parse_file <- function(x) {
+  read.table(x, header=T)
+}
+
+args <- commandArgs(trailing=TRUE)
+
+atlas.base <- args[1]
+gatk.base <- args[2]
+freebayes.base <- args[3]
+cges.base <- args[4]
+pdf.file <- args[5]
+
+callers <- list( 'Atlas', 'GATK', 'Freebayes', 'CGES')
+files <- list(atlas.base %&% '.frq',
+              gatk.base %&% '.frq',
+              freebayes.base %&% '.frq',
+              cges.base %&% '.frq')
+
+## read in tables
+dat <- lapply(files, parse_file)
+
+## drop unnecessary columns
+dat <- Map( function(dataf) dataf[ ,"MAF" ], dat )
+
+## and ye shall be named
+names(dat) <- callers
+
+plt.dat <- stack(dat, callers)
+
+plt <- ggplot(plt.dat, aes(x=values, fill=ind)) + facet_grid( ind ~ . ,scale= "free") +
+      geom_histogram(show_guide = FALSE) +
+      xlab('MAF') +
+      ylab('Counts')
+
+plt2 <- ggplot(plt.dat, aes(values, colour=ind)) + stat_ecdf() +
+      labs(x='MAF', y='Proportion of Variants with MAF <= x') +
+      theme_bw()
+
+pdf(pdf.file)
+show(plt)
+show(plt2)
+dev.off()
+

tools/cges_qc/R/mendel.R

+library(ggplot2)
+library(gridExtra)
+
+calc_mendel <- function(x) {
+  return( with(x, length(which(N>0))/length(N)) )
+}
+
+"%&%" <- function(a,b) paste(a, b, sep="")
+
+
+args <- commandArgs(trailing=TRUE)
+
+atlas.base <- args[1]
+gatk.base <- args[2]
+freebayes.base <- args[3]
+cges.base <- args[4]
+pdf.file <- args[5]
+
+## look at mendelian inconsistencies per trio
+atlas.fmen <- read.table(atlas.base %&% ".fmendel", header=T)
+gatk.fmen <- read.table(gatk.base %&% ".fmendel", header=T)
+freebayes.fmen <- read.table(freebayes.base %&% ".fmendel", header=T)
+consensus.fmen <- read.table(cges.base %&% ".fmendel", header=T)
+
+## look at mendelian inconsistencies per locus
+atlas.lmen <- read.table(atlas.base %&% ".lmendel", header=T)
+gatk.lmen <- read.table(gatk.base %&% ".lmendel", header=T)
+freebayes.lmen <- read.table(freebayes.base %&% ".lmendel", header=T)
+consensus.lmen <- read.table(cges.base %&% ".lmendel", header=T)
+
+## count the total number of variants in the set 
+freebayes.mvar <- length(freebayes.lmen$N)
+consensus.mvar <- length(consensus.lmen$N)
+atlas.mvar <- length(atlas.lmen$N)
+gatk.mvar <- length(gatk.lmen$N)
+
+
+
+trio_mvar <- list(atlas.mvar*sum(as.numeric(atlas.fmen$CHLD)),
+             gatk.mvar*sum(as.numeric(gatk.fmen$CHLD)),
+             freebayes.mvar*sum(as.numeric(freebayes.fmen$CHLD)),
+             consensus.mvar*sum(as.numeric(consensus.fmen$CHLD)))
+
+locus_mvar <- list(atlas.mvar, gatk.mvar, freebayes.mvar, consensus.mvar)
+
+## format numerical data
+atlas.lmen$N <- as.numeric(atlas.lmen$N)
+gatk.lmen$N <- as.numeric(gatk.lmen$N)
+freebayes.lmen$N <- as.numeric(freebayes.lmen$N)
+consensus.lmen$N <- as.numeric(consensus.lmen$N)
+
+atlas.fmen$N <- as.numeric(atlas.fmen$N)
+gatk.fmen$N <- as.numeric(gatk.fmen$N)
+freebayes.fmen$N <- as.numeric(freebayes.fmen$N)
+consensus.fmen$N <- as.numeric(consensus.fmen$N)
+
+
+## throw data into one large dataframe
+callers <- list( 'Atlas', 'GATK', 'Freebayes', 'CGES')
+fmendel.dat <- list(atlas.fmen$N, gatk.fmen$N, freebayes.fmen$N, consensus.fmen$N)
+lmendel.dat <- list(which(atlas.lmen$N>0), which(gatk.lmen$N > 0), which(freebayes.lmen$N > 0), which(consensus.lmen$N > 0))
+mendel <- data.frame( trio_error_rate = (unlist(lapply(fmendel.dat, sum)) / unlist(trio_mvar)) * 100,
+                      locus_error_rate = unlist(lapply(lmendel.dat, length)) / unlist(locus_mvar) * 100,
+                      Callers = unlist(callers)) 
+
+mendel$locus_order_callers <- reorder(mendel$Callers, mendel$locus_error_rate)
+mendel$trio_order_callers <- reorder(mendel$Callers, mendel$trio_error_rate)
+
+
+## plot dat graph
+locus_plt <- ggplot( mendel, aes(x=locus_order_callers, y=locus_error_rate, fill=Callers) ) + 
+          geom_bar(stat="identity", show_guide = FALSE) +
+          theme_bw() +
+          xlab("Caller") +
+          ylab("% of Sites with >1 Mendelian Errors") 
+
+trio_plt <- ggplot( mendel, aes(x=locus_order_callers, y=trio_error_rate, fill=Callers) ) + 
+          geom_bar(stat="identity", show_guide = FALSE) +
+          theme_bw() +
+          xlab("Caller") +
+          ylab("% of Proband Genotypes with Mendelian errors") 
+
+pdf(pdf.file)
+show(locus_plt)
+show(trio_plt)
+dev.off()

tools/cges_qc/R/missing.R

+library(ggplot2)
+library(plyr)
+
+"%&%" <- function(a,b) paste(a, b, sep="")
+
+parse_imen <- function(x) {
+  read.table(x, header=T)
+}
+
+args <- commandArgs(trailing=TRUE)
+
+atlas.base <- args[1]
+gatk.base <- args[2]
+freebayes.base <- args[3]
+cges.base <- args[4]
+pdf.file <- args[5]
+
+sets <- list('ATLAS', 'GATK', 'Freebayes', 'CGES')
+files <- list(atlas.base %&% '.imiss',
+              gatk.base %&% '.imiss',
+              freebayes.base %&% '.imiss',
+              cges.base %&% '.imiss')
+
+## read in tables
+dat <- lapply(files, parse_imen)
+
+## drop unnecessary columns
+dat <- Map( function(x) x[,!(names(x) %in% c('MISS_PHENO','N_MISS','N_GENO'))], dat )
+
+## uniquify errors column names
+dat <- Map( function(dataf, newname) rename(dataf, c("F_MISS"=newname)), dat, sets )
+
+## Merge them into a single data frame properly ordered by sample
+dat <- Reduce( function(...) merge(..., by.x=c('FID','IID'), by.y=c('FID', 'IID'), suffixes), dat )
+
+## friendly form for plotting
+plt.dat <- stack( dat, select = unlist(sets) )
+
+
+plt <- ggplot(plt.dat, aes(x=values, fill = ind)) +
+  facet_grid( ind ~ . ,scale= "free") +
+  geom_histogram(binwidth=0.003) +
+  xlab('# of Samples') + ylab('Rate of Missing Genotypes')
+
+
+pdf(pdf.file)
+show(plt)
+dev.off()
+

tools/cges_qc/R/rediscovery.R

+library(ggplot2)
+library(gridExtra)
+
+"%&%" <- function(a,b) paste(a, b, sep="")
+
+#' Make a data frame of TsTV summary data
+make_kg_fpath <- function(base) {
+  fpath <- paste( base, '.kg', sep = '' )
+  return(fpath)
+}
+
+make_evs_fpath <- function(base) {
+  fpath <- paste( base, '.evs', sep = '' )
+  return(fpath)
+}
+
+
+args <- commandArgs(trailing=TRUE)
+
+
+
+atlas.base <- args[1]
+gatk.base <- args[2]
+freebayes.base <- args[3]
+cges.base <- args[4]
+pdf.file <- args[5]
+
+
+set.names <- list( 'Atlas', 'GATK', 'Freebayes', 'CGES')
+call.sets = list( atlas.base
+                  gatk.base,
+                  freebayes.base,
+                  cges.base)
+
+
+evs.fpaths <-  mapply(make_evs_fpath, call.sets)
+evs.dat <- lapply( evs.fpaths, read.table )
+evs <- data.frame( evs = unlist( lapply(evs.dat, subset, select=V2) ),
+                  Callers = unlist(set.names))
+evs$order_sets <- reorder(evs$Callers, evs$evs)
+
+
+evs.plt <- ggplot( evs, aes(x=order_sets, y=evs, fill=Callers) ) +
+        geom_bar(stat="identity", show_guide = FALSE) +
+        labs(x="Caller", y="% of Variants Found in Exome Variant Server") +
+        theme_bw() +
+        colScale
+kg.fpaths <-  mapply(make_kg_fpath, call.sets)
+kg.dat <- lapply( kg.fpaths, read.table )
+
+kg <- data.frame( kg = unlist( lapply(kg.dat, subset, select=V2) ),
+                   Callers = unlist(set.names) )
+kg$order_sets <- reorder(kg$Callers, kg$kg)
+
+
+kg.plt <- ggplot( kg, aes(x=order_sets, y=kg, fill=Callers) ) +
+        geom_bar(stat="identity", show_guide = FALSE) +
+        labs(x="Caller", y="% of Variants Found in 1000 Genomes") +
+        theme_bw()
+
+
+pdf(pdf.file)
+show(kg.plt)
+show(evs.plt)
+dev.off()

tools/cges_qc/R/tstv.R

+library(ggplot2)
+
+"%&%" <- function(a,b) paste(a, b, sep="")
+
+extract_tstv <- function(x) {
+  return( x$COUNT[7] / x$COUNT[8] )
+}
+
+args <- commandArgs(trailing=TRUE)
+
+atlas.base <- args[1]
+gatk.base <- args[2]
+freebayes.base <- args[3]
+cges.base <- args[4]
+pdf.file <- args[5]
+
+atlas.tstv <- read.table(atlas.base %&% ".TsTv.summary", header=T)
+gatk.tstv <- read.table(gatk.base %&% ".TsTv.summary", header=T)
+freebayes.tstv <- read.table(freebayes.base %&% ".TsTv.summary", header=T)
+consensus.tstv <- read.table(cges.base %&% ".TsTv.summary", header=T)
+
+callers <- list('Atlas', 'GATK', 'Freebayes', 'CGES')
+tstv.dat <- list(atlas.tstv, gatk.tstv, freebayes.tstv, consensus.tstv)
+
+
+tstv <- data.frame( tstv = unlist( lapply(tstv.dat, extract_tstv) ),
+                    Callers = unlist(callers))
+tstv$order_callers <- reorder(tstv$Callers, tstv$tstv)
+
+plt <- ggplot( tstv, aes(x=order_callers, y=tstv, fill=Callers) ) +
+        geom_bar(stat="identity", show_guide = FALSE) +
+        labs(x="Callers", y="Ts/Tv") +
+        theme_bw()
+ 
+pdf(pdf.file)
+show(plt)
+dev.off()
+
+

tools/cges_qc/README.md

+CGES QC
+===========
+
+Quality summary pipeline for the consensus caller. The program takes four VCF files as inputs, and produces a variety of plots as output. The four input files refer to VCF files generated with Atlas, GATK Unified Genotyper, Freebayes, and a consensus set generated by [CGES](https://github.com/vtrubets/galaxy.consensus). The tool itself is written for easy integration with galaxy.
+
+### Plots produced:
+  * Mendel inconsistencies
+  * Sample Missingness
+  * Sample F-statistics (summarizing heterozygosity)
+  * Minor allele frequency spectra
+  * Transition-transversion mutation ratio
+
+
+Extensions:
+============
+
+There are a few missing plots that are reported in the manuscript:
+  * Site missingness spectra
+  * Site/Sample concordance plots for consensus
+  * Rediscovery plots. (( the issue here is packaging reference sets ))
+
+
+Options:
+========
+
+    -h, --help            show this help message and exit
+    --cges-vcf=CGES       File path for CGES VCF for which to generate QC
+                          metrics.
+    --atlas-vcf=ATLAS     File path for ATLAS VCF for which to generate QC
+                          metrics.
+    --gatk-vcf=GATK       File path for GATK VCF for which to generate QC
+                          metrics.
+    --freebayes-vcf=FREEBAYES
+                          File path for Freebayes VCF for which to generate QC
+                          metrics.
+    --ped-file=PEDFILE    Pedigree file for samples (Optional).
+    --tstv-out=TSTVOUT    Output file location for TsTv plots PDF.
+    --het-out=HETOUT      Output file location for heterozygosity plots PDF.
+    --maf-out=MAFOUT      Output file location for minor allele frequency plots
+                          PDF.
+    --miss-out=MISSOUT    Output file location for missingess plots PDF.
+    --rediscover-out=REDISCOVEROUT
+                          Output file location for rediscovery rate plots PDF.
+    --mendel-out=MENDELOUT
+                          Output file location for Mendel inconsistency plots
+                          PDF.
+    --temp-dir=TEMPDIR    Directory for writing intermediate analysis files.
+
+
+
+Usage:
+======
+
+Note: This tool wraps a number of other programs: PLINK, vcftools, and R code files run with Rscript. Should you want to run this, you must have these programs in your search path.
+
+This is a simple python executable where input VCF files and output PDFs of plots are specified explicitly. There is also the requirement of a temporary directory. This is where intermediate files produced by PLINK and vcftools are written.
+
+A full summary can be generated with the command:
+
+    python python/qc.pipeline.py \
+      --atlas-vcf "test/atlas.test.vcf" \
+      --gatk-vcf "test/gatk.test.vcf" \
+      --freebayes-vcf "test/freebayes.test.vcf" \
+      --cges-vcf "test/cges.test.vcf" \
+      --ped-file "test/test.pedigree.txt" \
+      --tstv-out "test/tstv.dat" \
+      --het-out "test/het.dat" \
+      --maf-out "test/maf.dat" \
+      --miss-out "test/miss.dat" \
+      --mendel-out "test/mendel.dat" \
+      --temp-dir "test/" 
+

tools/cges_qc/cges_qc.xml

+<?xml version="1.0"?>
+
+<tool name="CGES-QC" id="cgesqc">
+  <description>
+  Quality summary pipeline for the consensus caller. The program takes four VCF files as inputs, and produces a variety of plots as output. The four input files refer to VCF files generated with Atlas, GATK Unified Genotyper, Freebayes, and a consensus set generated by CGES. The tool itself is written for easy integration with galaxy.
+  </description>
+
+  <command interpreter="python">
+    python/qc.pipeline.py 
+      --atlas-vcf $atlasVcf
+      --gatk-vcf $gatkVcf
+      --freebayes-vcf $freebayesVcf
+      --cges-vcf $cgesVcf
+      --temp-dir /scratch/galaxy/cox/tmp/
+    #if str($pedfile):
+      --ped-file $pedfile
+    #else:
+      none
+    #end if
+    #if str($):
+      --ped-file $pedfile
+    #else:
+      none
+    #end if
+    #if str($tstvOut):
+      --tstv-out $tstvOut
+    #else:
+      none
+    #end if
+    #if str($hetOut):
+      --het-out $hetOut
+    #else:
+      none
+    #end if
+    #if str($mafOut):
+      --maf-out $mafOut
+    #else:
+      none
+    #end if
+    #if str($missOut):
+      --miss-out $missOut
+    #else:
+      none
+    #end if
+    #if str($mendelOut):
+      --mendel-out $mendelOut
+    #else:
+      none
+    #end if
+  </command>
+
+  <inputs>
+    <param name="atlasVcf" type="data" format="vcf" label="Select a VCF file produced by Atlas-SNP2" help="A VCF file produced by Atlas-SNP2 (Required)"/>
+    <param name="gatkVcf" type="data" format="vcf" label="Select a VCF file produced by GATK's Unified Genotyper" help="A VCF file produced by the GATK Unified Genotyper (Required)"/>
+    <param name="freebayesVcf" type="data" format="vcf" label="Select a VCF file produced by Freebayes" help="A VCF file produced by Freebayes (Required)"/>
+    <param name="cgessVcf" type="data" format="vcf" label="Select a VCF file produced by CGES consensus" help="A VCF file produced by CGES under consensus thresholds (Required)"/>
+    <param name="pedfile" type="data" format="txt" label="Pedigree file" help="A text file specifying familial relationships among samples."/>
+    <param name="tstvOut" type="data" format="txt" label="Pedigree file" help="A text file specifying familial relationships among samples."/>
+  
+    <param name="tstvOut" type="boolean" falsevalue="" truevalue="True" label="Output TsTv plot"/>
+    <param name="hetOut" type="boolean" falsevalue="" truevalue="True" label="Output F statistic plot"/>
+    <param name="mafOut" type="boolean" falsevalue="" truevalue="True" label="Output MAF plot"/>
+    <param name="missOut" type="boolean" falsevalue="" truevalue="True" label="Output missingness plot"/>
+    <param name="mendelOut" type="boolean" falsevalue="" truevalue="True" label="Output mendel plot"/>
+    
+  </inputs>
+
+  <outputs>
+    <data name="tstvOut" format="pdf" label="TsTv plot" >
+      <filter>tstvOut == "True"</filter>
+    </data>
+    <data name="hetOut" format="pdf" label="F-statistic plot">
+      <filter>hetOut == "True"</filter>
+    </data>
+    <data name="mafOut" format="pdf" label="MAF plot" >
+      <filter>mafOut == "True"</filter>
+    </data>
+    <data name="missOut" format="pdf" label="Missingness plot" >
+      <filter>missOut == "True"</filter>
+    </data>
+    <data name="mendelOut" format="pdf" label="Mendelian inconsistency plot" >
+      <filter>mendelOut == "True"</filter>
+    </data>
+  </outputs>
+
+  <help>
+    Command Line Options:
+
+    -h, --help            show this help message and exit
+    --cges-vcf=CGES       File path for CGES VCF for which to generate QC
+                          metrics.
+    --atlas-vcf=ATLAS     File path for ATLAS VCF for which to generate QC metrics.
+    --gatk-vcf=GATK       File path for GATK VCF for which to generate QC metrics.
+    --freebayes-vcf=FREEBAYES
+                                                                                            File path for Freebayes VCF for which to generate QC metrics.                                                               --ped-file=PEDFILE    Pedigree file for samples (Optional).                                               --tstv-out=TSTVOUT    Output file location for TsTv plots PDF.                                            --het-out=HETOUT      Output file location for heterozygosity plots PDF.                                  --maf-out=MAFOUT      Output file location for minor allele frequency plots PDF.
+    --miss-out=MISSOUT    Output file location for missingess plots PDF.
+    --rediscover-out=REDISCOVEROUT
+                                                                                            Output file location for rediscovery rate plots PDF.
+    --mendel-out=MENDELOUT
+                                                                                            Output file location for Mendel inconsistency plots PDF.                                                                    --temp-dir=TEMPDIR    Directory for writing intermediate analysis files.
+  </help>
+
+</tool>
+
+
+

tools/cges_qc/python/qc.pipeline.py

+#!/usr/bin/env
+
+import subprocess as sp
+import gzip as gz
+import optparse as opt
+import cPickle
+import vcf
+import os
+
+def smart_open(f):
+    '''
+    Open compressed files if compressed
+    '''
+    if f.endswith('.gz'):
+        return gz.open(f)
+    else:
+        return open(f)
+
+def smart_vcftools(call):
+    '''
+    Analog to smart open for calling vcftools with a gzipped vcf
+    '''
+    for idx, element in enumerate(call):
+        if element.endswith('.vcf.gz'):
+            call[idx-1] = '--gzvcf'
+    return call
+
+def makeID(rec, altall):
+  var = rec.CHROM.lstrip('chr') + ':' + str(rec.POS) + '.' + rec.REF + '.' + str(altall)
+  altvar = rec.CHROM.lstrip('chr') + ':' + str(rec.POS) + '.' + str(altall) + '.' + rec.REF
+  return var, altvar
+
+def calc_rediscovery(evs, vcfFile):
+  '''
+  Calculate the variant rediscovery rate in a VCF file.
+  '''
+  count = 0
+  total = 0
+  vcfReader = vcf.Reader(open(vcfFile))
+  for rec in vcfReader:
+    for allele in rec.ALT:
+      var, altvar = makeID(rec, allele)
+      if (var in evs) or (altvar in evs):
+        count += 1
+      total += 1
+  return (vcfFile.split('/')[-1], str(float(count)/float(total)))
+
+def make_plink_data(vcfFile, pedFile, temp, pedOut=None, mapOut=None):
+  '''
+  Given a VCF file, recode into PLINK files and cleanup auxilliary files.
+
+  If a pedigree file is supplied, the recode the ped file with that information.
+
+  '''
+  fnull = open(os.devnull, 'w')
+
+  ## convert vcf to plink formatted files
+  convertCall = ['vcftools', '--vcf', vcfFile, '--plink', '--recode', '--out',
+      temp ]
+  convertCall = smart_vcftools(convertCall)
+  sp.call(convertCall, stdout=fnull)
+
+  if pedFile:   
+    ## pull pedigree information from previous run
+    mothers = {}
+    fathers = {}
+    families = {}
+    sexes = {}
+    with open(pedFile) as fconn:
+      for line in fconn:
+        line = line.split(',')
+        mother = line[3]
+        father = line[2]
+        if mother=='Not Sequenced':
+            mother = '0'
+        if father=='Not Sequenced':
+            father = '0'
+        fathers[line[1]] = father
+        mothers[line[1]] = mother
+        family = line[0] 
+        families[line[1]] = family
+        sexes[line[1]] = line[4]
+
+    ## add pedigree information to plink ped file
+    plinkFile = temp + '.pedigree.ped'
+    plinkConn = open(plinkFile, 'w')
+    with smart_open(temp + '.ped') as pedPlinkFile:
+        for line in pedPlinkFile:
+            line = line.strip().split('\t')
+            iid = line[1]
+            if fathers.get(iid):
+                line[2] = fathers[iid]
+            if mothers.get(iid):
+                line[3] = mothers[iid]
+            if families.get(iid):
+                line[0] = families[iid]
+            if sexes.get(iid):
+                line[4] = sexes[iid]
+            ## add a fake phenotype
+            line[5] = '2'
+            print >> plinkConn, '\t'.join(line)
+    plinkConn.close()
+
+    ## move outputs and clean up vestigial files
+    sp.call(['rm', temp+'.recode.vcf'])
+    sp.call(['rm', temp+'.ped'])
+    sp.call(['mv', temp+'.pedigree.ped', temp+'.ped'])
+    if pedOut and mapOut:
+      sp.call(['mv', temp+'.pedigree.ped', pedout])
+      sp.call(['mv',  temp+'.map', mapOut])
+    
+
+
+def make_mendel_data(inputPed, inputMap, temp, trioOut=None, locusOut=None):
+  '''
+  Given a set of PLINK files, generate mendel stats, and move files to specified location.
+  '''
+  fnull = open(os.devnull, 'w')
+  ## calculate mendelian errors
+  mendelCall = ['plink',
+    '--ped', inputPed,
+    '--map', inputMap,
+    '--mendel',
+    '--allow-no-sex',
+    '--out', temp]
+  sp.call(mendelCall, stdout = fnull)
+
+  ## clean up and move files
+  sp.call(['rm', temp+'.log'])
+  sp.call(['rm', temp+'.imendel'])
+  sp.call(['rm', temp+'.nosex'])
+  if trioOut and locusOut:
+    sp.call(['mv', temp+'.fmendel', trioOut])
+    sp.call(['mv', temp+'.lmendel', locusOut])
+
+
+def make_tstv_data(vcfFile, temp, tstvOut=None):
+  fnull = open(os.devnull, 'w')
+  ## identify the total number of variants in the VCF file
+  nsnp = len( [ rec for rec in vcf.Reader(open(vcfFile, 'r')) ] )
+
+  ## get the ts/tv ratio
+  tstvCall = ['vcftools', '--vcf', vcfFile, '--TsTv', str(nsnp), '--out',
+      temp]
+  tstvCall = smart_vcftools(tstvCall)
+  sp.call(tstvCall, stdout = fnull) 
+ 
+  ## clean up files
+  sp.call(['rm', temp+'.log'])
+  sp.call(['rm', temp+'.TsTv'])
+  if tstvOut:
+    sp.call(['mv', temp+'.TsTv.summary', tstvOut])
+
+def make_het_data(vcfFile, temp, hetOut=None):
+  fnull = open(os.devnull, 'w')
+  ## Calculate sample heterozygosity
+  hetCall = ['vcftools', '--vcf', vcfFile, '--het', '--out', temp ]
+  hetCall = smart_vcftools(hetCall)
+  sp.call(hetCall, stdout = fnull)
+
+  ## clean up files
+  sp.call(['rm', temp+'.log'])
+  if hetOut:
+    sp.call(['mv', temp+'.het', hetOut])
+
+def make_maf_data(plinkMap, plinkPed, temp, mafOut=None):
+  fnull = open(os.devnull, 'w')
+  ## generate minor allele frequency distribution
+  mafCall = ['plink',
+    '--ped', plinkPed,
+    '--map', plinkMap,
+    '--freq',
+    '--allow-no-sex',
+    '--out', temp]
+  sp.call(mafCall, stdout=fnull)
+
+  ## clean up files
+  sp.call(['rm', temp+'.log'])
+  sp.call(['rm', temp+'.nosex'])
+  if mafOut:
+    sp.call(['mv', temp+'.frq', mafOut])
+
+
+def make_miss_data(plinkMap, plinkPed, temp, missOut=None): 
+  fnull = open(os.devnull, 'w')
+  ## get the missingness rates
+  missCall = ['plink',
+    '--ped', plinkPed,
+    '--map', plinkMap,
+    '--missing',
+    '--out', temp]
+  sp.call(missCall, stdout = fnull) 
+ 
+  ## clean up files
+  sp.call(['rm', temp+'.log'])
+  sp.call(['rm', temp+'.nosex'])
+  if missOut:
+    sp.call(['mv', temp+'.lmiss', missOut])
+
+
+def make_hardy_data(plinkMap, plinkPed, temp, hardyOut=None):
+  fnull = open(os.devnull, 'w')
+  ## get hardy weinberg estimates
+  hweCall = ['plink',
+    '--ped', plinkPed,
+    '--map', plinkMap,
+    '--hardy',
+    '--out', temp]
+  sp.call(hweCall, stdout = fnull)
+
+  ## clean up files
+  sp.call(['rm', temp+'.log'])
+  sp.call(['rm', temp+'.nosex'])
+  if hardyOut:
+    sp.call(['mv', temp+'.hwe', hardyOut])
+    
+
+def make_rediscovery_data(vcfFile, evsOut, kgOut):
+  fnull = open(os.devnull, 'w')
+  ## calculate EVS rediscovery rate
+  evsVar = cPickle.load(open('/nas40t0/vasya/exome_variant_server/ESP6500SI-V2-SSA137.snps.set.pickle', 'rb'))
+  evsRes = calc_rediscovery(evsVar, vcfFile)
+  del evsVar
+  print >> open(evsOut, 'w'), '\t'.join(evsRes)
+
+  ## calculate 1kG rediscovery rate
+  kgVar = cPickle.load(open('/nas40t0/vasya/1kG/ALL_1000G_phase1integrated_v3_impute_var.pickle'))
+  kgRes = calc_rediscovery(kgVar, vcfFile) 
+  del kgVar
+  print >> open(kgOut, 'w'), '\t'.join(kgRes)
+
+
+def compile_resource_descr(name, vcf, tmpdir):
+  '''
+  Make a dictionary of the file descriptions associated with a VCF file.
+  '''
+  resources = { "vcf": vcf,
+                "ped": tmpdir + name + '.ped' ,
+                "map": tmpdir + name + '.map',
+                "temp": tmpdir + name }
+  return resources
+
+def main():
+  ## parse command line arguments
+  parser = opt.OptionParser()
+  parser.add_option('--cges-vcf', dest = 'cges', action = 'store', 
+      help = 'File path for CGES VCF for which to generate QC metrics.')
+  parser.add_option('--atlas-vcf', dest = 'atlas', action = 'store', 
+      help = 'File path for ATLAS VCF for which to generate QC metrics.')
+  parser.add_option('--gatk-vcf', dest = 'gatk', action = 'store', 
+      help = 'File path for GATK VCF for which to generate QC metrics.')
+  parser.add_option('--freebayes-vcf', dest = 'freebayes', action = 'store', 
+      help = 'File path for Freebayes VCF for which to generate QC metrics.')
+  parser.add_option('--ped-file', dest = 'pedFile', action = 'store', 
+      help = 'Pedigree file for samples (Optional).')
+  parser.add_option('--tstv-out', dest = 'tstvOut', action = 'store', 
+      help = 'Output file location for TsTv plots PDF.')
+  parser.add_option('--het-out', dest = 'hetOut', action = 'store', 
+      help = 'Output file location for heterozygosity plots PDF.')
+  parser.add_option('--maf-out', dest = 'mafOut', action = 'store', 
+      help = 'Output file location for minor allele frequency plots PDF.')
+  parser.add_option('--miss-out', dest = 'missOut', action = 'store', 
+      help = 'Output file location for missingess plots PDF.')
+  parser.add_option('--rediscover-out', dest = 'rediscoverOut', action = 'store',
+      help = 'Output file location for rediscovery rate plots PDF.')
+  parser.add_option('--mendel-out', dest = 'mendelOut', action = 'store', 
+      help = 'Output file location for Mendel inconsistency plots PDF.')
+  parser.add_option('--temp-dir', dest = 'tempDir', action = 'store',
+      help = 'Directory for writing intermediate analysis files.')
+  (options, args) = parser.parse_args()
+  opt_dict = vars(options)
+  
+  fnull = open(os.devnull, 'w')
+
+  ## give each branch a name
+  branches = ['atlas', 'gatk', 'freebayes', 'cges']
+
+  ## store resource locations for each branch
+  resources = dict()
+  for branch in branches:
+    resources[branch] = compile_resource_descr(name = branch, vcf = opt_dict[branch], tmpdir = options.tempDir)
+
+  ## check if we need to recode into PLINK files
+  if options.mendelOut or options.mafOut or options.missOut or options.hardyOut:
+    for branchData in resources.values():
+      make_plink_data(vcfFile=branchData['vcf'], pedFile=options.pedFile, temp=branchData['temp'])
+
+  if options.mendelOut:
+    ## generate data
+    for branchData in resources.values():
+      make_mendel_data(inputPed=branchData['ped'], inputMap=branchData['map'], temp=branchData['temp'])
+    ## make plots PDF by executing an R script with passed in arguments
+    sp.call(['Rscript', 'R/mendel.R', resources['atlas']['temp'], resources['gatk']['temp'], resources['freebayes']['temp'], resources['cges']['temp'], options.mendelOut], stdout = fnull)
+
+  if options.tstvOut:
+    ## generate data
+    for branchData in resources.values():
+      make_tstv_data(vcfFile=branchData['vcf'], temp=branchData['temp'])
+    ## generate plot PDF
+    sp.call(['Rscript', 'R/tstv.R', resources['atlas']['temp'], resources['gatk']['temp'], resources['freebayes']['temp'], resources['cges']['temp'], options.tstvOut], stdout = fnull)
+
+  if options.hetOut:
+    for branchData in resources.values():
+      make_het_data(vcfFile=branchData['vcf'], temp=branchData['temp'])
+    sp.call(['Rscript', 'R/het.R', resources['atlas']['temp'], resources['gatk']['temp'], resources['freebayes']['temp'], resources['cges']['temp'], options.hetOut], stdout = fnull)
+
+  if options.mafOut:
+    for branchData in resources.values():
+      make_maf_data(plinkMap=branchData['map'], plinkPed=branchData['ped'], temp=branchData['temp'])
+    sp.call(['Rscript', 'R/maf.R', resources['atlas']['temp'], resources['gatk']['temp'], resources['freebayes']['temp'], resources['cges']['temp'], options.mafOut])
+     
+  if options.missOut:
+    for branchData in resources.values(): 
+      make_miss_data(plinkMap=branchData['map'], plinkPed=branchData['ped'], temp=branchData['temp'])
+    sp.call(['Rscript', 'R/missing.R', resources['atlas']['temp'], resources['gatk']['temp'], resources['freebayes']['temp'], resources['cges']['temp'], options.missOut])
+
+#  if options.rediscoverOut:
+#    make_rediscovery_data()
+    
+
+
+
+if __name__ == '__main__':
+    main()

tools/cges_qc/python/vcf.QC.pipeline.py

+'''
+A pipeline for generating a set of QC metrics for an arbitrary VCF.
+
+Target Metrics:
+    Current:
+        * ts/tv ratio
+        * sample missingness rate
+        * site missingness rate
+        * sample non-consensus rate
+        * site non-consensus rate
+        * MAF distribution
+
+    Planned:
+        * combined QUAL
+        * mendelian error
+
+programs used:
+    * vcftools
+    * PLINK
+
+
+'''
+
+## MODULES
+import vcf
+import optparse as opt
+import subprocess as sp
+import numpy as np
+import sys
+
+## CLASSES
+class vcfReport():
+    '''
+    A quality report generated from the vcf file.
+    '''
+
+    def __init__(self, vcfFile, mvar):
+        self.vcfReader = vcf.Reader(open(vcfFile, 'r'))
+        self.nsam = len(self.vcfReader.samples)
+        self.mvar = mvar
+        self.vcfFile = vcfFile
+        #: number of transitions
+        self.ts = 0
+        #: number of transversions
+        self.tv = 0
+        #: list of MAF
+        self.mafDist = np.empty( self.mvar, dtype=float)
+        #: list of call rates
+        self.callrateDist = np.empty( self.mvar, dtype=float)
+        #: list of site consensus-calling rates
+        self.siteConsRate = np.empty( self.mvar, dtype=float)
+        #: total number of called genotypes
+        self.genotypesCalled = 0
+        #: {sample ID : sample # of calls}
+        self.sampleCallTotal = np.zeros( (self.nsam, 1), dtype=int)
+        
+        ## go ahead an generate the report upon initialization
+        self.__generateReport()
+
+
+    def __generateReport(self):
+        '''
+        Iterate and process records in the vcf file.
+        Calculate mendelian errors using PLINK.
+        '''
+        for i, record in enumerate(self.vcfReader):
+            self.__processRecord(record, i)
+
+        self.__findMendelErrors()
+
+    def findMendelErrors(self):
+      '''
+      Reformat VCF as plink files, and add pedigree information.
+      Calculate the rates of mendelian errors from there.
+      
+      Inputs:
+          - output directory for intermediate files
+          - file mapping SM to RG for our samples
+          - file containing pedigree information
+
+      '''
+      
+
+
+    def __processRecord(self, record, recidx):
+        '''
+        Update sample level statistics:
+            * total number of called genotypes
+            * non-consensus rate
+            * mendelian error rate
+
+        Store site level statistics:
+            * missingness rate
+            * non-consensus rate
+            * mendelian error rate
+            * minor allele frequency
+        '''
+
+
+        ## per sample totals
+        consCount = 0
+        for idx, sample in enumerate(record.samples):
+          ## check if genotype is called
+          if sample.gt_type:
+              self.sampleCallTotal[idx] += 1
+          ## check if genotype was called consensus
+          if getattr(sample.data ,'CN') == 'T':
+              consCount += 1
+
+        ## global call total
+        self.genotypesCalled += record.num_called
+            
+        ## is site a transition variant?
+        if record.is_transition:
+            self.ts += 1
+        else:
+            self.tv += 1
+
+        ## calculate and record maf
+        if record.get_hom_refs() <= record.get_hom_alts():
+            maf = record.aaf
+        else:
+            maf  = 1.0 - record.aaf
+        self.mafDist[recidx] = maf
+
+        ## calculate and record missingness per variant
+        self.callrateDist[recidx] = record.call_rate
+
+        ## if consensus rate is available -- record a consensus rate
+        self.siteConsRate[recidx] = consCount / record.num_called
+        
+
+### FUNCTIONS ###
+#################
+
+## quickly find the number of samples in a vcf
+def num_variants():
+    ## grep -v '^#' test.consensus.vcf | wc -l
+    return Null 
+
+## quickly find the number of variants in a vcf
+def num_samples(vcfReader):
+    return len(vcfReader.samples)
+
+
+### MAIN ###
+############
+
+def main():
+
+    ## parse input arguments
+    parser = opt.OptionParser()
+    parser.add_option('--vcf', dest = 'vcfFile', help = 'Target VCF file.')
+    parser.add_option('--num-variants', dest = 'mvar', help = 'Number of variants in target VCF file.')
+    (options, args) = parser.parse_args()
+    vcfFile = options.vcfFile    
+    mvar = int(options.mvar)
+
+    ## create a QC report object
+    qcreport = vcfReport(vcfFile, mvar)
+
+    ## TODO :: we want to wrap this in a nice PDF report
+    ## potential libraries:
+    ##  - sphinx
+    ##  - pod
+    ##  - reportLab
+
+
+if __name__ == "__main__":
+    main()
+ 
+
+
+
+
+
+

tools/consensus_caller/galaxy.consensus/Gemfile

+source 'https://rubygems.org'
+gem 'github-pages'

tools/consensus_caller/galaxy.consensus/consensus_tool/allele_walker.py

+class allele_walker:
+  '''
+  Given a set of site concordant records, call consensus on variants with matching alleles.
+  '''
+
+  def __init__(self, recordSet):
+    self.recordSet = recordSet
+
+
+

tools/consensus_caller/galaxy.consensus/consensus_tool/extract_regions.py

+import vcf as pyvcf
+import pysam
+
+def get_regions(vcf):
+  '''
+  Function to find (contig, start, end) for contig in VCF
+  '''
+
+  reader = pyvcf.Reader(open(vcf))
+  tabixReader = pysam.Tabix(vcf)
+
+  regions = list()
+  for contig in tabixReader.contigs:
+    row = tabixReader._parseRegion(region=contig)
+    start = row[-2]
+    end = row[-1]
+    print (contig, start, end)
+  
+  
+
+
+

tools/consensus_caller/galaxy.consensus/consensus_tool/my_exceptions.py

+class ambiguousConcordance(Exception):
+  '''
+  Simple exception thrown when a a sample/site contains multiple genotypes at a given threshold.
+  This class is defined to make this handling more explicit than using base exceptions.
+  '''
+  def __init__(self, value):
+     self.value = value
+
+  def __str__(self):
+     return self.value
+  
+class discordant(Exception):
+  '''
+  simple exception thrown when a collection of genotypes is not concordant.
+  '''
+  def __init_(self, value):
+      self.value = value
+  def __str__(self):
+      return self.value
+

tools/consensus_caller/galaxy.consensus/consensus_tool/vcf_ensemble.py

+import vcf as pyvcf
+import itertools as it
+from variant_ensemble import variant_ensemble
+
+class simple_variant:
+  '''
+  Minimum representation of a variant.
+  '''
+
+  def __init__(self, rec, ALT):
+    '''
+    Take a pyVCF _Record and make a string. Must provide explicit alternate allelel ALT.
+    '''
+    self.ID = rec.CHROM + ':' + str(rec.POS) + ':' + str(rec.REF) + ':' + str(ALT)
+    self.contig = rec.CHROM
+    self.pos = rec.POS
+  def __hash__(self):
+    return hash(self.ID)
+  def __eq__(self, other):
+    if self.ID == other.ID: return True
+    else: return False
+  def __str__(self):
+    return self.ID    
+
+
+class vcf_ensemble:
+  '''
+  Represents an arbitrary number of VCF files describing the same data set.
+  '''
+  @property
+  def samples(self):
+    '''
+    Samples common to all input VCF files.
+    '''
+    sampleSets = [ set(x.samples) for x in self.vcfReaders ]
+    self.samples = reduce( lambda x,y: x.intersection(y), sampleSets )
+    return self.samples
+
+  @samples.setter
+  def samples(self, samples):
+    self.samples = samples
+
+  @samples.getter
+  def samples(self):
+    sampleSets = [ set(x.samples) for x in self.vcfReaders ]
+    self.samples = reduce( lambda x,y: x.intersection(y), sampleSets )
+    return self.samples
+  
+  @property
+  def variants(self):
+    '''
+    List of variant sets found in each caller.
+    '''
+    variantSets = []
+    readers = [ pyvcf.Reader(open(x), prepend_chr = False) for x in self.vcfs]
+    for x in readers:
+      variants = set()
+      for rec in x:
+        for alt in rec.ALT:
+          variants.add(simple_variant(rec, alt))
+
+      variantSets.append(variants)
+    return variantSets
+
+  @property
+  def vcfReaders(self):
+    '''
+    Connections to VCF files.
+    '''
+    return [ pyvcf.Reader(open(x), prepend_chr = False) for x in self.vcfs ]
+
+
+  def __init__(self, *args, **kwargs):
+    self.vcfs = kwargs.get('vcfList')
+    self.ignoreMissing = kwargs.get('ignoreMissing')
+
+  def _concordant_sites(self, threshold):
+    '''
+    Find variants that agree at some threshold.
+
+    This problem reduces to finding the union of a set of sets. The top level
+    set consists of combinations of VCF sets set by the threshold (i.e.
+   variants choose threshold).
+    '''
+    sets = set()
+    for comb in it.combinations(self.variants, threshold):
+      sets.add( frozenset(reduce( lambda x, y: x.intersection(y), comb )) ) 
+    return reduce( lambda x, y: x.union(y), sets )
+
+  def concordant_variants(self, siteThresh, genoThresh):
+    '''
+    Reduce the ensemble of VCFs into a set of variant sites that agree at some threshold.
+    '''
+    variants = self._concordant_sites(siteThresh)
+    samples = self.samples
+    for variant in variants:
+      records = [ x.fetch(variant.contig, variant.pos) for x in self.vcfReaders ]
+      ## clean up missing records for a given threshold
+      records = [ x for x in records if x ]
+      ensemble = variant_ensemble(recordSet=[ x for x in records if x], samples=samples, threshold = genoThresh, ignoreMissing = self.ignoreMissing)
+      yield records, ensemble.set_concordance()
+
+
+

tools/consensus_caller/galaxy.consensus/index.html

+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="chrome=1">
+    <title>Consensus Genotyper for Exome Sequencing by vtrubets</title>
+
+    <link rel="stylesheet" href="stylesheets/styles.css">
+    <link rel="stylesheet" href="stylesheets/pygment_trac.css">
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+    <!--[if lt IE 9]>
+    <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
+    <![endif]-->
+  </head>
+  <body>
+    <div class="wrapper">
+      <header>
+        <h1>Consensus Genotyper for Exome Sequencing</h1>
+        <p>Ensemble variant calling implementation and Galaxy wrappers.</p>
+
+        <p class="view"><a href="https://github.com/vtrubets/galaxy.consensus">View the Project on GitHub <small>vtrubets/galaxy.consensus</small></a></p>
+
+
+        <ul>
+          <li><a href="https://github.com/vtrubets/galaxy.consensus/zipball/master">Download <strong>ZIP File</strong></a></li>
+          <li><a href="https://github.com/vtrubets/galaxy.consensus/tarball/master">Download <strong>TAR Ball</strong></a></li>
+          <li><a href="https://github.com/vtrubets/galaxy.consensus">View On <strong>GitHub</strong></a></li>
+        </ul>
+      </header>
+      <section>
+        <h1>
+<a name="introduction" class="anchor" href="#introduction"><span class="octicon octicon-link"></span></a>Introduction:</h1>
+
+<p>This is an initial implementation of a two stage voting scheme among variant calling algorithms. Given a set of VCF files produced by various algorithms, sites are selected if they are seen among all callers. Genotypes among these sites are then selected as those that match among all callers. Currently, a user can input any number of sorted VCF files, and a strict consensus of variant sites and genotypes will be generated.</p>
+
+<p>Any VCF can be used as long as it can be parsed by <a href="https://github.com/jamescasbon/PyVCF">James Casbon's pyVCF module</a>.</p>
+
+<h1>
+<a name="options" class="anchor" href="#options"><span class="octicon octicon-link"></span></a>Options:</h1>
+
+<pre><code>usage: consensus_genotyper.py [-h] VCFS [VCFS ...]
+
+Find sites and genotypes which aggree among an arbitrary number of VCF files.
+
+positional arguments:
+  VCFS        List of VCF files for input.
+
+optional arguments:
+  -h, --help  show this help message and exit
+</code></pre>
+
+<h1>
+<a name="usage" class="anchor" href="#usage"><span class="octicon octicon-link"></span></a>Usage:</h1>
+
+<p>Test data is located in the data/ directory. The following command:</p>
+
+<p>python ./consensus_tool/consensus_genotyper.py ./data/*vcf &gt; test.output.vcf</p>
+
+<p>Will take the three test files in the data directory and generate a strict consensus of sites and genotypes (i.e. 3/3 files containt the variant site, and 3/3 files agree on the genotype for a sampple at that site).</p>
+
+<p>Some things to keep in mind: </p>
+
+<ul>
+<li>Multi-sample VCF files are currently supported, and the output will contain only samples which are found in all input files.</li>
+<li>Files must be sorted by physical position. This can be achieved using any VCF utility such as (vcf-sort in vcftools)[<a href="http://vcftools.sourceforge.net/perl_module.html#vcf-sort">http://vcftools.sourceforge.net/perl_module.html#vcf-sort</a>]. The caller works by iterating simultaneously across all input files until a matching variant record is found. If a VCF file is not sorted similarly, it is unlikely that any overlapping sites will be found.</li>
+<li>Missing data on the genotype level is ignored if actual genotypes are available in other VCF files. Missing data is produced only if all sites are missing, or if genotypes do not agree among all call sets.</li>
+</ul><h1>
+<a name="planned-extensions" class="anchor" href="#planned-extensions"><span class="octicon octicon-link"></span></a>Planned Extensions:</h1>
+
+<ul>
+<li>Operating on specific regions using the tabix index.</li>
+<li>Support for multi-allelic sites.</li>
+<li>Outputting variant sites which are discordant between callers. This is potentially interesting variation.</li>
+<li>The ability to specify concordance thresholds on the site and genotype level. This could be particularly helpful if one set of variants is markedly different from others, or if one is interested in finding the union of call sets rather than an intersection.</li>
+<li>The ability to preserve information from input VCF files. I'm thinking that it would help to specify this information in a high level configuration file. This would allow you to do things like propagate QUAL scores and compute with them downstream.</li>
+</ul>
+      </section>
+      <footer>
+        <p>This project is maintained by <a href="https://github.com/vtrubets">vtrubets</a></p>
+        <p><small>Hosted on GitHub Pages &mdash; Theme by <a href="https://github.com/orderedlist">orderedlist</a></small></p>
+      </footer>
+    </div>
+    <script src="javascripts/scale.fix.js"></script>
+    
+  </body>
+</html>

tools/consensus_caller/galaxy.consensus/javascripts/scale.fix.js

+var metas = document.getElementsByTagName('meta');
+var i;
+if (navigator.userAgent.match(/iPhone/i)) {
+  for (i=0; i<metas.length; i++) {
+    if (metas[i].name == "viewport") {
+      metas[i].content = "width=device-width, minimum-scale=1.0, maximum-scale=1.0";
+    }
+  }
+  document.addEventListener("gesturestart", gestureStart, false);
+}
+function gestureStart() {
+  for (i=0; i<metas.length; i++) {
+    if (metas[i].name == "viewport") {
+      metas[i].content = "width=device-width, minimum-scale=0.25, maximum-scale=1.6";
+    }
+  }
+}

tools/consensus_caller/galaxy.consensus/params.json

+{"name":"Consensus Genotyper for Exome Sequencing","tagline":"Ensemble variant calling implementation and Galaxy wrappers.","body":"Introduction:\r\n==============================\r\n\r\nThis is an initial implementation of a two stage voting scheme among variant calling algorithms. Given a set of VCF files produced by various algorithms, sites are selected if they are seen among all callers. Genotypes among these sites are then selected as those that match among all callers. Currently, a user can input any number of sorted VCF files, and a strict consensus of variant sites and genotypes will be generated.\r\n\r\nAny VCF can be used as long as it can be parsed by [James Casbon's pyVCF module](https://github.com/jamescasbon/PyVCF).\r\n\r\nOptions:\r\n========\r\n\r\n    usage: consensus_genotyper.py [-h] VCFS [VCFS ...]\r\n\r\n    Find sites and genotypes which aggree among an arbitrary number of VCF files.\r\n    \r\n    positional arguments:\r\n      VCFS        List of VCF files for input.\r\n\r\n    optional arguments:\r\n      -h, --help  show this help message and exit\r\n\r\n\r\n\r\nUsage:\r\n========\r\n\r\nTest data is located in the data/ directory. The following command:\r\n\r\npython ./consensus_tool/consensus_genotyper.py ./data/*vcf > test.output.vcf\r\n\r\nWill take the three test files in the data directory and generate a strict consensus of sites and genotypes (i.e. 3/3 files containt the variant site, and 3/3 files agree on the genotype for a sampple at that site).\r\n\r\nSome things to keep in mind: \r\n* Multi-sample VCF files are currently supported, and the output will contain only samples which are found in all input files.\r\n* Files must be sorted by physical position. This can be achieved using any VCF utility such as (vcf-sort in vcftools)[http://vcftools.sourceforge.net/perl_module.html#vcf-sort]. The caller works by iterating simultaneously across all input files until a matching variant record is found. If a VCF file is not sorted similarly, it is unlikely that any overlapping sites will be found.\r\n* Missing data on the genotype level is ignored if actual genotypes are available in other VCF files. Missing data is produced only if all sites are missing, or if genotypes do not agree among all call sets.\r\n\r\n\r\nPlanned Extensions:\r\n===================\r\n* Operating on specific regions using the tabix index.\r\n* Support for multi-allelic sites.\r\n* Outputting variant sites which are discordant between callers. This is potentially interesting variation.\r\n* The ability to specify concordance thresholds on the site and genotype level. This could be particularly helpful if one set of variants is markedly different from others, or if one is interested in finding the union of call sets rather than an intersection.\r\n* The ability to preserve information from input VCF files. I'm thinking that it would help to specify this information in a high level configuration file. This would allow you to do things like propagate QUAL scores and compute with them downstream.\r\n\r\n","google":"","note":"Don't delete this file! It's used internally to help with page regeneration."}

tools/consensus_caller/galaxy.consensus/stylesheets/pygment_trac.css

+.highlight  { background: #ffffff; }
+.highlight .c { color: #999988; font-style: italic } /* Comment */
+.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
+.highlight .k { font-weight: bold } /* Keyword */
+.highlight .o { font-weight: bold } /* Operator */
+.highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */
+.highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */
+.highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */
+.highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */
+.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
+.highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */
+.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .gr { color: #aa0000 } /* Generic.Error */
+.highlight .gh { color: #999999 } /* Generic.Heading */
+.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
+.highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */
+.highlight .go { color: #888888 } /* Generic.Output */
+.highlight .gp { color: #555555 } /* Generic.Prompt */
+.highlight .gs { font-weight: bold } /* Generic.Strong */
+.highlight .gu { color: #800080; font-weight: bold; } /* Generic.Subheading */
+.highlight .gt { color: #aa0000 } /* Generic.Traceback */
+.highlight .kc { font-weight: bold } /* Keyword.Constant */
+.highlight .kd { font-weight: bold } /* Keyword.Declaration */
+.highlight .kn { font-weight: bold } /* Keyword.Namespace */
+.highlight .kp { font-weight: bold } /* Keyword.Pseudo */
+.highlight .kr { font-weight: bold } /* Keyword.Reserved */
+.highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */
+.highlight .m { color: #009999 } /* Literal.Number */
+.highlight .s { color: #d14 } /* Literal.String */
+.highlight .na { color: #008080 } /* Name.Attribute */
+.highlight .nb { color: #0086B3 } /* Name.Builtin */
+.highlight .nc { color: #445588; font-weight: bold } /* Name.Class */
+.highlight .no { color: #008080 } /* Name.Constant */
+.highlight .ni { color: #800080 } /* Name.Entity */
+.highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */
+.highlight .nf { color: #990000; font-weight: bold } /* Name.Function */
+.highlight .nn { color: #555555 } /* Name.Namespace */
+.highlight .nt { color: #000080 } /* Name.Tag */
+.highlight .nv { color: #008080 } /* Name.Variable */
+.highlight .ow { font-weight: bold } /* Operator.Word */
+.highlight .w { color: #bbbbbb } /* Text.Whitespace */
+.highlight .mf { color: #009999 } /* Literal.Number.Float */
+.highlight .mh { color: #009999 } /* Literal.Number.Hex */
+.highlight .mi { color: #009999 } /* Literal.Number.Integer */
+.highlight .mo { color: #009999 } /* Literal.Number.Oct */
+.highlight .sb { color: #d14 } /* Literal.String.Backtick */
+.highlight .sc { color: #d14 } /* Literal.String.Char */
+.highlight .sd { color: #d14 } /* Literal.String.Doc */
+.highlight .s2 { color: #d14 } /* Literal.String.Double */
+.highlight .se { color: #d14 } /* Literal.String.Escape */
+.highlight .sh { color: #d14 } /* Literal.String.Heredoc */
+.highlight .si { color: #d14 } /* Literal.String.Interpol */
+.highlight .sx { color: #d14 } /* Literal.String.Other */
+.highlight .sr { color: #009926 } /* Literal.String.Regex */
+.highlight .s1 { color: #d14 } /* Literal.String.Single */
+.highlight .ss { color: #990073 } /* Literal.String.Symbol */
+.highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */
+.highlight .vc { color: #008080 } /* Name.Variable.Class */
+.highlight .vg { color: #008080 } /* Name.Variable.Global */
+.highlight .vi { color: #008080 } /* Name.Variable.Instance */
+.highlight .il { color: #009999 } /* Literal.Number.Integer.Long */
+
+.type-csharp .highlight .k { color: #0000FF }
+.type-csharp .highlight .kt { color: #0000FF }
+.type-csharp .highlight .nf { color: #000000; font-weight: normal }
+.type-csharp .highlight .nc { color: #2B91AF }
+.type-csharp .highlight .nn { color: #000000 }
+.type-csharp .highlight .s { color: #A31515 }
+.type-csharp .highlight .sc { color: #A31515 }

tools/consensus_caller/galaxy.consensus/stylesheets/styles.css

+@import url(https://fonts.googleapis.com/css?family=Lato:300italic,700italic,300,700);
+
+body {
+  padding:50px;
+  font:14px/1.5 Lato, "Helvetica Neue", Helvetica, Arial, sans-serif;
+  color:#777;
+  font-weight:300;
+}
+
+h1, h2, h3, h4, h5, h6 {
+  color:#222;
+  margin:0 0 20px;
+}
+
+p, ul, ol, table, pre, dl {
+  margin:0 0 20px;
+}
+
+h1, h2, h3 {
+  line-height:1.1;
+}
+
+h1 {
+  font-size:28px;
+}
+
+h2 {
+  color:#393939;
+}
+
+h3, h4, h5, h6 {
+  color:#494949;
+}
+
+a {
+  color:#39c;
+  font-weight:400;
+  text-decoration:none;
+}
+
+a small {
+  font-size:11px;
+  color:#777;
+  margin-top:-0.6em;
+  display:block;
+}
+
+.wrapper {
+  width:860px;
+  margin:0 auto;
+}
+
+blockquote {
+  border-left:1px solid #e5e5e5;
+  margin:0;
+  padding:0 0 0 20px;
+  font-style:italic;
+}
+
+code, pre {
+  font-family:Monaco, Bitstream Vera Sans Mono, Lucida Console, Terminal;
+  color:#333;
+  font-size:12px;
+}
+
+pre {
+  padding:8px 15px;
+  background: #f8f8f8;  
+  border-radius:5px;
+  border:1px solid #e5e5e5;
+  overflow-x: auto;
+}
+
+table {
+  width:100%;
+  border-collapse:collapse;
+}
+
+th, td {
+  text-align:left;
+  padding:5px 10px;
+  border-bottom:1px solid #e5e5e5;
+}
+
+dt {
+  color:#444;
+  font-weight:700;
+}
+
+th {
+  color:#444;
+}
+
+img {
+  max-width:100%;
+}
+
+header {
+  width:270px;
+  float:left;
+  position:fixed;
+}
+
+header ul {
+  list-style:none;
+  height:40px;
+  
+  padding:0;
+  
+  background: #eee;
+  background: -moz-linear-gradient(top, #f8f8f8 0%, #dddddd 100%);
+  background: -webkit-gradient(linear, left top, left bottom, color-stop(0%,#f8f8f8), color-stop(100%,#dddddd));
+  background: -webkit-linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  background: -o-linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  background: -ms-linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  background: linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  
+  border-radius:5px;
+  border:1px solid #d2d2d2;
+  box-shadow:inset #fff 0 1px 0, inset rgba(0,0,0,0.03) 0 -1px 0;
+  width:270px;
+}
+
+header li {
+  width:89px;
+  float:left;
+  border-right:1px solid #d2d2d2;
+  height:40px;
+}
+
+header ul a {
+  line-height:1;
+  font-size:11px;
+  color:#999;
+  display:block;
+  text-align:center;
+  padding-top:6px;
+  height:40px;
+}
+
+strong {
+  color:#222;
+  font-weight:700;
+}
+
+header ul li + li {
+  width:88px;
+  border-left:1px solid #fff;
+}
+
+header ul li + li + li {
+  border-right:none;
+  width:89px;
+}
+
+header ul a strong {
+  font-size:14px;
+  display:block;
+  color:#222;
+}
+
+section {
+  width:500px;
+  float:right;
+  padding-bottom:50px;
+}
+
+small {
+  font-size:11px;
+}
+
+hr {
+  border:0;
+  background:#e5e5e5;
+  height:1px;
+  margin:0 0 20px;
+}
+
+footer {
+  width:270px;
+  float:left;
+  position:fixed;
+  bottom:50px;
+}
+
+@media print, screen and (max-width: 960px) {
+  
+  div.wrapper {
+    width:auto;
+    margin:0;
+  }
+  
+  header, section, footer {
+    float:none;
+    position:static;
+    width:auto;
+  }
+  
+  header {
+    padding-right:320px;
+  }
+  
+  section {
+    border:1px solid #e5e5e5;
+    border-width:1px 0;
+    padding:20px 0;
+    margin:0 0 20px;
+  }
+  
+  header a small {
+    display:inline;
+  }
+  
+  header ul {
+    position:absolute;
+    right:50px;
+    top:52px;
+  }
+}
+
+@media print, screen and (max-width: 720px) {
+  body {
+    word-wrap:break-word;
+  }
+  
+  header {
+    padding:0;
+  }
+  
+  header ul, header p.view {
+    position:static;
+  }
+  
+  pre, code {
+    word-wrap:normal;
+  }
+}
+
+@media print, screen and (max-width: 480px) {
+  body {
+    padding:15px;
+  }
+  
+  header ul {
+    display:none;
+  }
+}
+
+@media print {
+  body {
+    padding:0.4in;
+    font-size:12pt;
+    color:#444;
+  }
+}