Wiki

Here is an example pipeline for aligning DNA sequencing (WGS/WXS) and related preprocessing steps. These steps are according to GATK best practices (https://software.broadinstitute.org/gatk/best-practices/) at the time of creating this document.

Briefly, the steps are as the following:

1- Alignment (bwa mem) by lane (http://bio-bwa.sourceforge.net/)

2- Sorting (PICARD) by lane (http://software.broadinstitute.org/cancer/software/genepattern/modules/docs/Picard.SortSam)

3- Merging bam files (by lane -> by sample)

4- Marking duplicates (PICARD) by sample

5- Base quality recalibration (GATK) by sample.

Note that realignment around indels is skipped here since the planned downstream analysis is performed with MuTect2/Haplotype Caller which perform this step on the go.

6- Collect statistics e.g. coverage and alignment efficiency.

Inputs for the pipeline include:

A list of fastq files, optionally quality controlled.
Reference genome in fasta format.
dbSNP and gold standard indels from 1kg in vcf format which can be downloaded from GATK resource bundle (ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/).

Outputs of the pipeline are:

Aligned BAM files ready for downstream analyses.
Alignment statistics

Dependencies:

bwa, GATK, PICARD, bedtools

#!/usr/bin/env anduril
//$OPT --threads 100
//$OPT --wrapper slurm-prefix

import anduril.builtin._
import anduril.tools._
import org.anduril.runtime._
import anduril.microarray._
import anduril.sequencing._

object alignment {

  // The following list variable should be a list of fastq files with the following columns:
  // Sample: sample id, this will be the same for all files from the same sample
  // Key: file id, this is unique for each file
  // library: sequencing library ID
  // Reads: path to reads fastq files
  // Mates: path to mates fastq files

  //This creates the list variable described above:
  val list     = INPUT(path="files/list.csv") 

  //Now we will do the same for the other needed input files: reference, dbsnp vcf file and gold standard 
  //indels and set up the needed dependencies

  // reference genome fasta file
  val reference = INPUT(path="files/GRCh38.d1.vd1.fa")

  // dbsnp vcf file
  val dbsnp = INPUT(path="files/dbsnp_144_sorted.hg38.vcf")

  // Mills and 1000G gold standard indels
  val indelsGold = INPUT(path="files/Mills_and_1000G_gold_standard.indels.hg38.vcf")

  val gatk = "path/to/gatk/folder"
  val picard = "path/to/picard/folder"
  val bedtools = "path/to/bedtools/binary/folder"
  // bwa should be in your PATH


  //The following section initializes NamedMap objects that will be used later on to name the items 
  //inserted to them

  val reads   = NamedMap[INPUT]("reads")
  val mates   = NamedMap[INPUT]("mates")
  val aligned = NamedMap[BashEvaluate]("aligned")
  val sorted  = NamedMap[BashEvaluate]("sorted")
  val sortedOut = NamedMap[Any]("sortedOut")

  //This will iterate over the different files in the list variable
  for ( rowMap <- iterCSV(list) ) {

    val Sample  = rowMap("Sample")
    val key      = rowMap("Key")
    val library = rowMap("library")

    //Here the NamedMap objects are used to name the different columns in the fasq file
    reads(key) = INPUT(path=rowMap("Reads"))
    mates(key) = INPUT(path=rowMap("Mates"))

  // step 1

    //Alignment is done using the bwa aligner with bash commands
    aligned(key) = BashEvaluate(var1=reads(key),
                                var2=mates(key),
                                var3=reference,
                                param1=Sample,
                                param2=library,
                                param3=key,
                                script="""
                                       bwa mem -M -t 1 -R "@RG\\tID:@param3@\\tPL:ILLUMINA\\tLB:@param2@\\tSM:@param1@" @var3@ @var1@ @var2@ > @out1@
                                       """)
    aligned(key)._keep = false
    aligned(key)._filename("out1","aligned.bam")


  // step2

    //Sorting by using PICARD SamSort with bash commands
    sorted(key) = BashEvaluate(var1=aligned(key).out1,
                               var2=reference,
                               param1 = picard,
                               script="""
                                      java -Xmx4g -jar @param1@/picard.jar SortSam VALIDATION_STRINGENCY=SILENT SORT_ORDER=coordinate \
                                      CREATE_INDEX=true CREATE_MD5_FILE=true INPUT=@var1@ OUTPUT=@out1@ TMP_DIR=$( gettempdir )
                                      """)
    sorted(key)._filename("out1","sorted.bam")
    sorted(key)._custom("memory") = "8000"

    sortedOut(key) = sorted(key).out1
  }  

  val bamCSV = Array2CSV(in=sortedOut)

  // add sample ID to keys
  val bamCSVperSample = CSVTransformer(csv1=bamCSV,
                                       csv2=list,
                                       transform1="""
                                                  csv <- merge(csv1,csv2[,1:2],by=1,sort=F)
                                                  data.frame(Sample=csv[,3],File=csv[,2],Key=csv[,1])
                                                  """)

  // split to array by sample ID
  val bySampleArray = REvaluate(table1=bamCSVperSample,
                                script="""
                                       array.out <- split(table1[,c(3,2)],table1[,1])
                                       table.out <- data.frame()
                                       """)

  val bySampleCSV = Array2CSV(in = bySampleArray.outArray)


  val statSummary = NamedMap[Any]("statSummary")
  val outBam = NamedMap[Any]("outBam")

  for ((k,v) <- iterArray(bySampleArray.outArray))
  {
    withName(k) 
    {   

    val sampleBamsCSV = bySampleArray.outArray(k)

    val sampleBamsArray = CSV2Array(in = sampleBamsCSV) 

 // The rest of the steps are done simply by using different anduril components 

 // step 3   
    val merged = BamCombiner(in = sampleBamsArray,
                             picard=picard)
    merged._keep = false

 // step 4   
    val marked = DuplicateMarker(in=merged.out,
                                 picard=picard)

 // step 5   
    val recal = BaseRecalibrator(reference = reference,
                                 bam = marked.outBam,
                                 dbsnp = dbsnp,
                                 mask = indelsGold,
                                 gatk = gatk,
                                 plot= false)

 // step 6   
    val stats = BamStats(bam = recal.alignment,
                         refGenome = reference,
                         markStats = marked.outMetrics,
                         targeted=false, // if your data is from exome set to true and provide targets input
                         sampleName=k,
                         maxCoverage=500,
                         picard=picard,
                         bedtools=bedtools)                                      

    statSummary(k) = stats.summary
    outBam(k) = recal.alignment  
      }                            


  }   
  // statistics summary output
  val finalSummary = CSVJoin(in = statSummary)

  // csv having paths to BAM files
  val outBamCSV = Array2CSV(in = outBam)


}