Wiki

Clone wiki

sequencing / Applications_DNAseq_alignment

Here is an example pipeline for aligning DNA sequencing (WGS/WXS) and related preprocessing steps. These steps are according to GATK best practices (https://software.broadinstitute.org/gatk/best-practices/) at the time of creating this document.

Briefly, the steps are as the following:

1- Alignment (bwa mem) by lane (http://bio-bwa.sourceforge.net/)

2- Sorting (PICARD) by lane (http://software.broadinstitute.org/cancer/software/genepattern/modules/docs/Picard.SortSam)

3- Merging bam files (by lane -> by sample)

4- Marking duplicates (PICARD) by sample

5- Base quality recalibration (GATK) by sample.

Note that realignment around indels is skipped here since the planned downstream analysis is performed with MuTect2/Haplotype Caller which perform this step on the go.

6- Collect statistics e.g. coverage and alignment efficiency.

Inputs for the pipeline include:

  • A list of fastq files, optionally quality controlled.

  • Reference genome in fasta format.

  • dbSNP and gold standard indels from 1kg in vcf format which can be downloaded from GATK resource bundle (ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/).

Outputs of the pipeline are:

  • Aligned BAM files ready for downstream analyses.

  • Alignment statistics

Dependencies:

bwa, GATK, PICARD, bedtools

#!/usr/bin/env anduril
//$OPT --threads 100
//$OPT --wrapper slurm-prefix

import anduril.builtin._
import anduril.tools._
import org.anduril.runtime._
import anduril.microarray._
import anduril.sequencing._

object alignment {

  // The following list variable should be a list of fastq files with the following columns:
  // Sample: sample id, this will be the same for all files from the same sample
  // Key: file id, this is unique for each file
  // library: sequencing library ID
  // Reads: path to reads fastq files
  // Mates: path to mates fastq files

  //This creates the list variable described above:
  val list     = INPUT(path="files/list.csv") 

  //Now we will do the same for the other needed input files: reference, dbsnp vcf file and gold standard 
  //indels and set up the needed dependencies

  // reference genome fasta file
  val reference = INPUT(path="files/GRCh38.d1.vd1.fa")

  // dbsnp vcf file
  val dbsnp = INPUT(path="files/dbsnp_144_sorted.hg38.vcf")

  // Mills and 1000G gold standard indels
  val indelsGold = INPUT(path="files/Mills_and_1000G_gold_standard.indels.hg38.vcf")

  val gatk = "path/to/gatk/folder"
  val picard = "path/to/picard/folder"
  val bedtools = "path/to/bedtools/binary/folder"
  // bwa should be in your PATH


  //The following section initializes NamedMap objects that will be used later on to name the items 
  //inserted to them

  val reads   = NamedMap[INPUT]("reads")
  val mates   = NamedMap[INPUT]("mates")
  val aligned = NamedMap[BashEvaluate]("aligned")
  val sorted  = NamedMap[BashEvaluate]("sorted")
  val sortedOut = NamedMap[Any]("sortedOut")

  //This will iterate over the different files in the list variable
  for ( rowMap <- iterCSV(list) ) {

    val Sample  = rowMap("Sample")
    val key      = rowMap("Key")
    val library = rowMap("library")

    //Here the NamedMap objects are used to name the different columns in the fasq file
    reads(key) = INPUT(path=rowMap("Reads"))
    mates(key) = INPUT(path=rowMap("Mates"))

  // step 1

    //Alignment is done using the bwa aligner with bash commands
    aligned(key) = BashEvaluate(var1=reads(key),
                                var2=mates(key),
                                var3=reference,
                                param1=Sample,
                                param2=library,
                                param3=key,
                                script="""
                                       bwa mem -M -t 1 -R "@RG\\tID:@param3@\\tPL:ILLUMINA\\tLB:@param2@\\tSM:@param1@" @var3@ @var1@ @var2@ > @out1@
                                       """)
    aligned(key)._keep = false
    aligned(key)._filename("out1","aligned.bam")


  // step2

    //Sorting by using PICARD SamSort with bash commands
    sorted(key) = BashEvaluate(var1=aligned(key).out1,
                               var2=reference,
                               param1 = picard,
                               script="""
                                      java -Xmx4g -jar @param1@/picard.jar SortSam VALIDATION_STRINGENCY=SILENT SORT_ORDER=coordinate \
                                      CREATE_INDEX=true CREATE_MD5_FILE=true INPUT=@var1@ OUTPUT=@out1@ TMP_DIR=$( gettempdir )
                                      """)
    sorted(key)._filename("out1","sorted.bam")
    sorted(key)._custom("memory") = "8000"

    sortedOut(key) = sorted(key).out1
  }  

  val bamCSV = Array2CSV(in=sortedOut)

  // add sample ID to keys
  val bamCSVperSample = CSVTransformer(csv1=bamCSV,
                                       csv2=list,
                                       transform1="""
                                                  csv <- merge(csv1,csv2[,1:2],by=1,sort=F)
                                                  data.frame(Sample=csv[,3],File=csv[,2],Key=csv[,1])
                                                  """)

  // split to array by sample ID
  val bySampleArray = REvaluate(table1=bamCSVperSample,
                                script="""
                                       array.out <- split(table1[,c(3,2)],table1[,1])
                                       table.out <- data.frame()
                                       """)

  val bySampleCSV = Array2CSV(in = bySampleArray.outArray)


  val statSummary = NamedMap[Any]("statSummary")
  val outBam = NamedMap[Any]("outBam")

  for ((k,v) <- iterArray(bySampleArray.outArray))
  {
    withName(k) 
    {   

    val sampleBamsCSV = bySampleArray.outArray(k)

    val sampleBamsArray = CSV2Array(in = sampleBamsCSV) 

 // The rest of the steps are done simply by using different anduril components 

 // step 3   
    val merged = BamCombiner(in = sampleBamsArray,
                             picard=picard)
    merged._keep = false

 // step 4   
    val marked = DuplicateMarker(in=merged.out,
                                 picard=picard)

 // step 5   
    val recal = BaseRecalibrator(reference = reference,
                                 bam = marked.outBam,
                                 dbsnp = dbsnp,
                                 mask = indelsGold,
                                 gatk = gatk,
                                 plot= false)

 // step 6   
    val stats = BamStats(bam = recal.alignment,
                         refGenome = reference,
                         markStats = marked.outMetrics,
                         targeted=false, // if your data is from exome set to true and provide targets input
                         sampleName=k,
                         maxCoverage=500,
                         picard=picard,
                         bedtools=bedtools)                                      

    statSummary(k) = stats.summary
    outBam(k) = recal.alignment  
      }                            


  }   
  // statistics summary output
  val finalSummary = CSVJoin(in = statSummary)

  // csv having paths to BAM files
  val outBamCSV = Array2CSV(in = outBam)


}

Updated