Wiki
Clone wikisequencing / Applications_DNAseq_alignment
Here is an example pipeline for aligning DNA sequencing (WGS/WXS) and related preprocessing steps. These steps are according to GATK best practices (https://software.broadinstitute.org/gatk/best-practices/) at the time of creating this document.
Briefly, the steps are as the following:
1- Alignment (bwa mem) by lane (http://bio-bwa.sourceforge.net/)
2- Sorting (PICARD) by lane (http://software.broadinstitute.org/cancer/software/genepattern/modules/docs/Picard.SortSam)
3- Merging bam files (by lane -> by sample)
4- Marking duplicates (PICARD) by sample
5- Base quality recalibration (GATK) by sample.
Note that realignment around indels is skipped here since the planned downstream analysis is performed with MuTect2/Haplotype Caller which perform this step on the go.
6- Collect statistics e.g. coverage and alignment efficiency.
Inputs for the pipeline include:
-
A list of fastq files, optionally quality controlled.
-
Reference genome in fasta format.
-
dbSNP and gold standard indels from 1kg in vcf format which can be downloaded from GATK resource bundle (ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/).
Outputs of the pipeline are:
-
Aligned BAM files ready for downstream analyses.
-
Alignment statistics
Dependencies:
bwa, GATK, PICARD, bedtools
#!/usr/bin/env anduril //$OPT --threads 100 //$OPT --wrapper slurm-prefix import anduril.builtin._ import anduril.tools._ import org.anduril.runtime._ import anduril.microarray._ import anduril.sequencing._ object alignment { // The following list variable should be a list of fastq files with the following columns: // Sample: sample id, this will be the same for all files from the same sample // Key: file id, this is unique for each file // library: sequencing library ID // Reads: path to reads fastq files // Mates: path to mates fastq files //This creates the list variable described above: val list = INPUT(path="files/list.csv") //Now we will do the same for the other needed input files: reference, dbsnp vcf file and gold standard //indels and set up the needed dependencies // reference genome fasta file val reference = INPUT(path="files/GRCh38.d1.vd1.fa") // dbsnp vcf file val dbsnp = INPUT(path="files/dbsnp_144_sorted.hg38.vcf") // Mills and 1000G gold standard indels val indelsGold = INPUT(path="files/Mills_and_1000G_gold_standard.indels.hg38.vcf") val gatk = "path/to/gatk/folder" val picard = "path/to/picard/folder" val bedtools = "path/to/bedtools/binary/folder" // bwa should be in your PATH //The following section initializes NamedMap objects that will be used later on to name the items //inserted to them val reads = NamedMap[INPUT]("reads") val mates = NamedMap[INPUT]("mates") val aligned = NamedMap[BashEvaluate]("aligned") val sorted = NamedMap[BashEvaluate]("sorted") val sortedOut = NamedMap[Any]("sortedOut") //This will iterate over the different files in the list variable for ( rowMap <- iterCSV(list) ) { val Sample = rowMap("Sample") val key = rowMap("Key") val library = rowMap("library") //Here the NamedMap objects are used to name the different columns in the fasq file reads(key) = INPUT(path=rowMap("Reads")) mates(key) = INPUT(path=rowMap("Mates")) // step 1 //Alignment is done using the bwa aligner with bash commands aligned(key) = BashEvaluate(var1=reads(key), var2=mates(key), var3=reference, param1=Sample, param2=library, param3=key, script=""" bwa mem -M -t 1 -R "@RG\\tID:@param3@\\tPL:ILLUMINA\\tLB:@param2@\\tSM:@param1@" @var3@ @var1@ @var2@ > @out1@ """) aligned(key)._keep = false aligned(key)._filename("out1","aligned.bam") // step2 //Sorting by using PICARD SamSort with bash commands sorted(key) = BashEvaluate(var1=aligned(key).out1, var2=reference, param1 = picard, script=""" java -Xmx4g -jar @param1@/picard.jar SortSam VALIDATION_STRINGENCY=SILENT SORT_ORDER=coordinate \ CREATE_INDEX=true CREATE_MD5_FILE=true INPUT=@var1@ OUTPUT=@out1@ TMP_DIR=$( gettempdir ) """) sorted(key)._filename("out1","sorted.bam") sorted(key)._custom("memory") = "8000" sortedOut(key) = sorted(key).out1 } val bamCSV = Array2CSV(in=sortedOut) // add sample ID to keys val bamCSVperSample = CSVTransformer(csv1=bamCSV, csv2=list, transform1=""" csv <- merge(csv1,csv2[,1:2],by=1,sort=F) data.frame(Sample=csv[,3],File=csv[,2],Key=csv[,1]) """) // split to array by sample ID val bySampleArray = REvaluate(table1=bamCSVperSample, script=""" array.out <- split(table1[,c(3,2)],table1[,1]) table.out <- data.frame() """) val bySampleCSV = Array2CSV(in = bySampleArray.outArray) val statSummary = NamedMap[Any]("statSummary") val outBam = NamedMap[Any]("outBam") for ((k,v) <- iterArray(bySampleArray.outArray)) { withName(k) { val sampleBamsCSV = bySampleArray.outArray(k) val sampleBamsArray = CSV2Array(in = sampleBamsCSV) // The rest of the steps are done simply by using different anduril components // step 3 val merged = BamCombiner(in = sampleBamsArray, picard=picard) merged._keep = false // step 4 val marked = DuplicateMarker(in=merged.out, picard=picard) // step 5 val recal = BaseRecalibrator(reference = reference, bam = marked.outBam, dbsnp = dbsnp, mask = indelsGold, gatk = gatk, plot= false) // step 6 val stats = BamStats(bam = recal.alignment, refGenome = reference, markStats = marked.outMetrics, targeted=false, // if your data is from exome set to true and provide targets input sampleName=k, maxCoverage=500, picard=picard, bedtools=bedtools) statSummary(k) = stats.summary outBam(k) = recal.alignment } } // statistics summary output val finalSummary = CSVJoin(in = statSummary) // csv having paths to BAM files val outBamCSV = Array2CSV(in = outBam) }
Updated