Source

utils / GATK.makefile

Full commit
GATK=/usr/local/bin/GenomeAnalysisTK.jar
R=/home/dcittaro/usr/bin/R
MarkDuplicates=/usr/local/bin/MarkDuplicates.jar
MergeSam=/usr/local/bin/MergeSamFiles.jar
dbsnp=/usr/local/scratch/genomes/hg19/annotation/dbsnp-hg19-136.vcf.gz
1kg_omni=/usr/local/scratch/genomes/hg19/annotation/1000G_omni2.5.hg19.sites.vcf.gz
hapmap=/usr/local/scratch/genomes/hg19/annotation/hapmap_3.3.hg19.sites.vcf.gz
mills=/usr/local/scratch/genomes/hg19/annotation/Mills_Devine_2hit.indels.hg19.sites.vcf.gz
ref_genome=/usr/local/scratch/genomes/hg19/fa/hg19.fa
experiment_name=SNV


# GATK parameters
CALL_CONF = 30.0
WHOLE_GENOME = 1


bamfiles := $(filter-out $(wildcard $(experiment_name)*.bam), $(wildcard *.bam))
bamfiles := $(filter-out $(wildcard *_dedup.bam),$(bamfiles))
bamdedup := $(patsubst %.bam,%_dedup.bam,$(bamfiles))

ifeq ($(bamdedup), '')
bamdedup := $(join $(bamdedup), $(wildcard *_dedup.bam))
endif

# chromosomes should be read from BAM header...
chromosomes := $(shell samtools view -H $(bamfiles) | cut -f2 | grep SN | cut -d":" -f2)
split_realign := $(patsubst %,$(experiment_name).%.realign.bam,$(chromosomes))
split_recal := $(patsubst %,$(experiment_name).%.realign.recal.bam,$(chromosomes))

all: $(experiment_name).final.vcf CovariateAnalysis VQSRPlot
	@echo "All Done"

dedup: $(bamdedup)
	@echo "Created all dedup'ed files:" $^
	touch $<

merge: $(experiment_name).bam
	@echo "Merge of single files done:" $<
	touch $<

realign: $(experiment_name).realign.bam
	@echo "Local realignment done:" $<
	touch $<

recalibrate: $(experiment_name).realign.recal.bam
	@echo "Created recalibrated file:" $<
	touch $<

variants: $(experiment_name).vcf
	@echo "Created variants file:" $<
	touch $<

vqsr: $(experiment_name).final.vcf
	@echo "Performed VQSR for all variants:" $<
	touch $<

$(bamdedup): $(bamfiles)
	java -jar $(MarkDuplicates) I=$(patsubst %_dedup.bam,%.bam,$@) \
	O=$@ \
	CREATE_INDEX=false \
	VALIDATION_STRINGENCY=SILENT \
	ASSUME_SORTED=true \
	REMOVE_DUPLICATES=true \
	METRICS_FILE=$(patsubst %_dedup.bam,%.metrics,$@)

$(experiment_name).bam: *_dedup.bam $(bamdedup)
	java -jar $(MergeSam) $(foreach bamfile,$^,I=$(bamfile)) \
	O=$(experiment_name).bam \
	CREATE_INDEX=true \
	MSD=true \
	VALIDATION_STRINGENCY=SILENT \
	ASSUME_SORTED=true \
	USE_THREADING=true

$(experiment_name).intervals: $(experiment_name).bam
	java -Xmx8g -jar $(GATK) \
	-I $< \
	-T RealignerTargetCreator \
	-R $(ref_genome) \
	-o $@ \
	--known $(dbsnp)

$(experiment_name).%.realign.bam: $(experiment_name).intervals $(experiment_name).bam
	$(eval chrom := $(subst .realign.bam,,$(subst $(experiment_name).,,$@)))
	@echo "Realign chromosome" $(chrom)
	java -Xmx8g -jar $(GATK) \
	-I $(experiment_name).bam \
	-R $(ref_genome) \
	-L $(chrom) \
	-T IndelRealigner \
	-targetIntervals $< \
	-o $@ \
	-known $(dbsnp)


$(experiment_name).realign.bam: $(split_realign)
	@echo "Merging" $*
	java -jar $(MergeSam) $(foreach bamfile,$+,I=$(bamfile)) \
	O=$@ \
	CREATE_INDEX=true \
	MSD=true \
	VALIDATION_STRINGENCY=SILENT \
	ASSUME_SORTED=true \
	USE_THREADING=true
	@rm -f $(split_realign)

$(experiment_name).recal.csv: $(experiment_name).realign.bam
	java -Xmx16g -jar $(GATK) \
	-R $(ref_genome) \
	-knownSites $(dbsnp) \
	-I $< \
	-T CountCovariates \
	-nt 8 \
	-cov ReadGroupCovariate \
	-cov QualityScoreCovariate \
	-cov CycleCovariate \
	-cov DinucCovariate \
	-cov MappingQualityCovariate \
	-recalFile $@

$(experiment_name).%.realign.recal.bam: $(experiment_name).recal.csv $(experiment_name).realign.bam
	$(eval chrom := $(subst .realign.recal.bam,,$(subst $(experiment_name).,,$@)))
	@echo "Recalibrating chromosome" $(chrom)
	java -Xmx16g -jar $(GATK) \
	-R $(ref_genome) \
	-I $(experiment_name).realign.bam \
	-T TableRecalibration \
	--doNotWriteOriginalQuals \
	-o $@ \
	-L $(chrom) \
	-recalFile $<

$(experiment_name).realign.recal.bam: $(split_recal)
	@echo "Merging" $*
	java -jar $(MergeSam) $(foreach bamfile,$+,I=$(bamfile)) \
	O=$@ \
	CREATE_INDEX=true \
	MSD=true \
	VALIDATION_STRINGENCY=SILENT \
	ASSUME_SORTED=true \
	USE_THREADING=true
	@rm -f $(split_recal)


$(experiment_name).vcf: $(experiment_name).realign.recal.bam
	java -Xmx64g -jar $(GATK) \
	-R $(ref_genome) \
	--dbsnp $(dbsnp) \
	-I $< \
	-T UnifiedGenotyper \
	-nt 8 \
	-stand_call_conf $(CALL_CONF) \
	-glm BOTH \
	-maxAlleles 7 \
	-o $@

ifeq ($(WHOLE_GENOME),1)
$(experiment_name).snp.recal.csv $(experiment_name).snp.tranches $(experiment_name).snp.plot.R: $(experiment_name).vcf
	java -Xmx16g -jar $(GATK) \
	-T VariantRecalibrator \
	-R $(ref_genome) \
	-input $< \
	-resource:hapmap,VCF,known=false,training=true,truth=true,prior=15.0 $(hapmap) \
	-resource:omni,VCF,known=false,training=true,truth=false,prior=12.0 $(okg_omni) \
	-resource:dbsnp,VCF,known=true,training=false,truth=false,prior=6.0 $(dbsnp) \
	-an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an DP \
	-mode SNP \
	-recalFile $(experiment_name).snp.recal.csv \
	-tranchesFile $(experiment_name).snp.tranches \
	-rscriptFile $(experiment_name).snp.plot.R

$(experiment_name)_snp.vcf: $(experiment_name).vcf $(experiment_name).snp.recal.csv $(experiment_name).snp.tranches
	java -Xmx16g -jar $(GATK) \
	-T ApplyRecalibration \
	-R $(ref_genome) \
	-input $< \
	--ts_filter_level 0.95 \
	-tranchesFile $(experiment_name).snp.tranches \
	-recalFile $(experiment_name).snp.recal.csv \
	--mode SNP \
	-o $@

$(experiment_name).indel.recal.csv $(experiment_name).indel.tranches $(experiment_name).indel.plot.R: $(experiment_name).vcf
	java -Xmx16g -jar $(GATK) \
	-T VariantRecalibrator \
	-R $(ref_genome) \
	-input $< \
	-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0 $(mills) \
	-an QD -an FS -an HaplotypeScore -an ReadPosRankSum \
	-mode INDEL \
	-recalFile $(experiment_name).indel.recal.csv \
	-tranchesFile $(experiment_name).indel.tranches \
	-rscriptFile $(experiment_name).indel.plot.R

$(experiment_name)_indel.vcf: $(experiment_name).vcf $(experiment_name).indel.recal.csv $(experiment_name).indel.tranches
	java -Xmx16g -jar $(GATK) \
	-T ApplyRecalibration \
	-R $(ref_genome) \
	-input $< \
	--ts_filter_level 0.95 \
	-tranchesFile $(experiment_name).indel.tranches \
	-recalFile $(experiment_name).indel.recal.csv \
	--mode INDEL \
	-o $@

else
$(experiment_name).snp.recal.csv $(experiment_name).snp.tranches $(experiment_name).snp.plot.R: $(experiment_name).vcf
	java -Xmx16g -jar $(GATK) \
	-T VariantRecalibrator \
	-R $(ref_genome) \
	-input $< \
	--maxGaussians 4 \
	-percentBad 0.05 \
	-resource:hapmap,VCF,known=false,training=true,truth=true,prior=15.0 $(hapmap) \
	-resource:omni,VCF,known=false,training=true,truth=false,prior=12.0 $(okg_omni) \
	-resource:dbsnp,VCF,known=true,training=false,truth=false,prior=6.0 $(dbsnp) \
	-an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an DP \
	-mode SNP \
	-recalFile $(experiment_name).snp.recal.csv \
	-tranchesFile $(experiment_name).snp.tranches \
	-rscriptFile $(experiment_name).snp.plot.R

$(experiment_name)_snp.vcf: $(experiment_name).vcf $(experiment_name).snp.recal.csv $(experiment_name).snp.tranches
	java -Xmx16g -jar $(GATK) \
	-T ApplyRecalibration \
	-R $(ref_genome) \
	-input $< \
	--ts_filter_level 0.95 \
	-tranchesFile $(experiment_name).snp.tranches \
	-recalFile $(experiment_name).snp.recal.csv \
	--mode SNP \
	-o $@

$(experiment_name)_indel.vcf: $(experiment_name).vcf
	java -Xmx16g -jar $(GATK) \
	-R $(ref_genome) \
	-T VariantFiltration \
	-o $@ \
	--variant $< \
	--filterExpression "QD < 2.0" \
	--filterExpression "ReadPosRankSum < -20.0" \
	--filterExpression "FS > 200.0" \
	--filterName QDFilter \
	--filterName ReadPosFilter \
	--filterName FSFilter \

endif

$(experiment_name).final.vcf: $(experiment_name)_snp.vcf $(experiment_name)_indel.vcf
	java -Xmx16g -jar ${GATK} \
	-R $(ref_genome) \
	-T CombineVariants \
	--variant $(experiment_name)_snp.vcf \
	--variant $(experiment_name)_indel.vcf  \
	--assumeIdenticalSamples \
	-o $@ \
	-genotypeMergeOptions UNSORTED


CovariateAnalysis: $(experiment_name).recal.csv
	@mkdir -p Plots
	java -Xmx16g -jar  /usr/local/bin/AnalyzeCovariates.jar \
	-recalFile $< \
	-outputDir Plots 
	@touch $@

VQSRPlot: vqsr
	@mkdir -p Plots
	$(R) --vanilla < $(experiment_name).snp.plot.R
	@mv $(experiment_name).snp.plot.R.pdf Plots
ifeq ($(WHOLE_GENOME), 1)
	$(R) --vanilla < $(experiment_name).indel.plot.R
	@mv $(experiment_name).indel.plot.R.pdf Plots
endif
	touch $@