Commits

Daniel Blankenberg committed 88afa15

Add SAMTools mpileup.

Comments (0)

Files changed (8)

datatypes_conf.xml.sample

       <converter file="vcf_to_summary_tree_converter.xml" target_datatype="summary_tree"/>
       <display file="igv/vcf.xml" />
     </datatype>
+    <datatype extension="bcf" type="galaxy.datatypes.binary:Binary" subclass="True"/>
     <datatype extension="wsf" type="galaxy.datatypes.wsf:SnpFile" display_in_upload="true"/>
     <datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/>
     <datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true">

test-data/samtools/mpileup/samtools_mpileup_out_1.log

+[mpileup] 1 samples in 1 input files
+<mpileup> Set max per-file depth to 8000

test-data/samtools/mpileup/samtools_mpileup_out_1.pileup

+phiX174	1411	A	1	^P.	$
+phiX174	1412	G	3	.^D.^F.	"$$
+phiX174	1413	C	5	...^D.^F.	"""$$
+phiX174	1414	G	6	.....^F.	#####$
+phiX174	1415	C	7	......^F.	%%%%%%&
+phiX174	1416	C	8	.......^F.	$$$$$$$$
+phiX174	1417	G	9	........^F.	"#######$
+phiX174	1418	T	10	.........^F.	"""""""""$
+phiX174	1419	G	10	..........	"""""'&'%$
+phiX174	1420	G	10	..........	""""""""""
+phiX174	1421	A	10	..........	""""""""""
+phiX174	1422	T	10	..........	""""""""""
+phiX174	1423	G	10	..........	"""""""""#
+phiX174	1424	C	10	..A.AAAAAA	%"""""""""
+phiX174	1425	C	10	..........	$$$"""""""
+phiX174	1426	T	10	..........	#####"""""
+phiX174	1427	G	10	..........	######""""
+phiX174	1428	A	10	..........	""""""""""
+phiX174	1429	C	10	..........	((((((&(""
+phiX174	1430	C	10	..........	$$$$$$$$$"
+phiX174	1431	G	10	..........	##########
+phiX174	1432	T	10	..........	""""""""""
+phiX174	1433	A	10	..........	##########
+phiX174	1434	C	10	..........	((((((&(%$
+phiX174	1435	C	10	..........	$$$$$$$$$$
+phiX174	1436	G	10	..........	##########
+phiX174	1437	A	10	..........	"""""""""!
+phiX174	1438	G	10	..........	"""""####!
+phiX174	1439	G	10	..........	"""""""""!
+phiX174	1440	C	10	..........	"""""""""!
+phiX174	1441	T	10	..........	""""""""#!
+phiX174	1442	A	10	..........	$$$%%%&&%!
+phiX174	1443	A	10	.-1C.-1C..-1C......	"""""""""!
+phiX174	1444	C	10	**.*......	&%"!"""""!
+phiX174	1445	C	10	..........	&%&!%%%&%!
+phiX174	1446	C	10	..........	"""!"""""!
+phiX174	1447	T	10	.$..$.......	#"#!"""""!
+phiX174	1448	A	8	.$..$.....	#!#%%$$!
+phiX174	1449	A	6	.$.$....	!""""!
+phiX174	1450	T	4	.$...	"""!
+phiX174	1451	G	3	.$..	#"!
+phiX174	1452	A	2	.$.	"!
+phiX174	1453	G	1	.$	!

test-data/samtools/mpileup/samtools_mpileup_out_2.bcf

Binary file added.

tool_conf.xml.sample

     <tool file="samtools/sam_to_bam.xml" />
     <tool file="samtools/bam_to_sam.xml" />
     <tool file="samtools/sam_merge.xml" />
+    <tool file="samtools/samtools_mpileup.xml" />
     <tool file="samtools/sam_pileup.xml" />
     <tool file="samtools/pileup_parser.xml" />
     <tool file="samtools/pileup_interval.xml" />

tool_data_table_conf.xml.sample

         <columns>value, name, path</columns>
         <file path="tool-data/perm_color_index.loc" />
     </table>
+    <!-- Location of SAMTools indexes and other files -->
+    <table name="sam_fa_indexes" comment_char="#">
+        <columns>line_type, value, path</columns>
+        <file path="tool-data/sam_fa_indices.loc" />
+    </table>
     <!-- Location of Picard dict file and other files -->
     <table name="picard_indexes" comment_char="#">
         <columns>value, dbkey, name, path</columns>

tools/samtools/samtools_mpileup.xml

+<tool id="samtools_mpileup" name="MPileup" version="0.0.1">
+  <description>SNP and indel caller</description>
+  <requirements>
+      <requirement type="package">samtools</requirement>
+  </requirements>
+  <command interpreter="python">samtools_wrapper.py
+    -p 'samtools mpileup'
+    --stdout "${output_log}"
+    #if $reference_source.reference_source_selector != "history":
+        -p '-f "${reference_source.ref_file.fields.path}"'
+    #else:
+        -d "-f" "${reference_source.ref_file}" "fa" "reference_input"
+    #end if
+    #for $i, $input_bam in enumerate( $reference_source.input_bams ):
+        -d " " "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "bam_input_${i}"
+        -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "bam_input_${i}" ##hardcode galaxy ext type as bam_index
+    #end for
+    -p '
+    #if str( $advanced_options.advanced_options_selector ) == "advanced":
+        ${advanced_options.skip_anomalous_read_pairs}
+        ${advanced_options.disable_probabilistic_realignment}
+        -C "${advanced_options.coefficient_for_downgrading}"
+        -d "${advanced_options.max_reads_per_bam}"
+        ${advanced_options.extended_BAQ_computation}
+        #if str( $advanced_options.position_list ) != 'None':
+          -l "${advanced_options.position_list}"
+        #end if
+        -q "${advanced_options.minimum_mapping_quality}"
+        -Q "${advanced_options.minimum_base_quality}"
+        #if str( $advanced_options.region_string ):
+            -r "${advanced_options.region_string}"
+        #end if
+        ${advanced_options.output_per_sample_read_depth}
+        ${advanced_options.output_per_sample_strand_bias_p_value}
+    #end if
+    #if str( $genotype_likelihood_computation_type.genotype_likelihood_computation_type_selector ) == 'perform_genotype_likelihood_computation':
+        ##-g or -u
+        -g
+        -e "${genotype_likelihood_computation_type.gap_extension_sequencing_error_probability}"
+        -h "${genotype_likelihood_computation_type.coefficient_for_modeling_homopolymer_errors}"
+        #if str( $genotype_likelihood_computation_type.perform_indel_calling.perform_indel_calling_selector ) == 'perform_indel_calling':
+            -L "${genotype_likelihood_computation_type.perform_indel_calling.skip_indel_calling_above_sample_depth}"
+        #else:
+            -I
+        #end if
+        -o "${genotype_likelihood_computation_type.gap_open_sequencing_error_probability}"
+        #if len( $genotype_likelihood_computation_type.platform_list_repeat ):
+            -P "${ ",".join( [ str( platform.platform_entry ) for platform in $genotype_likelihood_computation_type.platform_list_repeat ] ) }"
+        #end if
+    #end if
+    &gt; "${output_mpileup}"
+    '
+  </command>
+  <inputs>
+    <conditional name="reference_source">
+      <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
+        <option value="cached">Locally cached</option>
+        <option value="history">History</option>
+      </param>
+      <when value="cached">
+        <repeat name="input_bams" title="BAM file" min="1">
+            <param name="input_bam" type="data" format="bam" label="BAM file">
+              <validator type="unspecified_build" />
+              <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /> <!-- fixme!!! this needs to be a select -->
+            </param>
+        </repeat>
+        <param name="ref_file" type="select" label="Using reference genome">
+          <options from_data_table="sam_fa_indexes">
+            <!-- <filter type="data_meta" key="dbkey" ref="input_bam" column="value"/> does not yet work in a repeat...--> 
+          </options>
+        </param>
+      </when>
+      <when value="history"> <!-- FIX ME!!!! -->
+        <repeat name="input_bams" title="BAM file" min="1">
+            <param name="input_bam" type="data" format="bam" label="BAM file" >
+              <validator type="metadata" check="bam_index" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue."/>
+            </param>
+        </repeat>
+        <param name="ref_file" type="data" format="fasta" label="Using reference file" />
+      </when>
+    </conditional>
+
+    
+    <conditional name="genotype_likelihood_computation_type">
+      <param name="genotype_likelihood_computation_type_selector" type="select" label="Genotype Likelihood Computation">
+        <option value="perform_genotype_likelihood_computation">Perform genotype likelihood computation</option>
+        <option value="do_not_perform_genotype_likelihood_computation" selected="True">Do not perform genotype likelihood computation</option>
+      </param>
+      <when value="perform_genotype_likelihood_computation">
+          <param name="gap_extension_sequencing_error_probability" type="integer" value="20" label="Phred-scaled gap extension sequencing error probability" />
+          <param name="coefficient_for_modeling_homopolymer_errors" type="integer" value="100" label="Coefficient for modeling homopolymer errors." />
+          <conditional name="perform_indel_calling">
+            <param name="perform_indel_calling_selector" type="select" label="Perform INDEL calling">
+              <option value="perform_indel_calling" selected="True">Perform INDEL calling</option>
+              <option value="do_not_perform_indel_calling">Do not perform INDEL calling</option>
+            </param>
+            <when value="perform_indel_calling">
+              <param name="skip_indel_calling_above_sample_depth" type="integer" value="250" label="Skip INDEL calling if the average per-sample depth is above" />
+            </when>
+            <when value="do_no_perform_indel_calling" />
+          </conditional>
+          <param name="gap_open_sequencing_error_probability" type="integer" value="40" label="Phred-scaled gap open sequencing error probability" />
+          <repeat name="platform_list_repeat" title="Platform for INDEL candidates">
+            <param name="platform_entry" type="text" value="" label="Platform to use for INDEL candidates" />
+          </repeat>
+      </when>
+      <when value="do_not_perform_genotype_likelihood_computation">
+          <!-- Do nothing here -->
+      </when>
+    </conditional>
+    <conditional name="advanced_options">
+      <param name="advanced_options_selector" type="select" label="Set advanced options">
+        <option value="basic" selected="True">Basic</option>
+        <option value="advanced">Advanced</option>
+      </param>
+      <when value="advanced">
+        <param name="skip_anomalous_read_pairs" type="boolean" truevalue="-A" falsevalue="" checked="False" label="Do not skip anomalous read pairs in variant calling" />
+        <param name="disable_probabilistic_realignment" type="boolean" truevalue="-B" falsevalue="" checked="False" label="	Disable probabilistic realignment for the computation of base alignment quality (BAQ)" />
+        <param name="coefficient_for_downgrading" type="integer" value="0" label="Coefficient for downgrading mapping quality for reads containing excessive mismatches" />
+        <param name="max_reads_per_bam" type="integer" value="250" label="Max reads per BAM" />
+        <param name="extended_BAQ_computation" type="boolean" truevalue="-E" falsevalue="" checked="False" label="Extended BAQ computation" />
+        <param name="position_list" type="data" format="bed" label="List of regions or sites on which to operate" optional="True" />
+        <param name="minimum_mapping_quality" type="integer" value="0" label="Minimum mapping quality for an alignment to be used" />
+        <param name="minimum_base_quality" type="integer" value="13" label="Minimum base quality for a base to be considered" />
+        <param name="region_string" type="text" value="" label="Only generate pileup in region" />
+        <param name="output_per_sample_read_depth" type="boolean" truevalue="-D" falsevalue="" checked="False" label="Output per-sample read depth" />
+        <param name="output_per_sample_strand_bias_p_value" type="boolean" truevalue="-S" falsevalue="" checked="False" label="Output per-sample Phred-scaled strand bias P-value" />
+      </when>
+      <when value="basic" />
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="pileup" name="output_mpileup" label="${tool.name} on ${on_string}">
+      <change_format>
+        <when input="genotype_likelihood_computation_type.genotype_likelihood_computation_type_selector" value="perform_genotype_likelihood_computation" format="bcf" />
+      </change_format>
+    </data>
+    <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
+  </outputs>
+  <tests>
+      <test>
+          <param name="reference_source_selector" value="history" />
+          <param name="ref_file" value="phiX.fasta" ftype="fasta" />
+          <param name="input_bam" value="gatk/gatk_table_recalibration/gatk_table_recalibration_out_1.bam" ftype="bam" />
+          <param name="genotype_likelihood_computation_type_selector" value="do_not_perform_genotype_likelihood_computation" />
+          <param name="advanced_options_selector" value="basic" />
+          <output name="output_mpileup" file="samtools/mpileup/samtools_mpileup_out_1.pileup" /> 
+          <output name="output_log" file="samtools/mpileup/samtools_mpileup_out_1.log" />
+      </test>
+      <test>
+          <param name="reference_source_selector" value="history" />
+          <param name="ref_file" value="phiX.fasta" ftype="fasta" />
+          <param name="input_bam" value="gatk/gatk_table_recalibration/gatk_table_recalibration_out_1.bam" ftype="bam" />
+          <param name="genotype_likelihood_computation_type_selector" value="perform_genotype_likelihood_computation" />
+          <param name="gap_extension_sequencing_error_probability" value="20" />
+          <param name="coefficient_for_modeling_homopolymer_errors" value="100" />
+          <param name="perform_indel_calling_selector" value="perform_indel_calling" />
+          <param name="skip_indel_calling_above_sample_depth" value="250" />
+          <param name="gap_open_sequencing_error_probability" value="40" />
+          <param name="platform_list_repeat" value="0" />
+          <param name="advanced_options_selector" value="basic" />
+          <output name="output_mpileup" file="samtools/mpileup/samtools_mpileup_out_2.bcf" /> 
+          <output name="output_log" file="samtools/mpileup/samtools_mpileup_out_1.log" />
+      </test>
+  </tests>
+  <help>
+**What it does**
+
+ Generate BCF or pileup for one or multiple BAM files. Alignment records are grouped by sample identifiers in @RG header lines. If sample identifiers are absent, each input file is regarded as one sample. 
+
+------
+
+**Settings**::
+
+ Input Options:
+ -6 	Assume the quality is in the Illumina 1.3+ encoding.
+ -A Do not skip anomalous read pairs in variant calling.
+ -B 	Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments.
+ -b FILE 	List of input BAM files, one file per line [null]
+ -C INT 	Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0]
+ -d INT 	At a position, read maximally INT reads per input BAM. [250]
+ -E 	Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt specificity a little bit.
+ -f FILE 	The faidx-indexed reference file in the FASTA format. The file can be optionally compressed by razip. [null]
+ -l FILE 	BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null]
+ -q INT 	Minimum mapping quality for an alignment to be used [0]
+ -Q INT 	Minimum base quality for a base to be considered [13]
+ -r STR 	Only generate pileup in region STR [all sites]
+ Output Options:
+ 	
+ -D 	Output per-sample read depth
+ -g 	Compute genotype likelihoods and output them in the binary call format (BCF).
+ -S 	Output per-sample Phred-scaled strand bias P-value
+ -u 	Similar to -g except that the output is uncompressed BCF, which is preferred for piping.
+ 
+ Options for Genotype Likelihood Computation (for -g or -u):
+  	
+ -e INT 	Phred-scaled gap extension sequencing error probability. Reducing INT leads to longer indels. [20]
+ -h INT 	Coefficient for modeling homopolymer errors. Given an l-long homopolymer run, the sequencing error of an indel of size s is modeled as INT*s/l. [100]
+ -I 	Do not perform INDEL calling
+ -L INT 	Skip INDEL calling if the average per-sample depth is above INT. [250]
+ -o INT 	Phred-scaled gap open sequencing error probability. Reducing INT leads to more indel calls. [40]
+ -P STR 	Comma dilimited list of platforms (determined by @RG-PL) from which indel candidates are obtained. It is recommended to collect indel candidates from sequencing technologies that have low indel error rate such as ILLUMINA. [all]
+
+------
+
+**Citation**
+
+For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505943&gt;`_
+
+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
+
+  </help>
+</tool>

tools/samtools/samtools_wrapper.py

+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+A wrapper script for running SAMTools commands.
+"""
+
+import sys, optparse, os, tempfile, subprocess, shutil
+from string import Template
+
+GALAXY_EXT_TO_SAMTOOLS_EXT = { 'bam_index':'bam.bai', } #items not listed here will use the galaxy extension as-is
+GALAXY_EXT_TO_SAMTOOLS_FILE_TYPE = GALAXY_EXT_TO_SAMTOOLS_EXT #for now, these are the same, but could be different if needed
+DEFAULT_SAMTOOLS_PREFIX = "SAMTools_file"
+CHUNK_SIZE = 2**20 #1mb
+
+
+def cleanup_before_exit( tmp_dir ):
+    if tmp_dir and os.path.exists( tmp_dir ):
+        shutil.rmtree( tmp_dir )
+
+def SAMTOOLS_filename_from_galaxy( galaxy_filename, galaxy_ext, target_dir = None, prefix = None ):
+    suffix = GALAXY_EXT_TO_SAMTOOLS_EXT.get( galaxy_ext, galaxy_ext )
+    if prefix is None:
+        prefix = DEFAULT_SAMTOOLS_PREFIX
+    if target_dir is None:
+        target_dir = os.getcwd()
+    SAMTools_filename = os.path.join( target_dir, "%s.%s" % ( prefix, suffix ) )
+    os.symlink( galaxy_filename, SAMTools_filename )
+    return SAMTools_filename
+
+def SAMTOOLS_filetype_argument_substitution( argument, galaxy_ext ):
+    return argument % dict( file_type = GALAXY_EXT_TO_SAMTOOLS_FILE_TYPE.get( galaxy_ext, galaxy_ext ) )
+
+def open_file_from_option( filename, mode = 'rb' ):
+    if filename:
+        return open( filename, mode = mode )
+    return None
+
+def html_report_from_directory( html_out, dir ):
+    html_out.write( '<html>\n<head>\n<title>Galaxy - SAMTOOLS Output</title>\n</head>\n<body>\n<p/>\n<ul>\n' )
+    for fname in sorted( os.listdir( dir ) ):
+        html_out.write(  '<li><a href="%s">%s</a></li>\n' % ( fname, fname ) )
+    html_out.write( '</ul>\n</body>\n</html>\n' )
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to SAMTOOLS, without any modification.' )
+    parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' )
+    parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' )
+    parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' )
+    parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"')
+    (options, args) = parser.parse_args()
+    
+    tmp_dir = tempfile.mkdtemp( prefix='tmp-SAMTOOLS-' )
+    
+    #set up stdout and stderr output options
+    stdout = open_file_from_option( options.stdout, mode = 'wb' )
+    stderr = open_file_from_option( options.stderr, mode = 'wb' )
+    #if no stderr file is specified, we'll use our own
+    if stderr is None:
+        stderr = tempfile.NamedTemporaryFile( prefix="SAMTOOLS-stderr-", dir=tmp_dir )
+    
+    if options.pass_through_options:
+        cmd = ' '.join( options.pass_through_options )
+    else:
+        cmd = ''
+    return_code = None
+    if options.datasets:
+        for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets:
+            SAMTools_filename = SAMTOOLS_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix )
+            if dataset_arg:
+                if '>' in cmd:
+                    cmd = cmd.replace( '>', '  %s "%s" >' % ( SAMTOOLS_filetype_argument_substitution( dataset_arg, galaxy_ext ), SAMTools_filename ), 1 )
+                else:
+                    cmd = '%s %s "%s"' % ( cmd, SAMTOOLS_filetype_argument_substitution( dataset_arg, galaxy_ext ), SAMTools_filename )
+            #auto index fasta files:
+            if galaxy_ext == 'fa':
+                index_cmd = 'samtools faidx %s' % ( SAMTools_filename )
+                proc = subprocess.Popen( args=index_cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
+                return_code = proc.wait()
+                if return_code:
+                    break
+    if return_code is None or not return_code:
+        proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
+        return_code = proc.wait()
+    if return_code:
+        stderr_target = sys.stderr
+    else:
+        if stdout:
+            stderr_target = stdout
+        else:
+            stderr_target = sys.stdout
+    stderr.flush()
+    stderr.seek(0)
+    while True:
+        chunk = stderr.read( CHUNK_SIZE )
+        if chunk:
+            stderr_target.write( chunk )
+        else:
+            break
+    stderr.close()
+    #generate html reports
+    if options.html_report_from_directory:
+        for ( html_filename, html_dir ) in options.html_report_from_directory:
+            html_report_from_directory( open( html_filename, 'wb' ), html_dir )
+    
+    cleanup_before_exit( tmp_dir )
+
+if __name__=="__main__": __main__()