1. boliu
  2. galaxy-globus-cloudman

Commits

boliu  committed bfcb9f3 Draft

add FreeBayes, SNAP, modify vcfPrinter, BWA, GATK, tool_data_table_conf.xml, sam_fa_indices.loc

  • Participants
  • Parent commits fec1903
  • Branches default

Comments (0)

Files changed (21)

File tool-data/novoalign_indices.loc

View file
+hg19	hg19	hg19	/mnt/galaxyData/hg19.novoindex

File tool_conf.xml

View file
     <tool file="proteonics/validator.xml"/>
   </section>
 
+  <section name="SNAP" id="SNAP">
+    <tool file="snap/snap-index.xml"/>
+    <tool file="snap/snap-alignment.xml"/>
+  </section>
+
+  <section name="FreeBayes" id="FreeBayes">
+    <tool file="freebayes/freebayes.xml"/>
+  </section>
+
   <section name="NGS: RNA Analysis" id="ngs-rna-tools">
-    
     <label text="RNA-seq" id="rna_seq" />
     <tool file="ngs_rna/tophat_wrapper.xml" />
     <tool file="ngs_rna/tophat_wrapper_condor.xml" />
     <tool file="ngs_rna/tophat_color_wrapper.xml" />
     <tool file="ngs_rna/cufflinks_wrapper.xml" />
     <tool file="ngs_rna/cuffcompare_wrapper.xml" />
- 	<tool file="ngs_rna/cuffmerge_wrapper.xml" />
+    <tool file="ngs_rna/cuffmerge_wrapper.xml" />
     <tool file="ngs_rna/cuffdiff_wrapper.xml" />
     <!-- Trinity is very memory-intensive and should only be enabled/run
     on instances with sufficient resources. 

File tool_data_table_conf.xml

View file
         <file path="/mnt/galaxyIndices/galaxy/tool-data/liftOver.loc" />
     </table>
 
+    <!-- Locations of novoalign indexes -->
+    <table name="novoalign_indices" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/novoalign_indices.loc" />
+    </table>
 </tables>

File tools/atlas2/vcfPrinter-bak(one input).py

View file
+#!/usr/bin/env python
+import sys, os, commands, string, subprocess, shutil
+
+galaxyhome=os.environ.get('GALAXY_HOME')
+
+input_VCF = sys.argv[1]
+input_BAM = sys.argv[2]
+input_FASTA = " -r " + sys.argv[3]
+
+if sys.argv[4]!="none":
+    n = " " + sys.argv[4]
+else:
+    n = ""
+
+if sys.argv[5]!="none":
+    p = " " + sys.argv[5]
+else:
+    p = ""
+
+outputVCF = sys.argv[6]
+
+output_FOLDER = outputVCF.split('.')[0]+'_files'
+
+if not os.path.exists(output_FOLDER):
+	os.makedirs(output_FOLDER)
+else:
+	pass
+
+#vcfPrinter.rb requires that the input VCF file and BAM file have the same file name and .vcf/.bam postfix.
+# Copy input VCF file to input_FOLDER/file.vcf
+new_VCF = os.path.join( output_FOLDER, "file.vcf" )
+shutil.copyfile( input_VCF, new_VCF)
+# Copy input BAM file to input_FOLDER/file.bam
+new_BAM = os.path.join( output_FOLDER, "file.bam" )
+shutil.copyfile( input_BAM, new_BAM)
+
+parameter = " -i " + new_VCF +  " -b " + new_BAM + input_FASTA + n + p + " -o " + outputVCF
+#print parameter
+
+
+##uncomment when running on VM
+os.chdir(galaxyhome + "/tools/atlas2/")     #uncomment when running on cluster
+#os.chdir("/media/Work/galaxy-proteonics/tools/atlas2/")    #for local running
+
+command="ruby ./vcfPrinter/vcfPrinter.rb " + parameter
+
+print command
+
+proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE )
+

File tools/atlas2/vcfPrinter-bak(one input).xml

View file
+<?xml version="1.0"?>
+
+<tool name="vcfPrinter" id="vcfPrinter_id">
+  <description>
+  </description>
+
+  <command interpreter="python">vcfPrinter.py
+   $inputVCF
+   $inputBAM
+   $inputFASTA
+   
+   #if $n:
+	-n 
+   #else:
+	none 
+   #end if
+
+   #if $p:
+	-p
+   #else:
+	none 
+   #end if
+
+   $outputVCF
+
+  </command>
+
+  <inputs>
+
+    <param name="inputVCF" type="data" format="vcf" label="-i, Select a input VCF file" />
+    <param name="inputBAM" type="data" format="bam" label="-b, Select a input BAM file" help="The BAM files need to be indexed."/>
+    <param name="inputFASTA" type="data" format="fasta" label="-r, Select a reference fasta file" />
+
+    <param name="n" type="boolean" checked="false" label="-n" help="if checked, will not run pileup, hence will only merge data from single sample VCF. Samples without at a site will not have read level information gathered from samtools pileup making it significantly faster."/>
+    <param name="p" type="boolean" checked="false" label="-p" help="if checked, will not merge non-PASS variants, based on the PASS tag in filter field"/>
+
+  </inputs>
+
+  <outputs>
+    <data name="outputVCF" format="vcf" label="Output of vcfPrinter.vcf" />
+  </outputs>
+
+  <help>
+vcfPrinter_ is used to generate multi-sample Variant Call Format (VCF) file from single sample VCF files.
+ .. _vcfPrinter: http://www.hgsc.bcm.tmc.edu/cascade-tech-software_atlas2_snp_indel_calling_pipeline-ti.hgsc
+
+**Usage:**
+
+ruby vcfPrinter.rb -i "/home/user/data/file.vcf" -o /home/user/data/outfile.vcf -b "/home/user/data/file.bam" -r /home/user/ref/human_b36_male.fa
+
+
+  </help>
+
+</tool>

File tools/atlas2/vcfPrinter.py

-#!/usr/bin/env python
-
-import sys, os, commands, string, subprocess
-
-galaxyhome=os.environ.get('GALAXY_HOME')
-
-'''
-n = len(sys.argv)
-for i in range(0,n):
-    print sys.argv[i]
-'''
-'''
-SOLiD-SNP-caller [in.bam] [ref.fa] [.bed region] > [output.vcf]
-'''
-
-input_BAM = " " + sys.argv[1]
-input_FASTA = " " + sys.argv[2]
-
-if sys.argv[3]!="none":
-    input_BED = " " + sys.argv[3]
-else:
-    input_BED = ""    
-
-output = " > " + sys.argv[4]
-
-
-parameter = input_BAM + input_FASTA + input_BED + output
-#print parameter
-
-
-##uncomment when running on VM
-
-os.chdir(galaxyhome + "/tools/atlas2/")     #uncomment when running on cluster
-#os.chdir("/home/liubo/")    #for local running
-
-command="./SOLiD-SNP-caller/SOLiD-SNP-caller " + parameter
-
-#command = "SOLiD-SNP-caller [in.bam] [ref.fa] [.bed region] > [output.vcf]"
-
-
-print command
-
-subprocess.call(command,shell=True)
-
-

File tools/atlas2/vcfPrinter.xml

-<?xml version="1.0"?>
-
-<tool name="vcfPrinter" id="vcfPrinter_id">
-  <description>
-  </description>
-
-  <command>ruby ${GALAXY_DATA_INDEX_DIR}/../tools/atlas2/vcfPrinter/vcfPrinter.rb   
-   -i "$inputVCF"
-   -b "$inputBAM"
-   -r $inputFASTA
-   
-   #if $n == "true"
-	-n 
-   #end if
-
-   #if $p == "true"
-	-p 
-   #end if
-
-   -o $outputVCF
-
-  </command>
-
-  <inputs>
-
-    <param name="inputVCF" type="data" format="vcf" label="-i, Select a input VCF file" />
-    <param name="inputBAM" type="data" format="bam" label="-b, Select a input BAM file" help="The BAM files need to be indexed."/>
-    <param name="inputFASTA" type="data" format="fasta" label="-r, Select a reference fasta file" />
-
-    <param name="n" type="boolean" checked="false" label="-n" help="if checked, will not run pileup, hence will only merge data from single sample VCF. Samples without at a site will not have read level information gathered from samtools pileup making it significantly faster."/>
-    <param name="p" type="boolean" checked="false" label="-p" help="if checked, will not merge non-PASS variants, based on the PASS tag in filter field"/>
-
-  </inputs>
-
-  <outputs>
-    <data name="outputVCF" format="vcf" label="Output of vcfPrinter.vcf" />
-  </outputs>
-
-  <help>
-vcfPrinter_ is used to generate multi-sample Variant Call Format (VCF) file from single sample VCF files.
- .. _vcfPrinter: http://www.hgsc.bcm.tmc.edu/cascade-tech-software_atlas2_snp_indel_calling_pipeline-ti.hgsc
-
-**Usage:**
-
-ruby vcfPrinter.rb -i "/home/user/data/*.vcf" -o /home/user/data/outfile.vcf -b "/home/user/data/*.bam" -r /home/user/ref/human_b36_male.fa
-
-
-  </help>
-
-</tool>

File tools/freebayes/freebayes.xml

View file
+<?xml version="1.0"?>
+<tool id="freebayes" name="FreeBayes" version="0.0.3">
+  <requirements>
+    <requirement type="package" version="0.9.6_9608597d12e127c847ae03aa03440ab63992fedf">freebayes</requirement>
+    <requirement type="package" version="0.1.18">samtools</requirement>
+  </requirements>
+  <description> - Bayesian genetic variant detector</description>
+  <command>
+    ##set up input files
+    #set $reference_fasta_filename = "localref.fa"
+    #if str( $reference_source.reference_source_selector ) == "history":
+        ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &amp;&amp;
+        samtools faidx "${reference_fasta_filename}" 2&gt;&amp;1 || echo "Error running samtools faidx for FreeBayes" &gt;&amp;2 &amp;&amp;
+    #else:
+        #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+    #end if
+    #for $bam_count, $input_bam in enumerate( $reference_source.input_bams ):
+        ln -s "${input_bam.input_bam}" "localbam_${bam_count}.bam" &amp;&amp;
+        ln -s "${input_bam.input_bam.metadata.bam_index}" "localbam_${bam_count}.bam.bai" &amp;&amp;
+    #end for
+    ##finished setting up inputs
+    
+    ##start FreeBayes commandline
+    freebayes
+    #for $bam_count, $input_bam in enumerate( $reference_source.input_bams ):
+        --bam "localbam_${bam_count}.bam"
+    #end for
+    --fasta-reference "${reference_fasta_filename}" 
+    
+    ##outputs
+    --vcf "${output_vcf}"
+    
+    ##advanced options
+    #if str( $options_type.options_type_selector ) == "advanced":
+        ##additional outputs
+        #if $options_type.output_trace_option:
+            --trace "${output_trace}"
+        #end if
+        #if $options_type.output_failed_alleles_option:
+            --failed-alleles "${output_failed_alleles_bed}"
+        #end if
+        
+        ##additional inputs
+        #if str( $options_type.target_limit_type.target_limit_type_selector ) == "limit_by_target_file":
+            --targets "${options_type.target_limit_type.input_target_bed}"
+        #elif str( $options_type.target_limit_type.target_limit_type_selector ) == "limit_by_region":
+            --region "${options_type.target_limit_type.region_chromosome}:${options_type.target_limit_type.region_start}..${options_type.target_limit_type.region_end}"
+        #end if
+        #if $options_type.input_sample_file:
+            --samples "${options_type.input_sample_file}"
+        #end if
+        #if $options_type.input_populations_file:
+            --populations "${options_type.input_populations_file}"
+        #end if
+        #if $options_type.input_cnv_map_bed:
+            --cnv-map "${options_type.input_cnv_map_bed}"
+        #end if
+        #if str( $options_type.input_variant_type.input_variant_type_selector ) == "provide_vcf":
+            --variant-input "${options_type.input_variant_type.input_variant_vcf}"
+            ${options_type.input_variant_type.only_use_input_alleles}
+        #end if
+        #if $options_type.haplotype_basis_alleles:
+            --haplotype-basis-alleles "${options_type.haplotype_basis_alleles}"
+        #end if
+        
+        
+        ##reporting
+        #if str( $options_type.section_reporting_type.section_reporting_type_selector ) == "set":
+            --pvar "${options_type.section_reporting_type.pvar}"
+            ${options_type.section_reporting_type.show_reference_repeats}
+        #end if
+        
+        ##population model
+        #if str( $options_type.section_population_model_type.section_population_model_type_selector ) == "set":
+            --theta "${options_type.section_population_model_type.theta}"
+            --ploidy "${options_type.section_population_model_type.ploidy}"
+            ${options_type.section_population_model_type.pooled}
+        #end if
+        
+        ##reference allele
+        #if str( $options_type.use_reference_allele_type.use_reference_allele_type_selector ) == "include_reference_allele":
+            --use-reference-allele
+            ${options_type.use_reference_allele_type.diploid_reference}
+            --reference-quality "${options_type.use_reference_allele_type.reference_quality_mq},${options_type.use_reference_allele_type.reference_quality_bq}"
+        #end if
+        
+        ##allele scope
+        #if str( $options_type.section_allele_scope_type.section_allele_scope_type_selector ) == "set":
+            ${options_type.section_allele_scope_type.no_snps}
+            ${options_type.section_allele_scope_type.no_indels}
+            ${options_type.section_allele_scope_type.no_mnps}
+            ${options_type.section_allele_scope_type.no_complex}
+            --use-best-n-alleles "${options_type.section_allele_scope_type.use_best_n_alleles}"
+            #if $options_type.section_allele_scope_type.max_complex_gap:
+                --max-complex-gap "${options_type.section_allele_scope_type.max_complex_gap}"
+            #end if
+        #end if
+        
+        ##indel realignment
+        ${options_type.left_align_indels}
+        
+        ##input filters
+        #if str( $options_type.section_input_filters_type.section_input_filters_type_selector ) == "set":
+            ${options_type.section_input_filters_type.use_duplicate_reads}
+            #if str( $options_type.section_input_filters_type.quality_filter_type.quality_filter_type_selector ) == "apply_filters":
+                --min-mapping-quality "${options_type.section_input_filters_type.quality_filter_type.min_mapping_quality}"
+                --min-base-quality "${options_type.section_input_filters_type.quality_filter_type.min_base_quality}"
+                --min-supporting-quality "${options_type.section_input_filters_type.quality_filter_type.min_supporting_quality_mq},${options_type.section_input_filters_type.quality_filter_type.min_supporting_quality_bq}"
+            #elif str( $options_type.section_input_filters_type.quality_filter_type.quality_filter_type_selector ) == "standard_filters":
+                --standard-filters
+            #end if
+            --mismatch-base-quality-threshold "${options_type.section_input_filters_type.mismatch_base_quality_threshold}"
+            #if $options_type.section_input_filters_type.read_mismatch_limit:
+                --read-mismatch-limit "${options_type.section_input_filters_type.read_mismatch_limit}"
+            #end if
+            --read-max-mismatch-fraction "${options_type.section_input_filters_type.read_max_mismatch_fraction}"
+            #if $options_type.section_input_filters_type.read_snp_limit:
+                --read-snp-limit "${options_type.section_input_filters_type.read_snp_limit}"
+            #end if
+            #if $options_type.section_input_filters_type.read_indel_limit:
+                --read-indel-limit "${options_type.section_input_filters_type.read_indel_limit}"
+            #end if
+            --indel-exclusion-window "${options_type.section_input_filters_type.indel_exclusion_window}"
+            --min-alternate-fraction "${options_type.section_input_filters_type.min_alternate_fraction}"
+            --min-alternate-count "${options_type.section_input_filters_type.min_alternate_count}"
+            --min-alternate-qsum "${options_type.section_input_filters_type.min_alternate_qsum}"
+            --min-alternate-total "${options_type.section_input_filters_type.min_alternate_total}"
+            --min-coverage "${options_type.section_input_filters_type.min_coverage}"
+        #end if
+        
+        ##bayesian priors
+        #if str( $options_type.section_bayesian_priors_type.section_bayesian_priors_type_selector ) == "set":
+            ${options_type.section_bayesian_priors_type.no_ewens_priors}
+            ${options_type.section_bayesian_priors_type.no_population_priors}
+            ${options_type.section_bayesian_priors_type.hwe_priors}
+        #end if
+        
+        ##observation prior expectations
+        #if str( $options_type.section_observation_prior_expectations_type.section_observation_prior_expectations_type_selector ) == "set":
+            ${options_type.section_observation_prior_expectations_type.binomial_obs_priors}
+            ${options_type.section_observation_prior_expectations_type.allele_balance_priors}
+        #end if
+        
+        ##algorithmic features
+        #if str( $options_type.section_algorithmic_features_type.section_algorithmic_features_type_selector ) == "set":
+            --site-selection-max-iterations "${options_type.section_algorithmic_features_type.site_selection_max_iterations}"
+            --genotyping-max-iterations "${options_type.section_algorithmic_features_type.genotyping_max_iterations}"
+            --genotyping-max-banddepth "${options_type.section_algorithmic_features_type.genotyping_max_banddepth}"
+            --posterior-integration-limits "${options_type.section_algorithmic_features_type.posterior_integration_limits_n},${options_type.section_algorithmic_features_type.posterior_integration_limits_m}"
+            ${options_type.section_algorithmic_features_type.no_permute}
+            ${options_type.section_algorithmic_features_type.exclude_unobserved_genotypes}
+            #if $options_type.section_algorithmic_features_type.genotype_variant_threshold:
+                --genotype-variant-threshold "${options_type.section_algorithmic_features_type.genotype_variant_threshold}"
+            #end if
+            ${options_type.section_algorithmic_features_type.use_mapping_quality}
+            --read-dependence-factor "${options_type.section_algorithmic_features_type.read_dependence_factor}"
+            ${options_type.section_algorithmic_features_type.no_marginals}
+        #end if
+        
+    #end if
+  </command>
+  <inputs>
+    <conditional name="reference_source">
+      <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
+        <option value="cached">Locally cached</option>
+        <option value="history">History</option>
+      </param>
+      <when value="cached">
+        <repeat name="input_bams" title="Sample BAM file" min="1">
+            <param name="input_bam" type="data" format="bam" label="BAM file">
+              <validator type="unspecified_build" />
+              <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." />
+            </param>
+        </repeat>
+        <param name="ref_file" type="select" label="Using reference genome">
+          <options from_data_table="sam_fa_indexes">
+            <!-- <filter type="sam_fa_indexes" key="dbkey" ref="input_bam" column="value"/> does not yet work in a repeat...--> 
+          </options>
+          <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+        </param>
+      </when>
+      <when value="history"> <!-- FIX ME!!!! -->
+        <repeat name="input_bams" title="Sample BAM file" min="1">
+            <param name="input_bam" type="data" format="bam" label="BAM file" />
+        </repeat>
+        <param name="ref_file" type="data" format="fasta" label="Using reference file" />
+      </when>
+    </conditional>
+    
+    <conditional name="options_type">
+      <param name="options_type_selector" type="select" label="Basic or Advanced options">
+        <option value="basic" selected="True">Basic</option>
+        <option value="advanced">Advanced</option>
+      </param>
+      <when value="basic">
+        <!-- Do nothing here -->
+      </when>
+      <when value="advanced">
+        
+        <!-- output -->
+        <param name="output_failed_alleles_option" type="boolean" truevalue="--failed-alleles" falsevalue="" checked="False" label="Write out failed alleles file" />
+        <param name="output_trace_option" type="boolean" truevalue="--trace" falsevalue="" checked="False" label="Write out algorithm trace file" />
+        
+        
+        <!-- input -->
+        <conditional name="target_limit_type">
+          <param name="target_limit_type_selector" type="select" label="Limit analysis to listed targets">
+            <option value="do_not_limit" selected="True">Do not limit</option>
+            <option value="limit_by_target_file">Limit by target file</option>
+            <option value="limit_by_region">Limit to region</option>
+          </param>
+          <when value="do_not_limit">
+            <!-- Do nothing here -->
+          </when>
+          <when value="limit_by_target_file">
+            <param name="input_target_bed" type="data" format="bed" label="Limit analysis to targets listed in the BED-format FILE." />
+          </when>
+          <when value="limit_by_region">
+            <param name="region_chromosome" type="text" label="Region Chromosome" value="" /> <!--only once? -->
+            <param name="region_start" type="integer" label="Region Start" value="" />
+            <param name="region_end" type="integer" label="Region End" value="" />
+          </when>
+        </conditional>
+        <param name="input_sample_file" type="data" format="txt" label="Limit analysis to samples listed (one per line) in the FILE" optional="True" />
+        <param name="input_populations_file" type="data" format="txt" label="Populations File" optional="True" />
+        <param name="input_cnv_map_bed" type="data" format="bed" label="Read a copy number map from the BED file FILE" optional="True" />
+        <conditional name="input_variant_type">
+          <param name="input_variant_type_selector" type="select" label="Provide variants file">
+            <option value="do_not_provide" selected="True">Do not provide</option>
+            <option value="provide_vcf">Provide VCF file</option>
+          </param>
+          <when value="do_not_provide">
+            <!-- Do nothing here -->
+          </when>
+          <when value="provide_vcf">
+            <param name="input_variant_vcf" type="data" format="vcf" label="Use variants reported in VCF file as input to the algorithm" />
+            <param name="only_use_input_alleles" type="boolean" truevalue="--only-use-input-alleles" falsevalue="" checked="False" label="Only provide variant calls and genotype likelihoods for sites in VCF" />
+          </when>
+        </conditional>
+        <param name="haplotype_basis_alleles" type="data" format="vcf" label="Only use variant alleles provided in this input VCF for the construction of complex or haplotype alleles" optional="True" />
+        
+        <!-- reporting -->
+        <conditional name="section_reporting_type">
+          <param name="section_reporting_type_selector" type="select" label="Set Reporting options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="pvar" type="float" label="Report sites if the probability that there is a polymorphism at the site is greater" value="0.0001" />
+            <param name="show_reference_repeats" type="boolean" truevalue="--show-reference-repeats" falsevalue="" checked="False" label="Calculate and show information about reference repeats" />
+          </when>
+        </conditional>
+        
+        
+        <!-- population model -->
+        <conditional name="section_population_model_type">
+          <param name="section_population_model_type_selector" type="select" label="Set population model options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="theta" type="float" label="expected mutation rate or pairwise nucleotide diversity among the population" value="0.001" help="This serves as the single parameter to the Ewens Sampling Formula prior model"/>
+            <param name="ploidy" type="integer" label="default ploidy for the analysis" value="2" />
+            <param name="pooled" type="boolean" truevalue="--pooled" falsevalue="" checked="False" label="Assume that samples result from pooled sequencing" help="When using this flag, set --ploidy to the number of alleles in each sample." />
+          </when>
+        </conditional>
+        
+        <!-- reference allele -->
+            <conditional name="use_reference_allele_type">
+              <param name="use_reference_allele_type_selector" type="select" label="Include the reference allele in the analysis">
+                <option value="do_not_include_reference_allele" selected="True">Do not include</option>
+                <option value="include_reference_allele">Include</option>
+              </param>
+              <when value="do_not_include_reference_allele">
+                <!-- Do nothing here -->
+              </when>
+              <when value="include_reference_allele">
+                <param name="diploid_reference" type="boolean" truevalue="--diploid-reference" falsevalue="" checked="False" label="Treat reference as diploid" />
+                <param name="reference_quality_mq" type="integer" label="Assign mapping quality" value="100" />
+                <param name="reference_quality_bq" type="integer" label="Assign base quality" value="60" />
+              </when>
+            </conditional>     
+        
+        <!-- allele scope -->
+        <conditional name="section_allele_scope_type">
+          <param name="section_allele_scope_type_selector" type="select" label="Set allele scope options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="no_snps" type="boolean" truevalue="--no-snps" falsevalue="" checked="False" label="Ignore SNP alleles" />
+            <param name="no_indels" type="boolean" truevalue="--no-indels" falsevalue="" checked="False" label="Ignore insertion and deletion alleles" />
+            <param name="no_mnps" type="boolean" truevalue="--no-mnps" falsevalue="" checked="False" label="Ignore multi-nuceotide polymorphisms, MNPs" />
+            <param name="no_complex" type="boolean" truevalue="--no-complex" falsevalue="" checked="False" label="Ignore complex events (composites of other classes)" />
+            <param name="use_best_n_alleles" type="integer" label="Evaluate only the best N SNP alleles" value="0" min="0" help="Ranked by sum of supporting quality scores; Set to 0 to use all" />
+            <param name="max_complex_gap" type="integer" label="Allow complex alleles with contiguous embedded matches of up to this length" value="" optional="True"/>
+          </when>
+        </conditional>
+        
+        <!-- indel realignment -->
+        <param name="left_align_indels" type="boolean" truevalue="--left-align-indels" falsevalue="" checked="False" label="Left-realign and merge gaps embedded in reads" />
+        
+        <!-- input filters -->
+        <conditional name="section_input_filters_type">
+          <param name="section_input_filters_type_selector" type="select" label="Set input filters options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="use_duplicate_reads" type="boolean" truevalue="--use-duplicate-reads" falsevalue="" checked="False" label="Include duplicate-marked alignments in the analysis" />
+            <conditional name="quality_filter_type">
+              <param name="quality_filter_type_selector" type="select" label="Apply Quality filters">
+                <option value="standard_filters" selected="True">Apply standard</option>
+                <option value="apply_filters">Apply specified</option>
+              </param>
+              <when value="standard_filters">
+                <!-- Do nothing here --> <!-- standard-filters -->
+              </when>
+              <when value="apply_filters">
+                <param name="min_mapping_quality" type="integer" label="Exclude alignments from analysis if they have a mapping quality less than" value="0" />
+                <param name="min_base_quality" type="integer" label="Exclude alleles from analysis if their supporting base quality less than" value="0" />
+                <param name="min_supporting_quality_mq" type="integer" label="In order to consider an alternate allele, at least one supporting alignment must have mapping quality" value="0" />
+                <param name="min_supporting_quality_bq" type="integer" label="In order to consider an alternate allele, at least one supporting alignment must have base quality" value="0" />
+              </when>
+            </conditional>
+            <param name="mismatch_base_quality_threshold" type="integer" label="Count mismatches toward read-mismatch-limit if the base quality of the mismatch is &gt;=" value="10" />
+            <param name="read_mismatch_limit" type="integer" label="Exclude reads with more than N mismatches where each mismatch has base quality &gt;= mismatch-base-quality-threshold" value="" optional="True" />
+            <param name="read_max_mismatch_fraction" type="float" label="Exclude reads with more than N [0,1] fraction of mismatches where each mismatch has base quality &gt;= mismatch-base-quality-threshold" value="1.0" />
+            <param name="read_snp_limit" type="integer" label="Exclude reads with more than N base mismatches, ignoring gaps with quality &gt;= mismatch-base-quality-threshold" value="" optional="True" />
+            <param name="read_indel_limit" type="integer" label="Exclude reads with more than N separate gaps" value="" optional="True" />
+            <param name="indel_exclusion_window" type="integer" label="Ignore portions of alignments this many bases from a putative insertion or deletion allele" value="0" />
+            <param name="min_alternate_fraction" type="float" label="Require at least this fraction of observations supporting an alternate allele within a single individual in the in order to evaluate the position" value="0" />
+            <param name="min_alternate_count" type="integer" label="Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position" value="1" />
+            <param name="min_alternate_qsum" type="integer" label="Require at least this sum of quality of observations supporting an alternate allele within a single individual in order to evaluate the position" value="0" />
+            <param name="min_alternate_total" type="integer" label="Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis" value="1" />
+            <param name="min_coverage" type="integer" label="Require at least this coverage to process a site" value="0" />
+          </when>
+        </conditional>
+        
+        
+        <!-- bayesian priors -->
+        <conditional name="section_bayesian_priors_type">
+          <param name="section_bayesian_priors_type_selector" type="select" label="Set bayesian priors options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="no_ewens_priors" type="boolean" truevalue="--no-ewens-priors" falsevalue="" checked="False" label="Turns off the Ewens' Sampling Formula component of the priors" />
+            <param name="no_population_priors" type="boolean" truevalue="--no-population-priors" falsevalue="" checked="False" label="No population priors" help="Equivalent to --pooled --no-ewens-priors" />
+            <param name="hwe_priors" type="boolean" truevalue="--hwe-priors" falsevalue="" checked="False" label="Use the probability of the combination arising under HWE given the allele frequency as estimated by observation frequency" />
+          </when>
+        </conditional>
+        
+        <!-- observation prior expectations -->
+        <conditional name="section_observation_prior_expectations_type">
+          <param name="section_observation_prior_expectations_type_selector" type="select" label="Set observation prior expectations options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="binomial_obs_priors" type="boolean" truevalue="--binomial-obs-priors" falsevalue="" checked="False" label="Incorporate expectations about osbervations into the priors, Uses read placement probability, strand balance probability, and read position (5'-3') probability" />
+            <param name="allele_balance_priors" type="boolean" truevalue="--allele-balance-priors" falsevalue="" checked="False" label="Use aggregate probability of observation balance between alleles as a component of the priors.  Best for observations with minimal inherent reference bias" />
+          </when>
+        </conditional>
+        
+        
+        <!-- algorithmic features -->
+        <conditional name="section_algorithmic_features_type">
+          <param name="section_algorithmic_features_type_selector" type="select" label="Set algorithmic features options">
+            <option value="do_not_set" selected="True">Do not set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="do_not_set">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="site_selection_max_iterations" type="integer" label="Uses hill-climbing algorithm to search posterior space for N iterations to determine if the site should be evaluated." value="5" help="Set to 0 to prevent use of this algorithm for site selection, and to a low integer for improvide site selection at a slight performance penalty" />
+            <param name="genotyping_max_iterations" type="integer" label="Iterate no more than N times during genotyping step" value="25" />
+            <param name="genotyping_max_banddepth" type="integer" label="Integrate no deeper than the Nth best genotype by likelihood when genotyping" value="6" />
+            <param name="posterior_integration_limits_n" type="integer" label="Posteriror integration limit N" help="Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood." value="1" />
+            <param name="posterior_integration_limits_m" type="integer" label="Posteriror integration limit M" help="Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood." value="3" />
+            <param name="no_permute" type="boolean" truevalue="--no-permute" falsevalue="" checked="False" label="Do not scale prior probability of genotype combination given allele frequency by the number of permutations of included genotypes" />
+            <param name="exclude_unobserved_genotypes" type="boolean" truevalue="--exclude-unobserved-genotypes" falsevalue="" checked="False" label="Skip sample genotypings for which the sample has no supporting reads" />
+            <param name="genotype_variant_threshold" type="integer" label="Limit posterior integration to samples where the second-best genotype likelihood is no more than log(N) from the highest genotype likelihood for the sample" value="" optional="True" />
+            <param name="use_mapping_quality" type="boolean" truevalue="--use-mapping-quality" falsevalue="" checked="False" label="Use mapping quality of alleles when calculating data likelihoods" />
+            <param name="read_dependence_factor" type="float" label="Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations" value="0.9" />
+            <param name="no_marginals" type="boolean" truevalue="--no-marginals" falsevalue="" checked="False" label="Do not calculate the marginal probability of genotypes.  Saves time and improves scaling performance in large populations" />
+          </when>
+        </conditional>
+        
+        
+      </when>
+    </conditional>
+    
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output_vcf" label="${tool.name} on ${on_string} (variants)" />
+    <data format="bed" name="output_failed_alleles_bed" label="${tool.name} on ${on_string} (failed alleles)">
+        <filter>options_type['options_type_selector'] == "advanced" and options_type['output_failed_alleles_option'] is True</filter>
+    </data>
+    <data format="txt" name="output_trace" label="${tool.name} on ${on_string} (trace)">
+        <filter>options_type['options_type_selector'] == "advanced" and options_type['output_trace_option'] is True</filter>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+     <param name="reference_source_selector" value="history" />
+      <param name="ref_file" ftype="fasta" value="phiX.fasta"/>
+      <param name="input_bam" ftype="bam" value="fake_phiX_reads_1.bam"/>
+      <param name="options_type_selector" value="basic"/>
+      <output name="output_vcf" file="freebayes_out_1.vcf.contains" compare="contains"/>
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool uses FreeBayes to call SNPS given a reference sequence and a BAM alignment file.
+
+FreeBayes is a high-performance, flexible, and open-source Bayesian genetic variant detector. It operates on BAM alignment files, which are produced by most contemporary short-read aligners.
+
+In addition to substantial performance improvements over its predecessors (PolyBayes, GigaBayes, and BamBayes), it expands the scope of SNP and small-indel variant calling to populations of individuals with heterogeneous copy number. FreeBayes is currently under active development. 
+
+Go `here &lt;http://bioinformatics.bc.edu/marthlab/FreeBayes&gt;`_ for details on FreeBayes.
+
+------
+
+**Inputs**
+
+FreeBayes accepts an input aligned BAM file.
+
+
+**Outputs**
+
+The output is in the VCF format.
+
+-------
+
+**Settings**::
+
+  input and output:
+
+   -b --bam FILE   Add FILE to the set of BAM files to be analyzed.
+   -c --stdin      Read BAM input on stdin.
+   -v --vcf FILE   Output VCF-format results to FILE.
+   -f --fasta-reference FILE
+                   Use FILE as the reference sequence for analysis.
+                   An index file (FILE.fai) will be created if none exists.
+                   If neither --targets nor --region are specified, FreeBayes
+                   will analyze every position in this reference.
+   -t --targets FILE
+                   Limit analysis to targets listed in the BED-format FILE.
+   -r --region &lt;chrom&gt;:&lt;start_position&gt;..&lt;end_position&gt;
+                   Limit analysis to the specified region, 0-base coordinates,
+                   end_position not included (same as BED format).
+   -s --samples FILE
+                   Limit analysis to samples listed (one per line) in the FILE.
+                   By default FreeBayes will analyze all samples in its input
+                   BAM files.
+   --populations FILE
+                   Each line of FILE should list a sample and a population which
+                   it is part of.  The population-based bayesian inference model
+                   will then be partitioned on the basis of the populations.
+   -A --cnv-map FILE
+                   Read a copy number map from the BED file FILE, which has
+                   the format:
+                      reference sequence, start, end, sample name, copy number
+                   ... for each region in each sample which does not have the
+                   default copy number as set by --ploidy.
+   -L --trace FILE  Output an algorithmic trace to FILE.
+   --failed-alleles FILE
+                   Write a BED file of the analyzed positions which do not
+                   pass --pvar to FILE.
+   -@ --variant-input VCF
+                   Use variants reported in VCF file as input to the algorithm.
+                   A report will be generated for every record in the VCF file.
+   -l --only-use-input-alleles
+                   Only provide variant calls and genotype likelihoods for sites
+                   and alleles which are provided in the VCF input, and provide
+                   output in the VCF for all input alleles, not just those which
+                   have support in the data.
+   --haplotype-basis-alleles VCF
+                   When specified, only variant alleles provided in this input
+                   VCF will be used for the construction of complex or haplotype
+                   alleles.
+
+  reporting:
+
+   -P --pvar N     Report sites if the probability that there is a polymorphism
+                   at the site is greater than N.  default: 0.0001
+   -_ --show-reference-repeats
+                   Calculate and show information about reference repeats in
+                   the VCF output.
+
+  population model:
+
+   -T --theta N    The expected mutation rate or pairwise nucleotide diversity
+                   among the population under analysis.  This serves as the
+                   single parameter to the Ewens Sampling Formula prior model
+                   default: 0.001
+   -p --ploidy N   Sets the default ploidy for the analysis to N.  default: 2
+   -J --pooled     Assume that samples result from pooled sequencing.
+                   When using this flag, set --ploidy to the number of
+                   alleles in each sample.
+
+  reference allele:
+
+   -Z --use-reference-allele
+                   This flag includes the reference allele in the analysis as
+                   if it is another sample from the same population.
+   -H --diploid-reference
+                   If using the reference sequence as a sample (-Z),
+                   treat it as diploid.  default: false (reference is haploid)
+   --reference-quality MQ,BQ
+                   Assign mapping quality of MQ to the reference allele at each
+                   site and base quality of BQ.  default: 100,60
+
+  allele scope:
+
+   -I --no-snps    Ignore SNP alleles.
+   -i --no-indels  Ignore insertion and deletion alleles.
+   -X --no-mnps    Ignore multi-nuceotide polymorphisms, MNPs.
+   -u --no-complex Ignore complex events (composites of other classes).
+   -n --use-best-n-alleles N
+                   Evaluate only the best N SNP alleles, ranked by sum of
+                   supporting quality scores.  (Set to 0 to use all; default: all)
+   -E --max-complex-gap N
+                   Allow complex alleles with contiguous embedded matches of up
+                   to this length.
+
+  indel realignment:
+
+   -O --left-align-indels
+                   Left-realign and merge gaps embedded in reads. default: false
+
+  input filters:
+
+   -4 --use-duplicate-reads
+                   Include duplicate-marked alignments in the analysis.
+                   default: exclude duplicates
+   -m --min-mapping-quality Q
+                   Exclude alignments from analysis if they have a mapping
+                   quality less than Q.  default: 30
+   -q --min-base-quality Q
+                   Exclude alleles from analysis if their supporting base
+                   quality is less than Q.  default: 20
+   -R --min-supporting-quality MQ,BQ
+                   In order to consider an alternate allele, at least one supporting
+                   alignment must have mapping quality MQ, and one supporting 
+                   allele must have base quality BQ. default: 0,0, unset
+   -Q --mismatch-base-quality-threshold Q
+                   Count mismatches toward --read-mismatch-limit if the base
+                   quality of the mismatch is &gt;= Q.  default: 10
+   -U --read-mismatch-limit N
+                   Exclude reads with more than N mismatches where each mismatch
+                   has base quality &gt;= mismatch-base-quality-threshold.
+                   default: ~unbounded
+   -z --read-max-mismatch-fraction N
+                   Exclude reads with more than N [0,1] fraction of mismatches where
+                   each mismatch has base quality &gt;= mismatch-base-quality-threshold
+                   default: 1.0
+   -$ --read-snp-limit N
+                   Exclude reads with more than N base mismatches, ignoring gaps
+                   with quality &gt;= mismatch-base-quality-threshold.
+                   default: ~unbounded
+   -e --read-indel-limit N
+                   Exclude reads with more than N separate gaps.
+                   default: ~unbounded
+   -0 --standard-filters  Use stringent input base and mapping quality filters
+                   Equivalent to -m 30 -q 20 -R 0 -S 0
+   -x --indel-exclusion-window
+                   Ignore portions of alignments this many bases from a
+                   putative insertion or deletion allele.  default: 0
+   -F --min-alternate-fraction N
+                   Require at least this fraction of observations supporting
+                   an alternate allele within a single individual in the
+                   in order to evaluate the position.  default: 0.0
+   -C --min-alternate-count N
+                   Require at least this count of observations supporting
+                   an alternate allele within a single individual in order
+                   to evaluate the position.  default: 1
+   -3 --min-alternate-qsum N
+                   Require at least this sum of quality of observations supporting
+                   an alternate allele within a single individual in order
+                   to evaluate the position.  default: 0
+   -G --min-alternate-total N
+                   Require at least this count of observations supporting
+                   an alternate allele within the total population in order
+                   to use the allele in analysis.  default: 1
+   -! --min-coverage N
+                   Require at least this coverage to process a site.  default: 0
+
+  bayesian priors:
+
+   -Y --no-ewens-priors
+                   Turns off the Ewens' Sampling Formula component of the priors.
+   -k --no-population-priors
+                   Equivalent to --pooled --no-ewens-priors
+   -w --hwe-priors Use the probability of the combination arising under HWE given
+                   the allele frequency as estimated by observation frequency.
+
+  observation prior expectations:
+
+   -V --binomial-obs-priors
+                   Incorporate expectations about osbervations into the priors,
+                   Uses read placement probability, strand balance probability,
+                   and read position (5'-3') probability.
+   -a --allele-balance-priors
+                   Use aggregate probability of observation balance between alleles
+                   as a component of the priors.  Best for observations with minimal
+                   inherent reference bias.
+
+  algorithmic features:
+
+   -M --site-selection-max-iterations N
+                   Uses hill-climbing algorithm to search posterior space for N
+                   iterations to determine if the site should be evaluated.  Set to 0
+                   to prevent use of this algorithm for site selection, and
+                   to a low integer for improvide site selection at a slight
+                   performance penalty. default: 5.
+   -B --genotyping-max-iterations N
+                   Iterate no more than N times during genotyping step. default: 25.
+   --genotyping-max-banddepth N
+                   Integrate no deeper than the Nth best genotype by likelihood when
+                   genotyping. default: 6.
+   -W --posterior-integration-limits N,M
+                   Integrate all genotype combinations in our posterior space
+                   which include no more than N samples with their Mth best
+                   data likelihood. default: 1,3.
+   -K --no-permute
+                   Do not scale prior probability of genotype combination given allele
+                   frequency by the number of permutations of included genotypes.
+   -N --exclude-unobserved-genotypes
+                   Skip sample genotypings for which the sample has no supporting reads.
+   -S --genotype-variant-threshold N
+                   Limit posterior integration to samples where the second-best
+                   genotype likelihood is no more than log(N) from the highest
+                   genotype likelihood for the sample.  default: ~unbounded
+   -j --use-mapping-quality
+                   Use mapping quality of alleles when calculating data likelihoods.
+   -D --read-dependence-factor N
+                   Incorporate non-independence of reads by scaling successive
+                   observations by this factor during data likelihood
+                   calculations.  default: 0.9
+   -= --no-marginals
+                   Do not calculate the marginal probability of genotypes.  Saves
+                   time and improves scaling performance in large populations.
+
+
+------
+
+**Citation**
+
+For the underlying tool, please cite `Erik Garrison and Gabor Marth. Haplotype-based variant detection from short-read sequencing &lt;http://arxiv.org/abs/1207.3907&gt;`_.
+
+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
+
+  </help>
+</tool>

File tools/gatk-condor/gatk_wrapper.py

View file
 import sys, optparse, os, tempfile, subprocess, shutil
 from binascii import unhexlify
 from string import Template
+import time  #liubo added
 
 GALAXY_EXT_TO_GATK_EXT = { 'gatk_interval':'intervals', 'bam_index':'bam.bai', 'gatk_dbsnp':'dbSNP', 'picard_interval_list':'interval_list' } #items not listed here will use the galaxy extension as-is
 GALAXY_EXT_TO_GATK_FILE_TYPE = GALAXY_EXT_TO_GATK_EXT #for now, these are the same, but could be different if needed
         html_out.write(  '<li><a href="%s">%s</a></li>\n' % ( fname, fname ) )
     html_out.write( '</ul>\n</body>\n</html>\n' )
 
+def index_bam_files( bam_filenames, tmp_dir ):
+    for bam_filename in bam_filenames:
+        bam_index_filename = "%s.bai" % bam_filename
+        if not os.path.exists( bam_index_filename ):
+            #need to index this bam file
+            stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
+            command = 'samtools index %s %s' % ( bam_filename, bam_index_filename )
+            proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
+            return_code = proc.wait()
+            if return_code:
+                for line in open( stderr_name ):
+                    print >> sys.stderr, line
+                os.unlink( stderr_name ) #clean up
+                cleanup_before_exit( tmp_dir )
+                raise Exception( "Error indexing BAM file" )
+            os.unlink( stderr_name ) #clean up
+
 def __main__():
+    #liubo added, print current time for calculating execution time of GATK
+    print "Start time:"
+    print time.strftime('%d/%m/%Y %H:%M:%S\n', time.localtime(time.time()))
+
     #Parse Command Line
     parser = optparse.OptionParser()
     parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' )
     parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' )
     parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"')
     (options, args) = parser.parse_args()
-    
+   
+##liubo added
+    print options
+ 
     tmp_dir = tempfile.mkdtemp( prefix='tmp-gatk-' )
     if options.pass_through_options:
         cmd = ' '.join( options.pass_through_options )
         cmd = cmd.replace( 'java ', 'java -Xmx%s ' % ( options.max_jvm_heap ), 1 )
     elif options.max_jvm_heap_fraction is not None:
         cmd = cmd.replace( 'java ', 'java -XX:DefaultMaxRAMFraction=%s  -XX:+UseParallelGC ' % ( options.max_jvm_heap_fraction ), 1 )
+    bam_filenames = []
     if options.datasets:
         for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets:
             gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix )
             if dataset_arg:
                 cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename )
+            if galaxy_ext == "bam":
+                bam_filenames.append( gatk_filename )
+    index_bam_files( bam_filenames, tmp_dir )
     #set up stdout and stderr output options
     stdout = open_file_from_option( options.stdout, mode = 'wb' )
     stderr = open_file_from_option( options.stderr, mode = 'wb' )
     #if no stderr file is specified, we'll use our own
     if stderr is None:
         stderr = tempfile.NamedTemporaryFile( prefix="gatk-stderr-", dir=tmp_dir )
-    
+   
+    ##liubo added
+    print cmd
+ 
     proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
     return_code = proc.wait()
     
     
     cleanup_before_exit( tmp_dir )
 
+    #liubo added, print current time for calculating execution time of GATK
+    print "\n\nEnd time:"
+    print time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
 if __name__=="__main__": __main__()

File tools/gatk/gatk_wrapper.py

View file
 import sys, optparse, os, tempfile, subprocess, shutil
 from binascii import unhexlify
 from string import Template
+import time  #liubo added
 
 GALAXY_EXT_TO_GATK_EXT = { 'gatk_interval':'intervals', 'bam_index':'bam.bai', 'gatk_dbsnp':'dbSNP', 'picard_interval_list':'interval_list' } #items not listed here will use the galaxy extension as-is
 GALAXY_EXT_TO_GATK_FILE_TYPE = GALAXY_EXT_TO_GATK_EXT #for now, these are the same, but could be different if needed
             os.unlink( stderr_name ) #clean up
 
 def __main__():
+    #liubo added, print current time for calculating execution time of GATK
+    print "Start time:"
+    print time.strftime('%d/%m/%Y %H:%M:%S\n', time.localtime(time.time()))
+
     #Parse Command Line
     parser = optparse.OptionParser()
     parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' )
     
     cleanup_before_exit( tmp_dir )
 
+    #liubo added, print current time for calculating execution time of GATK
+    print "\n\nEnd time:"
+    print time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
 if __name__=="__main__": __main__()

File tools/novoalign/novoalign.xml

View file
 	</requirements>
 	<description>for Illumina</description>
 	<parallelism method="basic"></parallelism>
-	<command>/nfs/software/galaxy/tools/novoalign/novoalign.py /nfs/software/galaxy/tools/novoalign/novoalign
+	<command>${GALAXY_DATA_INDEX_DIR}/../tools/novoalign/novoalign.py ${GALAXY_DATA_INDEX_DIR}/../tools/novoalign/novoalign
 		-f $fastqFile1
 
 		#if $pairedOrSingle.pairedEnd == "true"

File tools/peak_calling/macs_wrapper-condor.py

View file
+import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "simplejson" )
+import simplejson
+
+#modification for MACS V1.4.2
+
+CHUNK_SIZE = 1024
+
+def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
+    out = open( target_filename, 'wb' )
+    for filename in glob.glob( glob_path ):
+        fh = gzip.open( filename, 'rb' )
+        while True:
+            data = fh.read( CHUNK_SIZE )
+            if data:
+                out.write( data )
+            else:
+                break
+        fh.close()
+        if delete:
+            os.unlink( filename )
+    out.close()
+
+def xls_to_interval( xls_file, interval_file, header = None ):
+    out = open( interval_file, 'wb' )
+    if header:
+        out.write( '#%s\n' % header )
+    wrote_header = False
+    #From macs readme: Coordinates in XLS is 1-based which is different with BED format.
+    for line in open( xls_file ):
+        #keep all existing comment lines
+        if line.startswith( '#' ):
+            out.write( line )
+        elif not wrote_header:
+            out.write( '#%s' % line )
+            wrote_header = True
+        else:
+            fields = line.split( '\t' )
+            if len( fields ) > 1:
+                fields[1] = str( int( fields[1] ) - 1 )
+            out.write( '\t'.join( fields ) )
+    out.close()
+
+def main():
+#modified sys.argv by liubo, for condor runner
+    options = simplejson.load( open( sys.argv[2] ) )
+    output_bed = sys.argv[3]
+    output_extra_html = sys.argv[4]
+    output_extra_path = sys.argv[5]
+    
+    experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for filenames (gzip of wig files will fail with spaces - macs doesn't properly escape them)..need to replace all whitespace, split makes this easier
+    #cmdline = "macs -t %s" % ",".join( options['input_chipseq'] )   
+    cmdline = "macs14 -t %s" % ",".join( options['input_chipseq'] )    #change to macs14 for MACS V1.4.2
+    if options['input_control']:
+        cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
+#    cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s --lambdaset='%s' %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['lambdaset'], options['futurefdr'] )
+    cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['futurefdr'] )
+
+    if 'wig' in options:
+        wigextend = int( options['wig']['wigextend']  )
+        if wigextend >= 0:
+            wigextend = "--wigextend='%s'" % wigextend
+        else:
+            wigextend = ''
+        cmdline = "%s --wig %s --space='%s'" % ( cmdline, wigextend, options['wig']['space'] )
+    if 'nomodel' in options:
+        cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] )
+    if 'diag' in options:
+        cmdline = "%s --diag --fe-min='%s' --fe-max='%s' --fe-step='%s'" % ( cmdline, options['diag']['fe-min'], options['diag']['fe-max'], options['diag']['fe-step'] )
+    
+    tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
+    stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
+    proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
+    proc.wait()
+    #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
+    #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
+    if proc.returncode:
+        stderr_f = open( stderr_name )
+        while True:
+            chunk = stderr_f.read( CHUNK_SIZE )
+            if not chunk:
+                stderr_f.close()
+                break
+            sys.stderr.write( chunk )
+    
+    #run R to create pdf from model script
+    if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
+        cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
+        proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
+        proc.wait()
+    
+    
+    #move bed out to proper output file
+    created_bed_name =  os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name )
+    if os.path.exists( created_bed_name ):
+        shutil.move( created_bed_name, output_bed )
+    
+    #parse xls files to interval files as needed
+    if options['xls_to_interval']:
+        create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
+        if os.path.exists( create_peak_xls_file ):
+            xls_to_interval( create_peak_xls_file, options['xls_to_interval']['peaks_file'], header = 'peaks file' )
+        create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name )
+        if os.path.exists( create_peak_xls_file ):
+            xls_to_interval( create_peak_xls_file, options['xls_to_interval']['negative_peaks_file'], header = 'negative peaks file' )
+    
+    #merge and move wig files as needed, delete gz'd files and remove emptied dirs
+    if 'wig' in options:
+        wig_base_dir = os.path.join( tmp_dir, "%s_MACS_wiggle" % experiment_name )
+        if os.path.exists( wig_base_dir ):
+            #treatment
+            treatment_dir = os.path.join( wig_base_dir, "treat" )
+            if os.path.exists( treatment_dir ):
+                gunzip_cat_glob_path( os.path.join( treatment_dir, "*.wig.gz" ), options['wig']['output_treatment_file'], delete = True )
+                os.rmdir( treatment_dir )
+                #control
+                if options['input_control']:
+                    control_dir = os.path.join( wig_base_dir, "control" )
+                    if os.path.exists( control_dir ):
+                        gunzip_cat_glob_path( os.path.join( control_dir, "*.wig.gz" ), options['wig']['output_control_file'], delete = True )
+                        os.rmdir( control_dir )
+            os.rmdir( wig_base_dir )
+    
+    #move all remaining files to extra files path of html file output to allow user download
+    out_html = open( output_extra_html, 'wb' )
+    out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
+    os.mkdir( output_extra_path )
+    for filename in sorted( os.listdir( tmp_dir ) ):
+        shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
+        out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
+    out_html.write( '</ul></p>\n' )
+    out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
+    out_html.write( '</body></html>\n' )
+    out_html.close()
+    
+    os.unlink( stderr_name )
+    os.rmdir( tmp_dir )
+
+if __name__ == "__main__": main()

File tools/peak_calling/macs_wrapper-condor.xml

View file
+<tool id="peakcalling_macs-condor" name="MACS (via Condor)" version="1.0.1">
+  <description>Model-based Analysis of ChIP-Seq</description>
+  <command interpreter="python">condor_run.py  
+${GALAXY_DATA_INDEX_DIR}/../tools/peak_calling/macs_wrapper-condor.py 
+$options_file $output_bed_file $output_extra_files $output_extra_files.files_path
+  </command>
+
+  <requirements>
+    <requirement type="python-module">macs</requirement>
+    <requirement type="package">macs</requirement>
+  </requirements>
+  <inputs>
+    <param name="experiment_name" type="text" value="MACS in Galaxy" size="50" label="Experiment Name"/>
+    <conditional name="input_type">
+      <param name="input_type_selector" type="select" label="Paired End Sequencing">
+        <option value="paired_end">Paired End (requires elandmulti format)</option>
+        <option value="single_end" selected="true">Single End</option>
+      </param>
+      <when value="paired_end">
+        <param name="input_chipseq_file1" type="data" format="elandmulti" label="ChIP-Seq Tag File 1" />
+        <param name="input_chipseq_file2" type="data" format="elandmulti" label="ChIP-Seq Tag File 2" />
+        <param name="input_control_file1" type="data" format="elandmulti" optional="True" label="ChIP-Seq Control File 1" />
+        <param name="input_control_file2" type="data" format="elandmulti" optional="True" label="ChIP-Seq Control File 2" />
+        <param name="petdist" type="integer" label="Best distance between Pair-End Tags" value="200"/>
+      </when>
+      <when value="single_end">
+        <param name="input_chipseq_file1" type="data" format="bed,sam,bam,eland,elandmulti" label="ChIP-Seq Tag File" />
+        <param name="input_control_file1" type="data" format="bed,sam,bam,eland,elandmulti" optional="True" label="ChIP-Seq Control File" />
+      </when>
+    </conditional>
+<!--    <param name="gsize" type="float" label="Effective genome size" value="2.7e+9" help="default: 2.7e+9"/>  modified by liubo, some values are 'hs'-->
+    <param name="gsize" type="text" label="Effective genome size" value="hs" help="It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs"/>
+
+    <param name="tsize" type="integer" label="Tag size" value="25"/>
+    <param name="bw" type="integer" label="Band width" value="300"/>
+    <param name="pvalue" type="float" label="Pvalue cutoff for peak detection" value="1e-5" help="default: 1e-5"/>
+<!--    <param name="mfold" type="integer" label="Select the regions with MFOLD high-confidence enrichment ratio against background to build model" value="32"/> -->
+    <param name="mfold" type="text" label="Select the regions with MFOLD high-confidence enrichment ratio against background to build model" value="10,30"/>
+
+    <param name="xls_to_interval" label="Parse xls files into into distinct interval files" type="boolean" truevalue="create" falsevalue="do_not_create" checked="False"/>
+    <conditional name="wig_type">
+      <param name="wig_type_selector" type="select" label="Save shifted raw tag count at every bp into a wiggle file">
+        <option value="wig">Save</option>
+        <option value="no_wig" selected="true">Do not create wig file (faster)</option>
+      </param>
+      <when value="wig">
+        <param name="wigextend" type="integer" label="Extend tag from its middle point to a wigextend size fragment." value="-1" help="Use value less than 0 for default (modeled d)"/>
+        <param name="space" type="integer" label="Resolution for saving wiggle files" value="10"/>
+      </when>
+      <when value="no_wig">
+        <!-- do nothing here -->
+      </when>
+    </conditional>
+    <param name="nolambda" label="Use fixed background lambda as local lambda for every peak region" type="boolean" truevalue="--nolambda" falsevalue="" checked="False" help="up to 9X more time consuming"/>
+    <param name="lambdaset" type="text" label="3 levels of regions around the peak region to calculate the maximum lambda as local lambda" value="1000,5000,10000" size="50"/>
+    <conditional name="nomodel_type">
+      <param name="nomodel_type_selector" type="select" label="Build Model">
+        <option value="nomodel">Do not build the shifting model</option>
+        <option value="create_model" selected="true">Build the shifting model</option>
+      </param>
+      <when value="nomodel">
+        <param name="shiftsize" type="integer" label="Arbitrary shift size in bp" value="100"/>
+      </when>
+      <when value="create_model">
+        <!-- do nothing here -->
+      </when>
+    </conditional>
+    <conditional name="diag_type">
+      <param name="diag_type_selector" type="select" label="Diagnosis report" help="up to 9X more time consuming">
+        <option value="diag">Produce a diagnosis report</option>
+        <option value="no_diag" selected="true">Do not produce report (faster)</option>
+      </param>
+      <when value="diag">
+        <param name="fe-min" type="integer" label="Min fold enrichment to consider" value="0"/>
+        <param name="fe-max" type="integer" label="Max fold enrichment to consider" value="32"/>
+        <param name="fe-step" type="integer" label="Fold enrichment step" value="20"/>
+      </when>
+      <when value="no_diag">
+        <!-- do nothing here -->
+      </when>
+    </conditional>
+    <param name="futurefdr" label="Perform the new peak detection method (futurefdr)" type="boolean" truevalue="--futurefdr" falsevalue="" checked="False" help="The default method only consider the peak location, 1k, 5k, and 10k regions in the control data; whereas the new future method also consider the 5k, 10k regions in treatment data to calculate local bias."/>
+  </inputs>
+  <outputs>
+    <data name="output_bed_file" format="bed" label="${tool.name} on ${on_string} (peaks: bed)"/>
+    <data name="output_xls_to_interval_peaks_file" format="interval" label="${tool.name} on ${on_string} (peaks: interval)">
+      <filter>xls_to_interval is True</filter>
+    </data>
+    <data name="output_xls_to_interval_negative_peaks_file" format="interval" label="${tool.name} on ${on_string} (negative peaks: interval)">
+      <filter>xls_to_interval is True</filter>
+      <filter>input_type['input_control_file1'] is not None</filter>
+    </data>
+    <data name="output_treatment_wig_file" format="wig" label="${tool.name} on ${on_string} (treatment: wig)">
+      <filter>wig_type['wig_type_selector']=='wig'</filter>
+    </data>
+    <data name="output_control_wig_file" format="wig" label="${tool.name} on ${on_string} (control: wig)">
+      <filter>wig_type['wig_type_selector'] == 'wig'</filter>
+      <filter>input_type['input_control_file1'] is not None</filter>
+    </data>
+    <data name="output_extra_files" format="html" label="${tool.name} on ${on_string} (html report)"/>
+  </outputs>
+  <configfiles>
+    <configfile name="options_file">&lt;%
+import simplejson
+%&gt;
+#set $__options = { 'experiment_name':str( $experiment_name ), 'gsize': str( $gsize ), 'tsize':str( $tsize ), 'bw':str( $bw ), 'pvalue':str( $pvalue ), 'mfold':str( $mfold ), 'nolambda':str( $nolambda ), 'lambdaset': str( $lambdaset ), 'futurefdr':str( $futurefdr ) }
+#if str( $xls_to_interval ) == 'create':
+#set $__options['xls_to_interval'] = { 'peaks_file': str( $output_xls_to_interval_peaks_file ), 'negative_peaks_file': str( $output_xls_to_interval_negative_peaks_file ) }
+#else:
+#set $__options['xls_to_interval'] = False
+#end if
+##treatment/tag input files and format
+#set $__options['input_chipseq'] = [ str( $input_type['input_chipseq_file1'] ) ]
+#if  $input_type['input_type_selector'] == 'paired_end':
+#set $_hole = __options['input_chipseq'].append( str( $input_type['input_chipseq_file2'] ) )
+#set $__options['format'] = 'ELANDMULTIPET'
+#else:
+#set $__options['format'] = $input_type['input_chipseq_file1'].extension.upper()
+#end if
+##control/input files
+#set $__options['input_control'] = []
+#if str( $input_type['input_control_file1'] ) != 'None':
+#set $_hole = __options['input_control'].append( str( $input_type['input_control_file1'] ) )
+#end if
+#if $input_type['input_type_selector'] == 'paired_end' and str( $input_type['input_control_file2'] ) != 'None':
+#set $_hole = __options['input_control'].append( str( $input_type['input_control_file2'] ) )
+#end if
+##wig options
+#if $wig_type['wig_type_selector'] == 'wig':
+#set $__options['wig'] = {}
+#set $__options['wig']['wigextend'] = str( $wig_type['wigextend'] )
+#set $__options['wig']['space'] = str( $wig_type['space'] )
+#set  $__options['wig']['output_treatment_file'] = str( $output_treatment_wig_file )
+#if $input_type['input_control_file1'] is not None:
+#set  $__options['wig']['output_control_file'] = str( $output_control_wig_file )
+#end if
+#end if
+##model options
+#if $nomodel_type['nomodel_type_selector'] == 'nomodel':
+#set $__options['nomodel'] = str( $nomodel_type['shiftsize'] )
+#end if
+##diag options
+#if $diag_type['diag_type_selector'] == 'diag':
+#set $__options['diag'] = { 'fe-min':str( $diag_type['fe-min'] ), 'fe-max':str( $diag_type['fe-max'] ), 'fe-step':str( $diag_type['fe-step'] ) }
+#end if
+${ simplejson.dumps( __options ) }
+    </configfile>
+  </configfiles>
+  <tests>
+    <test>
+      <param name="input_type_selector" value="single_end" />
+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />
+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />
+      <param name="experiment_name" value="Galaxy Test Run" />
+      <param name="tsize" value="36" />
+      <param name="mfold" value="13" />
+      <param name="gsize" value="2.7e+9" />
+      <param name="bw" value="300" />
+      <param name="pvalue" value="1e-5" />
+      <param name="xls_to_interval" />
+      <param name="wig_type_selector" value="no_wig" />
+      <param name="nolambda"/>
+      <param name="lambdaset" value="1000,5000,10000"/>
+      <param name="nomodel_type_selector" value="create_model" />
+      <param name="diag_type_selector" value="no_diag" />
+      <param name="futurefdr"/>
+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />
+      <output name="output_html_file" file="peakcalling_macs/macs_test_1_out.html" compare="re_match" >
+        <extra_files type="file" name="Galaxy_Test_Run_model.pdf" value="peakcalling_macs/test2/Galaxy_Test_Run_model.pdf" compare="re_match"/>
+        <extra_files type="file" name="Galaxy_Test_Run_model.r" value="peakcalling_macs/test2/Galaxy_Test_Run_model.r" compare="re_match"/>
+        <extra_files type="file" name="Galaxy_Test_Run_model.r.log" value="peakcalling_macs/test2/Galaxy_Test_Run_model.r.log"/>
+        <extra_files type="file" name="Galaxy_Test_Run_negative_peaks.xls" value="peakcalling_macs/test2/Galaxy_Test_Run_negative_peaks.xls" compare="re_match"/>
+        <extra_files type="file" name="Galaxy_Test_Run_peaks.xls" value="peakcalling_macs/test2/Galaxy_Test_Run_peaks.xls" compare="re_match"/>
+      </output>
+    </test>
+    <test>
+      <param name="input_type_selector" value="single_end" />
+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />
+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />
+      <param name="experiment_name" value="Galaxy Test Run" />
+      <param name="tsize" value="36" />
+      <param name="mfold" value="13" />
+      <param name="gsize" value="2.7e+9" />
+      <param name="bw" value="300" />
+      <param name="pvalue" value="1e-5" />
+      <param name="xls_to_interval" value="true" />
+      <param name="wig_type_selector" value="no_wig" />
+      <param name="nolambda"/>
+      <param name="lambdaset" value="1000,5000,10000"/>
+      <param name="nomodel_type_selector" value="create_model" />
+      <param name="diag_type_selector" value="no_diag" />
+      <param name="futurefdr"/>
+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />
+      <output name="output_xls_to_interval_peaks_file" file="peakcalling_macs/macs_test_2_peaks_out.interval" lines_diff="4" />
+      <output name="output_xls_to_interval_negative_peaks_file" file="peakcalling_macs/macs_test_2_neg_peaks_out.interval" />
+      <output name="output_html_file" file="peakcalling_macs/macs_test_1_out.html" compare="re_match" >
+        <extra_files type="directory" value="peakcalling_macs/test2/" compare="re_match"/>
+      </output>
+    </test>
+    <!-- <test>
+      <param name="input_type_selector" value="single_end" />
+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />
+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />
+      <param name="experiment_name" value="Galaxy Test Run" />
+      <param name="tsize" value="36" />
+      <param name="mfold" value="13" />
+      <param name="gsize" value="2.7e+9" />
+      <param name="bw" value="300" />
+      <param name="pvalue" value="1e-5" />
+      <param name="xls_to_interval" value="true" />
+      <param name="wig_type_selector" value="wig" />
+      <param name="wigextend" value="-1" />
+      <param name="space" value="10" />
+      <param name="nolambda"/>
+      <param name="lambdaset" value="1000,5000,10000"/>
+      <param name="nomodel_type_selector" value="create_model" />
+      <param name="diag_type_selector" value="no_diag" />
+      <param name="futurefdr"/>
+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />
+      <output name="output_xls_to_interval_peaks_file" file="peakcalling_macs/macs_test_2_peaks_out.interval" lines_diff="4" />
+      <output name="output_xls_to_interval_negative_peaks_file" file="macs_test_2_neg_peaks_out.interval" />
+      <output name="output_treatment_wig_file" file="peakcalling_macs/macs_test_3_treatment_out.wig" />
+      <output name="output_control_wig_file" file="peakcalling_macs/macs_test_3_control_out.wig" />
+      <output name="output_html_file" file="peakcalling_macs/macs_test_3_out.html" compare="re_match" >
+        <extra_files type="directory" value="peakcalling_macs/test2/" compare="re_match"/>
+      </output>
+    </test> -->
+  </tests>
+  <help>
+**What it does**
+
+This tool allows ChIP-seq peak calling using MACS.
+
+Depending upon selected options, 2 to 6 history items will be created; the first output will be a standard BED file and the last will be an HTML report containing links to download additional files generated by MACS. Up to two each of wig and interval files can be optionally created; the interval files are parsed from the xls output.
+
+View the original MACS documentation: http://liulab.dfci.harvard.edu/MACS/00README.html.
+
+------
+
+**Citation**
+
+For the underlying tool, please cite `Zhang Y, Liu T, Meyer CA, Eeckhoute J, Johnson DS, Bernstein BE, Nusbaum C, Myers RM, Brown M, Li W, Liu XS. Model-based analysis of ChIP-Seq (MACS). Genome Biol. 2008;9(9):R137. &lt;http://www.ncbi.nlm.nih.gov/pubmed/18798982&gt;`_
+
+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
+
+  </help>
+</tool>

File tools/samtools/sam_to_bam-condor.xml

View file
+<tool id="sam_to_bam-condor" name="SAM-to-BAM (via Condor)" version="1.1.2">
+  <description>converts SAM format to BAM format</description>
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <command interpreter="python">condor_run.py
+    ${GALAXY_DATA_INDEX_DIR}/../tools/samtools/sam_to_bam.py
+      --input1=$source.input1
+      #if $source.index_source == "history":
+        --dbkey=${ref_file.metadata.dbkey} 
+        --ref_file=$source.ref_file
+      #else
+        --dbkey=${input1.metadata.dbkey} 
+      #end if
+      --output1=$output1
+      --index_dir=${GALAXY_DATA_INDEX_DIR}
+  </command>
+  <inputs>
+    <conditional name="source">
+      <param name="index_source" type="select" label="Choose the source for the reference list">
+        <option value="cached">Locally cached</option>
+        <option value="history">History</option>
+      </param>
+      <when value="cached">
+        <param name="input1" type="data" format="sam" metadata_name="dbkey" label="SAM File to Convert">
+           <validator type="unspecified_build" />
+           <validator type="dataset_metadata_in_file" filename="sam_fa_indices.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="index" />
+        </param>
+      </when>
+      <when value="history">
+        <param name="input1" type="data" format="sam" label="Convert SAM file" />
+        <param name="ref_file" type="data" format="fasta" metadata_name="dbkey" label="Using reference file" />
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="bam" name="output1" label="${tool.name} on ${on_string}: converted BAM">
+      <actions>
+        <conditional name="source.index_source">
+          <when value="cached">
+            <action type="metadata" name="dbkey">
+              <option type="from_param" name="source.input1" param_attribute="dbkey" />
+            </action>
+          </when>
+          <when value="history">
+            <action type="metadata" name="dbkey">
+              <option type="from_param" name="source.ref_file" param_attribute="dbkey" />
+            </action>
+          </when>
+        </conditional>
+      </actions>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <!--
+      Sam-to-Bam command:
+      cp test-data/chr_m.fasta .
+      samtools faidx chr_m.fasta
+      samtools view -hbt chr_m.fasta.fai -o unsorted.bam test-data/sam_to_bam_in1.sam
+      samtools sort unsorted.bam sam_to_bam_out1
+      chr_m.fasta is the reference file (chrM from equCab2)
+      -->
+      <param name="index_source" value="history" /> 
+      <param name="input1" value="sam_to_bam_in1.sam" ftype="sam" />
+      <param name="ref_file" value="chr_m.fasta" ftype="fasta" dbkey="equCab2" />
+      <output name="output1" file="sam_to_bam_out1.bam" ftype="bam" />
+    </test>
+    <test>
+      <!--
+      Sam-to-Bam command:
+      samtools view -hbt chr_m.fasta.fai -o unsorted.bam test-data/sam_to_bam_in1.sam
+      samtools sort unsorted.bam sam_to_bam_out2
+      chr_m.fasta is the reference file and the index chr_m.fasta.fai 
+      these should be in the same directory, and chrM is from equCab2
+      -->
+      <param name="index_source" value="cached" />
+      <param name="input1" value="sam_to_bam_in1.sam" ftype="sam" dbkey="chrM" />
+      <output name="output1" file="sam_to_bam_out2.bam" ftype="bam" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses the SAMTools_ toolkit to produce an indexed BAM file based on a sorted input SAM file.
+
+.. _SAMTools: http://samtools.sourceforge.net/samtools.shtml
+
+------
+
+**Citation**
+
+For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505943&gt;`_
+
+  </help>
+</tool>

File tools/samtools/samtools_sort-condor.xml

View file
+<tool id="samtools_sort-condor" name="Sort (via Condor)" version="1.0.3">
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <description>BAM file</description>
+  <command interpreter="python">condor_run.py
+    ${GALAXY_DATA_INDEX_DIR}/../tools/samtools/samtools_sort.py
+      --input=$input
+      --output=$output
+  </command>
+  <inputs>
+    <param name="input" type="data" format="bam" label="input a BAM File to sort" />
+  </inputs>
+  <outputs>
+    <data format="bam" name="output" label="${tool.name} on ${on_string}: sorted BAM" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool uses the SAMTools_ toolkit to sort alignment file.
+
+.. _SAMTools: http://samtools.sourceforge.net/samtools.shtml
+
+------
+
+**Citation**
+
+For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505943&gt;`_
+
+  </help>
+</tool>

File tools/snap/snap

Binary file added.

File tools/snap/snap-alignment.py

View file
+#!/usr/bin/env python
+import sys, os, commands, string, subprocess, shutil
+
+galaxyhome=os.environ.get('GALAXY_HOME')
+
+
+n = len(sys.argv)
+for i in range(0,n):
+    print sys.argv[i]
+
+
+IndexDIR = sys.argv[1]
+Index_FOLDER = IndexDIR.split('.')[0]+'_files'
+
+output = " -o " + sys.argv[2]
+
+if len(sys.argv)>4:
+    inputFASTA = "paired " + Index_FOLDER + " " + sys.argv[3] + " " + sys.argv[4]
+else:
+    inputFASTA = "single " + Index_FOLDER + " " + sys.argv[3]
+
+
+##uncomment when running on VM
+os.chdir(galaxyhome + "/tools/snap/")     #uncomment when running on cluster
+#os.chdir("/media/Work/galaxy-proteonics/tools/snap/")    #for local running
+
+command="./snap " + inputFASTA + output
+
+print command
+
+proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE )
+

File tools/snap/snap-alignment.xml

View file
+<?xml version="1.0"?>
+
+<tool name="SNAP Alignment" id="SNAP_Alignment_id">
+  <description>
+  </description>
+
+  <command interpreter="python">
+   snap-alignment.py
+
+   $IndexDIR
+
+   $output
+
+    #if $singlePaired.sPaired == "single":
+	$singlePaired.input1
+    #else:
+        $singlePaired.input1 $singlePaired.input2
+    #end if   
+
+  </command>
+
+  <inputs>
+        <conditional name="singlePaired">
+            <param name="sPaired" type="select" label="Is this unpaired reads or paired-end reads?">
+              <option value="single">Single-end</option>
+              <option value="paired">Paired-end</option>
+            </param>
+            <when value="single">
+                <param format="fastqsanger" name="input1" type="data" label="FASTQ file" />
+            </when>
+            <when value="paired">
+                <param format="fastqsanger" name="input1" type="data" label="FASTQ file, forward reads" />
+                <param format="fastqsanger" name="input2" type="data" label="FASTQ file, reverse reads" />
+            </when>
+        </conditional>
+
+    <param name="IndexDIR" type="data" label="Select a index file" help="This should be the output of SNAP Build Index"/>
+
+  </inputs>
+
+  <outputs>
+    <data name="output" format="sam" label="${tool.name} on ${on_string}.sam" />
+  </outputs>
+
+  <help>
+SNAP_ is a new sequence aligner that is 10-100x faster and simultaneously more accurate than existing tools like BWA, Bowtie2 and SOAP2. It runs on commodity x86 processors, and supports a rich error model that lets it cheaply match reads with more differences from the reference than other tools. This gives SNAP up to 2x lower error rates than existing tools and lets it match larger mutations that they may miss.
+ .. _SNAP: http://snap.cs.berkeley.edu/
+
+SNAP was developed by a team from the UC Berkeley AMP Lab, Microsoft, and UCSF.
+
+SNAP 0.13.4 for Linux (64-bit)
+
+**Usage:**
+
+To build an index:
+
+snap  index  hg19.fa  index-dir	
+
+To align unpaired reads:
+
+snap single index-dir  reads.fq  -o output.sam	
+  
+To align paired-end reads:
+
+snap  paired index-dir  read1.fq  read2.fq  -o  output.sam	
+
+
+  </help>
+
+</tool>

File tools/snap/snap-index.py

View file
+#!/usr/bin/env python
+import sys, os, commands, string, subprocess, shutil
+
+galaxyhome=os.environ.get('GALAXY_HOME')
+
+
+n = len(sys.argv)
+for i in range(0,n):
+    print sys.argv[i]
+
+
+inputFASTA = sys.argv[1]
+
+if sys.argv[2]!="none":
+    seedSize = " " + sys.argv[2]
+else:
+    n = ""
+
+outputIndex = sys.argv[3]
+
+outputIndex_FOLDER = outputIndex.split('.')[0]+'_files'
+
+if not os.path.exists(outputIndex_FOLDER):
+	os.makedirs(outputIndex_FOLDER)
+else:
+	pass
+
+
+##uncomment when running on VM
+os.chdir(galaxyhome + "/tools/snap/")     #uncomment when running on cluster
+#os.chdir("/media/Work/galaxy-proteonics/tools/snap/")    #for local running
+
+command="./snap index " + inputFASTA + " " + outputIndex_FOLDER
+
+print command
+
+proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE )
+

File tools/snap/snap-index.xml

View file
+<?xml version="1.0"?>
+
+<tool name="SNAP Build Index" id="SNAP_build_index_id">
+  <description>
+  </description>
+
+  <command interpreter="python">
+   snap-index.py
+
+   $inputFASTA
+
+    #if str( $seedSize ):
+        $seedSize
+    #else:
+	none
+    #end if   
+
+   $outputIndex
+
+  </command>
+
+  <inputs>
+    <param name="inputFASTA" type="data" format="fasta" label="Select a fasta file" />
+    <param name="seedSize" type="text"  value="" label="-s" help="-s option to set seed size. We recommend seed size 20 for 100 bp reads and 22 for
+ larger reads."/>
+
+  </inputs>
+
+  <outputs>
+    <data name="outputIndex" label="${tool.name} on ${on_string}" />
+  </outputs>
+
+  <help>
+SNAP_ is a new sequence aligner that is 10-100x faster and simultaneously more accurate than existing tools like BWA, Bowtie2 and SOAP2. It runs on commodity x86 processors, and supports a rich error model that lets it cheaply match reads with more differences from the reference than other tools. This gives SNAP up to 2x lower error rates than existing tools and lets it match larger mutations that they may miss.
+ .. _SNAP: http://snap.cs.berkeley.edu/
+
+SNAP was developed by a team from the UC Berkeley AMP Lab, Microsoft, and UCSF.
+
+SNAP 0.13.4 for Linux (64-bit)
+
+**Usage:**
+
+To build an index:
+
+snap  index  hg19.fa  index-dir	
+
+To align unpaired reads:
+
+snap single index-dir  reads.fq  -o output.sam	
+  
+To align paired-end reads:
+
+snap  paired index-dir  read1.fq  read2.fq  -o  output.sam	
+
+
+  </help>
+
+</tool>

File tools/sr_mapping/bwa_wrapper.py

View file
 """
 
 import optparse, os, shutil, subprocess, sys, tempfile
+import time  #liubo added
 
 def stop_err( msg ):
     sys.stderr.write( '%s\n' % msg )
     raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
 
 def __main__():
+    #liubo added, print current time for calculating execution time of BWA
+    print "Start time:"
+    print time.strftime('%d/%m/%Y %H:%M:%S\n', time.localtime(time.time()))
+
     #Parse Command Line
     parser = optparse.OptionParser()
     parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
         try:
             tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
             tmp_stderr = open( tmp, 'wb' )
+
+	    print cmd1  #liubo added
+
             proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
             returncode = proc.wait()
             tmp_stderr.close()
             try:
                 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
                 tmp_stderr = open( tmp, 'wb' )
+
+	        print cmd2  #liubo added
+
                 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
                 returncode = proc.wait()
                 tmp_stderr.close()
                 if cmd2b:
                     tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
                     tmp_stderr = open( tmp, 'wb' )
+
+	            print cmd2b  #liubo added
+
                     proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
                     returncode = proc.wait()
                     tmp_stderr.close()
             try:
                 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
                 tmp_stderr = open( tmp, 'wb' )
+
+	        print cmd3  #liubo added
+
                 proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
                 returncode = proc.wait()
                 tmp_stderr.close()
         if os.path.exists( tmp_dir ):
             shutil.rmtree( tmp_dir )
 
+    #liubo added, print current time for calculating execution time of BWA
+    print "\n\nEnd time:"
+    print time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
 if __name__=="__main__": __main__()
+