Commits

Daniel Blankenberg committed 926c93b

Add Slice BAM by regions tool.

  • Participants
  • Parent commits 5e53e5d

Comments (0)

Files changed (3)

tool_conf.xml.sample

     <tool file="samtools/pileup_interval.xml" />
     <tool file="samtools/samtools_flagstat.xml" />
     <tool file="samtools/samtools_rmdup.xml" />
+    <tool file="samtools/samtools_slice_bam.xml" />
   </section>
   <section name="NGS: GATK Tools (beta)" id="gatk">
     <label text="Alignment Utilities" id="gatk_bam_utilities"/>

tools/samtools/samtools_slice_bam.py

+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+A wrapper script for slicing a BAM file by provided BED file using SAMTools.
+%prog input_filename.sam output_filename.bam
+"""
+#TODO: Confirm that the sort is necessary e.g. if input regions are out of order
+
+
+import sys, optparse, os, tempfile, subprocess, shutil
+
+CHUNK_SIZE = 2**20 #1mb
+
+def cleanup_before_exit( tmp_dir ):
+    if tmp_dir and os.path.exists( tmp_dir ):
+        shutil.rmtree( tmp_dir )
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    (options, args) = parser.parse_args()
+    
+    assert len( args ) == 4, "Invalid command line: samtools_slice_bam.py input.bam input.bam.bai input.interval output.bam"
+    input_bam_filename, input_index_filename, input_interval_filename, output_bam_filename = args
+    
+    tmp_dir = tempfile.mkdtemp( prefix='tmp-samtools_slice_bam-' )
+    
+    tmp_input_bam_filename = os.path.join( tmp_dir, 'input_bam.bam' )
+    os.symlink( input_bam_filename, tmp_input_bam_filename )
+    os.symlink( input_index_filename, "%s.bai" % tmp_input_bam_filename )
+    
+    #Slice BAM
+    unsorted_bam_filename = os.path.join( tmp_dir, 'unsorted.bam' )
+    unsorted_stderr_filename = os.path.join( tmp_dir, 'unsorted.stderr' )
+    cmd = 'samtools view -b -L "%s" "%s" > "%s"' % ( input_interval_filename, tmp_input_bam_filename, unsorted_bam_filename )
+    proc = subprocess.Popen( args=cmd, stderr=open( unsorted_stderr_filename, 'wb' ), shell=True, cwd=tmp_dir )
+    return_code = proc.wait()
+    if return_code:
+        stderr_target = sys.stderr
+    else:
+        stderr_target = sys.stdout
+    stderr = open( unsorted_stderr_filename )
+    while True:
+        chunk = stderr.read( CHUNK_SIZE )
+        if chunk:
+            stderr_target.write( chunk )
+        else:
+            break
+    stderr.close()
+    
+    #sort sam, so indexing will not fail
+    #TODO: confirm if sorting is necessary (is original BAM order maintained, or does the output follow the order of input intervals?)
+    sorted_stderr_filename = os.path.join( tmp_dir, 'sorted.stderr' )
+    sorting_prefix = os.path.join( tmp_dir, 'sorted_bam' )
+    cmd = 'samtools sort -o "%s" "%s" > "%s"' % ( unsorted_bam_filename, sorting_prefix, output_bam_filename )
+    proc = subprocess.Popen( args=cmd, stderr=open( sorted_stderr_filename, 'wb' ), shell=True, cwd=tmp_dir )
+    return_code = proc.wait()
+    
+    if return_code:
+        stderr_target = sys.stderr
+    else:
+        stderr_target = sys.stdout
+    stderr = open( sorted_stderr_filename )
+    while True:
+        chunk = stderr.read( CHUNK_SIZE )
+        if chunk:
+            stderr_target.write( chunk )
+        else:
+            break
+    stderr.close()
+    
+    cleanup_before_exit( tmp_dir )
+
+if __name__=="__main__": __main__()

tools/samtools/samtools_slice_bam.xml

+<tool id="samtools_slice_bam" name="Slice BAM" version="0.0.1">
+  <description>by provided regions</description>
+  <requirements>
+      <requirement type="package">samtools</requirement>
+  </requirements>
+  <command interpreter="python">samtools_slice_bam.py
+    "${input_bam}"
+    "${input_bam.metadata.bam_index}"
+    "${input_interval}"
+    "${output_bam}"
+  </command>
+  <inputs>
+    <param name="input_bam" type="data" format="bam" label="BAM file" />
+    <param name="input_interval" type="data" format="bed" label="BED file" />
+  </inputs>
+  <outputs>
+    <data format="bam" name="output_bam"/>
+  </outputs>
+  <tests>
+      <test>
+          <param name="input_bam" value="gatk/fake_phiX_reads_1.bam" ftype="bam" />
+          <param name="input_interval" value="gatk/fake_phiX_variant_locations.bed" ftype="bed" />
+          <output name="output_log" file="gatk/fake_phiX_reads_1.bam" ftype="bam" />
+      </test>
+  </tests>
+  <help>
+**What it does**
+
+ Accepts an input BAM file and an input BED file and creates an output BAM file containing only those alignments that overlap the provided BED intervals.
+
+------
+
+**Citation**
+
+For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505943&gt;`_
+
+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
+
+  </help>
+</tool>