Commits

James Taylor committed da1ad4e

Cleaning and documenting some of these operations

  • Participants
  • Parent commits c16ddd9

Comments (0)

Files changed (7)

File bed_complement.py

 #!/usr/bin/env python
 
 """
-Complement the regions of a bed file
+Complement the regions of a bed file. Requires a file that maps source names
+to sizes. This should be in the simple LEN file format (each line contains
+a source name followed by a size, separated by whitespace).
 
 usage: %prog bed_file chrom_length_file
 """
+
 import sys
-from warnings import warn
+
 from bx.bitset import *
 from bx.bitset_builders import *
+
 import cookbook.doc_optparse
 
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
 def read_len( f ):
     """Read a 'LEN' file and return a mapping from chromosome to length"""
     mapping = dict()

File bed_coverage.py

 #!/usr/bin/env python
 
 """
-Print number of bases covered by intervals in a bed file
+Print number of bases covered by intervals in a bed file. Bed files can be
+provided on the command line or to stdin.
 
 usage: %prog bed files ...
 """
 from bx.bitset import BinnedBitSet
 from itertools import *
 
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
 bed_filenames = sys.argv[1:]
 if bed_filenames:
     input = chain( * imap( open, bed_filenames ) )
 else:
     input = sys.stdin
 
-last_chrom = None
-last_bitset = None
-bitsets = dict() 
-
-for line in input:
-        if line.startswith("#") or line.startswith("track"): continue
-        fields = line.split()
-        if fields[0] != last_chrom:
-            if fields[0] not in bitsets:
-                bitsets[fields[0]] = BinnedBitSet()
-            last_chrom = fields[0]
-            last_bitset = bitsets[fields[0]]
-        start, end = int( fields[1] ), int( fields[2] )
-        if start > end: print >>sys.stderr, "Bed interval start after end: " + line.strip()
-        last_bitset.set_range( start, end-start )
+bitsets = binned_bitsets_from_file( open( in_fname ) )
 
 total = 0
 for chrom in bitsets:

File bed_intersect.py

 Find regions of first bed file that overlap regions in a second bed file
 
 usage: %prog bed_file_1 bed_file_2
-    -m, --mincols=N: Require this much overlap 
-    -u, --upstream_pad=N: upstream interval padding
-    -d, --downstream_pad=N: downstream interval padding
+    -m, --mincols=N: Require this much overlap (default 1bp)
+    -u, --upstream_pad=N: upstream interval padding (default 0bp)
+    -d, --downstream_pad=N: downstream interval padding (default 0bp)
     -v, --reverse: Print regions that DO NOT overlap
 """
+
 import sys
 from warnings import warn
-from bx.bitset_builders import binned_bitsets_from_file
+
+from bx.bitset import *
+from bx.bitset_builders import *
+
 import cookbook.doc_optparse
 
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
 mincols = 1
 upstream_pad = 0
 downstream_pad = 0

File bed_intersect_basewise.py

 #!/usr/bin/env python
 
 """
-Find regions of first bed file that overlap regions in a second bed file
+Find regions of first bed file that overlap regions in a second bed file. This
+program performs a base-by-base intersection, so only runs of bases that are
+covered in both of the inputs will be output.
 
 usage: %prog bed_file_1 bed_file_2
 """
 import sys
 from warnings import warn
-from bx.bitset import BinnedBitSet
+from bx.bitset import *
+from bx.bitset_builders import *
 import cookbook.doc_optparse
 
-def read_bed( f ):
-    last_chrom = None
-    last_bitset = None
-    bitsets = dict() 
-    for line in f:
-        fields = line.split()
-        if fields[0] != last_chrom:
-            if fields[0] not in bitsets:
-                bitsets[fields[0]] = BinnedBitSet() 
-            last_chrom = fields[0]
-            last_bitset = bitsets[fields[0]]
-        start, end = int( fields[1] ), int( fields[2] )
-        if start > end: warn( "Bed interval start after end!" )
-        last_bitset.set_range( start, end-start )
-    return bitsets   
+import pkg_resources
+pkg_resources.require( "bx-python" )
 
 options, args = cookbook.doc_optparse.parse( __doc__ )
 try:
 except:
     cookbook.doc_optparse.exit()
 
-bits1 = read_bed( open( in_fname ) )
-bits2 = read_bed( open( in2_fname ) )
+bits1 = binned_bitsets_from_file( open( in_fname ) )
+bits2 = binned_bitsets_from_file( open( in2_fname ) )
 
 bitsets = dict()
 

File bed_merge_overlapping.py

 #!/usr/bin/env python
 
 """
-Merge any overlapping regions of bed files.
+Merge any overlapping regions of bed files. Bed files can be provided on the
+command line or on stdin. Merged regions are always reported on the '+' 
+strand, and any fields beyond chrom/start/stop are lost. 
 
 usage: %prog bed files ...
 """
 
 import psyco_full
 import sys
-from bx.bitset import BinnedBitSet
+
+from bx.bitset import *
+from bx.bitset_builders import *
 from itertools import *
 
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
 bed_filenames = sys.argv[1:]
 if bed_filenames:
     input = chain( * imap( open, bed_filenames ) )
 else:
     input = sys.stdin
 
-last_chrom = None
-last_bitset = None
-bitsets = dict() 
-
-for line in input:
-        if line.startswith("#") or line.startswith("track"): continue
-        fields = line.split()
-        if fields[0] != last_chrom:
-            if fields[0] not in bitsets:
-                bitsets[fields[0]] = BinnedBitSet()
-            last_chrom = fields[0]
-            last_bitset = bitsets[fields[0]]
-        start, end = int( fields[1] ), int( fields[2] )
-        if start > end: print >>sys.stderr, "Bed interval start after end: " + line.strip()
-        last_bitset.set_range( start, end-start )
+bitsets = binned_bitsets_from_file( input )
 
 for chrom in bitsets:
     bits = bitsets[chrom]

File bed_subtract_basewise.py

 from bx.bitset_builders import binned_bitsets_from_file
 import cookbook.doc_optparse
 
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
 def read_len( f ):
     """Read a 'LEN' file and return a mapping from chromosome to length"""
     mapping = dict()

File bx/bitset_builders.py

 from warnings import warn
 from bx.bitset import *
 
-def binned_bitsets_from_file( f, chrom_col=0, start_col=1, end_col=2, upstream_pad=0, downstream_pad=0, lens={} ):
-    """Read a file into a dictionary of bitsets"""
+def binned_bitsets_from_file( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={} ):
+    """
+    Read a file into a dictionary of bitsets. The defaults arguments 
+    
+    - 'f' should be a file like object (or any iterable containing strings)
+    - 'chrom_col', 'start_col', and 'end_col' must exist in each line. 
+    - 'strand_col' is optional, any line without it will be assumed to be '+'
+    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
+      chromosomes will be assumed to be the maximum size
+    """
     last_chrom = None
     last_bitset = None
     bitsets = dict() 
     for line in f:
-        if line.startswith("#"): continue
+        if line.startswith("#"):
+            continue
         fields = line.split()
+        strand = "+"
+        if len(fields) > strand_col:
+            if fields[strand_col] == "-": strand = "-"
         chrom = fields[chrom_col]
         if chrom != last_chrom:
             if chrom not in bitsets:
             last_chrom = chrom
             last_bitset = bitsets[chrom]
         start, end = int( fields[start_col] ), int( fields[end_col] )
+        # Switch to '+' strand coordinates if not already
+        if strand == '-':
+            start = size - end
+            end = size - start
         if upstream_pad: start = max( 0, start - upstream_pad )
-        if downstream_pad: end = min( MAX, end + downstream_pad )
-#        if start > end: warn( "Interval start after end!" )
+        if downstream_pad: end = min( size, end + downstream_pad )
+        if start > end: warn( "Interval start after end!" )
         last_bitset.set_range( start, end-start )
     return bitsets