Commits

James Taylor committed f15fb4e

Added:

- bed_complement.py: complements regions in a bed file (requires a LEN file)

- bed_intersect_basewise: base-by-base 'AND' of two bed files

Modified:

- bed_intersect: Now takes upstream/downstream padding to duplicate the
'proximity' operation in galaxy. Ouput can be reversed
(printing regions that do not overlap the bitset)

- maf_tile: minor fixes, no longer uses the '*' character for regions
lacking local alignment.

- bx/bitset.pyx: utility function for creating a dictionary of bitsets
from a tabular data file

- bx/binned_array.py: added a get_range operation that more efficiently
walks the bins. Python slice syntax now supported.

  • Participants
  • Parent commits 54c058a

Comments (0)

Files changed (7)

bed_complement.py

+#!/usr/bin/env python
+
+"""
+Complement the regions of a bed file
+
+usage: %prog bed_file chrom_length_file
+"""
+import sys
+from warnings import warn
+from bx.bitset import *
+import cookbook.doc_optparse
+
+def read_len( f ):
+    """Read a 'LEN' file and return a mapping from chromosome to length"""
+    mapping = dict()
+    for line in f:
+        fields = line.split()
+        mapping[ fields[0] ] = int( fields[1] )
+    return mapping
+
+options, args = cookbook.doc_optparse.parse( __doc__ )
+try:
+    in_fname, len_fname = args
+except:
+    cookbook.doc_optparse.exit()
+
+bitsets = binned_bitsets_from_file( open( in_fname ) )
+
+lens = read_len( open( len_fname ) )
+
+# chrom_col, start_col, end_col = 0, 1, 2
+# 
+# last_chrom = None
+# last_bitset = None
+# bitsets = dict() 
+# for line in open( in_fname ):
+#     fields = line.split()
+#     chrom = fields[chrom_col]
+#     if chrom != last_chrom:
+#         if chrom not in bitsets:
+#             bitsets[chrom] = BitSet( lens[chrom] ) 
+#         last_chrom = chrom
+#         last_bitset = bitsets[chrom]
+#     start, end = int( fields[start_col] ), int( fields[end_col] )
+#     if start > end: warn( "Interval start after end!" )
+#     last_bitset.set_range( start, end-start )
+
+for chrom in lens:
+    if chrom in bitsets:
+        bits = bitsets[chrom]
+        bits.invert()
+        len = lens[chrom]
+        end = 0
+        while 1:
+            start = bits.next_set( end )
+            if start == bits.size: break
+            end = bits.next_clear( start )
+            if end > len: end = len
+            print "%s\t%d\t%d" % ( chrom, start, end )
+            if end == len: break
+    else:
+        print "%s\t%d\t%d" % ( chrom, 0, lens[chrom] )
 
 usage: %prog bed_file_1 bed_file_2
     -m, --mincols=N: Require this much overlap 
+    -u, --upstream_pad=N: upstream interval padding
+    -d, --downstream_pad=N: downstream interval padding
+    -v, --reverse: Print regions that DO NOT overlap
 """
 import sys
 from warnings import warn
-from bx.bitset import BinnedBitSet
+from bx.bitset import binned_bitsets_from_file
 import cookbook.doc_optparse
 
+mincols = 1
+upstream_pad = 0
+downstream_pad = 0
+
 options, args = cookbook.doc_optparse.parse( __doc__ )
 try:
     if options.mincols: mincols = int( options.mincols )
-    else: mincols = 1
+    if options.upstream_pad: upstream_pad = int( options.upstream_pad )
+    if options.downstream_pad: downstream_pad = int( options.downstream_pad )
+    reverse = bool( options.reverse )
     in_fname, in2_fname = args
 except:
     cookbook.doc_optparse.exit()
 
+# Read first bed into some bitsets
 
-last_chrom = None
-last_bitset = None
-bitsets = dict() 
-
-for line in open( in2_fname ):
-    fields = line.split()
-    if fields[0] != last_chrom:
-        if fields[0] not in bitsets:
-            bitsets[fields[0]] = BinnedBitSet() 
-        last_chrom = fields[0]
-        last_bitset = bitsets[fields[0]]
-    start, end = int( fields[1] ), int( fields[2] )
-    if start > end: warn( "Bed interval start after end!" )
-    last_bitset.set_range( start, end-start )
+bitsets = binned_bitsets_from_file( open( in_fname ) )
 
 # Read second BED and intersect
 
     start, end = int( fields[1] ), int( fields[2] )
     if start > end: warn( "Bed interval start after end!" )
     if bitsets[fields[0]].count_range( start, end-start ) >= mincols:
+        if not reverse: print line,
+    elif reverse:
         print line,

bed_intersect_basewise.py

+#!/usr/bin/env python
+
+"""
+Find regions of first bed file that overlap regions in a second bed file
+
+usage: %prog bed_file_1 bed_file_2
+"""
+import sys
+from warnings import warn
+from bx.bitset import BinnedBitSet
+import cookbook.doc_optparse
+
+def read_bed( f ):
+    last_chrom = None
+    last_bitset = None
+    bitsets = dict() 
+    for line in f:
+        fields = line.split()
+        if fields[0] != last_chrom:
+            if fields[0] not in bitsets:
+                bitsets[fields[0]] = BinnedBitSet() 
+            last_chrom = fields[0]
+            last_bitset = bitsets[fields[0]]
+        start, end = int( fields[1] ), int( fields[2] )
+        if start > end: warn( "Bed interval start after end!" )
+        last_bitset.set_range( start, end-start )
+    return bitsets   
+
+options, args = cookbook.doc_optparse.parse( __doc__ )
+try:
+    in_fname, in2_fname = args
+except:
+    cookbook.doc_optparse.exit()
+
+bits1 = read_bed( open( in_fname ) )
+bits2 = read_bed( open( in2_fname ) )
+
+bitsets = dict()
+
+for key in bits1:
+    if key in bits2:
+        bits1[key].iand( bits2[key] )
+        bitsets[key] = bits1[key]
+
+for chrom in bitsets:
+    bits = bitsets[chrom]
+    end = 0
+    while 1:
+        start = bits.next_set( end )
+        if start == bits.size: break
+        end = bits.next_clear( start )
+        print "%s\t%d\t%d" % ( chrom, start, end )

bx/binned_array.py

 MAX=512*1024*1024 
 
 class BinnedArray( object ):
-    def __init__( self, granularity=1024, default=NaN, max_size=MAX ):
-        self.max_size = MAX
-        self.bin_size = int( math.ceil( ( max_size / granularity ) ) )
+    def __init__( self, bin_size=512*1024, default=NaN, max_size=MAX ):
+        self.max_size = max_size
+        self.bin_size = bin_size
         self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
         self.bins = [ None ] * self.nbins
         self.default = default
         return index // self.bin_size, index % self.bin_size
     def init_bin( self, index ):
         # self.bins[index] = zeros( self.bin_size ) * self.default
-        self.bins[index] = array( [ self.default ] * self.bin_size, typecode="f" )
-    def __getitem__( self, key ):
+        self.bins[index] = resize( array(self.default, typecode="f"), (self.bin_size,) )
+    def get( self, key ):
         bin, offset = self.get_bin_offset( key )
         if self.bins[bin]:
             return self.bins[bin][offset]
         else:
             return self.default
-    def __setitem__( self, key, value ):
+    def set( self, key, value ):
         bin, offset = self.get_bin_offset( key )
         if not self.bins[bin]: self.init_bin( bin )
         self.bins[bin][offset] = value
-
+    def get_range( self, start, end ):
+        size = end - start
+        assert size >= 0
+        rval = []
+        while size > 0:
+            bin, offset = self.get_bin_offset( start )
+            delta = self.bin_size - offset
+            if self.bins[bin] is None:
+                if delta < size:
+                    rval.append( resize( array(self.default, typecode="f"), (delta,) ) )
+                    size -= delta
+                    start += delta
+                else:
+                    rval.append( resize( array(self.default, typecode="f"), (size,) ) )
+                    size = 0
+            else:
+                if delta < size:
+                    rval.append( self.bins[bin][offset:offset+delta] )
+                    size -= delta
+                    start += delta
+                else:
+                    rval.append( self.bins[bin][offset:offset+size] )
+                    size = 0
+        return concatenate( rval )
+    def __getitem__( self, key ):
+        if isinstance( key, slice ):
+            start, stop, stride = key.indices( self.max_size )
+            assert stride == 1, "Slices with strides are not supported"
+            return self.get_range( start, stop )
+        else:
+            return self.get( key )
+    def __setitem__( self, key, value ):
+        return self.set( key, value )
+    
+    
 if __name__ == "__main__":
     source = []
     for i in range( 13 ):
     # Verify
     for i in range( len( source ) ):
         assert source[i] == target[i], "No match, index: %d, source: %d, target: %d" % ( i, source[i], target[i] )
+    # Verfiy with slices
+    for i in range( 10 ):
+        a = int( random() * len( source ) )
+        b = int( random() * len( source ) )
+        if b < a: a, b = b, a
+        assert source[a:b] == target[a:b], "No match, index: %d:%d, source: %s, target: %s" % \
+            ( a, b, ",".join( map( str, source[a:a+10] ) ), ",".join( map( str, target[a:a+10] ) ) )
     def ior( self, BinnedBitSet other ):
         binBitsOr( self.bb, other.bb )
     def invert( self ):
-        binBitsNot( self.bb )
+        binBitsNot( self.bb )
+        
+## ---- Utility functions ---------------------------------------------
+
+from warnings import warn
+
+def binned_bitsets_from_file( f, chrom_col=0, start_col=1, end_col=2, upstream_pad=0, downstream_pad=0 ):
+    """Read a file into a dictionary of bitsets"""
+    last_chrom = None
+    last_bitset = None
+    bitsets = dict() 
+    for line in f:
+        fields = line.split()
+        chrom = fields[chrom_col]
+        if chrom != last_chrom:
+            if chrom not in bitsets:
+                bitsets[chrom] = BinnedBitSet() 
+            last_chrom = chrom
+            last_bitset = bitsets[chrom]
+        start, end = int( fields[start_col] ), int( fields[end_col] )
+        if upstream_pad: start = max( 0, start - upstream_pad )
+        if downstream_pad: end = min( MAX, end + downstream_pad )
+        if start > end: warn( "Interval start after end!" )
+        last_bitset.set_range( start, end-start )
+    return bitsets 
 import cookbook.doc_optparse
 
 import bx.align.maf
+import bx.align as align
 from bx import misc
 import bx.seq.nib
 import os
 
 def do_interval( sources, index, out, ref_src, start, end, seq_db ):
 
-    assert sources[0].split('.')[0] == ref_src.split('.')[0]
+    assert sources[0].split('.')[0] == ref_src.split('.')[0], "%s != %s" % ( sources[0].split('.')[0], ref_src.split('.')[0] )
 
     base_len = end - start
         
     # print len( blocks )
     # print blocks[0]
     
+    ref_src_size = None
     for i, block in enumerate( blocks ):
         ref = block.get_component_by_src_start( ref_src )
         ref_src_size = ref.src_size
 
     for ss, ee, index in intervals_from_mask( mask ):
         if index < 0:
-            tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref.src ] ) ).get( start+ss, ee-ss ) )
+            tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( start+ss, ee-ss ) )
             for row in tiled[1:]:
-                row.append( "*" * ( ee - ss ) )
+                row.append( "-" * ( ee - ss ) )
         else:
             slice_start = start + ss
             slice_end = start + ee
         text = "".join( tiled[i] )
         size = len( text ) - text.count( "-" )
         if i == 0:
+            if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).length
             c = align.Component( ref_src, start, end-start, "+", ref_src_size, text )
         else:
             c = align.Component( name + ".fake", 0, size, "?", size, text )
 aggregate_scores_in_intervals.py
 align_print_template.py
 bed_intersect.py
+bed_intersect_basewise.py
 bed_merge_overlapping.py
 bed_coverage.py
+bed_complement.py
 find_in_sorted_file.py
 int_seqs_to_char_strings.py
 interval_count_intersections.py