Commits

James Taylor committed e630c6c

"bed_merge_overlapping" merges all overlapping intervals in a bed file. It uses projection onto a bitset
like featureBits, but the bitset is broken up into bins which makes things much faster for smaller data
sets.

  • Participants
  • Parent commits 58c08d7

Comments (0)

Files changed (2)

File bed_merge_overlapping.py

+#!/usr/bin/env python
+
+"""
+Merge any overlapping regions of a bed file.
+Uses bitset projection similar to 'featureBits'
+"""
+
+import sys
+from bx.bitset import BinnedBitSet
+
+in_fname, out_fname = sys.argv[1:]
+
+last_chrom = None
+last_bitset = None
+bitsets = dict() 
+
+for line in open( in_fname ):
+    fields = line.split()
+    if fields[0] != last_chrom:
+        if fields[0] not in bitsets:
+            bitsets[fields[0]] = BinnedBitSet()
+        last_chrom = fields[0]
+        last_bitset = bitsets[fields[0]]
+    start, end = int( fields[1] ), int( fields[2] )
+    assert start <= end, "Bed interval start must be less than end"
+    last_bitset.set_range( start, end-start )
+
+for chrom in bitsets:
+    bits = bitsets[chrom]
+    end = 0
+    while 1:
+        start = bits.next_set( end )
+        if start is None: break
+        end = bits.next_clear( start )
+        print "%s\t%d\t%d" % ( chrom, start, end )

File bx/bitset.pyx

     cdef count_in_range( self, int start, int count ):
         return bitCountRange( self.bits, start, count )
 
-    cdef int find_next_set( self, int start ):
+    cdef int next_set( self, int start ):
         return bitFindSet( self.bits, start, self.bitCount )
     
-    cdef int find_next_clear( self, int start ):
+    cdef int next_clear( self, int start ):
         return bitFindClear( self.bits, start, self.bitCount )
 
     cdef iand( self, BitSet other ):
 
 class BinnedBitSet:
     def __init__( self, int granularity=1024, int max_size=MAX ):
-        self.max_size = MAX
+        self.max_size = max_size
         self.bin_size = int( math.ceil( ( max_size / granularity ) ) )
         self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
         self.bins = [ 0 ] * self.nbins
         bin, offset = self.get_bin_offset( index )
         if self.bins[bin] == 0 or self.bins[bin] == 1: self.init_bin( bin )
         self.bins[bin].set( offset )            
-        
+    def set_range( self, start, size ):
+        cdef BitSet bin
+        while size > 0:
+            bin_index, offset = self.get_bin_offset( start )
+            if self.bins[bin_index] == 0 or self.bins[bin_index] == 1: self.init_bin( bin_index )
+            bin = self.bins[bin_index]
+            bin_size = self.bin_size
+            amount = bin_size - offset
+            if amount < size:
+                bin.set_range( offset, amount )
+                size = size - amount
+                start = start + amount
+            else:
+                bin.set_range( offset, size )
+                size = 0
+    def next_set( self, start ):
+        cdef BitSet bin
+        bin_index, offset = self.get_bin_offset( start )
+        while bin_index < self.nbins:
+            if self.bins[bin_index] == 0:
+                bin_index = bin_index + 1
+                offset = 0
+                continue
+            bin = self.bins[bin_index]
+            ns = bin.next_set( offset )
+            if ns < self.bin_size:
+                return (bin_index*self.bin_size) + ns
+            else:
+                bin_index = bin_index + 1
+                offset = 0
+        else:
+            return None
+    def next_clear( self, start ):
+        cdef BitSet bin
+        bin_index, offset = self.get_bin_offset( start )
+        while bin_index < self.nbins:
+            if self.bins[bin_index] == 0:
+                bin_index = bin_index + 1
+                offset = 0
+                continue
+            bin = self.bins[bin_index]
+            ns = bin.next_clear( offset )
+            if ns < self.bin_size:
+                return (bin_index*self.bin_size) + ns
+            else:
+                bin_index = bin_index + 1
+                offset = 0
+        else:
+            return None