1. James Taylor
  2. bx-python

Commits

James Taylor  committed 246a539

Fixed a bug in count_range for BinnedBitset
Added bed_coverage, bed_intersect, and bed_merge_overlapping (all using binned bitsets)

  • Participants
  • Parent commits bd7be90
  • Branches default

Comments (0)

Files changed (8)

File aggregate_scores_in_intervals.py

View file
 import psyco_full
 import bx.wiggle
 from bx.binned_array import BinnedArray
+from fpconst import isNaN
 import cookbook.doc_optparse
+import misc
 
 def read_scores( f ):
     scores_by_chrom = dict()
     except:
         cookbook.doc_optparse.exit()
 
-    scores_by_chrom = read_scores( open( sys.argv[1] ) )
+    scores_by_chrom = read_scores( misc.open_compressed( sys.argv[1] ) )
     for line in open( sys.argv[2] ):
         fields = line.split()
         chrom, start, stop = fields[0], int( fields[1] ), int( fields[2] )
         for i in range( start, stop ):
             if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
                 score = scores_by_chrom[chrom][i]
-                total += score
-                count += 1
-                max_score = max( score, max_score )
-                min_score = min( score, min_score )
+                if not isNaN( score ):
+                    total += score
+                    count += 1
+                    max_score = max( score, max_score )
+                    min_score = min( score, min_score )
         if total > 0:
             avg = total/count
         else:

File bed_coverage.py

View file
+#!/usr/bin/env python
+
+"""
+Print number of bases covered by intervals in a bed file
+
+usage: %prog bed files ...
+"""
+
+import psyco_full
+import sys
+from bx.bitset import BinnedBitSet
+from itertools import *
+
+bed_filenames = sys.argv[1:]
+if bed_filenames:
+    input = chain( * imap( open, bed_filenames ) )
+else:
+    input = sys.stdin
+
+last_chrom = None
+last_bitset = None
+bitsets = dict() 
+
+for line in input:
+        if line.startswith("#") or line.startswith("track"): continue
+        fields = line.split()
+        if fields[0] != last_chrom:
+            if fields[0] not in bitsets:
+                bitsets[fields[0]] = BinnedBitSet()
+            last_chrom = fields[0]
+            last_bitset = bitsets[fields[0]]
+        start, end = int( fields[1] ), int( fields[2] )
+        if start > end: print >>sys.stderr, "Bed interval start after end: " + line.strip()
+        last_bitset.set_range( start, end-start )
+
+total = 0
+for chrom in bitsets:
+    total += bitsets[chrom].count_range( 0, bitsets[chrom].size )
+
+print total    

File bed_intersect.py

View file
+#!/usr/bin/env python
+
+"""
+Find regions of one bed file that overlap regions in a second bed file
+usage: %prog bed_file_1 bed_file_2
+"""
+import sys
+from warnings import warn
+from bx.bitset import BinnedBitSet
+
+in_fname, in2_fname = sys.argv[1:]
+
+last_chrom = None
+last_bitset = None
+bitsets = dict() 
+
+for line in open( in2_fname ):
+    fields = line.split()
+    if fields[0] != last_chrom:
+        if fields[0] not in bitsets:
+            bitsets[fields[0]] = BinnedBitSet() 
+        last_chrom = fields[0]
+        last_bitset = bitsets[fields[0]]
+    start, end = int( fields[1] ), int( fields[2] )
+    if start > end: warn( "Bed interval start after end!" )
+    last_bitset.set_range( start, end-start )
+
+# Read second BED and intersect
+
+for line in open( in_fname ):
+    fields = line.split()
+    if fields[0] not in bitsets: continue
+    start, end = int( fields[1] ), int( fields[2] )
+    if start > end: warn( "Bed interval start after end!" )
+    ns = bitsets[fields[0]].next_set( start )
+    if ns is not None and ns < end: 
+        print line,

File bed_merge_overlapping.py

View file
 #!/usr/bin/env python
 
 """
-Merge any overlapping regions of a bed file.
-Uses bitset projection similar to 'featureBits'
+Merge any overlapping regions of bed files.
+
+usage: %prog bed files ...
 """
 
+import psyco_full
 import sys
 from bx.bitset import BinnedBitSet
 
-in_fname, out_fname = sys.argv[1:]
+bed_filenames = sys.argv[1:]
 
 last_chrom = None
 last_bitset = None
 bitsets = dict() 
 
-for line in open( in_fname ):
-    fields = line.split()
-    if fields[0] != last_chrom:
-        if fields[0] not in bitsets:
-            bitsets[fields[0]] = BinnedBitSet()
-        last_chrom = fields[0]
-        last_bitset = bitsets[fields[0]]
-    start, end = int( fields[1] ), int( fields[2] )
-    assert start <= end, "Bed interval start must be less than end"
-    last_bitset.set_range( start, end-start )
+for fname in bed_filenames:
+    for line in open( fname ):
+        if line.startswith("#") or line.startswith("track"): continue
+        fields = line.split()
+        if fields[0] != last_chrom:
+            if fields[0] not in bitsets:
+                bitsets[fields[0]] = BinnedBitSet()
+            last_chrom = fields[0]
+            last_bitset = bitsets[fields[0]]
+        start, end = int( fields[1] ), int( fields[2] )
+        if start > end: print >>sys.stderr, "Bed interval start after end: " + line.strip()
+        last_bitset.set_range( start, end-start )
 
 for chrom in bitsets:
     bits = bitsets[chrom]
     end = 0
     while 1:
         start = bits.next_set( end )
-        if start is None: break
+        if start == bits.size: break
         end = bits.next_clear( start )
         print "%s\t%d\t%d" % ( chrom, start, end )

File bx/binned_array.py

View file
 
 import math
 
+from fpconst import *
 from Numeric import *
 from RandomArray import *
 
 MAX=512*1024*1024 
 
 class BinnedArray( object ):
-    def __init__( self, granularity=1024, default=None, max_size=MAX ):
+    def __init__( self, granularity=1024, default=NaN, max_size=MAX ):
         self.max_size = MAX
         self.bin_size = int( math.ceil( ( max_size / granularity ) ) )
         self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
         return index // self.bin_size, index % self.bin_size
     def init_bin( self, index ):
         # self.bins[index] = zeros( self.bin_size ) * self.default
-        self.bins[index] = [ self.default ] * self.bin_size
+        self.bins[index] = array( [ self.default ] * self.bin_size, typecode="f" )
     def __getitem__( self, key ):
         bin, offset = self.get_bin_offset( key )
         if self.bins[bin]:

File bx/bitset.pyx

View file
     def __dealloc__( self ):
         bitFree( & self.bits )
 
-    cdef set( self, int index ):
+    property size:
+        def __get__( self ):
+            return self.bitCount
+
+    def set( self, index ):
         bitSetOne( self.bits, index )
 
-    cdef clear( self, int index ):
+    def clear( self, index ):
         bitClearOne( self.bits, index )
 
-    cdef set_range( self, int start, int count ):   
+    def set_range( self, start, count ):   
         bitSetRange( self.bits, start, count )
 
     ## cdef clear_range( self, int start, int count ):
     ##    bitClearRange( self.bits, start, count )
 
-    cdef int get( self, int index ):
+    def get( self, index ):
         return bitReadOne( self.bits, index );
     
-    cdef count_in_range( self, int start, int count ):
+    def count_range( self, start, count ):
         return bitCountRange( self.bits, start, count )
 
-    cdef int next_set( self, int start ):
-        return bitFindSet( self.bits, start, self.bitCount )
+    def next_set( self, start, end=None ):
+        if end == None: end = self.bitCount
+        return bitFindSet( self.bits, start, end )
     
-    cdef int next_clear( self, int start ):
-        return bitFindClear( self.bits, start, self.bitCount )
+    def next_clear( self, start, end=None ):
+        if end == None: end = self.bitCount
+        return bitFindClear( self.bits, start, end )
 
-    cdef iand( self, BitSet other ):
+    def iand( self, BitSet other ):
         bitAnd( self.bits, other.bits, self.bitCount )
         
-    cdef ior( self, BitSet other ): 
+    def ior( self, BitSet other ): 
         bitOr( self.bits, other.bits, self.bitCount )
 
-    cdef bitXor( self, BitSet other ): 
+    def bitXor( self, BitSet other ): 
         bitXor( self.bits, other.bits, self.bitCount )
 
-    cdef invert( self ):
+    def invert( self ):
         bitNot( self.bits, self.bitCount)
 
     ## ---- Python "Operator Overloading" ----
         
-    cdef __getitem__( self, int index ):
+    def __getitem__( self, index ):
         return self.get( index )
 
-    cdef __iand__( self, BitSet other ):
+    def __iand__( self, other ):
         self.iand( other )
+        return self
 
-    cdef __ior__( self, BitSet other ):
+    def __ior__( self, other ):
         self.ior( other )
+        return self
 
-    cdef __invert__( self ):
+    def __invert__( self ):
         self.invert()
+        return self
         
 MAX=512*1024*1024 
 
     def next_set( self, start ):
         return binBitsFindSet( self.bb, start )
     def next_clear( self, start ):
-        return binBitsFindClear( self.bb, size )
-
+        return binBitsFindClear( self.bb, start )
+    property size:
+        def __get__( self ):
+            return self.bb.size

File scripts.list

View file
 aggregate_scores_in_intervals.py
 align_print_template.py
+bed_intersect.py
+bed_merge_overlapping.py
+bed_coverage.py
 find_in_sorted_file.py
 int_seqs_to_char_strings.py
 interval_count_intersections.py

File src/binBits.c

View file
     {
         int bin = binBitsGetBin( bb, start );  
         int offset = binBitsGetOffset( bb, start );
-        if ( bb->bins[bin] == NULL ) continue;
         delta = bb->bin_size - offset;
-        if ( delta < size )
+        if ( bb->bins[bin] == NULL )
         {
-            count += bitCountRange( bb->bins[bin], start, delta );
+            start += delta;
+            size -= delta;
+        }
+        else if ( delta < size )
+        {
+            count += bitCountRange( bb->bins[bin], offset, delta );
             size -= delta;
             start += delta;
         }
         else
         {
-            count += bitCountRange( bb->bins[bin], start, size );
+            count += bitCountRange( bb->bins[bin], offset, size );
             size = 0;
         } 
     }
+    return count;
 }
 
 int binBitsFindSet( struct BinBits *bb, int start )