Commits

James Taylor  committed bd7be90

C implementation of binned bitsets.

  • Participants
  • Parent commits d9ae19f

Comments (0)

Files changed (5)

File bx/bitset.pyx

     ## # Print part or all of bit map as a string of 0s and 1s.  Mostly useful for
     ## void bitPrint(Bits *a, int startIx, int bitCount, FILE* out)
 
+cdef extern from "binBits.h":
+    struct BinBits:
+        int size
+        int bin_size
+        int nbins
+        Bits ** bins
+    BinBits* binBitsAlloc( int size, int granularity )
+    void binBitsFree( BinBits * bb )
+    int binBitsReadOne( BinBits * bb, int pos )
+    void binBitsSetOne( BinBits * bb, int pos )
+    void binBitsSetRange( BinBits *bb, int start, int size )
+    int binBitsCountRange( BinBits *bb, int start, int size )
+    int binBitsFindSet( BinBits *bb, int start )
+    int binBitsFindClear( BinBits *bb, int start )
+
+
 cdef class BitSet:
     cdef Bits * bits
     cdef int bitCount
     cdef __invert__( self ):
         self.invert()
         
-        
-
 MAX=512*1024*1024 
 
-import math
+cdef class BinnedBitSet:
+    cdef BinBits * bb
+    def __new__( self, int size=MAX, int granularity=1024 ):
+        self.bb = binBitsAlloc( size, granularity )
+    def __dealloc( self ):
+        binBitsFree( self.bb );
+    def __getitem__( self, pos ):
+        return binBitsReadOne( self.bb, pos )
+    def set( self, pos ):
+        binBitsSetOne( self.bb, pos )
+    def set_range( self, int start, size ):
+        binBitsSetRange( self.bb, start, size )
+    def count_range( self, start, size ):
+        return binBitsCountRange( self.bb, start, size )
+    def next_set( self, start ):
+        return binBitsFindSet( self.bb, start )
+    def next_clear( self, start ):
+        return binBitsFindClear( self.bb, size )
 
-class BinnedBitSet:
-    def __init__( self, int granularity=1024, int max_size=MAX ):
-        self.max_size = max_size
-        self.bin_size = int( math.ceil( ( max_size / granularity ) ) )
-        self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
-        self.bins = [ 0 ] * self.nbins
-    def get_bin_offset( self, index ):
-        return int( index / self.bin_size ), index % self.bin_size
-    def init_bin( self, index ):
-        self.bins[index] = BitSet( self.bin_size )
-    def __getitem__( self, key ):
-        bin, offset = self.get_bin_offset( key )
-        if self.bins[bin] == 0:
-            return 0
-        elif self.bins[bin] == 1:
-            return 1
-        else:
-            return self.bins[bin].get( offset )
-    def set( self, index ):
-        bin, offset = self.get_bin_offset( index )
-        if self.bins[bin] == 0 or self.bins[bin] == 1: self.init_bin( bin )
-        self.bins[bin].set( offset )            
-    def set_range( self, start, size ):
-        cdef BitSet bin
-        while size > 0:
-            bin_index, offset = self.get_bin_offset( start )
-            if self.bins[bin_index] == 0 or self.bins[bin_index] == 1: self.init_bin( bin_index )
-            bin = self.bins[bin_index]
-            bin_size = self.bin_size
-            amount = bin_size - offset
-            if amount < size:
-                bin.set_range( offset, amount )
-                size = size - amount
-                start = start + amount
-            else:
-                bin.set_range( offset, size )
-                size = 0
-    def count_range( self, start, size ):
-        cdef BitSet bin
-        count = 0
-        while size > 0:
-            bin_index, offset = self.get_bin_offset( start )
-            if self.bins[bin_index] == 0 or self.bins[bin_index] == 1: continue
-            bin = self.bins[bin_index]
-            bin_size = self.bin_size
-            amount = bin_size - offset
-            if amount < size:
-                count = count + bin.count_in_range( offset, amount )
-                size = size - amount
-                start = start + amount
-            else:
-                count = count + bin.count_in_range( offset, size )
-                size = 0
-    def next_set( self, start ):
-        cdef BitSet bin
-        bin_index, offset = self.get_bin_offset( start )
-        while bin_index < self.nbins:
-            if self.bins[bin_index] == 0:
-                bin_index = bin_index + 1
-                offset = 0
-                continue
-            bin = self.bins[bin_index]
-            ns = bin.next_set( offset )
-            if ns < self.bin_size:
-                return (bin_index*self.bin_size) + ns
-            else:
-                bin_index = bin_index + 1
-                offset = 0
-        else:
-            return None
-    def next_clear( self, start ):
-        cdef BitSet bin
-        bin_index, offset = self.get_bin_offset( start )
-        while bin_index < self.nbins:
-            if self.bins[bin_index] == 0:
-                bin_index = bin_index + 1
-                offset = 0
-                continue
-            bin = self.bins[bin_index]
-            ns = bin.next_clear( offset )
-            if ns < self.bin_size:
-                return (bin_index*self.bin_size) + ns
-            else:
-                bin_index = bin_index + 1
-                offset = 0
-        else:
-            return None
         py_modules = py_packages,
         packages = packages,
         scripts = open( "scripts.list" ).read().split(),
-        ext_modules=[ Extension( "bx.bitset", [ "bx/bitset.pyx" ] + [ JK_LIB + f for f in bitset_deps ], include_dirs=[JK_INC] ) ],
+        ext_modules=[ Extension( "bx.bitset", [ "bx/bitset.pyx", "src/binBits.c" ] + [ JK_LIB + f for f in bitset_deps ], include_dirs=[JK_INC, "src"] ) ],
         cmdclass = {'build_ext': build_ext}
      )

File src/binBits.c

+#include "common.h"
+#include "bits.h"
+#include "binBits.h"
+
+struct BinBits* binBitsAlloc( int size, int granularity )
+{
+    struct BinBits * bb;
+    AllocVar(bb);
+    bb->size = size;
+    bb->bin_size = (int) ceil( size / (float) granularity );
+    bb->nbins = (int) ceil( size / (float) bb->bin_size );
+    AllocArray( bb->bins, bb->nbins );
+    return bb;
+}
+
+void binBitsFree( struct BinBits *bb )
+{
+  int i;
+  for ( i = 0; i < bb->nbins; i++ )
+  {
+      if ( bb->bins[i] != NULL )
+      {
+          bitFree( bb->bins[i] );
+      }
+      freeMem( bb );
+  }
+}
+    
+inline int binBitsGetBin( struct BinBits * bb, int pos )
+{
+    return pos / bb->bin_size;
+}
+
+inline int binBitsGetOffset( struct BinBits * bb, int pos )
+{
+    return pos % bb->bin_size;
+}
+
+boolean binBitsReadOne( struct BinBits * bb, int pos )
+{
+    int bin = binBitsGetBin( bb, pos );
+    if ( bb->bins[bin] )
+    {
+        return bitReadOne( bb->bins[bin], binBitsGetOffset( bb, pos ) );
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void binBitsSetOne( struct BinBits * bb, int pos )
+{
+    int bin = binBitsGetBin( bb, pos );  
+    int offset = binBitsGetOffset( bb, pos );
+    if ( bb->bins[bin] == NULL )
+    {
+        bb->bins[bin] = bitAlloc( bb->bin_size );
+    }
+    bitSetOne( bb->bins[bin], offset );
+}
+
+void binBitsSetRange( struct BinBits *bb, int start, int size )
+{
+    int bin, offset, delta;
+    while ( size > 0 )
+    {
+        int bin = binBitsGetBin( bb, start );  
+        int offset = binBitsGetOffset( bb, start );
+        if ( bb->bins[bin] == NULL )
+        {
+            bb->bins[bin] = bitAlloc( bb->bin_size );   
+        }
+        delta = bb->bin_size - offset;
+        if ( delta < size )
+        {
+            bitSetRange( bb->bins[bin], offset, delta );
+            size -= delta;
+            start += delta;
+        }
+        else
+        {
+            bitSetRange( bb->bins[bin], offset, size );
+            size = 0;
+        }
+    }
+}
+
+int binBitsCountRange( struct BinBits *bb, int start, int size )
+{
+    int delta;
+    int count = 0;
+    while ( size > 0 )
+    {
+        int bin = binBitsGetBin( bb, start );  
+        int offset = binBitsGetOffset( bb, start );
+        if ( bb->bins[bin] == NULL ) continue;
+        delta = bb->bin_size - offset;
+        if ( delta < size )
+        {
+            count += bitCountRange( bb->bins[bin], start, delta );
+            size -= delta;
+            start += delta;
+        }
+        else
+        {
+            count += bitCountRange( bb->bins[bin], start, size );
+            size = 0;
+        } 
+    }
+}
+
+int binBitsFindSet( struct BinBits *bb, int start )
+{
+    int ns;
+    int bin = binBitsGetBin( bb, start );  
+    int offset = binBitsGetOffset( bb, start );
+    while ( bin < bb->nbins )
+    {
+        if ( bb->bins[bin] != NULL )
+        {
+            ns = bitFindSet( bb->bins[bin], offset, bb->bin_size );
+            if ( ns < bb->bin_size )
+            {
+                return bin * bb->bin_size + ns;
+            }
+        }
+        bin += 1;
+        offset = 0;
+    }
+    return bb->size;
+}
+
+int binBitsFindClear( struct BinBits *bb, int start )
+{
+    int ns;
+    int bin = binBitsGetBin( bb, start );  
+    int offset = binBitsGetOffset( bb, start );
+    while ( bin < bb->nbins )
+    {
+        if ( bb->bins[bin] == NULL )
+        {
+            return bin*bb->bin_size;
+        }
+        else
+        {
+            ns = bitFindClear( bb->bins[bin], offset, bb->bin_size );
+            if ( ns < bb->bin_size )
+            {
+                return bin*bb->bin_size + ns;
+            }
+        }
+        bin += 1;
+        offset = 0;
+    }
+    return bb->size;
+}

File src/binBits.h

+#ifndef BINBITS_H
+#define BINBITS_H
+
+#include "common.h"
+#include "bits.h"
+
+struct BinBits
+{
+    int size;
+    int bin_size;
+    int nbins;
+    Bits ** bins;
+};
+
+struct BinBits* binBitsAlloc( int size, int granularity );
+void binBitsFree( struct BinBits *bb );
+boolean binBitsReadOne( struct BinBits * bb, int pos );
+void binBitsSetOne( struct BinBits * bb, int pos );
+void binBitsSetRange( struct BinBits *bb, int start, int size );
+int binBitsCountRange( struct BinBits *bb, int start, int size );
+int binBitsFindSet( struct BinBits *bb, int start );
+int binBitsFindClear( struct BinBits *bb, int start );
+
+#endif

File wiggle_to_simple.py

 else: out_file = sys.stdout
 
 for fields in bx.wiggle.Reader( in_file ):
-    print " ".join( line )
+    print " ".join( map( str, fields ) )
 
 in_file.close()
 out_file.close()