Commits

James Taylor committed 1aaece0

First steps toward bigbed/bigwig reading -- working read-only implementations of bPlusTree and cirTree, bbiFile can find and parse summaries

Comments (0)

Files changed (9)

lib/bx/bbi/__init__.py

+"""
+Support for the UCSC "Big Binary Indexed" file formats (bigWig and bigBed)
+"""

lib/bx/bbi/bbi_file.pyx

+"""
+Core implementation for reading UCSC "big binary indexed" files.
+
+There isn't really any specification for the format beyond the code, so this
+mirrors Jim Kent's 'bbiRead.c' mostly. 
+"""
+
+from bpt_file cimport BPTFile
+from cir_tree cimport CIRTreeFile
+from types cimport *
+
+from bx.misc.binary_file import BinaryFileReader
+from cStringIO import StringIO
+import zlib
+
+# Signatures for bbi related file types
+
+DEF big_wig_sig = 0x888FFC26
+DEF big_bed_sig = 0x8789F2EB
+
+# Some record sizes for parsing
+DEF summary_on_disk_size = 32
+
+cdef class BBIFile:
+    """
+    A "big binary indexed" file. Stores blocks of raw data and numeric 
+    summaries of that data at different levels of aggregation ("zoom levels").
+    Generic enough to accommodate both wiggle and bed data. 
+    """
+    # Probably a PyFileObject, or any seekable file-like
+    cdef object file
+    # A BinaryFileReader created from file
+    cdef object reader
+    # The magic number or type signature (whether the file is bigWig or bigBed or...)
+    cdef public bits32 magic
+    # Is the file byteswapped relative to our native byte order?
+    cdef boolean is_byteswapped
+    # The index to the chromosomes, an embedded BPT file
+    cdef BPTFile chrom_bpt
+    # Version number
+    cdef public bits16 version
+    # Number of zoom levels
+    cdef public bits16 zoom_levels
+    # Offset to chromosome index
+    cdef bits64 chrom_tree_offset
+    # Offset to unzoomed data
+    cdef bits64 unzoomed_data_offset
+    # Offset to unzoomed index
+    cdef bits64 unzoomed_index_offset
+    # If bed, number of columns
+    cdef bits16 field_count
+    cdef bits16 defined_field_count
+    # Offset to an embedded string containing "AutoSQL" format data that defines the columns
+    cdef bits64 as_offset
+    # Offset to total summary information (if any)
+    cdef bits64 total_summary_offset
+    # Size of uncompression buffer, 0 if no compression
+    cdef bits32 uncompress_buf_size
+    # Unzoomed data index
+    cdef CIRTreeFile unzoomed_cir
+    # Zoom levels list
+    cdef public object level_list
+
+    def __init__( self, file=None, expected_sig=None, type_name=None ):
+        if file is not None:
+            self.open( file, expected_sig, type_name )
+
+    def open( self, file, expected_sig, type_name ):
+        """
+        Initialize from an existing bbi file, signature (magic) must be passed
+        in since this is generic. 
+        """
+        self.file = file
+        # Open the file in a BinaryFileReader, handles magic and byteswapping
+        self.reader = reader = BinaryFileReader( file, expected_sig )
+        self.magic = expected_sig
+        self.is_byteswapped = self.reader.byteswap_needed
+        # Read header stuff
+        self.version = reader.read_uint16()
+        self.zoom_levels = reader.read_uint16()
+        self.chrom_tree_offset = reader.read_uint64()
+        self.unzoomed_data_offset = reader.read_uint64()
+        self.unzoomed_index_offset = reader.read_uint64()
+        self.field_count = reader.read_uint16()
+        self.defined_field_count = reader.read_uint16()
+        self.as_offset = reader.read_uint64()
+        self.total_summary_offset = reader.read_uint64()
+        self.uncompress_buf_size = reader.read_uint32()
+        # Skip reserved
+        reader.seek( 64 )
+        # Read zoom headers
+        self.level_list = []
+        for i from 0 <= i < self.zoom_levels:
+            level = ZoomLevel() 
+            level.bbi_file = self
+            level.reduction_level = reader.read_uint32()
+            level.reserved = reader.read_uint32()
+            level.data_offset = reader.read_uint64()
+            level.index_offset = reader.read_uint64()
+            self.level_list.append( level )
+        # Initialize and attach embedded BPTFile containing chromosome names and ids
+        reader.seek( self.chrom_tree_offset )
+        self.chrom_bpt = BPTFile( file=self.file )
+
+    def _get_chrom_id_and_size( self, chrom ):
+        """
+        Lookup id and size from the chromosome named `chrom`
+        """
+        bytes = self.chrom_bpt.find( chrom )
+        if bytes is not None:
+            # The value is two 32 bit uints, use the BPT's reader for checking byteswapping
+            chrom_id, chrom_size = self.chrom_bpt.reader.unpack( "II", bytes )
+            return chrom_id, chrom_size
+        else:
+            return None
+
+cdef class ZoomLevel:
+    cdef BBIFile bbi_file
+    cdef public bits32 reduction_level
+    cdef bits32 reserved
+    cdef public bits64 data_offset
+    cdef public bits64 index_offset
+
+    def summaries_in_region( self, bits32 chrom_id, bits32 start, bits32 end ):
+        cdef CIRTreeFile ctf
+        cdef int item_count
+        rval = []
+        reader = self.bbi_file.reader
+        reader.seek( self.index_offset )
+        ctf = CIRTreeFile( reader.file )
+        block_list = ctf.find_overlapping_blocks( chrom_id, start, end )
+        for offset, size in block_list:
+            # Seek to and read all data for the block
+            reader.seek( offset )
+            block_data = reader.read( size )
+            # Might need to uncompress
+            if self.bbi_file.uncompress_buf_size > 0:
+                ## block_data = zlib.decompress( block_data, buf_size = self.bbi_file.uncompress_buf_size )
+                block_data = zlib.decompress( block_data )
+            block_size = len( block_data )
+            # The block should be a bunch of summaries. 
+            assert block_size % summary_on_disk_size == 0
+            item_count = block_size / summary_on_disk_size
+            # Create another reader just for the block, shouldn't be too expensive
+            block_reader = BinaryFileReader( StringIO( block_data ), is_little_endian=reader.is_little_endian )
+            for i from 0 <= i < item_count:
+                ## NOTE: Look carefully at bbiRead again to be sure the endian
+                ##       conversion here is all correct. It looks like it is 
+                ##       just pushing raw data into memory and not swapping
+                summary = Summary()
+                summary.chrom_id = block_reader.read_uint32()
+                summary.start = block_reader.read_uint32()
+                summary.end = block_reader.read_uint32()
+                summary.valid_count = block_reader.read_uint32()
+                summary.min_val = block_reader.read_float()
+                summary.max_val = block_reader.read_float()
+                summary.sum_data = block_reader.read_float()
+                summary.sum_squares = block_reader.read_float()
+                rval.append( summary )
+        return rval
+            
+
+
+    
+cdef class ChromIDSize:
+    cdef bits32 chrom_id
+    cdef bits32 chrom_size
+
+cdef class ChromInfo:
+    cdef object name
+    cdef bits32 id
+    cdef bits32 size
+
+# Skipping 'struct bbiChromUsage'
+
+cdef enum summary_type:
+    mean = 0
+    max = 1
+    min = 2
+    coverage = 3
+    sd = 4
+    
+cdef class Summary:
+    cdef public bits32 chrom_id
+    cdef public bits32 start
+    cdef public bits32 end
+    cdef public bits32 valid_count
+    cdef public float min_val
+    cdef public float max_val
+    cdef public float sum_data
+    cdef public float sum_squares
+    
+cdef class Interval:
+    cdef bits32 start
+    cdef bits32 end
+    cdef double val
+
+cdef class SummaryElement:
+    cdef bits64 valid_count
+    cdef double min_val
+    cdef double max_val
+    cdef double sum_data
+    cdef double sum_squares
+
+

lib/bx/bbi/bpt_file.pxd

+from bx.misc.binary_file import BinaryFileReader
+
+from types cimport *
+
+cdef class BPTFile:
+    """
+    On disk B+ tree compatible with Jim Kent's bPlusTree.c
+    """
+    cdef object file
+    cdef object reader
+    cdef boolean is_byteswapped
+    cdef bits32 block_size
+    cdef bits32 key_size
+    cdef bits32 value_size
+    cdef bits64 item_count
+    cdef bits64 root_offset

lib/bx/bbi/bpt_file.pyx

+from bx.misc.binary_file import BinaryFileReader
+
+DEF bpt_sig = 0x78CA8C91
+
+# bptFileHeaderSize = 32
+# bptBlockHeaderSize = 4
+
+cdef class BPTFile:
+    """
+    On disk B+ tree compatible with Jim Kent's bPlusTree.c
+    """
+
+    def __init__( self, file=None ):
+        if file is not None:
+            self.attach( file )
+
+    def attach( self, file ):
+        """
+        Attach to an open file
+        """
+        self.file = file
+        self.reader = reader = BinaryFileReader( file, bpt_sig )
+        self.is_byteswapped = self.reader.byteswap_needed
+        # Read header stuff
+        self.block_size = reader.read_uint32()
+        self.key_size = reader.read_uint32()
+        self.value_size = reader.read_uint32()
+        self.item_count = reader.read_uint64()
+        reader.skip( 8 )
+        self.root_offset = reader.tell()
+
+    def r_find( self, bits64 block_start, key ):
+        """
+        Recursively seek the value matching key under the subtree starting
+        at file offset `block_start`
+        """
+        cdef UBYTE is_leaf
+        cdef bits16 child_count
+        cdef bits64 offset
+        self.reader.seek( block_start )
+        # Block header
+        is_leaf = self.reader.read_uint8()
+        self.reader.read_uint8()
+        child_count = self.reader.read_uint16()
+        if is_leaf:
+            for i from 0 <= i < child_count:
+                node_key = self.reader.read( self.key_size )
+                node_value = self.reader.read( self.value_size )
+                if node_key == key:
+                    return node_value
+            return None
+        else:
+            # Read and discard first key, store offset
+            self.reader.read( self.key_size )
+            offset = self.reader.read_bits64()
+            # Loop until correct subtree is found
+            for i from 0 <= i < child_count:
+                node_key = self.reader.read( self.key_size )
+                if node_key < key:
+                    break
+                offset = self.reader.read_bits64()
+            return self.r_find( offset, key )
+
+    def find( self, key ):
+        """
+        Find the value matching `key` (a bytestring). Returns the matching
+        value as a bytestring if found, or None
+        """
+        # Key is greater than key_size, must not be a match
+        if len( key ) > self.key_size:
+            return None
+        # Key is less than key_size, right pad with 0 bytes
+        if len( key ) < self.key_size:
+            key += ( '\0' * self.key_size - len( key ) )
+        # Call the recursive finder
+        return self.r_find( self.root_offset, key )

lib/bx/bbi/cir_tree.pxd

+from types cimport *
+
+cdef class CIRTreeFile:
+    cdef object file
+    cdef object reader
+    cdef bits64 root_offset
+    cdef bits32 block_size
+    cdef bits64 item_count
+    cdef bits32 start_chrom_ix
+    cdef bits32 start_base
+    cdef bits32 end_chrom_ix
+    cdef bits32 end_base
+    cdef bits64 file_size
+    cdef bits32 items_per_slot

lib/bx/bbi/cir_tree.pyx

+from bx.misc.binary_file import BinaryFileReader
+
+DEF cir_tree_sig = 0x2468ACE0
+
+cdef int ovcmp( bits32 a_hi, bits32 a_lo, bits32 b_hi, bits32 b_lo ):
+    if a_hi < b_hi: 
+        return 1
+    elif a_hi > b_hi:
+        return -1
+    else:
+        if a_lo < b_lo:
+            return 1
+        elif a_lo > b_lo:
+            return -1
+        else:
+            return 0
+
+cdef overlaps( qchrom, qstart, qend, rstartchrom, rstartbase, rendchrom, rendbase ):
+    return ( ovcmp( qchrom, qstart, rendchrom, rendbase ) > 0 ) and \
+           ( ovcmp( qchrom, qend, rstartchrom, rstartbase < 0 ) )
+
+cdef class CIRTreeFile:
+
+    def __init__( self, file=None ):
+        if file is not None:
+            self.attach( file )
+
+    def attach( self, file ):
+        """
+        Attach to an open file
+        """
+        self.file = file
+        self.reader = reader = BinaryFileReader( file, cir_tree_sig )
+        # Header
+        self.block_size = reader.read_uint32()
+        self.item_count = reader.read_uint64()
+        self.start_chrom_ix  = reader.read_uint32()
+        self.start_base = reader.read_uint32()
+        self.end_chrom_ix = reader.read_uint32()
+        self.end_base = reader.read_uint32()
+        self.file_size = reader.read_uint64()
+        self.items_per_slot = reader.read_uint32()
+        # Skip reserved
+        reader.read_uint32()
+        # Save root
+        self.root_offset = reader.tell()
+
+    def r_find_overlapping( self, int level, bits64 index_file_offset, bits32 chrom_ix, bits32 start, bits32 end, object rval ):
+        cdef UBYTE is_leaf
+        cdef bits16 child_count
+        reader = self.reader
+        reader.seek( index_file_offset )
+        # Block header
+        is_leaf = reader.read_uint8()
+        reader.read_uint8()
+        child_count = reader.read_uint16()
+        # Read block
+        if is_leaf:
+            self.r_find_overlapping_leaf( level, index_file_offset, chrom_ix, start, end, rval, child_count, reader )
+        else:
+            self.r_find_overlapping_parent( level, index_file_offset, chrom_ix, start, end, rval, child_count, reader )
+
+    def r_find_overlapping_leaf( self, int level, bits64 index_file_offset, bits32 chrom_ix, bits32 start, bits32 end, object rval, 
+                                bits16 child_count, object reader ):
+        cdef bits32 start_chrom_ix, start_base, end_chrom_ix, end_base
+        cdef bits64 offset
+        cdef bits64 size
+        for i from 0 <= i < child_count:
+            start_chrom_ix = reader.read_uint32()
+            start_base = reader.read_uint32()
+            end_chrom_ix = reader.read_uint32()
+            end_base = reader.read_uint32()
+            offset = reader.read_uint64()
+            size = reader.read_uint64()
+            if overlaps( chrom_ix, start, end, start_chrom_ix, start_base, end_chrom_ix, end_base ):
+                rval.append( ( offset, size ) )
+
+    def r_find_overlapping_parent( self, int level, bits64 index_file_offset, bits32 chrom_ix, bits32 start, bits32 end, object rval, 
+                                  bits16 child_count, object reader ):
+        # Read and cache offsets for all children to avoid excessive seeking
+        ## cdef bits32 start_chrom_ix[child_count], start_base[child_count], end_chrom_ix[child_count], end_base[child_count]
+        ## cdef bits64 offset[child_count]
+        start_chrom_ix = []; start_base = []; end_chrom_ix = []; end_base = []
+        offset = []
+        for i from 0 <= i < child_count:
+            ## start_chrom_ix[i] = reader.read_bits32()
+            ## start_base[i] = reader.read_bits32()
+            ## end_chrom_ix[i] = reader.read_bits32()
+            ## end_base[i] = reader.read_bits32()
+            ## offset[i] = reader.read_bits64()
+            start_chrom_ix.append( reader.read_uint32() )
+            start_base.append( reader.read_uint32() )
+            end_chrom_ix.append( reader.read_uint32() )
+            end_base.append( reader.read_uint32() )
+            offset.append( reader.read_uint64() )
+        # Now recurse
+        for i from 0 <= i < child_count:
+            if overlaps( chrom_ix, start, end, start_chrom_ix[i], start_base[i], end_chrom_ix[i], end_base[i] ):
+                self.r_find_overlapping( level + 1, offset[i], chrom_ix, start, end, rval )
+
+    def find_overlapping_blocks( self, bits32 chrom_ix, bits32 start, bits32 end ):
+        rval = []
+        self.r_find_overlapping( 0, self.root_offset, chrom_ix, start, end, rval )
+        return rval

lib/bx/bbi/types.pxd

+ctypedef unsigned char UBYTE 
+ctypedef signed char BYTE 
+ctypedef unsigned short UWORD 
+ctypedef short WORD         
+ctypedef unsigned long long bits64 
+ctypedef unsigned bits32    
+ctypedef unsigned short bits16
+ctypedef unsigned char bits8
+ctypedef int signed32
+
+ctypedef bint boolean
+

lib/bx/misc/binary_file.py

             elif struct.unpack( "<I", bytes )[0] == magic:
                 self.is_little_endian = True
             else:
-                raise BadMagic( "File does not have expected magic number" )                
+                raise BadMagicNumber( "File does not have expected magic number: %x != %x or %x" \
+                        % ( magic,struct.unpack( ">I", bytes )[0], struct.unpack( "<I", bytes )[0] ) )
         # Set endian code
         if self.is_little_endian:
             self.endian_code = "<"
         
     def read_uint64( self ):
         return self.read_and_unpack( "Q", 8 )[0]
+
+    def read_float( self ):
+        return self.read_and_unpack( "f", 4 )[0]
         
         
 class BinaryFileWriter( object ):
             # Sparse arrays with summaries organized as trees on disk
             extensions.append( Extension( "bx.arrays.array_tree", [ "lib/bx/arrays/array_tree.pyx" ], include_dirs=[numpy.get_include()] ) )  
 
-        # Reading UCSC wiggle format
+        # Reading UCSC bed and wiggle formats
         extensions.append( Extension( "bx.arrays.bed", [ "lib/bx/arrays/bed.pyx" ] ) )
+        extensions.append( Extension( "bx.arrays.wiggle", [ "lib/bx/arrays/wiggle.pyx" ] ) )
 
-        # Reading UCSC wiggle format
-        extensions.append( Extension( "bx.arrays.wiggle", [ "lib/bx/arrays/wiggle.pyx" ] ) )
+        # Reading UCSC "big binary index" files
+        extensions.append( Extension( "bx.bbi.bpt_file", [ "lib/bx/bbi/bpt_file.pyx" ] ) )
+        extensions.append( Extension( "bx.bbi.cir_tree", [ "lib/bx/bbi/cir_tree.pyx" ] ) )
+        extensions.append( Extension( "bx.bbi.bbi_file", [ "lib/bx/bbi/bbi_file.pyx" ] ) )
 
         # CpG masking
         extensions.append( Extension( "bx.align.sitemask._cpg", \