Commits

James Taylor committed 494ed09

Basic support for writing and reading dictionaries connected to files in CDB format. Should be completely compatible with DJB's files as long as written in little endian. However, this is setup to be embedded in a larger file (e.g. to provide an index from chromosome names to offsets where chromosome indexes are stored in interval_index_file. This would be more efficient if there are a large number of chromosomes [contigs]).

Comments (0)

Files changed (3)

lib/bx/misc/binary_file.py

 
 import numpy
 import struct
+import sys
 
 ## Standard size:
 ## short is 8 bits
     Currently this is not heavily optimized (it uses the `struct` module to
     unpack)
     """    
-    def __init__( self, file, magic = None ):
-        self.is_little_endian = False
-        self.endian_code = ">"
+    def __init__( self, file, magic = None, is_little_endian = False ):
+        self.is_little_endian = is_little_endian
         self.file = file
         if magic is not None:
             # Attempt to read magic number and chuck endianess
                 pass
             elif struct.unpack( "<I", bytes )[0] == magic:
                 self.is_little_endian = True
-                self.endian_code = "<"
             else:
-                raise BadMagic( "File does not have expected magic number" )
+                raise BadMagic( "File does not have expected magic number" )                
+        # Set endian code
+        if self.is_little_endian:
+            self.endian_code = "<"
+            self.byteswap_needed = ( sys.byteorder != "little" )
+        else:
+            self.endian_code = ">"
+            self.byteswap_needed = ( sys.byteorder != "big" )
         
     def unpack( self, format, buffer, byte_count=None ):
         pattern = "%s%s" % ( self.endian_code, format )
         return ''.join( rval )
         
     def read_raw_array( self, dtype, size ):
-        return numpy.fromfile( self.file, dtype=dtype, count=size )
+        a = numpy.fromfile( self.file, dtype=dtype, count=size )
+        if self.byteswap_needed:
+            a.byteswap()
+        return a
     
     def read( self, byte_count=1 ):
         return self.file.read( byte_count )
     Currently this is not heavily optimized (it uses the `struct` module to
     unpack)
     """    
-    def __init__( self, file, magic = None ):
-        # Always write in network byte order
-        self.endian_code = ">"
+    def __init__( self, file, magic = None, is_little_endian = False ):
+        self.is_little_endian = is_little_endian
+        if self.is_little_endian:
+            self.endian_code = "<"
+        else:
+            self.endian_code = ">"
         self.file = file
         if magic is not None:
             self.write_uint32( magic )

lib/bx/misc/cdb.py

+from UserDict import DictMixin
+
+from bx.misc.binary_file import BinaryFileReader, BinaryFileWriter
+import numpy
+
+def cdbhash( s ):
+    return reduce( lambda h, c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381 )
+
+class FileCDBDict( DictMixin ):
+    """
+    For accessing a CDB structure on disk. Read only. Currently only supports
+    access by key (getitem).
+    
+    NOTE: The keys method could be implemented by scanning the main table.
+    """
+    def __init__( self, file, is_little_endian=True ):
+        # TODO: Deal with endianess
+        self.io = BinaryFileReader( file, is_little_endian=is_little_endian )
+        self.header_offset = self.io.tell()
+        # Read the whole header (only 2k)
+        self.header = []
+        for i in range( 256 ):
+            self.header.append( ( self.io.read_uint32(), self.io.read_uint32() ) )
+    def __getitem__( self, key ):
+        hash = cdbhash( key )
+        # Find position of subtable using 8 LSBs of hash
+        subtable_offset = self.header[ hash % 256 ][0]
+        subtable_size = self.header[ hash % 256 ][1]
+        if subtable_size == 0:
+            raise KeyError
+        # Seek into subtable and look for match
+        start = ( hash >> 8 )
+        for i in range( subtable_size ):
+            offset = subtable_offset + ( ( start + i ) % subtable_size ) * 8 
+            self.io.seek( offset )
+            h = self.io.read_uint32()
+            p = self.io.read_uint32()
+            # Hit an empty bin, no match for key
+            if p == 0:
+                raise KeyError
+            # Hash matches, need to check full key
+            if h == hash:
+                self.io.seek( p )
+                klen = self.io.read_uint32()
+                vlen = self.io.read_uint32()
+                k = self.io.read( klen )
+                if k == key:
+                    v = self.io.read( vlen )
+                    return v
+        else:
+            # Visited every slot and no match (should never happen since
+            # there are empty slots by contruction)
+            raise KeyError
+        
+    @classmethod
+    def to_file( Class, dict, file, is_little_endian=True ):
+        """
+        For constructing a CDB structure in a file. Able to calculate size on
+        disk and write to a file
+        """
+        io = BinaryFileWriter( file, is_little_endian=is_little_endian )
+        start_offset = io.tell()
+        # Header is of fixed length
+        io.seek( start_offset + ( 8 * 256 ) )
+        # For each item, key and value length (written as length prefixed
+        # strings). We also calculate the subtables on this pass.
+        # NOTE: This requires the key and value be byte strings, support for
+        #       dealing with encoding specific value types should be
+        #       added to this wrapper
+        subtables = [ [] for i in range(256) ]
+        for key, value in dict.iteritems():
+            pair_offset = io.tell()
+            io.write_uint32( len( key ) )
+            io.write_uint32( len( value ) )
+            io.write( key )
+            io.write( value )
+            hash = cdbhash( key )
+            subtables[ hash % 256 ].append( ( hash, pair_offset ) )
+        # Save the offset where the subtables will start
+        subtable_offset = io.tell()
+        # Write subtables
+        for subtable in subtables:
+            if len( subtable ) > 0:
+                # Construct hashtable to be twice the size of the number
+                # of items in the subtable, and built it in memory
+                ncells = len( subtable ) * 2
+                cells = [ (0,0) for i in range( ncells ) ]
+                for hash, pair_offset in subtable:
+                    index = ( hash >> 8 ) % ncells
+                    while cells[index][1] != 0:
+                        index = ( index + 1 ) % ncells
+                    # Guaranteed to find a non-empty cell
+                    cells[index] = ( hash, pair_offset )
+                # Write subtable
+                for hash, pair_offset in cells:
+                    io.write_uint32( hash )
+                    io.write_uint32( pair_offset )
+        # Go back and write the header
+        io.seek( start_offset )
+        index = subtable_offset
+        for subtable in subtables:
+            io.write_uint32( index )
+            io.write_uint32( len( subtable * 2 ) )
+            # For each cell in the subtable, a hash and a pointer to a value
+            index += ( len( subtable ) * 2 ) * 8    

lib/bx/misc/cdb_tests.py

+from bx.misc.cdb import *
+from tempfile import NamedTemporaryFile
+
+def test():
+
+    d = {}
+    for i in range( 10000 ):
+        d[ 'foo' + str( i ) ] = 'bar' + str( i )
+    
+    # Open temporary file and get name    
+    file = NamedTemporaryFile()
+    file_name = file.name
+        
+    # Write cdb to file
+    FileCDBDict.to_file( d, file )
+    file.flush()
+    
+    # Open on disk
+    file2 = open( file_name )
+    cdb = FileCDBDict( file2 )
+    
+    for key, value in d.iteritems():
+        assert cdb[key] == value
+    
+    # Close everything (deletes the temporary file)
+    file2.close()
+    file.close()
+