Source

bx-python / lib / bx / misc / seeklzop.py

Full commit
James Taylor 6d55ed5 






Nate Coraor a6bb496 


James Taylor 1424204 
Nate Coraor a6bb496 

James Taylor 1424204 
James Taylor 6d55ed5 
































































































James Taylor e7e07d4 


James Taylor 6d55ed5 







James Taylor 3df62ac 
James Taylor 6d55ed5 










































James Taylor e7e07d4 
"""
Semi-random access to bz2 compressed data.
"""

import os
import bisect
import sys
try:
    import pkg_resources
    pkg_resources.require( 'python_lzo' )
    import lzo
except:
    pass

import struct

from bx_extras import lrucache   
from cStringIO import StringIO
    
class SeekableLzopFile( object ):
    """
    Filelike object supporting read-only semi-random access to bz2 compressed
    files for which an offset table (bz2t) has been generated by `bzip-table`.
    """
    
    def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ):
        self.filename = filename
        self.table_filename = table_filename
        self.init_table()
        self.file = open( self.filename, "r" )
        self.dirty = True
        self.at_eof = False
        self.file_pos = 0
        self.current_block_index = -1
        self.current_block = None
        if block_cache_size > 0:
            self.cache = lrucache.LRUCache( block_cache_size )
        else:
            self.cache = None
        
    def init_table( self ):
        self.block_size = None
        self.block_info = []
        # Position of corresponding block in compressed file (in bytes)
        for line in open( self.table_filename ):
            fields = line.split()
            if fields[0] == "s":
                self.block_size = int( fields[1] )
            if fields[0] == "o":
                offset = int( fields[1] )
                compressed_size = int( fields[2] )
                size = int( fields[3] )
                self.block_info.append( ( offset, compressed_size, size ) )
        self.nblocks = len( self.block_info )
        
    def close( self ):
        self.file.close()
        
    def load_block( self, index ):
        if self.cache is not None and index in self.cache:
            return self.cache[index]
        else:      
            offset, csize, size = self.block_info[ index ]
            # Get the block of compressed data
            self.file.seek( offset )
            data = self.file.read( csize )
            # Need to prepend a header for python-lzo module (silly)
            data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) )
            value = lzo.decompress( data )
            if self.cache is not None:
                self.cache[index] = value
            return value
        
    def fix_dirty( self ):
        chunk, offset = self.get_block_and_offset( self.file_pos )
        if self.current_block_index != chunk:
            self.current_block = StringIO( self.load_block( chunk ) )
            self.current_block.read( offset )
            self.current_block_index = chunk
        else:
            self.current_block.seek( offset )
        self.dirty = False
        
    def get_block_and_offset( self, index ):
        return int( index // self.block_size ), int( index % self.block_size )

    def seek( self, offset, whence=0 ):
        """
        Move the file pointer to a particular offset.
        """
        # Determine absolute target position
        if whence == 0:
            target_pos = offset
        elif whence == 1:
            target_pos = self.file_pos + offset
        elif whence == 2:
            raise Exception( "seek from end not supported" )
            ## target_pos = self.size - offset
        else:
            raise Exception( "Invalid `whence` argument: %r", whence )
        # Check if this is a noop
        if target_pos == self.file_pos:
            return    
        # Verify it is valid
        ## assert 0 <= target_pos < self.size, "Attempt to seek outside file"
        # Move the position
        self.file_pos = target_pos
        # Mark as dirty, the next time a read is done we need to actually
        # move the position in the bzip2 file
        self.dirty = True
        
    def tell( self ):
        return self.file_pos
        
    def readline( self ):
        if self.dirty:
            self.fix_dirty()
        if self.at_eof:
            return ""
        rval = []
        while 1:
            line = self.current_block.readline()
            self.file_pos += len( line )
            rval.append( line )
            if len( line ) > 0 and line[-1] == '\n':
                break
            elif self.current_block_index == self.nblocks - 1:
                self.at_eof = True
                break
            else:
                self.current_block_index += 1
                self.current_block = StringIO( self.load_block( self.current_block_index ) )      
        return "".join( rval ) 
            
    def next( self ):
        line = self.readline()
        if line == "":
            raise StopIteration
            
    def __iter__( self ):
        return self

# --- Factor out ---        
        
MAGIC="\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a"

F_ADLER32_D     = 0x00000001L
F_ADLER32_C     = 0x00000002L
F_H_EXTRA_FIELD = 0x00000040L
F_H_GMTDIFF     = 0x00000080L
F_CRC32_D       = 0x00000100L
F_CRC32_C       = 0x00000200L
F_MULTIPART     = 0x00000400L
F_H_FILTER      = 0x00000800L
F_H_CRC32       = 0x00001000L

assert struct.calcsize( "!H" ) == 2
assert struct.calcsize( "!I" ) == 4

class UnpackWrapper( object ):
    def __init__( self, file ):
        self.file = file
    def read( self, amt ):
        return self.file.read( amt )
    def get( self, fmt ):
        t = struct.unpack( fmt, self.file.read( struct.calcsize( fmt ) ) )
        return t[0]