Source

bx-python / lib / bx / seq / nib.py

Full commit
"""
Classes to support nib files.

:Author: James Taylor (james@bx.psu.edu), Bob Harris (rsharris@bx.psu.edu)

A nib sequence is a sequence of DNA, using the 10 character alphabet A,C,G,T,N
(upper and lower case).  The file is packed as 4 bits per character.

nib file format
---------------

Fields can be in big- or little-endian format;  they must match the endianess
of the magic number.

============ =========== ======================================================
offset 0x00: 6B E9 3D 3A big endian magic number (3A 3D E9 6B => little endian)
offset 0x04: xx xx xx xx length of data sequence (counted in characters)
offset 0x08:  ...        data sequence;  most significant nybble in each
                         byte is first in sequence
============ =========== ======================================================
"""

from __future__ import division

from bx.seq.seq import SeqFile,SeqReader
import sys, struct, string, math

import _nib

NIB_MAGIC_NUMBER = 0x6BE93D3A
NIB_MAGIC_NUMBER_SWAP = 0x3A3DE96B
NIB_MAGIC_SIZE = 4
NIB_LENGTH_SIZE = 4

class NibFile(SeqFile):

    def __init__(self, file, revcomp=False, name="", gap=None):
        SeqFile.__init__(self,file,revcomp,name,gap)

        self.byte_order = ">"
        magic = struct.unpack(">L", file.read(NIB_MAGIC_SIZE))[0]
        if (magic != NIB_MAGIC_NUMBER):
            if magic == NIB_MAGIC_NUMBER_SWAP: self.byte_order = "<"
            else: raise Exception("Not a NIB file")
        self.magic = magic
        self.length = struct.unpack("%sL" % self.byte_order, file.read(NIB_LENGTH_SIZE))[0]

    def raw_fetch(self, start, length):
        # Check parameters
        assert start >= 0, "Start must be greater than 0"
        assert length >= 0, "Length must be greater than 0"
        assert start + length <= self.length, "Interval beyond end of sequence"
        # Read block of bytes containing sequence
        block_start = int(math.floor(start / 2))
        block_end = int(math.floor((start + length - 1) / 2))
        block_len = block_end + 1 - block_start
        self.file.seek(NIB_MAGIC_SIZE + NIB_LENGTH_SIZE + block_start)
        raw = self.file.read(block_len)
        # Unpack compressed block into a character string and return
        return _nib.translate_raw_data( raw, start, length  )

class NibReader(SeqReader):
    
    def __init__(self, file, revcomp=False, name="", gap=None):
        SeqReader.__init__(self,file,revcomp,name,gap)

    def next(self):
        if (self.seqs_read != 0): return  # nib files have just one sequence
        seq = NibFile(self.file,self.revcomp,self.name,self.gap)
        self.seqs_read += 1
        return seq


class NibWriter(object):

    def __init__(self,file):
        self.file = file

    def write(self,seq):
        assert (False), "NibWriter.write() is not implemented yet"

    def close(self):
        self.file.close()