Commits

James Taylor  committed 320555a

Importing everything

  • Participants
  • Parent commits 075807e

Comments (0)

Files changed (67)

File aggregate_scores_in_intervals.py

+#!/usr/bin/env python
+
+"""
+usage: %prog score_file interval_file [out_file] [options] 
+"""
+
+from __future__ import division
+
+import sys
+import psyco_full
+import bx.wiggle
+from bx.binned_array import BinnedArray
+import cookbook.doc_optparse
+
+def read_scores( f ):
+    scores_by_chrom = dict()
+    for chrom, pos, val in bx.wiggle.Reader( f ):
+        if chrom not in scores_by_chrom:
+            scores_by_chrom[chrom] = BinnedArray()
+        scores_by_chrom[chrom][pos] = val
+    return scores_by_chrom
+
+def main():
+
+    # Parse command line
+    options, args = cookbook.doc_optparse.parse( __doc__ )
+    try:
+        score_file = open( args[0] )
+        interval_file = open( args[1] )
+        if len( args ) > 2:
+            out_file = open( args[2], 'w' )
+        else:
+            out_file = sys.stdout
+    except:
+        cookbook.doc_optparse.exit()
+
+    scores_by_chrom = read_scores( open( sys.argv[1] ) )
+    for line in open( sys.argv[2] ):
+        fields = line.split()
+        chrom, start, stop = fields[0], int( fields[1] ), int( fields[2] )
+        total = 0
+        count = 0
+        min_score = 100000000
+        max_score = -100000000
+        for i in range( start, stop ):
+            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
+                score = scores_by_chrom[chrom][i]
+                total += score
+                count += 1
+                max_score = max( score, max_score )
+                min_score = min( score, min_score )
+        if total > 0:
+            avg = total/count
+        else:
+            avg = "nan"
+            min_score = "nan"
+            max_score = "nan"
+            
+        print >> out_file, chrom, start, stop, avg, min_score, max_score
+
+    score_file.close()
+    interval_file.close()
+    out_file.close()
+
+if __name__ == "__main__": main()

File align_print_template.py

+#!/usr/bin/env python
+
+"""
+Read a MAF from standard input and print span of one component 
+
+usage: %prog template [options]
+    -f, --format = maf: Input format, maf (default) or axt
+"""
+
+from __future__ import division
+
+import psyco_full
+
+import sys
+import cookbook.doc_optparse
+from bx import align
+
+from Cheetah.Template import Template
+
+def __main__():
+
+    # Parse command line arguments
+    options, args = cookbook.doc_optparse.parse( __doc__ )
+
+    try:
+        template = Template( args[0] )
+        format = options.format
+        if not format: format = "maf"
+    except:
+        cookbook.doc_optparse.exit()
+
+    reader = align.get_reader( format, sys.stdin ) 
+
+    for a in reader: 
+        template.a = a
+        template.c = a.components
+        print template
+
+if __name__ == "__main__": __main__()

File bx/__init__.py

Empty file added.

File bx/align/__init__.py

+from align.core import *

File bx/align/axt.py

+from align import *
+
+"""
+Provides a reader for pairwise alignments in AXT format
+"""
+
+class Reader:
+
+    def __init__( self, file ):
+        self.file = file
+
+    def next( self ):
+        # Read block
+        line = readline( self.file )
+        if not line: return
+        fields = line.split()
+        seq1 = readline( self.file )
+        seq2 = readline( self.file )
+        blank = readline( self.file )
+        # Build 2 component alignment
+        alignment = Alignment()
+        component = Component()
+        component.src = fields[1]
+        component.start = int( fields[2] ) - 1
+        end = int( fields[3] )
+        component.size = end - component.start 
+        component.strand = "+"
+        component.text = seq1.strip()
+        alignment.add_component( component )
+        component = Component()
+        component.src = fields[4]
+        component.start = int( fields[5] ) - 1
+        end = int( fields[6] )
+        component.size = end - component.start
+        component.strand = fields[7]
+        component.text = seq2.strip()
+        
+        return alignment
+
+    def __iter__( self ):
+        while 1:
+            v = self.next()
+            if not v: break
+            yield v
+
+    def close( self ):
+        self.file.close()
+
+# Helper methods
+
+def readline( file, skip_blank=False ):
+    """Read a line from provided file, skipping any blank or comment lines"""
+    while 1:
+        line = file.readline()
+        if not line: return None 
+        if line[0] != '#' and not ( skip_blank and line.isspace() ):
+            return line

File bx/align/core.py

+import random
+
+# DNA reverse complement table
+DNA_COMP = "                                             -                  " \
+           " TVGH  CD  M KN   YSA BWXR       tvgh  cd  m kn   ysa bwxr      " \
+           "                                                                " \
+           "                                                                "
+
+class Alignment( object ):
+
+    def __init__( self, score=0, attributes={} ):
+        self.score = 0
+        self.text_size = 0
+        self.attributes = {}
+        self.components = []
+
+    def add_component( self, component ):
+        self.components.append( component )
+        if self.text_size == 0: self.text_size = len( component.text )
+        elif self.text_size != len( component.text ): raise "Components must have same text length"
+
+    def __str__( self ):
+        s = "a score=" + str( self.score )
+        for key in self.attributes: 
+            s += " %s=%s" % ( key, self.attributes[key] )
+        s += "\n"
+        # Components
+        for c in self.components: 
+            s += str( c )
+            s += "\n"
+        return s
+
+    def get_component_by_src( self, src ):
+        for c in self.components:
+            if c.src == src: return c
+        return None
+
+    def get_component_by_src_start( self, src ):
+        for c in self.components:
+            if c.src.startswith( src ): return c
+        return None    
+
+    def slice( self, start, end ):
+        new = Alignment( score=self.score, attributes=self.attributes )
+        for component in self.components:
+            new.components.append( component.slice( start, end ) )
+        new.text_size = end - start
+        return new
+    
+    def reverse_complement( self ):
+        new = Alignment( score=self.score, attributes=self.attributes )
+        for component in self.components:
+            new.components.append( component.reverse_complement() )
+        new.text_size = self.text_size
+        return new
+    
+    def slice_by_component( self, component_index, start, end ):
+        if type( component_index ) == type( 0 ):
+            ref = self.components[ component_index ]
+        elif type( component_index ) == type( "" ):
+            ref = self.get_component_by_src( component_index )
+        elif type( component_index ) == Component:
+            ref = component_index
+        else:
+            raise ValueError( "can't figure out what to do" )
+        start_col = ref.coord_to_col( start )  
+        end_col = ref.coord_to_col( end )  
+        return self.slice( start_col, end_col )
+        
+    def column_iter( self ):
+        for i in range( self.text_size ):
+            yield [ c.text[i] for c in self.components ]
+
+    def limit_to_species( self, species ):
+        new = Alignment( score=self.score, attributes=self.attributes )
+        new.text_size = self.text_size
+        for component in self.components:
+            if component.src.split('.')[0] in species:
+                new.add_component( component )
+        return new
+
+    def remove_all_gap_columns( self ):
+        """
+        Remove any columns containing only gaps from alignment components,
+        text of components is modified IN PLACE.
+        """
+        seqs = [ list( c.text ) for c in self.components ]
+        i = 0
+        text_size = self.text_size
+        while i < text_size:
+            all_gap = True
+            for seq in seqs:
+                if seq[i] != '-': all_gap = False
+            if all_gap:
+                for seq in seqs: del seq[i]
+                text_size -= 1
+            else:
+                i += 1
+        for i in range( len( self.components ) ):
+            self.components[i].text = ''.join( seqs[i] )
+        self.text_size = text_size
+    
+class Component( object ):
+
+    def __init__( self, src='', start=0, size=0, strand=None, src_size=0, text='' ):
+        self.src = src
+        self.start = start
+        self.size = size
+        self.strand = strand
+        self.src_size = src_size
+        self.text = text
+
+    def __str__( self ):
+        return "s %s %d %d %s %d %s" % ( self.src, self.start, 
+                                           self.size, self.strand, 
+                                           self.src_size, self.text )
+
+    def get_end( self ):
+        return self.start + self.size
+    end = property( fget=get_end )
+
+    def get_forward_strand_start( self ):
+        if self.strand == '-': return self.src_size - self.end
+        else: return self.start
+    forward_strand_start = property( fget=get_forward_strand_start )
+        
+    def get_forward_strand_end( self ):
+        if self.strand == '-': return self.src_size - self.start
+        else: return self.end
+    forward_strand_end = property( fget=get_forward_strand_end)
+
+    def reverse_complement( self ):
+        start = self.src_size - self.start 
+        if self.strand == "+": strand = "-"
+        else: strand = "+"
+        text = self.text.translate( DNA_COMP )
+        return Component( self.src, start, self.size, strand, self.src_size, text )
+
+    def slice( self, start, end ):
+        new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self.src_size )
+        new.text = self.text[start:end]
+
+        #for i in range( 0, start ):
+        #    if self.text[i] != '-': new.start += 1
+        #for c in new.text:
+        #    if c != '-': new.size += 1
+        new.start += start - self.text.count( '-', 0, start )
+        new.size = len( new.text ) - new.text.count( '-' )
+
+        return new
+
+    def slice_by_coord( self, start, end ):
+        start_col = self.coord_to_col( start )  
+        end_col = self.coord_to_col( end )  
+        return self.slice( start_col, end_col )
+    
+    def coord_to_col( self, pos ):
+        if pos < self.start or pos > self.get_end():
+            raise "Range error: %d not in %d-%d" % ( pos, self.start, self.get_end() )
+        return self.py_coord_to_col( pos )
+
+    def weave_coord_to_col( self, pos ):
+        text = self.text
+        text_size = len( self.text )
+        start = self.start
+        pos = pos
+        return weave.inline( """
+                                int col;
+                                int i;
+                                const char * ctext = text.c_str();
+                                for ( col = 0, i = start - 1; col < text_size; ++col )
+                                    if ( text[ col ] != '-' && ++i == pos ) break;
+                                return_val = col;
+                             """, 
+                             ['text', 'text_size', 'start', 'pos' ] )
+
+    def py_coord_to_col( self, pos ):
+        if pos < self.start or pos > self.get_end():
+            raise "Range error: %d not in %d-%d" % ( pos, self.start, self.get_end() )
+        i = self.start
+        col = 0
+        text = self.text
+        while i < pos:
+            if text[col] != '-': i += 1
+            col += 1 
+        return col
+
+def get_reader( format, infile ):
+    import align.axt, align.maf
+    if format == "maf": return align.maf.Reader( infile )
+    elif format == "axt": return align.axt.Reader( infile )
+    else: raise "Unknown alignment format %s" % format
+
+def shuffle_columns( a ):
+    """Randomize the columns of an alignment"""
+    mask = range( a.text_size )
+    random.shuffle( mask )
+    for c in a.components:
+        c.text = ''.join( [ c.text[i] for i in mask ] )
+
+
+

File bx/align/maf.py

+from align import *
+
+import itertools
+import interval_index_file
+
+# Tools for dealing with multiple alignments in MAF format
+
+class MultiIndexed( object ):
+    """Similar to 'indexed' but wraps more than one maf_file"""
+    def __init__( self, maf_filenames, keep_open=False ):
+        self.indexes = [ Indexed( maf_file, maf_file + ".index" ) for maf_file in maf_filenames ]
+    def get( self, src, start, end ):
+        blocks = []
+        for index in self.indexes: blocks += index.get( src, start, end )
+        return blocks
+    
+class Indexed( object ):
+    """Indexed access to a maf using overlap queries, requires an index file"""
+
+    def __init__( self, maf_filename, index_filename=None, keep_open=False ):
+        if index_filename is None: index_filename = maf_filename + ".index"
+        self.indexes = interval_index_file.Indexes( filename=index_filename )
+        self.maf_filename = maf_filename
+        if keep_open: 
+            self.f = open( maf_filename )
+        else:
+            self.f = None
+
+    def get( self, src, start, end ):
+        intersections = self.indexes.find( src, start, end )
+        return itertools.imap( self.get_maf_at_offset, [ val for start, end, val in intersections ] )
+
+    def get_maf_at_offset( self, offset ):
+        if self.f:
+            self.f.seek( offset )
+            return read_next_maf( self.f ) 
+        else:
+            f = open( self.maf_filename )
+            try:
+                f.seek( offset )
+                return read_next_maf( f ) 
+            finally:
+                f.close()
+            
+class Reader( object ):
+    """Iterate over all maf blocks in a file in order"""
+    
+    def __init__( self, file ):
+        self.file = file
+        # Read and verify maf header, store any attributes
+        fields = self.file.readline().split()
+        if fields[0] != '##maf': raise "File does not have MAF header"
+        self.attributes = parse_attributes( fields[1:] )
+
+    def next( self ):
+        return read_next_maf( self.file )
+
+    def __iter__( self ):
+        return ReaderIter( self )
+
+    def close( self ):
+        self.file.close()
+
+class ReaderIter( object ):
+    def __init__( self, reader ):
+        self.reader = reader
+    def __iter__( self ): 
+        return self
+    def next( self ):
+        v = self.reader.next()
+        if not v: raise StopIteration
+        return v
+
+class Writer( object ):
+
+    def __init__( self, file, attributes={} ):
+        self.file = file
+        # Write header, Webb's maf code wants version first, we accomodate
+        if not attributes.has_key('version'): attributes['version'] = 1
+        self.file.write( "##maf version=%s" % attributes['version'] )
+        for key in attributes: 
+            if key == 'version': continue
+            self.file.writelines( " %s=%s" % ( key, attributes[key] ) )
+        self.file.write( "\n" )
+
+    def write( self, alignment ):
+        self.file.write( "a score=" + str( alignment.score ) )
+        for key in alignment.attributes:
+            self.file.write( " %s=%s" % ( key, alignment.attributes[key] ) )
+        self.file.write( "\n" )
+        # Components
+        rows = []
+        for c in alignment.components:
+            rows.append( ( "s", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.text ) )
+        self.file.write( format_tabular( rows, "llrrrrr" ) )
+        self.file.write( "\n" )
+
+    def close( self ):
+        self.file.close()
+
+# ---- Helper methods ---------------------------------------------------------
+
+def read_next_maf( file ):
+        alignment = Alignment()
+        # Attributes line
+        line = readline( file, skip_blank=True )
+        if not line: return None
+        fields = line.split() 
+        if fields[0] != 'a': raise "Expected 'a ...' line"
+        alignment.attributes = parse_attributes( fields[1:] )
+        alignment.score = alignment.attributes['score']
+        del alignment.attributes['score']
+        # Sequence lines
+        while 1:
+            line = readline( file )
+            # EOF or Blank line terminates alignment components
+            if not line or line.isspace(): break
+            if line.isspace(): break 
+            # Verify
+            fields = line.split()
+            if fields[0] != 's': raise "Expected 's ...' line"
+            # Parse 
+            component = Component()
+            component.src = fields[1]
+            component.start = int( fields[2] )
+            component.size = int( fields[3] )
+            component.strand = fields[4]
+            component.src_size = int( fields[5] )
+            if len(fields) > 6: component.text = fields[6].strip()
+            # Add to set
+            alignment.add_component( component )
+        return alignment
+
+def readline( file, skip_blank=False ):
+    """Read a line from provided file, skipping any blank or comment lines"""
+    while 1:
+        line = file.readline()
+        if not line: return None 
+        if line[0] != '#' and not ( skip_blank and line.isspace() ):
+            return line
+
+def parse_attributes( fields ):
+    """Parse list of key=value strings into a dict"""
+    attributes = {}
+    for field in fields:
+        pair = field.split( '=' )
+        attributes[ pair[0] ] = pair[1]
+    return attributes
+
+def format_tabular( rows, align=None ):
+    if len( rows ) == 0: return ""
+    lengths = [ len( col ) for col in rows[ 0 ] ]
+    for row in rows[1:]:
+        for i in range( 0, len( row ) ):
+            lengths[ i ] = max( lengths[ i ], len( row[ i ] ) )
+    rval = ""
+    for row in rows:
+        for i in range( 0, len( row ) ):
+            if align and align[ i ] == "l":
+                rval += row[ i ].ljust( lengths[ i ] )
+            else:
+                rval += row[ i ].rjust( lengths[ i ] )
+            rval += " "
+        rval += "\n"
+    return rval
+        

File bx/alphabet.py

+from numarray import *
+
+import seq_numarray
+
+DNA_BASE=5
+
+## FIXME: Is DNA_BASE really neccesary? This should be able to map from
+##        any integer alphabet to any other.
+
+class Mapping( object ):
+    def __init__( self, f=None ):
+        if f: self.read_from_file( f )
+
+    def read_from_file( self, f ):
+        align_count = None
+        max_symbol = 0
+        reverse_map = {}
+        for line in f:
+            ( key, value ) = line.split()
+            if not align_count: 
+                align_count = len( key )
+                self.table = zeros( DNA_BASE ** align_count )
+            else:
+                assert align_count == len( key )
+            index = seq_numarray.DNA.translate_alignment_column( key )
+            self.table[ index ] = int( value )
+            reverse_map.setdefault( int( value ), [] ).append( key )
+            max_symbol = max( self.table[ index ], max_symbol )
+        self.align_count = align_count
+        self.symbol_count = max_symbol + 1
+        self.reverse_table = [ reverse_map[ index ] for index in range( 0, self.symbol_count ) ]
+
+    def collapse( self, a, b ):
+        copy = Mapping()
+        copy.align_count = self.align_count
+        copy.symbol_count = self.symbol_count - 1
+        table = self.table.copy()
+        for i in range( len( table ) ):
+            if table[i] == b: table[i] = a
+            elif table[i] == copy.symbol_count: table[i] = b
+        copy.table = table
+        return copy
+        
+    def translate_alignment( self, seqs ):
+        return self.translate( seq_numarray.DNA.translate_alignment( seqs ) )
+
+    def translate( self, seq ):
+        return take( self.table, seq )
+
+    def reverse( self, seq ):
+        return [ self.reverse_table[ index ] for index in seq ]

File bx/binned_array.py

+from __future__ import division
+
+import math
+
+from Numeric import *
+from RandomArray import *
+
+MAX=512*1024*1024 
+
+class BinnedArray( object ):
+    def __init__( self, granularity=1024, default=None, max_size=MAX ):
+        self.max_size = MAX
+        self.bin_size = int( math.ceil( ( max_size / granularity ) ) )
+        self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
+        self.bins = [ None ] * self.nbins
+        self.default = default
+    def get_bin_offset( self, index ):
+        return index // self.bin_size, index % self.bin_size
+    def init_bin( self, index ):
+        # self.bins[index] = zeros( self.bin_size ) * self.default
+        self.bins[index] = [ self.default ] * self.bin_size
+    def __getitem__( self, key ):
+        bin, offset = self.get_bin_offset( key )
+        if self.bins[bin]:
+            return self.bins[bin][offset]
+        else:
+            return self.default
+    def __setitem__( self, key, value ):
+        bin, offset = self.get_bin_offset( key )
+        if not self.bins[bin]: self.init_bin( bin )
+        self.bins[bin][offset] = value
+
+if __name__ == "__main__":
+    source = []
+    for i in range( 13 ):
+        if random() < 0.5:
+            source = concatenate( ( source, random_integers( 10, 0, 9456 ) ) )
+        else:
+            source = concatenate( ( source, zeros( 8972 ) ) )
+    # Set on target
+    target = BinnedArray( 128, 0, len( source ) )
+    for i in range( len( source ) ):
+        if source[i] > 0:
+            target[i] = source[i]
+    # Verify
+    for i in range( len( source ) ):
+        assert source[i] == target[i], "No match, index: %d, source: %d, target: %d" % ( i, source[i], target[i] )

File bx/bitset.pyx

+cdef extern from "bits.h":
+
+    ctypedef unsigned char Bits
+    # Allocate bits. 
+    Bits * bitAlloc( int bitCount )
+    # Clone bits. 
+    Bits * bitClone(Bits* orig, int bitCount )
+    # Free bits. 
+    void bitFree(Bits **pB)
+    # Set a single bit. 
+    void bitSetOne(Bits *b, int bitIx)
+    # Clear a single bit. 
+    void bitClearOne(Bits *b, int bitIx)
+    # Set a range of bits. 
+    void bitSetRange(Bits *b, int startIx, int bitCount)
+    # Read a single bit. 
+    int bitReadOne(Bits *b, int bitIx)
+    # Count number of bits set in range. 
+    int bitCountRange(Bits *b, int startIx, int bitCount)
+    # Find the index of the the next set bit. 
+    int bitFindSet(Bits *b, int startIx, int bitCount)
+    # Find the index of the the next clear bit. 
+    int bitFindClear(Bits *b, int startIx, int bitCount)
+    # Clear many bits. 
+    void bitClear(Bits *b, int bitCount)
+    # And two bitmaps.  Put result in a. 
+    void bitAnd(Bits *a, Bits *b, int bitCount)
+    # Or two bitmaps.  Put result in a. 
+    void bitOr(Bits *a, Bits *b, int bitCount)
+    # Xor two bitmaps.  Put result in a. 
+    void bitXor(Bits *a, Bits *b, int bitCount)
+    # Flip all bits in a. 
+    void bitNot(Bits *a, int bitCount)
+    ## # Print part or all of bit map as a string of 0s and 1s.  Mostly useful for
+    ## void bitPrint(Bits *a, int startIx, int bitCount, FILE* out)
+
+cdef class BitSet:
+    cdef Bits * bits
+    cdef int bitCount
+    
+    def __new__( self, int bitCount ):
+        self.bitCount = bitCount
+        self.bits = bitAlloc( bitCount )
+
+    def __dealloc__( self ):
+        bitFree( & self.bits )
+
+    cdef set( self, int index ):
+        bitSetOne( self.bits, index )
+
+    cdef clear( self, int index ):
+        bitClearOne( self.bits, index )
+
+    cdef set_range( self, int start, int count ):   
+        bitSetRange( self.bits, start, count )
+
+    ## cdef clear_range( self, int start, int count ):
+    ##    bitClearRange( self.bits, start, count )
+
+    cdef int get( self, int index ):
+        return bitReadOne( self.bits, index );
+    
+    cdef count_in_range( self, int start, int count ):
+        return bitCountRange( self.bits, start, count )
+
+    cdef int find_next_set( self, int start ):
+        return bitFindSet( self.bits, start, self.bitCount )
+    
+    cdef int find_next_clear( self, int start ):
+        return bitFindClear( self.bits, start, self.bitCount )
+
+    cdef and( self, BitSet other ):
+        bitAnd( self.bits, other.bits, self.bitCount )
+        
+    cdef or( self, BitSet other ): 
+        bitOr( self.bits, other.bits, self.bitCount )
+
+    cdef bitXor( self, BitSet other ): 
+        bitXor( self.bits, other.bits, self.bitCount )
+
+    cdef not( self ):
+        bitNot( self.bits, self.bitCount)
+
+    ## ---- Python "Operator Overloading" ----
+        
+    cdef __getitem__( self, int index ):
+        return self.get( index )
+
+    cdef __iand__( self, BitSet other ):
+        self.and( other )
+
+    cdef __ior__( self, BitSet other ):
+        self.or( other )
+
+    cdef __invert__( self ):
+        self.bitNot
+        
+        
+
+MAX=512*1024*1024 
+
+import math
+
+class BinnedBitSet:
+    def __init__( self, int granularity=1024, int max_size=MAX ):
+        self.max_size = MAX
+        self.bin_size = int( math.ceil( ( max_size / granularity ) ) )
+        self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
+        self.bins = [ 0 ] * self.nbins
+    def get_bin_offset( self, index ):
+        return int( index / self.bin_size ), index % self.bin_size
+    def init_bin( self, index ):
+        self.bins[index] = BitSet( self.bin_size )
+    def __getitem__( self, key ):
+        bin, offset = self.get_bin_offset( key )
+        if self.bins[bin] == 0:
+            return 0
+        elif self.bins[bin] == 1:
+            return 1
+        else:
+            return self.bins[bin].get( offset )
+    def set( self, index ):
+        bin, offset = self.get_bin_offset( index )
+        if self.bins[bin] == 0 or self.bins[bin] == 1: self.init_bin( bin )
+        self.bins[bin].set( offset )            
+        
+
+# Copyright (C) 2003, 2004 by BiRC -- Bioinformatics Research Center
+#                                     University of Aarhus, Denmark
+#                                     Contact: Thomas Mailund <mailund@birc.dk>
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at
+# your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
+# USA.
+
+"""
+A parser for FASTA files.
+
+Copyright (C) 2003, 2004 by BiRC -- Bioinformatics Research Center
+                                    University of Aarhus, Denmark
+                                    Contact: Thomas Mailund <mailund@birc.dk>
+"""
+
+from __future__ import generators
+
+class MisformedInput:
+    "Exception raised when the input file does not look like a fasta file."
+    pass
+
+class FastaRecord:
+    "Wrapper around a fasta record."
+    def __init__(self, header, sequence):
+        "Create a record with the given header and sequence."
+        self.header = header
+        self.sequence = sequence
+
+    def __str__(self):
+        return '>'+self.header+'\n'+self.sequence
+        
+
+def _fasta_itr_from_file(file):
+    "Provide an iteration through the fasta records in file."
+
+    h = file.readline()[:-1]
+    if h[0] != '>':
+        raise MisformedInput()
+    h = h[1:]
+
+    seq = []
+    for line in file:
+        line = line[:-1] # remove newline
+
+        if line[0] == '>':
+            yield FastaRecord(h,''.join(seq))
+
+            h = line[1:]
+            seq = []
+            continue
+
+        seq += [line]
+
+    yield FastaRecord(h,''.join(seq))
+
+
+def _fasta_itr_from_name(fname):
+    "Provide an iteration through the fasta records in the file named fname. "
+    f = open(fname)
+    for rec in _fasta_itr_from_file(f):
+        yield rec
+    f.close()
+
+
+def _fasta_itr(src):
+    """Provide an iteration through the fasta records in file `src'.
+    
+    Here `src' can be either a file object or the name of a file.
+    """
+    if type(src) == str:
+        return _fasta_itr_from_name(src)
+    elif type(src) == file:
+        return _fasta_itr_from_file(src)
+    else:
+        raise TypeError
+
+def fasta_get_by_name(itr,name):
+    "Return the record in itr with the given name."
+    for rec in itr:
+        if rec.header == name:
+            return rec
+    return None
+
+class fasta_itr:
+    "An iterator through a sequence of fasta records."
+    def __init__(self,src):
+        "Create an iterator through the records in src."
+        self.__itr = _fasta_itr(src)
+
+    def __iter__(self):
+        return self
+    def next(self):
+        return self.__itr.next()
+
+    def __getitem__(self,name):
+        return fasta_get_by_name(iter(self),name)
+
+class fasta_slice:
+    """Provide an iteration through the fasta records in file `src', from
+    index `start' to index `stop'.
+
+    Here `src' can be either a file object or the name of a file.
+    """
+    def __init__(self, src, start, stop):
+        """Provide an iteration through the fasta records in file `src', from
+        index `start' to index `stop'.
+
+        Here `src' can be either a file object or the name of a file.
+        """
+        self.__itr = _fasta_itr(src)
+        self.__current = 0
+        self.__start = start
+        self.__stop = stop
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        while self.__current < self.__start:
+            # skip past first records until we get to `start'
+            self.__itr.next()
+            self.__current += 1
+
+        if self.__current >= self.__stop:
+            # stop after `stop'
+            raise StopIteration
+
+        self.__current += 1
+        return self.__itr.next()
+
+    def __getitem__(self,name):
+        return fasta_get_by_name(iter(self),name)
+
+def get_sequence(src,name):
+    "Return the record in src with the given name."
+    return fasta_itr(src)[name]
+
+
+# TESTING...
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 2:
+        print "wrong programmer error"
+        sys.exit(2)
+
+    print 'iterating through all sequences in input file'
+    for rec in fasta_itr(sys.argv[1]):
+        print rec
+    print
+
+    #print 'input sequences (terminated with ^D)'
+    #for rec in fasta_itr(sys.stdin):
+    #    print rec
+    #print
+
+    print 'iterating through input, from the second sequence'
+    for rec in fasta_slice(sys.argv[1], 1, 3):
+        print rec
+    print
+
+    print 'the sequence for "bar"'
+    print fasta_itr(sys.argv[1])["bar"]
+    print fasta_slice(sys.argv[1],0,3)["bar"]
+    print get_sequence(sys.argv[1],"bar")
+    print
+
+
+    

File bx/interval_index_file.py

+from bisect import *
+from struct import *
+
+__all__ = [ 'Indexes', 'Index' ]
+
+MAGIC=0x2cff800a
+VERSION=0
+
+MIN=0
+MAX=512*1024*1024
+
+BIN_OFFSETS = [ 512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0 ]
+BIN_FIRST_SHIFT = 17
+BIN_NEXT_SHIFT = 3
+
+def bin_for_range( start, end):
+    """Find the smallest bin that can contain interval (start,end)"""
+    start_bin, end_bin = start, end - 1
+    start_bin >>= BIN_FIRST_SHIFT
+    end_bin >>= BIN_FIRST_SHIFT
+    for offset in BIN_OFFSETS:
+        if start_bin == end_bin: 
+            return offset + start_bin
+        else:
+            start_bin >>= BIN_NEXT_SHIFT
+            end_bin >>= BIN_NEXT_SHIFT
+    raise "Interval (%d,%d) out of range"
+         
+class Indexes:
+    """A set of indexes, each identified by a unique name"""
+
+    def __init__( self, filename=None ):
+        self.indexes = dict()
+        if filename is not None: self.open( filename )
+
+    def add( self, name, start, end, val ):
+        if name not in self.indexes:
+            self.indexes[name] = Index()
+        self.indexes[name].add( start, end, val ) 
+
+    def get( self, name ):
+        if self.indexes[name] is None:
+            self.indexes[name] = Index( filename=self.filename, offset=self.offsets[name] )
+        return self.indexes[name]
+
+    def find( self, name, start, end ):
+        try: return self.get( name ).find( start, end )
+        except: return []
+
+    def open( self, filename ):
+        self.filename = filename
+        self.offsets = dict()
+        f = open( filename )
+        magic, version, length = read_packed( f, ">3I" )
+        if magic != MAGIC:
+            raise "File does not have expected header"
+        if version > VERSION:
+            warn( "File claims version %d, I don't known anything about versions beyond %d. Attempting to continue", version, VERSION )
+        for i in range( length ):
+            key_len = read_packed( f, ">I" )
+            key = f.read( key_len )
+            offset = read_packed( f, ">I" )
+            self.indexes[ key ] = None
+            self.offsets[ key ] = offset
+        f.close()
+
+    def write( self, f ):
+        keys = self.indexes.keys()
+        keys.sort()   
+        # First determine the size of the header
+        base = calcsize( ">3I" )
+        for key in keys:
+            key = str( key )
+            base += calcsize( ">I" )
+            base += len( key )
+            base += calcsize( ">I" )
+        # Now actually write the header
+        write_packed( f, ">3I", MAGIC, VERSION, len( self.indexes ) )
+        for key in keys:
+            key = str( key )
+            # Write the string prefixed by its length (pascal!)
+            write_packed( f, ">I", len( key ) )
+            f.write( key )
+            # Write offset 
+            write_packed( f, ">I", base )
+            base += self.indexes[key].bytes_required()
+        # And finally write each index in order
+        for key in keys:
+            self.indexes[key].write( f )
+
+class Index:
+
+    def __init__( self, min=MIN, max=MAX, filename=None, offset=0 ):
+        if filename is None: 
+            self.new( min, max )
+        else: 
+            self.open( filename, offset )
+
+    def new( self, min, max ):
+        """Create an empty index for intervals in the range min, max"""
+        # Ensure the range will fit given the shifting strategy
+        assert MIN <= min <= max <= MAX
+        self.min = min
+        self.max = max
+        # Determine the largest bin we will actually use
+        self.bin_count = bin_for_range( max - 1, max ) + 1
+        # Create empty bins
+        self.bins = [ [] for i in range( self.bin_count ) ]
+
+    def open( self, filename, offset ):
+        self.filename = filename
+        self.offset = offset
+        # Open the file and seek to where we expect our header
+        f = open( filename )
+        f.seek( offset )        
+        # Read min/max
+        min, max = read_packed( f, ">2I" )
+        self.new( min, max )
+        # Read bin indexes
+        self.bin_offsets = []
+        self.bin_sizes = []
+        for i in range( self.bin_count ):
+            o, s = read_packed( f, ">2I" )
+            self.bin_offsets.append( o )
+            self.bin_sizes.append( s )
+        # Initialize bins to None, indicating that they need to be loaded
+        self.bins = [ None for i in range( self.bin_count ) ]
+
+    def add( self, start, end, val ):
+        """Add the interval (start,end) with associated value val to the index"""
+        insort( self.bins[ bin_for_range( start, end ) ], ( start, end, val ) )
+
+    def find( self, start, end ):
+        rval = []
+        start_bin = ( max( start, self.min ) ) >> BIN_FIRST_SHIFT
+        end_bin = ( min( end, self.max ) - 1 ) >> BIN_FIRST_SHIFT
+        for offset in BIN_OFFSETS:
+            for i in range( start_bin + offset, end_bin + offset + 1 ):
+                if self.bins[i] is None: self.load_bin( i )
+                # Iterate over bin and insert any overlapping elements into return value
+                for el_start, el_end, val in self.bins[i]:
+                    if el_start < end and el_end > start:
+                        insort_right( rval, ( el_start, el_end, val ) )
+            start_bin >>= BIN_NEXT_SHIFT
+            end_bin >>= BIN_NEXT_SHIFT
+        return rval
+
+    def iterate( self ):
+        for i in range( self.bin_count ):
+            if self.bins[i] is None: self.load_bin( i )
+            for entry in self.bins[i]:  yield entry
+
+    def load_bin( self, index ):
+        bin = []
+        f = open( self.filename )
+        f.seek( self.bin_offsets[index] )
+        # One big read for happy NFS
+        item_size = calcsize( ">3I" )
+        buffer = f.read( self.bin_sizes[index] * item_size )
+        for i in range( self.bin_sizes[index] ):
+            bin.append( unpack( ">3I", buffer[ i*item_size : (i+1)*item_size ] ) )
+        self.bins[index] = bin
+        f.close()
+
+    def write( self, f ):
+        # Write min/max
+        write_packed( f, ">2I", self.min, self.max )
+        # Write table of bin sizes and offsets
+        base = f.tell() + self.bin_count * calcsize( ">2I" )
+        for bin in self.bins:
+            write_packed( f, ">2I", base, len( bin ) )
+            base += len( bin ) * calcsize( ">3I" )
+        # Write contents of each bin
+        for bin in self.bins:
+            for start, end, val in bin:
+                write_packed( f, ">3I", start, end, val )
+
+    def bytes_required( self ):
+        rval = calcsize( ">2I" )
+        rval += self.bin_count * calcsize( ">2I" )
+        for bin in self.bins:
+            rval += len( bin ) * calcsize( ">3I" )
+        return rval 
+
+def write_packed( f, pattern, *vals ):
+    f.write( pack( pattern, *vals ) )
+
+def read_packed( f, pattern ):
+    rval = unpack( pattern, f.read( calcsize( pattern ) ) )
+    if len( rval ) == 1: return rval[0]
+    return rval
+
+

File bx/intervals/__init__.py

+
+__all__ = [ "Interval", "IntervalWithValue", "Intersecter" ]
+
+class Interval( object ):
+    """Basic interval, any object with start and end properties will work as well"""
+    def __init__( self, start, end, value=None ):
+        self.start = start
+        self.end = end
+        self.value = value
+    def __cmp__( self, other ):
+        return cmp( self.start, other.start ) or cmp( self.end, other.end )
+    def __repr__( self ):
+        if self.value:
+            return "IntervalWithValue( %d, %d, %s )" % ( self.start, self.end, repr( self.value ) )
+        else:
+            return "Interval( %d, %d )" % ( self.start, self.end )
+
+class Intersecter( object ):
+    """Data structure for performing window intersect queries on a set of 
+       intervals. Algorithm details naively copied from Scott Schwartz's 
+       'nxrpts.c'.
+
+       Usage
+       =====
+       
+       >>> intersecter = Intersecter()
+
+       Add intervals, the only requirement is that the interval have integer
+       start and end attributes:
+
+       >>> intersecter.add_interval( Interval( 0,  10 ) )
+       >>> intersecter.add_interval( Interval( 3,  7 ) )
+       >>> intersecter.add_interval( Interval( 3,  40 ) )
+       >>> intersecter.add_interval( Interval( 10, 50 ) )
+
+       Perform queries:
+
+       >>> intersecter.find( 2, 5 )
+       [Interval( 3, 40 ), Interval( 0, 10 ), Interval( 3, 7 )]
+       >>> intersecter.find( 10, 100 )
+       [Interval( 3, 40 ), Interval( 10, 50 ), Interval( 0, 10 )]
+       >>> intersecter.find( 100, 200 )
+       []
+       
+       """
+
+    THRESH = 4 
+
+    # ---- Basic API ----------------------------------------------------------
+
+    def __init__( self ):
+        """Initialize"""
+        self.intervals = []
+        self.dirty = True
+
+    def add_interval( self, interval ):
+        """Add an interval to the stored set"""
+        assert interval.start < interval.end, "Intervals must have length >= 1" 
+        self.dirty = True
+        self.intervals.append( interval )
+
+    def add( self, start, end, value=None ):    
+        self.add_interval( Interval( start, end, value ) )
+        
+    def find( self, start, end ):
+        """Return a list of all stored intervals intersecting [start,end)"""
+        result = []
+        self.find_func( 0, len( self.intervals ), start, end, result.append )
+        return result
+
+    # ---- Preprocessing ------------------------------------------------------
+
+    def prepare( self ):
+        n_intervals = len( self.intervals )
+        # starts is filled with the indices of intervals sorted by start position
+        self.starts = range( 0, n_intervals )
+        self.starts.sort( lambda a, b: cmp( self.intervals[ a ].start, self.intervals[ b ].start ) )
+        # ends is filled with the indices of intervals sorted by end position
+        self.ends = range( 0, n_intervals )
+        self.ends.sort( lambda a, b: cmp( self.intervals[ a ].end, self.intervals[ b ].end ) )
+        # centers gets filled my partition, initialize it to zeros
+        self.centers = [0] * n_intervals
+        # Partition recursively
+        self.partition( 0, n_intervals )
+        self.check_partition( 0, n_intervals )
+        # Ready for queries
+        self.dirty = False
+
+    def partition( self, lo, hi ):
+        if hi - lo < Intersecter.THRESH: return
+        center = ( lo + hi ) // 2
+        mid = self.intervals[ self.ends[ center ] ].end
+        # self.ends[lo:center] is correct, separate away self.ends[q:hi)
+        q = center + 1
+        g = lo
+        for i in range( q, hi ):
+            if self.intervals[ self.ends[ i ] ].start > mid:
+                self.centers[ g ] = self.ends[ i ]
+                g += 1
+            else:
+                self.ends[ q ] = self.ends[ i ]
+                q += 1
+        for i in range( q, hi ):
+            self.ends[ i ] = self.centers[ lo - q + i ]
+        # self.starts[q:hi) is correct, separate away self.starts[lo:p)
+        p = q
+        g = lo
+        i = p - 1
+        while i >= lo:
+            if self.intervals[ self.starts[ i ] ].end < mid:
+                self.centers[ g ] = self.starts[ i ]
+                g += 1
+            else:
+                p -= 1
+                self.starts[ p ] = self.starts[ i ]
+            i -= 1
+        for i in range( lo, p ):
+            self.starts[ i ] = self.centers[ p + lo - 1 - i ]
+        # Center
+        self.centers[ center - 1 ] = p
+        self.centers[ center ] = q
+        # Recurse
+        self.partition( lo, p )
+        self.partition( q, hi ) 
+
+    # ---- Find implementation ------------------------------------------------
+
+    def find_func( self, lo, hi, start, end, report_func ):
+        """For each stored interval i that intersects [start, end) call report_func( i )"""
+        if self.dirty: self.prepare() 
+        if hi - lo < Intersecter.THRESH: 
+            self.small_find( lo, hi, start, end, report_func )
+            return
+        p, q, m = self.parts( lo, hi )
+        if start > m:
+            j = q - 1
+            while j >= p and self.intervals[ self.ends[ j ] ].end >= start:
+                report_func( self.intervals[ self.ends[ j ] ] )
+                j -= 1
+            self.find_func( q, hi, start, end, report_func )
+        elif end < m:
+            j = p
+            while j < q and self.intervals[ self.starts[ j ] ].start <= end:
+                report_func( self.intervals[ self.starts[ j ] ] )
+                j += 1
+            self.find_func( lo, p, start, end, report_func )
+        else:
+            for j in range( p, q ):
+                report_func( self.intervals[ self.ends[ j ] ] )
+            self.left_find( lo, p, start, end, report_func )
+            self.right_find( q, hi, start, end, report_func )
+   
+    def small_find( self, lo, hi, start, end, report_func ):
+        for i in range( lo, hi ):
+            interval = self.intervals[ self.starts[ i ] ]
+            if start <= interval.end and interval.start <= end:
+                report_func( interval )
+
+    def left_find( self, lo, hi, start, end, report_func ):
+        while hi - lo >= Intersecter.THRESH:
+            p, q, m = self.parts( lo, hi )
+            if start > m:
+                j = q - 1
+                while j >= p and self.intervals[ self.ends[ j ] ].end >= start:
+                    report_func( self.intervals[ self.ends[ j ] ] )
+                    j -= 1
+                lo = q
+            else:
+                for j in range( p, hi ):
+                    report_func( self.intervals[ self.ends[ j ] ] )
+                hi = p
+        self.small_find( lo, hi, start, end, report_func )
+
+    def right_find( self, lo, hi, start, end, report_func ):
+        while hi - lo >= Intersecter.THRESH:
+            p, q, m = self.parts( lo, hi )
+            if end < m:
+                j = p
+                while j < q and self.intervals[ self.starts[ j ] ].start <= end:
+                    report_func( self.intervals[ self.starts[ j ] ] )
+                    j += 1
+                hi = p
+            else:
+                for j in range( lo, q ):
+                    report_func( self.intervals[ self.starts[ j ] ] )
+                lo = q
+        self.small_find( lo, hi, start, end, report_func )
+            
+    def parts( self, lo, hi ):
+        center = ( lo + hi ) // 2
+        p = self.centers[ center - 1 ]
+        q = self.centers[ center ]
+        m = self.intervals[ self.ends[ center ] ].end
+        return p, q, m
+
+    # ---- Testing ------------------------------------------------------------
+
+    def check_partition( self, lo, hi ):
+        if hi - lo < Intersecter.THRESH:
+            for i in range( lo, hi - 1 ):
+                assert self.intervals[ self.starts[ i ] ].start <= self.intervals[ self.starts[ i + 1 ] ].start
+                assert self.intervals[ self.ends[ i ] ].end <= self.intervals[ self.ends[ i + 1 ] ].end
+        else:
+            p, q, m = self.parts( lo, hi )
+            for i in range( lo, p ): 
+                assert self.intervals[ self.ends[ i ] ].end < m
+            for i in range( q, hi ): 
+                assert self.intervals[ self.starts[ i ] ].start > m
+            for i in range( p, q ): 
+                assert self.intervals[ self.starts[ i ] ].start <= m
+                assert self.intervals[ self.ends[ i ] ].end >= m
+            self.check_partition( lo, p )
+            self.check_partition( q, hi )
+
+# ---- Doctest ----------------------------------------------------------------
+
+def _test():
+    import doctest, intervals
+    return doctest.testmod( intervals )
+
+if __name__ == "__main__":
+    _test()
+

File bx/matrix_io.py

+"""
+Functions for reading and writing 2d matrices in the format used
+by SVDPACK / SVDLIBC and also easily parsed by R
+"""
+
+from numarray import *
+
+import struct
+
+header_pattern = "!ll" 
+header_pattern_size = struct.calcsize( header_pattern )
+
+def write_dense_binary( a, f ):
+    """Write the array `a` to the file `f` in dense binary format"""
+    assert len( a.shape ) == 2, "Only 2 dimensional arrays allowed"
+    nrow, ncol = a.shape
+    # Write shape first as two longs
+    f.write( struct.pack( header_pattern, nrow, ncol) )    
+    # Write each entry as a float. array.tofile does not allow control of byte order
+    row_pat = "!%df" % ncol
+    row_pat_size = struct.calcsize( row_pat )
+    for i in range( nrow ):
+        f.write( struct.pack( row_pat, *a[i] ) )
+   
+def read_dense_binary( f ):
+    """Read a 2d Float32 array from file `f` and return it"""
+    nrow, ncol = struct.unpack( header_pattern, f.read( header_pattern_size ) )
+    a = zeros( ( nrow, ncol ), type=Float32 )
+    nbytes = nrow * ncol * 4 
+    buf  = f.read( nbytes )
+    if len( buf ) != nbytes: raise "Didn't read expected number of bytes" 
+    return fromstring( buf, Float32, ( nrow, ncol ) )    
+    #row_pat = "!%df" % ncol
+    #row_pat_size = struct.calcsize( row_pat )
+    #for i in range( nrow ):
+    #    a[i] = array( struct.unpack( row_pat, f.read( row_pat_size ) ) )
+    #return a
+
+class DBRowReader( object ):
+    """Read a 2d Float32 array from file `f` and return it row by row"""
+    def __init__( self, f ):
+        self.f = f
+        self.nrow, self.ncol = struct.unpack( header_pattern, f.read( header_pattern_size ) )
+        self.row_pat = "!%df" % self.ncol
+        self.row_pat_size = struct.calcsize( self.row_pat )
+        self.rows_read = 0
+    def close( self ):
+        self.f.close()
+    def next( self ):
+        if self.rows_read >= self.nrow: raise StopIteration
+        self.rows_read += 1
+        return array( struct.unpack( self.row_pat, self.f.read( self.row_pat_size ) ) )
+    def __iter__( self ): return self
+
+class DBRowWriter( object ):
+    def __init__( self, nrow, ncol, f ):
+        self.nrow = nrow
+        self.ncol = ncol
+        self.f = f
+        self.row_pat = "!%df" % self.ncol
+        self.row_pat_size = struct.calcsize( self.row_pat )
+        self.rows_written = 0
+        # Write shape first as two longs
+        self.f.write( struct.pack( header_pattern, self.nrow, self.ncol) )    
+    def write( self, row ):
+        assert len( row ) == self.ncol, "Too many columns in row"
+        assert self.rows_written < self.nrow, "Too many rows written"
+        # Write each entry as a float. array.tofile does not allow control of byte order
+        self.f.write( struct.pack( self.row_pat, *row ) )
+        self.rows_written += 1
+    def close( self ):
+        assert self.rows_written == self.nrow
+        self.f.close()
+
+def test():
+    import numarray.random_array, StringIO
+    f = StringIO.StringIO()
+    a = numarray.random_array.random( ( 500, 300 ) )
+    write_dense_binary( a, f )
+    f.seek( 0 )
+    b = read_dense_binary( f )
+    assert allclose( a, b )
+    
+
+
+if __name__ == "__main__": test()
+import bz2, gzip
+
+def open_compressed( filename, mode='r' ):
+    if filename.endswith( ".bz2" ):
+        return bz2.BZ2File( filename, mode )
+    elif filename.endswith( ".gz" ):
+        return gzip.GzipFile( filename, mode )
+    else:
+        return file( filename, mode )

File bx/seq/__init__.py

Empty file added.

File bx/seq/nib.py

+from __future__ import division
+
+import sys, struct, string, math
+
+NIB_MAGIC_NUMBER = 0x6BE93D3A
+NIB_MAGIC_NUMBER_SWAP = 0x3A3DE96B
+NIB_MAGIC_SIZE = 4
+NIB_LENGTH_SIZE = 4
+NIB_I2C_TABLE = "TCAGNXXXtcagnxxx"
+
+READ_CHUNK_SIZE=1024*1024
+
+class NibFile( object ):
+    def __init__( self, file ):
+        self.byte_order = ">" 
+        magic = struct.unpack( ">L", file.read( NIB_MAGIC_SIZE ) )[0]
+        if magic != NIB_MAGIC_NUMBER: 
+            if magic == NIB_MAGIC_NUMBER_SWAP: self.byte_order = "<"  
+            else: raise "Not a NIB file"
+        self.file = file
+        self.magic = magic
+        self.length = struct.unpack( "%sL" % self.byte_order, file.read( NIB_LENGTH_SIZE ) )[0]
+    def get( self, start, length ):
+        assert start >= 0
+        assert start + length - 1 < self.length
+        # Read block of bytes containing sequence
+        block_start = int( math.floor( start / 2 ) )
+        block_end = int( math.floor( ( start + length - 1 ) / 2 ) )
+        block_len = block_end + 1 - block_start
+        self.file.seek( NIB_MAGIC_SIZE + NIB_LENGTH_SIZE + block_start )
+        result = []
+        raw = self.file.read( block_len )
+        data = struct.unpack( "%s%dB" % ( self.byte_order, block_len ), raw )
+        # Translate to character representation
+        for value in data:
+            result.append( NIB_I2C_TABLE[ ( value >> 4 ) & 0xF ] )
+            result.append( NIB_I2C_TABLE[ ( value >> 0 ) & 0xF ] )
+        # Trim if start / end are odd 
+        if start & 1: del result[ 0 ]
+        if ( start + length ) & 1: del result[ -1 ]
+        # Return as string
+        return string.join( result, '' )

File bx/seq/test.fa

+>h
+TGGAGGCATTTGTGATTCAATAGATGCAGAAAGAAACCTTCCTAGAGCTGGCGTTCTCTAACTAAAAGTGGAAAGTTCTG
+AGGAATGAGGACTGTTATAAATCCCACCCCACACCGCACCTTCTCCAGGGAAGTTTCATGGCCGTGAAGAGGACAGAAAG
+TGAGAACCAAGATggaactgaataaacaagcttcacactgttagtttccccat
+atgcttaccttcccacagatgccaaccttggaggcctaagaggcctagaatattatcctttgtctgatcatttctctaca
+aatttattgttctttgttaagatgctacataagcccaaattctaaccacccctttgagttacccatcatcaagtttc
+tcccatgtg

File bx/seq/test.nib

Binary file added.

File bx/seq/test_nib.py

+import unittest
+import nib
+
+# Same sequence data as stored in test.nib
+
+test_seq = "TGGAGGCATTTGTGATTCAATAGATGCAGAAAGAAACCTTCCTAGAGCTGGCGTTCTCTAACTAAAAGTGGAAAGTTCTGAGGAATGAGGACTGTTATAAATCCCACCCCACACCGCACCTTCTCCAGGGAAGTTTCATGGCCGTGAAGAGGACAGAAAGTGAGAACCAAGATggaactgaataaacaagcttcacactgttagtttccccatatgcttaccttcccacagatgccaaccttggaggcctaagaggcctagaatattatcctttgtctgatcatttctctacaaatttattgttctttgttaagatgctacataagcccaaattctaaccacccctttgagttacccatcatcaagtttctcccatgtg"
+
+test_seq_len = len( test_seq )
+
+class NIBTestCase( unittest.TestCase ):
+
+    def test_get( self ):
+        nibfile = nib.NibFile( file( "test.nib" ) )
+        # Try all combinations of even / odd boundaries
+        do_test_get( nibfile, 0, 10 )
+        do_test_get( nibfile, 1, 10 )
+        do_test_get( nibfile, 0, 11 )
+        do_test_get( nibfile, 1, 11 )
+        # Test near end of file also
+        do_test_get( nibfile, test_seq_len - 10, 10 )
+        do_test_get( nibfile, test_seq_len - 11, 11 )
+
+def do_test_get( nibfile, start, len ):
+    assert nibfile.get( start, len ) == test_seq[start:start+len]
+
+if __name__ == "__main__": unittest.main()

File bx/seq_numarray.py

+#!/usr/bin/env python2.3
+
+from numarray import *
+from numarray.ieeespecial import *
+from cookbook.attribute import *
+
+import string
+
+class SeqTranslater( object ):
+    attribute( table=None, base=0 )
+    """Encapsulates a mapping from chars to integers"""
+    def __init__( self, mapping ):
+        """Initialize translation table from a list in which the 
+           characters of each list item map to the same number"""
+        table = zeros( 256 ) - 1
+        reverse_mapping = []
+        for value, chars in enumerate( mapping ):
+            for ch in chars:
+                put( table, array( ch, 'b' ), value )
+            reverse_mapping.append( chars[ 0 ] )
+        self.table = table 
+        self.reverse_mapping = reverse_mapping
+        self.base = len( mapping )
+    def translate( self, seq ):
+        """Convert a character sequence to a single integer array"""
+        int_seq = take( self.table, array( seq, 'b' ) )
+        if -1 in int_seq: raise "Invalid character in sequence"
+        return int_seq
+    def translate_alignment( self, seqs ):   
+        """Convert the rows of a multiple alignment to a single integer array"""
+        if len( seqs ) < 1: return None
+        rval = zeros( len( seqs[ 0 ] ) )
+        factor = 1
+        for seq in seqs:
+            seq_ints = self.translate( seq )
+            rval += ( seq_ints * factor )
+            factor *= self.base
+        return rval
+    def translate_alignment_column( self, col, allow_invalid=False ):
+        value = 0
+        factor = 1
+        for ch in col:
+            row_value = self.table[ int( array( ch, 'b' )[0] ) ]
+            if row_value == -1: 
+                if allow_invalid: return -1
+                else: raise "Invalid character"
+            value += ( row_value * factor )
+            factor *= self.base
+        return value
+    def reverse_alignment_column( self, align_count, value ):
+        col = []
+        for i in range( 0, align_count ):
+            index = ( value / self.base ** i ) % self.base 
+            col.append( self.reverse_mapping[ index ] )
+        return string.join( col, '' )
+
+DNA = SeqTranslater( ( 'Aa', 'Cc', 'Gg', 'Tt', '-NnBb' ) )

File bx/wiggle.py

+#!/usr/bin/env python
+
+def parse_header( line ):
+    return dict( [ field.split( '=' ) for field in line.split()[1:] ] )
+
+def Reader( f ):
+
+    current_chrom = None
+    current_pos = None
+    current_step = None
+
+    mode = "bed"
+
+    for line in f:
+        if line.startswith( "track" ) or line.startswith( "#" ) or line.isspace():
+            continue
+        elif line.startswith( "variableStep" ):
+            header = parse_header( line )
+            current_chrom = header['chrom']
+            current_pos = None
+            current_step = None
+            if 'span' in header: current_span = int( header['span'] )
+            else: current_span = 1
+            mode = "variableStep"
+        elif line.startswith( "fixedStep" ):
+            header = parse_header( line )
+            current_chrom = header['chrom']
+            current_pos = int( header['start'] )
+            current_step = int( header['step'] )
+            if 'span' in header: current_span = int( header['span'] )
+            else: current_span = 1
+            mode = "fixedStep"
+        elif mode == "bed":
+            fields = line.split()
+            chrom = fields[0]
+            start = int( fields[1] )
+            end = int( fields[2] )
+            val = float( fields[3] )
+            for i in range( start, end ):
+                yield chrom, i, val
+        elif mode == "variableStep": 
+            fields = line.split()
+            pos = int( fields[0] )
+            val = float( fields[1] )
+            for i in range( current_span ):
+                yield current_chrom, pos+i, val
+        elif mode == "fixedStep":
+            val = float( line.split()[0] )
+            pos = current_pos
+            for i in range( current_span ):
+                yield current_chrom, pos+i, val
+            current_pos += current_span
+        else:
+            raise "Unexpected input line: %s" % line.strip()

File find_in_sorted_file.py

+#!/usr/bin/env python2.3
+
+import sys
+
+max_cats = 1000
+
+class Finder:
+    def __init__( self, file, segments ):
+        self.file = file
+        self.segments = segments
+        self.make_index()
+    def make_index( self ):
+        self.values = []
+        self.positions = []
+        
+        file.seek( 0, 2 )
+        end = file.tell()
+
+        step = end / ( self.segments - 1 )
+
+        for i in range( 0, self.segments - 1 ):
+            file.seek( i * step, 0 )
+            file.readline()
+            position = file.tell()
+            fields = file.readline().split()
+            self.values.append( int( fields[ 0 ] ) )
+            self.positions.append( position )
+
+    def scores_in_range( self, start, end ):
+        position = self.positions[ -1 ]
+        for i in range( 1, len( self.values ) ):
+            if self.values[ i ] > start:
+                position = self.positions[ i - 1 ]
+                break
+        self.file.seek( position, 0 )
+        result = []
+        while 1:
+            line = file.readline()
+            if line == "": break
+            fields = line.split()
+
+            pos = int( fields[ 0 ] )
+
+            if pos < start: continue
+            if pos > end: break
+
+            result.append( ( pos, fields[1] ) )
+
+        return result
+
+file = open( sys.argv[ 1 ] )
+
+finder = Finder( file, 100 )
+
+scores = finder.scores_in_range( int( sys.argv[2] ), int( sys.argv[3] ) )
+
+rng = scores[-1][0] - scores[0][0]
+
+if rng > max_cats:
+    stride = rng // max_cats
+else:
+    stride = 1
+
+for score in scores:
+    if score[0] % stride == 0:
+        print score[0], score[1]

File int_seqs_to_char_strings.py

+#!/usr/bin/env python2.3
+
+from itertools import *
+
+import sys
+
+table = "012345678ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+
+def main():
+    for line in sys.stdin:
+        ints = [ int( f ) for f in line.split() ]
+        if max( ints ) > len( table ):
+            raise "Alphabet size too large!"
+        print str.join( '', [ table[i] for i in ints ] )    
+
+if __name__ == "__main__": main()

File interval_count_intersections.py

+#!/usr/bin/env python
+
+"""
+Read two lists of intervals (with chromosomes) and count the number of entries
+in the second set that intersect any entry in the first set.
+"""
+
+from __future__ import division
+
+import psyco_full
+
+from bx import align.maf
+from bx import intervals
+from bx import misc
+import string
+import sys
+
+def main():
+
+    intersecters = {}
+
+    # Read ranges
+
+    for chr, start, end in read_intervals( misc.open_compressed( sys.argv[1] ) ):
+        if not intersecters.has_key( chr ): intersecters[ chr ] = intervals.Intersecter()
+        intersecters[ chr ].add_interval( intervals.Interval( start, end ) )
+
+    # Count intersection
+
+    total = 0
+
+    for chr, start, end in read_intervals( misc.open_compressed( sys.argv[2] ) ):
+        if intersecters.has_key( chr ):
+            intersection = intersecters[ chr ].find( start, end )
+            if intersection: 
+                #print chr, intersection
+                total += 1
+
+    print total
+
+def read_intervals( input ):
+    for line in input:
+        fields = line.split()
+        yield fields[0], int( fields[1] ), int( fields[2] )
+
+main()

File line_select.py

+#!/usr/bin/env python2.3
+
+"""
+Read a feature file containing a 0 or 1 on each line, output 
+all lines from stdin for which the feature file is 1
+
+usage: %prog feature_file < ...
+"""
+
+import psyco_full
+
+import sys
+from bx import align.maf
+
+def __main__():
+
+    feature_file = sys.argv[1]
+
+    if len( sys.argv ) > 2:
+        match = int( sys.argv[2] )
+    else:
+        match = 1
+    
+    feature_vector = [ int( line ) for line in file( feature_file ) ]
+
+    for index, line in enumerate( sys.stdin ):
+        if feature_vector[ index ] == match: print line,
+
+if __name__ == "__main__": __main__()

File maf_build_index.py

+#!/usr/bin/env python2.3
+
+"""
+Build an index file for a set of MAF alignment blocks.
+
+If index_file is not provided maf_file.index is used.
+
+usage: %prog maf_file index_file
+"""
+
+import psyco_full
+
+import cookbook.doc_optparse
+
+from bx import interval_index_file
+import sys
+
+from bx import align.maf
+
+def main():
+
+    # Parse command line
+
+    options, args = cookbook.doc_optparse.parse( __doc__ )
+
+    try:
+        maf_file = sys.argv[1]
+        if len( sys.argv ) > 2: index_file = sys.argv[2]
+        else: index_file = maf_file + ".index" 
+    except:
+        cookbook.doc_optparse.exit()
+
+    maf_reader = align.maf.Reader( open( maf_file ) )
+
+    indexes = interval_index_file.Indexes()
+
+    # Need to be a bit tricky in our iteration here to get the 'tells' right
+    while 1:
+        pos = maf_reader.file.tell()
+        block = maf_reader.next()
+        if block is None: break
+        for c in block.components:
+            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos )
+
+    out = open( index_file, 'w' )
+    indexes.write( out )
+    out.close()
+
+if __name__ == "__main__": main()
+#!/usr/bin/env python2.3
+
+"""
+Chops alignments in a MAF file to piece of a specified length. A random set of
+non overlapping chunks of exactly the specified chop length will be produced
+"""
+
+import sys
+
+import sys, random
+from bx import align.maf
+from optparse import OptionParser
+
+def main():
+
+    # Parse command line arguments
+
+    parser = OptionParser()
+    parser.add_option( "-l", "--length", action="store", type="int", default=100, help="" )
+
+    ( options, args ) = parser.parse_args()
+
+    length = options.length
+    maf_reader = align.maf.Reader( sys.stdin )
+    maf_writer = align.maf.Writer( sys.stdout )
+
+    for m in maf_reader:
+        for chopped in chop( m, length ):
+            maf_writer.write( chopped )
+
+def chop( m, length ):
+    maf_length = m.text_size
+    chunk_count = maf_length // length
+    lost_bases = maf_length % length
+    skip_amounts = [0] * ( chunk_count + 1 )
+    for i in range( 0, lost_bases ): skip_amounts[ random.randrange( 0, chunk_count + 1 ) ] += 1
+    start = 0
+    rval = []
+    for i in range( 0, chunk_count ):
+        start += skip_amounts[ i ]
+        n = m.slice( start, start + length )
+        if check_len( n ): rval.append( m.slice( start, start + length ) )
+        start += length
+    return rval
+
+def check_len( a ):
+    for c in a.components:
+        if c.size == 0: return False
+    return True 
+    
+
+if __name__ == "__main__": main()

File maf_chunk.py

+#!/usr/bin/env python2.3
+
+"""
+Read a MAF from stdin and break into a set of mafs containing 
+no more than a certain number of columns
+"""
+
+usage = "usage: %prog chunk_size out_dir"
+
+import sys
+from bx import align.maf
+from optparse import OptionParser
+import psyco_full
+import random
+
+INF="inf"
+
+def __main__():
+
+    # Parse command line arguments
+
+    parser = OptionParser( usage=usage )
+    parser.add_option( "--prob", action="store", default=None, type="float", 
+                       help="Probability of writing a given chunk" )
+    
+    ( options, args ) = parser.parse_args()
+
+    chunk_size = int( args[0] )
+    out_dir = args[1]
+    prob = options.prob
+
+    maf_reader = align.maf.Reader( sys.stdin )
+
+    maf_writer = None
+
+    count = 0
+    current_chunk = -1
+
+    chunk_min = INF
+    chunk_max = 0
+
+    write_current_chunk = True
+
+    interval_file = file( "%s/intervals.txt" % out_dir, "w" )	
+
+    for m in maf_reader:
+        chunk_min = min( chunk_min, m.components[0].start )
+        chunk_max = max( chunk_max, m.components[0].end )
+        if not maf_writer or count + m.text_size > chunk_size:
+            current_chunk += 1
+            # Finish the last chunk            
+            if maf_writer: 
+                maf_writer.close()
+                interval_file.write( "%s %s\n" % ( chunk_min, chunk_max ) )
+                chunk_min = INF
+                chunk_max = 0
+            # Decide if the new chunk will be written     
+            if prob: write_current_chunk = bool( random.random() <= prob )
+            else: write_current_chunk = True
+            if write_current_chunk:
+                maf_writer = align.maf.Writer( file( "%s/%09d.maf" % ( out_dir, current_chunk ), "w" ) )
+            else:
+                maf_writer = None
+            count = 0
+        if maf_writer: maf_writer.write( m )
+        #count += m.text_size
+        count += m.components[0].size
+    
+    if maf_writer:
+        maf_writer.close()
+        interval_file.write( "%s %s\n" % ( chunk_min, chunk_max ) )
+        interval_file.close()
+
+if __name__ == "__main__": __main__()

File maf_count.py

+#!/usr/bin/env python2.3
+
+"""
+Read a MAF from standard input and print counts of alignments, bases, or columns. 
+
+usage: %prog [options]
+   -c, --cols: count alignment columns rather than number of alignments
+   -b, --bases: count bases in first species rather than number of alignments
+   -s, --skip=N: when counting bases, skip this base
+   -e, --each: print a count for each alignment rather than whole file
+   -r, --ref=N: reference sequence (first by default, 0..n)
+"""
+
+import cookbook.doc_optparse
+import sys
+
+from bx import align.maf
+
+def __main__():
+
+    options, args = cookbook.doc_optparse.parse( __doc__ )
+
+    try:
+        if options.cols: action = "cols"