Commits

Bob Harris committed 5e5d804

(1) corrected reverse_complement (2) brought axt support up-to-snuff with maf (3) which necessitated changes to maf and core (4) added stubs for lav

  • Participants
  • Parent commits e630c6c

Comments (0)

Files changed (3)

File bx/align/axt.py

 from align import *
 
-"""
-Provides a reader for pairwise alignments in AXT format
-"""
+import itertools
+import interval_index_file
 
-class Reader:
+# Tools for dealing with pairwise alignments in AXT format
 
-    def __init__( self, file ):
+class MultiIndexed( object ):
+    """Similar to 'indexed' but wraps more than one axt_file"""
+    def __init__( self, axt_filenames, keep_open=False ):
+        self.indexes = [ Indexed( axt_file, axt_file + ".index" ) for axt_file in axt_filenames ]
+    def get( self, src, start, end ):
+        blocks = []
+        for index in self.indexes: blocks += index.get( src, start, end )
+        return blocks
+
+
+class Indexed( object ):
+    """Indexed access to a axt using overlap queries, requires an index file"""
+
+    def __init__( self, axt_filename, index_filename=None, keep_open=False, species_to_lengths=None ):
+        if index_filename is None: index_filename = axt_filename + ".index"
+        self.indexes = interval_index_file.Indexes( filename=index_filename )
+        self.axt_filename = axt_filename
+        self.species_to_lengths = species_to_lengths
+        if keep_open:
+            self.f = open( axt_filename )
+        else:
+            self.f = None
+
+    def get( self, src, start, end ):
+        intersections = self.indexes.find( src, start, end )
+        return itertools.imap( self.get_axt_at_offset, [ val for start, end, val in intersections ] )
+
+    def get_axt_at_offset( self, offset ):
+        if self.f:
+            self.f.seek( offset )
+            return read_next_axt( self.f, self.species_to_lengths )
+        else:
+            f = open( self.axt_filename )
+            try:
+                f.seek( offset )
+                return read_next_axt( f, self.species_to_lengths )
+            finally:
+                f.close()
+
+class Reader( object ):
+    """Iterate over all axt blocks in a file in order"""
+    
+    def __init__( self, file, species_to_lengths=None ):
         self.file = file
+        self.species_to_lengths = species_to_lengths
+        self.attributes = {}
 
     def next( self ):
-        # Read block
-        line = readline( self.file )
-        if not line: return
-        fields = line.split()
-        seq1 = readline( self.file )
-        seq2 = readline( self.file )
-        blank = readline( self.file )
-        # Build 2 component alignment
-        alignment = Alignment()
-        component = Component()
-        component.src = fields[1]
-        component.start = int( fields[2] ) - 1
-        end = int( fields[3] )
-        component.size = end - component.start 
-        component.strand = "+"
-        component.text = seq1.strip()
-        alignment.add_component( component )
-        component = Component()
-        component.src = fields[4]
-        component.start = int( fields[5] ) - 1
-        end = int( fields[6] )
-        component.size = end - component.start
-        component.strand = fields[7]
-        component.text = seq2.strip()
-        
-        return alignment
+        return read_next_axt( self.file, self.species_to_lengths )
 
     def __iter__( self ):
-        while 1:
-            v = self.next()
-            if not v: break
-            yield v
+        return ReaderIter( self )
 
     def close( self ):
         self.file.close()
 
-# Helper methods
+class ReaderIter( object ):
+    def __init__( self, reader ):
+        self.reader = reader
+    def __iter__( self ): 
+        return self
+    def next( self ):
+        v = self.reader.next()
+        if not v: raise StopIteration
+        return v
+
+class Writer( object ):
+
+    def __init__( self, file, attributes={} ):
+        self.file = file
+
+    def write( self, alignment ):
+        if (len(alignment.components) != 2):
+            raise "%d-component alignment is not compatible with axt" % \
+                   len(alignment.components)
+        c1 = alignment.components[0]
+        c2 = alignment.components[1]
+
+        if c1.strand != "+":
+        	c1 = c1.reverse_complement()
+        	c2 = c2.reverse_complement()
+
+        spec1,chr1 = src_split( c1.src )
+        spec2,chr2 = src_split( c2.src )
+
+        self.file.write( "0 %s %d %d %s %d %d %s %d\n" % \
+              (chr1,c1.start+1,c1.start+c1.size,
+               chr2,c2.start+1,c2.start+c2.size,c2.strand,
+               alignment.score))
+        self.file.write( "%s\n" % c1.text )
+        self.file.write( "%s\n" % c2.text )
+        self.file.write( "\n" )
+
+    def close( self ):
+        self.file.close()
+
+# ---- Helper methods ---------------------------------------------------------
+
+# typical axt block:
+#   0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500
+#   TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
+#   TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
+# start and stop are origin-1, inclusive
+# first species is always on plus strand
+# when second species is on minus strand, start and stop are counted from sequence end
+
+def read_next_axt( file, species_to_lengths=None ):
+    line = readline( file, skip_blank=True )
+    if not line: return
+    fields = line.split()
+    if (len(fields) != 9): raise "bad axt-block header: %s" % line
+    seq1 = readline( file )
+    if not line or line.isspace(): raise "incomplete axt-block; header: %s" % line
+    seq2 = readline( file )
+    if not line or line.isspace(): raise "incomplete axt-block; header: %s" % line
+    # Build 2 component alignment
+    alignment = Alignment(species_to_lengths=species_to_lengths)
+    alignment.score = fields[8]
+    # Build component for species 1
+    component = Component()
+    component.src = "species1." + fields[1]
+    component.start = int( fields[2] ) - 1    # (axt intervals are origin-1
+    end = int( fields[3] )                    #  and inclusive on both ends)
+    component.size = end - component.start
+    component.strand = "+"
+    component.text = seq1.strip()
+    alignment.add_component( component )
+    # Build component for species 2
+    component = Component()
+    component.src = "species2." + fields[4]
+    component.start = int( fields[5] ) - 1
+    end = int( fields[6] )
+    component.size = end - component.start
+    component.strand = fields[7]
+    component.text = seq2.strip()
+    alignment.add_component( component )
+    return alignment
 
 def readline( file, skip_blank=False ):
     """Read a line from provided file, skipping any blank or comment lines"""
     while 1:
         line = file.readline()
-        if not line: return None 
+        if not line: return None
         if line[0] != '#' and not ( skip_blank and line.isspace() ):
             return line
+

File bx/align/core.py

     def __init__( self, score=0, attributes={} ):
         self.score = 0
         self.text_size = 0
-        self.attributes = {}
+        self.attributes = attributes
+        if species_to_lengths == None: self.species_to_lengths = {}
+        else: self.species_to_lengths = species_to_lengths
         self.components = []
 
     def add_component( self, component ):
+        component.alignment = self
         self.components.append( component )
         if self.text_size == 0: self.text_size = len( component.text )
         elif self.text_size != len( component.text ): raise "Components must have same text length"
             s += "\n"
         return s
 
+    def src_size( self, src ):
+        species,chrom = src_split( src )
+        if species == None: raise "no src_size (%s not of form species.chrom)" % src
+        if species not in self.species_to_lengths: raise "no src_size (no length file for %s)" % species
+        chrom_to_length = self.species_to_lengths[species]
+        if type( chrom_to_length ) == type( "" ):  # (if it's a file name)
+            chrom_to_length = read_lengths_file( chrom_to_length )
+            self.species_to_lengths[species] = chrom_to_length
+        if chrom not in chrom_to_length: "no src_size (%s has no length for %s)" % ( species, chrom )
+        return chrom_to_length[chrom]
+
     def get_component_by_src( self, src ):
         for c in self.components:
             if c.src == src: return c
     
 class Component( object ):
 
-    def __init__( self, src='', start=0, size=0, strand=None, src_size=0, text='' ):
+    def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ):
+        self.alignment = None
         self.src = src
-        self.start = start
-        self.size = size
-        self.strand = strand
-        self.src_size = src_size
+        self.start = start          # Nota Bene:  start,size,strand are as they
+        self.size = size            # .. appear in a MAF file-- origin-zero, end
+        self.strand = strand        # .. excluded, and minus strand counts from
+        self._src_size = src_size   # .. end of sequence
         self.text = text
 
     def __str__( self ):
         return self.start + self.size
     end = property( fget=get_end )
 
+    def get_src_size( self ):
+        if self._src_size == None:
+            if self.alignment == None: raise "component has no src_size"
+            self._src_size = self.alignment.src_size( self.src )
+        return self._src_size
+    def set_src_size( self,src_size ):
+        self._src_size = src_size
+    src_size = property( fget=get_src_size, fset=set_src_size )
+
     def get_forward_strand_start( self ):
         if self.strand == '-': return self.src_size - self.end
         else: return self.start
         start = self.src_size - self.start 
         if self.strand == "+": strand = "-"
         else: strand = "+"
-        text = self.text.translate( DNA_COMP )
-        return Component( self.src, start, self.size, strand, self.src_size, text )
+        comp = [ch for ch in self.text.translate(DNA_COMP)]
+        comp.reverse()
+        text = "".join(comp)
+        new = Component( self.src, start, self.size, strand, self._src_size, text )
+        new.alignment = self.alignment
+        return new
 
     def slice( self, start, end ):
-        new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self.src_size )
+        new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size )
+        new.alignment = self.alignment
         new.text = self.text[start:end]
 
         #for i in range( 0, start ):
             col += 1 
         return col
 
-def get_reader( format, infile ):
+def get_reader( format, infile, species_to_lengths=None ):
     import align.axt, align.maf
-    if format == "maf": return align.maf.Reader( infile )
-    elif format == "axt": return align.axt.Reader( infile )
+    if format == "maf": return align.maf.Reader( infile, species_to_lengths )
+    elif format == "axt": return align.axt.Reader( infile, species_to_lengths )
+    elif format == "lav": return align.lav.Reader( infile, species_to_lengths )
+    else: raise "Unknown alignment format %s" % format
+
+def get_writer( format, outfile, attributes={} ):
+    import align.axt, align.maf
+    if format == "maf": return align.maf.Writer( outfile, attributes )
+    elif format == "axt": return align.axt.Writer( outfile, attributes )
+    elif format == "lav": return align.lav.Writer( outfile, attributes )
+    else: raise "Unknown alignment format %s" % format
+
+def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ):
+    import align.axt, align.maf
+    if format == "maf": return align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths )
+    elif format == "axt": return align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths )
+    elif format == "lav": return align.lav.Indexed( filename, index_filename, keep_open, species_to_lengths )
     else: raise "Unknown alignment format %s" % format
 
 def shuffle_columns( a ):
     for c in a.components:
         c.text = ''.join( [ c.text[i] for i in mask ] )
 
+def src_split( src ): # splits src into species,chrom
+    dot = src.rfind( "." )
+    if dot == -1: return None,src
+    else:         return src[:dot],src[dot+1:]
 
+# improvement: lengths file should probably be another class
 
+def read_lengths_file( name ):
+    chrom_to_length = {}
+    f = file ( name, "rt" )
+    for line in f:
+        line = line.strip()
+        if line == '' or line[0] == '#': continue
+        try:
+            fields = line.split()
+            if len(fields) != 2: raise
+            chrom = fields[0]
+            length = int( fields[1] )
+        except:
+            raise "bad length file line: %s" % line
+        if chrom in chrom_to_length and length != chrom_to_length[chrom]:
+            raise "%s has more than one length!" % chrom
+        chrom_to_length[chrom] = length
+    f.close()
+    return chrom_to_length
+

File bx/align/maf.py

 class Indexed( object ):
     """Indexed access to a maf using overlap queries, requires an index file"""
 
-    def __init__( self, maf_filename, index_filename=None, keep_open=False ):
+    def __init__( self, maf_filename, index_filename=None, keep_open=False, species_to_lengths=None ):
         if index_filename is None: index_filename = maf_filename + ".index"
         self.indexes = interval_index_file.Indexes( filename=index_filename )
         self.maf_filename = maf_filename
+        self.species_to_lengths = species_to_lengths
         if keep_open: 
             self.f = open( maf_filename )
         else:
     def get_maf_at_offset( self, offset ):
         if self.f:
             self.f.seek( offset )
-            return read_next_maf( self.f ) 
+            return read_next_maf( self.f, self.species_to_lengths )
         else:
             f = open( self.maf_filename )
             try:
                 f.seek( offset )
-                return read_next_maf( f ) 
+                return read_next_maf( f, self.species_to_lengths ) 
             finally:
                 f.close()
             
 class Reader( object ):
     """Iterate over all maf blocks in a file in order"""
     
-    def __init__( self, file ):
+    def __init__( self, file, species_to_lengths=None ):
         self.file = file
+        self.species_to_lengths = species_to_lengths
         # Read and verify maf header, store any attributes
         fields = self.file.readline().split()
         if fields[0] != '##maf': raise "File does not have MAF header"
         self.attributes = parse_attributes( fields[1:] )
 
     def next( self ):
-        return read_next_maf( self.file )
+        return read_next_maf( self.file, self.species_to_lengths )
 
     def __iter__( self ):
         return ReaderIter( self )
 
 # ---- Helper methods ---------------------------------------------------------
 
-def read_next_maf( file ):
-        alignment = Alignment()
-        # Attributes line
-        line = readline( file, skip_blank=True )
-        if not line: return None
-        fields = line.split() 
-        if fields[0] != 'a': raise "Expected 'a ...' line"
-        alignment.attributes = parse_attributes( fields[1:] )
-        alignment.score = alignment.attributes['score']
-        del alignment.attributes['score']
-        # Sequence lines
-        while 1:
-            line = readline( file )
-            # EOF or Blank line terminates alignment components
-            if not line or line.isspace(): break
-            if line.isspace(): break 
-            # Verify
-            fields = line.split()
-            if fields[0] != 's': raise "Expected 's ...' line"
-            # Parse 
-            component = Component()
-            component.src = fields[1]
-            component.start = int( fields[2] )
-            component.size = int( fields[3] )
-            component.strand = fields[4]
-            component.src_size = int( fields[5] )
-            if len(fields) > 6: component.text = fields[6].strip()
-            # Add to set
-            alignment.add_component( component )
-        return alignment
+def read_next_maf( file, species_to_lengths=None ):
+    alignment = Alignment(species_to_lengths=species_to_lengths)
+    # Attributes line
+    line = readline( file, skip_blank=True )
+    if not line: return None
+    fields = line.split() 
+    if fields[0] != 'a': raise "Expected 'a ...' line"
+    alignment.attributes = parse_attributes( fields[1:] )
+    alignment.score = alignment.attributes['score']
+    del alignment.attributes['score']
+    # Sequence lines
+    while 1:
+        line = readline( file )
+        # EOF or Blank line terminates alignment components
+        if not line or line.isspace(): break
+        if line.isspace(): break 
+        # Verify
+        fields = line.split()
+        if fields[0] != 's': raise "Expected 's ...' line"
+        # Parse 
+        component = Component()
+        component.src = fields[1]
+        component.start = int( fields[2] )
+        component.size = int( fields[3] )
+        component.strand = fields[4]
+        component.src_size = int( fields[5] )
+        if len(fields) > 6: component.text = fields[6].strip()
+        # Add to set
+        alignment.add_component( component )
+    return alignment
 
 def readline( file, skip_blank=False ):
     """Read a line from provided file, skipping any blank or comment lines"""