Commits

Bob Harris committed 0129da8

(1) added matrix scores for upper to loweer as per blastz (2) added support for asymmetirc alphabets (different alphabet for text1 than text2) and symbols outside the normal ASCII character set

  • Participants
  • Parent commits a522a93

Comments (0)

Files changed (2)

File bx/align/score.py

 from Numeric import *
 
 class ScoringScheme( object ):
-    def __init__( self, gap_open, gap_extend, input_size=128, default=-100, typecode=Int ):
-        self.table = zeros( (input_size, input_size), typecode=typecode )
+    def __init__( self, gap_open, gap_extend, default=-100, alphabet1="ACGT", alphabet2=None, gap1="-", gap2=None, text1_range=128, text2_range=None, typecode=Int ):
+        if (text2_range == None): text2_range = text1_range
+        if (alphabet2 == None): alphabet2 = alphabet1
+        if (gap2 == None): gap2 = gap1
+        if type(alphabet1) == str: alphabet1 = [ch for ch in alphabet1]
+        if type(alphabet2) == str: alphabet2 = [ch for ch in alphabet2]
+        self.table = zeros( (text1_range, text2_range), typecode=typecode )
         self.table *= default
         self.gap_open = gap_open
         self.gap_extend = gap_extend
+        self.gap1 = gap1
+        self.gap2 = gap2
+        self.alphabet1 = alphabet1
+        self.alphabet2 = alphabet2
     def set_score( self, a, b, val ):
         self.table[a,b] = val
-    
-def build_scoring_scheme( s, gap_open, gap_extend ):
+
+def build_scoring_scheme( s, gap_open, gap_extend, gap1="-", gap2=None ):
     """
     Initialize scoring scheme from a blastz style text blob, first line
     specifies the bases for each row/col, subsequent lines contain the
-    corresponding scores.
+    corresponding scores.  Slaw extensions allow for unusual and/or
+    asymmetric alphabets.  Symbols can be two digit hex, and each row
+    begins with symbol.  Note that a row corresponds to a symbol in text1
+    and a column to a symbol in text2.
+
+    examples:
+    
+       blastz                       slaw
+
+          A    C    G    T               01   02    A    C    G    T
+         91 -114  -31 -123          01  200 -200  -50  100  -50  100
+       -114  100 -125  -31          02 -200  200  100  -50  100  -50
+        -31 -125  100 -114
+       -123  -31 -114   91
     """
-    ss = ScoringScheme( gap_open, gap_extend )
+    # perform initial parse to determine alphabets and locate scores
+    bad_matrix = "invalid scoring matrix"
     lines = s.split( "\n" )
-    chars = lines[0].split()
-    for i, line in enumerate( lines[1:] ):
-        for j, score in enumerate( map( int, line.split() ) ):
-            ss.set_score( ord( chars[i].lower() ), ord( chars[j].lower() ), score )
-            ss.set_score( ord( chars[i].upper() ), ord( chars[j].upper() ), score )
+    rows  = []
+    symbols2 = lines.pop(0).split()
+    symbols1 = None
+    rows_have_syms = False
+    a_la_blastz = True
+    for i, line in enumerate( lines ):
+        row_scores = line.split()
+        if len( row_scores ) == len( symbols2 ):        # blastz-style row
+            if symbols1 == None:
+                if len( lines ) != len( symbols2 ):
+                    raise bad_matrix
+                symbols1 = symbols2
+            elif (rows_have_syms):
+                raise bad_matrix
+        elif len( row_scores ) == len( symbols2 ) + 1:  # row starts with symbol
+            if symbols1 == None:
+                symbols1 = []
+                rows_have_syms = True
+                a_la_blastz = False
+            elif not rows_have_syms:
+                raise bad_matrix
+            symbols1.append( row_scores.pop(0) )
+        else:
+            raise bad_matrix
+        rows.append( row_scores )
+    # convert alphabets from strings to characters
+    try:
+        alphabet1 = [sym_to_char( sym ) for sym in symbols1]
+        alphabet2 = [sym_to_char( sym ) for sym in symbols2]
+    except ValueError:
+        raise bad_matrix
+    if (alphabet1 != symbols1) or (alphabet2 != symbols2):
+        a_la_blastz = False
+    if a_la_blastz:
+        alphabet1 = [ch.upper() for ch in alphabet1]
+        alphabet2 = [ch.upper() for ch in alphabet2]
+    # create appropriately sized matrix
+    text1_range = text2_range = 128
+    if ord( max( alphabet1 ) ) >= 128: text1_range = 256
+    if ord( max( alphabet2 ) ) >= 128: text2_range = 256
+    ss = ScoringScheme( gap_open, gap_extend, alphabet1=alphabet1, alphabet2=alphabet2, gap1=gap1, gap2=gap2, text1_range=text1_range, text2_range=text2_range )
+    # fill matrix
+    for i, row_scores in enumerate( rows ):
+        for j, score in enumerate( map( int, row_scores ) ):
+            ss.set_score( ord( alphabet1[i] ), ord( alphabet2[j] ), score )
+            if a_la_blastz:
+                ss.set_score( ord( alphabet1[i].upper() ), ord( alphabet2[j].lower() ), score )
+                ss.set_score( ord( alphabet1[i].lower() ), ord( alphabet2[j].upper() ), score )
+                ss.set_score( ord( alphabet1[i].lower() ), ord( alphabet2[j].lower() ), score )
     return ss
-            
+
+# convert possible two-char symbol to a single character
+def sym_to_char( sym ):
+    if   len( sym ) == 1: return sym
+    elif len( sym ) != 2: raise ValueError
+    else:                 return chr(int(sym,base=16))
+
 def score_alignment( scoring_scheme, a ):
     score = 0
     ncomps = len( a.components )
         a = text1[i]
         b = text2[i]
         # Ignore gap/gap pair
-        if a == '-' and b == '-': 
+        if a == scoring_scheme.gap1 and b == scoring_scheme.gap2: 
             continue
         # Gap in first species
-        elif a == '-':
+        elif a == scoring_scheme.gap1:
             rval -= scoring_scheme.gap_extend
             if not last_gap_a:
                rval -= scoring_scheme.gap_open
                last_gap_a = True
                last_gap_b = False
         # Gap in second species
-        elif b == '-':
+        elif b == scoring_scheme.gap2:
             rval -= scoring_scheme.gap_extend
             if not last_gap_b:
                rval -= scoring_scheme.gap_open
     base) in text1.
     """
     if skip_ref_gaps:
-        rval = zeros( len( text1 ) - text1.count( '-' ) )
+        rval = zeros( len( text1 ) - text1.count( scoring_scheme.gap1 ) )
     else:
         rval = zeros( len( text1 ) )
     score = 0
         a = text1[i]
         b = text2[i]
         # Ignore gap/gap pair
-        if a == '-' and b == '-': 
+        if a == scoring_scheme.gap1 and b == scoring_scheme.gap2: 
             continue
         # Gap in first species
-        elif a == '-':
+        elif a == scoring_scheme.gap1:
             score -= scoring_scheme.gap_extend
             if not last_gap_a:
                score -= scoring_scheme.gap_open
                last_gap_a = True
                last_gap_b = False
         # Gap in second species
-        elif b == '-':
+        elif b == scoring_scheme.gap2:
             score -= scoring_scheme.gap_extend
             if not last_gap_b:
                score -= scoring_scheme.gap_open
         else:   
             score += scoring_scheme.table[ord(a),ord(b)]
             last_gap_a = last_gap_b = False
-        if not( skip_ref_gaps ) or a != '-':
+        if not( skip_ref_gaps ) or a != scoring_scheme.gap1:
             rval[pos] = score
             pos += 1
     return rval
                                   91 -114  -31 -123
                                 -114  100 -125  -31
                                  -31 -125  100 -114
-                                -123  -31 -114   91 """, 400, 30 )
+                                -123  -31 -114   91 """, 400, 30 )

File bx/align/score_tests.py

 s rheMac1.SCAFFOLD45837 26063 33 -     31516 TGTGTGATTAATGCCTGAGATTGTGTGAAGTAA-------
 """
 
+nonsymm_scheme = bx.align.score.build_scoring_scheme ( """  A    C    G    T
+                                                           91    0  -31 -123
+                                                         -114  100 -125  -31
+                                                          -31 -125  100 -114
+                                                         -123  -31 -114   91 """, 400, 30 )
+
+aligns_for_nonsymm_scheme = [ ( "AAAACCCCGGGGTTTT",
+                                "ACGTACGTACGTACGT", 
+                                -580 )
+                            ]
+
+
+asymm_scheme = bx.align.score.build_scoring_scheme ( """    01   02    A    C    G    T
+                                                       01  200 -200  -50  100  -50  100
+                                                       02 -200  200  100  -50  100  -50 """,
+                                                       0, 0, gap1='\x00' )
+
+aligns_for_asymm_scheme = [ ( "\x01\x01\x01\x01\x01\x01",
+                              "ACGT\x01\x02", 
+                              100 )
+                          ]
+
+
 class BasicTests( unittest.TestCase ):
     def test_scoring_text( self ):
         ss = bx.align.score.hox70
                            cumsum( array( [ -430, -30, -30, -30, -30, -31, 91, 91, -123 ] ) ) )
         self.assertEquals( bx.align.score.accumulate_scores( ss, "-----CTTT", "CTTAGTTTA", skip_ref_gaps=True ),
                            cumsum( array( [ -581, 91, 91, -123 ] ) ) )
-            
+
+    def test_nonsymm_scoring( self ):
+        ss = nonsymm_scheme
+        for t1, t2, score in aligns_for_nonsymm_scheme:
+            self.assertEquals( bx.align.score.score_texts( ss, t1, t2 ), score )
+
+    def test_asymm_scoring( self ):
+        ss = asymm_scheme
+        for t1, t2, score in aligns_for_asymm_scheme:
+            self.assertEquals( bx.align.score.score_texts( ss, t1, t2 ), score )
+   
 test_classes = [ BasicTests ]
-suite = unittest.TestSuite( [ unittest.makeSuite( c ) for c in test_classes ] )
+suite = unittest.TestSuite( [ unittest.makeSuite( c ) for c in test_classes ] )