Commits

David King  committed 22f4b4c

Added a maf reader for the pwm scorer

  • Participants
  • Parent commits 3e77999

Comments (0)

Files changed (2)

File pwm/position_weight_matrix.py

         self.nrows = len(seqrows)
         ncol = None
         for rownum,row in enumerate(self.rows):
-            if ncol == None: ncol = len(row)
-            elif ncol != len(row):
-                raise "Align: __init__:alignment block:row %d does not have %d columns, it has %d" % (rownum,ncol,len(row))
+            try:
+                if ncol == None: ncol = len(row)
+                elif ncol != len(row):
+                    raise "Align: __init__:alignment block:row %d does not have %d columns, it has %d" % (rownum,ncol,len(row))
+            except:
+                print row
+                raise ''
         self.ncols = ncol
         self.dims = (self.nrows,self.ncols)
 
                 position_rsq.append( r / smallN )
         return position_rsq 
 
-    def score_align_gaps (self, align):
-
-        # a blank score matrix
-        nrows,ncols = align.dims
-        scoremax = AlignScoreMatrix( align ).matrix
-
-        for ir in range(nrows):
-            for pos in range(ncols):
-                if align.rows[ir][pos] == '-': scoremax[ir][pos] = 1
-                else: scoremax[ir][pos] = 0
-
-        return scoremax
-
-    def score_align (self,align):
+    def score_align (self,align,gapmask=None,byPosition=True):
 
         # a blank score matrix
         nrows,ncols = align.dims
 
         minSeqLen = len( self )
         for ir in range(nrows):
+
+            # row is missing data
+            if isnan(align.rows[ir][0]): continue
+
             for start in range(ncols):
                 if align.rows[ir][start] == '-': continue
                 elif align.rows[ir][start] == 'n': continue
                     if len(subseq) == minSeqLen: 
                         end = ic+1
 
+                        #forward
                         scores = self.score_seq( subseq )
                         raw,forward_score = scores[0]
-
+                        #reverse
                         scores = self.score_reverse_seq( subseq )
                         raw,reverse_score = scores[0]
 
                         score = max(forward_score, reverse_score)
 
                         # replace the alignment positions with the result
-                        if True:
+                        if byPosition:
                             scoremax[ir][start] = score
                         else:
+                        # replace positions matching the width of the pwm
                             for i in range(start,end):
                                 if isnan(scoremax[ir][i]): scoremax[ir][i] = score
                                 elif score > scoremax[ir][i]: scoremax[ir][i] = score
-                        #print subseq,scoremax
-                        #for x in range(len(scoremax)):
-                            #for y in range(len(scoremax[x])):
-                                #print "%.2f\t" % scoremax[x][y],
-                            #print
-            #print
-        gapmask = self.score_align_gaps(align)
+        # mask gap characters
+        if gapmask == None:
+            gapmask = score_align_gaps(align)
         putmask( scoremax, gapmask, float('nan') )
         return scoremax
         
     def __setitem__ (self,key,value):
         self.rows[key] = value
 
+def score_align_gaps (align):
+    # a blank score matrix
+    nrows,ncols = align.dims
+    scoremax = AlignScoreMatrix( align ).matrix
+    for ir in range(nrows):
+        # row is missing data
+        if isnan(align.rows[ir][0]): continue
+        # scan for gaps
+        for pos in range(ncols):
+            if align.rows[ir][pos] == '-': scoremax[ir][pos] = 1
+            else: scoremax[ir][pos] = 0
+    return scoremax
+
 #-----------
 #
 # WeightMatrix Reader--
 class Reader (object):
     """Iterate over all interesting weight matrices in a file"""
 
-    def __init__ (self,file,tfIds=None,name=None,format='transfac',background=None):
+    def __init__ (self,file,tfIds=None,name=None,format='basic',background=None):
         self.tfIds      = tfIds
         self.file       = file
         self.name       = name
         # clean up
         self.tfToPwm = None
 
-def isnan(m):
-    return not_equal(m,m)
+def isnan(x):
+    #return ieeespecial.isnan(x)
+    if x==x: return False
+    return True
 
 def reverse_complement (nukes):
     return nukes[::-1].translate(PositionWeightMatrix.complementMap)

File pwm/pwm_score_maf.py

+#!/usr/bin/python2.4
+import sys,os
+import align.maf
+import position_weight_matrix as pwmx
+from numarray import *
+
+def isnan(x):
+    #return ieeespecial.isnan(x)
+    if x==x: return False
+    return True
+
+NaN = float('nan')
+#NaN = ieeespecial.nan
+#Inf = ieeespecial.plus_inf
+#NInf = ieeespecial.minus_inf
+
+def main():
+
+    pwm_file = sys.argv[1]
+    splist = sys.argv[2]
+    if len(sys.argv) ==4: 
+        inmaf = open(sys.argv[3])
+    else:
+        inmaf = sys.stdin
+
+    # read alignment species
+    species = []
+    for sp in splist.split(','):
+        species.append( sp )
+
+    # read weight matrices
+    pwm = {}
+    for wm in pwmx.Reader(open( pwm_file )):
+        pwm[ wm.id ] = wm
+
+    fbunch = {}
+    for scoremax in MafScorer(pwm, species, inmaf):
+        for k,matrix in scoremax.items():
+            fname = k + '.mx'
+            if fname not in fbunch:
+                fbunch[fname] = open(fname,'w')
+                print >>sys.stderr,"Writing",fname
+
+            for i in range( len(matrix)):
+                for j in range( len(matrix[i])):
+                    print >>fbunch[fname], "%.2f" % matrix[i][j],
+                print >>fbunch[fname]
+
+    for file in fbunch.values():
+        file.close()
+
+def MafScorer(pwm,species,inmaf):
+
+    for maf in align.maf.Reader( inmaf ):
+        # expand block rows to full
+        mafBlockSpecies = [specName.src.split('.')[0] for specName in maf.components]
+        alignlist = []
+        for sp in species:
+            try:
+                i = mafBlockSpecies.index( sp )
+                alignlist.append( maf.components[i].text )
+            except ValueError:
+                alignlist.append( [ NaN for n in range(len(maf.components[0].text)) ] )
+        alignrows = pwmx.Align( alignlist )
+        scoremax = {}
+        # record gap positions
+        filter = pwmx.score_align_gaps( alignrows )
+        # score pwm models
+        for model in pwm.keys():
+            scoremax[model] = pwm[model].score_align( alignrows, filter )
+        yield scoremax
+
+if __name__ == '__main__': main()