James Taylor avatar James Taylor committed 03723eb

motif.pwm: Scoring sequences with gaps,

Comments (0)

Files changed (4)

lib/bx/motif/_pwm.c

-/* Generated by Cython 0.9.6.14 on Sat May 31 17:10:21 2008 */
+/* Generated by Cython 0.9.6.14 on Tue Jun  3 21:12:07 2008 */
 
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
  *                 break
  *             score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]             # <<<<<<<<<<<<<< 
  *         else:
- *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score */
+ *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score
+ */
       __pyx_v_score += (((npy_float32 *)((__pyx_v_matrix->data + (__pyx_v_j * (__pyx_v_matrix->strides[0]))) + (__pyx_v_char_index * (__pyx_v_matrix->strides[1]))))[0]);
     }
     /*else*/ {
  *             score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]
  *         else:
  *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score             # <<<<<<<<<<<<<< 
+ *             
+ * def score_string_with_gaps( ndarray matrix, ndarray char_to_index, object string, ndarray rval ):
  */
       (((npy_float32 *)(__pyx_v_rval->data + (__pyx_v_i * (__pyx_v_rval->strides[0]))))[0]) = __pyx_v_score;
     }
   return __pyx_r;
 }
 
+/* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":50
+ *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score
+ *             
+ * def score_string_with_gaps( ndarray matrix, ndarray char_to_index, object string, ndarray rval ):             # <<<<<<<<<<<<<< 
+ *     """
+ *     Score each position in string `string` using the scoring matrix `matrix`.
+ */
+
+static char __pyx_k_1[] = "-";
+static char __pyx_k_2[] = "-";
+
+static PyObject *__pyx_pf_2bx_5motif_4_pwm_score_string_with_gaps(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_2bx_5motif_4_pwm_score_string_with_gaps[] = "\n    Score each position in string `string` using the scoring matrix `matrix`.\n    Characters in the string are mapped to columns in the matrix by `char_to_index`\n    and the score for each position is stored in `rval`.\n\n    matrix *must* be a 2d array of type float32\n    char_to_index *must* be a 1d array of type int16\n    rval *must* be a 1d array of type float32 and the same length as string\n    ";
+static PyObject *__pyx_pf_2bx_5motif_4_pwm_score_string_with_gaps(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_matrix = 0;
+  PyArrayObject *__pyx_v_char_to_index = 0;
+  PyObject *__pyx_v_string = 0;
+  PyArrayObject *__pyx_v_rval = 0;
+  char *__pyx_v_buffer;
+  Py_ssize_t __pyx_v_len;
+  float __pyx_v_score;
+  int __pyx_v_i;
+  int __pyx_v_j;
+  int __pyx_v_string_pos;
+  int __pyx_v_matrix_width;
+  npy_int16 __pyx_v_char_index;
+  int __pyx_v_stop;
+  PyObject *__pyx_r;
+  int __pyx_1;
+  int __pyx_2;
+  static char *__pyx_argnames[] = {"matrix","char_to_index","string","rval",0};
+  if (likely(!__pyx_kwds) && likely(PyTuple_GET_SIZE(__pyx_args) == 4)) {
+    __pyx_v_matrix = ((PyArrayObject *)PyTuple_GET_ITEM(__pyx_args, 0));
+    __pyx_v_char_to_index = ((PyArrayObject *)PyTuple_GET_ITEM(__pyx_args, 1));
+    __pyx_v_string = PyTuple_GET_ITEM(__pyx_args, 2);
+    __pyx_v_rval = ((PyArrayObject *)PyTuple_GET_ITEM(__pyx_args, 3));
+  }
+  else {
+    if (unlikely(!PyArg_ParseTupleAndKeywords(__pyx_args, __pyx_kwds, "OOOO", __pyx_argnames, &__pyx_v_matrix, &__pyx_v_char_to_index, &__pyx_v_string, &__pyx_v_rval))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L2;}
+  }
+  goto __pyx_L3;
+  __pyx_L2:;
+  __Pyx_AddTraceback("bx.motif._pwm.score_string_with_gaps");
+  return NULL;
+  __pyx_L3:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_matrix), __pyx_ptype_2bx_5motif_4_pwm_ndarray, 1, "matrix", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_char_to_index), __pyx_ptype_2bx_5motif_4_pwm_ndarray, 1, "char_to_index", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_rval), __pyx_ptype_2bx_5motif_4_pwm_ndarray, 1, "rval", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1;}
+
+  /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":64
+ *     cdef float score
+ *     cdef int i, j, string_pos
+ *     cdef int matrix_width = matrix.dimensions[0]             # <<<<<<<<<<<<<< 
+ *     cdef npy_int16 char_index
+ *     # Get input string as character pointer
+ */
+  __pyx_v_matrix_width = (__pyx_v_matrix->dimensions[0]);
+
+
+  /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":67
+ *     cdef npy_int16 char_index
+ *     # Get input string as character pointer
+ *     PyString_AsStringAndSize( string, &buffer, &len )             # <<<<<<<<<<<<<< 
+ *     # Loop over each position in the string 
+ *     cdef int stop = len - matrix.dimensions[0] + 1
+ */
+  __pyx_1 = PyString_AsStringAndSize(__pyx_v_string, (&__pyx_v_buffer), (&__pyx_v_len)); if (unlikely(__pyx_1 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 67; __pyx_clineno = __LINE__; goto __pyx_L1;}
+
+  /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":69
+ *     PyString_AsStringAndSize( string, &buffer, &len )
+ *     # Loop over each position in the string 
+ *     cdef int stop = len - matrix.dimensions[0] + 1             # <<<<<<<<<<<<<< 
+ *     for i from 0 <= i < stop:
+ *         if buffer[i] == '-':
+ */
+  __pyx_v_stop = ((__pyx_v_len - (__pyx_v_matrix->dimensions[0])) + 1);
+
+
+  /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":70
+ *     # Loop over each position in the string 
+ *     cdef int stop = len - matrix.dimensions[0] + 1
+ *     for i from 0 <= i < stop:             # <<<<<<<<<<<<<< 
+ *         if buffer[i] == '-':
+ *             # Never start scoring at a gap
+ */
+  for (__pyx_v_i = 0; __pyx_v_i < __pyx_v_stop; __pyx_v_i++) {
+
+    /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":71
+ *     cdef int stop = len - matrix.dimensions[0] + 1
+ *     for i from 0 <= i < stop:
+ *         if buffer[i] == '-':             # <<<<<<<<<<<<<< 
+ *             # Never start scoring at a gap
+ *             continue
+ */
+    __pyx_2 = ((__pyx_v_buffer[__pyx_v_i]) == '-');
+    if (__pyx_2) {
+      goto __pyx_L4;
+      goto __pyx_L6;
+    }
+    __pyx_L6:;
+
+    /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":74
+ *             # Never start scoring at a gap
+ *             continue
+ *         score = 0.0             # <<<<<<<<<<<<<< 
+ *         string_pos = i
+ *         for j from 0 <= j < matrix_width:
+ */
+    __pyx_v_score = 0.0;
+
+    /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":75
+ *             continue
+ *         score = 0.0
+ *         string_pos = i             # <<<<<<<<<<<<<< 
+ *         for j from 0 <= j < matrix_width:
+ *             # Advance to the next non-gap character
+ */
+    __pyx_v_string_pos = __pyx_v_i;
+
+    /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":76
+ *         score = 0.0
+ *         string_pos = i
+ *         for j from 0 <= j < matrix_width:             # <<<<<<<<<<<<<< 
+ *             # Advance to the next non-gap character
+ *             while buffer[string_pos] == '-' and string_pos < len:
+ */
+    for (__pyx_v_j = 0; __pyx_v_j < __pyx_v_matrix_width; __pyx_v_j++) {
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":78
+ *         for j from 0 <= j < matrix_width:
+ *             # Advance to the next non-gap character
+ *             while buffer[string_pos] == '-' and string_pos < len:             # <<<<<<<<<<<<<< 
+ *                 string_pos += 1
+ *             # Ran out of non-gap characters, no more scoring is possible
+ */
+      while (1) {
+        __pyx_2 = ((__pyx_v_buffer[__pyx_v_string_pos]) == '-');
+        if (__pyx_2) {
+          __pyx_2 = (__pyx_v_string_pos < __pyx_v_len);
+        }
+        if (!__pyx_2) break;
+
+        /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":79
+ *             # Advance to the next non-gap character
+ *             while buffer[string_pos] == '-' and string_pos < len:
+ *                 string_pos += 1             # <<<<<<<<<<<<<< 
+ *             # Ran out of non-gap characters, no more scoring is possible
+ *             if string_pos == len:
+ */
+        __pyx_v_string_pos += 1;
+      }
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":81
+ *                 string_pos += 1
+ *             # Ran out of non-gap characters, no more scoring is possible
+ *             if string_pos == len:             # <<<<<<<<<<<<<< 
+ *                 return
+ *             # Find character for position and score
+ */
+      __pyx_2 = (__pyx_v_string_pos == __pyx_v_len);
+      if (__pyx_2) {
+
+        /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":82
+ *             # Ran out of non-gap characters, no more scoring is possible
+ *             if string_pos == len:
+ *                 return             # <<<<<<<<<<<<<< 
+ *             # Find character for position and score
+ *             char_index = ( <npy_int16 *> ( char_to_index.data + buffer[string_pos] * char_to_index.strides[0] ) )[0]
+ */
+        __pyx_r = Py_None; Py_INCREF(Py_None);
+        goto __pyx_L0;
+        goto __pyx_L11;
+      }
+      __pyx_L11:;
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":84
+ *                 return
+ *             # Find character for position and score
+ *             char_index = ( <npy_int16 *> ( char_to_index.data + buffer[string_pos] * char_to_index.strides[0] ) )[0]             # <<<<<<<<<<<<<< 
+ *             if char_index < 0: 
+ *                 break
+ */
+      __pyx_v_char_index = (((npy_int16 *)(__pyx_v_char_to_index->data + ((__pyx_v_buffer[__pyx_v_string_pos]) * (__pyx_v_char_to_index->strides[0]))))[0]);
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":85
+ *             # Find character for position and score
+ *             char_index = ( <npy_int16 *> ( char_to_index.data + buffer[string_pos] * char_to_index.strides[0] ) )[0]
+ *             if char_index < 0:             # <<<<<<<<<<<<<< 
+ *                 break
+ *             score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]
+ */
+      __pyx_2 = (__pyx_v_char_index < 0);
+      if (__pyx_2) {
+        goto __pyx_L8;
+        goto __pyx_L12;
+      }
+      __pyx_L12:;
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":87
+ *             if char_index < 0: 
+ *                 break
+ *             score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]             # <<<<<<<<<<<<<< 
+ *             # Matched a character, move forward
+ *             string_pos += 1
+ */
+      __pyx_v_score += (((npy_float32 *)((__pyx_v_matrix->data + (__pyx_v_j * (__pyx_v_matrix->strides[0]))) + (__pyx_v_char_index * (__pyx_v_matrix->strides[1]))))[0]);
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":89
+ *             score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]
+ *             # Matched a character, move forward
+ *             string_pos += 1             # <<<<<<<<<<<<<< 
+ *         else:
+ *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score */
+      __pyx_v_string_pos += 1;
+    }
+    /*else*/ {
+
+      /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":91
+ *             string_pos += 1
+ *         else:
+ *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score             # <<<<<<<<<<<<<< 
+ */
+      (((npy_float32 *)(__pyx_v_rval->data + (__pyx_v_i * (__pyx_v_rval->strides[0]))))[0]) = __pyx_v_score;
+    }
+    __pyx_L8:;
+    __pyx_L4:;
+  }
+
+  __pyx_r = Py_None; Py_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1:;
+  __Pyx_AddTraceback("bx.motif._pwm.score_string_with_gaps");
+  __pyx_r = NULL;
+  __pyx_L0:;
+  return __pyx_r;
+}
+
 static struct PyMethodDef __pyx_methods[] = {
   {"score_string", (PyCFunction)__pyx_pf_2bx_5motif_4_pwm_score_string, METH_VARARGS|METH_KEYWORDS, __pyx_doc_2bx_5motif_4_pwm_score_string},
+  {"score_string_with_gaps", (PyCFunction)__pyx_pf_2bx_5motif_4_pwm_score_string_with_gaps, METH_VARARGS|METH_KEYWORDS, __pyx_doc_2bx_5motif_4_pwm_score_string_with_gaps},
   {0, 0, 0, 0}
 };
 
   /*--- Function import code ---*/
   /*--- Execution code ---*/
 
-  /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":20
- *     ctypedef float npy_float32
- * 
- * def score_string( ndarray matrix, ndarray char_to_index, object string, ndarray rval ):             # <<<<<<<<<<<<<< 
+  /* "/Users/james/projects/bx-python/code/trunk/lib/bx/motif/_pwm.pyx":50
+ *             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score
+ *             
+ * def score_string_with_gaps( ndarray matrix, ndarray char_to_index, object string, ndarray rval ):             # <<<<<<<<<<<<<< 
  *     """
  *     Score each position in string `string` using the scoring matrix `matrix`.
  */

lib/bx/motif/_pwm.pyx

                 break
             score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]
         else:
+            ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score
+            
+def score_string_with_gaps( ndarray matrix, ndarray char_to_index, object string, ndarray rval ):
+    """
+    Score each position in string `string` using the scoring matrix `matrix`.
+    Characters in the string are mapped to columns in the matrix by `char_to_index`
+    and the score for each position is stored in `rval`.
+
+    matrix *must* be a 2d array of type float32
+    char_to_index *must* be a 1d array of type int16
+    rval *must* be a 1d array of type float32 and the same length as string
+    """
+    cdef char *buffer
+    cdef Py_ssize_t len
+    cdef float score
+    cdef int i, j, string_pos
+    cdef int matrix_width = matrix.dimensions[0]
+    cdef npy_int16 char_index
+    # Get input string as character pointer
+    PyString_AsStringAndSize( string, &buffer, &len )
+    # Loop over each position in the string 
+    cdef int stop = len - matrix.dimensions[0] + 1
+    for i from 0 <= i < stop:
+        if buffer[i] == '-':
+            # Never start scoring at a gap
+            continue
+        score = 0.0
+        string_pos = i
+        for j from 0 <= j < matrix_width:
+            # Advance to the next non-gap character
+            while buffer[string_pos] == '-' and string_pos < len:
+                string_pos += 1
+            # Ran out of non-gap characters, no more scoring is possible
+            if string_pos == len:
+                return
+            # Find character for position and score
+            char_index = ( <npy_int16 *> ( char_to_index.data + buffer[string_pos] * char_to_index.strides[0] ) )[0]
+            if char_index < 0: 
+                break
+            score += ( <npy_float32*> ( matrix.data + j * matrix.strides[0] + char_index * matrix.strides[1] ) )[0]
+            # Matched a character, move forward
+            string_pos += 1
+        else:
             ( <npy_float32*> ( rval.data + i * rval.strides[0] ) )[0] = score

lib/bx/motif/pwm.py

         rval = zeros( len( string ), float32 )
         rval[:] = nan
         _pwm.score_string( self.values, self.char_to_index, string, rval )
+        return rval
+        
+    def score_string_with_gaps( self, string ):
+        """
+        Score each valid position in `string` using this scoring matrix. 
+        Positions which were not scored are set to nan. Gap characters are
+        ignored (matrices score across them).
+        """
+        rval = zeros( len( string ), float32 )
+        rval[:] = nan
+        _pwm.score_string_with_gaps( self.values, self.char_to_index, string, rval )
         return rval

lib/bx/motif/pwm_tests.py

 import pwm
-from numpy import allclose
+from numpy import allclose, isnan
 
 def test_create():
     m = pwm.FrequencyMatrix.from_rows( ['A','C','G','T'], get_ctcf_rows() )
     assert allclose( rc.score_string( "TGCCTGCCTCTGTAGGCTCC" )[0], 0.723828315735 )
     assert allclose( rc.score_string( "GTTGCCAGTTGGGGGAAGCA" )[0], -126.99407196 )
     assert allclose( rc.score_string( "GCAGACACCAGGTGGTTCAG" )[0], -86.9560623169 )
+    # Nothing valid
+    assert isnan( sm.score_string_with_gaps( "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" ) ).all()
+    # Too short
+    assert isnan( sm.score_string( "TTTT" ) ).all()
+
+def test_scoring_with_gaps():
+    m = pwm.FrequencyMatrix.from_rows( ['A','C','G','T'], get_ctcf_rows() )
+    # Stormo method
+    sm = m.to_stormo_scoring_matrix()
+    # Forward matches
+    assert allclose( sm.score_string_with_gaps( "GTTGCCAGT----TGGGGGAAGCATTT---AA" )[0], 4.65049839 )
+    assert allclose( sm.score_string_with_gaps( "GCAGA--CACCAGGTGG--TTCAG---" )[0], 1.60168743 )
+    assert allclose( sm.score_string_with_gaps( "----GTTGCCAGTTGGGGGAAGCA" )[4], 4.65049839 )
+    assert allclose( sm.score_string_with_gaps( "TTT--GTT--GCCA--GTTGGGG-G-A-A-G-C-A-" )[5], 4.65049839 )
+    assert isnan( sm.score_string_with_gaps( "TTT--GTT--GCCA--GTTGGGG-G-A-A-G-C-A-" )[4] )
+    # Nothing valid
+    assert isnan( sm.score_string_with_gaps( "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" ) ).all()
+    assert isnan( sm.score_string_with_gaps( "------------------------------------" ) ).all()
+    # Too short
+    assert isnan( sm.score_string_with_gaps( "TTTT" ) ).all()
+    assert isnan( sm.score_string_with_gaps( "TTTT----" ) ).all()
+
 
 def get_ctcf_rows():
     """
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.