Commits

Anonymous committed a22acbe

merge of Clement Pernet's patch:
--------------------------------------
Intermediate progress to the matmul based LQUP implementation project:

* Introduces offset for matrices, updates mzd_copy, mzd_nul_naiv, all the matmul code, to be consistent with odd offsets
* introduces the triangular system solving with matrix: left and right looking with upper and lower triangular matrices.
* add a test_trsm test in the test suite

Comments (0)

Files changed (8)

src/brilliantrussian.c

   return _mzd_mul_m4rm(C, A, B, k, FALSE);
 }
 
-packedmatrix *_mzd_mul_m4rm_old(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k, int clear) {
-  size_t i,j, a_nr, a_nc, b_nc;
-  size_t truerow;
-  unsigned int x;
-
-
-  a_nr = A->nrows;
-  a_nc = A->ncols;
-  b_nc = B->ncols;
-
-  if (b_nc < RADIX-10) {
-    return mzd_mul_naiv(C, A, B);
-  }
-
-  size_t wide = C->width;
-
-  /* clear first */
-  if (clear) {
-    for (i=0; i<C->nrows; i++) {
-      truerow = C->rowswap[i];
-      for (j=0; j<C->width; j++) {
-  	C->values[truerow + j] = 0;
-     }
-    }
-  }
-
-  const size_t blocksize = MZD_MUL_BLOCKSIZE;
-
-  if (k == 0) {
-    k = m4ri_opt_k(blocksize, a_nc, b_nc);
-  }
-
-  packedmatrix *T = mzd_init(TWOPOW(k), b_nc);
-  size_t *L = (size_t *)m4ri_mm_malloc(TWOPOW(k) * sizeof(size_t));
-
-  size_t s, start;
-  const size_t end = a_nc/k;
-
-    for (start=0; start + blocksize <= a_nr; start += blocksize) {
-      for(i=0; i < end; i++) {
-        mzd_make_table( B, i*k, 0, k, T, L);
-        for(s = 0; s < blocksize; s++) {
-          j = start + s;
-          x = L[ (int)mzd_read_bits(A, j, i*k, k) ];
-          word * const c = C->values + C->rowswap[j];
-          const word * const t = T->values + T->rowswap[x];
-          for(size_t ii=0; ii<wide ; ii++)
-            c[ii] ^= t[ii];
-        }
-      }
-    }
-  
-    for(i=0; i < a_nc/k; i++) {
-      mzd_make_table( B, i*k, 0, k, T, L);
-      for(s = 0; s < a_nr-start; s++) {
-        j = start + s;
-        x = L[ (int)mzd_read_bits(A, j, i*k, k) ];
-        /* mzd_combine( C,j,0, C,j,0,  T,x,0); */
-        word *c = C->values + C->rowswap[j];
-      const word *t = T->values + T->rowswap[x];
-      for(size_t ii=0; ii<wide ; ii++)
-        c[ii] ^= t[ii];
-      }
-    }
-    
-    /* handle rest */
-    if (a_nc%k) {
-      mzd_make_table( B, a_nc/k * k , 0, a_nc%k, T, L);
-    
-    for(j = 0; j<a_nr; j++) {
-      x = L[ (int)mzd_read_bits(A, j, i*k, a_nc%k) ];
-      mzd_combine(C,j,0, C,j,0, T,x,0);
-    }
-    }
-    
-    mzd_free(T);
-    m4ri_mm_free(L);
-    return C;
-}
-
 #ifdef HAVE_SSE2
 static inline void _mzd_combine8(word *c, word *t1, word *t2, word *t3, word *t4, word *t5, word *t6, word *t7, word *t8, int wide) {
   size_t i;
   size_t b_nc = B->ncols;
 
   if (b_nc < RADIX-10) {
-    return mzd_mul_naiv(C, A, B);
+    return _mzd_mul_naiv(C, A, B, clear);
   }
 
   size_t wide = C->width;
   if (clear) {
     for (i=0; i<C->nrows; i++) {
       truerow = C->rowswap[i];
-      for (j=0; j<C->width; j++) {
+      for (j=0; j<C->width-1; j++) {
   	C->values[truerow + j] = 0;
-     }
+      }
     }
   }
+  //   C->values[truerow + j] &= ((ONE << (RADIX - (C->ncols % RADIX))) - 1);
+
+  
 
   const size_t blocksize = MZD_MUL_BLOCKSIZE;
 

src/packedmatrix.c

   window->width = (window->offset + ncols) / RADIX;
   if ((window->offset + ncols) % RADIX)
     window->width++;
-  
   window->values = m->values;
 
 #ifdef HAVE_OPENMP
 }
 
 void mzd_print_matrix( const packedmatrix *M ) {
-  assert(M->offset == 0);
-
+  
   size_t i, j;
   char temp[SAFECHAR];
   word *row;
   for (i=0; i< M->nrows; i++ ) {
     printf("[ ");
     row = M->values + M->rowswap[i];
-    for (j=0; j< M->ncols/RADIX; j++) {
+    for (j=0; j< (M->ncols+M->offset)/RADIX; j++) {
       m4ri_word_to_str(temp, row[j], 1);
       printf("%s ", temp);
     }
     row = row + M->width - 1;
-    for (j=0; j< (size_t)(M->ncols%RADIX); j++) {
+    for (j=0; j< (size_t)((M->ncols+M->offset)%RADIX); j++) {
       printf("%d", (int)GET_BIT(*row, j));
       if (((j % 4)==3) && (j!=RADIX-1))
         printf(":");
     temp = DST->values + DST->rowswap[i];
     for (j=0; j < eol; j+=RADIX) {
       for (k=0; k<RADIX; k++) {
-        *temp |= ((word)mzd_read_bit(A, j+k, i))<<(RADIX-1-k);
+        *temp |= ((word)mzd_read_bit(A, j+k, i+A->offset))<<(RADIX-1-k);
       }
       temp++;
     }
     j = A->nrows - (A->nrows%RADIX);
     for (k=0; k<(size_t)(A->nrows%RADIX); k++) {
-      *temp |= ((word)mzd_read_bit(A, j+k, i))<<(RADIX-1-k);
+      *temp |= ((word)mzd_read_bit(A, j+k, i+A->offset))<<(RADIX-1-k);
     }
   }
+  DST->offset = 0;
   return DST;
 }
 
   return _mzd_transpose(DST, A);
 }
 
-packedmatrix *mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
-  assert(A->offset == 0);
-  assert(B->offset == 0);
+packedmatrix *mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B, const int clear) {
+  //assert(A->offset == 0);
+  //assert(B->offset == 0);
 
   packedmatrix *BT = mzd_transpose(NULL, B);
 
       m4ri_die("mzd_mul_naiv: Provided return matrix has wrong dimensions.\n");
     }
   }
-  _mzd_mul_naiv(C, A, BT);
+  _mzd_mul_naiv(C, A, BT, clear);
   mzd_free (BT);
   return C;
 }
 
 
 
-packedmatrix *_mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
-  assert(A->offset == 0);
-  assert(B->offset == 0);
-  assert(C->offset == 0);
+packedmatrix *_mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B, const int clear) {
+  //assert(A->offset == 0);
+  //assert(B->offset == 0);
+  //assert(C->offset == 0);
 
   size_t i, j, k, ii, eol;
   word *a, *b, *c;
       a = A->values + A->rowswap[i];
       c = C->values + C->rowswap[i];
       for (j=0; j<RADIX*eol; j+=RADIX) {
-        for (k=0; k<RADIX; k++) {
+	for (k=0; k<RADIX; k++) {
           b = B->values + B->rowswap[j+k];
           parity[k] = a[0] & b[0];
           for (ii=wide-1; ii>=1; ii--)
-          parity[k] ^= a[ii] & b[ii];
+	    parity[k] ^= a[ii] & b[ii];
         }
         c[j/RADIX] ^= parity64(parity);
       }
 }
 
 packedmatrix *mzd_copy(packedmatrix *n, const packedmatrix *p) {
-  if (n == NULL) {
-    n = mzd_init(p->nrows, p->ncols);
-  } else {
-    if (n == p) {
-      return n;
-    } else if (n->nrows < p->nrows || n->ncols < p->ncols) {
-      m4ri_die("mzd_copy: Target matrix is too small.");
+
+  if (!p->offset){
+    if (n == NULL) {
+      n = mzd_init(p->nrows, p->ncols);
+    } else {
+      if (n == p) {
+	return n;
+      } else if (n->nrows < p->nrows || n->ncols < p->ncols) {
+	m4ri_die("mzd_copy: Target matrix is too small.");
+      }
+    }
+    size_t i, j, p_truerow, n_truerow;
+
+    if (p->ncols < RADIX) {
+      // All columns fit in one word
+      word mask = (ONE << (RADIX - p->ncols))-1;
+
+      for (i=0; i<p->nrows; i++) {
+	p_truerow = p->rowswap[i];
+	n_truerow = n->rowswap[i];
+	n->values[n_truerow] = (n->values[n_truerow] & mask) | (p->values[p_truerow] & ~mask);
+      }
+    } else {
+
+      int r = p->ncols % RADIX;
+      word mask_end = (ONE << (RADIX - r)) - 1;
+
+      for (i=0; i<p->nrows; i++) {
+	p_truerow = p->rowswap[i];
+	n_truerow = n->rowswap[i];
+	for (j=0; j<p->width-1; j++) {
+	  n->values[n_truerow + j] = p->values[p_truerow + j];
+	}
+	n->values[n_truerow + j] = (n->values[n_truerow + j] & mask_end) | (p->values[p_truerow + j] & ~mask_end);
+      }
+    }
+  } else { // p->offset > 0
+    if (n == NULL) {
+      n = mzd_init(p->nrows, p->ncols+ p->offset);
+      n->ncols -= p->offset;
+    } else {
+      if (n == p) {
+	return n;
+      } else if (n->nrows < p->nrows || n->ncols < p->ncols) {
+	m4ri_die("mzd_copy: Target matrix is too small.");
+      }
+    }
+    size_t i, j, p_truerow, n_truerow;
+    int trailingdim =  RADIX - p->ncols - p->offset;
+
+    if (trailingdim >= 0) {
+      // All columns fit in one word
+      word mask = ((ONE << p->ncols) - 1) << trailingdim;
+      for (i=0; i<p->nrows; i++) {
+	p_truerow = p->rowswap[i];
+	n_truerow = n->rowswap[i];
+	n->values[n_truerow] = (n->values[n_truerow] & ~mask) | (p->values[p_truerow] & mask);
+      }
+    } else {
+      int r = (p->ncols + p->offset) % RADIX;
+      word mask_begin = (ONE << (RADIX - p->offset)) - 1;
+      word mask_end = (ONE << (RADIX - r)) - 1;
+      for (i=0; i<p->nrows; i++) {
+	p_truerow = p->rowswap[i];
+	n_truerow = n->rowswap[i];
+	n->values[n_truerow] = (n->values[n_truerow] & ~mask_begin) | (p->values[p_truerow] & mask_begin);
+	for (j=1; j<p->width-1; j++) {
+	  n->values[n_truerow + j] = p->values[p_truerow + j];
+	}
+	n->values[n_truerow + j] = (n->values[n_truerow + j] & mask_end) | (p->values[p_truerow + j] & ~mask_end);
+      }
     }
   }
-  size_t i, j, p_truerow, n_truerow;
+  n->offset = p->offset;
+  n->width=p->width;
   
-  for (i=0; i<p->nrows; i++) {
-    p_truerow = p->rowswap[i];
-    n_truerow = n->rowswap[i];
-    for (j=0; j<p->width; j++) {
-      n->values[n_truerow + j] = p->values[p_truerow + j];
-    }
-  }
-
   return n;
 }
 

src/packedmatrix.h

 void mzd_free(packedmatrix *A);
 
 
-packedmatrix *mzd_init_window(const packedmatrix *M, const size_t lowr, const size_t lowc, const size_t highr, const size_t highc);
-
 /**
  * \brief Create a window/view into the matrix M.
  *
  */
 
 static inline BIT mzd_read_bit(const packedmatrix *M, const size_t row, const size_t col ) {
-  return GET_BIT(M->values[ M->rowswap[row] + col/RADIX ], col%RADIX);
+  return GET_BIT(M->values[ M->rowswap[row] + (col+M->offset)/RADIX ], (col+M->offset) % RADIX);
 }
 
 /**
 
 static inline void mzd_write_bit(packedmatrix *M, const size_t row, const size_t col, const BIT value) {
   if (value==1)
-    SET_BIT(M->values[ M->rowswap[row] + col/RADIX ], col % RADIX);
+    SET_BIT(M->values[ M->rowswap[row] + (col+M->offset)/RADIX ], (col+M->offset) % RADIX);
   else
-    CLR_BIT(M->values[ M->rowswap[row] + col/RADIX ], col % RADIX);
+    CLR_BIT(M->values[ M->rowswap[row] + (col+M->offset)/RADIX ], (col+M->offset) % RADIX);
 }
 
 /**
  */
 
 static inline void mzd_xor_block(packedmatrix *M, const size_t row, const size_t col, const word value) {
-  size_t block=col/RADIX;
+  size_t block=(col+M->offset)/RADIX;
   size_t truerow=M->rowswap[row];
 
   word *entry=M->values + block + truerow;
  */
 
 static inline void mzd_write_block(packedmatrix *M, const size_t row, const size_t col, const word value) {
-  M->values[ M->rowswap[row] + col/RADIX ] = value;
+  M->values[ M->rowswap[row] + (col+M->offset)/RADIX ] = value;
 }
 
 /**
  */
 
 static inline word mzd_read_block(const packedmatrix *M, const size_t row, const size_t col ) {
-  return M->values[ M->rowswap[row] + col/RADIX ];
+  return M->values[ M->rowswap[row] + (col+M->offset)/RADIX ];
 }
 
 /**
  * \param C Preallocated product matrix, may be NULL for automatic creation.
  * \param A Input matrix A.
  * \param B Input matrix B.
+ * \param clear Whether to clear C before accumulating AB
  *
  * \note Normally, if you will multiply several times by b, it is
  * smarter to calculate bT yourself, and keep it, and then use the
  *
  * \wordoffset
  */
-packedmatrix *mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B);
+packedmatrix *mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B, const int clear);
 
 /**
  * \brief Naive cubic matrix multiplication with the pre-transposed B.
  * \param C Preallocated product matrix.
  * \param A Input matrix A.
  * \param B Pre-transposed input matrix B.
+ * \param clear Whether to clear C before accumulating AB
  *
  * \wordoffset
  */
 
-packedmatrix *_mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B);
+packedmatrix *_mzd_mul_naiv(packedmatrix *C, const packedmatrix *A, const packedmatrix *B, const int clear);
 
 /**
  * \brief Fill matrix M with uniformly distributed bits.
   /* there are two possible situations. Either all bits are in one
    * word or they are spread across two words. */
 
-  if ( (y%RADIX + n - 1) < RADIX ) {
+  if ( ((y+M->offset)%RADIX + n - 1) < RADIX ) {
     /* everything happens in one word here */
-    temp =  M->values[ y / RADIX + truerow ]; /* get the value */
-    temp <<= y%RADIX; /* clear upper bits */
+    temp =  M->values[ (y+M->offset) / RADIX + truerow ]; /* get the value */
+    temp <<= (y+M->offset)%RADIX; /* clear upper bits */
     temp >>= RADIX - n; /* clear lower bits and move to correct position.*/
     return temp;
 
   } else {
     /* two words are affected */
-    const size_t block = y / RADIX + truerow; /* correct block */
-    const size_t spot = (y + n ) % RADIX; /* correct offset */
+    const size_t block = (y+M->offset) / RADIX + truerow; /* correct block */
+    const size_t spot = (y + +M->offset+n ) % RADIX; /* correct offset */
     /* make room by shifting spot times to the right, and add stuff from the second word */
     temp = (M->values[block] << spot) | ( M->values[block + 1] >> (RADIX - spot) ); 
     return (temp << (RADIX-n)) >> (RADIX-n); /* clear upper bits and return */
   /* there are two possible situations. Either all bits are in one
    * word or they are spread across two words. */
 
-  if ( (y%RADIX + n - 1) < RADIX ) {
+  if ( ((y+M->offset)%RADIX + n - 1) < RADIX ) {
     /* everything happens in one word here */
-    temp =  M->values +  y / RADIX + truerow;
-    *temp |= values<<(RADIX-(y%RADIX)-n);
+    temp =  M->values +  (y+M->offset) / RADIX + truerow;
+    *temp |= values<<(RADIX-((y+M->offset)%RADIX)-n);
 
   } else {
     /* two words are affected */
-    const size_t block = y / RADIX + truerow; /* correct block */
-    const size_t spot = (y + n ) % RADIX; /* correct offset */
+    const size_t block = (y+M->offset) / RADIX + truerow; /* correct block */
+    const size_t spot = (y +M->offset+ n ) % RADIX; /* correct offset */
     M->values[block] |= values >> (spot);
     M->values[block + 1] |= values<<(RADIX-spot);
   }
   /* there are two possible situations. Either all bits are in one
    * word or they are spread across two words. */
 
-  if ( (y%RADIX + n - 1) < RADIX ) {
+  if ( ((y+M->offset)%RADIX + n - 1) < RADIX ) {
     /* everything happens in one word here */
-    temp =  M->values[ y / RADIX + truerow ];
-    temp <<= y%RADIX; /* clear upper bits */
+    temp =  M->values[ (y+M->offset) / RADIX + truerow ];
+    temp <<= (y+M->offset)%RADIX; /* clear upper bits */
     temp >>= RADIX-n; /* clear lower bits and move to correct position.*/
-    temp <<= RADIX-n - y%RADIX;
-    M->values[ y / RADIX + truerow ] ^= temp;
+    temp <<= RADIX-n - (y+M->offset)%RADIX;
+    M->values[ (y+M->offset) / RADIX + truerow ] ^= temp;
   } else {
     /* two words are affected */
-    const size_t block = y / RADIX + truerow; /* correct block */
-    const size_t spot = (y + n ) % RADIX; /* correct offset */
+    const size_t block = (y+M->offset) / RADIX + truerow; /* correct block */
+    const size_t spot = (y+M->offset + n ) % RADIX; /* correct offset */
     M->values[block] ^= M->values[block] & ((ONE<<(n-spot))-1);
     M->values[block+1] ^= (M->values[block+1]>>(RADIX-spot))<<(RADIX-spot);
   }
 #include "misc.h"
 #include "parity.h"
 #define CLOSER(a,b,target) (abs((long)a-(long)target)<abs((long)b-(long)target))
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
 
 #ifdef HAVE_OPENMP
 #include <omp.h>
     /* we copy the matrix first since it is only constant memory
        overhead and improves data locality, if you remove it make sure
        there are no speed regressions */
-    packedmatrix *Cbar = mzd_copy(NULL, C);
-    Cbar = mzd_addmul_m4rm(Cbar, A, B, 0);
+    packedmatrix *Cbar = mzd_copy (NULL, C);
+
+    mzd_addmul_m4rm (Cbar, A, B, 0);
     mzd_copy(C, Cbar);
     mzd_free(Cbar);
     return C;
 
 packedmatrix *_mzd_addmul (packedmatrix *C, packedmatrix *A, packedmatrix *B, int cutoff){
   /**
-   * Assumes that B and C are aligned in the same manner(as in a Schur complement)
-   * TODO: add addmul_even after each weird call 
+   * Assumes that B and C are aligned in the same manner (as in a Schur complement)
    */
   
   if (!A->offset){
-    if (!B->offset){
-
+    if (!B->offset) /* A even, B even */
       return _mzd_addmul_even (C, A, B, cutoff);
-
+    else {  /* A even, B weird */
+      size_t bnc = RADIX - B->offset;
+      if (B->ncols <= bnc){
+	_mzd_addmul_even_weird  (C,  A, B, cutoff);
+      } else {
+	packedmatrix * B0 = mzd_init_window (B, 0, 0, B->nrows, bnc);
+	packedmatrix * C0 = mzd_init_window (C, 0, 0, C->nrows, bnc);
+	packedmatrix * B1 = mzd_init_window (B, 0, bnc, B->nrows, B->ncols);
+	packedmatrix * C1 = mzd_init_window (C, 0, bnc, C->nrows, C->ncols);
+	_mzd_addmul_even_weird  (C0,  A, B0, cutoff);
+	_mzd_addmul_even (C1, A, B1, cutoff);
+	mzd_free_window (B0); mzd_free_window (B1);
+	mzd_free_window (C0); mzd_free_window (C1);
+      }
+    }
+  } else if (B->offset) { /* A weird, B weird */
+    size_t anc = RADIX - A->offset;
+    size_t bnc = RADIX - B->offset;
+    if (B->ncols <= bnc){
+      if (A->ncols <= anc)
+	_mzd_addmul_weird_weird (C, A, B, cutoff);
+      else {
+	packedmatrix * A0  = mzd_init_window (A, 0, 0, A->nrows, anc);
+	packedmatrix * A1  = mzd_init_window (A, 0, anc, A->nrows, A->ncols);
+	packedmatrix * B0  = mzd_init_window (B, 0, 0, anc, B->ncols);
+	packedmatrix * B1  = mzd_init_window (B, anc, 0, B->nrows, B->ncols);
+	_mzd_addmul_weird_weird (C, A0, B0, cutoff);
+	_mzd_addmul_even_weird  (C, A1, B1, cutoff);
+	mzd_free_window (A0);  mzd_free_window (A1);
+	mzd_free_window (B0);  mzd_free_window (B1);
+      }
+    } else if (A->ncols <= anc) {
+      packedmatrix * B0 = mzd_init_window (B, 0, 0, B->nrows, bnc);
+      packedmatrix * B1 = mzd_init_window (B, 0, bnc, B->nrows, B->ncols);
+      packedmatrix * C0 = mzd_init_window (C, 0, 0, C->nrows, bnc);
+      packedmatrix * C1 = mzd_init_window (C, 0, bnc, C->nrows, C->ncols);
+      _mzd_addmul_weird_weird (C0, A, B0, cutoff);
+      _mzd_addmul_weird_even  (C1, A, B1, cutoff);
+      mzd_free_window (B0); mzd_free_window (B1);
+      mzd_free_window (C0); mzd_free_window (C1);
     } else {
-      size_t bnc = RADIX - B->offset;
-      packedmatrix * B0 = mzd_init_window (B, 0, 0, B->nrows, bnc);
-      packedmatrix * C0 = mzd_init_window (C, 0, 0, C->nrows, bnc);
-      _mzd_addmul_even_weird  (C0,  A, B0, cutoff);
-      mzd_free (B0);
-      mzd_free (C0);
-    }
-  } else {
-    if (B->offset) {
-      size_t anc = RADIX - A->offset;
-      size_t bnc = RADIX - B->offset;
       packedmatrix * A0  = mzd_init_window (A, 0, 0, A->nrows, anc);
       packedmatrix * A1  = mzd_init_window (A, 0, anc, A->nrows, A->ncols);
       packedmatrix * B00 = mzd_init_window (B, 0, 0, anc, bnc);
       packedmatrix * B01 = mzd_init_window (B, 0, bnc, anc, B->ncols);
       packedmatrix * B10 = mzd_init_window (B, anc, 0, B->nrows, bnc);
+      packedmatrix * B11 = mzd_init_window (B, anc, bnc, B->nrows, B->ncols);
       packedmatrix * C0 = mzd_init_window (C, 0, 0, C->nrows, bnc);
       packedmatrix * C1 = mzd_init_window (C, 0, bnc, C->nrows, C->ncols);
+      
+      _mzd_addmul_weird_weird (C0, A0, B00, cutoff);
+      _mzd_addmul_even_weird  (C0,  A1, B10, cutoff);
+      _mzd_addmul_weird_even  (C1,  A0, B01, cutoff);
+      _mzd_addmul_even  (C1,  A1, B11, cutoff);
 
-      _mzd_addmul_weird_weird (C0, A0, B00, cutoff);
-      _mzd_addmul_weird_even  (C1,  A0, B01, cutoff);
-      _mzd_addmul_even_weird  (C0,  A1, B10, cutoff);
-
-      mzd_free (A0);  mzd_free (A1);
-      mzd_free (C0);  mzd_free (C1);
-      mzd_free (B00); mzd_free (B01); mzd_free (B10);
-
+      mzd_free_window (A0);  mzd_free_window (A1);
+      mzd_free_window (C0);  mzd_free_window (C1);
+      mzd_free_window (B00); mzd_free_window (B01);
+      mzd_free_window (B10); mzd_free_window (B11);
+    }
+  } else { /* A weird, B even */
+    int anc = RADIX - A->offset;
+    if (A->ncols <= anc){
+      _mzd_addmul_weird_even  (C,  A, B, cutoff);
     } else {
-      size_t anc = RADIX - A->offset;
       packedmatrix * A0  = mzd_init_window (A, 0, 0, A->nrows, anc);
-      packedmatrix * B0 = mzd_init_window (B, 0, 0, anc, B->ncols);
-      _mzd_addmul_weird_even  (C,  A0, B0, cutoff);
-      mzd_free (A0);
-      mzd_free (B0);
+      packedmatrix * A1  = mzd_init_window (A, 0, anc, A->nrows, A->ncols);
+      packedmatrix * B0  = mzd_init_window (B, 0, 0, anc, B->ncols);
+      packedmatrix * B1  = mzd_init_window (B, anc, 0, B->nrows, B->ncols);
+      _mzd_addmul_weird_even (C, A0, B0, cutoff);
+      _mzd_addmul_even  (C, A1, B1, cutoff);
+      mzd_free_window (A0); mzd_free_window (A1);
+      mzd_free_window (B0); mzd_free_window (B1);
     }
   }
   return C;
 }
 
 packedmatrix *_mzd_addmul_weird_even (packedmatrix *C, packedmatrix *A, packedmatrix *B, int cutoff){
-  packedmatrix * tmp = mzd_init (A->nrows, RADIX - A->offset);
-  for (size_t i=0; i < A->nrows; ++i)
-    tmp->values [i] = (A->values [A->rowswap [i]] << A->offset);
+  packedmatrix * tmp = mzd_init (A->nrows, MIN(RADIX- A->offset, A->ncols));
+  for (size_t i=0; i < A->nrows; ++i){
+    tmp->values [tmp->rowswap[i]] = (A->values [A->rowswap [i]] << A->offset);
+  }
   _mzd_addmul_even (C, tmp, B, cutoff);
   mzd_free(tmp);
   return C;
 }
 
  packedmatrix *_mzd_addmul_even_weird (packedmatrix *C, packedmatrix *A, packedmatrix *B, int cutoff){
-  packedmatrix * tmp = mzd_init (B->nrows, RADIX);
-  for (size_t i=0; i < B->nrows; ++i)
-    tmp->values [tmp->rowswap[i]] = RIGHTMOST_BITS (B->values [B->rowswap [i]], RADIX - B->offset);
-  _mzd_addmul_even (C, A, tmp, cutoff);
-  return C;
+   packedmatrix * tmp = mzd_init (B->nrows, RADIX);
+   size_t offset = C->offset;
+   size_t cncols = C->ncols;
+   C->offset=0;
+   C->ncols = RADIX;
+   word mask = ((ONE << B->ncols) - 1) << (RADIX-B->offset - B->ncols);
+   for (size_t i=0; i < B->nrows; ++i)
+     tmp->values [tmp->rowswap[i]] = B->values [B->rowswap [i]] & mask;
+   _mzd_addmul_even (C, A, tmp, cutoff);
+   C->offset=offset;
+   C->ncols = cncols;
+   mzd_free (tmp);
+   return C;
 }
 
  packedmatrix* _mzd_addmul_weird_weird (packedmatrix* C, packedmatrix* A, packedmatrix *B, int cutoff){
-   packedmatrix *BT = mzd_transpose(NULL, B);
+   packedmatrix *BT;
+   word* temp;
+   BT = mzd_init( B->ncols, B->nrows );
+   
+   for (int i = 0; i < B->ncols; ++i) {
+     temp = BT->values + BT->rowswap[i];
+     for (size_t k = 0; k < B->nrows; k++) {
+      *temp |= ((word)mzd_read_bit (B, k, i)) << (RADIX-1-k-A->offset);
+
+     }
+   }
+   
    word parity[64];
    for (size_t i = 0; i < 64; i++) {
      parity[i] = 0;
    }
-   for (size_t i = 0; i < RADIX; ++i) {
+   for (size_t i = 0; i < A->nrows; ++i) {
      word * a = A->values + A->rowswap[i];
      word * c = C->values + C->rowswap[i];
-     for (size_t k=RADIX-1; k>=BT->offset; k--) {
+     for (size_t k=0; k< C->ncols; k++) {
        word *b = BT->values + BT->rowswap[k];
-       parity[k] = (*a) & (*b);
+       parity[k+C->offset] = (*a) & (*b);
      }
-     *c ^= parity64(parity);
+     word par = parity64(parity);
+     *c ^= par;//parity64(parity);
    }
+   mzd_free (BT);
    return C;
  }
 
 }
 
 void _mzd_trsm_upper_right (packedmatrix *U, packedmatrix *B, const int cutoff) {
-
+  size_t nb = B->ncols;
+  size_t mb = B->nrows;
+  size_t n1 = RADIX-B->offset;
+  if (nb <= n1)
+    _mzd_trsm_upper_right_weird (U, B, cutoff);
+  else{
   /**
    \verbatim  
      _________ 
    * Their column dimension is lower than 64
    * The first column of U01, U11, B1 are aligned to words.
    */
-
-  size_t nb = B->ncols;
-  size_t mb = B->nrows;
-  size_t n1 = RADIX-B->offset;
-  packedmatrix *B0  = mzd_init_window (B,  0,  0, mb, n1);
-  packedmatrix *B1  = mzd_init_window (B,  0, n1, mb, nb);
-  packedmatrix *U00 = mzd_init_window (U,  0,  0, n1, n1);
-  packedmatrix *U01 = mzd_init_window (U,  0, n1, n1, nb);
-  packedmatrix *U11 = mzd_init_window (U, n1, n1, nb, nb);
-  
-  _mzd_trsm_upper_right_weird (U00, B0, cutoff);
-  mzd_addmul (B1, B0, U01, cutoff);
-  _mzd_trsm_upper_right_even (U11, B1, cutoff);
-  
-  mzd_free_window(B0);
-  mzd_free_window(B1);
-  
-  mzd_free_window(U00);
-  mzd_free_window(U01);
-  mzd_free_window(U11);
+    packedmatrix *B0  = mzd_init_window (B,  0,  0, mb, n1);
+    packedmatrix *B1  = mzd_init_window (B,  0, n1, mb, nb);
+    packedmatrix *U00 = mzd_init_window (U,  0,  0, n1, n1);
+    packedmatrix *U01 = mzd_init_window (U,  0, n1, n1, nb);
+    packedmatrix *U11 = mzd_init_window (U, n1, n1, nb, nb);
+    
+    _mzd_trsm_upper_right_weird (U00, B0, cutoff);
+    mzd_addmul (B1, B0, U01, cutoff);
+    _mzd_trsm_upper_right_even (U11, B1, cutoff);
+    
+    mzd_free_window(B0);
+    mzd_free_window(B1);
+    
+    mzd_free_window(U00);
+    mzd_free_window(U01);
+    mzd_free_window(U11);
+  }
 }
 
 /**
     
     register word ucol = 0;
     for (size_t k=0; k<i; ++k) {
-      if (GET_BIT (U->values[U->rowswap[k]], i + offset))
+      if (GET_BIT (U->values[U->rowswap[k]], i + U->offset))
 	SET_BIT (ucol, k+offset);
     }
-    
     /* doing 64 dotproducts at a time, to use the parity64 parallelism */
     size_t giantstep;
     word tmp[64];
       for (size_t babystep = 0; babystep < RADIX; ++babystep)
 	  if (GET_BIT (dotprod, babystep))
 	    FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i + offset);
-      }  
-      
-      for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
-	tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
-      for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
-	tmp [babystep] = 0;
+      }      
+    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep){
+      tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 
-      word dotprod = parity64 (tmp);
-      for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
-	if (GET_BIT (dotprod, babystep))
-	  FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i + offset);
     }
+    for (size_t babystep = mb-giantstep; babystep < 64; ++babystep){
+      tmp [babystep] = 0;
+    }
+
+    word dotprod = parity64 (tmp);
+    
+    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
+      if (GET_BIT (dotprod, babystep))
+	FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i + offset);
+  }
 }
 
 /**
- * Variant where U and B start at an odd bit position
+ * Variant where U and B start at an even bit position
  * Assumes that U->ncols < 64
  */
 void _mzd_trsm_upper_right_even (packedmatrix *U, packedmatrix *B, const int cutoff) {
     mzd_free_window(U11);
   }
 }
+
+/* Lower Left Implementations */
+
+void mzd_trsm_lower_left (packedmatrix *L, packedmatrix *B, const int cutoff) {
+  if(L->ncols != B->nrows)
+    m4ri_die("mzd_trsm_lower_left: L ncols (%d) need to match B nrows (%d).\n", L->ncols, B->nrows);
+  if(L->nrows != L->ncols)
+    m4ri_die("mzd_trsm_lower_left: L must be square and is found to be (%d) x (%d).\n", L->nrows, L->ncols);
+  
+  if (cutoff <= 0)
+    m4ri_die("mzd_trsm_lower_left: cutoff must be > 0.\n");
+
+  _mzd_trsm_lower_left (L, B, cutoff);
+}
+
+void _mzd_trsm_lower_left (packedmatrix *L, packedmatrix *B, const int cutoff) {
+
+
+  if (!L->offset)
+    _mzd_trsm_lower_left_even (L, B, cutoff);
+  else{
+    size_t nb = B->ncols;
+    size_t mb = B->nrows;
+    size_t m1 = RADIX - L->offset;
+    if (mb <= m1)
+      _mzd_trsm_lower_left_weird (L, B, cutoff);
+    else{
+      /**
+       *  
+       * |\           ______
+       * | \         |      |
+       * |  \        |  B0  |
+       * |L00\       |      |
+       * |____\      |______|
+       * |    |\     |      |
+       * |    | \    |      |
+       * |    |  \   |  B1  |
+       * |L10 |L11\  |      |
+       * |____|____\ |______|
+       * 
+       * L00 L10 and B0 are possibly located at uneven locations.
+       * Their column dimension is lower than 64
+       * The first column of L01, L11, B1 are aligned to words.
+       */
+      
+      packedmatrix *B0  = mzd_init_window (B,  0,  0, m1, nb);
+      packedmatrix *B1  = mzd_init_window (B,  m1, 0, mb, nb);
+      packedmatrix *L00 = mzd_init_window (L,  0,  0, m1, m1);
+      packedmatrix *L10 = mzd_init_window (L,  m1, 0, mb, m1);
+      packedmatrix *L11 = mzd_init_window (L, m1, m1, mb, mb);
+      
+      _mzd_trsm_lower_left_weird (L00, B0, cutoff);
+      mzd_addmul (B1, L10, B0, cutoff);
+      _mzd_trsm_lower_left_even (L11, B1, cutoff);
+    
+      mzd_free_window(B0);
+      mzd_free_window(B1);
+      
+      mzd_free_window(L00);
+      mzd_free_window(L10);
+      mzd_free_window(L11);
+    }
+  }
+}
+
+/**
+ * Variant where L and B start at an odd bit position
+ * Assumes that L->ncols < 64
+ */
+void _mzd_trsm_lower_left_weird (packedmatrix *L, packedmatrix *B, const int cutoff) {
+
+  size_t mb = B->nrows;
+  size_t nb = B->ncols;
+  size_t Boffset = B->offset;
+  size_t nbrest = (nb + Boffset) % RADIX;
+  if (nb + B->offset >= RADIX) {
+
+    // Large B
+    word mask1;
+
+    if (Boffset)
+      mask1 = ((ONE << (RADIX - B->offset)) - 1) ;
+    else
+      mask1 = ~0;
+    word mask2 = ~((ONE << (RADIX - nbrest)) - 1);
+
+    for (size_t i=1; i < mb; ++i) {
+      
+      /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1}  */
+      // Need to be optimized !!!
+      size_t Lidx = L->rowswap[i];
+      size_t Bidx = B->rowswap[i];
+
+      for (size_t k=0; k<i; ++k) {
+	if (GET_BIT (L->values [Lidx], k + L->offset)){
+	  B->values [Bidx] ^= B->values [B->rowswap [k]] & mask1;
+	  for (size_t j = 1; j < B->width-1; ++j)
+	    B->values [Bidx + j] ^= B->values [B->rowswap [k] + j];
+	  B->values [Bidx + B->width - 1] ^= B->values [B->rowswap [k] + B->width - 1] & mask2;
+	}
+      }
+    }
+  } else { // Small B
+
+    word mask = ((ONE << nb) - 1) ;
+    mask <<= (RADIX-nb-B->offset);
+
+    for (size_t i=1; i < mb; ++i) {
+      /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1}  */
+      // Need to be optimized !!!
+      size_t Lidx = L->rowswap[i];
+      size_t Bidx = B->rowswap[i];
+
+      for (size_t k=0; k<i; ++k) {
+	if (GET_BIT (L->values [Lidx], k + L->offset)){
+	  B->values [Bidx] ^= B->values [B->rowswap [k]] & mask;
+	}
+      }
+    }
+  }
+}
+
+/**
+ * Variant where L and B start at an odd bit position
+ * Assumes that L->ncols < 64
+ */
+void _mzd_trsm_lower_left_even (packedmatrix *L, packedmatrix *B, const int cutoff) {
+
+  size_t mb = B->nrows;
+  size_t nb = B->ncols;
+  size_t Boffset = B->offset;
+  size_t nbrest = (nb + Boffset) % RADIX;
+
+  if (mb <= RADIX){
+    /* base case */
+
+    if (nb + B->offset >= RADIX) {
+      // B is large
+      word mask1;
+      if (Boffset)
+	mask1 = ((ONE << (RADIX - B->offset)) - 1) ;
+      else
+	mask1 = ~0;
+      word mask2 = ~((ONE << (RADIX - nbrest)) - 1);
+
+      for (size_t i=1; i < mb; ++i) {
+	/* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1}  */
+	// Need to be optimized !!!
+	size_t Lidx = L->rowswap[i];
+	size_t Bidx = B->rowswap[i];
+
+	for (size_t k=0; k<i; ++k) {
+	  if (GET_BIT (L->values [Lidx], k)){
+	    B->values [Bidx] ^= B->values [B->rowswap [k]] & mask1;
+	    for (size_t j = 1; j < B->width-1; ++j)
+	      B->values [Bidx + j] ^= B->values [B->rowswap [k] + j];
+	    B->values [Bidx + B->width - 1] ^= B->values [B->rowswap [k] + B->width - 1] & mask2;
+	  }
+	}
+      }
+    } else { // B is small
+      word mask = ((ONE << nb) - 1) ;
+      mask <<= (RADIX-nb-B->offset);
+      for (size_t i=1; i < mb; ++i) {
+	/* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1}  */
+	// Need to be optimized !!!
+	size_t Lidx = L->rowswap [i];
+	size_t Bidx = B->rowswap [i];
+
+	for (size_t k=0; k<i; ++k) {
+	  if (GET_BIT (L->values [Lidx], k)){
+	    B->values [Bidx] ^= B->values [B->rowswap[k]] & mask;
+	  }
+	}
+      }
+    }
+  } else {
+    size_t mb1 = (((mb-1) / RADIX + 1) >> 1) * RADIX;
+
+    packedmatrix *B0 = mzd_init_window(B,  0,     0,   mb1, nb);
+    packedmatrix *B1 = mzd_init_window(B, mb1,    0,   mb,  nb);
+    packedmatrix *L00 = mzd_init_window(L, 0,     0, mb1, mb1);
+    packedmatrix *L10 = mzd_init_window(L, mb1,   0, mb, mb1);
+    packedmatrix *L11 = mzd_init_window(L, mb1, mb1, mb, mb);
+
+    _mzd_trsm_lower_left_even (L00, B0, cutoff);
+
+    _mzd_addmul (B1, L10, B0, cutoff);
+
+    _mzd_trsm_lower_left_even (L11, B1, cutoff);
+
+    mzd_free_window(B0);
+    mzd_free_window(B1);
+
+    mzd_free_window(L00);
+    mzd_free_window(L10);
+    mzd_free_window(L11);
+  }
+}
  * \brief TRiangular System solving with matrix.
  *
  * Solves X U = B where X and B are matrices, and U is upper triangular.
- * This version assumes that the matrices are at an even position on
- * the RADIX grid and that their dimension is a multiple of RADIX.
  * X is stored inplace on B
  * * 
  * This is the wrapper function including bounds checks. See
  * \param B Input matrix, being overwritten by the solution matrix X
  * \param cutoff Minimal dimension for Strassen recursion.
  *
- * \internal
  */
 
 void mzd_trsm_upper_right (packedmatrix *U, packedmatrix *B, const int cutoff);
 
 void _mzd_trsm_upper_right_weird (packedmatrix *U, packedmatrix *B, const int cutoff);
 
+/**
+ * \brief TRiangular System solving with matrix.
+ *
+ * Solves L X = B where X and B are matrices, and L is lower triangular.
+ *  X is stored inplace on B
+ *  
+ * This is the wrapper function including bounds checks. See
+ * _mzd_trsm_lower_left for implementation details.
+ *
+ * \param L Input lower triangular matrix.
+ * \param B Input matrix, being overwritten by the solution matrix X
+ * \param cutoff Minimal dimension for Strassen recursion.
+ *
+ */
+
+void mzd_trsm_lower_left (packedmatrix *L, packedmatrix *B, const int cutoff);
+
+/**
+ * \brief TRiangular System solving with matrix.
+ *
+ * Solves L X = B where X and B are matrices, and L is lower triangular.
+ * X is stored inplace on B
+ *
+ * \param L Input lower triangular matrix.
+ * \param B Input matrix, being overwritten by the solution matrix X
+ * \param cutoff Minimal dimension for Strassen recursion.
+ *
+ * \internal
+ */
+void _mzd_trsm_lower_left (packedmatrix *L, packedmatrix *B, const int cutoff);
+
+void _mzd_trsm_lower_left_even (packedmatrix *L, packedmatrix *B, const int cutoff);
+
+void _mzd_trsm_lower_left_weird (packedmatrix *L, packedmatrix *B, const int cutoff);
+
 #endif

testsuite/test_multiplication.c

   D = mzd_mul_m4rm(    NULL, A, B, k);
 
   /* E = A*B via naiv cubic multiplication */
-  E = mzd_mul_naiv(    NULL, A, B);
+  E = mzd_mul_naiv(    NULL, A, B, TRUE);
 
   mzd_free(A);
   mzd_free(B);

testsuite/test_trsm.c

+#include <stdlib.h>
+#include "m4ri.h"
+
+
+int testTRSMUpperRight (int m, int n, int offset){
+  packedmatrix* Ubase = mzd_init (2048,2048);
+  packedmatrix* Bbase = mzd_init (2048,2048);
+  mzd_randomize (Ubase);
+  mzd_randomize (Bbase);
+  packedmatrix* Bbasecopy = mzd_copy (NULL, Bbase);
+
+  packedmatrix* U = mzd_init_window (Ubase, 0, offset, n, offset + n);
+  packedmatrix* B = mzd_init_window (Bbase, 0, offset, m, offset + n);
+  packedmatrix* W = mzd_copy (NULL, B);
+
+  size_t i,j;
+  for (i=0; i<n; ++i){
+    for (j=0; j<i;++j)
+      mzd_write_bit(U,i,j, 0);
+    mzd_write_bit(U,i,i, 1);
+  }    
+  mzd_trsm_upper_right (U, B, 2048);
+
+  mzd_addmul(W, B, U, 2048);
+
+  int status = 0;
+  for ( i=0; i<m; ++i)
+    for ( j=0; j<n; ++j){
+      if (mzd_read_bit (W,i,j)){
+	printf("W [%d,%d] = %d\n", i,j, mzd_read_bit (W,i,j));
+	status = 1;
+      }
+    }
+  if (status){
+    printf("W = \n");
+    mzd_print_matrix(W);
+  }
+
+  // Verifiying that nothing has been changed around the submatrices
+  mzd_addmul(W, B, U, 2048);
+  mzd_copy (B, W);
+
+  for ( i=0; i<2048; ++i)
+    for ( j=0; j<2048/RADIX; ++j){
+      if (Bbase->values [Bbase->rowswap[i] + j] != Bbasecopy->values [Bbasecopy->rowswap[i] + j]){
+	printf("Bbase [%d,%d] = %llX\n", i,j, Bbase->values[Bbase->rowswap[i]+j]);
+	printf("Bbasecopy [%d,%d] = %llX\n", i,j, Bbasecopy->values[Bbasecopy->rowswap[i]+j]);
+	status = 1;
+      }
+    }
+  if (status){
+    printf("Bbase = \n");
+    mzd_print_matrix(Bbase);
+    printf("Bbasecopy] = \n");
+    mzd_print_matrix(Bbasecopy);
+  }
+  mzd_free_window (U);
+  mzd_free_window (B);
+  mzd_free (W);
+  mzd_free(Ubase);
+  mzd_free(Bbase);
+  mzd_free(Bbasecopy);
+
+  if (!status)
+    printf("passed\n");
+  return status;
+}
+
+int testTRSMLowerLeft (int m, int n, int offsetL, int offsetB){
+  packedmatrix* Lbase = mzd_init (2048,2048);
+  packedmatrix* Bbase = mzd_init (2048,2048);
+  mzd_randomize (Lbase);
+  mzd_randomize (Bbase);
+  packedmatrix* Bbasecopy = mzd_copy (NULL, Bbase);
+
+  packedmatrix* L = mzd_init_window (Lbase, 0, offsetL, m, offsetL + m);
+  packedmatrix* B = mzd_init_window (Bbase, 0, offsetB, m, offsetB + n);
+  packedmatrix* W = mzd_copy (NULL, B);
+
+  size_t i,j;
+  for (i=0; i<m; ++i){
+    for (j=i+1; j<m;++j)
+      mzd_write_bit(L,i,j, 0);
+    mzd_write_bit(L,i,i, 1);
+  }    
+  mzd_trsm_lower_left (L, B, 2048);
+  
+  mzd_addmul(W, L, B, 2048);
+
+  int status = 0;
+  for ( i=0; i<m; ++i)
+    for ( j=0; j<n; ++j){
+      if (mzd_read_bit (W,i,j)){
+	printf("W [%d,%d] = %d\n", i,j, mzd_read_bit (W,i,j));
+	status = 1;
+      }
+    }
+  if (status){
+    printf("W = \n");
+    mzd_print_matrix(W);
+    printf("L = \n");
+    mzd_print_matrix(L);
+    printf("B = \n");
+    mzd_print_matrix(B);
+  }
+
+  // Verifiying that nothing has been changed around the submatrices
+  mzd_addmul(W, L, B, 2048);
+
+  mzd_copy (B, W);
+
+  for ( i=0; i<2048; ++i)
+    for ( j=0; j<2048/RADIX; ++j){
+      if (Bbase->values [Bbase->rowswap[i] + j] != Bbasecopy->values [Bbasecopy->rowswap[i] + j]){
+	printf("Bbase     [%d,%d] = %llX\n", i,j, Bbase->values[Bbase->rowswap[i]+j]);
+	printf("Bbasecopy [%d,%d] = %llX\n", i,j, Bbasecopy->values[Bbasecopy->rowswap[i]+j]);
+	status = 1;
+      }
+    }
+  mzd_free_window (L);
+  mzd_free_window (B);
+  mzd_free_window (W);
+  mzd_free(Lbase);
+  mzd_free(Bbase);
+  mzd_free(Bbasecopy);
+
+  if (!status)
+    printf("passed\n");
+  return status;
+}
+
+int main(int argc, char **argv) {
+  m4ri_build_all_codes();
+
+  int status = 0;
+
+  printf("UpperRight: small, even placed ... ");
+  status += testTRSMUpperRight (57, 10, 0);
+  printf("UpperRight: large, even placed ... ");
+  status += testTRSMUpperRight (57, 150, 0);
+  printf("UpperRight: small, odd placed ... ");
+  status += testTRSMUpperRight (57, 3, 4);
+  printf("UpperRight: medium, odd placed ... ");
+  status += testTRSMUpperRight (57, 4, 62);
+  printf("UpperRight: large, odd placed ... ");
+  status += testTRSMUpperRight (57, 80, 60);
+  printf("UpperRight: larger, odd placed ... ");
+  status += testTRSMUpperRight (1577, 1802, 189);
+
+  printf("LowerLeft: small L even, small B even ... ");
+  status += testTRSMLowerLeft (10, 20, 0, 0);
+  printf("LowerLeft: small L even, large B even ... ");
+  status += testTRSMLowerLeft (10, 80, 0, 0);
+
+  printf("LowerLeft: small L even, small B odd ... ");
+  status += testTRSMLowerLeft (10, 20, 0, 15);
+  printf("LowerLeft: small L even, large B odd ... ");
+  status += testTRSMLowerLeft (10, 80, 0, 15);
+
+  printf("LowerLeft: small L odd, small B even ... ");
+  status += testTRSMLowerLeft (10, 20, 15, 0);
+  printf("LowerLeft: small L odd, large B even ... ");
+  status += testTRSMLowerLeft (10, 80, 15, 0);
+
+  printf("LowerLeft: small L odd, small B odd ... ");
+  status += testTRSMLowerLeft (10, 20, 15, 20);
+  printf("LowerLeft: small L odd, large B odd ... ");
+  status += testTRSMLowerLeft (10, 80, 15, 20);
+
+  printf("LowerLeft: large L even, small B even ... ");
+  status += testTRSMLowerLeft (70, 20, 0, 0);
+  printf("LowerLeft: large L even, large B even ... ");
+  status += testTRSMLowerLeft (70, 80, 0, 0);
+
+  printf("LowerLeft: large L even, small B odd ... ");
+  status += testTRSMLowerLeft (70, 10, 0, 15);
+  printf("LowerLeft: large L even, large B odd ... ");
+  status += testTRSMLowerLeft (70, 80, 0, 15);
+
+  printf("LowerLeft: large L odd, small B even ... ");
+  status += testTRSMLowerLeft (70, 20, 15, 0);
+  printf("LowerLeft: large L odd, large B even ... ");
+  status += testTRSMLowerLeft (70, 80, 15, 0);
+
+  printf("LowerLeft: large L odd, small B odd ... ");
+  status += testTRSMLowerLeft (70, 20, 15, 20);
+  printf("LowerLeft: large L odd, large B odd ... ");
+  status += testTRSMLowerLeft (70, 80, 15, 20);
+
+  printf("LowerLeft: larger L odd, larger B odd ... ");
+  status += testTRSMLowerLeft (770, 1600, 75, 89);
+  printf("LowerLeft: larger L odd, larger B odd ... ");
+  status += testTRSMLowerLeft (1764, 1345, 198, 123);
+
+  
+  if (!status) {
+    printf("All tests passed.\n");
+  }
+
+  m4ri_destroy_all_codes();
+  return 0;
+}