Commits

Martin Albrecht  committed c214e44

refactoreded packedmatrix to allow more than one malloc call to allocate the matrix
also renamed packedmatrix to mzd_t and permutation to mzp_t

  • Participants
  • Parent commits 8ce6774

Comments (0)

Files changed (30)

File src/brilliantrussian.c

  * for inclusion.
  */
 
-static inline int _mzd_gauss_submatrix_full(packedmatrix *A, size_t r, size_t c, size_t end_row, int k) {
+static inline int _mzd_gauss_submatrix_full(mzd_t *A, size_t r, size_t c, size_t end_row, int k) {
   assert(k<=RADIX);
   size_t i,j,l;
   size_t start_row = r;
  * for inclusion.
  */
 
-static inline int _mzd_gauss_submatrix(packedmatrix *A, size_t r, size_t c, size_t end_row, int k) {
+static inline int _mzd_gauss_submatrix(mzd_t *A, size_t r, size_t c, size_t end_row, int k) {
   size_t i,j,l;
   size_t start_row = r;
   int found;
  * for inclusion.
  */
 
-static inline int _mzd_gauss_submatrix_top(packedmatrix *A, size_t r, size_t c, int k) {
+static inline int _mzd_gauss_submatrix_top(mzd_t *A, size_t r, size_t c, int k) {
   size_t j,l;
   size_t start_row = r;
   for (j=c; j<c+k; j++) {
   return k;
 }
 
-static inline void _mzd_copy_back_rows(packedmatrix *A, packedmatrix *U, size_t r, size_t c, size_t k) {
+static inline void _mzd_copy_back_rows(mzd_t *A, mzd_t *U, size_t r, size_t c, size_t k) {
   size_t startblock = c/RADIX;
   size_t width = A->width - startblock;
   size_t i, j;
   for (i=0 ; i < k ; i++) {
-    const word * const src = U->values + U->rowswap[i] + startblock;
-    word *const dst = A->values + A->rowswap[r+i] + startblock;
+    const word * const src = U->rows[i] + startblock;
+    word *const dst = A->rows[r+i] + startblock;
     for (j=0; j< width; j++) {
       dst[j] = src[j];
     }
   }
 }
 
-void mzd_make_table( packedmatrix *M, size_t r, size_t c, int k, packedmatrix *T, size_t *L) {
+void mzd_make_table( mzd_t *M, size_t r, size_t c, int k, mzd_t *T, size_t *L) {
+  assert(T->blocks[1].size == 0);
   const size_t homeblock= c/RADIX;
   size_t i, j, rowneeded, id;
   size_t twokay= TWOPOW(k);
 
   word *ti, *ti1, *m;
 
-  ti1 = T->values + homeblock;
+  ti1 = T->rows[0] + homeblock;
   ti = ti1 + T->width;
 #ifdef HAVE_SSE2
   unsigned long incw = 0;
       ti+=incw; ti1+=incw;
 #endif
     } else {
-      m = M->values + M->rowswap[rowneeded] + homeblock;
+      m = M->rows[rowneeded] + homeblock;
 
       /* Duff's device loop unrolling */
       register int n = (wide + 7) / 8;
   }
 }
 
-void mzd_process_rows(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T, size_t *L) {
+void mzd_process_rows(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T, size_t *L) {
   size_t r;
   const size_t blocknum=startcol/RADIX;
   size_t wide = M->width - blocknum;
     word bm = ONE << ((RADIX - startcol - 1) % RADIX);
 
     for (r=startrow; r+2<=stoprow; r+=2) {
-      word *t = T->values + T->rowswap[1] + blocknum;
-      word *m0 = M->values + M->rowswap[r+0] + blocknum;
-      word *m1 = M->values + M->rowswap[r+1] + blocknum;
+      word *t  = T->rows[1] + blocknum;
+      word *m0 = M->rows[r+0] + blocknum;
+      word *m1 = M->rows[r+1] + blocknum;
       register int n = (wide + 7) / 8;
 
       if(*m0 & bm) {
 
     for( ; r<stoprow; r++) {
       const int x0 = L[ (int)mzd_read_bits(M, r, startcol, k) ];
-      word *m0 = M->values + M->rowswap[r] + blocknum;
-      word *t0 = T->values + T->rowswap[x0] + blocknum;
+      word *m0 = M->rows[r] + blocknum;
+      word *t0 = T->rows[x0] + blocknum;
       
       register int n = (wide + 7) / 8;
       switch (wide % 8) {
     const int x0 = L[ (int)mzd_read_bits(M, r+0, startcol, k) ];
     const int x1 = L[ (int)mzd_read_bits(M, r+1, startcol, k) ];
     
-    word *m0 = M->values + M->rowswap[r+0] + blocknum;
-    word *t0 = T->values + T->rowswap[x0] + blocknum;
+    word *m0 = M->rows[r+0] + blocknum;
+    word *t0 = T->rows[x0] + blocknum;
 
-    word *m1 = M->values + M->rowswap[r+1] + blocknum;
-    word *t1 = T->values + T->rowswap[x1] + blocknum;
+    word *m1 = M->rows[r+1] + blocknum;
+    word *t1 = T->rows[x1] + blocknum;
 
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
 
   for( ; r<stoprow; r++) {
     const int x0 = L[ (int)mzd_read_bits(M, r, startcol, k) ];
-    word *m0 = M->values + M->rowswap[r] + blocknum;
-    word *t0 = T->values + T->rowswap[x0] + blocknum;
+    word *m0 = M->rows[r] + blocknum;
+    word *t0 = T->rows[x0] + blocknum;
 
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
   }
 }
 
-void mzd_process_rows2(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1) {
+void mzd_process_rows2(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1) {
   size_t r;
   const size_t blocknum=startcol/RADIX;
   const size_t wide = M->width - blocknum;
     const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)];
     if(x0 == 0 && x1 == 0)
       continue;
-    word * m0 = M->values + M->rowswap[r] + blocknum;
-    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
-    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
+    word * m0 = M->rows[r] + blocknum;
+    const word *t0 = T0->rows[x0] + blocknum;
+    const word *t1 = T1->rows[x1] + blocknum;
 
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
   }
 }
 
-void mzd_process_rows3(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2) {
+void mzd_process_rows3(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1, mzd_t *T2, size_t *L2) {
   size_t r;
   const size_t blocknum=startcol/RADIX;
   const size_t wide = M->width - blocknum;
     if(x0 == 0 && x1 == 0 && x2 == 0) 
       continue;
 
-    word * m0 = M->values + M->rowswap[r] + blocknum;
-    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
-    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
-    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
+    word * m0 = M->rows[r] + blocknum;
+    const word *t0 = T0->rows[x0] + blocknum;
+    const word *t1 = T1->rows[x1] + blocknum;
+    const word *t2 = T2->rows[x2] + blocknum;
 
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
   }
 }
 
-void mzd_process_rows4(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
-                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3) {
+void mzd_process_rows4(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
+                       mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1, mzd_t *T2, size_t *L2, mzd_t *T3, size_t *L3) {
   size_t r;
   const size_t blocknum=startcol/RADIX;
   const size_t wide = M->width - blocknum;
     if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0) 
       continue;
 
-    word * m0 = M->values + M->rowswap[r] + blocknum;
-    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
-    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
-    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
-    const word *t3 = T3->values + T3->rowswap[x3] + blocknum;
+    word * m0 = M->rows[r] + blocknum;
+    const word *t0 = T0->rows[x0] + blocknum;
+    const word *t1 = T1->rows[x1] + blocknum;
+    const word *t2 = T2->rows[x2] + blocknum;
+    const word *t3 = T3->rows[x3] + blocknum;
     
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
   }
 }
 
-void mzd_process_rows5(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
-                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3,
-                       packedmatrix *T4, size_t *L4) {
+void mzd_process_rows5(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
+                       mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1, mzd_t *T2, size_t *L2, mzd_t *T3, size_t *L3,
+                       mzd_t *T4, size_t *L4) {
   size_t r;
   const size_t blocknum=startcol/RADIX;
   const size_t wide = M->width - blocknum;
     if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0) 
       continue;
 
-    word * m0 = M->values + M->rowswap[r] + blocknum;
-    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
-    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
-    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
-    const word *t3 = T3->values + T3->rowswap[x3] + blocknum;
-    const word *t4 = T4->values + T4->rowswap[x4] + blocknum;
+    word * m0 = M->rows[r] + blocknum;
+    const word *t0 = T0->rows[x0] + blocknum;
+    const word *t1 = T1->rows[x1] + blocknum;
+    const word *t2 = T2->rows[x2] + blocknum;
+    const word *t3 = T3->rows[x3] + blocknum;
+    const word *t4 = T4->rows[x4] + blocknum;
     
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
   }
 }
 
-void mzd_process_rows6(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
-                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3,
-                       packedmatrix *T4, size_t *L4, packedmatrix *T5, size_t *L5) {
+void mzd_process_rows6(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
+                       mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1, mzd_t *T2, size_t *L2, mzd_t *T3, size_t *L3,
+                       mzd_t *T4, size_t *L4, mzd_t *T5, size_t *L5) {
   size_t r;
   const size_t blocknum=startcol/RADIX;
   const size_t wide = M->width - blocknum;
     if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0) 
       continue;
 
-    word * m0 = M->values + M->rowswap[r] + blocknum;
-    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
-    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
-    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
-    const word *t3 = T3->values + T3->rowswap[x3] + blocknum;
-    const word *t4 = T4->values + T4->rowswap[x4] + blocknum;
-    const word *t5 = T5->values + T5->rowswap[x5] + blocknum;
+    word * m0 = M->rows[r] + blocknum;
+    const word *t0 = T0->rows[x0] + blocknum;
+    const word *t1 = T1->rows[x1] + blocknum;
+    const word *t2 = T2->rows[x2] + blocknum;
+    const word *t3 = T3->rows[x3] + blocknum;
+    const word *t4 = T4->rows[x4] + blocknum;
+    const word *t5 = T5->rows[x5] + blocknum;
 
     register int n = (wide + 7) / 8;
     switch (wide % 8) {
   }
 }
 
-size_t mzd_echelonize_m4ri(packedmatrix *A, int full, int k) {
+size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {
   /**
    * \par General algorithm
    * \li Step 1.Denote the first column to be processed in a given
   /*printf("k: %d\n",k);*/
   int kk = 6*k;
 
-  packedmatrix *U  = mzd_init(kk, A->ncols);
-  packedmatrix *T0 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T1 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T2 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T3 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T4 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T5 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *U  = mzd_init(kk, A->ncols);
+  mzd_t *T0 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T1 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T2 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T3 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T4 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T5 = mzd_init(TWOPOW(k), A->ncols);
   size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   return r;
 }
 
-void mzd_top_echelonize_m4ri(packedmatrix *A, int k) {
+void mzd_top_echelonize_m4ri(mzd_t *A, int k) {
   const size_t ncols = A->ncols; 
   size_t r = 0;
   size_t c = 0;
   }
   int kk = 4*k;
 
-  packedmatrix *T0 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T1 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T2 = mzd_init(TWOPOW(k), A->ncols);
-  packedmatrix *T3 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T0 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T1 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T2 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T3 = mzd_init(TWOPOW(k), A->ncols);
   size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   m4ri_mm_free(L3);
 }
 
-packedmatrix *mzd_invert_m4ri(packedmatrix *m, packedmatrix *I, int k) {
-  packedmatrix *big = mzd_concat(NULL, m, I);
+mzd_t *mzd_invert_m4ri(mzd_t *m, mzd_t *I, int k) {
+  mzd_t *big = mzd_concat(NULL, m, I);
   size_t size=m->ncols;
   if (k == 0) {
     k = m4ri_opt_k(m->nrows, m->ncols, 0);
   }
   size_t twokay=TWOPOW(k);
   size_t i;
-  packedmatrix *T=mzd_init(twokay, size*2);
+  mzd_t *T=mzd_init(twokay, size*2);
   size_t *L=(size_t *)m4ri_mm_malloc(twokay * sizeof(size_t));
-  packedmatrix *answer;
+  mzd_t *answer;
   
   mzd_echelonize_m4ri(big, TRUE, k);
   
   return answer;
 }
 
-packedmatrix *mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) {
+mzd_t *mzd_mul_m4rm(mzd_t *C, mzd_t *A, mzd_t *B, int k) {
   size_t a = A->nrows;
   size_t c = B->ncols;
 
   return _mzd_mul_m4rm(C, A, B, k, TRUE);
 }
 
-packedmatrix *mzd_addmul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) {
+mzd_t *mzd_addmul_m4rm(mzd_t *C, mzd_t *A, mzd_t *B, int k) {
   size_t a = A->nrows;
   size_t c = B->ncols;
 
 #define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide)
 #endif //M4RM_GRAY8
 
-packedmatrix *_mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k, int clear) {
+mzd_t *_mzd_mul_m4rm(mzd_t *C, mzd_t *A, mzd_t *B, int k, int clear) {
   /**
    * The algorithm proceeds as follows:
    * 
   size_t *buffer = (size_t*)m4ri_mm_malloc(8 * TWOPOW(k) * sizeof(size_t));
 #endif
 
-  packedmatrix *T1 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T1 = mzd_init(TWOPOW(k), b_nc);
   size_t *L1 = buffer;
-  packedmatrix *T2 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T2 = mzd_init(TWOPOW(k), b_nc);
   size_t *L2 = buffer + 1*TWOPOW(k);
-  packedmatrix *T3 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T3 = mzd_init(TWOPOW(k), b_nc);
   size_t *L3 = buffer + 2*TWOPOW(k);
-  packedmatrix *T4 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T4 = mzd_init(TWOPOW(k), b_nc);
   size_t *L4 = buffer + 3*TWOPOW(k);
 
 #ifdef M4RM_GRAY8
-  packedmatrix *T5 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T5 = mzd_init(TWOPOW(k), b_nc);
   size_t *L5 = buffer + 4*TWOPOW(k);
-  packedmatrix *T6 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T6 = mzd_init(TWOPOW(k), b_nc);
   size_t *L6 = buffer + 5*TWOPOW(k);
-  packedmatrix *T7 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T7 = mzd_init(TWOPOW(k), b_nc);
   size_t *L7 = buffer + 6*TWOPOW(k);
-  packedmatrix *T8 = mzd_init(TWOPOW(k), b_nc);
+  mzd_t *T8 = mzd_init(TWOPOW(k), b_nc);
   size_t *L8 = buffer + 7*TWOPOW(k);
 #endif
 
         x7 = L7[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k, k) ];
         x8 = L8[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k+k, k) ];
 #endif
-        c = C->values + C->rowswap[j];
-        t1 = T1->values + T1->rowswap[x1];
-        t2 = T2->values + T2->rowswap[x2];
-        t3 = T3->values + T3->rowswap[x3];
-        t4 = T4->values + T4->rowswap[x4];
+        c = C->rows[j];
+        t1 = T1->rows[x1];
+        t2 = T2->rows[x2];
+        t3 = T3->rows[x3];
+        t4 = T4->rows[x4];
 #ifdef M4RM_GRAY8
-        t5 = T5->values + T5->rowswap[x5];
-        t6 = T6->values + T6->rowswap[x6];
-        t7 = T7->values + T7->rowswap[x7];
-        t8 = T8->values + T8->rowswap[x8];
+        t5 = T5->rows[x5];
+        t6 = T6->rows[x6];
+        t7 = T7->rows[x7];
+        t8 = T8->rows[x8];
 #endif
         _MZD_COMBINE;
       }
       x7 = L7[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k, k) ];
       x8 = L8[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k+k, k) ];
 #endif
-      c = C->values + C->rowswap[j];
-      t1 = T1->values + T1->rowswap[x1];
-      t2 = T2->values + T2->rowswap[x2];
-      t3 = T3->values + T3->rowswap[x3];
-      t4 = T4->values + T4->rowswap[x4];
+      c = C->rows[j];
+      t1 = T1->rows[x1];
+      t2 = T2->rows[x2];
+      t3 = T3->rows[x3];
+      t4 = T4->rows[x4];
 #ifdef M4RM_GRAY8
-      t5 = T5->values + T5->rowswap[x5];
-      t6 = T6->values + T6->rowswap[x6];
-      t7 = T7->values + T7->rowswap[x7];
-      t8 = T8->values + T8->rowswap[x8];
+      t5 = T5->rows[x5];
+      t6 = T6->rows[x6];
+      t7 = T7->rows[x7];
+      t8 = T8->rows[x8];
 #endif
       _MZD_COMBINE;
     }
       mzd_make_table( B, i*k, 0, k, T1, L1);
       for(j = 0; j<a_nr; j++) {
         x1 = L1[ (int)mzd_read_bits(A, j, i*k, k) ];
-        c = C->values + C->rowswap[j];
-        t1 = T1->values + T1->rowswap[x1];
+        c = C->rows[j];
+        t1 = T1->rows[x1];
         for(ii=0; ii<wide; ii++) {
           c[ii] ^= t1[ii];
         }
       mzd_make_table( B, a_nc/k * k , 0, a_nc%k, T1, L1);
       for(j = 0; j<a_nr; j++) {
         x1 = L1[ (int)mzd_read_bits(A, j, i*k, a_nc%k) ];
-        c = C->values + C->rowswap[j];
-        t1 = T1->values + T1->rowswap[x1];
+        c = C->rows[j];
+        t1 = T1->rows[x1];
         for(ii=0; ii<wide; ii++) {
           c[ii] ^= t1[ii];
         }

File src/brilliantrussian.h

  * \wordoffset
  */
 
-void mzd_make_table( packedmatrix *M, size_t r, size_t c, int k, packedmatrix *T, size_t *L);
+void mzd_make_table( mzd_t *M, size_t r, size_t c, int k, mzd_t *T, size_t *L);
 
 /**
  * \brief The function looks up k bits from position i,startcol in
  * \wordoffset
  */
 
-void mzd_process_rows(packedmatrix *M, size_t startrow, size_t endrow, size_t startcol, int k, packedmatrix *T, size_t *L);
+void mzd_process_rows(mzd_t *M, size_t startrow, size_t endrow, size_t startcol, int k, mzd_t *T, size_t *L);
 
 /**
  * \brief Same as mzd_process_rows but works with two Gray code tables
  * \wordoffset
  */
 
-void mzd_process_rows2(packedmatrix *M, size_t startrow, size_t endrow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1);
+void mzd_process_rows2(mzd_t *M, size_t startrow, size_t endrow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1);
 
 /**
  * \brief Same as mzd_process_rows but works with three Gray code tables
  * \wordoffset
  */
 
-void mzd_process_rows3(packedmatrix *M, size_t startrow, size_t endrow, size_t startcol, int k, 
-                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1,
-                       packedmatrix *T2, size_t *L2);
+void mzd_process_rows3(mzd_t *M, size_t startrow, size_t endrow, size_t startcol, int k, 
+                       mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1,
+                       mzd_t *T2, size_t *L2);
 
 /**
  * \brief Same as mzd_process_rows but works with four Gray code tables
  * \wordoffset
  */
 
-void mzd_process_rows4(packedmatrix *M, size_t startrow, size_t endrow, size_t startcol, int k,
-                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1,
-                       packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3);
+void mzd_process_rows4(mzd_t *M, size_t startrow, size_t endrow, size_t startcol, int k,
+                       mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1,
+                       mzd_t *T2, size_t *L2, mzd_t *T3, size_t *L3);
 
 /**
  * \brief Matrix elimination using the 'Method of the Four Russians'
  * \return Rank of A.
  */
 
-size_t mzd_echelonize_m4ri(packedmatrix *M, int full, int k);
+size_t mzd_echelonize_m4ri(mzd_t *M, int full, int k);
 
 /**
  * \brief Given a matrix in upper triangular form compute the reduced row
  * \note This function isn't as optimized as it should be.
  */
 
-void mzd_top_echelonize_m4ri(packedmatrix *M, int k);
+void mzd_top_echelonize_m4ri(mzd_t *M, int k);
 
 /**
  * \brief Invert the matrix M using Konrod's method. 
  * must be free'd using mzd_free() once not needed anymore.
  */
 
-packedmatrix *mzd_invert_m4ri(packedmatrix *M, packedmatrix *I, int k);
+mzd_t *mzd_invert_m4ri(mzd_t *M, mzd_t *I, int k);
 
 /**
  * \brief Matrix multiplication using Konrod's method, i.e. compute C
  * \return Pointer to C.
  */
 
-packedmatrix *mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k);
+mzd_t *mzd_mul_m4rm(mzd_t *C, mzd_t *A, mzd_t *B, int k);
 
 
 /**
  * \return Pointer to C.
  */
 
-packedmatrix *mzd_addmul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k);
+mzd_t *mzd_addmul_m4rm(mzd_t *C, mzd_t *A, mzd_t *B, int k);
 
 /**
  * \brief Matrix multiplication using Konrod's method, i.e. compute C such
  * \return Pointer to C.
  */
 
-packedmatrix *_mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k, int clear);
+mzd_t *_mzd_mul_m4rm(mzd_t *C, mzd_t *A, mzd_t *B, int k, int clear);
 
 /**
  * \brief If defined 8 Gray code tables are used in parallel.
 #include "strassen.h"
 #include "lqup.h"
 
-size_t mzd_pluq (packedmatrix *A, permutation * P, permutation * Q, const int cutoff) {
+size_t mzd_pluq (mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
   if (P->length != A->nrows)
     m4ri_die("mzd_pluq: Permutation P length (%d) must match A nrows (%d)\n",P->length, A->nrows);
   if (Q->length != A->ncols)
   return _mzd_pluq(A, P, Q, cutoff);
 }
 
-size_t _mzd_pluq(packedmatrix *A, permutation * P, permutation * Q, const int cutoff) {
+size_t _mzd_pluq(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
   size_t nrows = A->nrows;
   size_t ncols = A->ncols;
 
 
     size_t i, j;
     size_t n1 = (((ncols - 1) / RADIX + 1) >> 1) * RADIX;
-    packedmatrix *A0  = mzd_init_window(A,  0,  0, nrows,    n1);
-    packedmatrix *A1  = mzd_init_window(A,  0, n1, nrows, ncols);
+    mzd_t *A0  = mzd_init_window(A,  0,  0, nrows,    n1);
+    mzd_t *A1  = mzd_init_window(A,  0, n1, nrows, ncols);
 
     size_t r1, r2;
     /* First recursive call */
      *   ------------------------------------------
      */
 
-    packedmatrix *A00 = mzd_init_window(A,  0,  0, r1, r1);
-    packedmatrix *A10 = mzd_init_window(A, r1,  0, nrows, r1);
-    packedmatrix *A01 = mzd_init_window(A,  0, n1, r1, ncols);
-    packedmatrix *A11 = mzd_init_window(A, r1, n1, nrows, ncols);
+    mzd_t *A00 = mzd_init_window(A,  0,  0, r1, r1);
+    mzd_t *A10 = mzd_init_window(A, r1,  0, nrows, r1);
+    mzd_t *A01 = mzd_init_window(A,  0, n1, r1, ncols);
+    mzd_t *A11 = mzd_init_window(A, r1, n1, nrows, ncols);
 
     if (r1) {
       /* Computation of the Schur complement */
     }
 
     /* Second recursive call */
-    permutation *P2 = mzp_init_window(P, r1, nrows);
-    permutation *Q2 = mzp_init_window(Q, n1, ncols);
+    mzp_t *P2 = mzp_init_window(P, r1, nrows);
+    mzp_t *Q2 = mzp_init_window(Q, n1, ncols);
 
     r2 = _mzd_pluq(A11, P2, Q2, cutoff);
 
      * // Update Q
      * for (i = 0; i < ncols - n1; ++i)
      *  Q2->values[i] += n1;
-     * permutation * Q2b = mzp_init_window(Q, r1, ncols);
-     * packedmatrix* A01b = mzd_init_window (A, 0, r1, r1, ncols);
-     * packedmatrix* A11b = mzd_init_window (A, r1, r1, nrows, ncols);
+     * mzp_t * Q2b = mzp_init_window(Q, r1, ncols);
+     * mzd_t* A01b = mzd_init_window (A, 0, r1, r1, ncols);
+     * mzd_t* A11b = mzd_init_window (A, r1, r1, nrows, ncols);
      * mzd_col_block_rotate (A01b, 0, n1-r1, n1 - r1 + r2, 1);
      * mzd_col_block_rotate (A11b, 0, n1-r1, n1 - r1 + r2, 1);
      * // Update Q
     mzd_apply_p_right_trans(A11, Q2);
 
 
-    permutation *tmp = mzp_init(A->ncols);
+    mzp_t *tmp = mzp_init(A->ncols);
     for(i=0, j=n1; j<n1+r2; i++, j++) {
       //mzd_col_swap(A, r1 + i, n1 + Q2->values[i]);
       tmp->values[r1+i] = Q2->values[i] + n1;
 }
 
 
-size_t _mzd_pluq_naive(packedmatrix *A, permutation *P, permutation *Q)  {
+size_t _mzd_pluq_naive(mzd_t *A, mzp_t *P, mzp_t *Q)  {
   size_t i, j, l, curr_pos;
   int found;
 
   return curr_pos;
 }
  
-size_t mzd_echelonize_pluq(packedmatrix *A, int full) {
-  permutation *P = mzp_init(A->nrows);
-  permutation *Q = mzp_init(A->ncols);
+size_t mzd_echelonize_pluq(mzd_t *A, int full) {
+  mzp_t *P = mzp_init(A->nrows);
+  mzp_t *Q = mzp_init(A->ncols);
 
   size_t r = mzd_pluq(A, P, Q, 0);
 
   if(full) {
-    packedmatrix *U = mzd_init_window(A, 0, 0, r, r);
-    packedmatrix *B = mzd_init_window(A, 0, r, r, A->ncols);
+    mzd_t *U = mzd_init_window(A, 0, 0, r, r);
+    mzd_t *B = mzd_init_window(A, 0, r, r, A->ncols);
     if(r!=A->ncols) 
       mzd_trsm_upper_left(U, B, 0);
     if(r!=0) 
   }
   
   if(r!=A->nrows) {
-    packedmatrix *R = mzd_init_window(A, r, 0, A->nrows, A->ncols);
+    mzd_t *R = mzd_init_window(A, r, 0, A->nrows, A->ncols);
     mzd_set_ui(R, 0);
     mzd_free_window(R);
   }
  * \return Rank of A.
  */
 
-size_t mzd_pluq(packedmatrix *A, permutation *P, permutation * Q, const int cutoff);
+size_t mzd_pluq(mzd_t *A, mzp_t *P, mzp_t * Q, const int cutoff);
 
 /**
  * \brief PLUQ matrix decomposition.
  * See mzd_pluq() for details.
  *
  * \param A Input matrix
- * \param P Output row permutation matrix
- * \param Q Output column permutation matrix
+ * \param P Output row mzp_t matrix
+ * \param Q Output column mzp_t matrix
  * \param cutoff Minimal dimension for Strassen recursion.
  *
  * \sa mzd_pluq()
  * \return Rank of A.
  */
 
-size_t _mzd_pluq(packedmatrix *A, permutation * P, permutation * Q, const int cutoff);
+size_t _mzd_pluq(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff);
 
 /**
  * \brief PLUQ matrix decomposition (naive base case).
  * See mzd_pluq() for details.
  * 
  * \param A Input matrix
- * \param P Output row permutation matrix
- * \param Q Output column permutation matrix
+ * \param P Output row mzp_t matrix
+ * \param Q Output column mzp_t matrix
  *
  * \sa mzd_pluq()
  *
  * \return Rank of A.
  */
 
-size_t _mzd_pluq_naive(packedmatrix *A, permutation * P, permutation * Q);
+size_t _mzd_pluq_naive(mzd_t *A, mzp_t * P, mzp_t * Q);
 
 /**
  * \brief (Reduced) row echelon form using PLUQ factorisation.
  */
 
 
-size_t mzd_echelonize_pluq(packedmatrix *A, int full);
+size_t mzd_echelonize_pluq(mzd_t *A, int full);
 
 #endif
 
 
 /* blocks of memory we like to keep around for later re-use */
-mm_block m4ri_mmc_cache[M4RI_MMC_NBLOCKS];
+mmb_t m4ri_mmc_cache[M4RI_MMC_NBLOCKS];
 
 void m4ri_die(const char *errormessage, ...) {
   va_list lst;
 }
 
 /**
+ * \brief Maximum number of bytes allocated in one malloc() call.
+ */
+
+#define MM_MAX_MALLOC ((1ULL)<<31)
+
+/**
  * \brief Enable memory block cache (default: disabled)
  */
 //#define ENABLE_MMC
 
+
 /**
  * \brief Number of blocks that are cached.
  */
    */
   void *data;
 
-} mm_block;
+} mmb_t;
 
 /**
  * The actual memory block cache.
  */
 
-extern mm_block m4ri_mmc_cache[M4RI_MMC_NBLOCKS];
+extern mmb_t m4ri_mmc_cache[M4RI_MMC_NBLOCKS];
 
 /**
  * \brief Return handle for locale memory management cache.
  * \attention Not thread safe.
  */
 
-static inline mm_block *m4ri_mmc_handle(void) {
+static inline mmb_t *m4ri_mmc_handle(void) {
   return m4ri_mmc_cache;
 }
 
 
 static inline void *m4ri_mmc_malloc(size_t size) {
 #ifdef ENABLE_MMC
-  mm_block *mm = m4ri_mmc_handle();
+  mmb_t *mm = m4ri_mmc_handle();
   if (size <= M4RI_MMC_THRESHOLD) {
     size_t i;
     for (i=0; i<M4RI_MMC_NBLOCKS; i++) {
 static inline void m4ri_mmc_free(void *condemned, size_t size) {
 #ifdef ENABLE_MMC
   static size_t j = 0;
-  mm_block *mm = m4ri_mmc_handle();
+  mmb_t *mm = m4ri_mmc_handle();
   if (size < M4RI_MMC_THRESHOLD) {
     size_t i;
     for(i=0; i<M4RI_MMC_NBLOCKS; i++) {
  */
 
 static inline void m4ri_mmc_cleanup(void) {
-  mm_block *mm = m4ri_mmc_handle();
+  mmb_t *mm = m4ri_mmc_handle();
   size_t i;
   for(i=0; i < M4RI_MMC_NBLOCKS; i++) {
     if (mm[i].size)
   }
 }
 
-
 #endif //MISC_H

File src/packedmatrix.c

 
 #define SAFECHAR (int)(RADIX+RADIX/3)
 
-packedmatrix *mzd_init(size_t r, size_t c) {
-  packedmatrix *newmatrix;
-  size_t i;
+mzd_t *mzd_init(size_t r, size_t c) {
+  mzd_t *A;
+  size_t i,j;
 
 #ifdef HAVE_OPENMP
 #pragma omp critical
 {
 #endif
-  newmatrix=(packedmatrix *)m4ri_mmc_malloc(sizeof(packedmatrix));
+  A=(mzd_t *)m4ri_mmc_malloc(sizeof(mzd_t));
 #ifdef HAVE_OPENMP
  }
 #endif
 
-  newmatrix->width=DIV_CEIL(c,RADIX);
+  A->width=DIV_CEIL(c,RADIX);
 
 #ifdef HAVE_SSE2
   int incw = 0;
   /* make sure each row is 16-byte aligned */
-  if (newmatrix->width & 1) {
-    newmatrix->width++;
+  if (A->width & 1) {
+    A->width++;
     incw = 1;
   }
 #endif
 
-  newmatrix->ncols=c;
-  newmatrix->nrows=r;
-  newmatrix->offset = 0;
-#ifdef HAVE_OPENMP
-#pragma omp critical
-{
-#endif
-  newmatrix->values=(word *)m4ri_mmc_calloc( (newmatrix->width)*r, sizeof(word) );
-#ifdef HAVE_OPENMP
- }
-#endif
+  A->ncols=c;
+  A->nrows=r;
+  A->offset = 0;
 
 #ifdef HAVE_OPENMP
 #pragma omp critical
 {
 #endif
-  newmatrix->rowswap=(size_t *)m4ri_mmc_malloc( r * sizeof(size_t) );
+  A->rows=(word **)m4ri_mmc_malloc( r * sizeof(word*) );
 #ifdef HAVE_OPENMP
  }
 #endif
 
-  /* Rowswap does not contain the rowswap index i but the correct
-   * offset in the values table. Rowswap is exclusively used to access
-   * elements in that table and this speeds up computation a
-   * little.
-   */
 
-  for (i=0; i<r; i++) { 
-    newmatrix->rowswap[i]=i*(newmatrix->width); 
-  }
-
-#ifdef HAVE_SSE2
-  if (incw) {
-    newmatrix->width--;
-  }
-#endif
-
-  return newmatrix;
-}
-
-packedmatrix *mzd_init_window (const packedmatrix *m, size_t lowr, size_t lowc, size_t highr, size_t highc) {
-  size_t nrows, ncols, i, offset; 
-  packedmatrix *window;
 #ifdef HAVE_OPENMP
 #pragma omp critical
 {
 #endif
-  window = (packedmatrix *)m4ri_mmc_malloc(sizeof(packedmatrix));
+
+  if(A->width) {
+    /* we allow more than one malloc call so he have to be a bit clever
+       here */
+    
+    const size_t bytes_per_row = A->width*sizeof(word);
+    const size_t max_rows_per_block = MM_MAX_MALLOC/bytes_per_row;
+    assert(max_rows_per_block);
+    size_t rest = r % max_rows_per_block;
+    
+    size_t nblocks = (rest == 0) ? r / max_rows_per_block : r / max_rows_per_block + 1;
+    A->blocks = m4ri_mmc_calloc(nblocks + 1, sizeof(mmb_t));
+    for(i=0; i<nblocks-1; i++) {
+      A->blocks[i].size = MM_MAX_MALLOC;
+      A->blocks[i].data = m4ri_mmc_calloc(MM_MAX_MALLOC,1);
+      for(j=0; j<max_rows_per_block; j++) 
+        A->rows[max_rows_per_block*i + j] = ((word*)A->blocks[i].data) + j*A->width;
+    }
+    if(rest==0)
+      rest = max_rows_per_block;
+
+    A->blocks[nblocks-1].size = rest * bytes_per_row;
+    A->blocks[nblocks-1].data = m4ri_mmc_calloc(rest, bytes_per_row);
+    for(j=0; j<rest; j++) {
+      A->rows[max_rows_per_block*(nblocks-1) + j] = (word*)(A->blocks[nblocks-1].data) + j*A->width;
+    }
+  }
+#ifdef HAVE_OPENMP
+ }
+#endif
+
+
+#ifdef HAVE_SSE2
+  if (incw) {
+    A->width--;
+  }
+#endif
+
+  return A;
+}
+
+mzd_t *mzd_init_window (const mzd_t *m, size_t lowr, size_t lowc, size_t highr, size_t highc) {
+  size_t nrows, ncols, i, offset; 
+  mzd_t *window;
+#ifdef HAVE_OPENMP
+#pragma omp critical
+{
+#endif
+  window = (mzd_t *)m4ri_mmc_malloc(sizeof(mzd_t));
 #ifdef HAVE_OPENMP
 }
 #endif
   window->width = (window->offset + ncols) / RADIX;
   if ((window->offset + ncols) % RADIX)
     window->width++;
-  window->values = m->values;
+  window->blocks = NULL;
+  window->blocksizes = NULL;
 
 #ifdef HAVE_OPENMP
 #pragma omp critical
 {
 #endif
-  window->rowswap = (size_t *)m4ri_mmc_malloc( nrows * sizeof(size_t));
+  window->rows = (word **)m4ri_mmc_malloc( nrows * sizeof(word*));
 #ifdef HAVE_OPENMP
 }
 #endif
   for(i=0; i<nrows; i++) {
-    window->rowswap[i] = m->rowswap[lowr + i] + offset;
+    window->rows[i] = m->rows[lowr + i] + offset;
   }
   
   return window;
 }
 
 
-void mzd_free( packedmatrix *condemned) {
+void mzd_free( mzd_t *A) {
 #ifdef HAVE_OPENMP
 #pragma omp critical
 {
 #endif
-  m4ri_mmc_free(condemned->values, condemned->width*condemned->nrows*sizeof(word));
-  m4ri_mmc_free(condemned->rowswap, condemned->nrows * sizeof(size_t));
-  m4ri_mmc_free(condemned, sizeof(packedmatrix));
+  m4ri_mmc_free(A->rows, A->nrows * sizeof(word*));
+  if(A->width && A->blocks) {
+    size_t i;
+    for(i=0; A->blocks[i].size; i++) {
+      m4ri_mmc_free(A->blocks[i].data, A->blocks[i].size);
+    }
+    m4ri_mmc_free(A->blocks, (i+1) * sizeof(mmb_t));
+  }
+  m4ri_mmc_free(A, sizeof(mzd_t));
 #ifdef HAVE_OPENMP
 }
 #endif
 }
 
-void mzd_free_window( packedmatrix *condemned) {
+void mzd_free_window( mzd_t *A) {
 #ifdef HAVE_OPENMP
 #pragma omp critical
 {
 #endif
-  m4ri_mmc_free(condemned->rowswap, condemned->nrows * sizeof(size_t));
-  m4ri_mmc_free(condemned, sizeof(packedmatrix));
+  m4ri_mmc_free(A->rows, A->nrows * sizeof(word*));
+  m4ri_mmc_free(A, sizeof(mzd_t));
 #ifdef HAVE_OPENMP
 }
 #endif
 }
 
-void mzd_print( const packedmatrix *M ) {
+void mzd_print( const mzd_t *M ) {
   size_t i, j, wide;
   char temp[SAFECHAR];
   word *row;
 
   for (i=0; i< M->nrows; i++ ) {
     printf("[");
-    row = M->values + M->rowswap[i];
+    row = M->rows[i];
     if(M->offset == 0) {
       for (j=0; j< M->width-1; j++) {
         m4ri_word_to_str(temp, row[j], 1);
   }
 }
 
-void mzd_print_tight( const packedmatrix *M ) {
+void mzd_print_tight( const mzd_t *M ) {
   assert(M->offset == 0);
 
   size_t i, j;
 
   for (i=0; i< M->nrows; i++ ) {
     printf("[");
-    row = M->values + M->rowswap[i];
+    row = M->rows[i];
     for (j=0; j< M->ncols/RADIX; j++) {
       m4ri_word_to_str(temp, row[j], 0);
       printf("%s", temp);
   }
 }
 
-void mzd_row_add( packedmatrix *m, size_t sourcerow, size_t destrow) {
+void mzd_row_add( mzd_t *m, size_t sourcerow, size_t destrow) {
   mzd_row_add_offset(m, destrow, sourcerow, 0);
 }
 
-int mzd_gauss_delayed(packedmatrix *M, size_t startcol, int full) {
+int mzd_gauss_delayed(mzd_t *M, size_t startcol, int full) {
   assert(M->offset == 0);
   size_t i,j;
   size_t start; 
   return pivots;
 }
 
-int mzd_echelonize_naive(packedmatrix *m, int full) { 
+int mzd_echelonize_naive(mzd_t *m, int full) { 
   return mzd_gauss_delayed(m, 0, full); 
 }
 
-static inline packedmatrix *_mzd_transpose_direct(packedmatrix *DST, const packedmatrix *A) {
+static inline mzd_t *_mzd_transpose_direct(mzd_t *DST, const mzd_t *A) {
   size_t i,j,k, eol;
   word *temp;
 
   }
 
   for (i=0; i<DST->nrows; i++) {
-    temp = DST->values + DST->rowswap[i];
+    temp = DST->rows[i];
     for (j=0; j < eol; j+=RADIX) {
       for (k=0; k<RADIX; k++) {
         *temp |= ((word)mzd_read_bit(A, j+k, i+A->offset))<<(RADIX-1-k);
   return DST;
 }
 
-static inline packedmatrix *_mzd_transpose(packedmatrix *DST, const packedmatrix *X) {
+static inline mzd_t *_mzd_transpose(mzd_t *DST, const mzd_t *X) {
   assert(X->offset == 0);
 
   const size_t nr = X->nrows;
   const size_t cutoff = 256; /* 256 seems optimal */
 
   if(nr <= cutoff || nc <= cutoff) {
-    packedmatrix *x = mzd_copy(NULL, X);
+    mzd_t *x = mzd_copy(NULL, X);
     _mzd_transpose_direct(DST, x);
     mzd_free(x);
     return DST;
   const size_t nr2 = RADIX*(X->nrows/(2*RADIX));
   const size_t nc2 = RADIX*(X->ncols/(2*RADIX));
 
-  packedmatrix *A = mzd_init_window(X,    0,   0, nr2, nc2);
-  packedmatrix *B = mzd_init_window(X,    0, nc2, nr2,  nc);
-  packedmatrix *C = mzd_init_window(X,  nr2,   0,  nr, nc2);
-  packedmatrix *D = mzd_init_window(X,  nr2, nc2,  nr,  nc);
+  mzd_t *A = mzd_init_window(X,    0,   0, nr2, nc2);
+  mzd_t *B = mzd_init_window(X,    0, nc2, nr2,  nc);
+  mzd_t *C = mzd_init_window(X,  nr2,   0,  nr, nc2);
+  mzd_t *D = mzd_init_window(X,  nr2, nc2,  nr,  nc);
 
-  packedmatrix *AT = mzd_init_window(DST,   0,   0, nc2, nr2);
-  packedmatrix *CT = mzd_init_window(DST,   0, nr2, nc2,  nr);
-  packedmatrix *BT = mzd_init_window(DST, nc2,   0,  nc, nr2);
-  packedmatrix *DT = mzd_init_window(DST, nc2, nr2,  nc,  nr);
+  mzd_t *AT = mzd_init_window(DST,   0,   0, nc2, nr2);
+  mzd_t *CT = mzd_init_window(DST,   0, nr2, nc2,  nr);
+  mzd_t *BT = mzd_init_window(DST, nc2,   0,  nc, nr2);
+  mzd_t *DT = mzd_init_window(DST, nc2, nr2,  nc,  nr);
 
   _mzd_transpose(AT, A);
   _mzd_transpose(BT, B);
   return DST;
 }
 
-packedmatrix *mzd_transpose(packedmatrix *DST, const packedmatrix *A) {
+mzd_t *mzd_transpose(mzd_t *DST, const mzd_t *A) {
   if (DST == NULL) {
     DST = mzd_init( A->ncols, A->nrows );
   } else {
     return _mzd_transpose(DST, A);
 }
 
-packedmatrix *mzd_mul_naive(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
+mzd_t *mzd_mul_naive(mzd_t *C, const mzd_t *A, const mzd_t *B) {
   if (C==NULL) {
     C=mzd_init(A->nrows, B->ncols);
   } else {
     }
   }
   if(B->ncols < RADIX-10) { /* this cutoff is rather arbitrary */
-    packedmatrix *BT = mzd_transpose(NULL, B);
+    mzd_t *BT = mzd_transpose(NULL, B);
     _mzd_mul_naive(C, A, BT, 1);
     mzd_free (BT);
   } else {
   return C;
 }
 
-packedmatrix *mzd_addmul_naive(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
+mzd_t *mzd_addmul_naive(mzd_t *C, const mzd_t *A, const mzd_t *B) {
   if (C->nrows != A->nrows || C->ncols != B->ncols) {
     m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
   }
 
   if(B->ncols < RADIX-10) { /* this cutoff is rather arbitrary */
-    packedmatrix *BT = mzd_transpose(NULL, B);
+    mzd_t *BT = mzd_transpose(NULL, B);
     _mzd_mul_naive(C, A, BT, 0);
     mzd_free (BT);
   } else {
   return C;
 }
 
-packedmatrix *_mzd_mul_naive(packedmatrix *C, const packedmatrix *A, const packedmatrix *B, const int clear) {
+mzd_t *_mzd_mul_naive(mzd_t *C, const mzd_t *A, const mzd_t *B, const int clear) {
   assert(A->offset == 0);
   assert(B->offset == 0);
   assert(C->offset == 0);
 
   if (clear) {
     for (i=0; i<C->nrows; i++) {
-      size_t truerow = C->rowswap[i];
       for (j=0; j<C->width-1; j++) {
-  	C->values[truerow + j] = 0;
+  	C->rows[i][j] = 0;
       }
-      C->values[truerow + j] &= ~LEFT_BITMASK(C->ncols);
+      C->rows[i][j] &= ~LEFT_BITMASK(C->ncols);
     }
   }
 
   size_t start;
   for (start = 0; start + blocksize <= C->nrows; start += blocksize) {
     for (i=start; i<start+blocksize; i++) {
-      a = A->values + A->rowswap[i];
-      c = C->values + C->rowswap[i];
+      a = A->rows[i];
+      c = C->rows[i];
       for (j=0; j<RADIX*eol; j+=RADIX) {
 	for (k=0; k<RADIX; k++) {
-          b = B->values + B->rowswap[j+k];
+          b = B->rows[j+k];
           parity[k] = a[0] & b[0];
           for (ii=wide-1; ii>=1; ii--)
 	    parity[k] ^= a[ii] & b[ii];
       
       if (eol != C->width) {
         for (k=0; k<(int)(C->ncols%RADIX); k++) {
-          b = B->values + B->rowswap[RADIX*eol+k];
+          b = B->rows[RADIX*eol+k];
           parity[k] = a[0] & b[0];
           for (ii=1; ii<A->width; ii++)
             parity[k] ^= a[ii] & b[ii];
   }
 
   for (i=C->nrows - (C->nrows%blocksize); i<C->nrows; i++) {
-    a = A->values + A->rowswap[i];
-    c = C->values + C->rowswap[i];
+    a = A->rows[i];
+    c = C->rows[i];
     for (j=0; j<RADIX*eol; j+=RADIX) {
       for (k=0; k<RADIX; k++) {
-        b = B->values + B->rowswap[j+k];
+        b = B->rows[j+k];
         parity[k] = a[0] & b[0];
         for (ii=wide-1; ii>=1; ii--)
           parity[k] ^= a[ii] & b[ii];
     
     if (eol != C->width) {
       for (k=0; k<(int)(C->ncols%RADIX); k++) {
-        b = B->values + B->rowswap[RADIX*eol+k];
+        b = B->rows[RADIX*eol+k];
         parity[k] = a[0] & b[0];
         for (ii=1; ii<A->width; ii++)
           parity[k] ^= a[ii] & b[ii];
   return C;
 }
 
-packedmatrix *_mzd_mul_va(packedmatrix *C, const packedmatrix *v, const packedmatrix *A, const int clear) {
+mzd_t *_mzd_mul_va(mzd_t *C, const mzd_t *v, const mzd_t *A, const int clear) {
   assert(C->offset == 0);
   assert(A->offset == 0);
   assert(v->offset == 0);
   return C;
 }
 
-void mzd_randomize(packedmatrix *A) {
+void mzd_randomize(mzd_t *A) {
   size_t i, j;
   assert(A->offset == 0);
 
   }
 }
 
-void mzd_set_ui( packedmatrix *A, unsigned int value) {
+void mzd_set_ui( mzd_t *A, unsigned int value) {
   size_t i,j;
   size_t stop = MIN(A->nrows, A->ncols);
 
     }
   } else {
     for (i=0; i<A->nrows; i++) {
-      size_t truerow = A->rowswap[i];
-      A->values[truerow] &= ~mask_begin;
+      word *row = A->rows[i];
+      row[0] &= ~mask_begin;
       for(j=1 ; j<A->width-1; j++)
-        A->values[truerow + j] = 0;
-      A->values[truerow + A->width - 1] &= ~mask_end;
+        row[j] = 0;
+      row[A->width - 1] &= ~mask_end;
     }
   }
 
   }
 }
 
-BIT mzd_equal(const packedmatrix *A, const packedmatrix *B) {
+BIT mzd_equal(const mzd_t *A, const mzd_t *B) {
   assert(A->offset == 0);
   assert(B->offset == 0);
 
 
   for (i=0; i< A->nrows; i++) {
     for (j=0; j< A->width; j++) {
-      if (A->values[A->rowswap[i] + j] != B->values[B->rowswap[i] + j])
+      if (A->rows[i][j] != B->rows[i][j])
 	return FALSE;
     }
   }
   return TRUE;
 }
 
-int mzd_cmp(const packedmatrix *A, const packedmatrix *B) {
+int mzd_cmp(const mzd_t *A, const mzd_t *B) {
   assert(A->offset == 0);
   assert(B->offset == 0);
 
 
   for(i=0; i < A->nrows ; i++) {
     for(j=0 ; j< A->width ; j++) {
-      if ( A->values[A->rowswap[i] + j] < B->values[B->rowswap[i] + j])
+      if ( A->rows[i][j] < B->rows[i][j] )
 	return -1;
-      else if( A->values[A->rowswap[i] + j] > B->values[B->rowswap[i] + j])
+      else if( A->rows[i][j] > B->rows[i][j] )
 	return 1;
     }
   }
   return 0;
 }
 
-packedmatrix *mzd_copy(packedmatrix *N, const packedmatrix *P) {
+mzd_t *mzd_copy(mzd_t *N, const mzd_t *P) {
   if (N == P)
     return N;
 
       if (N->nrows < P->nrows || N->ncols < P->ncols)
 	m4ri_die("mzd_copy: Target matrix is too small.");
     }
-    size_t i, j, p_truerow, n_truerow;
+    size_t i, j;
+    word *p_truerow, *n_truerow;
     const size_t wide = P->width-1; 
     word mask = LEFT_BITMASK(P->ncols);
     for (i=0; i<P->nrows; i++) {
-      p_truerow = P->rowswap[i];
-      n_truerow = N->rowswap[i];
+      p_truerow = P->rows[i];
+      n_truerow = N->rows[i];
       for (j=0; j<wide; j++) {
-        N->values[n_truerow + j] = P->values[p_truerow + j];
+       n_truerow[j] = p_truerow[j];
       }
-      N->values[n_truerow + wide] = (N->values[n_truerow + wide] & ~mask) | (P->values[p_truerow +  wide] & mask);
+      n_truerow[wide] = (n_truerow[wide] & ~mask) | (p_truerow[wide] & mask);
     }
   } else { // P->offset > 0
     if (N == NULL) {
 }
 
 /* This is sometimes called augment */
-packedmatrix *mzd_concat(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
+mzd_t *mzd_concat(mzd_t *C, const mzd_t *A, const mzd_t *B) {
   assert(A->offset == 0);
   assert(B->offset == 0);
-  size_t i, j, src_truerow, dst_truerow;
+  size_t i, j;
+  word *src_truerow, *dst_truerow;
   
   if (A->nrows != B->nrows) {
     m4ri_die("mzd_concat: Bad arguments to concat!\n");
   }
 
   for (i=0; i<A->nrows; i++) {
-    dst_truerow = C->rowswap[i];
-    src_truerow = A->rowswap[i];
+    dst_truerow = C->rows[i];
+    src_truerow = A->rows[i];
     for (j=0; j <A->width; j++) {
-      C->values[dst_truerow + j] = A->values[src_truerow + j];
+      dst_truerow[j] = src_truerow[j];
     }
   }
 
   return C;
 }
 
-packedmatrix *mzd_stack(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
+mzd_t *mzd_stack(mzd_t *C, const mzd_t *A, const mzd_t *B) {
   assert(A->offset == 0);
   assert(B->offset == 0);
-  size_t i, j, src_truerow, dst_truerow;
+  size_t i, j;
+  word *src_truerow, *dst_truerow;
 
   if (A->ncols != B->ncols) {
     m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n",A->ncols, B->ncols);
   }
   
   for(i=0; i<A->nrows; i++) {
-    src_truerow = A->rowswap[i];
-    dst_truerow = C->rowswap[i];
+    src_truerow = A->rows[i];
+    dst_truerow = C->rows[i];
     for (j=0; j<A->width; j++) {
-      C->values[dst_truerow + j] = A->values[src_truerow + j]; 
+      dst_truerow[j] = src_truerow[j]; 
     }
   }
 
   for(i=0; i<B->nrows; i++) {
-    dst_truerow = C->rowswap[A->nrows + i];
-    src_truerow = B->rowswap[i];
+    dst_truerow = C->rows[A->nrows + i];
+    src_truerow = B->rows[i];
     for (j=0; j<B->width; j++) {
-      C->values[dst_truerow + j] = B->values[src_truerow + j]; 
+      dst_truerow[j] = src_truerow[j]; 
     }
   }
   return C;
 }
 
-packedmatrix *mzd_invert_naive(packedmatrix *INV, packedmatrix *A, const packedmatrix *I) {
+mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t *A, const mzd_t *I) {
   assert(A->offset == 0);
-  packedmatrix *H;
+  mzd_t *H;
   int x;
 
   H = mzd_concat(NULL, A, I);
   return INV;
 }
 
-packedmatrix *mzd_add(packedmatrix *ret, const packedmatrix *left, const packedmatrix *right) {
+mzd_t *mzd_add(mzd_t *ret, const mzd_t *left, const mzd_t *right) {
   if (left->nrows != right->nrows || left->ncols != right->ncols) {
     m4ri_die("mzd_add: rows and columns must match.\n");
   }
   return _mzd_add(ret, left, right);
 }
 
-packedmatrix *_mzd_add(packedmatrix *C, const packedmatrix *A, const packedmatrix *B) {
+mzd_t *_mzd_add(mzd_t *C, const mzd_t *A, const mzd_t *B) {
   size_t i;
   size_t nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);
-  const packedmatrix *tmp;
+  const mzd_t *tmp;
 
   if (C == B) { //swap
     tmp = A;
   return C;
 }
 
-packedmatrix *mzd_submatrix(packedmatrix *S, const packedmatrix *M, const size_t startrow, const size_t startcol, const size_t endrow, const size_t endcol) {
+mzd_t *mzd_submatrix(mzd_t *S, const mzd_t *M, const size_t startrow, const size_t startcol, const size_t endrow, const size_t endcol) {
   size_t nrows, ncols, i, colword, x, y, block, spot, startword;
-  size_t truerow;
+  word *truerow;
   word temp  = 0;
   
   nrows = endrow - startrow;
   if ((M->offset + startcol)%RADIX == 0) {
     if(ncols/RADIX) {
       for(x = startrow, i=0; i<nrows; i++, x++) {
-        memcpy(S->values + S->rowswap[i], M->values + M->rowswap[x] + startword, 8*(ncols/RADIX));
+        memcpy(S->rows[i], M->rows[x] + startword, 8*(ncols/RADIX));
       }
     }
     if (ncols%RADIX) {
       for(x = startrow, i=0; i<nrows; i++, x++) {
         /* process remaining bits */
-	temp = M->values[M->rowswap[x] + startword + ncols/RADIX] & LEFT_BITMASK(ncols);
-	S->values[S->rowswap[i] + ncols/RADIX] = temp;
+	temp = M->rows[x][startword + ncols/RADIX] & LEFT_BITMASK(ncols);
+	S->rows[i][ncols/RADIX] = temp;
       } 
     }
     /* startcol is not the beginning of a word */
   } else { 
     spot = (M->offset + startcol) % RADIX;
     for(x = startrow, i=0; i<nrows; i++, x+=1) {
-      truerow = M->rowswap[x];
+      truerow = M->rows[x];
 
       /* process full words first */
       for(colword=0; colword<(int)(ncols/RADIX); colword++) {
-	block = truerow + colword + startword;
-	temp = (M->values[block] << (spot)) | (M->values[block + 1] >> (RADIX-spot) ); 
-	S->values[S->rowswap[i] + colword] = temp;
+	block = colword + startword;
+	temp = (truerow[block] << (spot)) | (truerow[block + 1] >> (RADIX-spot) ); 
+	S->rows[i][colword] = temp;
       }
       /* process remaining bits (lazy)*/
       colword = ncols/RADIX;
   return S;
 }
 
-void mzd_combine( packedmatrix * C, const size_t c_row, const size_t c_startblock,
-		  const packedmatrix * A, const size_t a_row, const size_t a_startblock, 
-		  const packedmatrix * B, const size_t b_row, const size_t b_startblock) {
+void mzd_combine( mzd_t * C, const size_t c_row, const size_t c_startblock,
+		  const mzd_t * A, const size_t a_row, const size_t a_startblock, 
+		  const mzd_t * B, const size_t b_row, const size_t b_startblock) {
 
   size_t i;
   if(C->offset || A->offset || B->offset) {
 
   size_t wide = A->width - a_startblock;
 
-  word *a = A->values + a_startblock + A->rowswap[a_row];
-  word *b = B->values + b_startblock + B->rowswap[b_row];
+  word *a = a_startblock + A->rows[a_row];
+  word *b = b_startblock + B->rows[b_row];
   
   if( C == A && a_row == c_row && a_startblock == c_startblock) {
 #ifdef HAVE_SSE2
     return;
     
   } else { /* C != A */
-    word *c = C->values + c_startblock + C->rowswap[c_row];
+    word *c = c_startblock + C->rows[c_row];
 
     /* this is a corner case triggered by Strassen multiplication
        which assumes certain (virtual) matrix sizes */
 }
 
 
-void mzd_col_swap(packedmatrix *M, const size_t cola, const size_t colb) {
+void mzd_col_swap(mzd_t *M, const size_t cola, const size_t colb) {
   if (cola == colb)
     return;
 
     const word ai = RADIX - a_bit - 1;
     const word bi = RADIX - b_bit - 1;
     for (i=0; i<M->nrows; i++) {
-      base = (M->values + M->rowswap[i] + a_word);
+      base = (M->rows[i] + a_word);
       register word b = *base;
       register word x = ((b >> ai) ^ (b >> bi)) & 1; // XOR temporary
       *base = b ^ ((x << ai) | (x << bi));
     const size_t offset = a_bit - b_bit;
 
     for (i=0; i<M->nrows; i++) {
-      base = M->values + M->rowswap[i];
+      base = M->rows[i];
       a = *(base + a_word);
       b = *(base + b_word);
 
   } else {
     const size_t offset = b_bit - a_bit;
     for (i=0; i<M->nrows; i++) {
-      base = M->values + M->rowswap[i];
+      base = M->rows[i];
 
       a = *(base + a_word);
       b = *(base + b_word);
 }
 
 
-int mzd_is_zero(packedmatrix *A) {
+int mzd_is_zero(mzd_t *A) {
   /* Could be improved: stopping as the first non zero value is found (status!=0)*/
   size_t mb = A->nrows;
   size_t nb = A->ncols;
     word mask_end = LEFT_BITMASK(nbrest);
     size_t i;
     for (i=0; i<mb; ++i) {
-        status |= A->values [A->rowswap [i]] & mask_begin;
+        status |= A->rows[i][0] & mask_begin;
         size_t j;
         for ( j = 1; j < A->width-1; ++j)
-            status |= A->values [A->rowswap [i] + j];
-        status |= A->values [A->rowswap [i] + A->width - 1] & mask_end;
+            status |= A->rows[i][j];
+        status |= A->rows[i][A->width - 1] & mask_end;
     }
   } else {
           // Small A
 
     size_t i;
     for (i=0; i < mb; ++i) {
-        status |= A->values [A->rowswap [i]] & mask;
+        status |= A->rows[i][0] & mask;
     }
   }
   
   return !status;
 }
 
-void mzd_copy_row(packedmatrix* B, size_t i, const packedmatrix* A, size_t j) {
+void mzd_copy_row(mzd_t* B, size_t i, const mzd_t* A, size_t j) {
   assert(B->offset == A->offset);
   assert(B->ncols >= A->ncols);
   size_t k;
   const size_t width= MIN(B->width, A->width) - 1;
 
-  word* a=A->values + A->rowswap[j];
-  word* b=B->values + B->rowswap[i];
+  word* a = A->rows[j];
+  word* b = B->rows[i];
  
   word mask_begin = RIGHT_BITMASK(RADIX - A->offset);
   word mask_end = LEFT_BITMASK( (A->offset + A->ncols)%RADIX );
 }
 
 
-void mzd_row_clear_offset(packedmatrix *M, size_t row, size_t coloffset) {
+void mzd_row_clear_offset(mzd_t *M, size_t row, size_t coloffset) {
   coloffset += M->offset;
   size_t startblock= coloffset/RADIX;
   size_t i;
 
   /* make sure to start clearing at coloffset */
   if (coloffset%RADIX) {
-    temp = M->values[M->rowswap[row] + startblock];
+    temp = M->rows[row][startblock];
     temp &= RIGHT_BITMASK(RADIX - coloffset);
   } else {
     temp = 0;
   }
-  M->values[M->rowswap[row] + startblock] = temp;
+  M->rows[row][startblock] = temp;
   temp=0;
   for ( i=startblock+1; i < M->width; i++ ) {
-    M->values[M->rowswap[row] + i] = temp;
+    M->rows[row][i] = temp;
   }
 }
 
 
-int mzd_find_pivot(packedmatrix *A, size_t start_row, size_t start_col, size_t *r, size_t *c) { 
+int mzd_find_pivot(mzd_t *A, size_t start_row, size_t start_col, size_t *r, size_t *c) { 
   assert(A->offset == 0);
   register size_t i = start_row;
   register size_t j = start_col;
     const size_t word_offset = start_col / RADIX;
     const word mask_begin = RIGHT_BITMASK(RADIX-bit_offset);
     for(i=start_row; i<nrows; i++) {
-      const word curr_data = A->values[A->rowswap[i] + word_offset] & mask_begin;
+      const word curr_data = A->rows[i][word_offset] & mask_begin;
       if (curr_data > data && leftmost_bit(curr_data) > leftmost_bit(data)) {
         row_candidate = i;
         data = curr_data;
     /* handle complete words */
     for(j=word_offset + 1; j<A->width - 1; j++) {
       for(i=start_row; i<nrows; i++) {
-        const word curr_data = A->values[A->rowswap[i] + j];
+        const word curr_data = A->rows[i][j];
         if (curr_data > data && leftmost_bit(curr_data) > leftmost_bit(data)) {
           row_candidate = i;
           data = curr_data;
     const word mask_end = LEFT_BITMASK(end_offset);
     j = A->width-1;
     for(i=start_row; i<nrows; i++) {
-      const word curr_data = A->values[A->rowswap[i] + j] & mask_end;
+      const word curr_data = A->rows[i][j] & mask_end;
       if (curr_data > data && leftmost_bit(curr_data) > leftmost_bit(data)) {
         row_candidate = i;
         data = curr_data;
    return (int)n;
 }
 
-double mzd_density(packedmatrix *A, int res) {
+double mzd_density(mzd_t *A, int res) {
   long count = 0;
   long total = 0;
   
     return ((double)count)/(A->ncols * A->nrows);
   } else {
     for(size_t i=0; i<A->nrows; i++) {
-      size_t truerow = A->rowswap[i];
+      word *truerow = A->rows[i];
       for(size_t j = A->offset; j<RADIX; j++)
         if(mzd_read_bit(A, i, j))
           count++;
       total += (long)RADIX - A->offset;
       for(size_t j=1; j<A->width-1; j+=res) {
-        count += m4ri_bitcount(A->values[truerow + j]);
+        count += m4ri_bitcount(truerow[j]);
         total += RADIX;
       }
       for(size_t j = 0; j < (A->offset + A->ncols)%RADIX; j++)

File src/packedmatrix.h

 
 typedef struct {
   /**
-   * Contains the actual values packed into words of size RADIX.
+   * Contains pointers to the actual blocks of memory containing the
+   * values packed into words of size RADIX.
    */
 
-  word *values;
+  mmb_t *blocks;
+
+  /**
+   * Size of each block in bytes.
+   */
+
+  size_t *blocksizes;
 
   /**
    * Number of rows.
   size_t offset;
   
   /**
-   * Offsets to each row, so e.g. the first word of the i-th row
-   * is m->values[m->rowswap[i]]
+   * Address of first word in each row, so the first word of row i is
+   * is m->rows[i]
    */
 
-  size_t *rowswap;
+  word **rows;
 
-} packedmatrix;
+} mzd_t;
 
 /**
  * \brief Create a new matrix of dimension r x c.
  *
  */
 
-packedmatrix *mzd_init(const size_t r, const size_t c);
+mzd_t *mzd_init(const size_t r, const size_t c);
 
 /**
  * \brief Free a matrix created with mzd_init.
  * \param A Matrix
  */
 
-void mzd_free(packedmatrix *A);
+void mzd_free(mzd_t *A);
 
 
 /**
  *
  */
 
-packedmatrix *mzd_init_window(const packedmatrix *M, const size_t lowr, const size_t lowc, const size_t highr, const size_t highc);
+mzd_t *mzd_init_window(const mzd_t *M, const size_t lowr, const size_t lowc, const size_t highr, const size_t highc);
 
 /**
  * \brief Free a matrix window created with mzd_init_window.
  * \param A Matrix
  */
 
-void mzd_free_window(packedmatrix *A);
+void mzd_free_window(mzd_t *A);
 
 /**
  * \brief Swap the two rows rowa and rowb.
  * \param rowb Row index.
  */
  
-static inline void mzd_row_swap(packedmatrix *M, const size_t rowa, const size_t rowb) {
+static inline void mzd_row_swap(mzd_t *M, const size_t rowa, const size_t rowb) {
   if(rowa == rowb)
     return;
   size_t i;
   size_t width = M->width - 1;
-  word *a = M->values + M->rowswap[rowa];
-  word *b = M->values + M->rowswap[rowb];
+  word *a = M->rows[rowa];
+  word *b = M->rows[rowb];
   word tmp; 
   word mask_begin = RIGHT_BITMASK(RADIX - M->offset);
   word mask_end = LEFT_BITMASK( (M->offset + M->ncols)%RADIX );
  * \param j Source row index.
  */
 
-void mzd_copy_row(packedmatrix* B, size_t i, const packedmatrix* A, size_t j);
+void mzd_copy_row(mzd_t* B, size_t i, const mzd_t* A, size_t j);
 
 /**
  * \brief Swap the two columns cola and colb.
  * \param colb Column index.
  */
  
-void mzd_col_swap(packedmatrix *M, const size_t cola, const size_t colb);
+void mzd_col_swap(mzd_t *M, const size_t cola, const size_t colb);
 
 /**
  * \brief Swap the two columns cola and colb but only between start_row and stop_row.
  * \param stop_row Row index (exclusive).
  */
  
-static inline void mzd_col_swap_in_rows(packedmatrix *M, const size_t cola, const size_t colb, const size_t start_row, const size_t stop_row) {
+static inline void mzd_col_swap_in_rows(mzd_t *M, const size_t cola, const size_t colb, const size_t start_row, const size_t stop_row) {
   if (cola == colb)
     return;
 
     const word ai = RADIX - a_bit - 1;
     const word bi = RADIX - b_bit - 1;
     for (i=start_row; i<stop_row; i++) {
-      base = (M->values + M->rowswap[i] + a_word);
+      base = (M->rows[i] + a_word);
       register word b = *base;
       register word x = ((b >> ai) ^ (b >> bi)) & 1; // XOR temporary
       *base = b ^ ((x << ai) | (x << bi));
   if(a_bit > b_bit) {
     const size_t offset = a_bit - b_bit;
     for (i=start_row; i<stop_row; i++) {
-      base = M->values + M->rowswap[i];
+      base = M->rows[i];
       a = *(base + a_word);
       b = *(base + b_word);
 
   } else {
     const size_t offset = b_bit - a_bit;
     for (i=start_row; i<stop_row; i++) {
-      base = M->values + M->rowswap[i];
+      base = M->rows[i];
       a = *(base + a_word);
       b = *(base + b_word);
 
  *
  */
 
-static inline BIT mzd_read_bit(const packedmatrix *M, const size_t row, const size_t col ) {
-  return GET_BIT(M->values[ M->rowswap[row] + (col+M->offset)/RADIX ], (col+M->offset) % RADIX);
+static inline BIT mzd_read_bit(const mzd_t *M, const size_t row, const size_t col ) {
+  return GET_BIT(M->rows[row][(col+M->offset)/RADIX], (col+M->offset) % RADIX);
 }
 
 /**
  *
  */
 
-static inline void mzd_write_bit(packedmatrix *M, const size_t row, const size_t col, const BIT value) {
+static inline void mzd_write_bit(mzd_t *M, const size_t row, const size_t col, const BIT value) {
   if (value==1)
-    SET_BIT(M->values[ M->rowswap[row] + (col+M->offset)/RADIX ], (col+M->offset) % RADIX);
+    SET_BIT(M->rows[row][(col+M->offset)/RADIX], (col+M->offset) % RADIX);
   else
-    CLR_BIT(M->values[ M->rowswap[row] + (col+M->offset)/RADIX ], (col+M->offset) % RADIX);
+    CLR_BIT(M->rows[row][(col+M->offset)/RADIX], (col+M->offset) % RADIX);
 }
 
 /**
  * \param M Matrix
  */
 
-void mzd_print(const packedmatrix *M );
+void mzd_print(const mzd_t *M);
 
 /**
  * \brief Print the matrix to stdout.
  * \param M Matrix
  */
 
-void mzd_print_tight(const packedmatrix *M );
+void mzd_print_tight(const mzd_t *M);
 
 /**
  * \brief Add the rows sourcerow and destrow and stores the total in the row
  * \param coloffset Column offset
  */
 
-/*void mzd_row_add_offset(packedmatrix *M, size_t dstrow, size_t srcrow, size_t coloffset); */
-static inline void mzd_row_add_offset(packedmatrix *M, size_t dstrow, size_t srcrow, size_t coloffset) {
+/*void mzd_row_add_offset(mzd_t *M, size_t dstrow, size_t srcrow, size_t coloffset); */
+static inline void mzd_row_add_offset(mzd_t *M, size_t dstrow, size_t srcrow, size_t coloffset) {
   coloffset += M->offset;
   const size_t startblock= coloffset/RADIX;
   size_t wide = M->width - startblock;
-  word *src = M->values + M->rowswap[srcrow] + startblock;
-  word *dst = M->values + M->rowswap[dstrow] + startblock;
+  word *src = M->rows[srcrow] + startblock;
+  word *dst = M->rows[dstrow] + startblock;
 
   word temp = *src++;
   if (coloffset%RADIX)
  * \note this can be done much faster with mzd_combine.
  */
 
-void mzd_row_add(packedmatrix *M, const size_t sourcerow, const size_t destrow);
+void mzd_row_add(mzd_t *M, const size_t sourcerow, const size_t destrow);
 
 /**
  * \brief Transpose a matrix.
  * \param A Matrix
  */
 
-packedmatrix *mzd_transpose(packedmatrix *DST, const packedmatrix *A );
+mzd_t *mzd_transpose(mzd_t *DST, const mzd_t *A );
 
 /**
  * \brief Naive cubic matrix multiplication.
  * function called _mzd_mul_naive
  *
  */
-packedmatrix *mzd_mul_naive(packedmatrix *C, const packedmatrix *A, const packedmatrix *B);
+mzd_t *mzd_mul_naive(mzd_t *C, const mzd_t *A, const mzd_t *B);
 
 /**
  * \brief Naive cubic matrix multiplication and addition
  * function called _mzd_mul_naive
  */
 
-packedmatrix *mzd_addmul_naive(packedmatrix *C, const packedmatrix *A, const packedmatrix *B);
+mzd_t *mzd_addmul_naive(mzd_t *C, const mzd_t *A, const mzd_t *B);
 
 /**
  * \brief Naive cubic matrix multiplication with the pre-transposed B.
  * \param clear Whether to clear C before accumulating AB
  */
 
-packedmatrix *_mzd_mul_naive(packedmatrix *C, const packedmatrix *A, const packedmatrix *B, const int clear);
+mzd_t *_mzd_mul_naive(mzd_t *C, const mzd_t *A, const mzd_t *B, const int clear);
 
 /**
  * \brief Matrix multiplication optimized for v*A where v is a vector.
  * \param clear If set clear C first, otherwise add result to C.
  *
  */
-packedmatrix *_mzd_mul_va(packedmatrix *C, const packedmatrix *v, const packedmatrix *A, const int clear);
+mzd_t *_mzd_mul_va(mzd_t *C, const mzd_t *v, const mzd_t *A, const int clear);
 
 /**
  * \brief Fill matrix M with uniformly distributed bits.
  * \wordoffset
  */
 
-void mzd_randomize(packedmatrix *M);
+void mzd_randomize(mzd_t *M);
 
 /**
  * \brief Set the matrix M to the value equivalent to the integer
  * \param value Either 0 or 1
  */
 
-void mzd_set_ui(packedmatrix *M, const unsigned value);
+void mzd_set_ui(mzd_t *M, const unsigned value);
 
 /**
  * \brief Gaussian elimination.
  * \wordoffset
  */
 
-int mzd_gauss_delayed(packedmatrix *M, const size_t startcol, const int full);
+int mzd_gauss_delayed(mzd_t *M, const size_t startcol, const int full);
 
 /**
  * \brief Gaussian elimination.
  * \sa mzd_echelonize_m4ri(), mzd_echelonize_pluq()
  */
 
-int mzd_echelonize_naive(packedmatrix *M, const int full);
+int mzd_echelonize_naive(mzd_t *M, const int full);
 
 /**
  * \brief Return TRUE if A == B.
  * \wordoffset
  */
 
-BIT mzd_equal(const packedmatrix *A, const packedmatrix *B );
+BIT mzd_equal(const mzd_t *A, const mzd_t *B );
 
 /**
  * \brief Return -1,0,1 if if A < B, A == B or A > B respectively.
  * \wordoffset
  */
 
-int mzd_cmp(const packedmatrix *A, const packedmatrix *B);
+int mzd_cmp(const mzd_t *A, const mzd_t *B);
 
 /**
  * \brief Copy matrix  A to DST.
  * \wordoffset
  */
 
-packedmatrix *mzd_copy(packedmatrix *DST, const packedmatrix *A);
+mzd_t *mzd_copy(mzd_t *DST, const mzd_t *A);
 
 /**
  * \brief Concatenate B to A and write the result to C.
  * 
  * That is,
  *
-\verbatim
-[ A ], [ B ] -> [ A  B ] = C
-\endverbatim
+ \verbatim
+ [ A ], [ B ] -> [ A  B ] = C
+ \endverbatim
  *
  * The inputs are not modified but a new matrix is created.
  *
  * \wordoffset
  */
 
-packedmatrix *mzd_concat(packedmatrix *C, const packedmatrix *A, const packedmatrix *B);
+mzd_t *mzd_concat(mzd_t *C, const mzd_t *A, const mzd_t *B);
 
 /**
  * \brief Stack A on top of B and write the result to C.
  *
  * That is, 
  *
-\verbatim
-[ A ], [ B ] -> [ A ] = C
-                [ B ]
-\endverbatim
+ \verbatim
+ [ A ], [ B ] -> [ A ] = C
+                 [ B ]
+ \endverbatim
  *
  * The inputs are not modified but a new matrix is created.
  *
  * \wordoffset
  */
 
-packedmatrix *mzd_stack(packedmatrix *C, const packedmatrix *A, const packedmatrix *B);
+mzd_t *mzd_stack(mzd_t *C, const mzd_t *A, const mzd_t *B);
 
 /**
  * \brief Copy a submatrix.
  * \param highr stop row (this row is \em not included)
  * \param highc stop column (this column is \em not included)
  */
-packedmatrix *mzd_submatrix(packedmatrix *S, const packedmatrix *M, const size_t lowr, const size_t lowc, const size_t highr, const size_t highc);
+mzd_t *mzd_submatrix(mzd_t *S, const mzd_t *M, const size_t lowr, const size_t lowc, const size_t highr, const size_t highc);
 
 /**
  * \brief Invert the matrix target using Gaussian elimination. 
  * \wordoffset
  */
 
-packedmatrix *mzd_invert_naive(packedmatrix *INV, packedmatrix *A, const packedmatrix *I);
+mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t *A, const mzd_t *I);