# Commits

committed 244dcf4

implemented heuristic algorithm which starts with M4RI and switches to PLS based
decomposition when the remaining matrix has a density of > 0.15.

# src/brilliantrussian.c

` `
` #include "brilliantrussian.h"`
` #include "grayflex.h"`
`-`
`+#include "lqup.h"`
` `
` /**`
`  * \brief Perform Gaussian reduction to reduced row echelon form on a`
`   }`
` }`
` `
`-size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {`
`+size_t _mzd_echelonize_m4ri(mzd_t *A, const int full, int k, int heuristic, const double threshold) {`
`   /**`
`    * \par General algorithm`
`    * \li Step 1.Denote the first column to be processed in a given`
`    * 2.5 coeffcient to 3.`
`    *`
`    * \attention This function implements a variant of the algorithm`
`-   * described above.`
`+   * described above. If heuristic is true, then this algorithm, will`
`+   * switch to PLUQ based echelon form computation once the density`
`+   * reaches the threshold.`
`    */`
` `
`   const size_t ncols = A->ncols; `
`     if ( (6*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 )`
`       k -= 1;`
`   }`
`-  /*printf("k: %d\n",k);*/`
`   int kk = 6*k;`
` `
`   mzd_t *U  = mzd_init(kk, A->ncols);`
`   size_t *L4 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
`   size_t *L5 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
` `
`+  size_t last_check = 0;`
`+`
`+  if (heuristic) {`
`+    if (c<A->ncols && r< A->nrows && _mzd_density(A,32, 0, 0) >= threshold) {`
`+      //#ifndef NDEBUG`
`+      printf("switching at %zu x %zu (%.5f)\n",r,c,mzd_density(A,32));`
`+      //#endif`
`+      mzd_t *Abar = mzd_init_window(A, r, (c/RADIX)*RADIX, A->nrows, A->ncols);`
`+      r += mzd_echelonize_pluq(Abar, full);`
`+      mzd_free(Abar);`
`+      c = ncols;`
`+    }`
`+  }`
`+`
`   while(c<ncols) {`
`+    if (heuristic && c > (last_check + 256)) {`
`+      last_check = c;`
`+      if (c<A->ncols && r< A->nrows && _mzd_density(A,32, r, c) >= threshold) {`
`+        //#ifndef NDEBUG`
`+        printf("switching at %zu x %zu (%.5f)\n",r,c,_mzd_density(A,32,r,c));`
`+        //#endif`
`+        mzd_t *Abar = mzd_init_window(A, r, (c/RADIX)*RADIX, A->nrows, A->ncols);`
`+        if (!full) {`
`+          r += mzd_echelonize_pluq(Abar, full);`
`+        } else {`
`+          size_t r2 = mzd_echelonize_pluq(Abar, full);`
`+          if (r>0)`
`+            _mzd_top_echelonize_m4ri(A, 0, r, c, r);`
`+          r += r2;`
`+        }`
`+        mzd_free(Abar);`
`+        c = ncols;`
`+        break;`
`+      }`
`+    }`
`+`
`     if(c+kk > A->ncols) {`
`       kk = ncols - c;`
`     }`
`   return r;`
` }`
` `
`-void mzd_top_echelonize_m4ri(mzd_t *A, int k) {`
`+size_t _mzd_top_echelonize_m4ri(mzd_t *A, int k, size_t r, size_t c, size_t max_r) {`
`   const size_t ncols = A->ncols; `
`-  size_t r = 0;`
`-  size_t c = 0;`
`   int kbar = 0;`
` `
`   if (k == 0) {`
`-    k = m4ri_opt_k(A->nrows, A->ncols, 0);`
`-    if (k>5) {`
`-      k -= 4;`
`-    }`
`+    k = m4ri_opt_k(max_r, A->ncols, 0);`
`+    if (k>=7)`
`+      k = 7;`
`+    if ( (6*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 )`
`+      k -= 1;`
`   }`
`-  int kk = 4*k;`
`+  int kk = 6*k;`
` `
`+  mzd_t *U  = mzd_init(kk, A->ncols);`
`   mzd_t *T0 = mzd_init(TWOPOW(k), A->ncols);`
`   mzd_t *T1 = mzd_init(TWOPOW(k), A->ncols);`
`   mzd_t *T2 = mzd_init(TWOPOW(k), A->ncols);`
`   mzd_t *T3 = mzd_init(TWOPOW(k), A->ncols);`
`+  mzd_t *T4 = mzd_init(TWOPOW(k), A->ncols);`
`+  mzd_t *T5 = mzd_init(TWOPOW(k), A->ncols);`
`   size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
`   size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
`   size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
`   size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
`+  size_t *L4 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
`+  size_t *L5 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));`
` `
`   while(c<ncols) {`
`     if(c+kk > A->ncols) {`
`       kk = ncols - c;`
`     }`
`-    kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk);`
`+    kbar = _mzd_gauss_submatrix_full(A, r, c, MIN(A->nrows,r+kk), kk);`
` `
`-    if (kbar>3*k) {`
`+    if (kbar>5*k) {`
`+      const int rem = kbar%6;`
`+      const int ka = kbar/6 + ((rem>=5) ? 1 : 0);`
`+      const int kb = kbar/6 + ((rem>=4) ? 1 : 0);`
`+      const int kc = kbar/6 + ((rem>=3) ? 1 : 0);`
`+      const int kd = kbar/6 + ((rem>=2) ? 1 : 0);`
`+      const int ke = kbar/6 + ((rem>=1) ? 1 : 0);;`
`+      const int kf = kbar/6;`
`+`
`+      mzd_make_table(A, r, c, ka, T0, L0);`
`+      mzd_make_table(A, r+ka, c, kb, T1, L1);`
`+      mzd_make_table(A, r+ka+kb, c, kc, T2, L2);`
`+      mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);`
`+      mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4);`
`+      mzd_make_table(A, r+ka+kb+kc+kd+ke, c, kf, T5, L5);`
`+      mzd_process_rows6(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5);`
`+`
`+  } else if (kbar>4*k) { `
`+      const int rem = kbar%5;`
`+      const int ka = kbar/5 + ((rem>=4) ? 1 : 0);`
`+      const int kb = kbar/5 + ((rem>=3) ? 1 : 0);`
`+      const int kc = kbar/5 + ((rem>=2) ? 1 : 0);`
`+      const int kd = kbar/5 + ((rem>=1) ? 1 : 0);`
`+      const int ke = kbar/5;`
`+`
`+      mzd_make_table(A, r, c, ka, T0, L0);`
`+      mzd_make_table(A, r+ka, c, kb, T1, L1);`
`+      mzd_make_table(A, r+ka+kb, c, kc, T2, L2);`
`+      mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);`
`+      mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4);`
`+      mzd_process_rows5(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4);`
`+      `
`+    } else if (kbar>3*k) {`
`       const int rem = kbar%4;`
`       const int ka = kbar/4 + ((rem>=3) ? 1 : 0);`
`       const int kb = kbar/4 + ((rem>=2) ? 1 : 0);`
`       const int kc = kbar/4 + ((rem>=1) ? 1 : 0);`
`       const int kd = kbar/4;`
`+`
`       mzd_make_table(A, r, c, ka, T0, L0);`
`       mzd_make_table(A, r+ka, c, kb, T1, L1);`
`       mzd_make_table(A, r+ka+kb, c, kc, T2, L2);`
`       mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);`
`-      mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);`
`+      mzd_process_rows4(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);`
`       `
`     } else if (kbar>2*k) {`
`-      int rem = kbar%3;`
`-      int ka = kbar/3 + ((rem>=2) ? 1 : 0);`
`-      int kb = kbar/3 + ((rem>=1) ? 1 : 0);`
`-      int kc = kbar/3;`
`+      const int rem = kbar%3;`
`+      const int ka = kbar/3 + ((rem>=2) ? 1 : 0);`
`+      const int kb = kbar/3 + ((rem>=1) ? 1 : 0);`
`+      const int kc = kbar/3;`
`+`
`       mzd_make_table(A, r, c, ka, T0, L0);`
`       mzd_make_table(A, r+ka, c, kb, T1, L1);`
`       mzd_make_table(A, r+ka+kb, c, kc, T2, L2);`
`-      mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2);`
`+      mzd_process_rows3(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2);`
`       `
`     } else if (kbar>k) {`
`       const int ka = kbar/2;`
`       const int kb = kbar - ka;`
`       mzd_make_table(A, r, c, ka, T0, L0);`
`       mzd_make_table(A, r+ka, c, kb, T1, L1);`
`-      mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1);`
`+      mzd_process_rows2(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1);`
`       `
`     } else if(kbar > 0) {`
`       mzd_make_table(A, r, c, kbar, T0, L0);`
`-      mzd_process_rows(A, 0, r, c, kbar, T0, L0);`
`+      mzd_process_rows(A, 0, MIN(r, max_r), c, kbar, T0, L0);`
`     }`
`+`
`     r += kbar;`
`     c += kbar;`
`     if(kk!=kbar) {`
`   m4ri_mm_free(L2);`
`   mzd_free(T3);`
`   m4ri_mm_free(L3);`
`+  mzd_free(T4);`
`+  m4ri_mm_free(L4);`
`+  mzd_free(T5);`
`+  m4ri_mm_free(L5);`
`+  mzd_free(U);`
`+  return r;`
` }`
` `
`+void mzd_top_echelonize_m4ri(mzd_t *M, int k) {`
`+  _mzd_top_echelonize_m4ri(M,k,0,0,M->nrows);`
`+}`
`+`
`+`
` mzd_t *mzd_invert_m4ri(mzd_t *m, mzd_t *I, int k) {`
`   mzd_t *big = mzd_concat(NULL, m, I);`
`   size_t size=m->ncols;`
`   return C;`
` }`
` `
`+`
`+size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {`
`+  return _mzd_echelonize_m4ri(A, full, k, 1, 0.15);`
`+}`
`+`
`+size_t mzd_echelonize(mzd_t *A, int full) {`
`+  return _mzd_echelonize_m4ri(A, full, 0, 1, 0.15);`
`+}`

# src/brilliantrussian.h

`  *`
`  * \wordoffset`
`  *`
`- * \note This function isn't as optimized as it should be.`
`  */`
` `
` void mzd_top_echelonize_m4ri(mzd_t *M, int k);`
` `
` /**`
`+ * \brief Given a matrix in upper triangular form compute the reduced`
`+ * row echelon form of that matrix but only start to do anything for`
`+ * the pivot at (r,c).`
`+ * `
`+ * \param M Matrix to be reduced.`
`+ * \param k M4RI parameter, may be 0 for auto-choose.`
`+ * \param r Row index.`
`+ * \paran c Column index.`
`+ *`
`+ * \param max_r Only clear top max_r rows.`
`+ * \wordoffset`
`+ *`
`+ */`
`+`
`+size_t _mzd_top_echelonize_m4ri(mzd_t *A, int k, size_t r, size_t c, size_t max_r);`
`+`
`+/**`
`  * \brief Invert the matrix M using Konrod's method. `
`  * `
`  * To avoid recomputing the identity matrix over and over again, I may`
` `
` #define M4RM_GRAY8`
` `
`+/**`
`+ *`
`+ */`
`+`
`+size_t mzd_echelonize(mzd_t *A, int full);`
`+`
` #endif //BRILLIANTRUSSIAN_H`

# src/lqup.c

` `
`     size_t i, j;`
`     size_t n1 = (((ncols - 1) / RADIX + 1) >> 1) * RADIX;`
`+`
`     mzd_t *A0  = mzd_init_window(A,  0,  0, nrows,    n1);`
`     mzd_t *A1  = mzd_init_window(A,  0, n1, nrows, ncols);`
` `

# src/packedmatrix.c

`    return (int)n;`
` }`
` `
`-double mzd_density(mzd_t *A, int res) {`
`-  long count = 0;`
`-  long total = 0;`
`+`
`+double _mzd_density(mzd_t *A, int res, size_t r, size_t c) {`
`+  size_t count = 0;`
`+  size_t total = 0;`
`   `
`   if(res == 0)`
`     res = (int)(A->width/100.0);`
`     res = 1;`
` `
`   if(A->width == 1) {`
`-    for(size_t i=0; i<A->nrows; i++)`
`-      for(size_t j=0; j<A->ncols; j++)`
`+    for(size_t i=r; i<A->nrows; i++)`
`+      for(size_t j=c; j<A->ncols; j++)`
`         if(mzd_read_bit(A, i, j))`
`           count++;`
`     return ((double)count)/(A->ncols * A->nrows);`
`-  } else {`
`-    for(size_t i=0; i<A->nrows; i++) {`
`-      word *truerow = A->rows[i];`
`-      for(size_t j = A->offset; j<RADIX; j++)`
`-        if(mzd_read_bit(A, i, j))`
`-          count++;`
`-      total += (long)RADIX - A->offset;`
`-      for(size_t j=1; j<A->width-1; j+=res) {`
`+  }`
`+`
`+  for(size_t i=r; i<A->nrows; i++) {`
`+    word *truerow = A->rows[i];`
`+    for(size_t j = c; j< RADIX-A->offset; j++)`
`+      if(mzd_read_bit(A, i, j))`
`+        count++;`
`+    total += RADIX - A->offset;`
`+`
`+    for(size_t j=MAX(1,((A->offset+c)/RADIX)); j<A->width-1; j+=res) {`
`         count += m4ri_bitcount(truerow[j]);`
`         total += RADIX;`
`-      }`
`-      for(size_t j = 0; j < (A->offset + A->ncols)%RADIX; j++)`
`-        if(mzd_read_bit(A, i, j))`
`-          count++;`
`-      total += (A->offset + A->ncols)%RADIX;`
`     }`
`+    for(size_t j = 0; j < (A->offset + A->ncols)%RADIX; j++)`
`+      if(mzd_read_bit(A, i, j+ RADIX*((A->offset + A->ncols)/RADIX)))`
`+        count++;`
`+    total += (A->offset + A->ncols)%RADIX;`
`   }`
`+`
`   return ((double)count)/(total);`
` }`
` `
`+double mzd_density(mzd_t *A, int res) {`
`+  return _mzd_density(A, res, 0, 0);`
`+}`
` `
` size_t mzd_first_zero_row(mzd_t *A) {`
`   word mask_begin = RIGHT_BITMASK(RADIX-A->offset);`

# src/packedmatrix.h

` double mzd_density(mzd_t *A, int res);`
` `
` /**`
`+ * \brief Return the number of nonzero entries divided by nrows *`
`+ * ncols considering only the submatrix starting at (r,c).`
`+ *`
`+ * If res = 0 then 100 samples per row are made, if res > 0 the`
`+ * function takes res sized steps within each row (res = 1 uses every`
`+ * word).`
`+ *`
`+ * \param A Matrix`
`+ * \param res Resolution of sampling`
`+ * \param r Row to start counting`
`+ * \param c Column to start counting`
`+ */`
`+`
`+double _mzd_density(mzd_t *A, int res, size_t r, size_t c);`
`+`
`+`
`+/**`
`  * \brief Return the first row with all zero entries.`
`  *`
`  * If no such row can be found returns nrows.`