Commits

Anonymous committed 244dcf4

implemented heuristic algorithm which starts with M4RI and switches to PLS based
decomposition when the remaining matrix has a density of > 0.15.

Comments (0)

Files changed (5)

src/brilliantrussian.c

 
 #include "brilliantrussian.h"
 #include "grayflex.h"
-
+#include "lqup.h"
 
 /**
  * \brief Perform Gaussian reduction to reduced row echelon form on a
   }
 }
 
-size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {
+size_t _mzd_echelonize_m4ri(mzd_t *A, const int full, int k, int heuristic, const double threshold) {
   /**
    * \par General algorithm
    * \li Step 1.Denote the first column to be processed in a given
    * 2.5 coeffcient to 3.
    *
    * \attention This function implements a variant of the algorithm
-   * described above.
+   * described above. If heuristic is true, then this algorithm, will
+   * switch to PLUQ based echelon form computation once the density
+   * reaches the threshold.
    */
 
   const size_t ncols = A->ncols; 
     if ( (6*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 )
       k -= 1;
   }
-  /*printf("k: %d\n",k);*/
   int kk = 6*k;
 
   mzd_t *U  = mzd_init(kk, A->ncols);
   size_t *L4 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L5 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
 
+  size_t last_check = 0;
+
+  if (heuristic) {
+    if (c<A->ncols && r< A->nrows && _mzd_density(A,32, 0, 0) >= threshold) {
+      //#ifndef NDEBUG
+      printf("switching at %zu x %zu (%.5f)\n",r,c,mzd_density(A,32));
+      //#endif
+      mzd_t *Abar = mzd_init_window(A, r, (c/RADIX)*RADIX, A->nrows, A->ncols);
+      r += mzd_echelonize_pluq(Abar, full);
+      mzd_free(Abar);
+      c = ncols;
+    }
+  }
+
   while(c<ncols) {
+    if (heuristic && c > (last_check + 256)) {
+      last_check = c;
+      if (c<A->ncols && r< A->nrows && _mzd_density(A,32, r, c) >= threshold) {
+        //#ifndef NDEBUG
+        printf("switching at %zu x %zu (%.5f)\n",r,c,_mzd_density(A,32,r,c));
+        //#endif
+        mzd_t *Abar = mzd_init_window(A, r, (c/RADIX)*RADIX, A->nrows, A->ncols);
+        if (!full) {
+          r += mzd_echelonize_pluq(Abar, full);
+        } else {
+          size_t r2 = mzd_echelonize_pluq(Abar, full);
+          if (r>0)
+            _mzd_top_echelonize_m4ri(A, 0, r, c, r);
+          r += r2;
+        }
+        mzd_free(Abar);
+        c = ncols;
+        break;
+      }
+    }
+
     if(c+kk > A->ncols) {
       kk = ncols - c;
     }
   return r;
 }
 
-void mzd_top_echelonize_m4ri(mzd_t *A, int k) {
+size_t _mzd_top_echelonize_m4ri(mzd_t *A, int k, size_t r, size_t c, size_t max_r) {
   const size_t ncols = A->ncols; 
-  size_t r = 0;
-  size_t c = 0;
   int kbar = 0;
 
   if (k == 0) {
-    k = m4ri_opt_k(A->nrows, A->ncols, 0);
-    if (k>5) {
-      k -= 4;
-    }
+    k = m4ri_opt_k(max_r, A->ncols, 0);
+    if (k>=7)
+      k = 7;
+    if ( (6*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 )
+      k -= 1;
   }
-  int kk = 4*k;
+  int kk = 6*k;
 
+  mzd_t *U  = mzd_init(kk, A->ncols);
   mzd_t *T0 = mzd_init(TWOPOW(k), A->ncols);
   mzd_t *T1 = mzd_init(TWOPOW(k), A->ncols);
   mzd_t *T2 = mzd_init(TWOPOW(k), A->ncols);
   mzd_t *T3 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T4 = mzd_init(TWOPOW(k), A->ncols);
+  mzd_t *T5 = mzd_init(TWOPOW(k), A->ncols);
   size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
   size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
+  size_t *L4 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
+  size_t *L5 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
 
   while(c<ncols) {
     if(c+kk > A->ncols) {
       kk = ncols - c;
     }
-    kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk);
+    kbar = _mzd_gauss_submatrix_full(A, r, c, MIN(A->nrows,r+kk), kk);
 
-    if (kbar>3*k) {
+    if (kbar>5*k) {
+      const int rem = kbar%6;
+      const int ka = kbar/6 + ((rem>=5) ? 1 : 0);
+      const int kb = kbar/6 + ((rem>=4) ? 1 : 0);
+      const int kc = kbar/6 + ((rem>=3) ? 1 : 0);
+      const int kd = kbar/6 + ((rem>=2) ? 1 : 0);
+      const int ke = kbar/6 + ((rem>=1) ? 1 : 0);;
+      const int kf = kbar/6;
+
+      mzd_make_table(A, r, c, ka, T0, L0);
+      mzd_make_table(A, r+ka, c, kb, T1, L1);
+      mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
+      mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
+      mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4);
+      mzd_make_table(A, r+ka+kb+kc+kd+ke, c, kf, T5, L5);
+      mzd_process_rows6(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5);
+
+  } else if (kbar>4*k) { 
+      const int rem = kbar%5;
+      const int ka = kbar/5 + ((rem>=4) ? 1 : 0);
+      const int kb = kbar/5 + ((rem>=3) ? 1 : 0);
+      const int kc = kbar/5 + ((rem>=2) ? 1 : 0);
+      const int kd = kbar/5 + ((rem>=1) ? 1 : 0);
+      const int ke = kbar/5;
+
+      mzd_make_table(A, r, c, ka, T0, L0);
+      mzd_make_table(A, r+ka, c, kb, T1, L1);
+      mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
+      mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
+      mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4);
+      mzd_process_rows5(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4);
+      
+    } else if (kbar>3*k) {
       const int rem = kbar%4;
       const int ka = kbar/4 + ((rem>=3) ? 1 : 0);
       const int kb = kbar/4 + ((rem>=2) ? 1 : 0);
       const int kc = kbar/4 + ((rem>=1) ? 1 : 0);
       const int kd = kbar/4;
+
       mzd_make_table(A, r, c, ka, T0, L0);
       mzd_make_table(A, r+ka, c, kb, T1, L1);
       mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
       mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
-      mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);
+      mzd_process_rows4(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);
       
     } else if (kbar>2*k) {
-      int rem = kbar%3;
-      int ka = kbar/3 + ((rem>=2) ? 1 : 0);
-      int kb = kbar/3 + ((rem>=1) ? 1 : 0);
-      int kc = kbar/3;
+      const int rem = kbar%3;
+      const int ka = kbar/3 + ((rem>=2) ? 1 : 0);
+      const int kb = kbar/3 + ((rem>=1) ? 1 : 0);
+      const int kc = kbar/3;
+
       mzd_make_table(A, r, c, ka, T0, L0);
       mzd_make_table(A, r+ka, c, kb, T1, L1);
       mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
-      mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2);
+      mzd_process_rows3(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2);
       
     } else if (kbar>k) {
       const int ka = kbar/2;
       const int kb = kbar - ka;
       mzd_make_table(A, r, c, ka, T0, L0);
       mzd_make_table(A, r+ka, c, kb, T1, L1);
-      mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1);
+      mzd_process_rows2(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1);
       
     } else if(kbar > 0) {
       mzd_make_table(A, r, c, kbar, T0, L0);
-      mzd_process_rows(A, 0, r, c, kbar, T0, L0);
+      mzd_process_rows(A, 0, MIN(r, max_r), c, kbar, T0, L0);
     }
+
     r += kbar;
     c += kbar;
     if(kk!=kbar) {
   m4ri_mm_free(L2);
   mzd_free(T3);
   m4ri_mm_free(L3);
+  mzd_free(T4);
+  m4ri_mm_free(L4);
+  mzd_free(T5);
+  m4ri_mm_free(L5);
+  mzd_free(U);
+  return r;
 }
 
+void mzd_top_echelonize_m4ri(mzd_t *M, int k) {
+  _mzd_top_echelonize_m4ri(M,k,0,0,M->nrows);
+}
+
+
 mzd_t *mzd_invert_m4ri(mzd_t *m, mzd_t *I, int k) {
   mzd_t *big = mzd_concat(NULL, m, I);
   size_t size=m->ncols;
   return C;
 }
 
+
+size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {
+  return _mzd_echelonize_m4ri(A, full, k, 1, 0.15);
+}
+
+size_t mzd_echelonize(mzd_t *A, int full) {
+  return _mzd_echelonize_m4ri(A, full, 0, 1, 0.15);
+}

src/brilliantrussian.h

  *
  * \wordoffset
  *
- * \note This function isn't as optimized as it should be.
  */
 
 void mzd_top_echelonize_m4ri(mzd_t *M, int k);
 
 /**
+ * \brief Given a matrix in upper triangular form compute the reduced
+ * row echelon form of that matrix but only start to do anything for
+ * the pivot at (r,c).
+ * 
+ * \param M Matrix to be reduced.
+ * \param k M4RI parameter, may be 0 for auto-choose.
+ * \param r Row index.
+ * \paran c Column index.
+ *
+ * \param max_r Only clear top max_r rows.
+ * \wordoffset
+ *
+ */
+
+size_t _mzd_top_echelonize_m4ri(mzd_t *A, int k, size_t r, size_t c, size_t max_r);
+
+/**
  * \brief Invert the matrix M using Konrod's method. 
  * 
  * To avoid recomputing the identity matrix over and over again, I may
 
 #define M4RM_GRAY8
 
+/**
+ *
+ */
+
+size_t mzd_echelonize(mzd_t *A, int full);
+
 #endif //BRILLIANTRUSSIAN_H
 
     size_t i, j;
     size_t n1 = (((ncols - 1) / RADIX + 1) >> 1) * RADIX;
+
     mzd_t *A0  = mzd_init_window(A,  0,  0, nrows,    n1);
     mzd_t *A1  = mzd_init_window(A,  0, n1, nrows, ncols);
 

src/packedmatrix.c

    return (int)n;
 }
 
-double mzd_density(mzd_t *A, int res) {
-  long count = 0;
-  long total = 0;
+
+double _mzd_density(mzd_t *A, int res, size_t r, size_t c) {
+  size_t count = 0;
+  size_t total = 0;
   
   if(res == 0)
     res = (int)(A->width/100.0);
     res = 1;
 
   if(A->width == 1) {
-    for(size_t i=0; i<A->nrows; i++)
-      for(size_t j=0; j<A->ncols; j++)
+    for(size_t i=r; i<A->nrows; i++)
+      for(size_t j=c; j<A->ncols; j++)
         if(mzd_read_bit(A, i, j))
           count++;
     return ((double)count)/(A->ncols * A->nrows);
-  } else {
-    for(size_t i=0; i<A->nrows; i++) {
-      word *truerow = A->rows[i];
-      for(size_t j = A->offset; j<RADIX; j++)
-        if(mzd_read_bit(A, i, j))
-          count++;
-      total += (long)RADIX - A->offset;
-      for(size_t j=1; j<A->width-1; j+=res) {
+  }
+
+  for(size_t i=r; i<A->nrows; i++) {
+    word *truerow = A->rows[i];
+    for(size_t j = c; j< RADIX-A->offset; j++)
+      if(mzd_read_bit(A, i, j))
+        count++;
+    total += RADIX - A->offset;
+
+    for(size_t j=MAX(1,((A->offset+c)/RADIX)); j<A->width-1; j+=res) {
         count += m4ri_bitcount(truerow[j]);
         total += RADIX;
-      }
-      for(size_t j = 0; j < (A->offset + A->ncols)%RADIX; j++)
-        if(mzd_read_bit(A, i, j))
-          count++;
-      total += (A->offset + A->ncols)%RADIX;
     }
+    for(size_t j = 0; j < (A->offset + A->ncols)%RADIX; j++)
+      if(mzd_read_bit(A, i, j+ RADIX*((A->offset + A->ncols)/RADIX)))
+        count++;
+    total += (A->offset + A->ncols)%RADIX;
   }
+
   return ((double)count)/(total);
 }
 
+double mzd_density(mzd_t *A, int res) {
+  return _mzd_density(A, res, 0, 0);
+}
 
 size_t mzd_first_zero_row(mzd_t *A) {
   word mask_begin = RIGHT_BITMASK(RADIX-A->offset);

src/packedmatrix.h

 double mzd_density(mzd_t *A, int res);
 
 /**
+ * \brief Return the number of nonzero entries divided by nrows *
+ * ncols considering only the submatrix starting at (r,c).
+ *
+ * If res = 0 then 100 samples per row are made, if res > 0 the
+ * function takes res sized steps within each row (res = 1 uses every
+ * word).
+ *
+ * \param A Matrix
+ * \param res Resolution of sampling
+ * \param r Row to start counting
+ * \param c Column to start counting
+ */
+
+double _mzd_density(mzd_t *A, int res, size_t r, size_t c);
+
+
+/**
  * \brief Return the first row with all zero entries.
  *
  * If no such row can be found returns nrows.