1. CarloWood
  2. M4RI

Commits

Martin Albrecht  committed 7607790

* renamed LQUP functions and filenames to PLS
* added echelonform.[c|h] files, which provide highlevel echelon forms

  • Participants
  • Parent commits 244dcf4
  • Branches default

Comments (0)

Files changed (23)

File Makefile.am

View file
 
 lib_LTLIBRARIES = libm4ri.la
 
-libm4ri_la_SOURCES = src/brilliantrussian.c src/misc.c src/packedmatrix.c src/grayflex.c src/strassen.c src/permutation.c src/trsm.c src/lqup.c src/solve.c src/pluq_mmpf.c
+libm4ri_la_SOURCES = src/brilliantrussian.c src/misc.c src/packedmatrix.c src/grayflex.c src/strassen.c src/permutation.c src/trsm.c src/pls.c src/solve.c src/pls_mmpf.c src/echelonform.c
 
 pkgincludesubdir = $(includedir)/m4ri
-pkgincludesub_HEADERS = src/m4ri.h src/brilliantrussian.h src/misc.h src/packedmatrix.h src/grayflex.h src/strassen.h src/parity.h src/permutation.h src/config.h src/trsm.h src/lqup.h src/solve.h  src/pluq_mmpf.h
+pkgincludesub_HEADERS = src/m4ri.h src/brilliantrussian.h src/misc.h src/packedmatrix.h src/grayflex.h src/strassen.h src/parity.h src/permutation.h src/config.h src/trsm.h src/pls.h src/solve.h src/pls_mmpf.h src/echelonform.h
 
 #libm4ri_la_LDFLAGS = -version-info 0:0:0
-libm4ri_la_LDFLAGS = -release 0.0.20091101
+libm4ri_la_LDFLAGS = -release 0.0.20010628
 
-check_PROGRAMS=test_multiplication test_elimination test_trsm test_lqup test_solve test_kernel
+check_PROGRAMS=test_multiplication test_elimination test_trsm test_pls test_solve test_kernel
 test_multiplication_SOURCES=testsuite/test_multiplication.c
 test_multiplication_LDFLAGS=-lm4ri -lm
 
 test_trsm_SOURCES=testsuite/test_trsm.c
 test_trsm_LDFLAGS=-lm4ri -lm
 
-test_lqup_SOURCES=testsuite/test_lqup.c
-test_lqup_LDFLAGS=-lm4ri -lm
+test_pls_SOURCES=testsuite/test_pluq.c
+test_pls_LDFLAGS=-lm4ri -lm
 
 test_solve_SOURCES=testsuite/test_solve.c
 test_solve_LDFLAGS=-lm4ri -lm
 test_kernel_SOURCES=testsuite/test_kernel.c
 test_kernel_LDFLAGS=-lm4ri -lm
 
-TESTS = test_multiplication test_elimination test_trsm test_lqup test_solve test_kernel
+TESTS = test_multiplication test_elimination test_trsm test_pls test_solve test_kernel

File m4ri.vcproj

View file
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
 	Version="9.00"
 				>
 			</File>
 			<File
-				RelativePath=".\src\lqup.h"
+				RelativePath=".\src\pls.h"
 				>
 			</File>
 			<File
 				>
 			</File>
 			<File
-				RelativePath=".\src\lqup.c"
+				RelativePath=".\src\pls.c"
 				>
 			</File>
 			<File

File src/brilliantrussian.c

View file
 *                 M4RI: Linear Algebra over GF(2)
 *
 *    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
-*    Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+*    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
 *
 *  Distributed under the terms of the GNU General Public License (GPL) 
 *  version 2 or higher.
 
 #include "brilliantrussian.h"
 #include "grayflex.h"
-#include "lqup.h"
+#include "echelonform.h"
 
 /**
  * \brief Perform Gaussian reduction to reduced row echelon form on a
 
   if (heuristic) {
     if (c<A->ncols && r< A->nrows && _mzd_density(A,32, 0, 0) >= threshold) {
-      //#ifndef NDEBUG
-      printf("switching at %zu x %zu (%.5f)\n",r,c,mzd_density(A,32));
-      //#endif
       mzd_t *Abar = mzd_init_window(A, r, (c/RADIX)*RADIX, A->nrows, A->ncols);
       r += mzd_echelonize_pluq(Abar, full);
       mzd_free(Abar);
     if (heuristic && c > (last_check + 256)) {
       last_check = c;
       if (c<A->ncols && r< A->nrows && _mzd_density(A,32, r, c) >= threshold) {
-        //#ifndef NDEBUG
-        printf("switching at %zu x %zu (%.5f)\n",r,c,_mzd_density(A,32,r,c));
-        //#endif
         mzd_t *Abar = mzd_init_window(A, r, (c/RADIX)*RADIX, A->nrows, A->ncols);
         if (!full) {
           r += mzd_echelonize_pluq(Abar, full);
   return C;
 }
 
-
-size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {
-  return _mzd_echelonize_m4ri(A, full, k, 1, 0.15);
-}
-
-size_t mzd_echelonize(mzd_t *A, int full) {
-  return _mzd_echelonize_m4ri(A, full, 0, 1, 0.15);
-}

File src/brilliantrussian.h

View file
  *                 M4RI:  Linear Algebra over GF(2)
  *
  *    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
- *    Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+ *    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
  *
  *  Distributed under the terms of the GNU General Public License (GPL)
  *  version 2 or higher.
                        mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1,
                        mzd_t *T2, size_t *L2, mzd_t *T3, size_t *L3);
 
+
 /**
  * \brief Matrix elimination using the 'Method of the Four Russians'
  * (M4RI).
  *
+ * The M4RI algorithm was proposed in Gregory Bard; Accelerating
+ * Cryptanalysis with the Method of Four Russians; 2006;
+ * http://eprint.iacr.org/2006/251
+ *
+ * Our implementatation is discussed in in Martin Albrecht and Clément
+ * Pernet; Efficient Decomposition of Dense Matrices over GF(2);
+ * http://arxiv.org/abs/1006.1744
+ * 
  * \param M Matrix to be reduced.
  * \param full Return the reduced row echelon form, not only upper triangular form.
  * \param k M4RI parameter, may be 0 for auto-choose.
  * \return Rank of A.
  */
 
-size_t mzd_echelonize_m4ri(mzd_t *M, int full, int k);
+size_t _mzd_echelonize_m4ri(mzd_t *A, const int full, int k, int heuristic, const double threshold);
 
 /**
  * \brief Given a matrix in upper triangular form compute the reduced row
  * row echelon form of that matrix but only start to do anything for
  * the pivot at (r,c).
  * 
- * \param M Matrix to be reduced.
+ * \param A Matrix to be reduced.
  * \param k M4RI parameter, may be 0 for auto-choose.
  * \param r Row index.
- * \paran c Column index.
+ * \param c Column index.
+ * \param max_r Only clear top max_r rows.
  *
- * \param max_r Only clear top max_r rows.
  * \wordoffset
  *
  */
  * \return Inverse of M
  *
  * \note This function allocates a new matrix for the inverse which
- * must be free'd using mzd_free() once not needed anymore.
+ * must be free'd using mzd_free() once it is not needed anymore.
  */
 
 mzd_t *mzd_invert_m4ri(mzd_t *M, mzd_t *I, int k);
 /**
  * Set C to C + AB using Konrod's method.
  *
+ * This is the convenient wrapper function, please see _mzd_mul_m4rm
+ * for authors and implementation details.
+ *
  * \param C Preallocated product matrix, may be NULL for zero matrix.
  * \param A Input matrix A
  * \param B Input matrix B
  * \brief Matrix multiplication using Konrod's method, i.e. compute C such
  * that C == AB.
  * 
- * This is the actual implementation.
+ * This is the actual implementation. 
+ * 
+ * This function is described in Martin Albrecht, Gregory Bard and
+ * William Hart; Efficient Multiplication of Dense Matrices over
+ * GF(2); pre-print available at http://arxiv.org/abs/0811.1714
  *
  * \param C Preallocated product matrix.
  * \param A Input matrix A
  * \param clear clear the matrix C first
  *
  * \author Martin Albrecht -- initial implementation
- * \author William Hart -- block matrix implementation, use of several Gray code tables, general speed-ups
+ * \author William Hart -- block matrix implementation, use of several
+ * Gray code tables, general speed-ups
  *
  * \wordoffset
  *
 
 #define M4RM_GRAY8
 
-/**
- *
- */
-
-size_t mzd_echelonize(mzd_t *A, int full);
-
 #endif //BRILLIANTRUSSIAN_H

File src/echelonform.c

View file
+/*******************************************************************
+*
+*                 M4RI: Linear Algebra over GF(2)
+*
+*    Copyright (C) 2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+*
+*  Distributed under the terms of the GNU General Public License (GPL) 
+*  version 2 or higher.
+*
+*    This code is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*  The full text of the GPL is available at:
+*
+*                  http://www.gnu.org/licenses/
+*
+********************************************************************/
+
+#include "echelonform.h"
+#include "brilliantrussian.h"
+#include "pls.h"
+#include "trsm.h"
+
+size_t mzd_echelonize(mzd_t *A, int full) {
+  return _mzd_echelonize_m4ri(A, full, 0, 1, ECHELONFORM_CROSSOVER_DENSITY);
+}
+
+size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) {
+  return _mzd_echelonize_m4ri(A, full, k, 0, 1.0);
+}
+
+size_t mzd_echelonize_pluq(mzd_t *A, int full) {
+  mzp_t *P = mzp_init(A->nrows);
+  mzp_t *Q = mzp_init(A->ncols);
+
+  size_t r = mzd_pluq(A, P, Q, 0);
+
+  if(full) {
+    mzd_t *U = mzd_init_window(A, 0, 0, r, r);
+    mzd_t *B = mzd_init_window(A, 0, r, r, A->ncols);
+    if(r!=A->ncols) 
+      mzd_trsm_upper_left(U, B, 0);
+    if(r!=0) 
+      mzd_set_ui(U, 0);
+    for(size_t i=0; i<r; i++)
+      mzd_write_bit(A, i, i, 1);
+    mzd_free_window(U);
+    mzd_free_window(B);
+
+  } else {
+    for(size_t i=0; i<r; i++) {
+      for(size_t j=0; j< i; j+=RADIX) {
+        const size_t length = MIN(RADIX, i-j);
+        mzd_clear_bits(A, i, j, length);
+      }
+    }
+  }
+
+  if(r!=0) {
+    mzd_t *A0 = mzd_init_window(A, 0, 0, r, A->ncols);
+    mzd_apply_p_right(A0, Q);
+    mzd_free_window(A0);
+  } else {
+    mzd_apply_p_right(A, Q);
+  }
+
+  if(r!=A->nrows) {
+    mzd_t *R = mzd_init_window(A, r, 0, A->nrows, A->ncols);
+    mzd_set_ui(R, 0);
+    mzd_free_window(R);
+  }
+  
+  mzp_free(P);
+  mzp_free(Q);
+  return r;
+}

File src/echelonform.h

View file
+/**
+ * \file echelonform.h
+ * \brief Row echelon forms
+ *
+ * \author Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+ */
+
+#ifndef ECHELONFORM_H
+#define ECHELONFORM_H
+
+/*******************************************************************
+*
+*                M4RI: Linear Algebra over GF(2)
+*
+*    Copyright (C) 2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+*
+*  Distributed under the terms of the GNU General Public License (GPL)
+*  version 2 or higher.
+*
+*    This code is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*  The full text of the GPL is available at:
+*
+*                  http://www.gnu.org/licenses/
+*
+********************************************************************/
+
+#include "packedmatrix.h"
+
+/**
+ * Density at which we switch to PLS decomposition.
+ */
+
+#define ECHELONFORM_CROSSOVER_DENSITY 0.15
+
+/**
+ * \brief (Reduced) row echelon form.
+ *
+ * This function will 
+ *
+ * \param A Matrix.
+ * \param full Return the reduced row echelon form, not only upper triangular form.
+ *
+ * \wordoffset
+ *
+ * \return Rank of A.
+ */
+
+size_t mzd_echelonize(mzd_t *A, int full);
+
+/**
+ * \brief (Reduced) row echelon form using PLUQ factorisation.
+ *
+ * \param A Matrix.
+ * \param full Return the reduced row echelon form, not only upper triangular form.
+ *
+ * \wordoffset
+ *
+ * \sa mzd_pluq()
+ *
+ * \return Rank of A.
+ */
+
+size_t mzd_echelonize_pluq(mzd_t *A, int full);
+
+/**
+ * \brief Matrix elimination using the 'Method of the Four Russians' (M4RI).
+ *
+ * This is a wrapper function for _mzd_echelonize_m4ri()
+ * 
+ * \param A Matrix to be reduced.
+ * \param full Return the reduced row echelon form, not only upper triangular form.
+ * \param k M4RI parameter, may be 0 for auto-choose.
+ *
+ * \wordoffset
+ *
+ * \sa _mzd_echelonize_m4ri()
+ * 
+ * \return Rank of A.
+ */
+
+size_t mzd_echelonize_m4ri(mzd_t *A, int full, int k);
+
+#endif //ECHELONFORM_H

File src/lqup.c

-/*******************************************************************
-*
-*                 M4RI: Linear Algebra over GF(2)
-*
-*    Copyright (C) 2008 Clement Pernet <clement.pernet@gmail.com>
-*
-*  Distributed under the terms of the GNU General Public License (GPL)
-*  version 2 or higher.
-*
-*    This code is distributed in the hope that it will be useful,
-*    but WITHOUT ANY WARRANTY; without even the implied warranty of
-*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-*    General Public License for more details.
-*
-*  The full text of the GPL is available at:
-*
-*                  http://www.gnu.org/licenses/
-*
-********************************************************************/
-
-#include "misc.h"
-#include "packedmatrix.h"
-#include "trsm.h"
-#include "parity.h"
-#include <stdio.h>
-#include "pluq_mmpf.h"
-#include "strassen.h"
-#include "lqup.h"
-
-size_t mzd_lqup (mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
-  if (P->length != A->nrows)
-    m4ri_die("mzd_lqup: Permutation P length (%d) must match A nrows (%d)\n",P->length, A->nrows);
-  if (Q->length != A->ncols)
-    m4ri_die("mzd_lqup: Permutation Q length (%d) must match A ncols (%d)\n",Q->length, A->ncols);
-  return _mzd_lqup(A, P, Q, cutoff);
-}
-
-size_t mzd_pluq (mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
-  if (P->length != A->nrows)
-    m4ri_die("mzd_pluq: Permutation P length (%d) must match A nrows (%d)\n",P->length, A->nrows);
-  if (Q->length != A->ncols)
-    m4ri_die("mzd_pluq: Permutation Q length (%d) must match A ncols (%d)\n",Q->length, A->ncols);
-  size_t r = _mzd_pluq(A, P, Q, cutoff);
-  return r;
-}
-
-
-size_t _mzd_pluq(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
-  size_t r  = _mzd_lqup(A,P,Q,cutoff);
-  if(r && r<A->nrows) {
-    mzd_t *A0 = mzd_init_window(A,0,0,r,A->ncols);
-    mzd_apply_p_right_trans_tri(A0, Q);
-    mzd_free_window(A0);
-  } else {
-    mzd_apply_p_right_trans_tri(A, Q);
-  }
-  return r;
-}
-
-size_t _mzd_lqup(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
-  size_t ncols = A->ncols;
-
-#if 1
-  size_t nrows = mzd_first_zero_row(A);
-  for(size_t i=nrows; i<A->nrows; i++)
-    P->values[i] = i;
-  for(size_t i=0; i<A->ncols; i++)
-    Q->values[i] = i;
-  if(!nrows) {
-    return 0;
-  }
-#else
-  size_t nrows = A->nrows;
-#endif
-
-  if (ncols <= RADIX || A->width*A->nrows <= LQUP_CUTOFF) {
-/*   if(ncols <= PLUQ_CUTOFF) { */
-    /* this improves data locality and runtime considerably */
-    mzd_t *Abar = mzd_copy(NULL, A);
-    size_t r = _mzd_lqup_mmpf(Abar, P, Q, 0);
-    //size_t r = _mzd_lqup_naive(Abar, P, Q);
-    mzd_copy(A, Abar);
-    mzd_free(Abar);
-    return r;
-  }
-
-  {
-    /* Block divide and conquer algorithm */
-    
-    /*                     n1
-     *   ------------------------------------------
-     *   | A0              | A1                   |
-     *   |                 |                      |
-     *   |                 |                      |
-     *   |                 |                      |
-     *   ------------------------------------------
-     */
-
-    size_t i, j;
-    size_t n1 = (((ncols - 1) / RADIX + 1) >> 1) * RADIX;
-
-    mzd_t *A0  = mzd_init_window(A,  0,  0, nrows,    n1);
-    mzd_t *A1  = mzd_init_window(A,  0, n1, nrows, ncols);
-
-    size_t r1, r2;
-    /* First recursive call */
-
-    mzp_t *P1 = mzp_init_window(P, 0, nrows);
-    mzp_t *Q1 = mzp_init_window(Q, 0, A0->ncols);
-    r1 = _mzd_lqup(A0, P1, Q1, cutoff);
-
-    /*           r1           n1
-     *   ------------------------------------------
-     *   | A00    |           | A01               |
-     *   |        |           |                   |
-     * r1------------------------------------------ 
-     * 
-     *   | A10    |           | A11               |
-     *   |        |           |                   |
-     *   ------------------------------------------
-     */
-
-    mzd_t *A00 = mzd_init_window(A,  0,  0, r1, r1);
-    mzd_t *A10 = mzd_init_window(A, r1,  0, nrows, r1);
-    mzd_t *A01 = mzd_init_window(A,  0, n1, r1, ncols);
-    mzd_t *A11 = mzd_init_window(A, r1, n1, nrows, ncols);
-
-
-    if (r1) {
-      /* Computation of the Schur complement */
-      mzd_apply_p_left(A1, P1);
-      _mzd_trsm_lower_left(A00, A01, cutoff);
-      mzd_addmul(A11, A10, A01, cutoff);
-    }
-    mzp_free_window(P1);
-    mzp_free_window(Q1);
-
-    /* Second recursive call */
-    mzp_t *P2 = mzp_init_window(P, r1, nrows);
-    mzp_t *Q2 = mzp_init_window(Q, n1, ncols);
-
-    r2 = _mzd_lqup(A11, P2, Q2, cutoff);
-
-    /*           n
-     *   -------------------
-     *   |      A0b        |
-     *   |                 |
-     *   r1-----------------
-     *   |      A1b        |
-     *   |                 |
-     *   -------------------
-     */
-
-    /* Update A10 */
-    mzd_apply_p_left(A10, P2);
-
-    /* Update P */
-    for (i = 0; i < nrows - r1; ++i)
-      P2->values[i] += r1;
-    
-    // Update the A0b block (permutation + rotation)
-    for(i=0, j=n1; j<ncols; i++, j++) 
-      Q2->values[i] += n1;
-    
-    for(i=n1, j=r1; i<n1+r2; i++, j++)
-      Q->values[j] = Q->values[i];
-
-    /* Compressing L */
-
-    mzp_t *shift = mzp_init(A->ncols);
-    if (r1 < n1){
-      for (i=r1,j=n1;i<r1+r2;i++,j++){
-	mzd_col_swap_in_rows(A, i, j, i, r1+r2);
-        shift->values[i] = j;
-      }
-    }
-    mzd_apply_p_right_trans_even_capped(A, shift, r1+r2, 0);
-    mzp_free(shift);
-
-    mzp_free_window(Q2);
-    mzp_free_window(P2);
-
-    mzd_free_window(A0);
-    mzd_free_window(A1);
-    mzd_free_window(A00);
-    mzd_free_window(A01);
-    mzd_free_window(A10);
-    mzd_free_window(A11);
-
-    return r1 + r2;
-  }
-}
-
-
-size_t _mzd_pluq_naive(mzd_t *A, mzp_t *P, mzp_t *Q)  {
-  size_t i, j, l, curr_pos;
-  int found;
-
-  curr_pos = 0;
-
-  for (curr_pos = 0; curr_pos < A->ncols; ) {
-    found = 0;
-    /* search for some pivot */
-    for (j = curr_pos; j < A->ncols; j++) {
-      for (i = curr_pos; i< A->nrows; i++ ) {
-	if (mzd_read_bit(A, i, j)) {
-          found = 1;
-          break;
-        }
-      }
-      if(found)
-        break;
-    }
-    
-    if(found) {
-      P->values[curr_pos] = i;
-      Q->values[curr_pos] = j;
-      mzd_row_swap(A, curr_pos, i);
-      mzd_col_swap(A, curr_pos, j);
-          
-      /* clear below but preserve transformation matrix */
-      if (curr_pos +1 < A->ncols){
-	for(l=curr_pos+1; l<A->nrows; l++) {
-	  if (mzd_read_bit(A, l, curr_pos)) {
-	    mzd_row_add_offset(A, l, curr_pos, curr_pos+1);
-	  }
-	}
-      }
-      curr_pos++;
-    } else {
-      break;
-    }
-  }
-  for (i=curr_pos; i<A->nrows; ++i)
-    P->values[i]=i;
-  for (i=curr_pos; i<A->ncols; ++i)
-    Q->values[i]=i;
-  return curr_pos;
-}
- 
-size_t _mzd_lqup_naive(mzd_t *A, mzp_t *P, mzp_t *Q)  {
-  size_t i, j, l, col_pos, row_pos;
-  int found;
-  col_pos = 0;
-  row_pos = 0;
-
-    /* search for some pivot */
-  for (; row_pos<A->nrows && col_pos < A->ncols; ) {
-    found = 0;
-    for (j = col_pos; j < A->ncols; j++) {
-      for (i = row_pos; i< A->nrows; i++ ) {
-	if (mzd_read_bit(A, i, j)) {
-          found = 1;
-          break;
-        }
-      }
-      if(found)
-        break;
-    }
-    if(found) {
-      P->values[row_pos] = i;
-      Q->values[row_pos] = j;
-      mzd_row_swap(A, row_pos, i);
-      //mzd_col_swap(A, curr_pos, j);
-          
-      /* clear below but preserve transformation matrix */
-      if (j+1 < A->ncols){
-	for(l=row_pos+1; l<A->nrows; l++) {
-	  if (mzd_read_bit(A, l, j)) {
-	    mzd_row_add_offset(A, l, row_pos, j+1);
-	  }
-	}
-      }
-      row_pos++;
-      col_pos=j+1;
-    } else {
-      break;
-    }
-  }
-  for (i=row_pos; i<A->nrows; ++i)
-    P->values[i]=i;
-  for (i=row_pos; i<A->ncols; ++i)
-    Q->values[i]=i;
-
-  /* Now compressing L*/
-  for (j = 0; j<row_pos; ++j){
-    if (Q->values[j]>j){
-      // To be optimized by a copy_row function
-      mzd_col_swap_in_rows (A,Q->values[j], j, j, A->nrows);
-    }
-  }
-
-  return row_pos;
-}
- 
-size_t mzd_echelonize_pluq(mzd_t *A, int full) {
-  mzp_t *P = mzp_init(A->nrows);
-  mzp_t *Q = mzp_init(A->ncols);
-
-  size_t r = mzd_pluq(A, P, Q, 0);
-
-  if(full) {
-    mzd_t *U = mzd_init_window(A, 0, 0, r, r);
-    mzd_t *B = mzd_init_window(A, 0, r, r, A->ncols);
-    if(r!=A->ncols) 
-      mzd_trsm_upper_left(U, B, 0);
-    if(r!=0) 
-      mzd_set_ui(U, 0);
-    for(size_t i=0; i<r; i++)
-      mzd_write_bit(A, i, i, 1);
-    mzd_free_window(U);
-    mzd_free_window(B);
-
-  } else {
-    for(size_t i=0; i<r; i++) {
-      for(size_t j=0; j< i; j+=RADIX) {
-        const size_t length = MIN(RADIX, i-j);
-        mzd_clear_bits(A, i, j, length);
-      }
-    }
-  }
-
-  if(r!=0) {
-    mzd_t *A0 = mzd_init_window(A, 0, 0, r, A->ncols);
-    mzd_apply_p_right(A0, Q);
-    mzd_free_window(A0);
-  } else {
-    mzd_apply_p_right(A, Q);
-  }
-
-  if(r!=A->nrows) {
-    mzd_t *R = mzd_init_window(A, r, 0, A->nrows, A->ncols);
-    mzd_set_ui(R, 0);
-    mzd_free_window(R);
-  }
-  
-  mzp_free(P);
-  mzp_free(Q);
-  return r;
-}

File src/lqup.h

-/**
- * \file lqup.h
- *
- * \brief PLUQ matrix decomposition routines.
- *
- * \author Clement Pernet <clement.pernet@gmail.com>
- * 
- * \note This file should be called pluq.h and will be renamed in the
- * future.
- */
-
-
-#ifndef PLUQ_H
-#define PLUQ_H
-/*******************************************************************
-*
-*                 M4RI: Linear Algebra over GF(2)
-*
-*    Copyright (C) 2008 Clement Pernet <clement.pernet@gmail.com>
-*
-*  Distributed under the terms of the GNU General Public License (GPL)
-*  version 2 or higher.
-*
-*    This code is distributed in the hope that it will be useful,
-*    but WITHOUT ANY WARRANTY; without even the implied warranty of
-*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-*    General Public License for more details.
-*
-*  The full text of the GPL is available at:
-*
-*                  http://www.gnu.org/licenses/
-*
-********************************************************************/
-
-#include "misc.h"
-#include "packedmatrix.h"
-
-/**
- * Crossover point for PLUQ factorization.
- */
-
-#define LQUP_CUTOFF MIN(524288,CPU_L2_CACHE>>3)
-
-/**
- * \brief PLUQ matrix decomposition.
- *
- * If (P,L,U,Q) satisfy PLUQ = A, it returns (P, L, U, Q^T).
- *
- * P and Q must be preallocated but they don't have to be
- * identity permutations. If cutoff is zero a value is chosen
- * automatically. It is recommended to set cutoff to zero for most
- * applications.
- *
- * The row echelon form (not reduced) can be read from the upper
- * triangular matrix U. See mzd_echelonize_pluq() for details.
- * 
- * This is the wrapper function including bounds checks. See
- * _mzd_pluq() for implementation details.
- *
- * \param A Input m x n matrix
- * \param P Output row permutation of length m
- * \param Q Output column permutation matrix of length n
- * \param cutoff Minimal dimension for Strassen recursion.
- *
- * \sa _mzd_pluq() _mzd_pluq_mmpf() mzd_echelonize_pluq()
- *
- * \wordoffset
- *
- * \return Rank of A.
- */
-
-size_t mzd_pluq(mzd_t *A, mzp_t *P, mzp_t * Q, const int cutoff);
-
-
-/**
- * \brief LQUP matrix decomposition.
- *
- * Computes the transposed LQUP matrix decomposition using a block
- * recursive algorithm.
- *
- * If (L,Q,U,P) satisfy LQUP = A^T, it returns (L, Q^T, U, P).
- *
- * P and Q must be preallocated but they don't have to be
- * identity permutations. If cutoff is zero a value is chosen
- * automatically. It is recommended to set cutoff to zero for most
- * applications.
- *
- * This is the wrapper function including bounds checks. See
- * _mzd_lqup() for implementation details.
- *
- * \param A Input m x n matrix
- * \param P Output row permutation of length m
- * \param Q Output column permutation matrix of length n
- * \param cutoff Minimal dimension for Strassen recursion.
- *
- * \sa _mzd_lqup() _mzd_pluq() _mzd_pluq_mmpf() mzd_echelonize_pluq()
- *
- * \wordoffset
- *
- * \return Rank of A.
- */
-
-size_t mzd_lqup(mzd_t *A, mzp_t *P, mzp_t * Q, const int cutoff);
-
-/**
- * \brief PLUQ matrix decomposition.
- *
- * See mzd_pluq() for details.
- *
- * \param A Input matrix
- * \param P Output row mzp_t matrix
- * \param Q Output column mzp_t matrix
- * \param cutoff Minimal dimension for Strassen recursion.
- *
- * \sa mzd_pluq()
- *
- * \wordoffset
- * \return Rank of A.
- */
-
-size_t _mzd_pluq(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff);
-
-/**
- * \brief LQUP matrix decomposition.
- *
- * See mzd_lqup() for details.
- *
- * \param A Input matrix
- * \param P Output row mzp_t matrix
- * \param Qt Output column mzp_t matrix
- * \param cutoff Minimal dimension for Strassen recursion.
- *
- * \sa mzd_lqup()
- *
- * \wordoffset
- * \return Rank of A.
- */
-
-size_t _mzd_lqup(mzd_t *A, mzp_t * P, mzp_t * Qt, const int cutoff);
-
-/**
- * \brief PLUQ matrix decomposition (naive base case).
- *
- * See mzd_pluq() for details.
- * 
- * \param A Input matrix
- * \param P Output row mzp_t matrix
- * \param Q Output column mzp_t matrix
- *
- * \sa mzd_pluq()
- *
- * \wordoffset
- * \return Rank of A.
- */
-
-size_t _mzd_pluq_naive(mzd_t *A, mzp_t * P, mzp_t * Q);
-
-/**
- * \brief LQUP matrix decomposition (naive base case).
- *
- * See mzd_lqup() for details.
- * 
- * \param A Input matrix
- * \param P Output row mzp_t matrix
- * \param Qt Output column mzp_t matrix
- *
- * \sa mzd_lqup()
- *
- * \wordoffset
- * \return Rank of A.
- */
-
-size_t _mzd_lqup_naive(mzd_t *A, mzp_t *P, mzp_t *Qt);
-
-/**
- * \brief (Reduced) row echelon form using PLUQ factorisation.
- *
- * \param A Matrix.
- * \param full Return the reduced row echelon form, not only upper triangular form.
- *
- * \wordoffset
- *
- * \sa mzd_pluq()
- *
- * \return Rank of A.
- */
-
-
-size_t mzd_echelonize_pluq(mzd_t *A, int full);
-
-#endif

File src/m4ri.h

View file
 #include "grayflex.h"
 #include "parity.h"
 #include "trsm.h"
-#include "lqup.h"
-#include "pluq_mmpf.h"
+#include "pls.h"
+#include "pls_mmpf.h"
 #include "solve.h"
+#include "echelonform.h"
 
 #ifdef __cplusplus
 }

File src/packedmatrix.h

View file
 *                M4RI: Linear Algebra over GF(2)
 *
 *    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
-*    Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+*    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
 *
 *  Distributed under the terms of the GNU General Public License (GPL)
 *  version 2 or higher.

File src/pls.c

View file
+/*******************************************************************
+*
+*                 M4RI: Linear Algebra over GF(2)
+*
+*    Copyright (C) 2008 Clement Pernet <clement.pernet@gmail.com>
+*
+*  Distributed under the terms of the GNU General Public License (GPL)
+*  version 2 or higher.
+*
+*    This code is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*  The full text of the GPL is available at:
+*
+*                  http://www.gnu.org/licenses/
+*
+********************************************************************/
+
+#include "misc.h"
+#include "packedmatrix.h"
+#include "trsm.h"
+#include "parity.h"
+#include <stdio.h>
+#include "pls_mmpf.h"
+#include "strassen.h"
+#include "pls.h"
+
+size_t mzd_pls (mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
+  if (P->length != A->nrows)
+    m4ri_die("mzd_pls: Permutation P length (%d) must match A nrows (%d)\n",P->length, A->nrows);
+  if (Q->length != A->ncols)
+    m4ri_die("mzd_pls: Permutation Q length (%d) must match A ncols (%d)\n",Q->length, A->ncols);
+  return _mzd_pls(A, P, Q, cutoff);
+}
+
+size_t mzd_pluq (mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
+  if (P->length != A->nrows)
+    m4ri_die("mzd_pluq: Permutation P length (%d) must match A nrows (%d)\n",P->length, A->nrows);
+  if (Q->length != A->ncols)
+    m4ri_die("mzd_pluq: Permutation Q length (%d) must match A ncols (%d)\n",Q->length, A->ncols);
+  size_t r = _mzd_pluq(A, P, Q, cutoff);
+  return r;
+}
+
+
+size_t _mzd_pluq(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
+  size_t r  = _mzd_pls(A,P,Q,cutoff);
+  if(r && r<A->nrows) {
+    mzd_t *A0 = mzd_init_window(A,0,0,r,A->ncols);
+    mzd_apply_p_right_trans_tri(A0, Q);
+    mzd_free_window(A0);
+  } else {
+    mzd_apply_p_right_trans_tri(A, Q);
+  }
+  return r;
+}
+
+size_t _mzd_pls(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff) {
+  size_t ncols = A->ncols;
+
+#if 1
+  size_t nrows = mzd_first_zero_row(A);
+  for(size_t i=nrows; i<A->nrows; i++)
+    P->values[i] = i;
+  for(size_t i=0; i<A->ncols; i++)
+    Q->values[i] = i;
+  if(!nrows) {
+    return 0;
+  }
+#else
+  size_t nrows = A->nrows;
+#endif
+
+  if (ncols <= RADIX || A->width*A->nrows <= PLS_CUTOFF) {
+/*   if(ncols <= PLUQ_CUTOFF) { */
+    /* this improves data locality and runtime considerably */
+    mzd_t *Abar = mzd_copy(NULL, A);
+    size_t r = _mzd_pls_mmpf(Abar, P, Q, 0);
+    //size_t r = _mzd_pls_naive(Abar, P, Q);
+    mzd_copy(A, Abar);
+    mzd_free(Abar);
+    return r;
+  }
+
+  {
+    /* Block divide and conquer algorithm */
+    
+    /*                     n1
+     *   ------------------------------------------
+     *   | A0              | A1                   |
+     *   |                 |                      |
+     *   |                 |                      |
+     *   |                 |                      |
+     *   ------------------------------------------
+     */
+
+    size_t i, j;
+    size_t n1 = (((ncols - 1) / RADIX + 1) >> 1) * RADIX;
+
+    mzd_t *A0  = mzd_init_window(A,  0,  0, nrows,    n1);
+    mzd_t *A1  = mzd_init_window(A,  0, n1, nrows, ncols);
+
+    size_t r1, r2;
+    /* First recursive call */
+
+    mzp_t *P1 = mzp_init_window(P, 0, nrows);
+    mzp_t *Q1 = mzp_init_window(Q, 0, A0->ncols);
+    r1 = _mzd_pls(A0, P1, Q1, cutoff);
+
+    /*           r1           n1
+     *   ------------------------------------------
+     *   | A00    |           | A01               |
+     *   |        |           |                   |
+     * r1------------------------------------------ 
+     * 
+     *   | A10    |           | A11               |
+     *   |        |           |                   |
+     *   ------------------------------------------
+     */
+
+    mzd_t *A00 = mzd_init_window(A,  0,  0, r1, r1);
+    mzd_t *A10 = mzd_init_window(A, r1,  0, nrows, r1);
+    mzd_t *A01 = mzd_init_window(A,  0, n1, r1, ncols);
+    mzd_t *A11 = mzd_init_window(A, r1, n1, nrows, ncols);
+
+
+    if (r1) {
+      /* Computation of the Schur complement */
+      mzd_apply_p_left(A1, P1);
+      _mzd_trsm_lower_left(A00, A01, cutoff);
+      mzd_addmul(A11, A10, A01, cutoff);
+    }
+    mzp_free_window(P1);
+    mzp_free_window(Q1);
+
+    /* Second recursive call */
+    mzp_t *P2 = mzp_init_window(P, r1, nrows);
+    mzp_t *Q2 = mzp_init_window(Q, n1, ncols);
+
+    r2 = _mzd_pls(A11, P2, Q2, cutoff);
+
+    /*           n
+     *   -------------------
+     *   |      A0b        |
+     *   |                 |
+     *   r1-----------------
+     *   |      A1b        |
+     *   |                 |
+     *   -------------------
+     */
+
+    /* Update A10 */
+    mzd_apply_p_left(A10, P2);
+
+    /* Update P */
+    for (i = 0; i < nrows - r1; ++i)
+      P2->values[i] += r1;
+    
+    // Update the A0b block (permutation + rotation)
+    for(i=0, j=n1; j<ncols; i++, j++) 
+      Q2->values[i] += n1;
+    
+    for(i=n1, j=r1; i<n1+r2; i++, j++)
+      Q->values[j] = Q->values[i];
+
+    /* Compressing L */
+
+    mzp_t *shift = mzp_init(A->ncols);
+    if (r1 < n1){
+      for (i=r1,j=n1;i<r1+r2;i++,j++){
+	mzd_col_swap_in_rows(A, i, j, i, r1+r2);
+        shift->values[i] = j;
+      }
+    }
+    mzd_apply_p_right_trans_even_capped(A, shift, r1+r2, 0);
+    mzp_free(shift);
+
+    mzp_free_window(Q2);
+    mzp_free_window(P2);
+
+    mzd_free_window(A0);
+    mzd_free_window(A1);
+    mzd_free_window(A00);
+    mzd_free_window(A01);
+    mzd_free_window(A10);
+    mzd_free_window(A11);
+
+    return r1 + r2;
+  }
+}
+
+
+size_t _mzd_pluq_naive(mzd_t *A, mzp_t *P, mzp_t *Q)  {
+  size_t i, j, l, curr_pos;
+  int found;
+
+  curr_pos = 0;
+
+  for (curr_pos = 0; curr_pos < A->ncols; ) {
+    found = 0;
+    /* search for some pivot */
+    for (j = curr_pos; j < A->ncols; j++) {
+      for (i = curr_pos; i< A->nrows; i++ ) {
+	if (mzd_read_bit(A, i, j)) {
+          found = 1;
+          break;
+        }
+      }
+      if(found)
+        break;
+    }
+    
+    if(found) {
+      P->values[curr_pos] = i;
+      Q->values[curr_pos] = j;
+      mzd_row_swap(A, curr_pos, i);
+      mzd_col_swap(A, curr_pos, j);
+          
+      /* clear below but preserve transformation matrix */
+      if (curr_pos +1 < A->ncols){
+	for(l=curr_pos+1; l<A->nrows; l++) {
+	  if (mzd_read_bit(A, l, curr_pos)) {
+	    mzd_row_add_offset(A, l, curr_pos, curr_pos+1);
+	  }
+	}
+      }
+      curr_pos++;
+    } else {
+      break;
+    }
+  }
+  for (i=curr_pos; i<A->nrows; ++i)
+    P->values[i]=i;
+  for (i=curr_pos; i<A->ncols; ++i)
+    Q->values[i]=i;
+  return curr_pos;
+}
+ 
+size_t _mzd_pls_naive(mzd_t *A, mzp_t *P, mzp_t *Q)  {
+  size_t i, j, l, col_pos, row_pos;
+  int found;
+  col_pos = 0;
+  row_pos = 0;
+
+    /* search for some pivot */
+  for (; row_pos<A->nrows && col_pos < A->ncols; ) {
+    found = 0;
+    for (j = col_pos; j < A->ncols; j++) {
+      for (i = row_pos; i< A->nrows; i++ ) {
+	if (mzd_read_bit(A, i, j)) {
+          found = 1;
+          break;
+        }
+      }
+      if(found)
+        break;
+    }
+    if(found) {
+      P->values[row_pos] = i;
+      Q->values[row_pos] = j;
+      mzd_row_swap(A, row_pos, i);
+      //mzd_col_swap(A, curr_pos, j);
+          
+      /* clear below but preserve transformation matrix */
+      if (j+1 < A->ncols){
+	for(l=row_pos+1; l<A->nrows; l++) {
+	  if (mzd_read_bit(A, l, j)) {
+	    mzd_row_add_offset(A, l, row_pos, j+1);
+	  }
+	}
+      }
+      row_pos++;
+      col_pos=j+1;
+    } else {
+      break;
+    }
+  }
+  for (i=row_pos; i<A->nrows; ++i)
+    P->values[i]=i;
+  for (i=row_pos; i<A->ncols; ++i)
+    Q->values[i]=i;
+
+  /* Now compressing L*/
+  for (j = 0; j<row_pos; ++j){
+    if (Q->values[j]>j){
+      // To be optimized by a copy_row function
+      mzd_col_swap_in_rows (A,Q->values[j], j, j, A->nrows);
+    }
+  }
+
+  return row_pos;
+}
+
+

File src/pls.h

View file
+/**
+ * \file pls.h
+ *
+ * \brief PLS and PLUQ matrix decomposition routines.
+ *
+ * \author Clement Pernet <clement.pernet@gmail.com>
+ * 
+ */
+
+#ifndef PLUQ_H
+#define PLUQ_H
+/*******************************************************************
+*
+*                 M4RI: Linear Algebra over GF(2)
+*
+*    Copyright (C) 2008, 2009 Clement Pernet <clement.pernet@gmail.com>
+*
+*  Distributed under the terms of the GNU General Public License (GPL)
+*  version 2 or higher.
+*
+*    This code is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*  The full text of the GPL is available at:
+*
+*                  http://www.gnu.org/licenses/
+*
+********************************************************************/
+
+#include "misc.h"
+#include "packedmatrix.h"
+#include "permutation.h"
+
+/**
+ * Crossover point for PLUQ factorization.
+ */
+
+#define PLS_CUTOFF MIN(524288,CPU_L2_CACHE>>3)
+
+/**
+ * \brief PLUQ matrix decomposition.
+ *
+ * Returns (P,L,U,Q) satisfying PLUQ = A where P and Q are two
+ * permutation matrices, of dimension respectively m x m and n x n, L
+ * is m x r unit lower triangular and U is r x n upper triangular.
+ *
+ * P and Q must be preallocated but they don't have to be
+ * identity permutations. If cutoff is zero a value is chosen
+ * automatically. It is recommended to set cutoff to zero for most
+ * applications.
+ *
+ * The row echelon form (not reduced) can be read from the upper
+ * triangular matrix U. See mzd_echelonize_pluq() for details.
+ * 
+ * This is the wrapper function including bounds checks. See
+ * _mzd_pluq() for implementation details.
+ *
+ * \param A Input m x n matrix
+ * \param P Output row permutation of length m
+ * \param Q Output column permutation matrix of length n
+ * \param cutoff Minimal dimension for Strassen recursion.
+ *
+ * \sa _mzd_pluq() _mzd_pluq_mmpf() mzd_echelonize_pluq()
+ *
+ * \wordoffset
+ *
+ * \return Rank of A.
+ */
+
+size_t mzd_pluq(mzd_t *A, mzp_t *P, mzp_t * Q, const int cutoff);
+
+
+/**
+ * \brief PLS matrix decomposition.
+ *
+ * Computes the PLS matrix decomposition using a block recursive
+ * algorithm.
+ *
+ * Returns (P,L,S) satisfying PLS = A where P is a permutation matrix
+ * of dimension m x m, L is m x r unit lower triangular and S is an r
+ * x n matrix which is upper triangular except that its columns are
+ * permuted, that is S = UQ for U r x n upper triangular and Q is a n
+ * x n permutation matrix. The matrix L and S are stored in place over
+ * A.
+ *
+ * P and Q must be preallocated but they don't have to be
+ * identity permutations. If cutoff is zero a value is chosen
+ * automatically. It is recommended to set cutoff to zero for most
+ * applications.
+ *
+ * This is the wrapper function including bounds checks. See
+ * _mzd_pls() for implementation details.
+ *
+ * \param A Input m x n matrix
+ * \param P Output row permutation of length m
+ * \param Q Output column permutation matrix of length n
+ * \param cutoff Minimal dimension for Strassen recursion.
+ *
+ * \sa _mzd_pls() _mzd_pluq() _mzd_pluq_mmpf() mzd_echelonize_pluq()
+ *
+ * \wordoffset
+ *
+ * \return Rank of A.
+ */
+
+size_t mzd_pls(mzd_t *A, mzp_t *P, mzp_t * Q, const int cutoff);
+
+/**
+ * \brief PLUQ matrix decomposition.
+ *
+ * See mzd_pluq() for details.
+ *
+ * \param A Input matrix
+ * \param P Output row mzp_t matrix
+ * \param Q Output column mzp_t matrix
+ * \param cutoff Minimal dimension for Strassen recursion.
+ *
+ * \sa mzd_pluq()
+ *
+ * \wordoffset
+ * \return Rank of A.
+ */
+
+size_t _mzd_pluq(mzd_t *A, mzp_t * P, mzp_t * Q, const int cutoff);
+
+/**
+ * \brief PLS matrix decomposition.
+ *
+ * See mzd_pls() for details.
+ *
+ * \param A Input matrix
+ * \param P Output row mzp_t matrix
+ * \param Qt Output column mzp_t matrix
+ * \param cutoff Minimal dimension for Strassen recursion.
+ *
+ * \sa mzd_pls()
+ *
+ * \wordoffset
+ * \return Rank of A.
+ */
+
+size_t _mzd_pls(mzd_t *A, mzp_t * P, mzp_t * Qt, const int cutoff);
+
+/**
+ * \brief PLUQ matrix decomposition (naive base case).
+ *
+ * See mzd_pluq() for details.
+ * 
+ * \param A Input matrix
+ * \param P Output row mzp_t matrix
+ * \param Q Output column mzp_t matrix
+ *
+ * \sa mzd_pluq()
+ *
+ * \wordoffset
+ * \return Rank of A.
+ */
+
+size_t _mzd_pluq_naive(mzd_t *A, mzp_t * P, mzp_t * Q);
+
+/**
+ * \brief PLS matrix decomposition (naive base case).
+ *
+ * See mzd_pls() for details.
+ * 
+ * \param A Input matrix
+ * \param P Output row mzp_t matrix
+ * \param Qt Output column mzp_t matrix
+ *
+ * \sa mzd_pls()
+ *
+ * \wordoffset
+ * \return Rank of A.
+ */
+
+size_t _mzd_pls_naive(mzd_t *A, mzp_t *P, mzp_t *Qt);
+
+#endif

File src/pls_mmpf.c

View file
+/*******************************************************************
+*
+*                 M4RI: Linear Algebra over GF(2)
+*
+*    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+*
+*  Distributed under the terms of the GNU General Public License (GPL) 
+*  version 2 or higher.
+*
+*    This code is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*  The full text of the GPL is available at:
+*
+*                  http://www.gnu.org/licenses/
+*
+********************************************************************/
+
+#include <assert.h>
+
+#include "misc.h"
+
+#ifdef HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+#include "pls_mmpf.h"
+#include "brilliantrussian.h"
+#include "grayflex.h"
+
+static inline size_t _max_value(size_t *data, size_t length) {
+  size_t max = 0;
+  for(size_t i=0; i<length; i++) {
+    max = MAX(max, data[i]);
+  }
+  return max;
+}
+
+size_t _mzd_pls_submatrix(mzd_t *A, size_t start_row, size_t stop_row, size_t start_col, int k, mzp_t *P, mzp_t *Q, size_t *done, size_t *done_row)  {
+  size_t i, l, curr_pos;
+  int found;
+
+  word bm[4*MAXKAY];
+  size_t os[4*MAXKAY];
+
+  for(curr_pos=0; curr_pos < (size_t)k; curr_pos++) {
+    os[curr_pos] = (start_col+curr_pos)/RADIX;
+    bm[curr_pos] = ONE<<(RADIX-(start_col+curr_pos)%RADIX-1);
+    found = 0;
+    /* search for some pivot */
+    for(i = start_row + curr_pos; i < stop_row; i++) {
+      const word tmp = mzd_read_bits(A, i, start_col, curr_pos+1);
+      if(tmp) {
+        word *Arow = A->rows[i];
+        /* clear before but preserve transformation matrix */
+        for (l=0; l<curr_pos; l++)
+          if(done[l] < i) {
+            if(Arow[os[l]] & bm[l])
+              mzd_row_add_offset(A, i, start_row + l, start_col + l + 1);
+            done[l] = i; /* encode up to which row we added for l already */
+          }
+        if(mzd_read_bit(A, i, start_col + curr_pos)) {
+          found = 1;
+          break;
+        }
+      }
+    }
+    if(!found) {
+      break;
+    }
+
+    P->values[start_row + curr_pos] = i;
+    mzd_row_swap(A, i, start_row + curr_pos);
+
+    Q->values[start_row + curr_pos] = start_col + curr_pos;
+    done[curr_pos] = i;
+  }
+  
+  /* finish submatrix */
+  *done_row = _max_value(done, curr_pos);
+  for(size_t c2=0; c2<curr_pos && start_col + c2 < A->ncols -1; c2++)
+    for(size_t r2=done[c2]+1; r2<=*done_row; r2++)
+      if(mzd_read_bit(A, r2, start_col + c2))
+        mzd_row_add_offset(A, r2, start_row + c2, start_col + c2 + 1);
+  return curr_pos;
+}
+
+/* create a table of all 2^k linear combinations */
+void mzd_make_table_pls( mzd_t *M, size_t r, size_t c, int k, mzd_t *T, size_t *L) {
+  assert(T->blocks[1].size == 0);
+  const size_t blockoffset= c/RADIX;
+  size_t i, rowneeded;
+  size_t twokay= TWOPOW(k);
+  size_t wide = T->width - blockoffset;
+
+  word *ti, *ti1, *m;
+
+  ti1 = T->rows[0] + blockoffset;
+  ti = ti1 + T->width;
+#ifdef HAVE_SSE2
+  unsigned long incw = 0;
+  if (T->width & 1) incw = 1;
+  ti += incw;
+#endif
+
+  L[0]=0;
+  for (i=1; i<twokay; i++) {
+    rowneeded = r + codebook[k]->inc[i-1];
+    m = M->rows[rowneeded] + blockoffset;
+
+    /* Duff's device loop unrolling */
+    register int n = (wide + 7) / 8;
+    switch (wide % 8) {
+    case 0: do { *(ti++) = *(m++) ^ *(ti1++);
+    case 7:      *(ti++) = *(m++) ^ *(ti1++);
+    case 6:      *(ti++) = *(m++) ^ *(ti1++);
+    case 5:      *(ti++) = *(m++) ^ *(ti1++);
+    case 4:      *(ti++) = *(m++) ^ *(ti1++);
+    case 3:      *(ti++) = *(m++) ^ *(ti1++);
+    case 2:      *(ti++) = *(m++) ^ *(ti1++);
+    case 1:      *(ti++) = *(m++) ^ *(ti1++);
+      } while (--n > 0);
+    }
+#ifdef HAVE_SSE2
+    ti+=incw; ti1+=incw;
+#endif
+    ti += blockoffset;
+    ti1 += blockoffset;
+
+    /* U is a basis but not the canonical basis, so we need to read what
+       element we just created from T*/
+    L[(int)mzd_read_bits(T,i,c,k)] = i;
+    
+  }
+  /* We need fix the table to update the transformation matrix
+     correctly; e.g. if the first row has [1 0 1] and we clear a row
+     below with [1 0 1] we need to encode that this row is cleared by
+     adding the first row only ([1 0 0]).*/
+  for(i=1; i < twokay; i++) {
+    const word correction = (word)codebook[k]->ord[i];
+    mzd_xor_bits(T, i,c, k, correction);
+  }
+}
+
+void mzd_process_rows2_pls(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1) {
+  size_t r;
+  const int ka = k/2;
+  const int kb = k-k/2;
+  const size_t blocknuma=startcol/RADIX;
+  const size_t blocknumb=(startcol+ka)/RADIX;
+  const size_t blockoffset = blocknumb - blocknuma;
+  size_t wide = M->width - blocknuma;
+
+  if(wide < 3) {
+    mzd_process_rows(M, startrow, stoprow, startcol, ka, T0, L0);
+    mzd_process_rows(M, startrow, stoprow, startcol + ka, kb, T1, L1);
+    return;
+  }
+
+  wide -= 2;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
+#endif
+  for(r=startrow; r<stoprow; r++) {
+    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka) ];
+    word *t0 = T0->rows[x0] + blocknuma;
+    word *m0 = M->rows[r+0] + blocknuma;
+    m0[0] ^= t0[0];
+    m0[1] ^= t0[1];
+    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb) ];
+    word *t1 = T1->rows[x1] + blocknumb;
+    for(size_t i=blockoffset; i<2; i++) {
+      m0[i] ^= t1[i-blockoffset];
+    }
+
+    t0+=2;
+    t1+=2-blockoffset;
+    m0+=2;
+
+    register int n = (wide + 7) / 8;
+    switch (wide % 8) {
+    case 0: do { *m0++ ^= *t0++ ^ *t1++;
+      case 7:    *m0++ ^= *t0++ ^ *t1++;
+      case 6:    *m0++ ^= *t0++ ^ *t1++;
+      case 5:    *m0++ ^= *t0++ ^ *t1++;
+      case 4:    *m0++ ^= *t0++ ^ *t1++;
+      case 3:    *m0++ ^= *t0++ ^ *t1++;
+      case 2:    *m0++ ^= *t0++ ^ *t1++;
+      case 1:    *m0++ ^= *t0++ ^ *t1++;
+      } while (--n > 0);
+    }
+  }
+}
+
+void mzd_process_rows3_pls(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1, mzd_t *T2, size_t *L2) {
+  size_t r;
+
+  const int rem = k%3;
+  const int ka = k/3 + ((rem>=2) ? 1 : 0);
+  const int kb = k/3 + ((rem>=1) ? 1 : 0);
+  const int kc = k/3;
+  const size_t blocknuma=startcol/RADIX;
+  const size_t blocknumb=(startcol+ka)/RADIX;
+  const size_t blocknumc=(startcol+ka+kb)/RADIX;
+  const size_t blockoffsetb = blocknumb - blocknuma;
+  const size_t blockoffsetc = blocknumc - blocknuma;
+  size_t wide = M->width - blocknuma;
+
+  if(wide < 4) {
+    mzd_process_rows(M, startrow, stoprow, startcol, ka, T0, L0);
+    mzd_process_rows(M, startrow, stoprow, startcol + ka, kb, T1, L1);
+    mzd_process_rows(M, startrow, stoprow, startcol + ka + kb, kc, T2, L2);
+    return;
+  }
+
+  wide -= 3;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
+#endif
+  for(r=startrow; r<stoprow; r++) {
+    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka) ];
+    word *t0 = T0->rows[x0] + blocknuma;
+    word *m0 = M->rows[r] + blocknuma;
+    m0[0] ^= t0[0];
+    m0[1] ^= t0[1];
+    m0[2] ^= t0[2];
+
+    t0+=3;
+
+    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb) ];
+    word *t1 = T1->rows[x1] + blocknumb;
+    for(size_t i=blockoffsetb; i<3; i++) {
+      m0[i] ^= t1[i-blockoffsetb];
+    }
+    t1+=3-blockoffsetb;
+
+    const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc) ];
+    word *t2 = T2->rows[x2] + blocknumc;
+    for(size_t i=blockoffsetc; i<3; i++) {
+      m0[i] ^= t2[i-blockoffsetc];
+    }
+    t2+=3-blockoffsetc;
+
+    m0+=3;
+
+    register int n = (wide + 7) / 8;
+    switch (wide % 8) {
+    case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 7:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 6:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 5:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 4:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 3:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 2:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      case 1:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
+      } while (--n > 0);
+    }
+  }
+}
+
+void mzd_process_rows4_pls(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1, mzd_t *T2, size_t *L2, mzd_t *T3, size_t *L3) {
+  size_t r;
+
+  const int rem = k%4;
+  const int ka = k/4 + ((rem>=3) ? 1 : 0);
+  const int kb = k/4 + ((rem>=2) ? 1 : 0);
+  const int kc = k/4 + ((rem>=1) ? 1 : 0);
+  const int kd = k/4;
+  const size_t blocknuma=startcol/RADIX;
+  const size_t blocknumb=(startcol+ka)/RADIX;
+  const size_t blocknumc=(startcol+ka+kb)/RADIX;
+  const size_t blocknumd=(startcol+ka+kb+kc)/RADIX;
+  const size_t blockoffsetb = blocknumb - blocknuma;
+  const size_t blockoffsetc = blocknumc - blocknuma;
+  const size_t blockoffsetd = blocknumd - blocknuma;
+  size_t wide = M->width - blocknuma;
+
+  if(wide < 5) {
+    mzd_process_rows(M, startrow, stoprow, startcol, ka, T0, L0);
+    mzd_process_rows(M, startrow, stoprow, startcol + ka, kb, T1, L1);
+    mzd_process_rows(M, startrow, stoprow, startcol + ka + kb, kc, T2, L2);
+    mzd_process_rows(M, startrow, stoprow, startcol + ka + kb + kc, kd, T3, L3);
+    return;
+  }
+  wide -= 4;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
+#endif
+  for(r=startrow; r<stoprow; r++) {
+    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka) ];
+    word *t0 = T0->rows[x0] + blocknuma;
+    word *m0 = M->rows[r] + blocknuma;
+    m0[0] ^= t0[0];
+    m0[1] ^= t0[1];
+    m0[2] ^= t0[2];
+    m0[3] ^= t0[3];
+
+    t0+=4;
+
+    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb) ];
+    word *t1 = T1->rows[x1] + blocknumb;
+    for(size_t i=blockoffsetb; i<4; i++) {
+      m0[i] ^= t1[i-blockoffsetb];
+    }
+    t1+=4-blockoffsetb;
+
+    const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc) ];
+    word *t2 = T2->rows[x2] + blocknumc;
+    for(size_t i=blockoffsetc; i<4; i++) {
+      m0[i] ^= t2[i-blockoffsetc];
+    }
+    t2+=4-blockoffsetc;
+
+    const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd) ];
+    word *t3 = T3->rows[x3] + blocknumd;
+    for(size_t i=blockoffsetd; i<4; i++) {
+      m0[i] ^= t3[i-blockoffsetd];
+    }
+    t3+=4-blockoffsetd;
+
+    m0+=4;
+
+    register int n = (wide + 7) / 8;
+    switch (wide % 8) {
+    case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 7:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 6:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 5:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 4:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 3:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 2:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      case 1:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
+      } while (--n > 0);
+    }
+  }
+}
+
+/* extract U from A for table creation */
+mzd_t *_mzd_pls_to_u(mzd_t *U, mzd_t *A, size_t r, size_t c, int k) {
+  /* this function call is now rather cheap, but it could be avoided
+     completetly if needed */
+  assert(U->offset == 0);
+  assert(A->offset == 0);
+  size_t i, j;
+  size_t startcol = (c/RADIX)*RADIX;
+  mzd_submatrix(U, A, r, 0, r+k, A->ncols);
+
+  for(i=0; i<(size_t)k; i++)
+    for(j=startcol; j<c+i; j++) 
+      mzd_write_bit(U, i, j,  0);
+  return U;
+}
+
+/* method of many people factorisation */
+size_t _mzd_pls_mmpf(mzd_t *A, mzp_t * P, mzp_t * Q, int k) {
+  assert(A->offset == 0);
+  const size_t nrows = A->nrows;//mzd_first_zero_row(A);
+  const size_t ncols = A->ncols; 
+  size_t curr_row = 0;
+  size_t curr_col = 0;
+  size_t kbar = 0;
+  size_t done_row = 0;
+
+  if(k == 0) {
+    k = m4ri_opt_k(nrows, ncols, 0);
+    if (k>=7)
+      k = 7;
+    if ( (4*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 )
+      k -= 1;
+  }
+
+  int kk = 4*k;
+
+  for(size_t i = 0; i<ncols; i++) 
+    Q->values[i] = i;
+
+  for(size_t i = 0; i<A->nrows; i++)
+    P->values[i] = i;
+
+  mzd_t *T0 = mzd_init(TWOPOW(k), ncols);
+  mzd_t *T1 = mzd_init(TWOPOW(k), ncols);
+  mzd_t *T2 = mzd_init(TWOPOW(k), ncols);
+  mzd_t *T3 = mzd_init(TWOPOW(k), ncols);
+  mzd_t *U = mzd_init(kk, ncols);
+
+  size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
+  size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
+  size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
+  size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
+  size_t *done = (size_t *)m4ri_mm_malloc(kk * sizeof(size_t));
+
+  while(curr_col < ncols && curr_row < nrows) {
+    if(curr_col + kk > ncols)
+      kk = ncols - curr_col;
+
+    /* 1. compute LQUP factorisation for a kxk submatrix */
+    kbar = _mzd_pls_submatrix(A, curr_row, nrows, curr_col, kk, P, Q, done, &done_row);
+
+    /* 2. extract U */
+    _mzd_pls_to_u(U, A, curr_row, curr_col, kbar);
+
+    if(kbar > (size_t)3*k) {
+      const int rem = kbar%4;
+  
+      const int ka = kbar/4 + ((rem>=3) ? 1 : 0);
+      const int kb = kbar/4 + ((rem>=2) ? 1 : 0);
+      const int kc = kbar/4 + ((rem>=1) ? 1 : 0);
+      const int kd = kbar/4;
+
+      if (kbar==kk) {
+        /* 2. generate table T */
+        mzd_make_table_pls(U, 0,          curr_col,          ka, T0, L0);
+        mzd_make_table_pls(U, 0+ka,       curr_col+ka,       kb, T1, L1);
+        mzd_make_table_pls(U, 0+ka+kb,    curr_col+ka+kb,    kc, T2, L2);
+        mzd_make_table_pls(U, 0+ka+kb+kc, curr_col+ka+kb+kc, kd, T3, L3);
+        /* 3. use that table to process remaining rows below */
+        mzd_process_rows4_pls(A, done_row + 1, nrows, curr_col, kbar, T0, L0, T1, L1, T2, L2, T3, L3);
+      } else {
+        curr_col += 1; 
+      }
+
+    } else if(kbar > (size_t)2*k) {
+      const int rem = kbar%3;
+
+      const int ka = kbar/3 + ((rem>=2) ? 1 : 0);
+      const int kb = kbar/3 + ((rem>=1) ? 1 : 0);
+      const int kc = kbar/3;
+
+      if (kbar==kk) {
+        /* 2. generate table T */
+        mzd_make_table_pls(U, 0,       curr_col,       ka, T0, L0);
+        mzd_make_table_pls(U, 0+ka,    curr_col+ka,    kb, T1, L1);
+        mzd_make_table_pls(U, 0+ka+kb, curr_col+ka+kb, kc, T2, L2);
+        /* 3. use that table to process remaining rows below */
+        mzd_process_rows3_pls(A, done_row + 1, nrows, curr_col, kbar, T0, L0, T1, L1, T2, L2);
+      } else {
+        curr_col += 1; 
+      }
+
+    } else if(kbar > (size_t)k) {
+      const int ka = kbar/2;
+      const int kb = kbar - ka;
+
+      if(kbar==kk) {
+        /* 2. generate table T */
+        mzd_make_table_pls(U, 0,    curr_col,    ka, T0, L0);
+        mzd_make_table_pls(U, 0+ka, curr_col+ka, kb, T1, L1);
+        /* 3. use that table to process remaining rows below */
+        mzd_process_rows2_pls(A, done_row + 1, nrows, curr_col, kbar, T0, L0, T1, L1);
+      } else {
+        curr_col += 1; 
+      }
+
+    } else if(kbar > 0) {
+
+      if(kbar==kk) {
+        /* 2. generate table T */
+        mzd_make_table_pls(U, 0, curr_col, kbar, T0, L0);
+        /* 3. use that table to process remaining rows below */
+        mzd_process_rows(A, done_row + 1, nrows, curr_col, kbar, T0, L0);
+      } else {
+        curr_col += 1; 
+      }
+
+    } else {
+      curr_col += 1;
+      size_t i = curr_row;
+      size_t j = curr_col;
+      int found = mzd_find_pivot(A, curr_row, curr_col, &i, &j);
+      if(found) {
+        P->values[curr_row] = i;
+        Q->values[curr_row] = j;
+        mzd_row_swap(A, curr_row, i);
+        const size_t wrd = j/RADIX;
+        const word bm = ONE<<(RADIX-(j%RADIX)-1);
+        for(size_t l = curr_row+1; l<nrows; l++)
+          if(A->rows[l][wrd] & bm)
+            mzd_row_add_offset(A, l, curr_row, j + 1);
+        curr_col = j + 1;
+        curr_row++;
+      } else {
+        break;
+      }
+    }
+    curr_col += kbar;
+    curr_row += kbar;
+    if (kbar > 0)
+      if (kbar == kk && kk < 4*k)
+        kk = kbar + 1;
+      else
+        kk = kbar;
+    else if(kk>2)
+      kk = kk/2;
+  }
+
+  /* Now compressing L*/
+  for (size_t j = 0; j<curr_row; ++j){
+    if (Q->values[j]>j) {
+      mzd_col_swap_in_rows(A,Q->values[j], j, j, curr_row);
+    }
+  }
+  mzp_t *Qbar = mzp_init_window(Q,0,curr_row);
+  mzd_apply_p_right_trans_even_capped(A, Qbar, curr_row, 0);
+  mzp_free_window(Qbar);
+
+  mzd_free(U);
+  mzd_free(T0);
+  mzd_free(T1);
+  mzd_free(T2);
+  mzd_free(T3);
+  m4ri_mm_free(L0);
+  m4ri_mm_free(L1);
+  m4ri_mm_free(L2);
+  m4ri_mm_free(L3);
+  m4ri_mm_free(done);
+  return curr_row;
+}
+
+size_t _mzd_pluq_mmpf(mzd_t *A, mzp_t * P, mzp_t * Q, const int k) {
+  size_t r  = _mzd_pls_mmpf(A,P,Q,k);
+  mzd_apply_p_right_trans_tri(A, Q);
+  return r;
+}

File src/pls_mmpf.h

View file
+/**
+ * \file pls_mmpf.h
+ * \brief PLS and PLUQ factorization using Gray codes.
+ *
+ * \author Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+ * 
+ * \example testsuite/test_pluq.c
+ */
+
+#ifndef LQUP_MMPF_H
+#define LQUP_MMPF_H
+ /*******************************************************************
+ *
+ *                 M4RI:  Linear Algebra over GF(2)
+ *
+ *    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+ *
+ *  Distributed under the terms of the GNU General Public License (GPL)
+ *  version 2 or higher.
+ *
+ *    This code is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *    General Public License for more details.
+ *
+ *  The full text of the GPL is available at:
+ *
+ *                  http://www.gnu.org/licenses/
+ *
+ ********************************************************************/
+
+#include "packedmatrix.h"
+#include "permutation.h"
+
+/**
+ * \brief PLS matrix decomposition of A using Gray codes.
+ *
+ * Returns (P,L,S) satisfying PLS = A where P is a permutation matrix
+ * of dimension m x m, L is m x r unit lower triangular and S is an r
+ * x n matrix which is upper triangular except that its columns are
+ * permuted, that is S = UQ for U r x n upper triangular and Q is a n
+ * x n permutation matrix. The matrix L and S are stored in place over
+ * A.
+ *
+ * \param A Matrix.
+ * \param P Preallocated row permutation.
+ * \param Q Preallocated column permutation.
+ * \param k Size of Gray code tables.
+ *
+ * \wordoffset
+ *
+ * \return Rank of A.
+ */
+
+size_t _mzd_pls_mmpf(mzd_t *A, mzp_t * P, mzp_t * Q, int k);
+
+/**
+ * \brief PLUQ matrix decomposition of A using Gray codes.
+ *
+ * Returns (P,L,U,Q) satisfying PLUQ = A where P and Q are two
+ * permutation matrices, of dimension respectively m x m and n x n, L
+ * is m x r unit lower triangular and U is r x n upper triangular.
+ *
+ * \param A Matrix.
+ * \param P Preallocated row permutation.
+ * \param Q Preallocated column permutation.
+ * \param k Size of Gray code tables.
+ *
+ * \wordoffset
+ *
+ * \return Rank of A.
+ */
+
+size_t _mzd_pluq_mmpf(mzd_t *A, mzp_t * P, mzp_t * Q, int k);
+
+
+/**
+ * \brief PLS matrix decomposition of a submatrix for up to k columns
+ * starting at (r,c).
+ *
+ * Updates P and Q and modifies A in place. The buffer done afterwards
+ * holds how far a particular row was already eliminated.
+ *
+ * \param A Matrix.
+ * \param start_row Row Offset.
+ * \param stop_row Up to which row the matrix should be processed (exclusive).
+ * \param start_col Column Offset.
+ * \param k Size of Gray code tables.
+ * \param P Preallocated row permutation.
+ * \param Q Preallocated column permutation.
+ * \param done Preallocated temporary buffer.
+ * \param done_row Stores the last row which is already reduced processed after function execution.
+ *
+ * \retval kbar Maximum k for which a pivot could be found.
+ */
+
+size_t _mzd_pls_submatrix(mzd_t *A, size_t start_row, size_t stop_row, size_t start_col, int k, mzp_t *P, mzp_t *Q, size_t *done, size_t *done_row);
+
+#endif //LQUP_MMPF_H

File src/pluq_mmpf.c

-/*******************************************************************
-*
-*                 M4RI: Linear Algebra over GF(2)
-*
-*    Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
-*
-*  Distributed under the terms of the GNU General Public License (GPL) 
-*  version 2 or higher.
-*
-*    This code is distributed in the hope that it will be useful,
-*    but WITHOUT ANY WARRANTY; without even the implied warranty of
-*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-*    General Public License for more details.
-*
-*  The full text of the GPL is available at:
-*
-*                  http://www.gnu.org/licenses/
-*
-********************************************************************/
-
-#include <assert.h>
-
-#include "misc.h"
-
-#ifdef HAVE_SSE2
-#include <emmintrin.h>
-#endif
-
-#include "pluq_mmpf.h"
-#include "brilliantrussian.h"
-#include "grayflex.h"
-
-static inline size_t _max_value(size_t *data, size_t length) {
-  size_t max = 0;
-  for(size_t i=0; i<length; i++) {
-    max = MAX(max, data[i]);
-  }
-  return max;
-}
-
-size_t _mzd_lqup_submatrix(mzd_t *A, size_t start_row, size_t stop_row, size_t start_col, int k, mzp_t *P, mzp_t *Q, size_t *done, size_t *done_row)  {
-  size_t i, l, curr_pos;
-  int found;
-
-  word bm[4*MAXKAY];
-  size_t os[4*MAXKAY];
-
-  for(curr_pos=0; curr_pos < (size_t)k; curr_pos++) {
-    os[curr_pos] = (start_col+curr_pos)/RADIX;
-    bm[curr_pos] = ONE<<(RADIX-(start_col+curr_pos)%RADIX-1);
-    found = 0;
-    /* search for some pivot */
-    for(i = start_row + curr_pos; i < stop_row; i++) {
-      const word tmp = mzd_read_bits(A, i, start_col, curr_pos+1);
-      if(tmp) {
-        word *Arow = A->rows[i];
-        /* clear before but preserve transformation matrix */
-        for (l=0; l<curr_pos; l++)
-          if(done[l] < i) {
-            if(Arow[os[l]] & bm[l])
-              mzd_row_add_offset(A, i, start_row + l, start_col + l + 1);
-            done[l] = i; /* encode up to which row we added for l already */
-          }
-        if(mzd_read_bit(A, i, start_col + curr_pos)) {
-          found = 1;
-          break;
-        }
-      }
-    }
-    if(!found) {
-      break;
-    }
-
-    P->values[start_row + curr_pos] = i;
-    mzd_row_swap(A, i, start_row + curr_pos);
-
-    Q->values[start_row + curr_pos] = start_col + curr_pos;
-    done[curr_pos] = i;
-  }
-  
-  /* finish submatrix */
-  *done_row = _max_value(done, curr_pos);
-  for(size_t c2=0; c2<curr_pos && start_col + c2 < A->ncols -1; c2++)
-    for(size_t r2=done[c2]+1; r2<=*done_row; r2++)
-      if(mzd_read_bit(A, r2, start_col + c2))
-        mzd_row_add_offset(A, r2, start_row + c2, start_col + c2 + 1);
-  return curr_pos;
-}
-
-/* create a table of all 2^k linear combinations */
-void mzd_make_table_lqup( mzd_t *M, size_t r, size_t c, int k, mzd_t *T, size_t *L) {
-  assert(T->blocks[1].size == 0);
-  const size_t blockoffset= c/RADIX;
-  size_t i, rowneeded;
-  size_t twokay= TWOPOW(k);
-  size_t wide = T->width - blockoffset;
-
-  word *ti, *ti1, *m;
-
-  ti1 = T->rows[0] + blockoffset;
-  ti = ti1 + T->width;
-#ifdef HAVE_SSE2
-  unsigned long incw = 0;
-  if (T->width & 1) incw = 1;
-  ti += incw;
-#endif
-
-  L[0]=0;
-  for (i=1; i<twokay; i++) {
-    rowneeded = r + codebook[k]->inc[i-1];
-    m = M->rows[rowneeded] + blockoffset;
-
-    /* Duff's device loop unrolling */
-    register int n = (wide + 7) / 8;
-    switch (wide % 8) {
-    case 0: do { *(ti++) = *(m++) ^ *(ti1++);
-    case 7:      *(ti++) = *(m++) ^ *(ti1++);
-    case 6:      *(ti++) = *(m++) ^ *(ti1++);
-    case 5:      *(ti++) = *(m++) ^ *(ti1++);
-    case 4:      *(ti++) = *(m++) ^ *(ti1++);
-    case 3:      *(ti++) = *(m++) ^ *(ti1++);
-    case 2:      *(ti++) = *(m++) ^ *(ti1++);
-    case 1:      *(ti++) = *(m++) ^ *(ti1++);
-      } while (--n > 0);
-    }
-#ifdef HAVE_SSE2
-    ti+=incw; ti1+=incw;
-#endif
-    ti += blockoffset;
-    ti1 += blockoffset;
-
-    /* U is a basis but not the canonical basis, so we need to read what
-       element we just created from T*/
-    L[(int)mzd_read_bits(T,i,c,k)] = i;
-    
-  }
-  /* We need fix the table to update the transformation matrix
-     correctly; e.g. if the first row has [1 0 1] and we clear a row
-     below with [1 0 1] we need to encode that this row is cleared by
-     adding the first row only ([1 0 0]).*/
-  for(i=1; i < twokay; i++) {
-    const word correction = (word)codebook[k]->ord[i];
-    mzd_xor_bits(T, i,c, k, correction);
-  }
-}
-
-void mzd_process_rows2_lqup(mzd_t *M, size_t startrow, size_t stoprow, size_t startcol, int k, mzd_t *T0, size_t *L0, mzd_t *T1, size_t *L1) {
-  size_t r;
-  const int ka = k/2;
-  const int kb = k-k/2;
-  const size_t blocknuma=startcol/RADIX;
-  const size_t blocknumb=(startcol+ka)/RADIX;
-  const size_t blockoffset = blocknumb - blocknuma;
-  size_t wide = M->width - blocknuma;
-
-  if(wide < 3) {
-    mzd_process_rows(M, startrow, stoprow, startcol, ka, T0, L0);
-    mzd_process_rows(M, startrow, stoprow, startcol + ka, kb, T1, L1);
-    return;
-  }
-
-  wide -= 2;
-#ifdef HAVE_OPENMP
-#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
-#endif
-  for(r=startrow; r<stoprow; r++) {
-    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka) ];
-    word *t0 = T0->rows[x0] + blocknuma;
-    word *m0 = M->rows[r+0] + blocknuma;
-    m0[0] ^= t0[0];