malb / M4RI (http://m4ri.sagemath.org/)

M4RI is a library for fast arithmetic with dense matrices over F2. It was started by Gregory Bard, is maintained by Martin Albrecht. Several people contributed to it. The name M4RI comes from the first implemented algorithm: The "Method of the Four Russians" inversion algorithm published by Gregory Bard. This algorithm in turn is named after the "Method of the Four Russians" multiplication algorithm which is probably better referred to as Kronrod's method. M4RI is used by the Sage mathematics software and the PolyBoRi library. M4RI is available under the General Public License Version 2 or later (GPLv2+).

Clone this repository (size: 730.9 KB): HTTPS / SSH
$ hg clone http://bitbucket.org/malb/m4ri/
commit 221: 378b655e4717
parent 220: 2ed374681d29
branch: default
use fast pivot searching code in mzd_reduce_m4ri
Martin Albrecht / malb
15 months ago
M4RI / src / brilliantrussian.c
    #   Introduced
1
0214554f1aee
/*******************************************************************
2
0214554f1aee
*
3
0214554f1aee
*                 M4RI: Linear Algebra over GF(2)
4
0214554f1aee
*
5
0214554f1aee
*    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
6
0214554f1aee
*    Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
7
0214554f1aee
*
8
0214554f1aee
*  Distributed under the terms of the GNU General Public License (GPL) 
9
0214554f1aee
*  version 2 or higher.
10
0214554f1aee
*
11
0214554f1aee
*    This code is distributed in the hope that it will be useful,
12
0214554f1aee
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
13
0214554f1aee
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
0214554f1aee
*    General Public License for more details.
15
0214554f1aee
*
16
0214554f1aee
*  The full text of the GPL is available at:
17
0214554f1aee
*
18
0214554f1aee
*                  http://www.gnu.org/licenses/
19
0214554f1aee
*
20
0214554f1aee
********************************************************************/
21
2dd7e018b179
22
2dd7e018b179
#include "misc.h"
23
2dd7e018b179
24
35191cb07327
#ifdef HAVE_SSE2
25
35191cb07327
#include <emmintrin.h>
26
35191cb07327
#endif
27
01d3fae4e843
28
fa74a8495f94
#include <assert.h>
29
fa74a8495f94
30
01d3fae4e843
#include "brilliantrussian.h"
31
35191cb07327
#include "grayflex.h"
32
01d3fae4e843
33
35191cb07327
34
f63ff83556ef
/**
35
762af6cbe158
 * \brief Perform Gaussian reduction to reduced row echelon form on a
36
762af6cbe158
 * submatrix.
37
130dff8d3482
 * 
38
130dff8d3482
 * The submatrix has dimension at most k starting at r x c of A. Checks
39
130dff8d3482
 * for pivot rows up to row endrow (exclusive). Terminates as soon as
40
130dff8d3482
 * finding a pivot column fails.
41
f63ff83556ef
 *
42
130dff8d3482
 * \param A Matrix.
43
130dff8d3482
 * \param r First row.
44
130dff8d3482
 * \param c First column.
45
130dff8d3482
 * \param k Maximal dimension of identity matrix to produce.
46
762af6cbe158
 * \param end_row Maximal row index (exclusive) for rows to consider
47
130dff8d3482
 * for inclusion.
48
f63ff83556ef
 */
49
01d3fae4e843
50
1b4de1a6098b
static inline int _mzd_gauss_submatrix_full(packedmatrix *A, size_t r, size_t c, size_t end_row, int k) {
51
1b4de1a6098b
  size_t i,j,l;
52
1b4de1a6098b
  size_t start_row = r;
53
f63ff83556ef
  int found;
54
f63ff83556ef
  for (j=c; j<c+k; j++) {
55
f63ff83556ef
    found = 0;
56
762af6cbe158
    for (i=start_row; i< end_row; i++) {
57
762af6cbe158
      /* first we need to clear the first columns */
58
c717af9bd812
      if(mzd_read_bits(A,i,c,j-c))
59
c717af9bd812
        for (l=0; l<j-c; l++)
60
c717af9bd812
          if (mzd_read_bit(A, i, c+l))
61
c717af9bd812
            mzd_row_add_offset(A, i, r+l, c+l);
62
f63ff83556ef
      
63
f63ff83556ef
      /* pivot? */
64
f63ff83556ef
      if (mzd_read_bit(A, i, j)) {
65
f63ff83556ef
        mzd_row_swap(A, i, start_row);
66
f63ff83556ef
        /* clear above */
67
f63ff83556ef
        for (l=r; l<start_row; l++) {
68
f63ff83556ef
          if (mzd_read_bit(A, l, j)) {
69
762af6cbe158
            mzd_row_add_offset(A, l, start_row, j);
70
f63ff83556ef
          }
71
f63ff83556ef
        }
72
f63ff83556ef
        start_row++;
73
f63ff83556ef
        found = 1;
74
f63ff83556ef
        break;
75
f63ff83556ef
      }
76
f63ff83556ef
    }
77
f63ff83556ef
    if (found==0) {
78
f63ff83556ef
      return j - c;
79
f63ff83556ef
    }
80
f63ff83556ef
  }
81
f63ff83556ef
  return j - c;
82
f63ff83556ef
}
83
f63ff83556ef
84
762af6cbe158
/**
85
762af6cbe158
 * \brief Perform Gaussian reduction to upper triangular matrix on a
86
762af6cbe158
 * submatrix.
87
762af6cbe158
 * 
88
762af6cbe158
 * The submatrix has dimension at most k starting at r x c of A. Checks
89
762af6cbe158
 * for pivot rows up to row end_row (exclusive). Terminates as soon as
90
762af6cbe158
 * finding a pivot column fails.
91
762af6cbe158
 *
92
762af6cbe158
 * \param A Matrix.
93
762af6cbe158
 * \param r First row.
94
762af6cbe158
 * \param c First column.
95
762af6cbe158
 * \param k Maximal dimension of identity matrix to produce.
96
762af6cbe158
 * \param end_row Maximal row index (exclusive) for rows to consider
97
762af6cbe158
 * for inclusion.
98
762af6cbe158
 */
99
762af6cbe158
100
1b4de1a6098b
static inline int _mzd_gauss_submatrix(packedmatrix *A, size_t r, size_t c, size_t end_row, int k) {
101
1b4de1a6098b
  size_t i,j,l;
102
1b4de1a6098b
  size_t start_row = r;
103
762af6cbe158
  int found;
104
762af6cbe158
  for (j=c; j<c+k; j++) {
105
762af6cbe158
    found = 0;
106
762af6cbe158
    for (i=start_row; i< end_row; i++) {
107
762af6cbe158
      /* first we need to clear the first columns */
108
762af6cbe158
      for (l=0; l<j-c; l++)
109
762af6cbe158
        if (mzd_read_bit(A, i, c+l))
110
762af6cbe158
          mzd_row_add_offset(A, i, r+l, c+l);
111
762af6cbe158
      
112
762af6cbe158
      /* pivot? */
113
762af6cbe158
      if (mzd_read_bit(A, i, j)) {
114
762af6cbe158
        mzd_row_swap(A, i, start_row);
115
762af6cbe158
        start_row++;
116
762af6cbe158
        found = 1;
117
762af6cbe158
        break;
118
762af6cbe158
      }
119
762af6cbe158
    }
120
762af6cbe158
    if (found==0) {
121
762af6cbe158
      return j - c;
122
762af6cbe158
    }
123
762af6cbe158
  }
124
762af6cbe158
  return j - c;
125
762af6cbe158
}
126
762af6cbe158
127
762af6cbe158
/**
128
762af6cbe158
 * \brief Given a submatrix in upper triangular form compute the
129
762af6cbe158
 * reduced row echelon form.
130
762af6cbe158
 * 
131
762af6cbe158
 * The submatrix has dimension at most k starting at r x c of A. Checks
132
762af6cbe158
 * for pivot rows up to row end_row (exclusive). Terminates as soon as
133
762af6cbe158
 * finding a pivot column fails.
134
762af6cbe158
 *
135
762af6cbe158
 * \param A Matrix.
136
762af6cbe158
 * \param r First row.
137
762af6cbe158
 * \param c First column.
138
762af6cbe158
 * \param k Maximal dimension of identity matrix to produce.
139
762af6cbe158
 * \param end_row Maximal row index (exclusive) for rows to consider
140
762af6cbe158
 * for inclusion.
141
762af6cbe158
 */
142
762af6cbe158
143
1b4de1a6098b
static inline int _mzd_gauss_submatrix_top(packedmatrix *A, size_t r, size_t c, int k) {
144
1b4de1a6098b
  size_t j,l;
145
1b4de1a6098b
  size_t start_row = r;
146
762af6cbe158
  for (j=c; j<c+k; j++) {
147
762af6cbe158
    for (l=r; l<start_row; l++) {
148
762af6cbe158
      if (mzd_read_bit(A, l, j)) {
149
762af6cbe158
        mzd_row_add_offset(A, l, start_row, j);
150
762af6cbe158
      }
151
762af6cbe158
    }
152
762af6cbe158
    start_row++;
153
762af6cbe158
  }
154
762af6cbe158
  return k;
155
762af6cbe158
}
156
762af6cbe158
157
762af6cbe158
static inline void _mzd_copy_back_rows(packedmatrix *A, packedmatrix *U, size_t r, size_t c, size_t k) {
158
762af6cbe158
  size_t startblock = c/RADIX;
159
762af6cbe158
  size_t width = A->width - startblock;
160
762af6cbe158
  size_t i, j;
161
762af6cbe158
  for (i=0 ; i < k ; i++) {
162
762af6cbe158
    const word * const src = U->values + U->rowswap[i] + startblock;
163
762af6cbe158
    word *const dst = A->values + A->rowswap[r+i] + startblock;
164
762af6cbe158
    for (j=0; j< width; j++) {
165
762af6cbe158
      dst[j] = src[j];
166
762af6cbe158
    }
167
762af6cbe158
  }
168
762af6cbe158
}
169
762af6cbe158
170
1b4de1a6098b
void mzd_make_table( packedmatrix *M, size_t r, size_t c, int k, packedmatrix *T, size_t *L) {
171
1b4de1a6098b
  const size_t homeblock= c/RADIX;
172
1b4de1a6098b
  size_t i, j, rowneeded, id;
173
1b4de1a6098b
  size_t twokay= TWOPOW(k);
174
1b4de1a6098b
  size_t wide = T->width - homeblock;
175
01d3fae4e843
176
76716a73940a
  word *ti, *ti1, *m;
177
76716a73940a
178
76716a73940a
  ti1 = T->values + homeblock;
179
4a2f5e7fe65d
  ti = ti1 + T->width;
180
067ff9f2f927
#ifdef HAVE_SSE2
181
067ff9f2f927
  unsigned long incw = 0;
182
067ff9f2f927
  if (T->width & 1) incw = 1;
183
067ff9f2f927
  ti += incw;
184
067ff9f2f927
#endif
185
4a2f5e7fe65d
186
35191cb07327
  L[0]=0;
187
76716a73940a
  for (i=1; i<twokay; i++) {
188
570d0958b479
    rowneeded = r + codebook[k]->inc[i-1];
189
76716a73940a
    id = codebook[k]->ord[i];
190
76716a73940a
    L[id] = i;
191
76716a73940a
    if (rowneeded >= M->nrows) {
192
966a82c78d39
      for (j = 0; j < wide; j++) {
193
4a2f5e7fe65d
        *ti++ = *ti1++;
194
76716a73940a
      }
195
067ff9f2f927
#ifdef HAVE_SSE2
196
067ff9f2f927
      ti+=incw; ti1+=incw;
197
067ff9f2f927
#endif
198
76716a73940a
    } else {
199
76716a73940a
      m = M->values + M->rowswap[rowneeded] + homeblock;
200
4a2f5e7fe65d
201
e67a2e7c4c4b
      /* Duff's device loop unrolling */
202
e67a2e7c4c4b
      register int n = (wide + 7) / 8;
203
e67a2e7c4c4b
      switch (wide % 8) {
204
e67a2e7c4c4b
      case 0: do { *(ti++) = *(m++) ^ *(ti1++);
205
e67a2e7c4c4b
      case 7:      *(ti++) = *(m++) ^ *(ti1++);
206
e67a2e7c4c4b
      case 6:      *(ti++) = *(m++) ^ *(ti1++);
207
e67a2e7c4c4b
      case 5:      *(ti++) = *(m++) ^ *(ti1++);
208
e67a2e7c4c4b
      case 4:      *(ti++) = *(m++) ^ *(ti1++);
209
e67a2e7c4c4b
      case 3:      *(ti++) = *(m++) ^ *(ti1++);
210
e67a2e7c4c4b
      case 2:      *(ti++) = *(m++) ^ *(ti1++);
211
e67a2e7c4c4b
      case 1:      *(ti++) = *(m++) ^ *(ti1++);
212
e67a2e7c4c4b
        } while (--n > 0);
213
4a2f5e7fe65d
      }
214
067ff9f2f927
#ifdef HAVE_SSE2
215
067ff9f2f927
      ti+=incw; ti1+=incw;
216
067ff9f2f927
#endif
217
1ceebbfe39a8
      ti += homeblock;
218
1ceebbfe39a8
      ti1 += homeblock;
219
76716a73940a
    }
220
b3bd008958dc
  }
221
01d3fae4e843
}
222
01d3fae4e843
223
1b4de1a6098b
void mzd_process_rows(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T, size_t *L) {
224
f9845bc6e485
  size_t r;
225
1b4de1a6098b
  const size_t blocknum=startcol/RADIX;
226
1b4de1a6098b
  size_t wide = M->width - blocknum;
227
3016fb391eab
228
5b6bab6a102a
  if(k==1) {
229
5b6bab6a102a
    word bm = ONE << ((RADIX - startcol - 1) % RADIX);
230
5b6bab6a102a
231
5b6bab6a102a
    for (r=startrow; r+2<=stoprow; r+=2) {
232
5b6bab6a102a
      word *t = T->values + T->rowswap[1] + blocknum;
233
5b6bab6a102a
      word *m0 = M->values + M->rowswap[r+0] + blocknum;
234
5b6bab6a102a
      word *m1 = M->values + M->rowswap[r+1] + blocknum;
235
5b6bab6a102a
      register int n = (wide + 7) / 8;
236
5b6bab6a102a
237
5b6bab6a102a
      if(*m0 & bm) {
238
5b6bab6a102a
        if(*m1 & bm) {
239
5b6bab6a102a
          switch (wide % 8) {
240
5b6bab6a102a
          case 0: do { *m0++ ^= *t; *m1++ ^= *t++;
241
5b6bab6a102a
          case 7:    *m0++ ^= *t; *m1++ ^= *t++;
242
5b6bab6a102a
          case 6:    *m0++ ^= *t; *m1++ ^= *t++;
243
5b6bab6a102a
          case 5:    *m0++ ^= *t; *m1++ ^= *t++;
244
5b6bab6a102a
          case 4:    *m0++ ^= *t; *m1++ ^= *t++;
245
5b6bab6a102a
          case 3:    *m0++ ^= *t; *m1++ ^= *t++;
246
5b6bab6a102a
          case 2:    *m0++ ^= *t; *m1++ ^= *t++;
247
5b6bab6a102a
          case 1:    *m0++ ^= *t; *m1++ ^= *t++;
248
5b6bab6a102a
            } while (--n > 0);
249
5b6bab6a102a
          }
250
5b6bab6a102a
        } else {
251
5b6bab6a102a
          switch (wide % 8) {
252
5b6bab6a102a
          case 0: do { *m0++ ^= *t++;
253
5b6bab6a102a
          case 7:    *m0++ ^= *t++;
254
5b6bab6a102a
          case 6:    *m0++ ^= *t++;
255
5b6bab6a102a
          case 5:    *m0++ ^= *t++;
256
5b6bab6a102a
          case 4:    *m0++ ^= *t++;
257
5b6bab6a102a
          case 3:    *m0++ ^= *t++;
258
5b6bab6a102a
          case 2:    *m0++ ^= *t++;
259
5b6bab6a102a
          case 1:    *m0++ ^= *t++;
260
5b6bab6a102a
            } while (--n > 0);
261
5b6bab6a102a
          }
262
5b6bab6a102a
        }
263
5b6bab6a102a
      } else if(*m1 & bm) {
264
5b6bab6a102a
          switch (wide % 8) {
265
5b6bab6a102a
          case 0: do { *m1++ ^= *t++;
266
5b6bab6a102a
          case 7:    *m1++ ^= *t++;
267
5b6bab6a102a
          case 6:    *m1++ ^= *t++;
268
5b6bab6a102a
          case 5:    *m1++ ^= *t++;
269
5b6bab6a102a
          case 4:    *m1++ ^= *t++;
270
5b6bab6a102a
          case 3:    *m1++ ^= *t++;
271
5b6bab6a102a
          case 2:    *m1++ ^= *t++;
272
5b6bab6a102a
          case 1:    *m1++ ^= *t++;
273
5b6bab6a102a
            } while (--n > 0);
274
5b6bab6a102a
          }
275
5b6bab6a102a
      }
276
5b6bab6a102a
    }
277
5b6bab6a102a
278
5b6bab6a102a
    for( ; r<stoprow; r++) {
279
5b6bab6a102a
      const int x0 = L[ (int)mzd_read_bits(M, r, startcol, k) ];
280
5b6bab6a102a
      word *m0 = M->values + M->rowswap[r] + blocknum;
281
5b6bab6a102a
      word *t0 = T->values + T->rowswap[x0] + blocknum;
282
5b6bab6a102a
      
283
5b6bab6a102a
      register int n = (wide + 7) / 8;
284
5b6bab6a102a
      switch (wide % 8) {
285
5b6bab6a102a
      case 0: do { *m0++ ^= *t0++;
286
5b6bab6a102a
        case 7:    *m0++ ^= *t0++;
287
5b6bab6a102a
        case 6:    *m0++ ^= *t0++;
288
5b6bab6a102a
        case 5:    *m0++ ^= *t0++;
289
5b6bab6a102a
        case 4:    *m0++ ^= *t0++;
290
5b6bab6a102a
        case 3:    *m0++ ^= *t0++;
291
5b6bab6a102a
        case 2:    *m0++ ^= *t0++;
292
5b6bab6a102a
        case 1:    *m0++ ^= *t0++;
293
5b6bab6a102a
        } while (--n > 0);
294
5b6bab6a102a
      }
295
5b6bab6a102a
    }
296
5b6bab6a102a
    return;
297
5b6bab6a102a
  }
298
5b6bab6a102a
299
3de1c2fcfd95
  for (r=startrow; r+2<=stoprow; r+=2) {
300
41bd093d9fc0
    const int x0 = L[ (int)mzd_read_bits(M, r+0, startcol, k) ];
301
41bd093d9fc0
    const int x1 = L[ (int)mzd_read_bits(M, r+1, startcol, k) ];
302
a50e736b16b7
    
303
a50e736b16b7
    word *m0 = M->values + M->rowswap[r+0] + blocknum;
304
130dff8d3482
    word *t0 = T->values + T->rowswap[x0] + blocknum;
305
130dff8d3482
306
a50e736b16b7
    word *m1 = M->values + M->rowswap[r+1] + blocknum;
307
a50e736b16b7
    word *t1 = T->values + T->rowswap[x1] + blocknum;
308
01d3fae4e843
309
130dff8d3482
    register int n = (wide + 7) / 8;
310
130dff8d3482
    switch (wide % 8) {
311
130dff8d3482
    case 0: do { *m0++ ^= *t0++; *m1++ ^= *t1++;
312
130dff8d3482
      case 7:    *m0++ ^= *t0++; *m1++ ^= *t1++;
313
130dff8d3482
      case 6:    *m0++ ^= *t0++; *m1++ ^= *t1++;
314
130dff8d3482
      case 5:    *m0++ ^= *t0++; *m1++ ^= *t1++;
315
130dff8d3482
      case 4:    *m0++ ^= *t0++; *m1++ ^= *t1++;
316
130dff8d3482
      case 3:    *m0++ ^= *t0++; *m1++ ^= *t1++;
317
130dff8d3482
      case 2:    *m0++ ^= *t0++; *m1++ ^= *t1++;
318
130dff8d3482
      case 1:    *m0++ ^= *t0++; *m1++ ^= *t1++;
319
130dff8d3482
      } while (--n > 0);
320
01d3fae4e843
    }
321
130dff8d3482
  }
322
130dff8d3482
323
a50e736b16b7
  for( ; r<stoprow; r++) {
324
41bd093d9fc0
    const int x0 = L[ (int)mzd_read_bits(M, r, startcol, k) ];
325
a50e736b16b7
    word *m0 = M->values + M->rowswap[r] + blocknum;
326
a50e736b16b7
    word *t0 = T->values + T->rowswap[x0] + blocknum;
327
130dff8d3482
328
130dff8d3482
    register int n = (wide + 7) / 8;
329
130dff8d3482
    switch (wide % 8) {
330
130dff8d3482
    case 0: do { *m0++ ^= *t0++;
331
130dff8d3482
      case 7:    *m0++ ^= *t0++;
332
130dff8d3482
      case 6:    *m0++ ^= *t0++;
333
130dff8d3482
      case 5:    *m0++ ^= *t0++;
334
130dff8d3482
      case 4:    *m0++ ^= *t0++;
335
130dff8d3482
      case 3:    *m0++ ^= *t0++;
336
130dff8d3482
      case 2:    *m0++ ^= *t0++;
337
130dff8d3482
      case 1:    *m0++ ^= *t0++;
338
130dff8d3482
      } while (--n > 0);
339
130dff8d3482
    }
340
130dff8d3482
  }
341
130dff8d3482
}
342
130dff8d3482
343
1b4de1a6098b
void mzd_process_rows2(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1) {
344
f9845bc6e485
  size_t r;
345
1b4de1a6098b
  const size_t blocknum=startcol/RADIX;
346
1b4de1a6098b
  const size_t wide = M->width - blocknum;
347
130dff8d3482
348
130dff8d3482
  const int ka = k/2;
349
130dff8d3482
  const int kb = k-k/2;
350
130dff8d3482
351
e4b83893013e
#ifdef HAVE_OPENMP
352
e4b83893013e
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
353
e4b83893013e
#endif
354
130dff8d3482
  for(r=startrow; r<stoprow; r++) {
355
41bd093d9fc0
    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)];
356
41bd093d9fc0
    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)];
357
1208d145f7d2
    if(x0 == 0 && x1 == 0)
358
1208d145f7d2
      continue;
359
130dff8d3482
    word * m0 = M->values + M->rowswap[r] + blocknum;
360
130dff8d3482
    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
361
130dff8d3482
    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
362
130dff8d3482
363
130dff8d3482
    register int n = (wide + 7) / 8;
364
130dff8d3482
    switch (wide % 8) {
365
130dff8d3482
    case 0: do { *m0++ ^= *t0++ ^ *t1++;
366
130dff8d3482
      case 7:    *m0++ ^= *t0++ ^ *t1++;
367
130dff8d3482
      case 6:    *m0++ ^= *t0++ ^ *t1++;
368
130dff8d3482
      case 5:    *m0++ ^= *t0++ ^ *t1++;
369
130dff8d3482
      case 4:    *m0++ ^= *t0++ ^ *t1++;
370
130dff8d3482
      case 3:    *m0++ ^= *t0++ ^ *t1++;
371
130dff8d3482
      case 2:    *m0++ ^= *t0++ ^ *t1++;
372
130dff8d3482
      case 1:    *m0++ ^= *t0++ ^ *t1++;
373
130dff8d3482
      } while (--n > 0);
374
130dff8d3482
    }
375
130dff8d3482
  }
376
130dff8d3482
}
377
130dff8d3482
378
1b4de1a6098b
void mzd_process_rows3(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2) {
379
f9845bc6e485
  size_t r;
380
1b4de1a6098b
  const size_t blocknum=startcol/RADIX;
381
1b4de1a6098b
  const size_t wide = M->width - blocknum;
382
130dff8d3482
383
130dff8d3482
  int rem = k%3;
384
130dff8d3482
  
385
130dff8d3482
  const int ka = k/3 + ((rem>=2) ? 1 : 0);
386
130dff8d3482
  const int kb = k/3 + ((rem>=1) ? 1 : 0);
387
130dff8d3482
  const int kc = k/3;
388
130dff8d3482
389
e4b83893013e
#ifdef HAVE_OPENMP
390
e4b83893013e
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
391
e4b83893013e
#endif
392
130dff8d3482
  for(r=startrow; r<stoprow; r++) {
393
41bd093d9fc0
    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)];
394
41bd093d9fc0
    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)];
395
41bd093d9fc0
    const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)];
396
1208d145f7d2
    if(x0 == 0 && x1 == 0 && x2 == 0) 
397
1208d145f7d2
      continue;
398
1208d145f7d2
399
130dff8d3482
    word * m0 = M->values + M->rowswap[r] + blocknum;
400
130dff8d3482
    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
401
130dff8d3482
    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
402
130dff8d3482
    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
403
130dff8d3482
404
130dff8d3482
    register int n = (wide + 7) / 8;
405
130dff8d3482
    switch (wide % 8) {
406
130dff8d3482
    case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
407
130dff8d3482
      case 7:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
408
130dff8d3482
      case 6:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
409
130dff8d3482
      case 5:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
410
130dff8d3482
      case 4:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
411
130dff8d3482
      case 3:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
412
130dff8d3482
      case 2:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
413
130dff8d3482
      case 1:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++;
414
130dff8d3482
      } while (--n > 0);
415
130dff8d3482
    }
416
a50e736b16b7
  }
417
01d3fae4e843
}
418
01d3fae4e843
419
1b4de1a6098b
void mzd_process_rows4(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
420
1b4de1a6098b
                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3) {
421
f9845bc6e485
  size_t r;
422
1b4de1a6098b
  const size_t blocknum=startcol/RADIX;
423
1b4de1a6098b
  const size_t wide = M->width - blocknum;
424
52e22dd25d90
425
52e22dd25d90
  int rem = k%4;
426
52e22dd25d90
  
427
52e22dd25d90
  const int ka = k/4 + ((rem>=3) ? 1 : 0);
428
52e22dd25d90
  const int kb = k/4 + ((rem>=2) ? 1 : 0);
429
52e22dd25d90
  const int kc = k/4 + ((rem>=1) ? 1 : 0);
430
52e22dd25d90
  const int kd = k/4;
431
52e22dd25d90
432
e4b83893013e
#ifdef HAVE_OPENMP
433
e4b83893013e
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
434
e4b83893013e
#endif
435
52e22dd25d90
  for(r=startrow; r<stoprow; r++) {
436
41bd093d9fc0
    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)];
437
41bd093d9fc0
    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)];
438
41bd093d9fc0
    const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)];
439
41bd093d9fc0
    const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd)];
440
1208d145f7d2
    if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0) 
441
1208d145f7d2
      continue;
442
1208d145f7d2
443
52e22dd25d90
    word * m0 = M->values + M->rowswap[r] + blocknum;
444
52e22dd25d90
    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
445
52e22dd25d90
    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
446
52e22dd25d90
    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
447
52e22dd25d90
    const word *t3 = T3->values + T3->rowswap[x3] + blocknum;
448
1b4de1a6098b
    
449
52e22dd25d90
    register int n = (wide + 7) / 8;
450
52e22dd25d90
    switch (wide % 8) {
451
52e22dd25d90
    case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
452
52e22dd25d90
      case 7:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
453
52e22dd25d90
      case 6:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
454
52e22dd25d90
      case 5:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
455
52e22dd25d90
      case 4:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
456
52e22dd25d90
      case 3:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
457
52e22dd25d90
      case 2:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
458
52e22dd25d90
      case 1:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++;
459
52e22dd25d90
      } while (--n > 0);
460
52e22dd25d90
    }
461
52e22dd25d90
  }
462
52e22dd25d90
}
463
52e22dd25d90
464
1b4de1a6098b
void mzd_process_rows5(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
465
1b4de1a6098b
                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3,
466
1b4de1a6098b
                       packedmatrix *T4, size_t *L4) {
467
f9845bc6e485
  size_t r;
468
1b4de1a6098b
  const size_t blocknum=startcol/RADIX;
469
1b4de1a6098b
  const size_t wide = M->width - blocknum;
470
1b4de1a6098b
  int rem = k%5;
471
1b4de1a6098b
  
472
1b4de1a6098b
  const int ka = k/5 + ((rem>=4) ? 1 : 0);
473
1b4de1a6098b
  const int kb = k/5 + ((rem>=3) ? 1 : 0);
474
1b4de1a6098b
  const int kc = k/5 + ((rem>=2) ? 1 : 0);
475
1b4de1a6098b
  const int kd = k/5 + ((rem>=1) ? 1 : 0);
476
1b4de1a6098b
  const int ke = k/5;
477
1b4de1a6098b
478
e4b83893013e
#ifdef HAVE_OPENMP
479
e4b83893013e
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
480
e4b83893013e
#endif
481
1b4de1a6098b
  for(r=startrow; r<stoprow; r++) {
482
34b9e9154944
    
483
1b4de1a6098b
    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)];
484
1b4de1a6098b
    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)];
485
1b4de1a6098b
    const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)];
486
1b4de1a6098b
    const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd)];
487
1b4de1a6098b
    const int x4 = L4[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc+kd, ke)];
488
1b4de1a6098b
489
1b4de1a6098b
    if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0) 
490
1b4de1a6098b
      continue;
491
1b4de1a6098b
492
1b4de1a6098b
    word * m0 = M->values + M->rowswap[r] + blocknum;
493
1b4de1a6098b
    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
494
1b4de1a6098b
    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
495
1b4de1a6098b
    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
496
1b4de1a6098b
    const word *t3 = T3->values + T3->rowswap[x3] + blocknum;
497
1b4de1a6098b
    const word *t4 = T4->values + T4->rowswap[x4] + blocknum;
498
1b4de1a6098b
    
499
1b4de1a6098b
    register int n = (wide + 7) / 8;
500
1b4de1a6098b
    switch (wide % 8) {
501
1b4de1a6098b
    case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
502
1b4de1a6098b
      case 7:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
503
1b4de1a6098b
      case 6:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
504
1b4de1a6098b
      case 5:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
505
1b4de1a6098b
      case 4:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
506
1b4de1a6098b
      case 3:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
507
1b4de1a6098b
      case 2:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
508
1b4de1a6098b
      case 1:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
509
1b4de1a6098b
      } while (--n > 0);
510
1b4de1a6098b
    }
511
1b4de1a6098b
  }
512
1b4de1a6098b
}
513
1b4de1a6098b
514
1b4de1a6098b
void mzd_process_rows6(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, 
515
1b4de1a6098b
                       packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3,
516
1b4de1a6098b
                       packedmatrix *T4, size_t *L4, packedmatrix *T5, size_t *L5) {
517
f9845bc6e485
  size_t r;
518
1b4de1a6098b
  const size_t blocknum=startcol/RADIX;
519
1b4de1a6098b
  const size_t wide = M->width - blocknum;
520
1b4de1a6098b
521
1b4de1a6098b
  int rem = k%6;
522
1b4de1a6098b
  
523
1b4de1a6098b
  const int ka = k/6 + ((rem>=5) ? 1 : 0);
524
1b4de1a6098b
  const int kb = k/6 + ((rem>=4) ? 1 : 0);
525
1b4de1a6098b
  const int kc = k/6 + ((rem>=3) ? 1 : 0);
526
1b4de1a6098b
  const int kd = k/6 + ((rem>=2) ? 1 : 0);
527
1b4de1a6098b
  const int ke = k/6 + ((rem>=1) ? 1 : 0);;
528
1b4de1a6098b
  const int kf = k/6;
529
1b4de1a6098b
530
e4b83893013e
#ifdef HAVE_OPENMP
531
e4b83893013e
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128)
532
e4b83893013e
#endif
533
1b4de1a6098b
  for(r=startrow; r<stoprow; r++) {
534
1b4de1a6098b
    const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)];
535
1b4de1a6098b
    const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)];
536
1b4de1a6098b
    const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)];
537
1b4de1a6098b
    const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd)];
538
1b4de1a6098b
    const int x4 = L4[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc+kd, ke)];
539
1b4de1a6098b
    const int x5 = L5[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc+kd+ke, kf)];
540
f9845bc6e485
    
541
1b4de1a6098b
    if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0) 
542
1b4de1a6098b
      continue;
543
34b9e9154944
544
1b4de1a6098b
    word * m0 = M->values + M->rowswap[r] + blocknum;
545
1b4de1a6098b
    const word *t0 = T0->values + T0->rowswap[x0] + blocknum;
546
1b4de1a6098b
    const word *t1 = T1->values + T1->rowswap[x1] + blocknum;
547
1b4de1a6098b
    const word *t2 = T2->values + T2->rowswap[x2] + blocknum;
548
1b4de1a6098b
    const word *t3 = T3->values + T3->rowswap[x3] + blocknum;
549
1b4de1a6098b
    const word *t4 = T4->values + T4->rowswap[x4] + blocknum;
550
1b4de1a6098b
    const word *t5 = T5->values + T5->rowswap[x5] + blocknum;
551
34b9e9154944
552
1b4de1a6098b
    register int n = (wide + 7) / 8;
553
1b4de1a6098b
    switch (wide % 8) {
554
34b9e9154944
      case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
555
1b4de1a6098b
      case 7:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
556
1b4de1a6098b
      case 6:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
557
1b4de1a6098b
      case 5:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
558
1b4de1a6098b
      case 4:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
559
1b4de1a6098b
      case 3:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
560
1b4de1a6098b
      case 2:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
561
1b4de1a6098b
      case 1:    *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
562
1b4de1a6098b
      } while (--n > 0);
563
1b4de1a6098b
    }
564
1b4de1a6098b
  }
565
1b4de1a6098b
}
566
1b4de1a6098b
567
1b4de1a6098b
int mzd_reduce_m4ri(packedmatrix *A, int full, int k, packedmatrix *T, size_t *L) {
568
35191cb07327
  /**
569
35191cb07327
   * The algorithm works as follows:
570
757222db298a
   *
571
35191cb07327
   * Step 1.Denote the first column to be processed in a given
572
35191cb07327
   * iteration as \f$a_i\f$. Then, perform Gaussian elimination on the
573
35191cb07327
   * first \f$3k\f$ rows after and including the \f$i\f$-th row to
574
35191cb07327
   * produce an identity matrix in \f$a_{i,i} ... a_{i+k-1,i+k-1},\f$
575
35191cb07327
   * and zeroes in \f$a_{i+k,i} ... a_{i+3k-1,i+k-1}\f$.
576
f63ff83556ef
   *
577
35191cb07327
   * Step 2. Construct a table consisting of the \f$2^k\f$ binary strings of
578
35191cb07327
   * length k in a Gray code.  Thus with only \f$2^k\f$ vector
579
35191cb07327
   * additions, all possible linear combinations of these k rows
580
35191cb07327
   * have been precomputed.
581
f63ff83556ef
   *
582
f63ff83556ef
   *
583
35191cb07327
   * Step 3. One can rapidly process the remaining rows from \f$i +
584
35191cb07327
   * 3k\f$ until row \f$m\f$ (the last row) by using the table. For
585
fa74a8495f94
   * example, suppose the \f$j\f$-th row has entries \f$a_{j,i}
586
35191cb07327
   * ... a_{j,i+k-1}\f$ in the columns being processed. Selecting the
587
35191cb07327
   * row of the table associated with this k-bit string, and adding it
588
35191cb07327
   * to row j will force the k columns to zero, and adjust the
589
18f7108f161c
   * remaining columns from \f$ i + k\f$ to n in the appropriate way,
590
35191cb07327
   * as if Gaussian elimination had been performed.
591
f63ff83556ef
   *
592
35191cb07327
   * Step 4. While the above form of the algorithm will reduce a
593
35191cb07327
   * system of boolean linear equations to unit upper triangular form,
594
35191cb07327
   * and thus permit a system to be solved with back substitution, the
595
35191cb07327
   * M4RI algorithm can also be used to invert a matrix, or put the
596
35191cb07327
   * system into reduced row echelon form (RREF). Simply run Step 3
597
35191cb07327
   * on rows \f$0 ... i-1\f$ as well as on rows \f$i + 3k
598
35191cb07327
   * ... m\f$. This only affects the complexity slightly, changing the
599
35191cb07327
   * 2.5 coeffcient to 3
600
01d3fae4e843
   */
601
01d3fae4e843
602
1b4de1a6098b
  const size_t ncols = A->ncols; 
603
1b4de1a6098b
  size_t r = 0;
604
1b4de1a6098b
  size_t c = 0;
605
c8fb3be4c910
  int kbar = 0;
606
c8fb3be4c910
607
570d0958b479
  if (k == 0) {
608
570d0958b479
    k = m4ri_opt_k(A->nrows, A->ncols, 0);
609
e15a37476955
    if (k>=7)
610
e15a37476955
      k = 7;
611
e15a37476955
    if ( (6*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 )
612
e15a37476955
      k -= 1;
613
570d0958b479
  }
614
f9845bc6e485
  /*printf("k: %d\n",k);*/
615
1b4de1a6098b
  int kk = 6*k;
616
8e4bbe385050
617
762af6cbe158
  packedmatrix *U  = mzd_init(kk, A->ncols);
618
130dff8d3482
  packedmatrix *T0 = mzd_init(TWOPOW(k), A->ncols);
619
130dff8d3482
  packedmatrix *T1 = mzd_init(TWOPOW(k), A->ncols);
620
130dff8d3482
  packedmatrix *T2 = mzd_init(TWOPOW(k), A->ncols);
621
52e22dd25d90
  packedmatrix *T3 = mzd_init(TWOPOW(k), A->ncols);
622
1b4de1a6098b
  packedmatrix *T4 = mzd_init(TWOPOW(k), A->ncols);
623
1b4de1a6098b
  packedmatrix *T5 = mzd_init(TWOPOW(k), A->ncols);
624
1b4de1a6098b
  size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
625
1b4de1a6098b
  size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
626
1b4de1a6098b
  size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
627
1b4de1a6098b
  size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
628
1b4de1a6098b
  size_t *L4 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
629
1b4de1a6098b
  size_t *L5 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
630
570d0958b479
631
570d0958b479
  while(c<ncols) {
632
8e4bbe385050
    if(c+kk > A->ncols) {
633
8e4bbe385050
      kk = ncols - c;
634
c8fb3be4c910
    }
635
762af6cbe158
    if (full) {
636
762af6cbe158
      kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk);
637
762af6cbe158
    } else {
638
762af6cbe158
      kbar = _mzd_gauss_submatrix(A, r, c, A->nrows, kk);
639
378b655e4717
      /* this isn't necessary, adapt make_table */
640
762af6cbe158
      U = mzd_submatrix(U, A, r, 0, r+kbar, A->ncols);
641
762af6cbe158
      _mzd_gauss_submatrix_top(A, r, c, kbar);
642
762af6cbe158
    }
643
c8fb3be4c910
644
1b4de1a6098b
    if (kbar>5*k) {
645
1b4de1a6098b
      const int rem = kbar%6;
646
1b4de1a6098b
      const int ka = kbar/6 + ((rem>=5) ? 1 : 0);
647
1b4de1a6098b
      const int kb = kbar/6 + ((rem>=4) ? 1 : 0);
648
1b4de1a6098b
      const int kc = kbar/6 + ((rem>=3) ? 1 : 0);
649
1b4de1a6098b
      const int kd = kbar/6 + ((rem>=2) ? 1 : 0);
650
1b4de1a6098b
      const int ke = kbar/6 + ((rem>=1) ? 1 : 0);;
651
1b4de1a6098b
      const int kf = kbar/6;
652
1b4de1a6098b
653
2ed374681d29
      if(full || kbar==kk) {
654
2ed374681d29
        mzd_make_table(A, r, c, ka, T0, L0);
655
2ed374681d29
        mzd_make_table(A, r+ka, c, kb, T1, L1);
656
2ed374681d29
        mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
657
2ed374681d29
        mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
658
2ed374681d29
        mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4);
659
2ed374681d29
        mzd_make_table(A, r+ka+kb+kc+kd+ke, c, kf, T5, L5);
660
2ed374681d29
      }
661
2ed374681d29
      if(kbar==kk)
662
2ed374681d29
        mzd_process_rows6(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5);
663
1b4de1a6098b
      if(full)
664
1b4de1a6098b
        mzd_process_rows6(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5);
665
1b4de1a6098b
666
2ed374681d29
  } else if (kbar>4*k) { 
667
1b4de1a6098b
      const int rem = kbar%5;
668
1b4de1a6098b
      const int ka = kbar/5 + ((rem>=4) ? 1 : 0);
669
1b4de1a6098b
      const int kb = kbar/5 + ((rem>=3) ? 1 : 0);
670
1b4de1a6098b
      const int kc = kbar/5 + ((rem>=2) ? 1 : 0);
671
1b4de1a6098b
      const int kd = kbar/5 + ((rem>=1) ? 1 : 0);
672
1b4de1a6098b
      const int ke = kbar/5;
673
2ed374681d29
      if(full || kbar==kk) {
674
2ed374681d29
        mzd_make_table(A, r, c, ka, T0, L0);
675
2ed374681d29
        mzd_make_table(A, r+ka, c, kb, T1, L1);
676
2ed374681d29
        mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
677
2ed374681d29
        mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
678
2ed374681d29
        mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4);
679
2ed374681d29
      }
680
2ed374681d29
      if(kbar==kk)
681
2ed374681d29
        mzd_process_rows5(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4);
682
1b4de1a6098b
      if(full)
683
1b4de1a6098b
        mzd_process_rows5(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4);
684
1b4de1a6098b
      
685
1b4de1a6098b
    } else if (kbar>3*k) {
686
52e22dd25d90
      const int rem = kbar%4;
687
52e22dd25d90
      const int ka = kbar/4 + ((rem>=3) ? 1 : 0);
688
52e22dd25d90
      const int kb = kbar/4 + ((rem>=2) ? 1 : 0);
689
52e22dd25d90
      const int kc = kbar/4 + ((rem>=1) ? 1 : 0);
690
52e22dd25d90
      const int kd = kbar/4;
691
2ed374681d29
      if(full || kbar==kk) {
692
2ed374681d29
        mzd_make_table(A, r, c, ka, T0, L0);
693
2ed374681d29
        mzd_make_table(A, r+ka, c, kb, T1, L1);
694
2ed374681d29
        mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
695
2ed374681d29
        mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
696
2ed374681d29
      }
697
2ed374681d29
      if(kbar==kk)
698
2ed374681d29
        mzd_process_rows4(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);
699
52e22dd25d90
      if(full)
700
52e22dd25d90
        mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);
701
52e22dd25d90
      
702
52e22dd25d90
    } else if (kbar>2*k) {
703
130dff8d3482
      int rem = kbar%3;
704
130dff8d3482
      int ka = kbar/3 + ((rem>=2) ? 1 : 0);
705
130dff8d3482
      int kb = kbar/3 + ((rem>=1) ? 1 : 0);
706
130dff8d3482
      int kc = kbar/3;
707
2ed374681d29
      if(full || kbar==kk) {
708
2ed374681d29
        mzd_make_table(A, r, c, ka, T0, L0);
709
2ed374681d29
        mzd_make_table(A, r+ka, c, kb, T1, L1);
710
2ed374681d29
        mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
711
2ed374681d29
      }
712
2ed374681d29
      if(kbar==kk)
713
2ed374681d29
        mzd_process_rows3(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2);
714
130dff8d3482
      if(full)
715
130dff8d3482
        mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2);
716
130dff8d3482
      
717
130dff8d3482
    } else if (kbar>k) {
718
130dff8d3482
      const int ka = kbar/2;
719
130dff8d3482
      const int kb = kbar - ka;
720
2ed374681d29
      if(full || kbar==kk) {
721
2ed374681d29
        mzd_make_table(A, r, c, ka, T0, L0);
722
2ed374681d29
        mzd_make_table(A, r+ka, c, kb, T1, L1);
723
2ed374681d29
      }
724
2ed374681d29
      if(kbar==kk)
725
2ed374681d29
        mzd_process_rows2(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1);
726
130dff8d3482
      if(full)
727
130dff8d3482
        mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1);
728
130dff8d3482
      
729
130dff8d3482
    } else if(kbar > 0) {
730
2ed374681d29
      if(full || kbar==kk) {
731
2ed374681d29
        mzd_make_table(A, r, c, kbar, T0, L0);
732
2ed374681d29
      }
733
2ed374681d29
      if(kbar==kk)
734
2ed374681d29
        mzd_process_rows(A, r+kbar, A->nrows, c, kbar, T0, L0);
735
130dff8d3482
      if(full)
736
130dff8d3482
        mzd_process_rows(A, 0, r, c, kbar, T0, L0);
737
881892b8ade8
    }
738
762af6cbe158
739
762af6cbe158
    if (!full) {
740
762af6cbe158
      _mzd_copy_back_rows(A, U, r, c, kbar);
741
762af6cbe158
    }
742
762af6cbe158
743
c8fb3be4c910
    r += kbar;
744
c8fb3be4c910
    c += kbar;
745
8e4bbe385050
    if(kk!=kbar) {
746
378b655e4717
      size_t cbar;
747
378b655e4717
      size_t rbar;
748
378b655e4717
      if (mzd_find_pivot(A, r, c, &rbar, &cbar)) {
749
378b655e4717
        c = cbar;
750
378b655e4717
        mzd_row_swap(A, r, rbar);
751
378b655e4717
      } else {
752
378b655e4717
        break;
753
378b655e4717
      }
754
378b655e4717
      //c++;
755
c8fb3be4c910
    }
756
c8fb3be4c910
  }
757
570d0958b479
758
8e4bbe385050
  mzd_free(T0);
759
8e4bbe385050
  m4ri_mm_free(L0);
760
8e4bbe385050
  mzd_free(T1);
761
8e4bbe385050
  m4ri_mm_free(L1);
762
130dff8d3482
  mzd_free(T2);
763
130dff8d3482
  m4ri_mm_free(L2);
764
52e22dd25d90
  mzd_free(T3);
765
52e22dd25d90
  m4ri_mm_free(L3);
766
1b4de1a6098b
  mzd_free(T4);
767
1b4de1a6098b
  m4ri_mm_free(L4);
768
1b4de1a6098b
  mzd_free(T5);
769
1b4de1a6098b
  m4ri_mm_free(L5);
770
762af6cbe158
  mzd_free(U);
771
570d0958b479
  return r;
772
c8fb3be4c910
}
773
c8fb3be4c910
774
1b4de1a6098b
void mzd_top_reduce_m4ri(packedmatrix *A, int k, packedmatrix *T, size_t *L) {
775
1b4de1a6098b
  const size_t ncols = A->ncols; 
776
1b4de1a6098b
  size_t r = 0;
777
1b4de1a6098b
  size_t c = 0;
778
f63ff83556ef
  int kbar = 0;
779
f63ff83556ef
780
757222db298a
  if (k == 0) {
781
f63ff83556ef
    k = m4ri_opt_k(A->nrows, A->ncols, 0);
782
afd18c34a693
    if (k>5) {
783
afd18c34a693
      k -= 4;
784
a93796ab261d
    }
785
757222db298a
  }
786
a93796ab261d
  int kk = 4*k;
787
f63ff83556ef
788
130dff8d3482
  packedmatrix *T0 = mzd_init(TWOPOW(k), A->ncols);
789
130dff8d3482
  packedmatrix *T1 = mzd_init(TWOPOW(k), A->ncols);
790
a93796ab261d
  packedmatrix *T2 = mzd_init(TWOPOW(k), A->ncols);
791
a93796ab261d
  packedmatrix *T3 = mzd_init(TWOPOW(k), A->ncols);
792
1b4de1a6098b
  size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
793
1b4de1a6098b
  size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
794
1b4de1a6098b
  size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
795
1b4de1a6098b
  size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t));
796
f63ff83556ef
797
f63ff83556ef
  while(c<ncols) {
798
f63ff83556ef
    if(c+kk > A->ncols) {
799
f63ff83556ef
      kk = ncols - c;
800
757222db298a
    }
801
762af6cbe158
    kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk);
802
f63ff83556ef
803
a93796ab261d
    if (kbar>3*k) {
804
a93796ab261d
      const int rem = kbar%4;
805
a93796ab261d
      const int ka = kbar/4 + ((rem>=3) ? 1 : 0);
806
a93796ab261d
      const int kb = kbar/4 + ((rem>=2) ? 1 : 0);
807
a93796ab261d
      const int kc = kbar/4 + ((rem>=1) ? 1 : 0);
808
a93796ab261d
      const int kd = kbar/4;
809
a93796ab261d
      mzd_make_table(A, r, c, ka, T0, L0);
810
a93796ab261d
      mzd_make_table(A, r+ka, c, kb, T1, L1);
811
a93796ab261d
      mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
812
a93796ab261d
      mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3);
813
a93796ab261d
      mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3);
814
a93796ab261d
      
815
a93796ab261d
    } else if (kbar>2*k) {
816
a93796ab261d
      int rem = kbar%3;
817
a93796ab261d
      int ka = kbar/3 + ((rem>=2) ? 1 : 0);
818
a93796ab261d
      int kb = kbar/3 + ((rem>=1) ? 1 : 0);
819
a93796ab261d
      int kc = kbar/3;
820
a93796ab261d
      mzd_make_table(A, r, c, ka, T0, L0);
821
a93796ab261d
      mzd_make_table(A, r+ka, c, kb, T1, L1);
822
a93796ab261d
      mzd_make_table(A, r+ka+kb, c, kc, T2, L2);
823
a93796ab261d
      mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2);
824
a93796ab261d
      
825
a93796ab261d
    } else if (kbar>k) {
826
a93796ab261d
      const int ka = kbar/2;
827
a93796ab261d
      const int kb = kbar - ka;
828
a93796ab261d
      mzd_make_table(A, r, c, ka, T0, L0);
829
a93796ab261d
      mzd_make_table(A, r+ka, c, kb, T1, L1);
830
a93796ab261d
      mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1);
831
a93796ab261d
      
832
a93796ab261d
    } else if(kbar > 0) {
833
a93796ab261d
      mzd_make_table(A, r, c, kbar, T0, L0);
834
a93796ab261d
      mzd_process_rows(A, 0, r, c, kbar, T0, L0);
835
f63ff83556ef
    }
836
f63ff83556ef
    r += kbar;
837
f63ff83556ef
    c += kbar;
838
f63ff83556ef
    if(kk!=kbar) {
839
f63ff83556ef
      c++;
840
757222db298a
    }
841
757222db298a
  }
842
f63ff83556ef
843
f63ff83556ef
  mzd_free(T0);
844
f63ff83556ef
  m4ri_mm_free(L0);
845
f63ff83556ef
  mzd_free(T1);
846
f63ff83556ef
  m4ri_mm_free(L1);
847
a93796ab261d
  mzd_free(T2);
848
a93796ab261d
  m4ri_mm_free(L2);
849
a93796ab261d
  mzd_free(T3);
850
a93796ab261d
  m4ri_mm_free(L3);
851
01d3fae4e843
}
852
01d3fae4e843
853
d358a5c4c9fd
packedmatrix *mzd_invert_m4ri(packedmatrix *m, packedmatrix *I, int k) {
854
d358a5c4c9fd
  packedmatrix *big = mzd_concat(NULL, m, I);
855
1b4de1a6098b
  size_t size=m->ncols;
856
757222db298a
  if (k == 0) {
857
35191cb07327
    k = m4ri_opt_k(m->nrows, m->ncols, 0);
858
757222db298a
  }
859
1b4de1a6098b
  size_t twokay=TWOPOW(k);
860
1b4de1a6098b
  size_t i;
861
1ceebbfe39a8
  packedmatrix *T=mzd_init(twokay, size*2);
862
1b4de1a6098b
  size_t *L=(size_t *)m4ri_mm_malloc(twokay * sizeof(size_t));
863
01d3fae4e843
  packedmatrix *answer;
864
757222db298a
  
865
1ceebbfe39a8
  mzd_reduce_m4ri(big, TRUE, k, T, L);
866
757222db298a
  
867
01d3fae4e843
  for(i=0; i < size; i++) {
868
35191cb07327
    if (!mzd_read_bit(big, i,i )) {
869
35191cb07327
      answer = NULL;
870
01d3fae4e843
      break;
871
01d3fae4e843
    }
872
01d3fae4e843
  }
873
01d3fae4e843
  if (i == size)
874
d358a5c4c9fd
    answer=mzd_submatrix(NULL, big, 0, size, size, size*2);
875
757222db298a
  
876
1ceebbfe39a8
  m4ri_mm_free(L);
877
1ceebbfe39a8
  mzd_free(T);
878
35191cb07327
  mzd_free(big);
879
01d3fae4e843
  
880
01d3fae4e843
  return answer;
881
01d3fae4e843
}
882
01d3fae4e843
883
35191cb07327
packedmatrix *mzd_mul_m4rm_t(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) {
884
8fd17136fa20
  packedmatrix *AT, *BT, *CT;
885
01d3fae4e843
  
886
18f7108f161c
  if(A->ncols != B->nrows) 
887
18f7108f161c
    m4ri_die("mzd_mul_m4rm_t: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows);
888
01d3fae4e843
  
889
35191cb07327
  AT = mzd_transpose(NULL, A);
890
35191cb07327
  BT = mzd_transpose(NULL, B);
891
757222db298a
  
892
18f7108f161c
  CT = mzd_init(B->ncols, A->nrows);
893
67c79e0249fe
  CT = _mzd_mul_m4rm(CT, BT, AT, k, 0);
894
757222db298a
  
895
35191cb07327
  mzd_free(AT);
896
35191cb07327
  mzd_free(BT);
897
01d3fae4e843
898
35191cb07327
  C = mzd_transpose(C, CT);
899
35191cb07327
  mzd_free(CT);
900
01d3fae4e843
  return C;
901
01d3fae4e843
}
902
01d3fae4e843
903
89e907bfb554
packedmatrix *mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) {
904
1b4de1a6098b
  size_t a = A->nrows;
905
1b4de1a6098b
  size_t c = B->ncols;
906
18f7108f161c
907
18f7108f161c
  if(A->ncols != B->nrows) 
908
130dff8d3482
    m4ri_die("mzd_mul_m4rm: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows);
909
18f7108f161c
  if (C == NULL) {
910
18f7108f161c
    C = mzd_init(a, c);
911
18f7108f161c
  } else {
912
18f7108f161c
    if (C->nrows != a || C->ncols != c)
913
18f7108f161c
      m4ri_die("mzd_mul_m4rm: C (%d x %d) has wrong dimensions.\n", C->nrows, C->ncols);
914
18f7108f161c
  }
915
67c79e0249fe
  return _mzd_mul_m4rm(C, A, B, k, TRUE);
916
18f7108f161c
}
917
18f7108f161c
918
89e907bfb554
packedmatrix *mzd_addmul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) {
919
1b4de1a6098b
  size_t a = A->nrows;
920
1b4de1a6098b
  size_t c = B->ncols;
921
18f7108f161c
922
18f7108f161c
  if(A->ncols != B->nrows) 
923
18f7108f161c
    m4ri_die("mzd_mul_m4rm A ncols (%d) need to match B nrows (%d) .\n", A->ncols, B->nrows);
924
18f7108f161c
  if (C == NULL) {
925
18f7108f161c
    C = mzd_init(a, c);
926
18f7108f161c
  } else {
927
18f7108f161c
    if (C->nrows != a || C->ncols != c)
928
18f7108f161c
      m4ri_die("mzd_mul_m4rm: C has wrong dimensions.\n");
929
18f7108f161c
  }
930
67c79e0249fe
  return _mzd_mul_m4rm(C, A, B, k, FALSE);
931
18f7108f161c
}
932
18f7108f161c
933
5404680bea3f
#ifdef HAVE_SSE2
934
0244d50cc50b
static inline void _mzd_combine8(word *c, word *t1, word *t2, word *t3, word *t4, word *t5, word *t6, word *t7, word *t8, int wide) {
935
1b4de1a6098b
  size_t i;
936
5404680bea3f
  /* assuming t1 ... t8 are aligned, but c might not be */
937
5404680bea3f
  if (ALIGNMENT(c,16)==0) {
938
5404680bea3f
    __m128i *__c = (__m128i*)c;
939
5404680bea3f
    __m128i *__t1 = (__m128i*)t1;
940
5404680bea3f
    __m128i *__t2 = (__m128i*)t2;
941
5404680bea3f
    __m128i *__t3 = (__m128i*)t3;
942
5404680bea3f
    __m128i *__t4 = (__m128i*)t4;
943
5404680bea3f
    __m128i *__t5 = (__m128i*)t5;
944
5404680bea3f
    __m128i *__t6 = (__m128i*)t6;
945
5404680bea3f
    __m128i *__t7 = (__m128i*)t7;
946
5404680bea3f
    __m128i *__t8 = (__m128i*)t8;
947
5404680bea3f
    const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xF);
948
5404680bea3f
    __m128i xmm1;
949
5404680bea3f
    
950
5404680bea3f
    while(__c < eof) {
951
5404680bea3f
      xmm1 = _mm_xor_si128(*__c, *__t1++);
952
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t2++);
953
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t3++);
954
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t4++);
955
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t5++);
956
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t6++);
957
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t7++);
958
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t8++);
959
5404680bea3f
      *__c++ = xmm1;
960
5404680bea3f
    }
961
5404680bea3f
    c  = (word*)__c;
962
5404680bea3f
    t1 = (word*)__t1;
963
5404680bea3f
    t2 = (word*)__t2;
964
5404680bea3f
    t3 = (word*)__t3;
965
5404680bea3f
    t4 = (word*)__t4;
966
5404680bea3f
    t5 = (word*)__t5;
967
5404680bea3f
    t6 = (word*)__t6;
968
5404680bea3f
    t7 = (word*)__t7;
969
5404680bea3f
    t8 = (word*)__t8;
970
5404680bea3f
    wide = ((sizeof(word)*wide)%16)/sizeof(word);
971
5404680bea3f
  }
972
5404680bea3f
  for(i=0; i<wide; i++) {
973
5404680bea3f
    c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
974
5404680bea3f
  }
975
5404680bea3f
}
976
0244d50cc50b
#else
977
0244d50cc50b
978
0244d50cc50b
#define _mzd_combine8(c,t1,t2,t3,t4,t5,t6,t7,t8,wide) for(ii=0; ii<wide ; ii++) c[ii] ^= t1[ii] ^ t2[ii] ^ t3[ii] ^ t4[ii] ^ t5[ii] ^ t6[ii] ^ t7[ii] ^ t8[ii]
979
0244d50cc50b
980
5404680bea3f
#endif
981
5404680bea3f
982
5404680bea3f
#ifdef HAVE_SSE2
983
0244d50cc50b
static inline void _mzd_combine4(word *c, word *t1, word *t2, word *t3, word *t4, size_t wide) {
984
1b4de1a6098b
  size_t i;
985
0244d50cc50b
  /* assuming t1 ... t4 are aligned, but c might not be */
986
5404680bea3f
  if (ALIGNMENT(c,16)==0) {
987
5404680bea3f
    __m128i *__c = (__m128i*)c;
988
5404680bea3f
    __m128i *__t1 = (__m128i*)t1;
989
5404680bea3f
    __m128i *__t2 = (__m128i*)t2;
990
5404680bea3f
    __m128i *__t3 = (__m128i*)t3;
991
5404680bea3f
    __m128i *__t4 = (__m128i*)t4;
992
5404680bea3f
    const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xF);
993
5404680bea3f
    __m128i xmm1;
994
5404680bea3f
    
995
5404680bea3f
    while(__c < eof) {
996
5404680bea3f
      xmm1 = _mm_xor_si128(*__c, *__t1++);
997
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t2++);
998
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t3++);
999
5404680bea3f
      xmm1 = _mm_xor_si128(xmm1, *__t4++);
1000
5404680bea3f
      *__c++ = xmm1;
1001
5404680bea3f
    }
1002
5404680bea3f
    c  = (word*)__c;
1003
5404680bea3f
    t1 = (word*)__t1;
1004
5404680bea3f
    t2 = (word*)__t2;
1005
5404680bea3f
    t3 = (word*)__t3;
1006
5404680bea3f
    t4 = (word*)__t4;
1007
5404680bea3f
    wide = ((sizeof(word)*wide)%16)/sizeof(word);
1008
5404680bea3f
  }
1009
5404680bea3f
  for(i=0; i<wide; i++) {
1010
5404680bea3f
    c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i];
1011
5404680bea3f
  }
1012
5404680bea3f
}
1013
0244d50cc50b
#else
1014
0244d50cc50b
1015
0244d50cc50b
#define _mzd_combine4(c, t1, t2, t3, t4, wide) for(ii=0; ii<wide ; ii++) c[ii] ^= t1[ii] ^ t2[ii] ^ t3[ii] ^ t4[ii]
1016
0244d50cc50b
1017
5404680bea3f
#endif //HAVE_SSE2
1018
5404680bea3f
1019
5404680bea3f
#ifdef HAVE_SSE2
1020
0244d50cc50b
static inline void _mzd_combine2(word *c, word *t1, word *t2, size_t wide) {
1021
0244d50cc50b
  size_t i;
1022
0244d50cc50b
  /* assuming t1 ... t2 are aligned, but c might not be */
1023
0244d50cc50b
  if (ALIGNMENT(c,16)==0) {
1024
0244d50cc50b
    __m128i *__c = (__m128i*)c;
1025
0244d50cc50b
    __m128i *__t1 = (__m128i*)t1;
1026
0244d50cc50b
    __m128i *__t2 = (__m128i*)t2;
1027
0244d50cc50b
    const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xF);
1028
0244d50cc50b
    __m128i xmm1;
1029
0244d50cc50b
    
1030
0244d50cc50b
    while(__c < eof) {
1031
0244d50cc50b
      xmm1 = _mm_xor_si128(*__c, *__t1++);
1032
0244d50cc50b
      xmm1 = _mm_xor_si128(xmm1, *__t2++);
1033
0244d50cc50b
      *__c++ = xmm1;
1034
0244d50cc50b
    }
1035
0244d50cc50b
    c  = (word*)__c;
1036
0244d50cc50b
    t1 = (word*)__t1;
1037
0244d50cc50b
    t2 = (word*)__t2;
1038
0244d50cc50b
    wide = ((sizeof(word)*wide)%16)/sizeof(word);
1039
0244d50cc50b
  }
1040
0244d50cc50b
  for(i=0; i<wide; i++) {
1041
0244d50cc50b
    c[i] ^= t1[i] ^ t2[i];
1042
0244d50cc50b
  }
1043
0244d50cc50b
}
1044
0244d50cc50b
#else
1045
0244d50cc50b
1046
0244d50cc50b
#define _mzd_combine2(c, t1, t2, wide) for(ii=0; ii<wide ; ii++) c[ii] ^= t1[ii] ^ t2[ii]
1047
0244d50cc50b
1048
0244d50cc50b
#endif //HAVE_SSE2
1049
0244d50cc50b
1050
5404680bea3f
1051
51c42f6de1ea
#ifdef M4RM_GRAY8
1052
0244d50cc50b
#define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide)
1053
51c42f6de1ea
#else //M4RM_GRAY8
1054
0244d50cc50b
#define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide)
1055
51c42f6de1ea
#endif //M4RM_GRAY8
1056
5404680bea3f
1057
67c79e0249fe
packedmatrix *_mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k, int clear) {
1058
f63ff83556ef
  /**
1059
f63ff83556ef
   * The algorithm proceeds as follows:
1060
f63ff83556ef
   * 
1061
f63ff83556ef
   * Step 1. Make a Gray code table of all the \f$2^k\f$ linear combinations
1062
f63ff83556ef
   * of the \f$k\f$ rows of \f$B_i\f$.  Call the \f$x\f$-th row
1063
f63ff83556ef
   * \f$T_x\f$.
1064
f63ff83556ef
   *
1065
f63ff83556ef
   * Step 2. Read the entries 
1066
f63ff83556ef
   *    \f$a_{j,(i-1)k+1}, a_{j,(i-1)k+2} , ... , a_{j,(i-1)k+k}.\f$
1067
f63ff83556ef
   *
1068
f63ff83556ef
   * Let \f$x\f$ be the \f$k\f$ bit binary number formed by the
1069
f63ff83556ef
   * concatenation of \f$a_{j,(i-1)k+1}, ... , a_{j,ik}\f$.
1070
f63ff83556ef
   *
1071
f63ff83556ef
   * Step 3. for \f$h = 1,2, ... , c\f$ do
1072
f63ff83556ef
   *   calculate \f$C_{jh} = C_{jh} + T_{xh}\f$.
1073
f63ff83556ef
   */
1074
a9653d7bf249
  assert(C->offset==0);
1075
1b4de1a6098b
  size_t i,j;
1076
1b4de1a6098b
  size_t ii;
1077
48be69fd8a7a
  unsigned int x1, x2, x3, x4;
1078
48be69fd8a7a
  word *t1, *t2, *t3, *t4;
1079
48be69fd8a7a
1080
51c42f6de1ea
#ifdef M4RM_GRAY8
1081
48be69fd8a7a
  unsigned int x5, x6, x7, x8;
1082
48be69fd8a7a
  word *t5, *t6, *t7, *t8;
1083
48be69fd8a7a
#endif
1084
48be69fd8a7a
1085
48be69fd8a7a
  word *c;
1086
c9090f72dd57
1087
1b4de1a6098b
  size_t a_nr = A->nrows;
1088
1b4de1a6098b
  size_t a_nc = A->ncols;
1089
1b4de1a6098b
  size_t b_nc = B->ncols;
1090
c9090f72dd57
1091
c9090f72dd57
  if (b_nc < RADIX-10) {
1092
eb19d7b1b678
    if(clear)
1093
bc105b372f12
      return mzd_mul_naive(C, A, B);
1094
eb19d7b1b678
    else
1095
bc105b372f12
      return mzd_addmul_naive(C, A, B);
1096
a9653d7bf249
  } else if (a_nr < 16) {
1097
a9653d7bf249
    return _mzd_mul_va(C, A, B, clear);
1098
c9090f72dd57
  }
1099
c9090f72dd57
1100
1b4de1a6098b
  size_t wide = C->width;
1101
c9090f72dd57
1102
c9090f72dd57
  /* clear first */
1103
c9090f72dd57
  if (clear) {
1104
a9653d7bf249
    mzd_set_ui(C, 0);
1105
c9090f72dd57
  }
1106
c9090f72dd57
1107
1b4de1a6098b
  const size_t blocksize = MZD_MUL_BLOCKSIZE;
1108
c9090f72dd57
1109
c9090f72dd57
  if (k == 0) {
1110
c9090f72dd57
    k = m4ri_opt_k(blocksize, a_nc, b_nc);
1111
51c42f6de1ea
#ifdef M4RM_GRAY8
1112
48253acb1b66
    if (k>3)
1113
48253acb1b66
      k -= 2;
1114
48253acb1b66
    /* reduce k further if that has a chance of hitting L1 */
1115
48253acb1b66
    const size_t tsize = (int)(0.8*(TWOPOW(k) * b_nc));
1116
48253acb1b66
    if(tsize > CPU_L1_CACHE && tsize/2 <= CPU_L1_CACHE)
1117
48253acb1b66
      k -= 1;
1118
48be69fd8a7a
#else
1119
48be69fd8a7a
    if (k>2)
1120
48be69fd8a7a
      k -= 1;
1121
48be69fd8a7a
#endif
1122
c9090f72dd57
  }
1123
762af6cbe158
1124
762af6cbe158
#ifndef M4RM_GRAY8
1125
f9845bc6e485
  size_t *buffer = (size_t*)m4ri_mm_malloc(4 * TWOPOW(k) * sizeof(size_t));
1126
762af6cbe158
#else
1127
f9845bc6e485
  size_t *buffer = (size_t*)m4ri_mm_malloc(8 * TWOPOW(k) * sizeof(size_t));
1128
762af6cbe158
#endif
1129
762af6cbe158
1130
c9090f72dd57
  packedmatrix *T1 = mzd_init(TWOPOW(k), b_nc);
1131
1b4de1a6098b
  size_t *L1 = buffer;
1132
c9090f72dd57
  packedmatrix *T2 = mzd_init(TWOPOW(k), b_nc);
1133
1b4de1a6098b
  size_t *L2 = buffer + 1*TWOPOW(k);
1134
411816376bce
  packedmatrix *T3 = mzd_init(TWOPOW(k), b_nc);
1135
1b4de1a6098b
  size_t *L3 = buffer + 2*TWOPOW(k);
1136
411816376bce
  packedmatrix *T4 = mzd_init(TWOPOW(k), b_nc);
1137
1b4de1a6098b
  size_t *L4 = buffer + 3*TWOPOW(k);
1138
48be69fd8a7a
1139
51c42f6de1ea
#ifdef M4RM_GRAY8
1140
411816376bce
  packedmatrix *T5 = mzd_init(TWOPOW(k), b_nc);
1141
1b4de1a6098b
  size_t *L5 = buffer + 4*TWOPOW(k);
1142
411816376bce
  packedmatrix *T6 = mzd_init(TWOPOW(k), b_nc);
1143
1b4de1a6098b
  size_t *L6 = buffer + 5*TWOPOW(k);
1144
411816376bce
  packedmatrix *T7 = mzd_init(TWOPOW(k), b_nc);
1145
1b4de1a6098b
  size_t *L7 = buffer + 6*TWOPOW(k);
1146
411816376bce
  packedmatrix *T8 = mzd_init(TWOPOW(k), b_nc);
1147
1b4de1a6098b
  size_t *L8 = buffer + 7*TWOPOW(k);
1148
48be69fd8a7a
#endif
1149
c9090f72dd57
1150
c9090f72dd57
  /* process stuff that fits into multiple of k first, but blockwise (babystep-giantstep)*/
1151
f9845bc6e485
  size_t babystep, giantstep;
1152
51c42f6de1ea
#ifdef M4RM_GRAY8
1153
411816376bce
  const int kk = 8*k;
1154
48be69fd8a7a
#else
1155
48be69fd8a7a
  const int kk = 4*k;
1156
48be69fd8a7a
#endif
1157
f9845bc6e485
  const size_t end = a_nc/kk;
1158
c9090f72dd57
1159
c9090f72dd57
  for (giantstep=0; giantstep + blocksize <= a_nr; giantstep += blocksize) {
1160
c9090f72dd57
    for(i=0; i < end; i++) {
1161
570d0958b479
      mzd_make_table( B, i*kk, 0, k, T1, L1);
1162
570d0958b479
      mzd_make_table( B, i*kk+k, 0, k, T2, L2);
1163
570d0958b479
      mzd_make_table( B, i*kk+k+k, 0, k, T3, L3);
1164
570d0958b479
      mzd_make_table( B, i*kk+k+k+k, 0, k, T4, L4);
1165
51c42f6de1ea
#ifdef M4RM_GRAY8
1166
570d0958b479
      mzd_make_table( B, i*kk+k+k+k+k, 0, k, T5, L5);
1167
570d0958b479
      mzd_make_table( B, i*kk+k+k+k+k+k, 0, k, T6, L6);
1168
570d0958b479
      mzd_make_table( B, i*kk+k+k+k+k+k+k, 0, k, T7, L7);
1169
570d0958b479
      mzd_make_table( B, i*kk+k+k+k+k+k+k+k, 0, k, T8, L8);
1170
48be69fd8a7a
#endif   
1171
0244d50cc50b
1172
c9090f72dd57
      for(babystep = 0; babystep < blocksize; babystep++) {
1173
c9090f72dd57
        j = giantstep + babystep;
1174
41bd093d9fc0
        x1 = L1[ (int)mzd_read_bits(A, j, i*kk, k) ];
1175
41bd093d9fc0
        x2 = L2[ (int)mzd_read_bits(A, j, i*kk+k, k) ];
1176
41bd093d9fc0
        x3 = L3[ (int)mzd_read_bits(A, j, i*kk+k+k, k) ];
1177
41bd093d9fc0
        x4 = L4[ (int)mzd_read_bits(A, j, i*kk+k+k+k, k) ];
1178
0244d50cc50b
#ifdef M4RM_GRAY8
1179
41bd093d9fc0
        x5 = L5[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k, k) ];
1180
41bd093d9fc0
        x6 = L6[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k, k) ];
1181
41bd093d9fc0
        x7 = L7[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k, k) ];
1182
41bd093d9fc0
        x8 = L8[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k+k, k) ];
1183
48be69fd8a7a
#endif
1184
c9090f72dd57
        c = C->values + C->rowswap[j];
1185
c9090f72dd57
        t1 = T1->values + T1->rowswap[x1];
1186
c9090f72dd57
        t2 = T2->values + T2->rowswap[x2];
1187
411816376bce
        t3 = T3->values + T3->rowswap[x3];
1188
411816376bce
        t4 = T4->values + T4->rowswap[x4];
1189
51c42f6de1ea
#ifdef M4RM_GRAY8
1190
411816376bce
        t5 = T5->values + T5->rowswap[x5];
1191
411816376bce
        t6 = T6->values + T6->rowswap[x6];
1192
411816376bce
        t7 = T7->values + T7->rowswap[x7];
1193
411816376bce
        t8 = T8->values + T8->rowswap[x8];
1194
48be69fd8a7a
#endif
1195
5404680bea3f
        _MZD_COMBINE;
1196
c9090f72dd57
      }
1197
c9090f72dd57
    }
1198
c9090f72dd57
  }
1199
c9090f72dd57
  
1200
c9090f72dd57
  for(i=0; i < end; i++) {
1201
570d0958b479
    mzd_make_table( B, i*kk, 0, k, T1, L1);
1202
570d0958b479
    mzd_make_table( B, i*kk+k, 0, k, T2, L2);
1203
570d0958b479
    mzd_make_table( B, i*kk+k+k, 0, k, T3, L3);
1204
570d0958b479
    mzd_make_table( B, i*kk+k+k+k, 0, k, T4, L4);
1205
51c42f6de1ea
#ifdef M4RM_GRAY8
1206
570d0958b479
    mzd_make_table( B, i*kk+k+k+k+k, 0, k, T5, L5);
1207
570d0958b479
    mzd_make_table( B, i*kk+k+k+k+k+k, 0, k, T6, L6);
1208
570d0958b479
    mzd_make_table( B, i*kk+k+k+k+k+k+k, 0, k, T7, L7);
1209
570d0958b479
    mzd_make_table( B, i*kk+k+k+k+k+k+k+k, 0, k, T8, L8);
1210
48be69fd8a7a
#endif
1211
c9090f72dd57
    for(babystep = 0; babystep < a_nr - giantstep; babystep++) {
1212
c9090f72dd57
      j = giantstep + babystep;
1213
41bd093d9fc0
      x1 = L1[ (int)mzd_read_bits(A, j, i*kk, k) ];
1214
41bd093d9fc0
      x2 = L2[ (int)mzd_read_bits(A, j, i*kk+k, k) ];
1215
41bd093d9fc0
      x3 = L3[ (int)mzd_read_bits(A, j, i*kk+k+k, k) ];
1216
41bd093d9fc0
      x4 = L4[ (int)mzd_read_bits(A, j, i*kk+k+k+k, k) ];
1217
51c42f6de1ea
#ifdef M4RM_GRAY8
1218
41bd093d9fc0
      x5 = L5[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k, k) ];
1219
41bd093d9fc0
      x6 = L6[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k, k) ];
1220
41bd093d9fc0
      x7 = L7[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k, k) ];
1221
41bd093d9fc0
      x8 = L8[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k+k, k) ];
1222
48be69fd8a7a
#endif
1223
c9090f72dd57
      c = C->values + C->rowswap[j];
1224
c9090f72dd57
      t1 = T1->values + T1->rowswap[x1];
1225
c9090f72dd57
      t2 = T2->values + T2->rowswap[x2];
1226
411816376bce
      t3 = T3->values + T3->rowswap[x3];
1227
411816376bce
      t4 = T4->values + T4->rowswap[x4];
1228
51c42f6de1ea
#ifdef M4RM_GRAY8
1229
411816376bce
      t5 = T5->values + T5->rowswap[x5];
1230
411816376bce
      t6 = T6->values + T6->rowswap[x6];
1231
411816376bce
      t7 = T7->values + T7->rowswap[x7];
1232
411816376bce
      t8 = T8->values + T8->rowswap[x8];
1233
48be69fd8a7a
#endif
1234
5404680bea3f
      _MZD_COMBINE;
1235
c9090f72dd57
    }
1236
c9090f72dd57
  }
1237
c9090f72dd57
1238
ff9a7b521b90
  /* handle stuff that doesn't fit into multiple of kk */
1239
c9090f72dd57
  if (a_nc%kk) {
1240
ff9a7b521b90
    for (i=end*kk/k; i < (a_nc)/k; i++) {
1241
570d0958b479
      mzd_make_table( B, i*k, 0, k, T1, L1);
1242
ff9a7b521b90
      for(j = 0; j<a_nr; j++) {
1243
41bd093d9fc0
        x1 = L1[ (int)mzd_read_bits(A, j, i*k, k) ];
1244
ff9a7b521b90
        c = C->values + C->rowswap[j];
1245
ff9a7b521b90
        t1 = T1->values + T1->rowswap[x1];
1246
ff9a7b521b90
        for(ii=0; ii<wide; ii++) {
1247
ff9a7b521b90
          c[ii] ^= t1[ii];
1248
ff9a7b521b90
        }
1249
ff9a7b521b90
      }
1250
ff9a7b521b90
    }
1251
ff9a7b521b90
    /* handle stuff that doesn't fit into multiple of k */
1252
ff9a7b521b90
    if (a_nc%k) {
1253
570d0958b479
      mzd_make_table( B, a_nc/k * k , 0, a_nc%k, T1, L1);
1254
ff9a7b521b90
      for(j = 0; j<a_nr; j++) {
1255
41bd093d9fc0
        x1 = L1[ (int)mzd_read_bits(A, j, i*k, a_nc%k) ];
1256
ff9a7b521b90
        c = C->values + C->rowswap[j];
1257
ff9a7b521b90
        t1 = T1->values + T1->rowswap[x1];
1258
ff9a7b521b90
        for(ii=0; ii<wide; ii++) {
1259
ff9a7b521b90
          c[ii] ^= t1[ii];
1260
ff9a7b521b90
        }
1261
ff9a7b521b90
      }
1262
c9090f72dd57
    }
1263
c9090f72dd57
  }
1264
c9090f72dd57
1265
c9090f72dd57
  mzd_free(T1);
1266
c9090f72dd57
  mzd_free(T2);
1267
411816376bce
  mzd_free(T3);
1268
411816376bce
  mzd_free(T4);
1269
51c42f6de1ea
#ifdef M4RM_GRAY8
1270
411816376bce
  mzd_free(T5);
1271
411816376bce
  mzd_free(T6);
1272
411816376bce
  mzd_free(T7);
1273
411816376bce
  mzd_free(T8);
1274
48be69fd8a7a
#endif
1275
762af6cbe158
  m4ri_mm_free(buffer);
1276
c9090f72dd57
  return C;
1277
c9090f72dd57
}
1278
c9090f72dd57