1
|
0214554f1aee
|
/******************************************************************* |
2
|
0214554f1aee
|
* |
3
|
0214554f1aee
|
* M4RI: Linear Algebra over GF(2) |
4
|
0214554f1aee
|
* |
5
|
0214554f1aee
|
* Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu> |
6
|
0214554f1aee
|
* Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk> |
7
|
0214554f1aee
|
* |
8
|
0214554f1aee
|
* Distributed under the terms of the GNU General Public License (GPL) |
9
|
0214554f1aee
|
* version 2 or higher. |
10
|
0214554f1aee
|
* |
11
|
0214554f1aee
|
* This code is distributed in the hope that it will be useful, |
12
|
0214554f1aee
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
0214554f1aee
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14
|
0214554f1aee
|
* General Public License for more details. |
15
|
0214554f1aee
|
* |
16
|
0214554f1aee
|
* The full text of the GPL is available at: |
17
|
0214554f1aee
|
* |
18
|
0214554f1aee
|
* http://www.gnu.org/licenses/ |
19
|
0214554f1aee
|
* |
20
|
0214554f1aee
|
********************************************************************/ |
21
|
2dd7e018b179
|
|
22
|
2dd7e018b179
|
#include "misc.h" |
23
|
2dd7e018b179
|
|
24
|
35191cb07327
|
#ifdef HAVE_SSE2 |
25
|
35191cb07327
|
#include <emmintrin.h> |
26
|
35191cb07327
|
#endif |
27
|
01d3fae4e843
|
|
28
|
fa74a8495f94
|
#include <assert.h> |
29
|
fa74a8495f94
|
|
30
|
01d3fae4e843
|
#include "brilliantrussian.h" |
31
|
35191cb07327
|
#include "grayflex.h" |
32
|
01d3fae4e843
|
|
33
|
35191cb07327
|
|
34
|
f63ff83556ef
|
/** |
35
|
762af6cbe158
|
* \brief Perform Gaussian reduction to reduced row echelon form on a |
36
|
762af6cbe158
|
* submatrix. |
37
|
130dff8d3482
|
* |
38
|
130dff8d3482
|
* The submatrix has dimension at most k starting at r x c of A. Checks |
39
|
130dff8d3482
|
* for pivot rows up to row endrow (exclusive). Terminates as soon as |
40
|
130dff8d3482
|
* finding a pivot column fails. |
41
|
f63ff83556ef
|
* |
42
|
130dff8d3482
|
* \param A Matrix. |
43
|
130dff8d3482
|
* \param r First row. |
44
|
130dff8d3482
|
* \param c First column. |
45
|
130dff8d3482
|
* \param k Maximal dimension of identity matrix to produce. |
46
|
762af6cbe158
|
* \param end_row Maximal row index (exclusive) for rows to consider |
47
|
130dff8d3482
|
* for inclusion. |
48
|
f63ff83556ef
|
*/ |
49
|
01d3fae4e843
|
|
50
|
1b4de1a6098b
|
static inline int _mzd_gauss_submatrix_full(packedmatrix *A, size_t r, size_t c, size_t end_row, int k) { |
51
|
1b4de1a6098b
|
size_t i,j,l; |
52
|
1b4de1a6098b
|
size_t start_row = r; |
53
|
f63ff83556ef
|
int found; |
54
|
f63ff83556ef
|
for (j=c; j<c+k; j++) { |
55
|
f63ff83556ef
|
found = 0; |
56
|
762af6cbe158
|
for (i=start_row; i< end_row; i++) { |
57
|
762af6cbe158
|
/* first we need to clear the first columns */ |
58
|
c717af9bd812
|
if(mzd_read_bits(A,i,c,j-c)) |
59
|
c717af9bd812
|
for (l=0; l<j-c; l++) |
60
|
c717af9bd812
|
if (mzd_read_bit(A, i, c+l)) |
61
|
c717af9bd812
|
mzd_row_add_offset(A, i, r+l, c+l); |
62
|
f63ff83556ef
|
|
63
|
f63ff83556ef
|
/* pivot? */ |
64
|
f63ff83556ef
|
if (mzd_read_bit(A, i, j)) { |
65
|
f63ff83556ef
|
mzd_row_swap(A, i, start_row); |
66
|
f63ff83556ef
|
/* clear above */ |
67
|
f63ff83556ef
|
for (l=r; l<start_row; l++) { |
68
|
f63ff83556ef
|
if (mzd_read_bit(A, l, j)) { |
69
|
762af6cbe158
|
mzd_row_add_offset(A, l, start_row, j); |
70
|
f63ff83556ef
|
} |
71
|
f63ff83556ef
|
} |
72
|
f63ff83556ef
|
start_row++; |
73
|
f63ff83556ef
|
found = 1; |
74
|
f63ff83556ef
|
break; |
75
|
f63ff83556ef
|
} |
76
|
f63ff83556ef
|
} |
77
|
f63ff83556ef
|
if (found==0) { |
78
|
f63ff83556ef
|
return j - c; |
79
|
f63ff83556ef
|
} |
80
|
f63ff83556ef
|
} |
81
|
f63ff83556ef
|
return j - c; |
82
|
f63ff83556ef
|
} |
83
|
f63ff83556ef
|
|
84
|
762af6cbe158
|
/** |
85
|
762af6cbe158
|
* \brief Perform Gaussian reduction to upper triangular matrix on a |
86
|
762af6cbe158
|
* submatrix. |
87
|
762af6cbe158
|
* |
88
|
762af6cbe158
|
* The submatrix has dimension at most k starting at r x c of A. Checks |
89
|
762af6cbe158
|
* for pivot rows up to row end_row (exclusive). Terminates as soon as |
90
|
762af6cbe158
|
* finding a pivot column fails. |
91
|
762af6cbe158
|
* |
92
|
762af6cbe158
|
* \param A Matrix. |
93
|
762af6cbe158
|
* \param r First row. |
94
|
762af6cbe158
|
* \param c First column. |
95
|
762af6cbe158
|
* \param k Maximal dimension of identity matrix to produce. |
96
|
762af6cbe158
|
* \param end_row Maximal row index (exclusive) for rows to consider |
97
|
762af6cbe158
|
* for inclusion. |
98
|
762af6cbe158
|
*/ |
99
|
762af6cbe158
|
|
100
|
1b4de1a6098b
|
static inline int _mzd_gauss_submatrix(packedmatrix *A, size_t r, size_t c, size_t end_row, int k) { |
101
|
1b4de1a6098b
|
size_t i,j,l; |
102
|
1b4de1a6098b
|
size_t start_row = r; |
103
|
762af6cbe158
|
int found; |
104
|
762af6cbe158
|
for (j=c; j<c+k; j++) { |
105
|
762af6cbe158
|
found = 0; |
106
|
762af6cbe158
|
for (i=start_row; i< end_row; i++) { |
107
|
762af6cbe158
|
/* first we need to clear the first columns */ |
108
|
762af6cbe158
|
for (l=0; l<j-c; l++) |
109
|
762af6cbe158
|
if (mzd_read_bit(A, i, c+l)) |
110
|
762af6cbe158
|
mzd_row_add_offset(A, i, r+l, c+l); |
111
|
762af6cbe158
|
|
112
|
762af6cbe158
|
/* pivot? */ |
113
|
762af6cbe158
|
if (mzd_read_bit(A, i, j)) { |
114
|
762af6cbe158
|
mzd_row_swap(A, i, start_row); |
115
|
762af6cbe158
|
start_row++; |
116
|
762af6cbe158
|
found = 1; |
117
|
762af6cbe158
|
break; |
118
|
762af6cbe158
|
} |
119
|
762af6cbe158
|
} |
120
|
762af6cbe158
|
if (found==0) { |
121
|
762af6cbe158
|
return j - c; |
122
|
762af6cbe158
|
} |
123
|
762af6cbe158
|
} |
124
|
762af6cbe158
|
return j - c; |
125
|
762af6cbe158
|
} |
126
|
762af6cbe158
|
|
127
|
762af6cbe158
|
/** |
128
|
762af6cbe158
|
* \brief Given a submatrix in upper triangular form compute the |
129
|
762af6cbe158
|
* reduced row echelon form. |
130
|
762af6cbe158
|
* |
131
|
762af6cbe158
|
* The submatrix has dimension at most k starting at r x c of A. Checks |
132
|
762af6cbe158
|
* for pivot rows up to row end_row (exclusive). Terminates as soon as |
133
|
762af6cbe158
|
* finding a pivot column fails. |
134
|
762af6cbe158
|
* |
135
|
762af6cbe158
|
* \param A Matrix. |
136
|
762af6cbe158
|
* \param r First row. |
137
|
762af6cbe158
|
* \param c First column. |
138
|
762af6cbe158
|
* \param k Maximal dimension of identity matrix to produce. |
139
|
762af6cbe158
|
* \param end_row Maximal row index (exclusive) for rows to consider |
140
|
762af6cbe158
|
* for inclusion. |
141
|
762af6cbe158
|
*/ |
142
|
762af6cbe158
|
|
143
|
1b4de1a6098b
|
static inline int _mzd_gauss_submatrix_top(packedmatrix *A, size_t r, size_t c, int k) { |
144
|
1b4de1a6098b
|
size_t j,l; |
145
|
1b4de1a6098b
|
size_t start_row = r; |
146
|
762af6cbe158
|
for (j=c; j<c+k; j++) { |
147
|
762af6cbe158
|
for (l=r; l<start_row; l++) { |
148
|
762af6cbe158
|
if (mzd_read_bit(A, l, j)) { |
149
|
762af6cbe158
|
mzd_row_add_offset(A, l, start_row, j); |
150
|
762af6cbe158
|
} |
151
|
762af6cbe158
|
} |
152
|
762af6cbe158
|
start_row++; |
153
|
762af6cbe158
|
} |
154
|
762af6cbe158
|
return k; |
155
|
762af6cbe158
|
} |
156
|
762af6cbe158
|
|
157
|
762af6cbe158
|
static inline void _mzd_copy_back_rows(packedmatrix *A, packedmatrix *U, size_t r, size_t c, size_t k) { |
158
|
762af6cbe158
|
size_t startblock = c/RADIX; |
159
|
762af6cbe158
|
size_t width = A->width - startblock; |
160
|
762af6cbe158
|
size_t i, j; |
161
|
762af6cbe158
|
for (i=0 ; i < k ; i++) { |
162
|
762af6cbe158
|
const word * const src = U->values + U->rowswap[i] + startblock; |
163
|
762af6cbe158
|
word *const dst = A->values + A->rowswap[r+i] + startblock; |
164
|
762af6cbe158
|
for (j=0; j< width; j++) { |
165
|
762af6cbe158
|
dst[j] = src[j]; |
166
|
762af6cbe158
|
} |
167
|
762af6cbe158
|
} |
168
|
762af6cbe158
|
} |
169
|
762af6cbe158
|
|
170
|
1b4de1a6098b
|
void mzd_make_table( packedmatrix *M, size_t r, size_t c, int k, packedmatrix *T, size_t *L) { |
171
|
1b4de1a6098b
|
const size_t homeblock= c/RADIX; |
172
|
1b4de1a6098b
|
size_t i, j, rowneeded, id; |
173
|
1b4de1a6098b
|
size_t twokay= TWOPOW(k); |
174
|
1b4de1a6098b
|
size_t wide = T->width - homeblock; |
175
|
01d3fae4e843
|
|
176
|
76716a73940a
|
word *ti, *ti1, *m; |
177
|
76716a73940a
|
|
178
|
76716a73940a
|
ti1 = T->values + homeblock; |
179
|
4a2f5e7fe65d
|
ti = ti1 + T->width; |
180
|
067ff9f2f927
|
#ifdef HAVE_SSE2 |
181
|
067ff9f2f927
|
unsigned long incw = 0; |
182
|
067ff9f2f927
|
if (T->width & 1) incw = 1; |
183
|
067ff9f2f927
|
ti += incw; |
184
|
067ff9f2f927
|
#endif |
185
|
4a2f5e7fe65d
|
|
186
|
35191cb07327
|
L[0]=0; |
187
|
76716a73940a
|
for (i=1; i<twokay; i++) { |
188
|
570d0958b479
|
rowneeded = r + codebook[k]->inc[i-1]; |
189
|
76716a73940a
|
id = codebook[k]->ord[i]; |
190
|
76716a73940a
|
L[id] = i; |
191
|
76716a73940a
|
if (rowneeded >= M->nrows) { |
192
|
966a82c78d39
|
for (j = 0; j < wide; j++) { |
193
|
4a2f5e7fe65d
|
*ti++ = *ti1++; |
194
|
76716a73940a
|
} |
195
|
067ff9f2f927
|
#ifdef HAVE_SSE2 |
196
|
067ff9f2f927
|
ti+=incw; ti1+=incw; |
197
|
067ff9f2f927
|
#endif |
198
|
76716a73940a
|
} else { |
199
|
76716a73940a
|
m = M->values + M->rowswap[rowneeded] + homeblock; |
200
|
4a2f5e7fe65d
|
|
201
|
e67a2e7c4c4b
|
/* Duff's device loop unrolling */ |
202
|
e67a2e7c4c4b
|
register int n = (wide + 7) / 8; |
203
|
e67a2e7c4c4b
|
switch (wide % 8) { |
204
|
e67a2e7c4c4b
|
case 0: do { *(ti++) = *(m++) ^ *(ti1++); |
205
|
e67a2e7c4c4b
|
case 7: *(ti++) = *(m++) ^ *(ti1++); |
206
|
e67a2e7c4c4b
|
case 6: *(ti++) = *(m++) ^ *(ti1++); |
207
|
e67a2e7c4c4b
|
case 5: *(ti++) = *(m++) ^ *(ti1++); |
208
|
e67a2e7c4c4b
|
case 4: *(ti++) = *(m++) ^ *(ti1++); |
209
|
e67a2e7c4c4b
|
case 3: *(ti++) = *(m++) ^ *(ti1++); |
210
|
e67a2e7c4c4b
|
case 2: *(ti++) = *(m++) ^ *(ti1++); |
211
|
e67a2e7c4c4b
|
case 1: *(ti++) = *(m++) ^ *(ti1++); |
212
|
e67a2e7c4c4b
|
} while (--n > 0); |
213
|
4a2f5e7fe65d
|
} |
214
|
067ff9f2f927
|
#ifdef HAVE_SSE2 |
215
|
067ff9f2f927
|
ti+=incw; ti1+=incw; |
216
|
067ff9f2f927
|
#endif |
217
|
1ceebbfe39a8
|
ti += homeblock; |
218
|
1ceebbfe39a8
|
ti1 += homeblock; |
219
|
76716a73940a
|
} |
220
|
b3bd008958dc
|
} |
221
|
01d3fae4e843
|
} |
222
|
01d3fae4e843
|
|
223
|
1b4de1a6098b
|
void mzd_process_rows(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T, size_t *L) { |
224
|
f9845bc6e485
|
size_t r; |
225
|
1b4de1a6098b
|
const size_t blocknum=startcol/RADIX; |
226
|
1b4de1a6098b
|
size_t wide = M->width - blocknum; |
227
|
3016fb391eab
|
|
228
|
5b6bab6a102a
|
if(k==1) { |
229
|
5b6bab6a102a
|
word bm = ONE << ((RADIX - startcol - 1) % RADIX); |
230
|
5b6bab6a102a
|
|
231
|
5b6bab6a102a
|
for (r=startrow; r+2<=stoprow; r+=2) { |
232
|
5b6bab6a102a
|
word *t = T->values + T->rowswap[1] + blocknum; |
233
|
5b6bab6a102a
|
word *m0 = M->values + M->rowswap[r+0] + blocknum; |
234
|
5b6bab6a102a
|
word *m1 = M->values + M->rowswap[r+1] + blocknum; |
235
|
5b6bab6a102a
|
register int n = (wide + 7) / 8; |
236
|
5b6bab6a102a
|
|
237
|
5b6bab6a102a
|
if(*m0 & bm) { |
238
|
5b6bab6a102a
|
if(*m1 & bm) { |
239
|
5b6bab6a102a
|
switch (wide % 8) { |
240
|
5b6bab6a102a
|
case 0: do { *m0++ ^= *t; *m1++ ^= *t++; |
241
|
5b6bab6a102a
|
case 7: *m0++ ^= *t; *m1++ ^= *t++; |
242
|
5b6bab6a102a
|
case 6: *m0++ ^= *t; *m1++ ^= *t++; |
243
|
5b6bab6a102a
|
case 5: *m0++ ^= *t; *m1++ ^= *t++; |
244
|
5b6bab6a102a
|
case 4: *m0++ ^= *t; *m1++ ^= *t++; |
245
|
5b6bab6a102a
|
case 3: *m0++ ^= *t; *m1++ ^= *t++; |
246
|
5b6bab6a102a
|
case 2: *m0++ ^= *t; *m1++ ^= *t++; |
247
|
5b6bab6a102a
|
case 1: *m0++ ^= *t; *m1++ ^= *t++; |
248
|
5b6bab6a102a
|
} while (--n > 0); |
249
|
5b6bab6a102a
|
} |
250
|
5b6bab6a102a
|
} else { |
251
|
5b6bab6a102a
|
switch (wide % 8) { |
252
|
5b6bab6a102a
|
case 0: do { *m0++ ^= *t++; |
253
|
5b6bab6a102a
|
case 7: *m0++ ^= *t++; |
254
|
5b6bab6a102a
|
case 6: *m0++ ^= *t++; |
255
|
5b6bab6a102a
|
case 5: *m0++ ^= *t++; |
256
|
5b6bab6a102a
|
case 4: *m0++ ^= *t++; |
257
|
5b6bab6a102a
|
case 3: *m0++ ^= *t++; |
258
|
5b6bab6a102a
|
case 2: *m0++ ^= *t++; |
259
|
5b6bab6a102a
|
case 1: *m0++ ^= *t++; |
260
|
5b6bab6a102a
|
} while (--n > 0); |
261
|
5b6bab6a102a
|
} |
262
|
5b6bab6a102a
|
} |
263
|
5b6bab6a102a
|
} else if(*m1 & bm) { |
264
|
5b6bab6a102a
|
switch (wide % 8) { |
265
|
5b6bab6a102a
|
case 0: do { *m1++ ^= *t++; |
266
|
5b6bab6a102a
|
case 7: *m1++ ^= *t++; |
267
|
5b6bab6a102a
|
case 6: *m1++ ^= *t++; |
268
|
5b6bab6a102a
|
case 5: *m1++ ^= *t++; |
269
|
5b6bab6a102a
|
case 4: *m1++ ^= *t++; |
270
|
5b6bab6a102a
|
case 3: *m1++ ^= *t++; |
271
|
5b6bab6a102a
|
case 2: *m1++ ^= *t++; |
272
|
5b6bab6a102a
|
case 1: *m1++ ^= *t++; |
273
|
5b6bab6a102a
|
} while (--n > 0); |
274
|
5b6bab6a102a
|
} |
275
|
5b6bab6a102a
|
} |
276
|
5b6bab6a102a
|
} |
277
|
5b6bab6a102a
|
|
278
|
5b6bab6a102a
|
for( ; r<stoprow; r++) { |
279
|
5b6bab6a102a
|
const int x0 = L[ (int)mzd_read_bits(M, r, startcol, k) ]; |
280
|
5b6bab6a102a
|
word *m0 = M->values + M->rowswap[r] + blocknum; |
281
|
5b6bab6a102a
|
word *t0 = T->values + T->rowswap[x0] + blocknum; |
282
|
5b6bab6a102a
|
|
283
|
5b6bab6a102a
|
register int n = (wide + 7) / 8; |
284
|
5b6bab6a102a
|
switch (wide % 8) { |
285
|
5b6bab6a102a
|
case 0: do { *m0++ ^= *t0++; |
286
|
5b6bab6a102a
|
case 7: *m0++ ^= *t0++; |
287
|
5b6bab6a102a
|
case 6: *m0++ ^= *t0++; |
288
|
5b6bab6a102a
|
case 5: *m0++ ^= *t0++; |
289
|
5b6bab6a102a
|
case 4: *m0++ ^= *t0++; |
290
|
5b6bab6a102a
|
case 3: *m0++ ^= *t0++; |
291
|
5b6bab6a102a
|
case 2: *m0++ ^= *t0++; |
292
|
5b6bab6a102a
|
case 1: *m0++ ^= *t0++; |
293
|
5b6bab6a102a
|
} while (--n > 0); |
294
|
5b6bab6a102a
|
} |
295
|
5b6bab6a102a
|
} |
296
|
5b6bab6a102a
|
return; |
297
|
5b6bab6a102a
|
} |
298
|
5b6bab6a102a
|
|
299
|
3de1c2fcfd95
|
for (r=startrow; r+2<=stoprow; r+=2) { |
300
|
41bd093d9fc0
|
const int x0 = L[ (int)mzd_read_bits(M, r+0, startcol, k) ]; |
301
|
41bd093d9fc0
|
const int x1 = L[ (int)mzd_read_bits(M, r+1, startcol, k) ]; |
302
|
a50e736b16b7
|
|
303
|
a50e736b16b7
|
word *m0 = M->values + M->rowswap[r+0] + blocknum; |
304
|
130dff8d3482
|
word *t0 = T->values + T->rowswap[x0] + blocknum; |
305
|
130dff8d3482
|
|
306
|
a50e736b16b7
|
word *m1 = M->values + M->rowswap[r+1] + blocknum; |
307
|
a50e736b16b7
|
word *t1 = T->values + T->rowswap[x1] + blocknum; |
308
|
01d3fae4e843
|
|
309
|
130dff8d3482
|
register int n = (wide + 7) / 8; |
310
|
130dff8d3482
|
switch (wide % 8) { |
311
|
130dff8d3482
|
case 0: do { *m0++ ^= *t0++; *m1++ ^= *t1++; |
312
|
130dff8d3482
|
case 7: *m0++ ^= *t0++; *m1++ ^= *t1++; |
313
|
130dff8d3482
|
case 6: *m0++ ^= *t0++; *m1++ ^= *t1++; |
314
|
130dff8d3482
|
case 5: *m0++ ^= *t0++; *m1++ ^= *t1++; |
315
|
130dff8d3482
|
case 4: *m0++ ^= *t0++; *m1++ ^= *t1++; |
316
|
130dff8d3482
|
case 3: *m0++ ^= *t0++; *m1++ ^= *t1++; |
317
|
130dff8d3482
|
case 2: *m0++ ^= *t0++; *m1++ ^= *t1++; |
318
|
130dff8d3482
|
case 1: *m0++ ^= *t0++; *m1++ ^= *t1++; |
319
|
130dff8d3482
|
} while (--n > 0); |
320
|
01d3fae4e843
|
} |
321
|
130dff8d3482
|
} |
322
|
130dff8d3482
|
|
323
|
a50e736b16b7
|
for( ; r<stoprow; r++) { |
324
|
41bd093d9fc0
|
const int x0 = L[ (int)mzd_read_bits(M, r, startcol, k) ]; |
325
|
a50e736b16b7
|
word *m0 = M->values + M->rowswap[r] + blocknum; |
326
|
a50e736b16b7
|
word *t0 = T->values + T->rowswap[x0] + blocknum; |
327
|
130dff8d3482
|
|
328
|
130dff8d3482
|
register int n = (wide + 7) / 8; |
329
|
130dff8d3482
|
switch (wide % 8) { |
330
|
130dff8d3482
|
case 0: do { *m0++ ^= *t0++; |
331
|
130dff8d3482
|
case 7: *m0++ ^= *t0++; |
332
|
130dff8d3482
|
case 6: *m0++ ^= *t0++; |
333
|
130dff8d3482
|
case 5: *m0++ ^= *t0++; |
334
|
130dff8d3482
|
case 4: *m0++ ^= *t0++; |
335
|
130dff8d3482
|
case 3: *m0++ ^= *t0++; |
336
|
130dff8d3482
|
case 2: *m0++ ^= *t0++; |
337
|
130dff8d3482
|
case 1: *m0++ ^= *t0++; |
338
|
130dff8d3482
|
} while (--n > 0); |
339
|
130dff8d3482
|
} |
340
|
130dff8d3482
|
} |
341
|
130dff8d3482
|
} |
342
|
130dff8d3482
|
|
343
|
1b4de1a6098b
|
void mzd_process_rows2(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1) { |
344
|
f9845bc6e485
|
size_t r; |
345
|
1b4de1a6098b
|
const size_t blocknum=startcol/RADIX; |
346
|
1b4de1a6098b
|
const size_t wide = M->width - blocknum; |
347
|
130dff8d3482
|
|
348
|
130dff8d3482
|
const int ka = k/2; |
349
|
130dff8d3482
|
const int kb = k-k/2; |
350
|
130dff8d3482
|
|
351
|
e4b83893013e
|
#ifdef HAVE_OPENMP |
352
|
e4b83893013e
|
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128) |
353
|
e4b83893013e
|
#endif |
354
|
130dff8d3482
|
for(r=startrow; r<stoprow; r++) { |
355
|
41bd093d9fc0
|
const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)]; |
356
|
41bd093d9fc0
|
const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)]; |
357
|
1208d145f7d2
|
if(x0 == 0 && x1 == 0) |
358
|
1208d145f7d2
|
continue; |
359
|
130dff8d3482
|
word * m0 = M->values + M->rowswap[r] + blocknum; |
360
|
130dff8d3482
|
const word *t0 = T0->values + T0->rowswap[x0] + blocknum; |
361
|
130dff8d3482
|
const word *t1 = T1->values + T1->rowswap[x1] + blocknum; |
362
|
130dff8d3482
|
|
363
|
130dff8d3482
|
register int n = (wide + 7) / 8; |
364
|
130dff8d3482
|
switch (wide % 8) { |
365
|
130dff8d3482
|
case 0: do { *m0++ ^= *t0++ ^ *t1++; |
366
|
130dff8d3482
|
case 7: *m0++ ^= *t0++ ^ *t1++; |
367
|
130dff8d3482
|
case 6: *m0++ ^= *t0++ ^ *t1++; |
368
|
130dff8d3482
|
case 5: *m0++ ^= *t0++ ^ *t1++; |
369
|
130dff8d3482
|
case 4: *m0++ ^= *t0++ ^ *t1++; |
370
|
130dff8d3482
|
case 3: *m0++ ^= *t0++ ^ *t1++; |
371
|
130dff8d3482
|
case 2: *m0++ ^= *t0++ ^ *t1++; |
372
|
130dff8d3482
|
case 1: *m0++ ^= *t0++ ^ *t1++; |
373
|
130dff8d3482
|
} while (--n > 0); |
374
|
130dff8d3482
|
} |
375
|
130dff8d3482
|
} |
376
|
130dff8d3482
|
} |
377
|
130dff8d3482
|
|
378
|
1b4de1a6098b
|
void mzd_process_rows3(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2) { |
379
|
f9845bc6e485
|
size_t r; |
380
|
1b4de1a6098b
|
const size_t blocknum=startcol/RADIX; |
381
|
1b4de1a6098b
|
const size_t wide = M->width - blocknum; |
382
|
130dff8d3482
|
|
383
|
130dff8d3482
|
int rem = k%3; |
384
|
130dff8d3482
|
|
385
|
130dff8d3482
|
const int ka = k/3 + ((rem>=2) ? 1 : 0); |
386
|
130dff8d3482
|
const int kb = k/3 + ((rem>=1) ? 1 : 0); |
387
|
130dff8d3482
|
const int kc = k/3; |
388
|
130dff8d3482
|
|
389
|
e4b83893013e
|
#ifdef HAVE_OPENMP |
390
|
e4b83893013e
|
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128) |
391
|
e4b83893013e
|
#endif |
392
|
130dff8d3482
|
for(r=startrow; r<stoprow; r++) { |
393
|
41bd093d9fc0
|
const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)]; |
394
|
41bd093d9fc0
|
const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)]; |
395
|
41bd093d9fc0
|
const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)]; |
396
|
1208d145f7d2
|
if(x0 == 0 && x1 == 0 && x2 == 0) |
397
|
1208d145f7d2
|
continue; |
398
|
1208d145f7d2
|
|
399
|
130dff8d3482
|
word * m0 = M->values + M->rowswap[r] + blocknum; |
400
|
130dff8d3482
|
const word *t0 = T0->values + T0->rowswap[x0] + blocknum; |
401
|
130dff8d3482
|
const word *t1 = T1->values + T1->rowswap[x1] + blocknum; |
402
|
130dff8d3482
|
const word *t2 = T2->values + T2->rowswap[x2] + blocknum; |
403
|
130dff8d3482
|
|
404
|
130dff8d3482
|
register int n = (wide + 7) / 8; |
405
|
130dff8d3482
|
switch (wide % 8) { |
406
|
130dff8d3482
|
case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
407
|
130dff8d3482
|
case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
408
|
130dff8d3482
|
case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
409
|
130dff8d3482
|
case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
410
|
130dff8d3482
|
case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
411
|
130dff8d3482
|
case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
412
|
130dff8d3482
|
case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
413
|
130dff8d3482
|
case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; |
414
|
130dff8d3482
|
} while (--n > 0); |
415
|
130dff8d3482
|
} |
416
|
a50e736b16b7
|
} |
417
|
01d3fae4e843
|
} |
418
|
01d3fae4e843
|
|
419
|
1b4de1a6098b
|
void mzd_process_rows4(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, |
420
|
1b4de1a6098b
|
packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3) { |
421
|
f9845bc6e485
|
size_t r; |
422
|
1b4de1a6098b
|
const size_t blocknum=startcol/RADIX; |
423
|
1b4de1a6098b
|
const size_t wide = M->width - blocknum; |
424
|
52e22dd25d90
|
|
425
|
52e22dd25d90
|
int rem = k%4; |
426
|
52e22dd25d90
|
|
427
|
52e22dd25d90
|
const int ka = k/4 + ((rem>=3) ? 1 : 0); |
428
|
52e22dd25d90
|
const int kb = k/4 + ((rem>=2) ? 1 : 0); |
429
|
52e22dd25d90
|
const int kc = k/4 + ((rem>=1) ? 1 : 0); |
430
|
52e22dd25d90
|
const int kd = k/4; |
431
|
52e22dd25d90
|
|
432
|
e4b83893013e
|
#ifdef HAVE_OPENMP |
433
|
e4b83893013e
|
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128) |
434
|
e4b83893013e
|
#endif |
435
|
52e22dd25d90
|
for(r=startrow; r<stoprow; r++) { |
436
|
41bd093d9fc0
|
const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)]; |
437
|
41bd093d9fc0
|
const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)]; |
438
|
41bd093d9fc0
|
const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)]; |
439
|
41bd093d9fc0
|
const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd)]; |
440
|
1208d145f7d2
|
if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0) |
441
|
1208d145f7d2
|
continue; |
442
|
1208d145f7d2
|
|
443
|
52e22dd25d90
|
word * m0 = M->values + M->rowswap[r] + blocknum; |
444
|
52e22dd25d90
|
const word *t0 = T0->values + T0->rowswap[x0] + blocknum; |
445
|
52e22dd25d90
|
const word *t1 = T1->values + T1->rowswap[x1] + blocknum; |
446
|
52e22dd25d90
|
const word *t2 = T2->values + T2->rowswap[x2] + blocknum; |
447
|
52e22dd25d90
|
const word *t3 = T3->values + T3->rowswap[x3] + blocknum; |
448
|
1b4de1a6098b
|
|
449
|
52e22dd25d90
|
register int n = (wide + 7) / 8; |
450
|
52e22dd25d90
|
switch (wide % 8) { |
451
|
52e22dd25d90
|
case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
452
|
52e22dd25d90
|
case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
453
|
52e22dd25d90
|
case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
454
|
52e22dd25d90
|
case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
455
|
52e22dd25d90
|
case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
456
|
52e22dd25d90
|
case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
457
|
52e22dd25d90
|
case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
458
|
52e22dd25d90
|
case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; |
459
|
52e22dd25d90
|
} while (--n > 0); |
460
|
52e22dd25d90
|
} |
461
|
52e22dd25d90
|
} |
462
|
52e22dd25d90
|
} |
463
|
52e22dd25d90
|
|
464
|
1b4de1a6098b
|
void mzd_process_rows5(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, |
465
|
1b4de1a6098b
|
packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3, |
466
|
1b4de1a6098b
|
packedmatrix *T4, size_t *L4) { |
467
|
f9845bc6e485
|
size_t r; |
468
|
1b4de1a6098b
|
const size_t blocknum=startcol/RADIX; |
469
|
1b4de1a6098b
|
const size_t wide = M->width - blocknum; |
470
|
1b4de1a6098b
|
int rem = k%5; |
471
|
1b4de1a6098b
|
|
472
|
1b4de1a6098b
|
const int ka = k/5 + ((rem>=4) ? 1 : 0); |
473
|
1b4de1a6098b
|
const int kb = k/5 + ((rem>=3) ? 1 : 0); |
474
|
1b4de1a6098b
|
const int kc = k/5 + ((rem>=2) ? 1 : 0); |
475
|
1b4de1a6098b
|
const int kd = k/5 + ((rem>=1) ? 1 : 0); |
476
|
1b4de1a6098b
|
const int ke = k/5; |
477
|
1b4de1a6098b
|
|
478
|
e4b83893013e
|
#ifdef HAVE_OPENMP |
479
|
e4b83893013e
|
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128) |
480
|
e4b83893013e
|
#endif |
481
|
1b4de1a6098b
|
for(r=startrow; r<stoprow; r++) { |
482
|
34b9e9154944
|
|
483
|
1b4de1a6098b
|
const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)]; |
484
|
1b4de1a6098b
|
const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)]; |
485
|
1b4de1a6098b
|
const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)]; |
486
|
1b4de1a6098b
|
const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd)]; |
487
|
1b4de1a6098b
|
const int x4 = L4[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc+kd, ke)]; |
488
|
1b4de1a6098b
|
|
489
|
1b4de1a6098b
|
if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0) |
490
|
1b4de1a6098b
|
continue; |
491
|
1b4de1a6098b
|
|
492
|
1b4de1a6098b
|
word * m0 = M->values + M->rowswap[r] + blocknum; |
493
|
1b4de1a6098b
|
const word *t0 = T0->values + T0->rowswap[x0] + blocknum; |
494
|
1b4de1a6098b
|
const word *t1 = T1->values + T1->rowswap[x1] + blocknum; |
495
|
1b4de1a6098b
|
const word *t2 = T2->values + T2->rowswap[x2] + blocknum; |
496
|
1b4de1a6098b
|
const word *t3 = T3->values + T3->rowswap[x3] + blocknum; |
497
|
1b4de1a6098b
|
const word *t4 = T4->values + T4->rowswap[x4] + blocknum; |
498
|
1b4de1a6098b
|
|
499
|
1b4de1a6098b
|
register int n = (wide + 7) / 8; |
500
|
1b4de1a6098b
|
switch (wide % 8) { |
501
|
1b4de1a6098b
|
case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
502
|
1b4de1a6098b
|
case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
503
|
1b4de1a6098b
|
case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
504
|
1b4de1a6098b
|
case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
505
|
1b4de1a6098b
|
case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
506
|
1b4de1a6098b
|
case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
507
|
1b4de1a6098b
|
case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
508
|
1b4de1a6098b
|
case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; |
509
|
1b4de1a6098b
|
} while (--n > 0); |
510
|
1b4de1a6098b
|
} |
511
|
1b4de1a6098b
|
} |
512
|
1b4de1a6098b
|
} |
513
|
1b4de1a6098b
|
|
514
|
1b4de1a6098b
|
void mzd_process_rows6(packedmatrix *M, size_t startrow, size_t stoprow, size_t startcol, int k, |
515
|
1b4de1a6098b
|
packedmatrix *T0, size_t *L0, packedmatrix *T1, size_t *L1, packedmatrix *T2, size_t *L2, packedmatrix *T3, size_t *L3, |
516
|
1b4de1a6098b
|
packedmatrix *T4, size_t *L4, packedmatrix *T5, size_t *L5) { |
517
|
f9845bc6e485
|
size_t r; |
518
|
1b4de1a6098b
|
const size_t blocknum=startcol/RADIX; |
519
|
1b4de1a6098b
|
const size_t wide = M->width - blocknum; |
520
|
1b4de1a6098b
|
|
521
|
1b4de1a6098b
|
int rem = k%6; |
522
|
1b4de1a6098b
|
|
523
|
1b4de1a6098b
|
const int ka = k/6 + ((rem>=5) ? 1 : 0); |
524
|
1b4de1a6098b
|
const int kb = k/6 + ((rem>=4) ? 1 : 0); |
525
|
1b4de1a6098b
|
const int kc = k/6 + ((rem>=3) ? 1 : 0); |
526
|
1b4de1a6098b
|
const int kd = k/6 + ((rem>=2) ? 1 : 0); |
527
|
1b4de1a6098b
|
const int ke = k/6 + ((rem>=1) ? 1 : 0);; |
528
|
1b4de1a6098b
|
const int kf = k/6; |
529
|
1b4de1a6098b
|
|
530
|
e4b83893013e
|
#ifdef HAVE_OPENMP |
531
|
e4b83893013e
|
#pragma omp parallel for private(r) shared(startrow, stoprow) schedule(dynamic,32) if(stoprow-startrow > 128) |
532
|
e4b83893013e
|
#endif |
533
|
1b4de1a6098b
|
for(r=startrow; r<stoprow; r++) { |
534
|
1b4de1a6098b
|
const int x0 = L0[ (int)mzd_read_bits(M, r, startcol, ka)]; |
535
|
1b4de1a6098b
|
const int x1 = L1[ (int)mzd_read_bits(M, r, startcol+ka, kb)]; |
536
|
1b4de1a6098b
|
const int x2 = L2[ (int)mzd_read_bits(M, r, startcol+ka+kb, kc)]; |
537
|
1b4de1a6098b
|
const int x3 = L3[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc, kd)]; |
538
|
1b4de1a6098b
|
const int x4 = L4[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc+kd, ke)]; |
539
|
1b4de1a6098b
|
const int x5 = L5[ (int)mzd_read_bits(M, r, startcol+ka+kb+kc+kd+ke, kf)]; |
540
|
f9845bc6e485
|
|
541
|
1b4de1a6098b
|
if(x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0) |
542
|
1b4de1a6098b
|
continue; |
543
|
34b9e9154944
|
|
544
|
1b4de1a6098b
|
word * m0 = M->values + M->rowswap[r] + blocknum; |
545
|
1b4de1a6098b
|
const word *t0 = T0->values + T0->rowswap[x0] + blocknum; |
546
|
1b4de1a6098b
|
const word *t1 = T1->values + T1->rowswap[x1] + blocknum; |
547
|
1b4de1a6098b
|
const word *t2 = T2->values + T2->rowswap[x2] + blocknum; |
548
|
1b4de1a6098b
|
const word *t3 = T3->values + T3->rowswap[x3] + blocknum; |
549
|
1b4de1a6098b
|
const word *t4 = T4->values + T4->rowswap[x4] + blocknum; |
550
|
1b4de1a6098b
|
const word *t5 = T5->values + T5->rowswap[x5] + blocknum; |
551
|
34b9e9154944
|
|
552
|
1b4de1a6098b
|
register int n = (wide + 7) / 8; |
553
|
1b4de1a6098b
|
switch (wide % 8) { |
554
|
34b9e9154944
|
case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
555
|
1b4de1a6098b
|
case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
556
|
1b4de1a6098b
|
case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
557
|
1b4de1a6098b
|
case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
558
|
1b4de1a6098b
|
case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
559
|
1b4de1a6098b
|
case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
560
|
1b4de1a6098b
|
case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
561
|
1b4de1a6098b
|
case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; |
562
|
1b4de1a6098b
|
} while (--n > 0); |
563
|
1b4de1a6098b
|
} |
564
|
1b4de1a6098b
|
} |
565
|
1b4de1a6098b
|
} |
566
|
1b4de1a6098b
|
|
567
|
1b4de1a6098b
|
int mzd_reduce_m4ri(packedmatrix *A, int full, int k, packedmatrix *T, size_t *L) { |
568
|
35191cb07327
|
/** |
569
|
35191cb07327
|
* The algorithm works as follows: |
570
|
757222db298a
|
* |
571
|
35191cb07327
|
* Step 1.Denote the first column to be processed in a given |
572
|
35191cb07327
|
* iteration as \f$a_i\f$. Then, perform Gaussian elimination on the |
573
|
35191cb07327
|
* first \f$3k\f$ rows after and including the \f$i\f$-th row to |
574
|
35191cb07327
|
* produce an identity matrix in \f$a_{i,i} ... a_{i+k-1,i+k-1},\f$ |
575
|
35191cb07327
|
* and zeroes in \f$a_{i+k,i} ... a_{i+3k-1,i+k-1}\f$. |
576
|
f63ff83556ef
|
* |
577
|
35191cb07327
|
* Step 2. Construct a table consisting of the \f$2^k\f$ binary strings of |
578
|
35191cb07327
|
* length k in a Gray code. Thus with only \f$2^k\f$ vector |
579
|
35191cb07327
|
* additions, all possible linear combinations of these k rows |
580
|
35191cb07327
|
* have been precomputed. |
581
|
f63ff83556ef
|
* |
582
|
f63ff83556ef
|
* |
583
|
35191cb07327
|
* Step 3. One can rapidly process the remaining rows from \f$i + |
584
|
35191cb07327
|
* 3k\f$ until row \f$m\f$ (the last row) by using the table. For |
585
|
fa74a8495f94
|
* example, suppose the \f$j\f$-th row has entries \f$a_{j,i} |
586
|
35191cb07327
|
* ... a_{j,i+k-1}\f$ in the columns being processed. Selecting the |
587
|
35191cb07327
|
* row of the table associated with this k-bit string, and adding it |
588
|
35191cb07327
|
* to row j will force the k columns to zero, and adjust the |
589
|
18f7108f161c
|
* remaining columns from \f$ i + k\f$ to n in the appropriate way, |
590
|
35191cb07327
|
* as if Gaussian elimination had been performed. |
591
|
f63ff83556ef
|
* |
592
|
35191cb07327
|
* Step 4. While the above form of the algorithm will reduce a |
593
|
35191cb07327
|
* system of boolean linear equations to unit upper triangular form, |
594
|
35191cb07327
|
* and thus permit a system to be solved with back substitution, the |
595
|
35191cb07327
|
* M4RI algorithm can also be used to invert a matrix, or put the |
596
|
35191cb07327
|
* system into reduced row echelon form (RREF). Simply run Step 3 |
597
|
35191cb07327
|
* on rows \f$0 ... i-1\f$ as well as on rows \f$i + 3k |
598
|
35191cb07327
|
* ... m\f$. This only affects the complexity slightly, changing the |
599
|
35191cb07327
|
* 2.5 coeffcient to 3 |
600
|
01d3fae4e843
|
*/ |
601
|
01d3fae4e843
|
|
602
|
1b4de1a6098b
|
const size_t ncols = A->ncols; |
603
|
1b4de1a6098b
|
size_t r = 0; |
604
|
1b4de1a6098b
|
size_t c = 0; |
605
|
c8fb3be4c910
|
int kbar = 0; |
606
|
c8fb3be4c910
|
|
607
|
570d0958b479
|
if (k == 0) { |
608
|
570d0958b479
|
k = m4ri_opt_k(A->nrows, A->ncols, 0); |
609
|
e15a37476955
|
if (k>=7) |
610
|
e15a37476955
|
k = 7; |
611
|
e15a37476955
|
if ( (6*(1<<k)*A->ncols / 8.0) > CPU_L2_CACHE / 2.0 ) |
612
|
e15a37476955
|
k -= 1; |
613
|
570d0958b479
|
} |
614
|
f9845bc6e485
|
/*printf("k: %d\n",k);*/ |
615
|
1b4de1a6098b
|
int kk = 6*k; |
616
|
8e4bbe385050
|
|
617
|
762af6cbe158
|
packedmatrix *U = mzd_init(kk, A->ncols); |
618
|
130dff8d3482
|
packedmatrix *T0 = mzd_init(TWOPOW(k), A->ncols); |
619
|
130dff8d3482
|
packedmatrix *T1 = mzd_init(TWOPOW(k), A->ncols); |
620
|
130dff8d3482
|
packedmatrix *T2 = mzd_init(TWOPOW(k), A->ncols); |
621
|
52e22dd25d90
|
packedmatrix *T3 = mzd_init(TWOPOW(k), A->ncols); |
622
|
1b4de1a6098b
|
packedmatrix *T4 = mzd_init(TWOPOW(k), A->ncols); |
623
|
1b4de1a6098b
|
packedmatrix *T5 = mzd_init(TWOPOW(k), A->ncols); |
624
|
1b4de1a6098b
|
size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
625
|
1b4de1a6098b
|
size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
626
|
1b4de1a6098b
|
size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
627
|
1b4de1a6098b
|
size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
628
|
1b4de1a6098b
|
size_t *L4 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
629
|
1b4de1a6098b
|
size_t *L5 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
630
|
570d0958b479
|
|
631
|
570d0958b479
|
while(c<ncols) { |
632
|
8e4bbe385050
|
if(c+kk > A->ncols) { |
633
|
8e4bbe385050
|
kk = ncols - c; |
634
|
c8fb3be4c910
|
} |
635
|
762af6cbe158
|
if (full) { |
636
|
762af6cbe158
|
kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk); |
637
|
762af6cbe158
|
} else { |
638
|
762af6cbe158
|
kbar = _mzd_gauss_submatrix(A, r, c, A->nrows, kk); |
639
|
378b655e4717
|
/* this isn't necessary, adapt make_table */ |
640
|
762af6cbe158
|
U = mzd_submatrix(U, A, r, 0, r+kbar, A->ncols); |
641
|
762af6cbe158
|
_mzd_gauss_submatrix_top(A, r, c, kbar); |
642
|
762af6cbe158
|
} |
643
|
c8fb3be4c910
|
|
644
|
1b4de1a6098b
|
if (kbar>5*k) { |
645
|
1b4de1a6098b
|
const int rem = kbar%6; |
646
|
1b4de1a6098b
|
const int ka = kbar/6 + ((rem>=5) ? 1 : 0); |
647
|
1b4de1a6098b
|
const int kb = kbar/6 + ((rem>=4) ? 1 : 0); |
648
|
1b4de1a6098b
|
const int kc = kbar/6 + ((rem>=3) ? 1 : 0); |
649
|
1b4de1a6098b
|
const int kd = kbar/6 + ((rem>=2) ? 1 : 0); |
650
|
1b4de1a6098b
|
const int ke = kbar/6 + ((rem>=1) ? 1 : 0);; |
651
|
1b4de1a6098b
|
const int kf = kbar/6; |
652
|
1b4de1a6098b
|
|
653
|
2ed374681d29
|
if(full || kbar==kk) { |
654
|
2ed374681d29
|
mzd_make_table(A, r, c, ka, T0, L0); |
655
|
2ed374681d29
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
656
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb, c, kc, T2, L2); |
657
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); |
658
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4); |
659
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb+kc+kd+ke, c, kf, T5, L5); |
660
|
2ed374681d29
|
} |
661
|
2ed374681d29
|
if(kbar==kk) |
662
|
2ed374681d29
|
mzd_process_rows6(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5); |
663
|
1b4de1a6098b
|
if(full) |
664
|
1b4de1a6098b
|
mzd_process_rows6(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5); |
665
|
1b4de1a6098b
|
|
666
|
2ed374681d29
|
} else if (kbar>4*k) { |
667
|
1b4de1a6098b
|
const int rem = kbar%5; |
668
|
1b4de1a6098b
|
const int ka = kbar/5 + ((rem>=4) ? 1 : 0); |
669
|
1b4de1a6098b
|
const int kb = kbar/5 + ((rem>=3) ? 1 : 0); |
670
|
1b4de1a6098b
|
const int kc = kbar/5 + ((rem>=2) ? 1 : 0); |
671
|
1b4de1a6098b
|
const int kd = kbar/5 + ((rem>=1) ? 1 : 0); |
672
|
1b4de1a6098b
|
const int ke = kbar/5; |
673
|
2ed374681d29
|
if(full || kbar==kk) { |
674
|
2ed374681d29
|
mzd_make_table(A, r, c, ka, T0, L0); |
675
|
2ed374681d29
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
676
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb, c, kc, T2, L2); |
677
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); |
678
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4); |
679
|
2ed374681d29
|
} |
680
|
2ed374681d29
|
if(kbar==kk) |
681
|
2ed374681d29
|
mzd_process_rows5(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4); |
682
|
1b4de1a6098b
|
if(full) |
683
|
1b4de1a6098b
|
mzd_process_rows5(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4); |
684
|
1b4de1a6098b
|
|
685
|
1b4de1a6098b
|
} else if (kbar>3*k) { |
686
|
52e22dd25d90
|
const int rem = kbar%4; |
687
|
52e22dd25d90
|
const int ka = kbar/4 + ((rem>=3) ? 1 : 0); |
688
|
52e22dd25d90
|
const int kb = kbar/4 + ((rem>=2) ? 1 : 0); |
689
|
52e22dd25d90
|
const int kc = kbar/4 + ((rem>=1) ? 1 : 0); |
690
|
52e22dd25d90
|
const int kd = kbar/4; |
691
|
2ed374681d29
|
if(full || kbar==kk) { |
692
|
2ed374681d29
|
mzd_make_table(A, r, c, ka, T0, L0); |
693
|
2ed374681d29
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
694
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb, c, kc, T2, L2); |
695
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); |
696
|
2ed374681d29
|
} |
697
|
2ed374681d29
|
if(kbar==kk) |
698
|
2ed374681d29
|
mzd_process_rows4(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3); |
699
|
52e22dd25d90
|
if(full) |
700
|
52e22dd25d90
|
mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3); |
701
|
52e22dd25d90
|
|
702
|
52e22dd25d90
|
} else if (kbar>2*k) { |
703
|
130dff8d3482
|
int rem = kbar%3; |
704
|
130dff8d3482
|
int ka = kbar/3 + ((rem>=2) ? 1 : 0); |
705
|
130dff8d3482
|
int kb = kbar/3 + ((rem>=1) ? 1 : 0); |
706
|
130dff8d3482
|
int kc = kbar/3; |
707
|
2ed374681d29
|
if(full || kbar==kk) { |
708
|
2ed374681d29
|
mzd_make_table(A, r, c, ka, T0, L0); |
709
|
2ed374681d29
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
710
|
2ed374681d29
|
mzd_make_table(A, r+ka+kb, c, kc, T2, L2); |
711
|
2ed374681d29
|
} |
712
|
2ed374681d29
|
if(kbar==kk) |
713
|
2ed374681d29
|
mzd_process_rows3(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2); |
714
|
130dff8d3482
|
if(full) |
715
|
130dff8d3482
|
mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2); |
716
|
130dff8d3482
|
|
717
|
130dff8d3482
|
} else if (kbar>k) { |
718
|
130dff8d3482
|
const int ka = kbar/2; |
719
|
130dff8d3482
|
const int kb = kbar - ka; |
720
|
2ed374681d29
|
if(full || kbar==kk) { |
721
|
2ed374681d29
|
mzd_make_table(A, r, c, ka, T0, L0); |
722
|
2ed374681d29
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
723
|
2ed374681d29
|
} |
724
|
2ed374681d29
|
if(kbar==kk) |
725
|
2ed374681d29
|
mzd_process_rows2(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1); |
726
|
130dff8d3482
|
if(full) |
727
|
130dff8d3482
|
mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1); |
728
|
130dff8d3482
|
|
729
|
130dff8d3482
|
} else if(kbar > 0) { |
730
|
2ed374681d29
|
if(full || kbar==kk) { |
731
|
2ed374681d29
|
mzd_make_table(A, r, c, kbar, T0, L0); |
732
|
2ed374681d29
|
} |
733
|
2ed374681d29
|
if(kbar==kk) |
734
|
2ed374681d29
|
mzd_process_rows(A, r+kbar, A->nrows, c, kbar, T0, L0); |
735
|
130dff8d3482
|
if(full) |
736
|
130dff8d3482
|
mzd_process_rows(A, 0, r, c, kbar, T0, L0); |
737
|
881892b8ade8
|
} |
738
|
762af6cbe158
|
|
739
|
762af6cbe158
|
if (!full) { |
740
|
762af6cbe158
|
_mzd_copy_back_rows(A, U, r, c, kbar); |
741
|
762af6cbe158
|
} |
742
|
762af6cbe158
|
|
743
|
c8fb3be4c910
|
r += kbar; |
744
|
c8fb3be4c910
|
c += kbar; |
745
|
8e4bbe385050
|
if(kk!=kbar) { |
746
|
378b655e4717
|
size_t cbar; |
747
|
378b655e4717
|
size_t rbar; |
748
|
378b655e4717
|
if (mzd_find_pivot(A, r, c, &rbar, &cbar)) { |
749
|
378b655e4717
|
c = cbar; |
750
|
378b655e4717
|
mzd_row_swap(A, r, rbar); |
751
|
378b655e4717
|
} else { |
752
|
378b655e4717
|
break; |
753
|
378b655e4717
|
} |
754
|
378b655e4717
|
//c++; |
755
|
c8fb3be4c910
|
} |
756
|
c8fb3be4c910
|
} |
757
|
570d0958b479
|
|
758
|
8e4bbe385050
|
mzd_free(T0); |
759
|
8e4bbe385050
|
m4ri_mm_free(L0); |
760
|
8e4bbe385050
|
mzd_free(T1); |
761
|
8e4bbe385050
|
m4ri_mm_free(L1); |
762
|
130dff8d3482
|
mzd_free(T2); |
763
|
130dff8d3482
|
m4ri_mm_free(L2); |
764
|
52e22dd25d90
|
mzd_free(T3); |
765
|
52e22dd25d90
|
m4ri_mm_free(L3); |
766
|
1b4de1a6098b
|
mzd_free(T4); |
767
|
1b4de1a6098b
|
m4ri_mm_free(L4); |
768
|
1b4de1a6098b
|
mzd_free(T5); |
769
|
1b4de1a6098b
|
m4ri_mm_free(L5); |
770
|
762af6cbe158
|
mzd_free(U); |
771
|
570d0958b479
|
return r; |
772
|
c8fb3be4c910
|
} |
773
|
c8fb3be4c910
|
|
774
|
1b4de1a6098b
|
void mzd_top_reduce_m4ri(packedmatrix *A, int k, packedmatrix *T, size_t *L) { |
775
|
1b4de1a6098b
|
const size_t ncols = A->ncols; |
776
|
1b4de1a6098b
|
size_t r = 0; |
777
|
1b4de1a6098b
|
size_t c = 0; |
778
|
f63ff83556ef
|
int kbar = 0; |
779
|
f63ff83556ef
|
|
780
|
757222db298a
|
if (k == 0) { |
781
|
f63ff83556ef
|
k = m4ri_opt_k(A->nrows, A->ncols, 0); |
782
|
afd18c34a693
|
if (k>5) { |
783
|
afd18c34a693
|
k -= 4; |
784
|
a93796ab261d
|
} |
785
|
757222db298a
|
} |
786
|
a93796ab261d
|
int kk = 4*k; |
787
|
f63ff83556ef
|
|
788
|
130dff8d3482
|
packedmatrix *T0 = mzd_init(TWOPOW(k), A->ncols); |
789
|
130dff8d3482
|
packedmatrix *T1 = mzd_init(TWOPOW(k), A->ncols); |
790
|
a93796ab261d
|
packedmatrix *T2 = mzd_init(TWOPOW(k), A->ncols); |
791
|
a93796ab261d
|
packedmatrix *T3 = mzd_init(TWOPOW(k), A->ncols); |
792
|
1b4de1a6098b
|
size_t *L0 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
793
|
1b4de1a6098b
|
size_t *L1 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
794
|
1b4de1a6098b
|
size_t *L2 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
795
|
1b4de1a6098b
|
size_t *L3 = (size_t *)m4ri_mm_calloc(TWOPOW(k), sizeof(size_t)); |
796
|
f63ff83556ef
|
|
797
|
f63ff83556ef
|
while(c<ncols) { |
798
|
f63ff83556ef
|
if(c+kk > A->ncols) { |
799
|
f63ff83556ef
|
kk = ncols - c; |
800
|
757222db298a
|
} |
801
|
762af6cbe158
|
kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk); |
802
|
f63ff83556ef
|
|
803
|
a93796ab261d
|
if (kbar>3*k) { |
804
|
a93796ab261d
|
const int rem = kbar%4; |
805
|
a93796ab261d
|
const int ka = kbar/4 + ((rem>=3) ? 1 : 0); |
806
|
a93796ab261d
|
const int kb = kbar/4 + ((rem>=2) ? 1 : 0); |
807
|
a93796ab261d
|
const int kc = kbar/4 + ((rem>=1) ? 1 : 0); |
808
|
a93796ab261d
|
const int kd = kbar/4; |
809
|
a93796ab261d
|
mzd_make_table(A, r, c, ka, T0, L0); |
810
|
a93796ab261d
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
811
|
a93796ab261d
|
mzd_make_table(A, r+ka+kb, c, kc, T2, L2); |
812
|
a93796ab261d
|
mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); |
813
|
a93796ab261d
|
mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3); |
814
|
a93796ab261d
|
|
815
|
a93796ab261d
|
} else if (kbar>2*k) { |
816
|
a93796ab261d
|
int rem = kbar%3; |
817
|
a93796ab261d
|
int ka = kbar/3 + ((rem>=2) ? 1 : 0); |
818
|
a93796ab261d
|
int kb = kbar/3 + ((rem>=1) ? 1 : 0); |
819
|
a93796ab261d
|
int kc = kbar/3; |
820
|
a93796ab261d
|
mzd_make_table(A, r, c, ka, T0, L0); |
821
|
a93796ab261d
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
822
|
a93796ab261d
|
mzd_make_table(A, r+ka+kb, c, kc, T2, L2); |
823
|
a93796ab261d
|
mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2); |
824
|
a93796ab261d
|
|
825
|
a93796ab261d
|
} else if (kbar>k) { |
826
|
a93796ab261d
|
const int ka = kbar/2; |
827
|
a93796ab261d
|
const int kb = kbar - ka; |
828
|
a93796ab261d
|
mzd_make_table(A, r, c, ka, T0, L0); |
829
|
a93796ab261d
|
mzd_make_table(A, r+ka, c, kb, T1, L1); |
830
|
a93796ab261d
|
mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1); |
831
|
a93796ab261d
|
|
832
|
a93796ab261d
|
} else if(kbar > 0) { |
833
|
a93796ab261d
|
mzd_make_table(A, r, c, kbar, T0, L0); |
834
|
a93796ab261d
|
mzd_process_rows(A, 0, r, c, kbar, T0, L0); |
835
|
f63ff83556ef
|
} |
836
|
f63ff83556ef
|
r += kbar; |
837
|
f63ff83556ef
|
c += kbar; |
838
|
f63ff83556ef
|
if(kk!=kbar) { |
839
|
f63ff83556ef
|
c++; |
840
|
757222db298a
|
} |
841
|
757222db298a
|
} |
842
|
f63ff83556ef
|
|
843
|
f63ff83556ef
|
mzd_free(T0); |
844
|
f63ff83556ef
|
m4ri_mm_free(L0); |
845
|
f63ff83556ef
|
mzd_free(T1); |
846
|
f63ff83556ef
|
m4ri_mm_free(L1); |
847
|
a93796ab261d
|
mzd_free(T2); |
848
|
a93796ab261d
|
m4ri_mm_free(L2); |
849
|
a93796ab261d
|
mzd_free(T3); |
850
|
a93796ab261d
|
m4ri_mm_free(L3); |
851
|
01d3fae4e843
|
} |
852
|
01d3fae4e843
|
|
853
|
d358a5c4c9fd
|
packedmatrix *mzd_invert_m4ri(packedmatrix *m, packedmatrix *I, int k) { |
854
|
d358a5c4c9fd
|
packedmatrix *big = mzd_concat(NULL, m, I); |
855
|
1b4de1a6098b
|
size_t size=m->ncols; |
856
|
757222db298a
|
if (k == 0) { |
857
|
35191cb07327
|
k = m4ri_opt_k(m->nrows, m->ncols, 0); |
858
|
757222db298a
|
} |
859
|
1b4de1a6098b
|
size_t twokay=TWOPOW(k); |
860
|
1b4de1a6098b
|
size_t i; |
861
|
1ceebbfe39a8
|
packedmatrix *T=mzd_init(twokay, size*2); |
862
|
1b4de1a6098b
|
size_t *L=(size_t *)m4ri_mm_malloc(twokay * sizeof(size_t)); |
863
|
01d3fae4e843
|
packedmatrix *answer; |
864
|
757222db298a
|
|
865
|
1ceebbfe39a8
|
mzd_reduce_m4ri(big, TRUE, k, T, L); |
866
|
757222db298a
|
|
867
|
01d3fae4e843
|
for(i=0; i < size; i++) { |
868
|
35191cb07327
|
if (!mzd_read_bit(big, i,i )) { |
869
|
35191cb07327
|
answer = NULL; |
870
|
01d3fae4e843
|
break; |
871
|
01d3fae4e843
|
} |
872
|
01d3fae4e843
|
} |
873
|
01d3fae4e843
|
if (i == size) |
874
|
d358a5c4c9fd
|
answer=mzd_submatrix(NULL, big, 0, size, size, size*2); |
875
|
757222db298a
|
|
876
|
1ceebbfe39a8
|
m4ri_mm_free(L); |
877
|
1ceebbfe39a8
|
mzd_free(T); |
878
|
35191cb07327
|
mzd_free(big); |
879
|
01d3fae4e843
|
|
880
|
01d3fae4e843
|
return answer; |
881
|
01d3fae4e843
|
} |
882
|
01d3fae4e843
|
|
883
|
35191cb07327
|
packedmatrix *mzd_mul_m4rm_t(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) { |
884
|
8fd17136fa20
|
packedmatrix *AT, *BT, *CT; |
885
|
01d3fae4e843
|
|
886
|
18f7108f161c
|
if(A->ncols != B->nrows) |
887
|
18f7108f161c
|
m4ri_die("mzd_mul_m4rm_t: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows); |
888
|
01d3fae4e843
|
|
889
|
35191cb07327
|
AT = mzd_transpose(NULL, A); |
890
|
35191cb07327
|
BT = mzd_transpose(NULL, B); |
891
|
757222db298a
|
|
892
|
18f7108f161c
|
CT = mzd_init(B->ncols, A->nrows); |
893
|
67c79e0249fe
|
CT = _mzd_mul_m4rm(CT, BT, AT, k, 0); |
894
|
757222db298a
|
|
895
|
35191cb07327
|
mzd_free(AT); |
896
|
35191cb07327
|
mzd_free(BT); |
897
|
01d3fae4e843
|
|
898
|
35191cb07327
|
C = mzd_transpose(C, CT); |
899
|
35191cb07327
|
mzd_free(CT); |
900
|
01d3fae4e843
|
return C; |
901
|
01d3fae4e843
|
} |
902
|
01d3fae4e843
|
|
903
|
89e907bfb554
|
packedmatrix *mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) { |
904
|
1b4de1a6098b
|
size_t a = A->nrows; |
905
|
1b4de1a6098b
|
size_t c = B->ncols; |
906
|
18f7108f161c
|
|
907
|
18f7108f161c
|
if(A->ncols != B->nrows) |
908
|
130dff8d3482
|
m4ri_die("mzd_mul_m4rm: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows); |
909
|
18f7108f161c
|
if (C == NULL) { |
910
|
18f7108f161c
|
C = mzd_init(a, c); |
911
|
18f7108f161c
|
} else { |
912
|
18f7108f161c
|
if (C->nrows != a || C->ncols != c) |
913
|
18f7108f161c
|
m4ri_die("mzd_mul_m4rm: C (%d x %d) has wrong dimensions.\n", C->nrows, C->ncols); |
914
|
18f7108f161c
|
} |
915
|
67c79e0249fe
|
return _mzd_mul_m4rm(C, A, B, k, TRUE); |
916
|
18f7108f161c
|
} |
917
|
18f7108f161c
|
|
918
|
89e907bfb554
|
packedmatrix *mzd_addmul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k) { |
919
|
1b4de1a6098b
|
size_t a = A->nrows; |
920
|
1b4de1a6098b
|
size_t c = B->ncols; |
921
|
18f7108f161c
|
|
922
|
18f7108f161c
|
if(A->ncols != B->nrows) |
923
|
18f7108f161c
|
m4ri_die("mzd_mul_m4rm A ncols (%d) need to match B nrows (%d) .\n", A->ncols, B->nrows); |
924
|
18f7108f161c
|
if (C == NULL) { |
925
|
18f7108f161c
|
C = mzd_init(a, c); |
926
|
18f7108f161c
|
} else { |
927
|
18f7108f161c
|
if (C->nrows != a || C->ncols != c) |
928
|
18f7108f161c
|
m4ri_die("mzd_mul_m4rm: C has wrong dimensions.\n"); |
929
|
18f7108f161c
|
} |
930
|
67c79e0249fe
|
return _mzd_mul_m4rm(C, A, B, k, FALSE); |
931
|
18f7108f161c
|
} |
932
|
18f7108f161c
|
|
933
|
5404680bea3f
|
#ifdef HAVE_SSE2 |
934
|
0244d50cc50b
|
static inline void _mzd_combine8(word *c, word *t1, word *t2, word *t3, word *t4, word *t5, word *t6, word *t7, word *t8, int wide) { |
935
|
1b4de1a6098b
|
size_t i; |
936
|
5404680bea3f
|
/* assuming t1 ... t8 are aligned, but c might not be */ |
937
|
5404680bea3f
|
if (ALIGNMENT(c,16)==0) { |
938
|
5404680bea3f
|
__m128i *__c = (__m128i*)c; |
939
|
5404680bea3f
|
__m128i *__t1 = (__m128i*)t1; |
940
|
5404680bea3f
|
__m128i *__t2 = (__m128i*)t2; |
941
|
5404680bea3f
|
__m128i *__t3 = (__m128i*)t3; |
942
|
5404680bea3f
|
__m128i *__t4 = (__m128i*)t4; |
943
|
5404680bea3f
|
__m128i *__t5 = (__m128i*)t5; |
944
|
5404680bea3f
|
__m128i *__t6 = (__m128i*)t6; |
945
|
5404680bea3f
|
__m128i *__t7 = (__m128i*)t7; |
946
|
5404680bea3f
|
__m128i *__t8 = (__m128i*)t8; |
947
|
5404680bea3f
|
const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xF); |
948
|
5404680bea3f
|
__m128i xmm1; |
949
|
5404680bea3f
|
|
950
|
5404680bea3f
|
while(__c < eof) { |
951
|
5404680bea3f
|
xmm1 = _mm_xor_si128(*__c, *__t1++); |
952
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t2++); |
953
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t3++); |
954
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t4++); |
955
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t5++); |
956
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t6++); |
957
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t7++); |
958
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t8++); |
959
|
5404680bea3f
|
*__c++ = xmm1; |
960
|
5404680bea3f
|
} |
961
|
5404680bea3f
|
c = (word*)__c; |
962
|
5404680bea3f
|
t1 = (word*)__t1; |
963
|
5404680bea3f
|
t2 = (word*)__t2; |
964
|
5404680bea3f
|
t3 = (word*)__t3; |
965
|
5404680bea3f
|
t4 = (word*)__t4; |
966
|
5404680bea3f
|
t5 = (word*)__t5; |
967
|
5404680bea3f
|
t6 = (word*)__t6; |
968
|
5404680bea3f
|
t7 = (word*)__t7; |
969
|
5404680bea3f
|
t8 = (word*)__t8; |
970
|
5404680bea3f
|
wide = ((sizeof(word)*wide)%16)/sizeof(word); |
971
|
5404680bea3f
|
} |
972
|
5404680bea3f
|
for(i=0; i<wide; i++) { |
973
|
5404680bea3f
|
c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i]; |
974
|
5404680bea3f
|
} |
975
|
5404680bea3f
|
} |
976
|
0244d50cc50b
|
#else |
977
|
0244d50cc50b
|
|
978
|
0244d50cc50b
|
#define _mzd_combine8(c,t1,t2,t3,t4,t5,t6,t7,t8,wide) for(ii=0; ii<wide ; ii++) c[ii] ^= t1[ii] ^ t2[ii] ^ t3[ii] ^ t4[ii] ^ t5[ii] ^ t6[ii] ^ t7[ii] ^ t8[ii] |
979
|
0244d50cc50b
|
|
980
|
5404680bea3f
|
#endif |
981
|
5404680bea3f
|
|
982
|
5404680bea3f
|
#ifdef HAVE_SSE2 |
983
|
0244d50cc50b
|
static inline void _mzd_combine4(word *c, word *t1, word *t2, word *t3, word *t4, size_t wide) { |
984
|
1b4de1a6098b
|
size_t i; |
985
|
0244d50cc50b
|
/* assuming t1 ... t4 are aligned, but c might not be */ |
986
|
5404680bea3f
|
if (ALIGNMENT(c,16)==0) { |
987
|
5404680bea3f
|
__m128i *__c = (__m128i*)c; |
988
|
5404680bea3f
|
__m128i *__t1 = (__m128i*)t1; |
989
|
5404680bea3f
|
__m128i *__t2 = (__m128i*)t2; |
990
|
5404680bea3f
|
__m128i *__t3 = (__m128i*)t3; |
991
|
5404680bea3f
|
__m128i *__t4 = (__m128i*)t4; |
992
|
5404680bea3f
|
const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xF); |
993
|
5404680bea3f
|
__m128i xmm1; |
994
|
5404680bea3f
|
|
995
|
5404680bea3f
|
while(__c < eof) { |
996
|
5404680bea3f
|
xmm1 = _mm_xor_si128(*__c, *__t1++); |
997
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t2++); |
998
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t3++); |
999
|
5404680bea3f
|
xmm1 = _mm_xor_si128(xmm1, *__t4++); |
1000
|
5404680bea3f
|
*__c++ = xmm1; |
1001
|
5404680bea3f
|
} |
1002
|
5404680bea3f
|
c = (word*)__c; |
1003
|
5404680bea3f
|
t1 = (word*)__t1; |
1004
|
5404680bea3f
|
t2 = (word*)__t2; |
1005
|
5404680bea3f
|
t3 = (word*)__t3; |
1006
|
5404680bea3f
|
t4 = (word*)__t4; |
1007
|
5404680bea3f
|
wide = ((sizeof(word)*wide)%16)/sizeof(word); |
1008
|
5404680bea3f
|
} |
1009
|
5404680bea3f
|
for(i=0; i<wide; i++) { |
1010
|
5404680bea3f
|
c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i]; |
1011
|
5404680bea3f
|
} |
1012
|
5404680bea3f
|
} |
1013
|
0244d50cc50b
|
#else |
1014
|
0244d50cc50b
|
|
1015
|
0244d50cc50b
|
#define _mzd_combine4(c, t1, t2, t3, t4, wide) for(ii=0; ii<wide ; ii++) c[ii] ^= t1[ii] ^ t2[ii] ^ t3[ii] ^ t4[ii] |
1016
|
0244d50cc50b
|
|
1017
|
5404680bea3f
|
#endif //HAVE_SSE2 |
1018
|
5404680bea3f
|
|
1019
|
5404680bea3f
|
#ifdef HAVE_SSE2 |
1020
|
0244d50cc50b
|
static inline void _mzd_combine2(word *c, word *t1, word *t2, size_t wide) { |
1021
|
0244d50cc50b
|
size_t i; |
1022
|
0244d50cc50b
|
/* assuming t1 ... t2 are aligned, but c might not be */ |
1023
|
0244d50cc50b
|
if (ALIGNMENT(c,16)==0) { |
1024
|
0244d50cc50b
|
__m128i *__c = (__m128i*)c; |
1025
|
0244d50cc50b
|
__m128i *__t1 = (__m128i*)t1; |
1026
|
0244d50cc50b
|
__m128i *__t2 = (__m128i*)t2; |
1027
|
0244d50cc50b
|
const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xF); |
1028
|
0244d50cc50b
|
__m128i xmm1; |
1029
|
0244d50cc50b
|
|
1030
|
0244d50cc50b
|
while(__c < eof) { |
1031
|
0244d50cc50b
|
xmm1 = _mm_xor_si128(*__c, *__t1++); |
1032
|
0244d50cc50b
|
xmm1 = _mm_xor_si128(xmm1, *__t2++); |
1033
|
0244d50cc50b
|
*__c++ = xmm1; |
1034
|
0244d50cc50b
|
} |
1035
|
0244d50cc50b
|
c = (word*)__c; |
1036
|
0244d50cc50b
|
t1 = (word*)__t1; |
1037
|
0244d50cc50b
|
t2 = (word*)__t2; |
1038
|
0244d50cc50b
|
wide = ((sizeof(word)*wide)%16)/sizeof(word); |
1039
|
0244d50cc50b
|
} |
1040
|
0244d50cc50b
|
for(i=0; i<wide; i++) { |
1041
|
0244d50cc50b
|
c[i] ^= t1[i] ^ t2[i]; |
1042
|
0244d50cc50b
|
} |
1043
|
0244d50cc50b
|
} |
1044
|
0244d50cc50b
|
#else |
1045
|
0244d50cc50b
|
|
1046
|
0244d50cc50b
|
#define _mzd_combine2(c, t1, t2, wide) for(ii=0; ii<wide ; ii++) c[ii] ^= t1[ii] ^ t2[ii] |
1047
|
0244d50cc50b
|
|
1048
|
0244d50cc50b
|
#endif //HAVE_SSE2 |
1049
|
0244d50cc50b
|
|
1050
|
5404680bea3f
|
|
1051
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1052
|
0244d50cc50b
|
#define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide) |
1053
|
51c42f6de1ea
|
#else //M4RM_GRAY8 |
1054
|
0244d50cc50b
|
#define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide) |
1055
|
51c42f6de1ea
|
#endif //M4RM_GRAY8 |
1056
|
5404680bea3f
|
|
1057
|
67c79e0249fe
|
packedmatrix *_mzd_mul_m4rm(packedmatrix *C, packedmatrix *A, packedmatrix *B, int k, int clear) { |
1058
|
f63ff83556ef
|
/** |
1059
|
f63ff83556ef
|
* The algorithm proceeds as follows: |
1060
|
f63ff83556ef
|
* |
1061
|
f63ff83556ef
|
* Step 1. Make a Gray code table of all the \f$2^k\f$ linear combinations |
1062
|
f63ff83556ef
|
* of the \f$k\f$ rows of \f$B_i\f$. Call the \f$x\f$-th row |
1063
|
f63ff83556ef
|
* \f$T_x\f$. |
1064
|
f63ff83556ef
|
* |
1065
|
f63ff83556ef
|
* Step 2. Read the entries |
1066
|
f63ff83556ef
|
* \f$a_{j,(i-1)k+1}, a_{j,(i-1)k+2} , ... , a_{j,(i-1)k+k}.\f$ |
1067
|
f63ff83556ef
|
* |
1068
|
f63ff83556ef
|
* Let \f$x\f$ be the \f$k\f$ bit binary number formed by the |
1069
|
f63ff83556ef
|
* concatenation of \f$a_{j,(i-1)k+1}, ... , a_{j,ik}\f$. |
1070
|
f63ff83556ef
|
* |
1071
|
f63ff83556ef
|
* Step 3. for \f$h = 1,2, ... , c\f$ do |
1072
|
f63ff83556ef
|
* calculate \f$C_{jh} = C_{jh} + T_{xh}\f$. |
1073
|
f63ff83556ef
|
*/ |
1074
|
a9653d7bf249
|
assert(C->offset==0); |
1075
|
1b4de1a6098b
|
size_t i,j; |
1076
|
1b4de1a6098b
|
size_t ii; |
1077
|
48be69fd8a7a
|
unsigned int x1, x2, x3, x4; |
1078
|
48be69fd8a7a
|
word *t1, *t2, *t3, *t4; |
1079
|
48be69fd8a7a
|
|
1080
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1081
|
48be69fd8a7a
|
unsigned int x5, x6, x7, x8; |
1082
|
48be69fd8a7a
|
word *t5, *t6, *t7, *t8; |
1083
|
48be69fd8a7a
|
#endif |
1084
|
48be69fd8a7a
|
|
1085
|
48be69fd8a7a
|
word *c; |
1086
|
c9090f72dd57
|
|
1087
|
1b4de1a6098b
|
size_t a_nr = A->nrows; |
1088
|
1b4de1a6098b
|
size_t a_nc = A->ncols; |
1089
|
1b4de1a6098b
|
size_t b_nc = B->ncols; |
1090
|
c9090f72dd57
|
|
1091
|
c9090f72dd57
|
if (b_nc < RADIX-10) { |
1092
|
eb19d7b1b678
|
if(clear) |
1093
|
bc105b372f12
|
return mzd_mul_naive(C, A, B); |
1094
|
eb19d7b1b678
|
else |
1095
|
bc105b372f12
|
return mzd_addmul_naive(C, A, B); |
1096
|
a9653d7bf249
|
} else if (a_nr < 16) { |
1097
|
a9653d7bf249
|
return _mzd_mul_va(C, A, B, clear); |
1098
|
c9090f72dd57
|
} |
1099
|
c9090f72dd57
|
|
1100
|
1b4de1a6098b
|
size_t wide = C->width; |
1101
|
c9090f72dd57
|
|
1102
|
c9090f72dd57
|
/* clear first */ |
1103
|
c9090f72dd57
|
if (clear) { |
1104
|
a9653d7bf249
|
mzd_set_ui(C, 0); |
1105
|
c9090f72dd57
|
} |
1106
|
c9090f72dd57
|
|
1107
|
1b4de1a6098b
|
const size_t blocksize = MZD_MUL_BLOCKSIZE; |
1108
|
c9090f72dd57
|
|
1109
|
c9090f72dd57
|
if (k == 0) { |
1110
|
c9090f72dd57
|
k = m4ri_opt_k(blocksize, a_nc, b_nc); |
1111
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1112
|
48253acb1b66
|
if (k>3) |
1113
|
48253acb1b66
|
k -= 2; |
1114
|
48253acb1b66
|
/* reduce k further if that has a chance of hitting L1 */ |
1115
|
48253acb1b66
|
const size_t tsize = (int)(0.8*(TWOPOW(k) * b_nc)); |
1116
|
48253acb1b66
|
if(tsize > CPU_L1_CACHE && tsize/2 <= CPU_L1_CACHE) |
1117
|
48253acb1b66
|
k -= 1; |
1118
|
48be69fd8a7a
|
#else |
1119
|
48be69fd8a7a
|
if (k>2) |
1120
|
48be69fd8a7a
|
k -= 1; |
1121
|
48be69fd8a7a
|
#endif |
1122
|
c9090f72dd57
|
} |
1123
|
762af6cbe158
|
|
1124
|
762af6cbe158
|
#ifndef M4RM_GRAY8 |
1125
|
f9845bc6e485
|
size_t *buffer = (size_t*)m4ri_mm_malloc(4 * TWOPOW(k) * sizeof(size_t)); |
1126
|
762af6cbe158
|
#else |
1127
|
f9845bc6e485
|
size_t *buffer = (size_t*)m4ri_mm_malloc(8 * TWOPOW(k) * sizeof(size_t)); |
1128
|
762af6cbe158
|
#endif |
1129
|
762af6cbe158
|
|
1130
|
c9090f72dd57
|
packedmatrix *T1 = mzd_init(TWOPOW(k), b_nc); |
1131
|
1b4de1a6098b
|
size_t *L1 = buffer; |
1132
|
c9090f72dd57
|
packedmatrix *T2 = mzd_init(TWOPOW(k), b_nc); |
1133
|
1b4de1a6098b
|
size_t *L2 = buffer + 1*TWOPOW(k); |
1134
|
411816376bce
|
packedmatrix *T3 = mzd_init(TWOPOW(k), b_nc); |
1135
|
1b4de1a6098b
|
size_t *L3 = buffer + 2*TWOPOW(k); |
1136
|
411816376bce
|
packedmatrix *T4 = mzd_init(TWOPOW(k), b_nc); |
1137
|
1b4de1a6098b
|
size_t *L4 = buffer + 3*TWOPOW(k); |
1138
|
48be69fd8a7a
|
|
1139
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1140
|
411816376bce
|
packedmatrix *T5 = mzd_init(TWOPOW(k), b_nc); |
1141
|
1b4de1a6098b
|
size_t *L5 = buffer + 4*TWOPOW(k); |
1142
|
411816376bce
|
packedmatrix *T6 = mzd_init(TWOPOW(k), b_nc); |
1143
|
1b4de1a6098b
|
size_t *L6 = buffer + 5*TWOPOW(k); |
1144
|
411816376bce
|
packedmatrix *T7 = mzd_init(TWOPOW(k), b_nc); |
1145
|
1b4de1a6098b
|
size_t *L7 = buffer + 6*TWOPOW(k); |
1146
|
411816376bce
|
packedmatrix *T8 = mzd_init(TWOPOW(k), b_nc); |
1147
|
1b4de1a6098b
|
size_t *L8 = buffer + 7*TWOPOW(k); |
1148
|
48be69fd8a7a
|
#endif |
1149
|
c9090f72dd57
|
|
1150
|
c9090f72dd57
|
/* process stuff that fits into multiple of k first, but blockwise (babystep-giantstep)*/ |
1151
|
f9845bc6e485
|
size_t babystep, giantstep; |
1152
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1153
|
411816376bce
|
const int kk = 8*k; |
1154
|
48be69fd8a7a
|
#else |
1155
|
48be69fd8a7a
|
const int kk = 4*k; |
1156
|
48be69fd8a7a
|
#endif |
1157
|
f9845bc6e485
|
const size_t end = a_nc/kk; |
1158
|
c9090f72dd57
|
|
1159
|
c9090f72dd57
|
for (giantstep=0; giantstep + blocksize <= a_nr; giantstep += blocksize) { |
1160
|
c9090f72dd57
|
for(i=0; i < end; i++) { |
1161
|
570d0958b479
|
mzd_make_table( B, i*kk, 0, k, T1, L1); |
1162
|
570d0958b479
|
mzd_make_table( B, i*kk+k, 0, k, T2, L2); |
1163
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k, 0, k, T3, L3); |
1164
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k, 0, k, T4, L4); |
1165
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1166
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k, 0, k, T5, L5); |
1167
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k+k, 0, k, T6, L6); |
1168
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k+k+k, 0, k, T7, L7); |
1169
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k+k+k+k, 0, k, T8, L8); |
1170
|
48be69fd8a7a
|
#endif |
1171
|
0244d50cc50b
|
|
1172
|
c9090f72dd57
|
for(babystep = 0; babystep < blocksize; babystep++) { |
1173
|
c9090f72dd57
|
j = giantstep + babystep; |
1174
|
41bd093d9fc0
|
x1 = L1[ (int)mzd_read_bits(A, j, i*kk, k) ]; |
1175
|
41bd093d9fc0
|
x2 = L2[ (int)mzd_read_bits(A, j, i*kk+k, k) ]; |
1176
|
41bd093d9fc0
|
x3 = L3[ (int)mzd_read_bits(A, j, i*kk+k+k, k) ]; |
1177
|
41bd093d9fc0
|
x4 = L4[ (int)mzd_read_bits(A, j, i*kk+k+k+k, k) ]; |
1178
|
0244d50cc50b
|
#ifdef M4RM_GRAY8 |
1179
|
41bd093d9fc0
|
x5 = L5[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k, k) ]; |
1180
|
41bd093d9fc0
|
x6 = L6[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k, k) ]; |
1181
|
41bd093d9fc0
|
x7 = L7[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k, k) ]; |
1182
|
41bd093d9fc0
|
x8 = L8[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k+k, k) ]; |
1183
|
48be69fd8a7a
|
#endif |
1184
|
c9090f72dd57
|
c = C->values + C->rowswap[j]; |
1185
|
c9090f72dd57
|
t1 = T1->values + T1->rowswap[x1]; |
1186
|
c9090f72dd57
|
t2 = T2->values + T2->rowswap[x2]; |
1187
|
411816376bce
|
t3 = T3->values + T3->rowswap[x3]; |
1188
|
411816376bce
|
t4 = T4->values + T4->rowswap[x4]; |
1189
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1190
|
411816376bce
|
t5 = T5->values + T5->rowswap[x5]; |
1191
|
411816376bce
|
t6 = T6->values + T6->rowswap[x6]; |
1192
|
411816376bce
|
t7 = T7->values + T7->rowswap[x7]; |
1193
|
411816376bce
|
t8 = T8->values + T8->rowswap[x8]; |
1194
|
48be69fd8a7a
|
#endif |
1195
|
5404680bea3f
|
_MZD_COMBINE; |
1196
|
c9090f72dd57
|
} |
1197
|
c9090f72dd57
|
} |
1198
|
c9090f72dd57
|
} |
1199
|
c9090f72dd57
|
|
1200
|
c9090f72dd57
|
for(i=0; i < end; i++) { |
1201
|
570d0958b479
|
mzd_make_table( B, i*kk, 0, k, T1, L1); |
1202
|
570d0958b479
|
mzd_make_table( B, i*kk+k, 0, k, T2, L2); |
1203
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k, 0, k, T3, L3); |
1204
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k, 0, k, T4, L4); |
1205
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1206
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k, 0, k, T5, L5); |
1207
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k+k, 0, k, T6, L6); |
1208
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k+k+k, 0, k, T7, L7); |
1209
|
570d0958b479
|
mzd_make_table( B, i*kk+k+k+k+k+k+k+k, 0, k, T8, L8); |
1210
|
48be69fd8a7a
|
#endif |
1211
|
c9090f72dd57
|
for(babystep = 0; babystep < a_nr - giantstep; babystep++) { |
1212
|
c9090f72dd57
|
j = giantstep + babystep; |
1213
|
41bd093d9fc0
|
x1 = L1[ (int)mzd_read_bits(A, j, i*kk, k) ]; |
1214
|
41bd093d9fc0
|
x2 = L2[ (int)mzd_read_bits(A, j, i*kk+k, k) ]; |
1215
|
41bd093d9fc0
|
x3 = L3[ (int)mzd_read_bits(A, j, i*kk+k+k, k) ]; |
1216
|
41bd093d9fc0
|
x4 = L4[ (int)mzd_read_bits(A, j, i*kk+k+k+k, k) ]; |
1217
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1218
|
41bd093d9fc0
|
x5 = L5[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k, k) ]; |
1219
|
41bd093d9fc0
|
x6 = L6[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k, k) ]; |
1220
|
41bd093d9fc0
|
x7 = L7[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k, k) ]; |
1221
|
41bd093d9fc0
|
x8 = L8[ (int)mzd_read_bits(A, j, i*kk+k+k+k+k+k+k+k, k) ]; |
1222
|
48be69fd8a7a
|
#endif |
1223
|
c9090f72dd57
|
c = C->values + C->rowswap[j]; |
1224
|
c9090f72dd57
|
t1 = T1->values + T1->rowswap[x1]; |
1225
|
c9090f72dd57
|
t2 = T2->values + T2->rowswap[x2]; |
1226
|
411816376bce
|
t3 = T3->values + T3->rowswap[x3]; |
1227
|
411816376bce
|
t4 = T4->values + T4->rowswap[x4]; |
1228
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1229
|
411816376bce
|
t5 = T5->values + T5->rowswap[x5]; |
1230
|
411816376bce
|
t6 = T6->values + T6->rowswap[x6]; |
1231
|
411816376bce
|
t7 = T7->values + T7->rowswap[x7]; |
1232
|
411816376bce
|
t8 = T8->values + T8->rowswap[x8]; |
1233
|
48be69fd8a7a
|
#endif |
1234
|
5404680bea3f
|
_MZD_COMBINE; |
1235
|
c9090f72dd57
|
} |
1236
|
c9090f72dd57
|
} |
1237
|
c9090f72dd57
|
|
1238
|
ff9a7b521b90
|
/* handle stuff that doesn't fit into multiple of kk */ |
1239
|
c9090f72dd57
|
if (a_nc%kk) { |
1240
|
ff9a7b521b90
|
for (i=end*kk/k; i < (a_nc)/k; i++) { |
1241
|
570d0958b479
|
mzd_make_table( B, i*k, 0, k, T1, L1); |
1242
|
ff9a7b521b90
|
for(j = 0; j<a_nr; j++) { |
1243
|
41bd093d9fc0
|
x1 = L1[ (int)mzd_read_bits(A, j, i*k, k) ]; |
1244
|
ff9a7b521b90
|
c = C->values + C->rowswap[j]; |
1245
|
ff9a7b521b90
|
t1 = T1->values + T1->rowswap[x1]; |
1246
|
ff9a7b521b90
|
for(ii=0; ii<wide; ii++) { |
1247
|
ff9a7b521b90
|
c[ii] ^= t1[ii]; |
1248
|
ff9a7b521b90
|
} |
1249
|
ff9a7b521b90
|
} |
1250
|
ff9a7b521b90
|
} |
1251
|
ff9a7b521b90
|
/* handle stuff that doesn't fit into multiple of k */ |
1252
|
ff9a7b521b90
|
if (a_nc%k) { |
1253
|
570d0958b479
|
mzd_make_table( B, a_nc/k * k , 0, a_nc%k, T1, L1); |
1254
|
ff9a7b521b90
|
for(j = 0; j<a_nr; j++) { |
1255
|
41bd093d9fc0
|
x1 = L1[ (int)mzd_read_bits(A, j, i*k, a_nc%k) ]; |
1256
|
ff9a7b521b90
|
c = C->values + C->rowswap[j]; |
1257
|
ff9a7b521b90
|
t1 = T1->values + T1->rowswap[x1]; |
1258
|
ff9a7b521b90
|
for(ii=0; ii<wide; ii++) { |
1259
|
ff9a7b521b90
|
c[ii] ^= t1[ii]; |
1260
|
ff9a7b521b90
|
} |
1261
|
ff9a7b521b90
|
} |
1262
|
c9090f72dd57
|
} |
1263
|
c9090f72dd57
|
} |
1264
|
c9090f72dd57
|
|
1265
|
c9090f72dd57
|
mzd_free(T1); |
1266
|
c9090f72dd57
|
mzd_free(T2); |
1267
|
411816376bce
|
mzd_free(T3); |
1268
|
411816376bce
|
mzd_free(T4); |
1269
|
51c42f6de1ea
|
#ifdef M4RM_GRAY8 |
1270
|
411816376bce
|
mzd_free(T5); |
1271
|
411816376bce
|
mzd_free(T6); |
1272
|
411816376bce
|
mzd_free(T7); |
1273
|
411816376bce
|
mzd_free(T8); |
1274
|
48be69fd8a7a
|
#endif |
1275
|
762af6cbe158
|
m4ri_mm_free(buffer); |
1276
|
c9090f72dd57
|
return C; |
1277
|
c9090f72dd57
|
} |
1278
|
c9090f72dd57
|
|