MMM.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_DENSE_MMM_H_
36 #define _BLAZE_MATH_DENSE_MMM_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/Aliases.h>
61 #include <blaze/math/shims/IsOne.h>
63 #include <blaze/math/SIMD.h>
68 #include <blaze/system/Blocking.h>
70 #include <blaze/util/Assert.h>
72 #include <blaze/util/Types.h>
74 
75 
76 namespace blaze {
77 
78 //=================================================================================================
79 //
80 // GENERAL DENSE MATRIX MULTIPLICATION KERNELS
81 //
82 //=================================================================================================
83 
84 //*************************************************************************************************
103 template< typename MT1, typename MT2, typename MT3, typename ST >
104 void mmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
105 {
106  using ET1 = ElementType_<MT1>;
107  using ET2 = ElementType_<MT2>;
108  using ET3 = ElementType_<MT3>;
109  using SIMDType = SIMDTrait_<ET1>;
110 
115 
118 
121 
124 
125  enum : size_t { SIMDSIZE = SIMDTrait<ET1>::size };
126 
127  constexpr bool remainder( !IsPadded<MT2>::value || !IsPadded<MT3>::value );
128 
129  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
130  constexpr size_t JBLOCK( MMM_INNER_BLOCK_SIZE );
131 
132  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
133  BLAZE_STATIC_ASSERT( JBLOCK >= SIMDSIZE && JBLOCK % SIMDSIZE == 0UL );
134 
135  const size_t M( A.rows() );
136  const size_t N( B.columns() );
137  const size_t K( A.columns() );
138 
139  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
140 
141  DynamicMatrix<ET2,false> A2( M, KBLOCK );
142  DynamicMatrix<ET3,true> B2( KBLOCK, JBLOCK );
143 
144  if( isDefault( beta ) ) {
145  reset( ~C );
146  }
147  else if( !isOne( beta ) ) {
148  (~C) *= beta;
149  }
150 
151  size_t kk( 0UL );
152  size_t kblock( 0UL );
153 
154  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
155  {
156  if( remainder ) {
157  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
158  }
159  else {
160  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
161  }
162 
163  const size_t ibegin( IsLower<MT2>::value ? kk : 0UL );
164  const size_t iend ( IsUpper<MT2>::value ? kk+kblock : M );
165  const size_t isize ( iend - ibegin );
166 
167  A2 = serial( submatrix<!remainder>( A, ibegin, kk, isize, kblock ) );
168 
169  size_t jj( 0UL );
170  size_t jblock( 0UL );
171 
172  while( jj < N )
173  {
174  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
175 
176  if( ( IsLower<MT3>::value && kk+kblock <= jj ) ||
177  ( IsUpper<MT3>::value && jj+jblock <= kk ) ) {
178  jj += jblock;
179  continue;
180  }
181 
182  B2 = serial( submatrix<!remainder>( B, kk, jj, kblock, jblock ) );
183 
184  size_t i( 0UL );
185 
186  if( IsFloatingPoint<ET1>::value )
187  {
188  for( ; (i+5UL) <= isize; i+=5UL )
189  {
190  size_t j( 0UL );
191 
192  for( ; (j+2UL) <= jblock; j+=2UL )
193  {
194  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
195 
196  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
197  {
198  const SIMDType a1( A2.load(i ,k) );
199  const SIMDType a2( A2.load(i+1UL,k) );
200  const SIMDType a3( A2.load(i+2UL,k) );
201  const SIMDType a4( A2.load(i+3UL,k) );
202  const SIMDType a5( A2.load(i+4UL,k) );
203 
204  const SIMDType b1( B2.load(k,j ) );
205  const SIMDType b2( B2.load(k,j+1UL) );
206 
207  xmm1 += a1 * b1;
208  xmm2 += a1 * b2;
209  xmm3 += a2 * b1;
210  xmm4 += a2 * b2;
211  xmm5 += a3 * b1;
212  xmm6 += a3 * b2;
213  xmm7 += a4 * b1;
214  xmm8 += a4 * b2;
215  xmm9 += a5 * b1;
216  xmm10 += a5 * b2;
217  }
218 
219  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
220  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
221  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
222  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
223  (~C)(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
224  (~C)(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
225  (~C)(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
226  (~C)(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
227  (~C)(ibegin+i+4UL,jj+j ) += sum( xmm9 ) * alpha;
228  (~C)(ibegin+i+4UL,jj+j+1UL) += sum( xmm10 ) * alpha;
229  }
230 
231  if( j<jblock )
232  {
233  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
234 
235  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
236  {
237  const SIMDType a1( A2.load(i ,k) );
238  const SIMDType a2( A2.load(i+1UL,k) );
239  const SIMDType a3( A2.load(i+2UL,k) );
240  const SIMDType a4( A2.load(i+3UL,k) );
241  const SIMDType a5( A2.load(i+4UL,k) );
242 
243  const SIMDType b1( B2.load(k,j) );
244 
245  xmm1 += a1 * b1;
246  xmm2 += a2 * b1;
247  xmm3 += a3 * b1;
248  xmm4 += a4 * b1;
249  xmm5 += a5 * b1;
250  }
251 
252  (~C)(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
253  (~C)(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
254  (~C)(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
255  (~C)(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
256  (~C)(ibegin+i+4UL,jj+j) += sum( xmm5 ) * alpha;
257  }
258  }
259  }
260  else
261  {
262  for( ; (i+4UL) <= isize; i+=4UL )
263  {
264  size_t j( 0UL );
265 
266  for( ; (j+2UL) <= jblock; j+=2UL )
267  {
268  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
269 
270  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
271  {
272  const SIMDType a1( A2.load(i ,k) );
273  const SIMDType a2( A2.load(i+1UL,k) );
274  const SIMDType a3( A2.load(i+2UL,k) );
275  const SIMDType a4( A2.load(i+3UL,k) );
276 
277  const SIMDType b1( B2.load(k,j ) );
278  const SIMDType b2( B2.load(k,j+1UL) );
279 
280  xmm1 += a1 * b1;
281  xmm2 += a1 * b2;
282  xmm3 += a2 * b1;
283  xmm4 += a2 * b2;
284  xmm5 += a3 * b1;
285  xmm6 += a3 * b2;
286  xmm7 += a4 * b1;
287  xmm8 += a4 * b2;
288  }
289 
290  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
291  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
292  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
293  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
294  (~C)(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
295  (~C)(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
296  (~C)(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
297  (~C)(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
298  }
299 
300  if( j<jblock )
301  {
302  SIMDType xmm1, xmm2, xmm3, xmm4;
303 
304  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
305  {
306  const SIMDType a1( A2.load(i ,k) );
307  const SIMDType a2( A2.load(i+1UL,k) );
308  const SIMDType a3( A2.load(i+2UL,k) );
309  const SIMDType a4( A2.load(i+3UL,k) );
310 
311  const SIMDType b1( B2.load(k,j) );
312 
313  xmm1 += a1 * b1;
314  xmm2 += a2 * b1;
315  xmm3 += a3 * b1;
316  xmm4 += a4 * b1;
317  }
318 
319  (~C)(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
320  (~C)(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
321  (~C)(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
322  (~C)(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
323  }
324  }
325  }
326 
327  for( ; (i+2UL) <= isize; i+=2UL )
328  {
329  size_t j( 0UL );
330 
331  for( ; (j+4UL) <= jblock; j+=4UL )
332  {
333  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
334 
335  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
336  {
337  const SIMDType a1( A2.load(i ,k) );
338  const SIMDType a2( A2.load(i+1UL,k) );
339 
340  const SIMDType b1( B2.load(k,j ) );
341  const SIMDType b2( B2.load(k,j+1UL) );
342  const SIMDType b3( B2.load(k,j+2UL) );
343  const SIMDType b4( B2.load(k,j+3UL) );
344 
345  xmm1 += a1 * b1;
346  xmm2 += a1 * b2;
347  xmm3 += a1 * b3;
348  xmm4 += a1 * b4;
349  xmm5 += a2 * b1;
350  xmm6 += a2 * b2;
351  xmm7 += a2 * b3;
352  xmm8 += a2 * b4;
353  }
354 
355  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
356  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
357  (~C)(ibegin+i ,jj+j+2UL) += sum( xmm3 ) * alpha;
358  (~C)(ibegin+i ,jj+j+3UL) += sum( xmm4 ) * alpha;
359  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm5 ) * alpha;
360  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm6 ) * alpha;
361  (~C)(ibegin+i+1UL,jj+j+2UL) += sum( xmm7 ) * alpha;
362  (~C)(ibegin+i+1UL,jj+j+3UL) += sum( xmm8 ) * alpha;
363  }
364 
365  for( ; (j+2UL) <= jblock; j+=2UL )
366  {
367  SIMDType xmm1, xmm2, xmm3, xmm4;
368 
369  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
370  {
371  const SIMDType a1( A2.load(i ,k) );
372  const SIMDType a2( A2.load(i+1UL,k) );
373 
374  const SIMDType b1( B2.load(k,j ) );
375  const SIMDType b2( B2.load(k,j+1UL) );
376 
377  xmm1 += a1 * b1;
378  xmm2 += a1 * b2;
379  xmm3 += a2 * b1;
380  xmm4 += a2 * b2;
381  }
382 
383  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
384  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
385  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
386  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
387  }
388 
389  if( j<jblock )
390  {
391  SIMDType xmm1, xmm2;
392 
393  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
394  {
395  const SIMDType a1( A2.load(i ,k) );
396  const SIMDType a2( A2.load(i+1UL,k) );
397 
398  const SIMDType b1( B2.load(k,j) );
399 
400  xmm1 += a1 * b1;
401  xmm2 += a2 * b1;
402  }
403 
404  (~C)(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
405  (~C)(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
406  }
407  }
408 
409  if( i<isize )
410  {
411  size_t j( 0UL );
412 
413  for( ; (j+2UL) <= jblock; j+=2UL )
414  {
415  SIMDType xmm1, xmm2;
416 
417  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
418  {
419  const SIMDType a1( A2.load(i,k) );
420 
421  xmm1 += a1 * B2.load(k,j );
422  xmm2 += a1 * B2.load(k,j+1UL);
423  }
424 
425  (~C)(ibegin+i,jj+j ) += sum( xmm1 ) * alpha;
426  (~C)(ibegin+i,jj+j+1UL) += sum( xmm2 ) * alpha;
427  }
428 
429  if( j<jblock )
430  {
431  SIMDType xmm1;
432 
433  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
434  {
435  const SIMDType a1( A2.load(i,k) );
436 
437  xmm1 += a1 * B2.load(k,j);
438  }
439 
440  (~C)(ibegin+i,jj+j) += sum( xmm1 ) * alpha;
441  }
442  }
443 
444  jj += jblock;
445  }
446 
447  kk += kblock;
448  }
449 
450  if( remainder && kk < K )
451  {
452  const size_t ksize( K - kk );
453 
454  const size_t ibegin( IsLower<MT2>::value ? kk : 0UL );
455  const size_t isize ( M - ibegin );
456 
457  A2 = serial( submatrix( A, ibegin, kk, isize, ksize ) );
458 
459  size_t jj( 0UL );
460  size_t jblock( 0UL );
461 
462  while( jj < N )
463  {
464  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
465 
466  if( IsUpper<MT3>::value && jj+jblock <= kk ) {
467  jj += jblock;
468  continue;
469  }
470 
471  B2 = serial( submatrix( B, kk, jj, ksize, jblock ) );
472 
473  size_t i( 0UL );
474 
475  if( IsFloatingPoint<ET1>::value )
476  {
477  for( ; (i+5UL) <= isize; i+=5UL )
478  {
479  size_t j( 0UL );
480 
481  for( ; (j+2UL) <= jblock; j+=2UL ) {
482  for( size_t k=0UL; k<ksize; ++k ) {
483  (~C)(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
484  (~C)(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
485  (~C)(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
486  (~C)(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
487  (~C)(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
488  (~C)(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
489  (~C)(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
490  (~C)(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
491  (~C)(ibegin+i+4UL,jj+j ) += A2(i+4UL,k) * B2(k,j ) * alpha;
492  (~C)(ibegin+i+4UL,jj+j+1UL) += A2(i+4UL,k) * B2(k,j+1UL) * alpha;
493  }
494  }
495 
496  if( j<jblock ) {
497  for( size_t k=0UL; k<ksize; ++k ) {
498  (~C)(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
499  (~C)(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
500  (~C)(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
501  (~C)(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
502  (~C)(ibegin+i+4UL,jj+j) += A2(i+4UL,k) * B2(k,j) * alpha;
503  }
504  }
505  }
506  }
507  else
508  {
509  for( ; (i+4UL) <= isize; i+=4UL )
510  {
511  size_t j( 0UL );
512 
513  for( ; (j+2UL) <= jblock; j+=2UL ) {
514  for( size_t k=0UL; k<ksize; ++k ) {
515  (~C)(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
516  (~C)(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
517  (~C)(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
518  (~C)(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
519  (~C)(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
520  (~C)(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
521  (~C)(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
522  (~C)(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
523  }
524  }
525 
526  if( j<jblock ) {
527  for( size_t k=0UL; k<ksize; ++k ) {
528  (~C)(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
529  (~C)(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
530  (~C)(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
531  (~C)(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
532  }
533  }
534  }
535  }
536 
537  for( ; (i+2UL) <= isize; i+=2UL )
538  {
539  size_t j( 0UL );
540 
541  for( ; (j+2UL) <= jblock; j+=2UL ) {
542  for( size_t k=0UL; k<ksize; ++k ) {
543  (~C)(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
544  (~C)(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
545  (~C)(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
546  (~C)(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
547  }
548  }
549 
550  if( j<jblock ) {
551  for( size_t k=0UL; k<ksize; ++k ) {
552  (~C)(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
553  (~C)(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
554  }
555  }
556  }
557 
558  if( i<isize )
559  {
560  size_t j( 0UL );
561 
562  for( ; (j+2UL) <= jblock; j+=2UL ) {
563  for( size_t k=0UL; k<ksize; ++k ) {
564  (~C)(ibegin+i,jj+j ) += A2(i,k) * B2(k,j ) * alpha;
565  (~C)(ibegin+i,jj+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
566  }
567  }
568 
569  if( j<jblock ) {
570  for( size_t k=0UL; k<ksize; ++k ) {
571  (~C)(ibegin+i,jj+j) += A2(i,k) * B2(k,j) * alpha;
572  }
573  }
574  }
575 
576  jj += jblock;
577  }
578  }
579 }
581 //*************************************************************************************************
582 
583 
584 //*************************************************************************************************
603 template< typename MT1, typename MT2, typename MT3, typename ST >
604 void mmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
605 {
606  using ET1 = ElementType_<MT1>;
607  using ET2 = ElementType_<MT2>;
608  using ET3 = ElementType_<MT3>;
609  using SIMDType = SIMDTrait_<ET1>;
610 
615 
618 
621 
624 
625  enum : size_t { SIMDSIZE = SIMDTrait<ET1>::size };
626 
627  constexpr bool remainder( !IsPadded<MT2>::value || !IsPadded<MT3>::value );
628 
629  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
630  constexpr size_t IBLOCK( MMM_INNER_BLOCK_SIZE );
631 
632  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
633  BLAZE_STATIC_ASSERT( IBLOCK >= SIMDSIZE && IBLOCK % SIMDSIZE == 0UL );
634 
635  const size_t M( A.rows() );
636  const size_t N( B.columns() );
637  const size_t K( A.columns() );
638 
639  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
640 
641  DynamicMatrix<ET2,false> A2( IBLOCK, KBLOCK );
642  DynamicMatrix<ET3,true> B2( KBLOCK, N );
643 
644  if( isDefault( beta ) ) {
645  reset( ~C );
646  }
647  else if( !isOne( beta ) ) {
648  (~C) *= beta;
649  }
650 
651  size_t kk( 0UL );
652  size_t kblock( 0UL );
653 
654  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
655  {
656  if( remainder ) {
657  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
658  }
659  else {
660  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
661  }
662 
663  const size_t jbegin( IsUpper<MT3>::value ? kk : 0UL );
664  const size_t jend ( IsLower<MT3>::value ? kk+kblock : N );
665  const size_t jsize ( jend - jbegin );
666 
667  B2 = serial( submatrix<!remainder>( B, kk, jbegin, kblock, jsize ) );
668 
669  size_t ii( 0UL );
670  size_t iblock( 0UL );
671 
672  while( ii < M )
673  {
674  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
675 
676  if( ( IsLower<MT2>::value && ii+iblock <= kk ) ||
677  ( IsUpper<MT2>::value && kk+kblock <= ii ) ) {
678  ii += iblock;
679  continue;
680  }
681 
682  A2 = serial( submatrix<!remainder>( A, ii, kk, iblock, kblock ) );
683 
684  size_t j( 0UL );
685 
686  if( IsFloatingPoint<ET3>::value )
687  {
688  for( ; (j+5UL) <= jsize; j+=5UL )
689  {
690  size_t i( 0UL );
691 
692  for( ; (i+2UL) <= iblock; i+=2UL )
693  {
694  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
695 
696  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
697  {
698  const SIMDType a1( A2.load(i ,k) );
699  const SIMDType a2( A2.load(i+1UL,k) );
700 
701  const SIMDType b1( B2.load(k,j ) );
702  const SIMDType b2( B2.load(k,j+1UL) );
703  const SIMDType b3( B2.load(k,j+2UL) );
704  const SIMDType b4( B2.load(k,j+3UL) );
705  const SIMDType b5( B2.load(k,j+4UL) );
706 
707  xmm1 += a1 * b1;
708  xmm2 += a1 * b2;
709  xmm3 += a1 * b3;
710  xmm4 += a1 * b4;
711  xmm5 += a1 * b5;
712  xmm6 += a2 * b1;
713  xmm7 += a2 * b2;
714  xmm8 += a2 * b3;
715  xmm9 += a2 * b4;
716  xmm10 += a2 * b5;
717  }
718 
719  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
720  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
721  (~C)(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
722  (~C)(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
723  (~C)(ii+i ,jbegin+j+4UL) += sum( xmm5 ) * alpha;
724  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm6 ) * alpha;
725  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm7 ) * alpha;
726  (~C)(ii+i+1UL,jbegin+j+2UL) += sum( xmm8 ) * alpha;
727  (~C)(ii+i+1UL,jbegin+j+3UL) += sum( xmm9 ) * alpha;
728  (~C)(ii+i+1UL,jbegin+j+4UL) += sum( xmm10 ) * alpha;
729  }
730 
731  if( i<iblock )
732  {
733  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
734 
735  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
736  {
737  const SIMDType a1( A2.load(i,k) );
738 
739  xmm1 += a1 * B2.load(k,j );
740  xmm2 += a1 * B2.load(k,j+1UL);
741  xmm3 += a1 * B2.load(k,j+2UL);
742  xmm4 += a1 * B2.load(k,j+3UL);
743  xmm5 += a1 * B2.load(k,j+4UL);
744  }
745 
746  (~C)(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
747  (~C)(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
748  (~C)(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
749  (~C)(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
750  (~C)(ii+i,jbegin+j+4UL) += sum( xmm5 ) * alpha;
751  }
752  }
753  }
754  else
755  {
756  for( ; (j+4UL) <= jsize; j+=4UL )
757  {
758  size_t i( 0UL );
759 
760  for( ; (i+2UL) <= iblock; i+=2UL )
761  {
762  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
763 
764  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
765  {
766  const SIMDType a1( A2.load(i ,k) );
767  const SIMDType a2( A2.load(i+1UL,k) );
768 
769  const SIMDType b1( B2.load(k,j ) );
770  const SIMDType b2( B2.load(k,j+1UL) );
771  const SIMDType b3( B2.load(k,j+2UL) );
772  const SIMDType b4( B2.load(k,j+3UL) );
773 
774  xmm1 += a1 * b1;
775  xmm2 += a1 * b2;
776  xmm3 += a1 * b3;
777  xmm4 += a1 * b4;
778  xmm5 += a2 * b1;
779  xmm6 += a2 * b2;
780  xmm7 += a2 * b3;
781  xmm8 += a2 * b4;
782  }
783 
784  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
785  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
786  (~C)(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
787  (~C)(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
788  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm5 ) * alpha;
789  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
790  (~C)(ii+i+1UL,jbegin+j+2UL) += sum( xmm7 ) * alpha;
791  (~C)(ii+i+1UL,jbegin+j+3UL) += sum( xmm8 ) * alpha;
792  }
793 
794  if( i<iblock )
795  {
796  SIMDType xmm1, xmm2, xmm3, xmm4;
797 
798  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
799  {
800  const SIMDType a1( A2.load(i,k) );
801 
802  xmm1 += a1 * B2.load(k,j );
803  xmm2 += a1 * B2.load(k,j+1UL);
804  xmm3 += a1 * B2.load(k,j+2UL);
805  xmm4 += a1 * B2.load(k,j+3UL);
806  }
807 
808  (~C)(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
809  (~C)(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
810  (~C)(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
811  (~C)(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
812  }
813  }
814  }
815 
816  for( ; (j+2UL) <= jsize; j+=2UL )
817  {
818  size_t i( 0UL );
819 
820  for( ; (i+4UL) <= iblock; i+=4UL )
821  {
822  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
823 
824  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
825  {
826  const SIMDType a1( A2.load(i ,k) );
827  const SIMDType a2( A2.load(i+1UL,k) );
828  const SIMDType a3( A2.load(i+2UL,k) );
829  const SIMDType a4( A2.load(i+3UL,k) );
830 
831  const SIMDType b1( B2.load(k,j ) );
832  const SIMDType b2( B2.load(k,j+1UL) );
833 
834  xmm1 += a1 * b1;
835  xmm2 += a1 * b2;
836  xmm3 += a2 * b1;
837  xmm4 += a2 * b2;
838  xmm5 += a3 * b1;
839  xmm6 += a3 * b2;
840  xmm7 += a4 * b1;
841  xmm8 += a4 * b2;
842  }
843 
844  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
845  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
846  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
847  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
848  (~C)(ii+i+2UL,jbegin+j ) += sum( xmm5 ) * alpha;
849  (~C)(ii+i+2UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
850  (~C)(ii+i+3UL,jbegin+j ) += sum( xmm7 ) * alpha;
851  (~C)(ii+i+3UL,jbegin+j+1UL) += sum( xmm8 ) * alpha;
852  }
853 
854  for( ; (i+2UL) <= iblock; i+=2UL )
855  {
856  SIMDType xmm1, xmm2, xmm3, xmm4;
857 
858  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
859  {
860  const SIMDType a1( A2.load(i ,k) );
861  const SIMDType a2( A2.load(i+1UL,k) );
862 
863  const SIMDType b1( B2.load(k,j ) );
864  const SIMDType b2( B2.load(k,j+1UL) );
865 
866  xmm1 += a1 * b1;
867  xmm2 += a1 * b2;
868  xmm3 += a2 * b1;
869  xmm4 += a2 * b2;
870  }
871 
872  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
873  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
874  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
875  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
876  }
877 
878  if( i<iblock )
879  {
880  SIMDType xmm1, xmm2;
881 
882  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
883  {
884  const SIMDType a1( A2.load(i,k) );
885 
886  xmm1 += a1 * B2.load(k,j );
887  xmm2 += a1 * B2.load(k,j+1UL);
888  }
889 
890  (~C)(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
891  (~C)(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
892  }
893  }
894 
895  if( j<jsize )
896  {
897  size_t i( 0UL );
898 
899  for( ; (i+2UL) <= iblock; i+=2UL )
900  {
901  SIMDType xmm1, xmm2;
902 
903  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
904  {
905  const SIMDType b1( B2.load(k,j) );
906 
907  xmm1 += A2.load(i ,k) * b1;
908  xmm2 += A2.load(i+1UL,k) * b1;
909  }
910 
911  (~C)(ii+i ,jbegin+j) += sum( xmm1 ) * alpha;
912  (~C)(ii+i+1UL,jbegin+j) += sum( xmm2 ) * alpha;
913  }
914 
915  if( i<iblock )
916  {
917  SIMDType xmm1;
918 
919  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
920  {
921  xmm1 += A2.load(i,k) * B2.load(k,j);
922  }
923 
924  (~C)(ii+i,jbegin+j) += sum( xmm1 ) * alpha;
925  }
926  }
927 
928  ii += iblock;
929  }
930 
931  kk += kblock;
932  }
933 
934  if( remainder && kk < K )
935  {
936  const size_t ksize( K - kk );
937 
938  const size_t jbegin( IsUpper<MT3>::value ? kk : 0UL );
939  const size_t jsize ( N - jbegin );
940 
941  B2 = serial( submatrix( B, kk, jbegin, ksize, jsize ) );
942 
943  size_t ii( 0UL );
944  size_t iblock( 0UL );
945 
946  while( ii < M )
947  {
948  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
949 
950  if( IsLower<MT2>::value && ii+iblock <= kk ) {
951  ii += iblock;
952  continue;
953  }
954 
955  A2 = serial( submatrix( A, ii, kk, iblock, ksize ) );
956 
957  size_t j( 0UL );
958 
959  if( IsFloatingPoint<ET1>::value )
960  {
961  for( ; (j+5UL) <= jsize; j+=5UL )
962  {
963  size_t i( 0UL );
964 
965  for( ; (i+2UL) <= iblock; i+=2UL ) {
966  for( size_t k=0UL; k<ksize; ++k ) {
967  (~C)(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
968  (~C)(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
969  (~C)(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
970  (~C)(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
971  (~C)(ii+i ,jbegin+j+4UL) += A2(i ,k) * B2(k,j+4UL) * alpha;
972  (~C)(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
973  (~C)(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
974  (~C)(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
975  (~C)(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
976  (~C)(ii+i+1UL,jbegin+j+4UL) += A2(i+1UL,k) * B2(k,j+4UL) * alpha;
977  }
978  }
979 
980  if( i<iblock ) {
981  for( size_t k=0UL; k<ksize; ++k ) {
982  (~C)(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
983  (~C)(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
984  (~C)(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
985  (~C)(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
986  (~C)(ii+i,jbegin+j+4UL) += A2(i,k) * B2(k,j+4UL) * alpha;
987  }
988  }
989  }
990  }
991  else
992  {
993  for( ; (j+4UL) <= jsize; j+=4UL )
994  {
995  size_t i( 0UL );
996 
997  for( ; (i+2UL) <= iblock; i+=2UL ) {
998  for( size_t k=0UL; k<ksize; ++k ) {
999  (~C)(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
1000  (~C)(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1001  (~C)(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
1002  (~C)(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
1003  (~C)(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1004  (~C)(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1005  (~C)(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
1006  (~C)(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
1007  }
1008  }
1009 
1010  if( i<iblock ) {
1011  for( size_t k=0UL; k<ksize; ++k ) {
1012  (~C)(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
1013  (~C)(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
1014  (~C)(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
1015  (~C)(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
1016  }
1017  }
1018  }
1019  }
1020 
1021  for( ; (j+2UL) <= jsize; j+=2UL )
1022  {
1023  size_t i( 0UL );
1024 
1025  for( ; (i+2UL) <= iblock; i+=2UL ) {
1026  for( size_t k=0UL; k<ksize; ++k ) {
1027  (~C)(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
1028  (~C)(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1029  (~C)(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1030  (~C)(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1031  }
1032  }
1033 
1034  if( i<iblock ) {
1035  for( size_t k=0UL; k<ksize; ++k ) {
1036  (~C)(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
1037  (~C)(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
1038  }
1039  }
1040  }
1041 
1042  if( j<jsize )
1043  {
1044  size_t i( 0UL );
1045 
1046  for( ; (i+2UL) <= iblock; i+=2UL ) {
1047  for( size_t k=0UL; k<ksize; ++k ) {
1048  (~C)(ii+i ,jbegin+j) += A2(i ,k) * B2(k,j) * alpha;
1049  (~C)(ii+i+1UL,jbegin+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1050  }
1051  }
1052 
1053  if( i<iblock ) {
1054  for( size_t k=0UL; k<ksize; ++k ) {
1055  (~C)(ii+i,jbegin+j) += A2(i,k) * B2(k,j) * alpha;
1056  }
1057  }
1058  }
1059 
1060  ii += iblock;
1061  }
1062  }
1063 }
1065 //*************************************************************************************************
1066 
1067 
1068 //*************************************************************************************************
1084 template< typename MT1, typename MT2, typename MT3 >
1085 inline void mmm( MT1& C, const MT2& A, const MT3& B )
1086 {
1087  using ET1 = ElementType_<MT1>;
1088  using ET2 = ElementType_<MT2>;
1089  using ET3 = ElementType_<MT3>;
1090 
1093 
1094  mmm( C, A, B, ET1(1), ET1(0) );
1095 }
1097 //*************************************************************************************************
1098 
1099 
1100 
1101 
1102 //=================================================================================================
1103 //
1104 // LOWER DENSE MATRIX MULTIPLICATION KERNELS
1105 //
1106 //=================================================================================================
1107 
1108 //*************************************************************************************************
1127 template< typename MT1, typename MT2, typename MT3, typename ST >
1128 void lmmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
1129 {
1130  using ET1 = ElementType_<MT1>;
1131  using ET2 = ElementType_<MT2>;
1132  using ET3 = ElementType_<MT3>;
1133  using SIMDType = SIMDTrait_<ET1>;
1134 
1143 
1146 
1149 
1152 
1153  enum : size_t { SIMDSIZE = SIMDTrait<ET1>::size };
1154 
1155  constexpr bool remainder( !IsPadded<MT2>::value || !IsPadded<MT3>::value );
1156 
1157  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
1158  constexpr size_t JBLOCK( MMM_INNER_BLOCK_SIZE );
1159 
1160  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
1161  BLAZE_STATIC_ASSERT( JBLOCK >= SIMDSIZE && JBLOCK % SIMDSIZE == 0UL );
1162 
1163  const size_t M( A.rows() );
1164  const size_t N( B.columns() );
1165  const size_t K( A.columns() );
1166 
1167  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
1168 
1169  DynamicMatrix<ET2,false> A2( M, KBLOCK );
1170  DynamicMatrix<ET3,true> B2( KBLOCK, JBLOCK );
1171 
1172  decltype(auto) c( derestrict( ~C ) );
1173 
1174  if( isDefault( beta ) ) {
1175  reset( c );
1176  }
1177  else if( !isOne( beta ) ) {
1178  c *= beta;
1179  }
1180 
1181  size_t kk( 0UL );
1182  size_t kblock( 0UL );
1183 
1184  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
1185  {
1186  if( remainder ) {
1187  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
1188  }
1189  else {
1190  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
1191  }
1192 
1193  const size_t ibegin( IsLower<MT2>::value ? kk : 0UL );
1194  const size_t iend ( IsUpper<MT2>::value ? kk+kblock : M );
1195  const size_t isize ( iend - ibegin );
1196 
1197  A2 = serial( submatrix<!remainder>( A, ibegin, kk, isize, kblock ) );
1198 
1199  size_t jj( 0UL );
1200  size_t jblock( 0UL );
1201 
1202  while( jj < N )
1203  {
1204  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
1205 
1206  if( ( IsLower<MT3>::value && kk+kblock <= jj ) ||
1207  ( IsUpper<MT3>::value && jj+jblock <= kk ) ) {
1208  jj += jblock;
1209  continue;
1210  }
1211 
1212  B2 = serial( submatrix<!remainder>( B, kk, jj, kblock, jblock ) );
1213 
1214  size_t i( 0UL );
1215 
1216  if( IsFloatingPoint<ET1>::value )
1217  {
1218  for( ; (i+5UL) <= isize; i+=5UL )
1219  {
1220  if( jj > ibegin+i+4UL ) continue;
1221 
1222  const size_t jend( min( ibegin+i-jj+5UL, jblock ) );
1223  size_t j( 0UL );
1224 
1225  for( ; (j+2UL) <= jend; j+=2UL )
1226  {
1227  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1228 
1229  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1230  {
1231  const SIMDType a1( A2.load(i ,k) );
1232  const SIMDType a2( A2.load(i+1UL,k) );
1233  const SIMDType a3( A2.load(i+2UL,k) );
1234  const SIMDType a4( A2.load(i+3UL,k) );
1235  const SIMDType a5( A2.load(i+4UL,k) );
1236 
1237  const SIMDType b1( B2.load(k,j ) );
1238  const SIMDType b2( B2.load(k,j+1UL) );
1239 
1240  xmm1 += a1 * b1;
1241  xmm2 += a1 * b2;
1242  xmm3 += a2 * b1;
1243  xmm4 += a2 * b2;
1244  xmm5 += a3 * b1;
1245  xmm6 += a3 * b2;
1246  xmm7 += a4 * b1;
1247  xmm8 += a4 * b2;
1248  xmm9 += a5 * b1;
1249  xmm10 += a5 * b2;
1250  }
1251 
1252  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1253  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1254  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
1255  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
1256  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
1257  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
1258  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
1259  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
1260  c(ibegin+i+4UL,jj+j ) += sum( xmm9 ) * alpha;
1261  c(ibegin+i+4UL,jj+j+1UL) += sum( xmm10 ) * alpha;
1262  }
1263 
1264  if( j<jend )
1265  {
1266  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1267 
1268  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1269  {
1270  const SIMDType a1( A2.load(i ,k) );
1271  const SIMDType a2( A2.load(i+1UL,k) );
1272  const SIMDType a3( A2.load(i+2UL,k) );
1273  const SIMDType a4( A2.load(i+3UL,k) );
1274  const SIMDType a5( A2.load(i+4UL,k) );
1275 
1276  const SIMDType b1( B2.load(k,j) );
1277 
1278  xmm1 += a1 * b1;
1279  xmm2 += a2 * b1;
1280  xmm3 += a3 * b1;
1281  xmm4 += a4 * b1;
1282  xmm5 += a5 * b1;
1283  }
1284 
1285  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
1286  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
1287  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
1288  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
1289  c(ibegin+i+4UL,jj+j) += sum( xmm5 ) * alpha;
1290  }
1291  }
1292  }
1293  else
1294  {
1295  for( ; (i+4UL) <= isize; i+=4UL )
1296  {
1297  if( jj > ibegin+i+3UL ) continue;
1298 
1299  const size_t jend( min( ibegin+i-jj+4UL, jblock ) );
1300  size_t j( 0UL );
1301 
1302  for( ; (j+2UL) <= jend; j+=2UL )
1303  {
1304  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1305 
1306  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1307  {
1308  const SIMDType a1( A2.load(i ,k) );
1309  const SIMDType a2( A2.load(i+1UL,k) );
1310  const SIMDType a3( A2.load(i+2UL,k) );
1311  const SIMDType a4( A2.load(i+3UL,k) );
1312 
1313  const SIMDType b1( B2.load(k,j ) );
1314  const SIMDType b2( B2.load(k,j+1UL) );
1315 
1316  xmm1 += a1 * b1;
1317  xmm2 += a1 * b2;
1318  xmm3 += a2 * b1;
1319  xmm4 += a2 * b2;
1320  xmm5 += a3 * b1;
1321  xmm6 += a3 * b2;
1322  xmm7 += a4 * b1;
1323  xmm8 += a4 * b2;
1324  }
1325 
1326  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1327  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1328  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
1329  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
1330  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
1331  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
1332  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
1333  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
1334  }
1335 
1336  if( j<jend )
1337  {
1338  SIMDType xmm1, xmm2, xmm3, xmm4;
1339 
1340  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1341  {
1342  const SIMDType a1( A2.load(i ,k) );
1343  const SIMDType a2( A2.load(i+1UL,k) );
1344  const SIMDType a3( A2.load(i+2UL,k) );
1345  const SIMDType a4( A2.load(i+3UL,k) );
1346 
1347  const SIMDType b1( B2.load(k,j) );
1348 
1349  xmm1 += a1 * b1;
1350  xmm2 += a2 * b1;
1351  xmm3 += a3 * b1;
1352  xmm4 += a4 * b1;
1353  }
1354 
1355  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
1356  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
1357  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
1358  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
1359  }
1360  }
1361  }
1362 
1363  for( ; (i+2UL) <= isize; i+=2UL )
1364  {
1365  if( jj > ibegin+i+1UL ) continue;
1366 
1367  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1368  size_t j( 0UL );
1369 
1370  for( ; (j+4UL) <= jend; j+=4UL )
1371  {
1372  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1373 
1374  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1375  {
1376  const SIMDType a1( A2.load(i ,k) );
1377  const SIMDType a2( A2.load(i+1UL,k) );
1378 
1379  const SIMDType b1( B2.load(k,j ) );
1380  const SIMDType b2( B2.load(k,j+1UL) );
1381  const SIMDType b3( B2.load(k,j+2UL) );
1382  const SIMDType b4( B2.load(k,j+3UL) );
1383 
1384  xmm1 += a1 * b1;
1385  xmm2 += a1 * b2;
1386  xmm3 += a1 * b3;
1387  xmm4 += a1 * b4;
1388  xmm5 += a2 * b1;
1389  xmm6 += a2 * b2;
1390  xmm7 += a2 * b3;
1391  xmm8 += a2 * b4;
1392  }
1393 
1394  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1395  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1396  c(ibegin+i ,jj+j+2UL) += sum( xmm3 ) * alpha;
1397  c(ibegin+i ,jj+j+3UL) += sum( xmm4 ) * alpha;
1398  c(ibegin+i+1UL,jj+j ) += sum( xmm5 ) * alpha;
1399  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm6 ) * alpha;
1400  c(ibegin+i+1UL,jj+j+2UL) += sum( xmm7 ) * alpha;
1401  c(ibegin+i+1UL,jj+j+3UL) += sum( xmm8 ) * alpha;
1402  }
1403 
1404  for( ; (j+2UL) <= jend; j+=2UL )
1405  {
1406  SIMDType xmm1, xmm2, xmm3, xmm4;
1407 
1408  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1409  {
1410  const SIMDType a1( A2.load(i ,k) );
1411  const SIMDType a2( A2.load(i+1UL,k) );
1412 
1413  const SIMDType b1( B2.load(k,j ) );
1414  const SIMDType b2( B2.load(k,j+1UL) );
1415 
1416  xmm1 += a1 * b1;
1417  xmm2 += a1 * b2;
1418  xmm3 += a2 * b1;
1419  xmm4 += a2 * b2;
1420  }
1421 
1422  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1423  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1424  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
1425  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
1426  }
1427 
1428  if( j<jend )
1429  {
1430  SIMDType xmm1, xmm2;
1431 
1432  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1433  {
1434  const SIMDType a1( A2.load(i ,k) );
1435  const SIMDType a2( A2.load(i+1UL,k) );
1436 
1437  const SIMDType b1( B2.load(k,j) );
1438 
1439  xmm1 += a1 * b1;
1440  xmm2 += a2 * b1;
1441  }
1442 
1443  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
1444  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
1445  }
1446  }
1447 
1448  if( i<isize && jj <= ibegin+i )
1449  {
1450  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1451  size_t j( 0UL );
1452 
1453  for( ; (j+2UL) <= jend; j+=2UL )
1454  {
1455  SIMDType xmm1, xmm2;
1456 
1457  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1458  {
1459  const SIMDType a1( A2.load(i,k) );
1460 
1461  xmm1 += a1 * B2.load(k,j );
1462  xmm2 += a1 * B2.load(k,j+1UL);
1463  }
1464 
1465  c(ibegin+i,jj+j ) += sum( xmm1 ) * alpha;
1466  c(ibegin+i,jj+j+1UL) += sum( xmm2 ) * alpha;
1467  }
1468 
1469  if( j<jend )
1470  {
1471  SIMDType xmm1;
1472 
1473  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1474  {
1475  const SIMDType a1( A2.load(i,k) );
1476 
1477  xmm1 += a1 * B2.load(k,j);
1478  }
1479 
1480  c(ibegin+i,jj+j) += sum( xmm1 ) * alpha;
1481  }
1482  }
1483 
1484  jj += jblock;
1485  }
1486 
1487  kk += kblock;
1488  }
1489 
1490  if( remainder && kk < K )
1491  {
1492  const size_t ksize( K - kk );
1493 
1494  const size_t ibegin( IsLower<MT2>::value ? kk : 0UL );
1495  const size_t isize ( M - ibegin );
1496 
1497  A2 = serial( submatrix( A, ibegin, kk, isize, ksize ) );
1498 
1499  size_t jj( 0UL );
1500  size_t jblock( 0UL );
1501 
1502  while( jj < N )
1503  {
1504  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
1505 
1506  if( IsUpper<MT3>::value && jj+jblock <= kk ) {
1507  jj += jblock;
1508  continue;
1509  }
1510 
1511  B2 = serial( submatrix( B, kk, jj, ksize, jblock ) );
1512 
1513  size_t i( 0UL );
1514 
1515  if( IsFloatingPoint<ET1>::value )
1516  {
1517  for( ; (i+5UL) <= isize; i+=5UL )
1518  {
1519  if( jj > ibegin+i+4UL ) continue;
1520 
1521  const size_t jend( min( ibegin+i-jj+5UL, jblock ) );
1522  size_t j( 0UL );
1523 
1524  for( ; (j+2UL) <= jend; j+=2UL ) {
1525  for( size_t k=0UL; k<ksize; ++k ) {
1526  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
1527  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1528  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1529  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1530  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
1531  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
1532  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
1533  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
1534  c(ibegin+i+4UL,jj+j ) += A2(i+4UL,k) * B2(k,j ) * alpha;
1535  c(ibegin+i+4UL,jj+j+1UL) += A2(i+4UL,k) * B2(k,j+1UL) * alpha;
1536  }
1537  }
1538 
1539  if( j<jend ) {
1540  for( size_t k=0UL; k<ksize; ++k ) {
1541  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
1542  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1543  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
1544  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
1545  c(ibegin+i+4UL,jj+j) += A2(i+4UL,k) * B2(k,j) * alpha;
1546  }
1547  }
1548  }
1549  }
1550  else
1551  {
1552  for( ; (i+4UL) <= isize; i+=4UL )
1553  {
1554  if( jj > ibegin+i+3UL ) continue;
1555 
1556  const size_t jend( min( ibegin+i-jj+4UL, jblock ) );
1557  size_t j( 0UL );
1558 
1559  for( ; (j+2UL) <= jend; j+=2UL ) {
1560  for( size_t k=0UL; k<ksize; ++k ) {
1561  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
1562  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1563  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1564  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1565  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
1566  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
1567  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
1568  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
1569  }
1570  }
1571 
1572  if( j<jend ) {
1573  for( size_t k=0UL; k<ksize; ++k ) {
1574  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
1575  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1576  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
1577  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
1578  }
1579  }
1580  }
1581  }
1582 
1583  for( ; (i+2UL) <= isize; i+=2UL )
1584  {
1585  if( jj > ibegin+i+1UL ) continue;
1586 
1587  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1588  size_t j( 0UL );
1589 
1590  for( ; (j+2UL) <= jend; j+=2UL ) {
1591  for( size_t k=0UL; k<ksize; ++k ) {
1592  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
1593  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1594  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1595  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1596  }
1597  }
1598 
1599  if( j<jend ) {
1600  for( size_t k=0UL; k<ksize; ++k ) {
1601  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
1602  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1603  }
1604  }
1605  }
1606 
1607  if( i<isize && jj <= ibegin+i )
1608  {
1609  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1610  size_t j( 0UL );
1611 
1612  for( ; (j+2UL) <= jend; j+=2UL ) {
1613  for( size_t k=0UL; k<ksize; ++k ) {
1614  c(ibegin+i,jj+j ) += A2(i,k) * B2(k,j ) * alpha;
1615  c(ibegin+i,jj+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
1616  }
1617  }
1618 
1619  if( j<jend ) {
1620  for( size_t k=0UL; k<ksize; ++k ) {
1621  c(ibegin+i,jj+j) += A2(i,k) * B2(k,j) * alpha;
1622  }
1623  }
1624  }
1625 
1626  jj += jblock;
1627  }
1628  }
1629 }
1631 //*************************************************************************************************
1632 
1633 
1634 //*************************************************************************************************
1653 template< typename MT1, typename MT2, typename MT3, typename ST >
1654 void lmmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
1655 {
1656  using ET1 = ElementType_<MT1>;
1657  using ET2 = ElementType_<MT2>;
1658  using ET3 = ElementType_<MT3>;
1659  using SIMDType = SIMDTrait_<ET1>;
1660 
1669 
1672 
1675 
1678 
1679  enum : size_t { SIMDSIZE = SIMDTrait<ET1>::size };
1680 
1681  constexpr bool remainder( !IsPadded<MT2>::value || !IsPadded<MT3>::value );
1682 
1683  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
1684  constexpr size_t IBLOCK( MMM_INNER_BLOCK_SIZE );
1685 
1686  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
1687  BLAZE_STATIC_ASSERT( IBLOCK >= SIMDSIZE && IBLOCK % SIMDSIZE == 0UL );
1688 
1689  const size_t M( A.rows() );
1690  const size_t N( B.columns() );
1691  const size_t K( A.columns() );
1692 
1693  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
1694 
1695  DynamicMatrix<ET2,false> A2( IBLOCK, KBLOCK );
1696  DynamicMatrix<ET3,true> B2( KBLOCK, N );
1697 
1698  decltype(auto) c( derestrict( ~C ) );
1699 
1700  if( isDefault( beta ) ) {
1701  reset( c );
1702  }
1703  else if( !isOne( beta ) ) {
1704  c *= beta;
1705  }
1706 
1707  size_t kk( 0UL );
1708  size_t kblock( 0UL );
1709 
1710  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
1711  {
1712  if( remainder ) {
1713  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
1714  }
1715  else {
1716  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
1717  }
1718 
1719  const size_t jbegin( IsUpper<MT3>::value ? kk : 0UL );
1720  const size_t jend ( IsLower<MT3>::value ? kk+kblock : N );
1721  const size_t jsize ( jend - jbegin );
1722 
1723  B2 = serial( submatrix<!remainder>( B, kk, jbegin, kblock, jsize ) );
1724 
1725  size_t ii( 0UL );
1726  size_t iblock( 0UL );
1727 
1728  while( ii < M )
1729  {
1730  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
1731 
1732  if( ( IsLower<MT2>::value && ii+iblock <= kk ) ||
1733  ( IsUpper<MT2>::value && kk+kblock <= ii ) ) {
1734  ii += iblock;
1735  continue;
1736  }
1737 
1738  A2 = serial( submatrix<!remainder>( A, ii, kk, iblock, kblock ) );
1739 
1740  size_t j( 0UL );
1741 
1742  if( IsFloatingPoint<ET3>::value )
1743  {
1744  for( ; (j+5UL) <= jsize; j+=5UL )
1745  {
1746  if( ii+iblock < jbegin ) continue;
1747 
1748  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1749 
1750  for( ; (i+2UL) <= iblock; i+=2UL )
1751  {
1752  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1753 
1754  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1755  {
1756  const SIMDType a1( A2.load(i ,k) );
1757  const SIMDType a2( A2.load(i+1UL,k) );
1758 
1759  const SIMDType b1( B2.load(k,j ) );
1760  const SIMDType b2( B2.load(k,j+1UL) );
1761  const SIMDType b3( B2.load(k,j+2UL) );
1762  const SIMDType b4( B2.load(k,j+3UL) );
1763  const SIMDType b5( B2.load(k,j+4UL) );
1764 
1765  xmm1 += a1 * b1;
1766  xmm2 += a1 * b2;
1767  xmm3 += a1 * b3;
1768  xmm4 += a1 * b4;
1769  xmm5 += a1 * b5;
1770  xmm6 += a2 * b1;
1771  xmm7 += a2 * b2;
1772  xmm8 += a2 * b3;
1773  xmm9 += a2 * b4;
1774  xmm10 += a2 * b5;
1775  }
1776 
1777  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1778  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1779  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1780  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1781  c(ii+i ,jbegin+j+4UL) += sum( xmm5 ) * alpha;
1782  c(ii+i+1UL,jbegin+j ) += sum( xmm6 ) * alpha;
1783  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm7 ) * alpha;
1784  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm8 ) * alpha;
1785  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm9 ) * alpha;
1786  c(ii+i+1UL,jbegin+j+4UL) += sum( xmm10 ) * alpha;
1787  }
1788 
1789  if( i<iblock )
1790  {
1791  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1792 
1793  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1794  {
1795  const SIMDType a1( A2.load(i,k) );
1796 
1797  xmm1 += a1 * B2.load(k,j );
1798  xmm2 += a1 * B2.load(k,j+1UL);
1799  xmm3 += a1 * B2.load(k,j+2UL);
1800  xmm4 += a1 * B2.load(k,j+3UL);
1801  xmm5 += a1 * B2.load(k,j+4UL);
1802  }
1803 
1804  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
1805  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1806  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1807  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1808  c(ii+i,jbegin+j+4UL) += sum( xmm5 ) * alpha;
1809  }
1810  }
1811  }
1812  else
1813  {
1814  for( ; (j+4UL) <= jsize; j+=4UL )
1815  {
1816  if( ii+iblock < jbegin ) continue;
1817 
1818  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1819 
1820  for( ; (i+2UL) <= iblock; i+=2UL )
1821  {
1822  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1823 
1824  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1825  {
1826  const SIMDType a1( A2.load(i ,k) );
1827  const SIMDType a2( A2.load(i+1UL,k) );
1828 
1829  const SIMDType b1( B2.load(k,j ) );
1830  const SIMDType b2( B2.load(k,j+1UL) );
1831  const SIMDType b3( B2.load(k,j+2UL) );
1832  const SIMDType b4( B2.load(k,j+3UL) );
1833 
1834  xmm1 += a1 * b1;
1835  xmm2 += a1 * b2;
1836  xmm3 += a1 * b3;
1837  xmm4 += a1 * b4;
1838  xmm5 += a2 * b1;
1839  xmm6 += a2 * b2;
1840  xmm7 += a2 * b3;
1841  xmm8 += a2 * b4;
1842  }
1843 
1844  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1845  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1846  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1847  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1848  c(ii+i+1UL,jbegin+j ) += sum( xmm5 ) * alpha;
1849  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
1850  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm7 ) * alpha;
1851  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm8 ) * alpha;
1852  }
1853 
1854  if( i<iblock )
1855  {
1856  SIMDType xmm1, xmm2, xmm3, xmm4;
1857 
1858  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1859  {
1860  const SIMDType a1( A2.load(i,k) );
1861 
1862  xmm1 += a1 * B2.load(k,j );
1863  xmm2 += a1 * B2.load(k,j+1UL);
1864  xmm3 += a1 * B2.load(k,j+2UL);
1865  xmm4 += a1 * B2.load(k,j+3UL);
1866  }
1867 
1868  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
1869  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1870  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1871  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1872  }
1873  }
1874  }
1875 
1876  for( ; (j+2UL) <= jsize; j+=2UL )
1877  {
1878  if( ii+iblock < jbegin ) continue;
1879 
1880  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1881 
1882  for( ; (i+4UL) <= iblock; i+=4UL )
1883  {
1884  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1885 
1886  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1887  {
1888  const SIMDType a1( A2.load(i ,k) );
1889  const SIMDType a2( A2.load(i+1UL,k) );
1890  const SIMDType a3( A2.load(i+2UL,k) );
1891  const SIMDType a4( A2.load(i+3UL,k) );
1892 
1893  const SIMDType b1( B2.load(k,j ) );
1894  const SIMDType b2( B2.load(k,j+1UL) );
1895 
1896  xmm1 += a1 * b1;
1897  xmm2 += a1 * b2;
1898  xmm3 += a2 * b1;
1899  xmm4 += a2 * b2;
1900  xmm5 += a3 * b1;
1901  xmm6 += a3 * b2;
1902  xmm7 += a4 * b1;
1903  xmm8 += a4 * b2;
1904  }
1905 
1906  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1907  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1908  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
1909  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
1910  c(ii+i+2UL,jbegin+j ) += sum( xmm5 ) * alpha;
1911  c(ii+i+2UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
1912  c(ii+i+3UL,jbegin+j ) += sum( xmm7 ) * alpha;
1913  c(ii+i+3UL,jbegin+j+1UL) += sum( xmm8 ) * alpha;
1914  }
1915 
1916  for( ; (i+2UL) <= iblock; i+=2UL )
1917  {
1918  SIMDType xmm1, xmm2, xmm3, xmm4;
1919 
1920  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1921  {
1922  const SIMDType a1( A2.load(i ,k) );
1923  const SIMDType a2( A2.load(i+1UL,k) );
1924 
1925  const SIMDType b1( B2.load(k,j ) );
1926  const SIMDType b2( B2.load(k,j+1UL) );
1927 
1928  xmm1 += a1 * b1;
1929  xmm2 += a1 * b2;
1930  xmm3 += a2 * b1;
1931  xmm4 += a2 * b2;
1932  }
1933 
1934  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1935  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1936  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
1937  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
1938  }
1939 
1940  if( i<iblock )
1941  {
1942  SIMDType xmm1, xmm2;
1943 
1944  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1945  {
1946  const SIMDType a1( A2.load(i,k) );
1947 
1948  xmm1 += a1 * B2.load(k,j );
1949  xmm2 += a1 * B2.load(k,j+1UL);
1950  }
1951 
1952  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
1953  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1954  }
1955  }
1956 
1957  if( j<jsize && ii+iblock >= jbegin )
1958  {
1959  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1960 
1961  for( ; (i+2UL) <= iblock; i+=2UL )
1962  {
1963  SIMDType xmm1, xmm2;
1964 
1965  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1966  {
1967  const SIMDType b1( B2.load(k,j) );
1968 
1969  xmm1 += A2.load(i ,k) * b1;
1970  xmm2 += A2.load(i+1UL,k) * b1;
1971  }
1972 
1973  c(ii+i ,jbegin+j) += sum( xmm1 ) * alpha;
1974  c(ii+i+1UL,jbegin+j) += sum( xmm2 ) * alpha;
1975  }
1976 
1977  if( i<iblock )
1978  {
1979  SIMDType xmm1;
1980 
1981  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1982  {
1983  xmm1 += A2.load(i,k) * B2.load(k,j);
1984  }
1985 
1986  c(ii+i,jbegin+j) += sum( xmm1 ) * alpha;
1987  }
1988  }
1989 
1990  ii += iblock;
1991  }
1992 
1993  kk += kblock;
1994  }
1995 
1996  if( remainder && kk < K )
1997  {
1998  const size_t ksize( K - kk );
1999 
2000  const size_t jbegin( IsUpper<MT3>::value ? kk : 0UL );
2001  const size_t jsize ( N - jbegin );
2002 
2003  B2 = serial( submatrix( B, kk, jbegin, ksize, jsize ) );
2004 
2005  size_t ii( 0UL );
2006  size_t iblock( 0UL );
2007 
2008  while( ii < M )
2009  {
2010  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
2011 
2012  if( IsLower<MT2>::value && ii+iblock <= kk ) {
2013  ii += iblock;
2014  continue;
2015  }
2016 
2017  A2 = serial( submatrix( A, ii, kk, iblock, ksize ) );
2018 
2019  size_t j( 0UL );
2020 
2021  if( IsFloatingPoint<ET1>::value )
2022  {
2023  for( ; (j+5UL) <= jsize; j+=5UL )
2024  {
2025  if( ii+iblock < jbegin ) continue;
2026 
2027  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2028 
2029  for( ; (i+2UL) <= iblock; i+=2UL ) {
2030  for( size_t k=0UL; k<ksize; ++k ) {
2031  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
2032  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2033  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
2034  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
2035  c(ii+i ,jbegin+j+4UL) += A2(i ,k) * B2(k,j+4UL) * alpha;
2036  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2037  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2038  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
2039  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
2040  c(ii+i+1UL,jbegin+j+4UL) += A2(i+1UL,k) * B2(k,j+4UL) * alpha;
2041  }
2042  }
2043 
2044  if( i<iblock ) {
2045  for( size_t k=0UL; k<ksize; ++k ) {
2046  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
2047  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2048  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
2049  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
2050  c(ii+i,jbegin+j+4UL) += A2(i,k) * B2(k,j+4UL) * alpha;
2051  }
2052  }
2053  }
2054  }
2055  else
2056  {
2057  for( ; (j+4UL) <= jsize; j+=4UL )
2058  {
2059  if( ii+iblock < jbegin ) continue;
2060 
2061  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2062 
2063  for( ; (i+2UL) <= iblock; i+=2UL ) {
2064  for( size_t k=0UL; k<ksize; ++k ) {
2065  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
2066  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2067  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
2068  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
2069  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2070  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2071  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
2072  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
2073  }
2074  }
2075 
2076  if( i<iblock ) {
2077  for( size_t k=0UL; k<ksize; ++k ) {
2078  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
2079  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2080  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
2081  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
2082  }
2083  }
2084  }
2085  }
2086 
2087  for( ; (j+2UL) <= jsize; j+=2UL )
2088  {
2089  if( ii+iblock < jbegin ) continue;
2090 
2091  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2092 
2093  for( ; (i+2UL) <= iblock; i+=2UL ) {
2094  for( size_t k=0UL; k<ksize; ++k ) {
2095  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
2096  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2097  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2098  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2099  }
2100  }
2101 
2102  if( i<iblock ) {
2103  for( size_t k=0UL; k<ksize; ++k ) {
2104  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
2105  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2106  }
2107  }
2108  }
2109 
2110  if( j<jsize )
2111  {
2112  if( ii+iblock < jbegin ) continue;
2113 
2114  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2115 
2116  for( ; (i+2UL) <= iblock; i+=2UL ) {
2117  for( size_t k=0UL; k<ksize; ++k ) {
2118  c(ii+i ,jbegin+j) += A2(i ,k) * B2(k,j) * alpha;
2119  c(ii+i+1UL,jbegin+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2120  }
2121  }
2122 
2123  if( i<iblock ) {
2124  for( size_t k=0UL; k<ksize; ++k ) {
2125  c(ii+i,jbegin+j) += A2(i,k) * B2(k,j) * alpha;
2126  }
2127  }
2128  }
2129 
2130  ii += iblock;
2131  }
2132  }
2133 }
2135 //*************************************************************************************************
2136 
2137 
2138 //*************************************************************************************************
2154 template< typename MT1, typename MT2, typename MT3 >
2155 inline void lmmm( MT1& C, const MT2& A, const MT3& B )
2156 {
2157  using ET1 = ElementType_<MT1>;
2158  using ET2 = ElementType_<MT2>;
2159  using ET3 = ElementType_<MT3>;
2160 
2163 
2164  lmmm( C, A, B, ET1(1), ET1(0) );
2165 }
2167 //*************************************************************************************************
2168 
2169 
2170 
2171 
2172 //=================================================================================================
2173 //
2174 // UPPER DENSE MATRIX MULTIPLICATION KERNELS
2175 //
2176 //=================================================================================================
2177 
2178 //*************************************************************************************************
2197 template< typename MT1, typename MT2, typename MT3, typename ST >
2198 void ummm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
2199 {
2200  using ET1 = ElementType_<MT1>;
2201  using ET2 = ElementType_<MT2>;
2202  using ET3 = ElementType_<MT3>;
2203  using SIMDType = SIMDTrait_<ET1>;
2204 
2213 
2216 
2219 
2222 
2223  enum : size_t { SIMDSIZE = SIMDTrait<ET1>::size };
2224 
2225  constexpr bool remainder( !IsPadded<MT2>::value || !IsPadded<MT3>::value );
2226 
2227  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
2228  constexpr size_t JBLOCK( MMM_INNER_BLOCK_SIZE );
2229 
2230  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
2231  BLAZE_STATIC_ASSERT( JBLOCK >= SIMDSIZE && JBLOCK % SIMDSIZE == 0UL );
2232 
2233  const size_t M( A.rows() );
2234  const size_t N( B.columns() );
2235  const size_t K( A.columns() );
2236 
2237  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
2238 
2239  DynamicMatrix<ET2,false> A2( M, KBLOCK );
2240  DynamicMatrix<ET3,true> B2( KBLOCK, JBLOCK );
2241 
2242  decltype(auto) c( derestrict( ~C ) );
2243 
2244  if( isDefault( beta ) ) {
2245  reset( c );
2246  }
2247  else if( !isOne( beta ) ) {
2248  c *= beta;
2249  }
2250 
2251  size_t kk( 0UL );
2252  size_t kblock( 0UL );
2253 
2254  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
2255  {
2256  if( remainder ) {
2257  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
2258  }
2259  else {
2260  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
2261  }
2262 
2263  const size_t ibegin( IsLower<MT2>::value ? kk : 0UL );
2264  const size_t iend ( IsUpper<MT2>::value ? kk+kblock : M );
2265  const size_t isize ( iend - ibegin );
2266 
2267  A2 = serial( submatrix<!remainder>( A, ibegin, kk, isize, kblock ) );
2268 
2269  size_t jj( 0UL );
2270  size_t jblock( 0UL );
2271 
2272  while( jj < N )
2273  {
2274  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
2275 
2276  if( ( IsLower<MT3>::value && kk+kblock <= jj ) ||
2277  ( IsUpper<MT3>::value && jj+jblock <= kk ) ) {
2278  jj += jblock;
2279  continue;
2280  }
2281 
2282  B2 = serial( submatrix<!remainder>( B, kk, jj, kblock, jblock ) );
2283 
2284  size_t i( 0UL );
2285 
2286  if( IsFloatingPoint<ET1>::value )
2287  {
2288  for( ; (i+5UL) <= isize; i+=5UL )
2289  {
2290  if( jj+jblock < ibegin ) continue;
2291 
2292  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2293 
2294  for( ; (j+2UL) <= jblock; j+=2UL )
2295  {
2296  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
2297 
2298  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2299  {
2300  const SIMDType a1( A2.load(i ,k) );
2301  const SIMDType a2( A2.load(i+1UL,k) );
2302  const SIMDType a3( A2.load(i+2UL,k) );
2303  const SIMDType a4( A2.load(i+3UL,k) );
2304  const SIMDType a5( A2.load(i+4UL,k) );
2305 
2306  const SIMDType b1( B2.load(k,j ) );
2307  const SIMDType b2( B2.load(k,j+1UL) );
2308 
2309  xmm1 += a1 * b1;
2310  xmm2 += a1 * b2;
2311  xmm3 += a2 * b1;
2312  xmm4 += a2 * b2;
2313  xmm5 += a3 * b1;
2314  xmm6 += a3 * b2;
2315  xmm7 += a4 * b1;
2316  xmm8 += a4 * b2;
2317  xmm9 += a5 * b1;
2318  xmm10 += a5 * b2;
2319  }
2320 
2321  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2322  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2323  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
2324  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
2325  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
2326  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
2327  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
2328  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
2329  c(ibegin+i+4UL,jj+j ) += sum( xmm9 ) * alpha;
2330  c(ibegin+i+4UL,jj+j+1UL) += sum( xmm10 ) * alpha;
2331  }
2332 
2333  if( j<jblock )
2334  {
2335  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
2336 
2337  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2338  {
2339  const SIMDType a1( A2.load(i ,k) );
2340  const SIMDType a2( A2.load(i+1UL,k) );
2341  const SIMDType a3( A2.load(i+2UL,k) );
2342  const SIMDType a4( A2.load(i+3UL,k) );
2343  const SIMDType a5( A2.load(i+4UL,k) );
2344 
2345  const SIMDType b1( B2.load(k,j) );
2346 
2347  xmm1 += a1 * b1;
2348  xmm2 += a2 * b1;
2349  xmm3 += a3 * b1;
2350  xmm4 += a4 * b1;
2351  xmm5 += a5 * b1;
2352  }
2353 
2354  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
2355  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
2356  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
2357  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
2358  c(ibegin+i+4UL,jj+j) += sum( xmm5 ) * alpha;
2359  }
2360  }
2361  }
2362  else
2363  {
2364  for( ; (i+4UL) <= isize; i+=4UL )
2365  {
2366  if( jj+jblock < ibegin ) continue;
2367 
2368  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2369 
2370  for( ; (j+2UL) <= jblock; j+=2UL )
2371  {
2372  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2373 
2374  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2375  {
2376  const SIMDType a1( A2.load(i ,k) );
2377  const SIMDType a2( A2.load(i+1UL,k) );
2378  const SIMDType a3( A2.load(i+2UL,k) );
2379  const SIMDType a4( A2.load(i+3UL,k) );
2380 
2381  const SIMDType b1( B2.load(k,j ) );
2382  const SIMDType b2( B2.load(k,j+1UL) );
2383 
2384  xmm1 += a1 * b1;
2385  xmm2 += a1 * b2;
2386  xmm3 += a2 * b1;
2387  xmm4 += a2 * b2;
2388  xmm5 += a3 * b1;
2389  xmm6 += a3 * b2;
2390  xmm7 += a4 * b1;
2391  xmm8 += a4 * b2;
2392  }
2393 
2394  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2395  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2396  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
2397  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
2398  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
2399  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
2400  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
2401  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
2402  }
2403 
2404  if( j<jblock )
2405  {
2406  SIMDType xmm1, xmm2, xmm3, xmm4;
2407 
2408  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2409  {
2410  const SIMDType a1( A2.load(i ,k) );
2411  const SIMDType a2( A2.load(i+1UL,k) );
2412  const SIMDType a3( A2.load(i+2UL,k) );
2413  const SIMDType a4( A2.load(i+3UL,k) );
2414 
2415  const SIMDType b1( B2.load(k,j) );
2416 
2417  xmm1 += a1 * b1;
2418  xmm2 += a2 * b1;
2419  xmm3 += a3 * b1;
2420  xmm4 += a4 * b1;
2421  }
2422 
2423  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
2424  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
2425  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
2426  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
2427  }
2428  }
2429  }
2430 
2431  for( ; (i+2UL) <= isize; i+=2UL )
2432  {
2433  if( jj+jblock < ibegin ) continue;
2434 
2435  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2436 
2437  for( ; (j+4UL) <= jblock; j+=4UL )
2438  {
2439  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2440 
2441  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2442  {
2443  const SIMDType a1( A2.load(i ,k) );
2444  const SIMDType a2( A2.load(i+1UL,k) );
2445 
2446  const SIMDType b1( B2.load(k,j ) );
2447  const SIMDType b2( B2.load(k,j+1UL) );
2448  const SIMDType b3( B2.load(k,j+2UL) );
2449  const SIMDType b4( B2.load(k,j+3UL) );
2450 
2451  xmm1 += a1 * b1;
2452  xmm2 += a1 * b2;
2453  xmm3 += a1 * b3;
2454  xmm4 += a1 * b4;
2455  xmm5 += a2 * b1;
2456  xmm6 += a2 * b2;
2457  xmm7 += a2 * b3;
2458  xmm8 += a2 * b4;
2459  }
2460 
2461  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2462  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2463  c(ibegin+i ,jj+j+2UL) += sum( xmm3 ) * alpha;
2464  c(ibegin+i ,jj+j+3UL) += sum( xmm4 ) * alpha;
2465  c(ibegin+i+1UL,jj+j ) += sum( xmm5 ) * alpha;
2466  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm6 ) * alpha;
2467  c(ibegin+i+1UL,jj+j+2UL) += sum( xmm7 ) * alpha;
2468  c(ibegin+i+1UL,jj+j+3UL) += sum( xmm8 ) * alpha;
2469  }
2470 
2471  for( ; (j+2UL) <= jblock; j+=2UL )
2472  {
2473  SIMDType xmm1, xmm2, xmm3, xmm4;
2474 
2475  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2476  {
2477  const SIMDType a1( A2.load(i ,k) );
2478  const SIMDType a2( A2.load(i+1UL,k) );
2479 
2480  const SIMDType b1( B2.load(k,j ) );
2481  const SIMDType b2( B2.load(k,j+1UL) );
2482 
2483  xmm1 += a1 * b1;
2484  xmm2 += a1 * b2;
2485  xmm3 += a2 * b1;
2486  xmm4 += a2 * b2;
2487  }
2488 
2489  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2490  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2491  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
2492  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
2493  }
2494 
2495  if( j<jblock )
2496  {
2497  SIMDType xmm1, xmm2;
2498 
2499  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2500  {
2501  const SIMDType a1( A2.load(i ,k) );
2502  const SIMDType a2( A2.load(i+1UL,k) );
2503 
2504  const SIMDType b1( B2.load(k,j) );
2505 
2506  xmm1 += a1 * b1;
2507  xmm2 += a2 * b1;
2508  }
2509 
2510  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
2511  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
2512  }
2513  }
2514 
2515  if( i<isize && jj+jblock >= ibegin )
2516  {
2517  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2518 
2519  for( ; (j+2UL) <= jblock; j+=2UL )
2520  {
2521  SIMDType xmm1, xmm2;
2522 
2523  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2524  {
2525  const SIMDType a1( A2.load(i,k) );
2526 
2527  xmm1 += a1 * B2.load(k,j );
2528  xmm2 += a1 * B2.load(k,j+1UL);
2529  }
2530 
2531  c(ibegin+i,jj+j ) += sum( xmm1 ) * alpha;
2532  c(ibegin+i,jj+j+1UL) += sum( xmm2 ) * alpha;
2533  }
2534 
2535  if( j<jblock )
2536  {
2537  SIMDType xmm1;
2538 
2539  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2540  {
2541  const SIMDType a1( A2.load(i,k) );
2542 
2543  xmm1 += a1 * B2.load(k,j);
2544  }
2545 
2546  c(ibegin+i,jj+j) += sum( xmm1 ) * alpha;
2547  }
2548  }
2549 
2550  jj += jblock;
2551  }
2552 
2553  kk += kblock;
2554  }
2555 
2556  if( remainder && kk < K )
2557  {
2558  const size_t ksize( K - kk );
2559 
2560  const size_t ibegin( IsLower<MT2>::value ? kk : 0UL );
2561  const size_t isize ( M - ibegin );
2562 
2563  A2 = serial( submatrix( A, ibegin, kk, isize, ksize ) );
2564 
2565  size_t jj( 0UL );
2566  size_t jblock( 0UL );
2567 
2568  while( jj < N )
2569  {
2570  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
2571 
2572  if( IsUpper<MT3>::value && jj+jblock <= kk ) {
2573  jj += jblock;
2574  continue;
2575  }
2576 
2577  B2 = serial( submatrix( B, kk, jj, ksize, jblock ) );
2578 
2579  size_t i( 0UL );
2580 
2581  if( IsFloatingPoint<ET1>::value )
2582  {
2583  for( ; (i+5UL) <= isize; i+=5UL )
2584  {
2585  if( jj+jblock < ibegin ) continue;
2586 
2587  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2588 
2589  for( ; (j+2UL) <= jblock; j+=2UL ) {
2590  for( size_t k=0UL; k<ksize; ++k ) {
2591  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
2592  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2593  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2594  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2595  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
2596  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
2597  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
2598  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
2599  c(ibegin+i+4UL,jj+j ) += A2(i+4UL,k) * B2(k,j ) * alpha;
2600  c(ibegin+i+4UL,jj+j+1UL) += A2(i+4UL,k) * B2(k,j+1UL) * alpha;
2601  }
2602  }
2603 
2604  if( j<jblock ) {
2605  for( size_t k=0UL; k<ksize; ++k ) {
2606  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
2607  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2608  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
2609  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
2610  c(ibegin+i+4UL,jj+j) += A2(i+4UL,k) * B2(k,j) * alpha;
2611  }
2612  }
2613  }
2614  }
2615  else
2616  {
2617  for( ; (i+4UL) <= isize; i+=4UL )
2618  {
2619  if( jj+jblock < ibegin ) continue;
2620 
2621  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2622 
2623  for( ; (j+2UL) <= jblock; j+=2UL ) {
2624  for( size_t k=0UL; k<ksize; ++k ) {
2625  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
2626  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2627  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2628  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2629  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
2630  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
2631  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
2632  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
2633  }
2634  }
2635 
2636  if( j<jblock ) {
2637  for( size_t k=0UL; k<ksize; ++k ) {
2638  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
2639  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2640  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
2641  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
2642  }
2643  }
2644  }
2645  }
2646 
2647  for( ; (i+2UL) <= isize; i+=2UL )
2648  {
2649  if( jj+jblock < ibegin ) continue;
2650 
2651  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2652 
2653  for( ; (j+2UL) <= jblock; j+=2UL ) {
2654  for( size_t k=0UL; k<ksize; ++k ) {
2655  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
2656  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2657  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2658  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2659  }
2660  }
2661 
2662  if( j<jblock ) {
2663  for( size_t k=0UL; k<ksize; ++k ) {
2664  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
2665  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2666  }
2667  }
2668  }
2669 
2670  if( i<isize && jj+jblock >= ibegin )
2671  {
2672  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2673 
2674  for( ; (j+2UL) <= jblock; j+=2UL ) {
2675  for( size_t k=0UL; k<ksize; ++k ) {
2676  c(ibegin+i,jj+j ) += A2(i,k) * B2(k,j ) * alpha;
2677  c(ibegin+i,jj+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2678  }
2679  }
2680 
2681  if( j<jblock ) {
2682  for( size_t k=0UL; k<ksize; ++k ) {
2683  c(ibegin+i,jj+j) += A2(i,k) * B2(k,j) * alpha;
2684  }
2685  }
2686  }
2687 
2688  jj += jblock;
2689  }
2690  }
2691 }
2693 //*************************************************************************************************
2694 
2695 
2696 //*************************************************************************************************
2715 template< typename MT1, typename MT2, typename MT3, typename ST >
2716 void ummm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
2717 {
2718  using ET1 = ElementType_<MT1>;
2719  using ET2 = ElementType_<MT2>;
2720  using ET3 = ElementType_<MT3>;
2721  using SIMDType = SIMDTrait_<ET1>;
2722 
2731 
2734 
2737 
2740 
2741  enum : size_t { SIMDSIZE = SIMDTrait<ET1>::size };
2742 
2743  constexpr bool remainder( !IsPadded<MT2>::value || !IsPadded<MT3>::value );
2744 
2745  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
2746  constexpr size_t IBLOCK( MMM_INNER_BLOCK_SIZE );
2747 
2748  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
2749  BLAZE_STATIC_ASSERT( IBLOCK >= SIMDSIZE && IBLOCK % SIMDSIZE == 0UL );
2750 
2751  const size_t M( A.rows() );
2752  const size_t N( B.columns() );
2753  const size_t K( A.columns() );
2754 
2755  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
2756 
2757  DynamicMatrix<ET2,false> A2( IBLOCK, KBLOCK );
2758  DynamicMatrix<ET3,true> B2( KBLOCK, N );
2759 
2760  decltype(auto) c( derestrict( ~C ) );
2761 
2762  if( isDefault( beta ) ) {
2763  reset( c );
2764  }
2765  else if( !isOne( beta ) ) {
2766  c *= beta;
2767  }
2768 
2769  size_t kk( 0UL );
2770  size_t kblock( 0UL );
2771 
2772  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
2773  {
2774  if( remainder ) {
2775  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
2776  }
2777  else {
2778  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
2779  }
2780 
2781  const size_t jbegin( IsUpper<MT3>::value ? kk : 0UL );
2782  const size_t jend ( IsLower<MT3>::value ? kk+kblock : N );
2783  const size_t jsize ( jend - jbegin );
2784 
2785  B2 = serial( submatrix<!remainder>( B, kk, jbegin, kblock, jsize ) );
2786 
2787  size_t ii( 0UL );
2788  size_t iblock( 0UL );
2789 
2790  while( ii < M )
2791  {
2792  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
2793 
2794  if( ( IsLower<MT2>::value && ii+iblock <= kk ) ||
2795  ( IsUpper<MT2>::value && kk+kblock <= ii ) ) {
2796  ii += iblock;
2797  continue;
2798  }
2799 
2800  A2 = serial( submatrix<!remainder>( A, ii, kk, iblock, kblock ) );
2801 
2802  size_t j( 0UL );
2803 
2804  if( IsFloatingPoint<ET3>::value )
2805  {
2806  for( ; (j+5UL) <= jsize; j+=5UL )
2807  {
2808  if( ii > jbegin+j+4UL ) continue;
2809 
2810  const size_t iend( min( iblock, jbegin+j-ii+5UL ) );
2811  size_t i( 0UL );
2812 
2813  for( ; (i+2UL) <= iend; i+=2UL )
2814  {
2815  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
2816 
2817  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2818  {
2819  const SIMDType a1( A2.load(i ,k) );
2820  const SIMDType a2( A2.load(i+1UL,k) );
2821 
2822  const SIMDType b1( B2.load(k,j ) );
2823  const SIMDType b2( B2.load(k,j+1UL) );
2824  const SIMDType b3( B2.load(k,j+2UL) );
2825  const SIMDType b4( B2.load(k,j+3UL) );
2826  const SIMDType b5( B2.load(k,j+4UL) );
2827 
2828  xmm1 += a1 * b1;
2829  xmm2 += a1 * b2;
2830  xmm3 += a1 * b3;
2831  xmm4 += a1 * b4;
2832  xmm5 += a1 * b5;
2833  xmm6 += a2 * b1;
2834  xmm7 += a2 * b2;
2835  xmm8 += a2 * b3;
2836  xmm9 += a2 * b4;
2837  xmm10 += a2 * b5;
2838  }
2839 
2840  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
2841  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2842  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2843  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2844  c(ii+i ,jbegin+j+4UL) += sum( xmm5 ) * alpha;
2845  c(ii+i+1UL,jbegin+j ) += sum( xmm6 ) * alpha;
2846  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm7 ) * alpha;
2847  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm8 ) * alpha;
2848  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm9 ) * alpha;
2849  c(ii+i+1UL,jbegin+j+4UL) += sum( xmm10 ) * alpha;
2850  }
2851 
2852  if( i<iend )
2853  {
2854  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
2855 
2856  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2857  {
2858  const SIMDType a1( A2.load(i,k) );
2859 
2860  xmm1 += a1 * B2.load(k,j );
2861  xmm2 += a1 * B2.load(k,j+1UL);
2862  xmm3 += a1 * B2.load(k,j+2UL);
2863  xmm4 += a1 * B2.load(k,j+3UL);
2864  xmm5 += a1 * B2.load(k,j+4UL);
2865  }
2866 
2867  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
2868  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2869  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2870  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2871  c(ii+i,jbegin+j+4UL) += sum( xmm5 ) * alpha;
2872  }
2873  }
2874  }
2875  else
2876  {
2877  for( ; (j+4UL) <= jsize; j+=4UL )
2878  {
2879  if( ii > jbegin+j+3UL ) continue;
2880 
2881  const size_t iend( min( iblock, jbegin+j-ii+4UL ) );
2882  size_t i( 0UL );
2883 
2884  for( ; (i+2UL) <= iend; i+=2UL )
2885  {
2886  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2887 
2888  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2889  {
2890  const SIMDType a1( A2.load(i ,k) );
2891  const SIMDType a2( A2.load(i+1UL,k) );
2892 
2893  const SIMDType b1( B2.load(k,j ) );
2894  const SIMDType b2( B2.load(k,j+1UL) );
2895  const SIMDType b3( B2.load(k,j+2UL) );
2896  const SIMDType b4( B2.load(k,j+3UL) );
2897 
2898  xmm1 += a1 * b1;
2899  xmm2 += a1 * b2;
2900  xmm3 += a1 * b3;
2901  xmm4 += a1 * b4;
2902  xmm5 += a2 * b1;
2903  xmm6 += a2 * b2;
2904  xmm7 += a2 * b3;
2905  xmm8 += a2 * b4;
2906  }
2907 
2908  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
2909  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2910  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2911  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2912  c(ii+i+1UL,jbegin+j ) += sum( xmm5 ) * alpha;
2913  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
2914  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm7 ) * alpha;
2915  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm8 ) * alpha;
2916  }
2917 
2918  if( i<iend )
2919  {
2920  SIMDType xmm1, xmm2, xmm3, xmm4;
2921 
2922  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2923  {
2924  const SIMDType a1( A2.load(i,k) );
2925 
2926  xmm1 += a1 * B2.load(k,j );
2927  xmm2 += a1 * B2.load(k,j+1UL);
2928  xmm3 += a1 * B2.load(k,j+2UL);
2929  xmm4 += a1 * B2.load(k,j+3UL);
2930  }
2931 
2932  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
2933  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2934  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2935  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2936  }
2937  }
2938  }
2939 
2940  for( ; (j+2UL) <= jsize; j+=2UL )
2941  {
2942  if( ii > jbegin+j+1UL ) continue;
2943 
2944  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
2945  size_t i( 0UL );
2946 
2947  for( ; (i+4UL) <= iend; i+=4UL )
2948  {
2949  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2950 
2951  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2952  {
2953  const SIMDType a1( A2.load(i ,k) );
2954  const SIMDType a2( A2.load(i+1UL,k) );
2955  const SIMDType a3( A2.load(i+2UL,k) );
2956  const SIMDType a4( A2.load(i+3UL,k) );
2957 
2958  const SIMDType b1( B2.load(k,j ) );
2959  const SIMDType b2( B2.load(k,j+1UL) );
2960 
2961  xmm1 += a1 * b1;
2962  xmm2 += a1 * b2;
2963  xmm3 += a2 * b1;
2964  xmm4 += a2 * b2;
2965  xmm5 += a3 * b1;
2966  xmm6 += a3 * b2;
2967  xmm7 += a4 * b1;
2968  xmm8 += a4 * b2;
2969  }
2970 
2971  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
2972  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2973  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
2974  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
2975  c(ii+i+2UL,jbegin+j ) += sum( xmm5 ) * alpha;
2976  c(ii+i+2UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
2977  c(ii+i+3UL,jbegin+j ) += sum( xmm7 ) * alpha;
2978  c(ii+i+3UL,jbegin+j+1UL) += sum( xmm8 ) * alpha;
2979  }
2980 
2981  for( ; (i+2UL) <= iend; i+=2UL )
2982  {
2983  SIMDType xmm1, xmm2, xmm3, xmm4;
2984 
2985  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2986  {
2987  const SIMDType a1( A2.load(i ,k) );
2988  const SIMDType a2( A2.load(i+1UL,k) );
2989 
2990  const SIMDType b1( B2.load(k,j ) );
2991  const SIMDType b2( B2.load(k,j+1UL) );
2992 
2993  xmm1 += a1 * b1;
2994  xmm2 += a1 * b2;
2995  xmm3 += a2 * b1;
2996  xmm4 += a2 * b2;
2997  }
2998 
2999  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
3000  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
3001  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
3002  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
3003  }
3004 
3005  if( i<iend )
3006  {
3007  SIMDType xmm1, xmm2;
3008 
3009  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
3010  {
3011  const SIMDType a1( A2.load(i,k) );
3012 
3013  xmm1 += a1 * B2.load(k,j );
3014  xmm2 += a1 * B2.load(k,j+1UL);
3015  }
3016 
3017  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
3018  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
3019  }
3020  }
3021 
3022  if( j<jsize && ii <= jbegin+j )
3023  {
3024  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
3025  size_t i( 0UL );
3026 
3027  for( ; (i+2UL) <= iend; i+=2UL )
3028  {
3029  SIMDType xmm1, xmm2;
3030 
3031  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
3032  {
3033  const SIMDType b1( B2.load(k,j) );
3034 
3035  xmm1 += A2.load(i ,k) * b1;
3036  xmm2 += A2.load(i+1UL,k) * b1;
3037  }
3038 
3039  c(ii+i ,jbegin+j) += sum( xmm1 ) * alpha;
3040  c(ii+i+1UL,jbegin+j) += sum( xmm2 ) * alpha;
3041  }
3042 
3043  if( i<iend )
3044  {
3045  SIMDType xmm1;
3046 
3047  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
3048  {
3049  xmm1 += A2.load(i,k) * B2.load(k,j);
3050  }
3051 
3052  c(ii+i,jbegin+j) += sum( xmm1 ) * alpha;
3053  }
3054  }
3055 
3056  ii += iblock;
3057  }
3058 
3059  kk += kblock;
3060  }
3061 
3062  if( remainder && kk < K )
3063  {
3064  const size_t ksize( K - kk );
3065 
3066  const size_t jbegin( IsUpper<MT3>::value ? kk : 0UL );
3067  const size_t jsize ( N - jbegin );
3068 
3069  B2 = serial( submatrix( B, kk, jbegin, ksize, jsize ) );
3070 
3071  size_t ii( 0UL );
3072  size_t iblock( 0UL );
3073 
3074  while( ii < M )
3075  {
3076  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
3077 
3078  if( IsLower<MT2>::value && ii+iblock <= kk ) {
3079  ii += iblock;
3080  continue;
3081  }
3082 
3083  A2 = serial( submatrix( A, ii, kk, iblock, ksize ) );
3084 
3085  size_t j( 0UL );
3086 
3087  if( IsFloatingPoint<ET1>::value )
3088  {
3089  for( ; (j+5UL) <= jsize; j+=5UL )
3090  {
3091  if( ii > jbegin+j+4UL ) continue;
3092 
3093  const size_t iend( min( iblock, jbegin+j-ii+5UL ) );
3094  size_t i( 0UL );
3095 
3096  for( ; (i+2UL) <= iend; i+=2UL ) {
3097  for( size_t k=0UL; k<ksize; ++k ) {
3098  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
3099  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
3100  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
3101  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
3102  c(ii+i ,jbegin+j+4UL) += A2(i ,k) * B2(k,j+4UL) * alpha;
3103  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
3104  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
3105  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
3106  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
3107  c(ii+i+1UL,jbegin+j+4UL) += A2(i+1UL,k) * B2(k,j+4UL) * alpha;
3108  }
3109  }
3110 
3111  if( i<iend ) {
3112  for( size_t k=0UL; k<ksize; ++k ) {
3113  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
3114  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
3115  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
3116  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
3117  c(ii+i,jbegin+j+4UL) += A2(i,k) * B2(k,j+4UL) * alpha;
3118  }
3119  }
3120  }
3121  }
3122  else
3123  {
3124  for( ; (j+4UL) <= jsize; j+=4UL )
3125  {
3126  if( ii > jbegin+j+3UL ) continue;
3127 
3128  const size_t iend( min( iblock, jbegin+j-ii+4UL ) );
3129  size_t i( 0UL );
3130 
3131  for( ; (i+2UL) <= iend; i+=2UL ) {
3132  for( size_t k=0UL; k<ksize; ++k ) {
3133  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
3134  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
3135  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
3136  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
3137  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
3138  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
3139  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
3140  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
3141  }
3142  }
3143 
3144  if( i<iend ) {
3145  for( size_t k=0UL; k<ksize; ++k ) {
3146  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
3147  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
3148  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
3149  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
3150  }
3151  }
3152  }
3153  }
3154 
3155  for( ; (j+2UL) <= jsize; j+=2UL )
3156  {
3157  if( ii > jbegin+j+1UL ) continue;
3158 
3159  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
3160  size_t i( 0UL );
3161 
3162  for( ; (i+2UL) <= iend; i+=2UL ) {
3163  for( size_t k=0UL; k<ksize; ++k ) {
3164  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
3165  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
3166  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
3167  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
3168  }
3169  }
3170 
3171  if( i<iend ) {
3172  for( size_t k=0UL; k<ksize; ++k ) {
3173  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
3174  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
3175  }
3176  }
3177  }
3178 
3179  if( j<jsize && ii <= jbegin+j )
3180  {
3181  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
3182  size_t i( 0UL );
3183 
3184  for( ; (i+2UL) <= iend; i+=2UL ) {
3185  for( size_t k=0UL; k<ksize; ++k ) {
3186  c(ii+i ,jbegin+j) += A2(i ,k) * B2(k,j) * alpha;
3187  c(ii+i+1UL,jbegin+j) += A2(i+1UL,k) * B2(k,j) * alpha;
3188  }
3189  }
3190 
3191  if( i<iend ) {
3192  for( size_t k=0UL; k<ksize; ++k ) {
3193  c(ii+i,jbegin+j) += A2(i,k) * B2(k,j) * alpha;
3194  }
3195  }
3196  }
3197 
3198  ii += iblock;
3199  }
3200  }
3201 }
3203 //*************************************************************************************************
3204 
3205 
3206 //*************************************************************************************************
3222 template< typename MT1, typename MT2, typename MT3 >
3223 inline void ummm( MT1& C, const MT2& A, const MT3& B )
3224 {
3225  using ET1 = ElementType_<MT1>;
3226  using ET2 = ElementType_<MT2>;
3227  using ET3 = ElementType_<MT3>;
3228 
3231 
3232  ummm( C, A, B, ET1(1), ET1(0) );
3233 }
3235 //*************************************************************************************************
3236 
3237 
3238 
3239 
3240 //=================================================================================================
3241 //
3242 // SYMMETRIC DENSE MATRIX MULTIPLICATION KERNELS
3243 //
3244 //=================================================================================================
3245 
3246 //*************************************************************************************************
3264 template< typename MT1, typename MT2, typename MT3, typename ST >
3265 void smmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha )
3266 {
3267  using ET1 = ElementType_<MT1>;
3268  using ET2 = ElementType_<MT2>;
3269  using ET3 = ElementType_<MT3>;
3270 
3275 
3278 
3281 
3284 
3285  const size_t M( A.rows() );
3286  const size_t N( B.columns() );
3287 
3288  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3289 
3290  lmmm( C, A, B, alpha, ST(0) );
3291 
3292  for( size_t ii=0UL; ii<M; ii+=BLOCK_SIZE )
3293  {
3294  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3295 
3296  for( size_t i=ii; i<iend; ++i ) {
3297  for( size_t j=i+1UL; j<iend; ++j ) {
3298  (~C)(i,j) = (~C)(j,i);
3299  }
3300  }
3301 
3302  for( size_t jj=ii+BLOCK_SIZE; jj<N; jj+=BLOCK_SIZE ) {
3303  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3304  for( size_t i=ii; i<iend; ++i ) {
3305  for( size_t j=jj; j<jend; ++j ) {
3306  (~C)(i,j) = (~C)(j,i);
3307  }
3308  }
3309  }
3310  }
3311 }
3313 //*************************************************************************************************
3314 
3315 
3316 //*************************************************************************************************
3334 template< typename MT1, typename MT2, typename MT3, typename ST >
3335 void smmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha )
3336 {
3337  using ET1 = ElementType_<MT1>;
3338  using ET2 = ElementType_<MT2>;
3339  using ET3 = ElementType_<MT3>;
3340 
3345 
3348 
3351 
3354 
3355  const size_t M( A.rows() );
3356  const size_t N( B.columns() );
3357 
3358  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3359 
3360  ummm( C, A, B, alpha, ST(0) );
3361 
3362  for( size_t jj=0UL; jj<N; jj+=BLOCK_SIZE )
3363  {
3364  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3365 
3366  for( size_t j=jj; j<jend; ++j ) {
3367  for( size_t i=jj+1UL; i<jend; ++i ) {
3368  (~C)(i,j) = (~C)(j,i);
3369  }
3370  }
3371 
3372  for( size_t ii=jj+BLOCK_SIZE; ii<M; ii+=BLOCK_SIZE ) {
3373  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3374  for( size_t j=jj; j<jend; ++j ) {
3375  for( size_t i=ii; i<iend; ++i ) {
3376  (~C)(i,j) = (~C)(j,i);
3377  }
3378  }
3379  }
3380  }
3381 }
3383 //*************************************************************************************************
3384 
3385 
3386 //*************************************************************************************************
3402 template< typename MT1, typename MT2, typename MT3 >
3403 inline void smmm( MT1& C, const MT2& A, const MT3& B )
3404 {
3405  using ET1 = ElementType_<MT1>;
3406  using ET2 = ElementType_<MT2>;
3407  using ET3 = ElementType_<MT3>;
3408 
3411 
3412  smmm( C, A, B, ET1(1) );
3413 }
3415 //*************************************************************************************************
3416 
3417 
3418 
3419 
3420 //=================================================================================================
3421 //
3422 // HERMITIAN DENSE MATRIX MULTIPLICATION KERNELS
3423 //
3424 //=================================================================================================
3425 
3426 //*************************************************************************************************
3444 template< typename MT1, typename MT2, typename MT3, typename ST >
3445 void hmmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha )
3446 {
3447  using ET1 = ElementType_<MT1>;
3448  using ET2 = ElementType_<MT2>;
3449  using ET3 = ElementType_<MT3>;
3450 
3455 
3458 
3461 
3464 
3465  const size_t M( A.rows() );
3466  const size_t N( B.columns() );
3467 
3468  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3469 
3470  lmmm( C, A, B, alpha, ST(0) );
3471 
3472  for( size_t ii=0UL; ii<M; ii+=BLOCK_SIZE )
3473  {
3474  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3475 
3476  for( size_t i=ii; i<iend; ++i ) {
3477  for( size_t j=i+1UL; j<iend; ++j ) {
3478  (~C)(i,j) = conj( (~C)(j,i) );
3479  }
3480  }
3481 
3482  for( size_t jj=ii+BLOCK_SIZE; jj<N; jj+=BLOCK_SIZE ) {
3483  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3484  for( size_t i=ii; i<iend; ++i ) {
3485  for( size_t j=jj; j<jend; ++j ) {
3486  (~C)(i,j) = conj( (~C)(j,i) );
3487  }
3488  }
3489  }
3490  }
3491 }
3493 //*************************************************************************************************
3494 
3495 
3496 //*************************************************************************************************
3514 template< typename MT1, typename MT2, typename MT3, typename ST >
3515 void hmmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha )
3516 {
3517  using ET1 = ElementType_<MT1>;
3518  using ET2 = ElementType_<MT2>;
3519  using ET3 = ElementType_<MT3>;
3520 
3525 
3528 
3531 
3534 
3535  const size_t M( A.rows() );
3536  const size_t N( B.columns() );
3537 
3538  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3539 
3540  ummm( C, A, B, alpha, ST(0) );
3541 
3542  for( size_t jj=0UL; jj<N; jj+=BLOCK_SIZE )
3543  {
3544  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3545 
3546  for( size_t j=jj; j<jend; ++j ) {
3547  for( size_t i=jj+1UL; i<jend; ++i ) {
3548  (~C)(i,j) = conj( (~C)(j,i) );
3549  }
3550  }
3551 
3552  for( size_t ii=jj+BLOCK_SIZE; ii<M; ii+=BLOCK_SIZE ) {
3553  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3554  for( size_t j=jj; j<jend; ++j ) {
3555  for( size_t i=ii; i<iend; ++i ) {
3556  (~C)(i,j) = conj( (~C)(j,i) );
3557  }
3558  }
3559  }
3560  }
3561 }
3563 //*************************************************************************************************
3564 
3565 
3566 //*************************************************************************************************
3582 template< typename MT1, typename MT2, typename MT3 >
3583 inline void hmmm( MT1& C, const MT2& A, const MT3& B )
3584 {
3585  using ET1 = ElementType_<MT1>;
3586  using ET2 = ElementType_<MT2>;
3587  using ET3 = ElementType_<MT3>;
3588 
3591 
3592  hmmm( C, A, B, ET1(1) );
3593 }
3595 //*************************************************************************************************
3596 
3597 } // namespace blaze
3598 
3599 #endif
Header file for the implementation of the Submatrix view.
Constraint on the data type.
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Header file for kernel specific block sizes.
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_STRICTLY_UPPER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a strictly upper triangular matrix type...
Definition: StrictlyUpper.h:81
Header file for basic type definitions.
Header file for the serial shim.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:265
#define BLAZE_CONSTRAINT_MUST_NOT_BE_COMPUTATION_TYPE(T)
Constraint on the data type.In case the given data type T is a computational expression (i...
Definition: Computation.h:81
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_NOT_BE_UNIUPPER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a upper unitriangular matrix type...
Definition: UniUpper.h:81
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
Submatrix< MT, AF > submatrix(Matrix< MT, SO > &matrix, size_t row, size_t column, size_t m, size_t n)
Creating a view on a specific submatrix of the given matrix.
Definition: Submatrix.h:352
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Constraint on the data type.
Constraints on the storage order of matrix types.
Constraint on the data type.
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_ADAPTOR_TYPE(T)
Constraint on the data type.In case the given data type T is an adaptor type (as for instance LowerMa...
Definition: Adaptor.h:81
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_STRICTLY_LOWER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a strictly lower triangular matrix type...
Definition: StrictlyLower.h:81
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_SIMD_COMBINABLE_TYPES(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 are not SIMD combinable (i...
Definition: SIMDCombinable.h:61
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Compile time assertion.
Header file for the IsFloatingPoint type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Header file for the DenseMatrix base class.
Header file for all SIMD functionality.
Header file for the IsLower type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_UPPER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a upper triangular matrix type...
Definition: Upper.h:81
Header file for the implementation of a dynamic MxN matrix.
Constraint on the data type.
Header file for the IsPadded type trait.
Header file for the isOne shim.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
Header file for run time assertion macros.
Constraint on the data type.
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_LOWER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a lower triangular matrix type...
Definition: Lower.h:81
bool isOne(const DiagonalProxy< MT > &proxy)
Returns whether the represented element is 1.
Definition: DiagonalProxy.h:662
Header file for the isDefault shim.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_UNILOWER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a lower unitriangular matrix type...
Definition: UniLower.h:81
Constraint on the data type.
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
bool isDefault(const DiagonalProxy< MT > &proxy)
Returns whether the represented element is in default state.
Definition: DiagonalProxy.h:600
#define BLAZE_CONSTRAINT_MUST_NOT_BE_HERMITIAN_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is an Hermitian matrix type, a compilation error is created.
Definition: Hermitian.h:79
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
#define BLAZE_STATIC_ASSERT(expr)
Compile time assertion macro.In case of an invalid compile time expression, a compilation error is cr...
Definition: StaticAssert.h:112
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the data type.