Blaze  3.6
MMM.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_DENSE_MMM_H_
36 #define _BLAZE_MATH_DENSE_MMM_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/Aliases.h>
61 #include <blaze/math/shims/IsOne.h>
63 #include <blaze/math/SIMD.h>
67 #include <blaze/math/views/Check.h>
69 #include <blaze/system/Blocking.h>
71 #include <blaze/util/Assert.h>
73 #include <blaze/util/Types.h>
75 
76 
77 namespace blaze {
78 
79 //=================================================================================================
80 //
81 // GENERAL DENSE MATRIX MULTIPLICATION KERNELS
82 //
83 //=================================================================================================
84 
85 //*************************************************************************************************
104 template< typename MT1, typename MT2, typename MT3, typename ST >
105 void mmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
106 {
107  using ET1 = ElementType_t<MT1>;
108  using ET2 = ElementType_t<MT2>;
109  using ET3 = ElementType_t<MT3>;
110  using SIMDType = SIMDTrait_t<ET1>;
111 
116 
119 
122 
125 
126  constexpr size_t SIMDSIZE( SIMDTrait<ET1>::size );
127 
128  constexpr bool remainder( !IsPadded_v<MT2> || !IsPadded_v<MT3> );
129 
130  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
131  constexpr size_t JBLOCK( MMM_INNER_BLOCK_SIZE );
132 
133  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
134  BLAZE_STATIC_ASSERT( JBLOCK >= SIMDSIZE && JBLOCK % SIMDSIZE == 0UL );
135 
136  const size_t M( A.rows() );
137  const size_t N( B.columns() );
138  const size_t K( A.columns() );
139 
140  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
141 
142  DynamicMatrix<ET2,false> A2( M, KBLOCK );
143  DynamicMatrix<ET3,true> B2( KBLOCK, JBLOCK );
144 
145  if( isDefault( beta ) ) {
146  reset( ~C );
147  }
148  else if( !isOne( beta ) ) {
149  (~C) *= beta;
150  }
151 
152  size_t kk( 0UL );
153  size_t kblock( 0UL );
154 
155  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
156  {
157  if( remainder ) {
158  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
159  }
160  else {
161  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
162  }
163 
164  const size_t ibegin( IsLower_v<MT2> ? kk : 0UL );
165  const size_t iend ( IsUpper_v<MT2> ? kk+kblock : M );
166  const size_t isize ( iend - ibegin );
167 
168  A2 = serial( submatrix< remainder ? unaligned : aligned >( A, ibegin, kk, isize, kblock, unchecked ) );
169 
170  size_t jj( 0UL );
171  size_t jblock( 0UL );
172 
173  while( jj < N )
174  {
175  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
176 
177  if( ( IsLower_v<MT3> && kk+kblock <= jj ) ||
178  ( IsUpper_v<MT3> && jj+jblock <= kk ) ) {
179  jj += jblock;
180  continue;
181  }
182 
183  B2 = serial( submatrix< remainder ? unaligned : aligned >( B, kk, jj, kblock, jblock, unchecked ) );
184 
185  size_t i( 0UL );
186 
187  if( IsFloatingPoint_v<ET1> )
188  {
189  for( ; (i+5UL) <= isize; i+=5UL )
190  {
191  size_t j( 0UL );
192 
193  for( ; (j+2UL) <= jblock; j+=2UL )
194  {
195  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
196 
197  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
198  {
199  const SIMDType a1( A2.load(i ,k) );
200  const SIMDType a2( A2.load(i+1UL,k) );
201  const SIMDType a3( A2.load(i+2UL,k) );
202  const SIMDType a4( A2.load(i+3UL,k) );
203  const SIMDType a5( A2.load(i+4UL,k) );
204 
205  const SIMDType b1( B2.load(k,j ) );
206  const SIMDType b2( B2.load(k,j+1UL) );
207 
208  xmm1 += a1 * b1;
209  xmm2 += a1 * b2;
210  xmm3 += a2 * b1;
211  xmm4 += a2 * b2;
212  xmm5 += a3 * b1;
213  xmm6 += a3 * b2;
214  xmm7 += a4 * b1;
215  xmm8 += a4 * b2;
216  xmm9 += a5 * b1;
217  xmm10 += a5 * b2;
218  }
219 
220  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
221  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
222  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
223  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
224  (~C)(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
225  (~C)(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
226  (~C)(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
227  (~C)(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
228  (~C)(ibegin+i+4UL,jj+j ) += sum( xmm9 ) * alpha;
229  (~C)(ibegin+i+4UL,jj+j+1UL) += sum( xmm10 ) * alpha;
230  }
231 
232  if( j<jblock )
233  {
234  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
235 
236  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
237  {
238  const SIMDType a1( A2.load(i ,k) );
239  const SIMDType a2( A2.load(i+1UL,k) );
240  const SIMDType a3( A2.load(i+2UL,k) );
241  const SIMDType a4( A2.load(i+3UL,k) );
242  const SIMDType a5( A2.load(i+4UL,k) );
243 
244  const SIMDType b1( B2.load(k,j) );
245 
246  xmm1 += a1 * b1;
247  xmm2 += a2 * b1;
248  xmm3 += a3 * b1;
249  xmm4 += a4 * b1;
250  xmm5 += a5 * b1;
251  }
252 
253  (~C)(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
254  (~C)(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
255  (~C)(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
256  (~C)(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
257  (~C)(ibegin+i+4UL,jj+j) += sum( xmm5 ) * alpha;
258  }
259  }
260  }
261  else
262  {
263  for( ; (i+4UL) <= isize; i+=4UL )
264  {
265  size_t j( 0UL );
266 
267  for( ; (j+2UL) <= jblock; j+=2UL )
268  {
269  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
270 
271  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
272  {
273  const SIMDType a1( A2.load(i ,k) );
274  const SIMDType a2( A2.load(i+1UL,k) );
275  const SIMDType a3( A2.load(i+2UL,k) );
276  const SIMDType a4( A2.load(i+3UL,k) );
277 
278  const SIMDType b1( B2.load(k,j ) );
279  const SIMDType b2( B2.load(k,j+1UL) );
280 
281  xmm1 += a1 * b1;
282  xmm2 += a1 * b2;
283  xmm3 += a2 * b1;
284  xmm4 += a2 * b2;
285  xmm5 += a3 * b1;
286  xmm6 += a3 * b2;
287  xmm7 += a4 * b1;
288  xmm8 += a4 * b2;
289  }
290 
291  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
292  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
293  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
294  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
295  (~C)(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
296  (~C)(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
297  (~C)(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
298  (~C)(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
299  }
300 
301  if( j<jblock )
302  {
303  SIMDType xmm1, xmm2, xmm3, xmm4;
304 
305  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
306  {
307  const SIMDType a1( A2.load(i ,k) );
308  const SIMDType a2( A2.load(i+1UL,k) );
309  const SIMDType a3( A2.load(i+2UL,k) );
310  const SIMDType a4( A2.load(i+3UL,k) );
311 
312  const SIMDType b1( B2.load(k,j) );
313 
314  xmm1 += a1 * b1;
315  xmm2 += a2 * b1;
316  xmm3 += a3 * b1;
317  xmm4 += a4 * b1;
318  }
319 
320  (~C)(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
321  (~C)(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
322  (~C)(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
323  (~C)(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
324  }
325  }
326  }
327 
328  for( ; (i+2UL) <= isize; i+=2UL )
329  {
330  size_t j( 0UL );
331 
332  for( ; (j+4UL) <= jblock; j+=4UL )
333  {
334  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
335 
336  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
337  {
338  const SIMDType a1( A2.load(i ,k) );
339  const SIMDType a2( A2.load(i+1UL,k) );
340 
341  const SIMDType b1( B2.load(k,j ) );
342  const SIMDType b2( B2.load(k,j+1UL) );
343  const SIMDType b3( B2.load(k,j+2UL) );
344  const SIMDType b4( B2.load(k,j+3UL) );
345 
346  xmm1 += a1 * b1;
347  xmm2 += a1 * b2;
348  xmm3 += a1 * b3;
349  xmm4 += a1 * b4;
350  xmm5 += a2 * b1;
351  xmm6 += a2 * b2;
352  xmm7 += a2 * b3;
353  xmm8 += a2 * b4;
354  }
355 
356  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
357  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
358  (~C)(ibegin+i ,jj+j+2UL) += sum( xmm3 ) * alpha;
359  (~C)(ibegin+i ,jj+j+3UL) += sum( xmm4 ) * alpha;
360  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm5 ) * alpha;
361  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm6 ) * alpha;
362  (~C)(ibegin+i+1UL,jj+j+2UL) += sum( xmm7 ) * alpha;
363  (~C)(ibegin+i+1UL,jj+j+3UL) += sum( xmm8 ) * alpha;
364  }
365 
366  for( ; (j+2UL) <= jblock; j+=2UL )
367  {
368  SIMDType xmm1, xmm2, xmm3, xmm4;
369 
370  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
371  {
372  const SIMDType a1( A2.load(i ,k) );
373  const SIMDType a2( A2.load(i+1UL,k) );
374 
375  const SIMDType b1( B2.load(k,j ) );
376  const SIMDType b2( B2.load(k,j+1UL) );
377 
378  xmm1 += a1 * b1;
379  xmm2 += a1 * b2;
380  xmm3 += a2 * b1;
381  xmm4 += a2 * b2;
382  }
383 
384  (~C)(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
385  (~C)(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
386  (~C)(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
387  (~C)(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
388  }
389 
390  if( j<jblock )
391  {
392  SIMDType xmm1, xmm2;
393 
394  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
395  {
396  const SIMDType a1( A2.load(i ,k) );
397  const SIMDType a2( A2.load(i+1UL,k) );
398 
399  const SIMDType b1( B2.load(k,j) );
400 
401  xmm1 += a1 * b1;
402  xmm2 += a2 * b1;
403  }
404 
405  (~C)(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
406  (~C)(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
407  }
408  }
409 
410  if( i<isize )
411  {
412  size_t j( 0UL );
413 
414  for( ; (j+2UL) <= jblock; j+=2UL )
415  {
416  SIMDType xmm1, xmm2;
417 
418  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
419  {
420  const SIMDType a1( A2.load(i,k) );
421 
422  xmm1 += a1 * B2.load(k,j );
423  xmm2 += a1 * B2.load(k,j+1UL);
424  }
425 
426  (~C)(ibegin+i,jj+j ) += sum( xmm1 ) * alpha;
427  (~C)(ibegin+i,jj+j+1UL) += sum( xmm2 ) * alpha;
428  }
429 
430  if( j<jblock )
431  {
432  SIMDType xmm1;
433 
434  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
435  {
436  const SIMDType a1( A2.load(i,k) );
437 
438  xmm1 += a1 * B2.load(k,j);
439  }
440 
441  (~C)(ibegin+i,jj+j) += sum( xmm1 ) * alpha;
442  }
443  }
444 
445  jj += jblock;
446  }
447 
448  kk += kblock;
449  }
450 
451  if( remainder && kk < K )
452  {
453  const size_t ksize( K - kk );
454 
455  const size_t ibegin( IsLower_v<MT2> ? kk : 0UL );
456  const size_t isize ( M - ibegin );
457 
458  A2 = serial( submatrix( A, ibegin, kk, isize, ksize, unchecked ) );
459 
460  size_t jj( 0UL );
461  size_t jblock( 0UL );
462 
463  while( jj < N )
464  {
465  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
466 
467  if( IsUpper_v<MT3> && jj+jblock <= kk ) {
468  jj += jblock;
469  continue;
470  }
471 
472  B2 = serial( submatrix( B, kk, jj, ksize, jblock, unchecked ) );
473 
474  size_t i( 0UL );
475 
476  if( IsFloatingPoint_v<ET1> )
477  {
478  for( ; (i+5UL) <= isize; i+=5UL )
479  {
480  size_t j( 0UL );
481 
482  for( ; (j+2UL) <= jblock; j+=2UL ) {
483  for( size_t k=0UL; k<ksize; ++k ) {
484  (~C)(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
485  (~C)(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
486  (~C)(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
487  (~C)(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
488  (~C)(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
489  (~C)(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
490  (~C)(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
491  (~C)(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
492  (~C)(ibegin+i+4UL,jj+j ) += A2(i+4UL,k) * B2(k,j ) * alpha;
493  (~C)(ibegin+i+4UL,jj+j+1UL) += A2(i+4UL,k) * B2(k,j+1UL) * alpha;
494  }
495  }
496 
497  if( j<jblock ) {
498  for( size_t k=0UL; k<ksize; ++k ) {
499  (~C)(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
500  (~C)(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
501  (~C)(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
502  (~C)(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
503  (~C)(ibegin+i+4UL,jj+j) += A2(i+4UL,k) * B2(k,j) * alpha;
504  }
505  }
506  }
507  }
508  else
509  {
510  for( ; (i+4UL) <= isize; i+=4UL )
511  {
512  size_t j( 0UL );
513 
514  for( ; (j+2UL) <= jblock; j+=2UL ) {
515  for( size_t k=0UL; k<ksize; ++k ) {
516  (~C)(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
517  (~C)(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
518  (~C)(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
519  (~C)(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
520  (~C)(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
521  (~C)(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
522  (~C)(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
523  (~C)(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
524  }
525  }
526 
527  if( j<jblock ) {
528  for( size_t k=0UL; k<ksize; ++k ) {
529  (~C)(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
530  (~C)(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
531  (~C)(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
532  (~C)(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
533  }
534  }
535  }
536  }
537 
538  for( ; (i+2UL) <= isize; i+=2UL )
539  {
540  size_t j( 0UL );
541 
542  for( ; (j+2UL) <= jblock; j+=2UL ) {
543  for( size_t k=0UL; k<ksize; ++k ) {
544  (~C)(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
545  (~C)(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
546  (~C)(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
547  (~C)(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
548  }
549  }
550 
551  if( j<jblock ) {
552  for( size_t k=0UL; k<ksize; ++k ) {
553  (~C)(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
554  (~C)(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
555  }
556  }
557  }
558 
559  if( i<isize )
560  {
561  size_t j( 0UL );
562 
563  for( ; (j+2UL) <= jblock; j+=2UL ) {
564  for( size_t k=0UL; k<ksize; ++k ) {
565  (~C)(ibegin+i,jj+j ) += A2(i,k) * B2(k,j ) * alpha;
566  (~C)(ibegin+i,jj+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
567  }
568  }
569 
570  if( j<jblock ) {
571  for( size_t k=0UL; k<ksize; ++k ) {
572  (~C)(ibegin+i,jj+j) += A2(i,k) * B2(k,j) * alpha;
573  }
574  }
575  }
576 
577  jj += jblock;
578  }
579  }
580 }
582 //*************************************************************************************************
583 
584 
585 //*************************************************************************************************
604 template< typename MT1, typename MT2, typename MT3, typename ST >
605 void mmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
606 {
607  using ET1 = ElementType_t<MT1>;
608  using ET2 = ElementType_t<MT2>;
609  using ET3 = ElementType_t<MT3>;
610  using SIMDType = SIMDTrait_t<ET1>;
611 
616 
619 
622 
625 
626  constexpr size_t SIMDSIZE( SIMDTrait<ET1>::size );
627 
628  constexpr bool remainder( !IsPadded_v<MT2> || !IsPadded_v<MT3> );
629 
630  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
631  constexpr size_t IBLOCK( MMM_INNER_BLOCK_SIZE );
632 
633  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
634  BLAZE_STATIC_ASSERT( IBLOCK >= SIMDSIZE && IBLOCK % SIMDSIZE == 0UL );
635 
636  const size_t M( A.rows() );
637  const size_t N( B.columns() );
638  const size_t K( A.columns() );
639 
640  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
641 
642  DynamicMatrix<ET2,false> A2( IBLOCK, KBLOCK );
643  DynamicMatrix<ET3,true> B2( KBLOCK, N );
644 
645  if( isDefault( beta ) ) {
646  reset( ~C );
647  }
648  else if( !isOne( beta ) ) {
649  (~C) *= beta;
650  }
651 
652  size_t kk( 0UL );
653  size_t kblock( 0UL );
654 
655  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
656  {
657  if( remainder ) {
658  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
659  }
660  else {
661  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
662  }
663 
664  const size_t jbegin( IsUpper_v<MT3> ? kk : 0UL );
665  const size_t jend ( IsLower_v<MT3> ? kk+kblock : N );
666  const size_t jsize ( jend - jbegin );
667 
668  B2 = serial( submatrix< remainder ? unaligned : aligned >( B, kk, jbegin, kblock, jsize, unchecked ) );
669 
670  size_t ii( 0UL );
671  size_t iblock( 0UL );
672 
673  while( ii < M )
674  {
675  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
676 
677  if( ( IsLower_v<MT2> && ii+iblock <= kk ) ||
678  ( IsUpper_v<MT2> && kk+kblock <= ii ) ) {
679  ii += iblock;
680  continue;
681  }
682 
683  A2 = serial( submatrix< remainder ? unaligned : aligned >( A, ii, kk, iblock, kblock, unchecked ) );
684 
685  size_t j( 0UL );
686 
687  if( IsFloatingPoint_v<ET3> )
688  {
689  for( ; (j+5UL) <= jsize; j+=5UL )
690  {
691  size_t i( 0UL );
692 
693  for( ; (i+2UL) <= iblock; i+=2UL )
694  {
695  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
696 
697  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
698  {
699  const SIMDType a1( A2.load(i ,k) );
700  const SIMDType a2( A2.load(i+1UL,k) );
701 
702  const SIMDType b1( B2.load(k,j ) );
703  const SIMDType b2( B2.load(k,j+1UL) );
704  const SIMDType b3( B2.load(k,j+2UL) );
705  const SIMDType b4( B2.load(k,j+3UL) );
706  const SIMDType b5( B2.load(k,j+4UL) );
707 
708  xmm1 += a1 * b1;
709  xmm2 += a1 * b2;
710  xmm3 += a1 * b3;
711  xmm4 += a1 * b4;
712  xmm5 += a1 * b5;
713  xmm6 += a2 * b1;
714  xmm7 += a2 * b2;
715  xmm8 += a2 * b3;
716  xmm9 += a2 * b4;
717  xmm10 += a2 * b5;
718  }
719 
720  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
721  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
722  (~C)(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
723  (~C)(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
724  (~C)(ii+i ,jbegin+j+4UL) += sum( xmm5 ) * alpha;
725  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm6 ) * alpha;
726  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm7 ) * alpha;
727  (~C)(ii+i+1UL,jbegin+j+2UL) += sum( xmm8 ) * alpha;
728  (~C)(ii+i+1UL,jbegin+j+3UL) += sum( xmm9 ) * alpha;
729  (~C)(ii+i+1UL,jbegin+j+4UL) += sum( xmm10 ) * alpha;
730  }
731 
732  if( i<iblock )
733  {
734  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
735 
736  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
737  {
738  const SIMDType a1( A2.load(i,k) );
739 
740  xmm1 += a1 * B2.load(k,j );
741  xmm2 += a1 * B2.load(k,j+1UL);
742  xmm3 += a1 * B2.load(k,j+2UL);
743  xmm4 += a1 * B2.load(k,j+3UL);
744  xmm5 += a1 * B2.load(k,j+4UL);
745  }
746 
747  (~C)(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
748  (~C)(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
749  (~C)(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
750  (~C)(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
751  (~C)(ii+i,jbegin+j+4UL) += sum( xmm5 ) * alpha;
752  }
753  }
754  }
755  else
756  {
757  for( ; (j+4UL) <= jsize; j+=4UL )
758  {
759  size_t i( 0UL );
760 
761  for( ; (i+2UL) <= iblock; i+=2UL )
762  {
763  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
764 
765  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
766  {
767  const SIMDType a1( A2.load(i ,k) );
768  const SIMDType a2( A2.load(i+1UL,k) );
769 
770  const SIMDType b1( B2.load(k,j ) );
771  const SIMDType b2( B2.load(k,j+1UL) );
772  const SIMDType b3( B2.load(k,j+2UL) );
773  const SIMDType b4( B2.load(k,j+3UL) );
774 
775  xmm1 += a1 * b1;
776  xmm2 += a1 * b2;
777  xmm3 += a1 * b3;
778  xmm4 += a1 * b4;
779  xmm5 += a2 * b1;
780  xmm6 += a2 * b2;
781  xmm7 += a2 * b3;
782  xmm8 += a2 * b4;
783  }
784 
785  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
786  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
787  (~C)(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
788  (~C)(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
789  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm5 ) * alpha;
790  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
791  (~C)(ii+i+1UL,jbegin+j+2UL) += sum( xmm7 ) * alpha;
792  (~C)(ii+i+1UL,jbegin+j+3UL) += sum( xmm8 ) * alpha;
793  }
794 
795  if( i<iblock )
796  {
797  SIMDType xmm1, xmm2, xmm3, xmm4;
798 
799  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
800  {
801  const SIMDType a1( A2.load(i,k) );
802 
803  xmm1 += a1 * B2.load(k,j );
804  xmm2 += a1 * B2.load(k,j+1UL);
805  xmm3 += a1 * B2.load(k,j+2UL);
806  xmm4 += a1 * B2.load(k,j+3UL);
807  }
808 
809  (~C)(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
810  (~C)(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
811  (~C)(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
812  (~C)(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
813  }
814  }
815  }
816 
817  for( ; (j+2UL) <= jsize; j+=2UL )
818  {
819  size_t i( 0UL );
820 
821  for( ; (i+4UL) <= iblock; i+=4UL )
822  {
823  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
824 
825  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
826  {
827  const SIMDType a1( A2.load(i ,k) );
828  const SIMDType a2( A2.load(i+1UL,k) );
829  const SIMDType a3( A2.load(i+2UL,k) );
830  const SIMDType a4( A2.load(i+3UL,k) );
831 
832  const SIMDType b1( B2.load(k,j ) );
833  const SIMDType b2( B2.load(k,j+1UL) );
834 
835  xmm1 += a1 * b1;
836  xmm2 += a1 * b2;
837  xmm3 += a2 * b1;
838  xmm4 += a2 * b2;
839  xmm5 += a3 * b1;
840  xmm6 += a3 * b2;
841  xmm7 += a4 * b1;
842  xmm8 += a4 * b2;
843  }
844 
845  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
846  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
847  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
848  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
849  (~C)(ii+i+2UL,jbegin+j ) += sum( xmm5 ) * alpha;
850  (~C)(ii+i+2UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
851  (~C)(ii+i+3UL,jbegin+j ) += sum( xmm7 ) * alpha;
852  (~C)(ii+i+3UL,jbegin+j+1UL) += sum( xmm8 ) * alpha;
853  }
854 
855  for( ; (i+2UL) <= iblock; i+=2UL )
856  {
857  SIMDType xmm1, xmm2, xmm3, xmm4;
858 
859  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
860  {
861  const SIMDType a1( A2.load(i ,k) );
862  const SIMDType a2( A2.load(i+1UL,k) );
863 
864  const SIMDType b1( B2.load(k,j ) );
865  const SIMDType b2( B2.load(k,j+1UL) );
866 
867  xmm1 += a1 * b1;
868  xmm2 += a1 * b2;
869  xmm3 += a2 * b1;
870  xmm4 += a2 * b2;
871  }
872 
873  (~C)(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
874  (~C)(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
875  (~C)(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
876  (~C)(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
877  }
878 
879  if( i<iblock )
880  {
881  SIMDType xmm1, xmm2;
882 
883  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
884  {
885  const SIMDType a1( A2.load(i,k) );
886 
887  xmm1 += a1 * B2.load(k,j );
888  xmm2 += a1 * B2.load(k,j+1UL);
889  }
890 
891  (~C)(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
892  (~C)(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
893  }
894  }
895 
896  if( j<jsize )
897  {
898  size_t i( 0UL );
899 
900  for( ; (i+2UL) <= iblock; i+=2UL )
901  {
902  SIMDType xmm1, xmm2;
903 
904  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
905  {
906  const SIMDType b1( B2.load(k,j) );
907 
908  xmm1 += A2.load(i ,k) * b1;
909  xmm2 += A2.load(i+1UL,k) * b1;
910  }
911 
912  (~C)(ii+i ,jbegin+j) += sum( xmm1 ) * alpha;
913  (~C)(ii+i+1UL,jbegin+j) += sum( xmm2 ) * alpha;
914  }
915 
916  if( i<iblock )
917  {
918  SIMDType xmm1;
919 
920  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
921  {
922  xmm1 += A2.load(i,k) * B2.load(k,j);
923  }
924 
925  (~C)(ii+i,jbegin+j) += sum( xmm1 ) * alpha;
926  }
927  }
928 
929  ii += iblock;
930  }
931 
932  kk += kblock;
933  }
934 
935  if( remainder && kk < K )
936  {
937  const size_t ksize( K - kk );
938 
939  const size_t jbegin( IsUpper_v<MT3> ? kk : 0UL );
940  const size_t jsize ( N - jbegin );
941 
942  B2 = serial( submatrix( B, kk, jbegin, ksize, jsize, unchecked ) );
943 
944  size_t ii( 0UL );
945  size_t iblock( 0UL );
946 
947  while( ii < M )
948  {
949  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
950 
951  if( IsLower_v<MT2> && ii+iblock <= kk ) {
952  ii += iblock;
953  continue;
954  }
955 
956  A2 = serial( submatrix( A, ii, kk, iblock, ksize, unchecked ) );
957 
958  size_t j( 0UL );
959 
960  if( IsFloatingPoint_v<ET1> )
961  {
962  for( ; (j+5UL) <= jsize; j+=5UL )
963  {
964  size_t i( 0UL );
965 
966  for( ; (i+2UL) <= iblock; i+=2UL ) {
967  for( size_t k=0UL; k<ksize; ++k ) {
968  (~C)(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
969  (~C)(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
970  (~C)(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
971  (~C)(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
972  (~C)(ii+i ,jbegin+j+4UL) += A2(i ,k) * B2(k,j+4UL) * alpha;
973  (~C)(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
974  (~C)(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
975  (~C)(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
976  (~C)(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
977  (~C)(ii+i+1UL,jbegin+j+4UL) += A2(i+1UL,k) * B2(k,j+4UL) * alpha;
978  }
979  }
980 
981  if( i<iblock ) {
982  for( size_t k=0UL; k<ksize; ++k ) {
983  (~C)(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
984  (~C)(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
985  (~C)(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
986  (~C)(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
987  (~C)(ii+i,jbegin+j+4UL) += A2(i,k) * B2(k,j+4UL) * alpha;
988  }
989  }
990  }
991  }
992  else
993  {
994  for( ; (j+4UL) <= jsize; j+=4UL )
995  {
996  size_t i( 0UL );
997 
998  for( ; (i+2UL) <= iblock; i+=2UL ) {
999  for( size_t k=0UL; k<ksize; ++k ) {
1000  (~C)(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
1001  (~C)(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1002  (~C)(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
1003  (~C)(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
1004  (~C)(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1005  (~C)(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1006  (~C)(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
1007  (~C)(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
1008  }
1009  }
1010 
1011  if( i<iblock ) {
1012  for( size_t k=0UL; k<ksize; ++k ) {
1013  (~C)(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
1014  (~C)(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
1015  (~C)(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
1016  (~C)(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
1017  }
1018  }
1019  }
1020  }
1021 
1022  for( ; (j+2UL) <= jsize; j+=2UL )
1023  {
1024  size_t i( 0UL );
1025 
1026  for( ; (i+2UL) <= iblock; i+=2UL ) {
1027  for( size_t k=0UL; k<ksize; ++k ) {
1028  (~C)(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
1029  (~C)(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1030  (~C)(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1031  (~C)(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1032  }
1033  }
1034 
1035  if( i<iblock ) {
1036  for( size_t k=0UL; k<ksize; ++k ) {
1037  (~C)(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
1038  (~C)(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
1039  }
1040  }
1041  }
1042 
1043  if( j<jsize )
1044  {
1045  size_t i( 0UL );
1046 
1047  for( ; (i+2UL) <= iblock; i+=2UL ) {
1048  for( size_t k=0UL; k<ksize; ++k ) {
1049  (~C)(ii+i ,jbegin+j) += A2(i ,k) * B2(k,j) * alpha;
1050  (~C)(ii+i+1UL,jbegin+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1051  }
1052  }
1053 
1054  if( i<iblock ) {
1055  for( size_t k=0UL; k<ksize; ++k ) {
1056  (~C)(ii+i,jbegin+j) += A2(i,k) * B2(k,j) * alpha;
1057  }
1058  }
1059  }
1060 
1061  ii += iblock;
1062  }
1063  }
1064 }
1066 //*************************************************************************************************
1067 
1068 
1069 //*************************************************************************************************
1085 template< typename MT1, typename MT2, typename MT3 >
1086 inline void mmm( MT1& C, const MT2& A, const MT3& B )
1087 {
1088  using ET1 = ElementType_t<MT1>;
1089  using ET2 = ElementType_t<MT2>;
1090  using ET3 = ElementType_t<MT3>;
1091 
1094 
1095  mmm( C, A, B, ET1(1), ET1(0) );
1096 }
1098 //*************************************************************************************************
1099 
1100 
1101 
1102 
1103 //=================================================================================================
1104 //
1105 // LOWER DENSE MATRIX MULTIPLICATION KERNELS
1106 //
1107 //=================================================================================================
1108 
1109 //*************************************************************************************************
1128 template< typename MT1, typename MT2, typename MT3, typename ST >
1129 void lmmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
1130 {
1131  using ET1 = ElementType_t<MT1>;
1132  using ET2 = ElementType_t<MT2>;
1133  using ET3 = ElementType_t<MT3>;
1134  using SIMDType = SIMDTrait_t<ET1>;
1135 
1144 
1147 
1150 
1153 
1154  constexpr size_t SIMDSIZE( SIMDTrait<ET1>::size );
1155 
1156  constexpr bool remainder( !IsPadded_v<MT2> || !IsPadded_v<MT3> );
1157 
1158  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
1159  constexpr size_t JBLOCK( MMM_INNER_BLOCK_SIZE );
1160 
1161  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
1162  BLAZE_STATIC_ASSERT( JBLOCK >= SIMDSIZE && JBLOCK % SIMDSIZE == 0UL );
1163 
1164  const size_t M( A.rows() );
1165  const size_t N( B.columns() );
1166  const size_t K( A.columns() );
1167 
1168  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
1169 
1170  DynamicMatrix<ET2,false> A2( M, KBLOCK );
1171  DynamicMatrix<ET3,true> B2( KBLOCK, JBLOCK );
1172 
1173  decltype(auto) c( derestrict( ~C ) );
1174 
1175  if( isDefault( beta ) ) {
1176  reset( c );
1177  }
1178  else if( !isOne( beta ) ) {
1179  c *= beta;
1180  }
1181 
1182  size_t kk( 0UL );
1183  size_t kblock( 0UL );
1184 
1185  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
1186  {
1187  if( remainder ) {
1188  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
1189  }
1190  else {
1191  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
1192  }
1193 
1194  const size_t ibegin( IsLower_v<MT2> ? kk : 0UL );
1195  const size_t iend ( IsUpper_v<MT2> ? kk+kblock : M );
1196  const size_t isize ( iend - ibegin );
1197 
1198  A2 = serial( submatrix< remainder ? unaligned : aligned >( A, ibegin, kk, isize, kblock, unchecked ) );
1199 
1200  size_t jj( 0UL );
1201  size_t jblock( 0UL );
1202 
1203  while( jj < N )
1204  {
1205  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
1206 
1207  if( ( IsLower_v<MT3> && kk+kblock <= jj ) ||
1208  ( IsUpper_v<MT3> && jj+jblock <= kk ) ) {
1209  jj += jblock;
1210  continue;
1211  }
1212 
1213  B2 = serial( submatrix< remainder ? unaligned : aligned >( B, kk, jj, kblock, jblock, unchecked ) );
1214 
1215  size_t i( 0UL );
1216 
1217  if( IsFloatingPoint_v<ET1> )
1218  {
1219  for( ; (i+5UL) <= isize; i+=5UL )
1220  {
1221  if( jj > ibegin+i+4UL ) continue;
1222 
1223  const size_t jend( min( ibegin+i-jj+5UL, jblock ) );
1224  size_t j( 0UL );
1225 
1226  for( ; (j+2UL) <= jend; j+=2UL )
1227  {
1228  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1229 
1230  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1231  {
1232  const SIMDType a1( A2.load(i ,k) );
1233  const SIMDType a2( A2.load(i+1UL,k) );
1234  const SIMDType a3( A2.load(i+2UL,k) );
1235  const SIMDType a4( A2.load(i+3UL,k) );
1236  const SIMDType a5( A2.load(i+4UL,k) );
1237 
1238  const SIMDType b1( B2.load(k,j ) );
1239  const SIMDType b2( B2.load(k,j+1UL) );
1240 
1241  xmm1 += a1 * b1;
1242  xmm2 += a1 * b2;
1243  xmm3 += a2 * b1;
1244  xmm4 += a2 * b2;
1245  xmm5 += a3 * b1;
1246  xmm6 += a3 * b2;
1247  xmm7 += a4 * b1;
1248  xmm8 += a4 * b2;
1249  xmm9 += a5 * b1;
1250  xmm10 += a5 * b2;
1251  }
1252 
1253  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1254  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1255  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
1256  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
1257  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
1258  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
1259  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
1260  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
1261  c(ibegin+i+4UL,jj+j ) += sum( xmm9 ) * alpha;
1262  c(ibegin+i+4UL,jj+j+1UL) += sum( xmm10 ) * alpha;
1263  }
1264 
1265  if( j<jend )
1266  {
1267  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1268 
1269  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1270  {
1271  const SIMDType a1( A2.load(i ,k) );
1272  const SIMDType a2( A2.load(i+1UL,k) );
1273  const SIMDType a3( A2.load(i+2UL,k) );
1274  const SIMDType a4( A2.load(i+3UL,k) );
1275  const SIMDType a5( A2.load(i+4UL,k) );
1276 
1277  const SIMDType b1( B2.load(k,j) );
1278 
1279  xmm1 += a1 * b1;
1280  xmm2 += a2 * b1;
1281  xmm3 += a3 * b1;
1282  xmm4 += a4 * b1;
1283  xmm5 += a5 * b1;
1284  }
1285 
1286  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
1287  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
1288  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
1289  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
1290  c(ibegin+i+4UL,jj+j) += sum( xmm5 ) * alpha;
1291  }
1292  }
1293  }
1294  else
1295  {
1296  for( ; (i+4UL) <= isize; i+=4UL )
1297  {
1298  if( jj > ibegin+i+3UL ) continue;
1299 
1300  const size_t jend( min( ibegin+i-jj+4UL, jblock ) );
1301  size_t j( 0UL );
1302 
1303  for( ; (j+2UL) <= jend; j+=2UL )
1304  {
1305  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1306 
1307  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1308  {
1309  const SIMDType a1( A2.load(i ,k) );
1310  const SIMDType a2( A2.load(i+1UL,k) );
1311  const SIMDType a3( A2.load(i+2UL,k) );
1312  const SIMDType a4( A2.load(i+3UL,k) );
1313 
1314  const SIMDType b1( B2.load(k,j ) );
1315  const SIMDType b2( B2.load(k,j+1UL) );
1316 
1317  xmm1 += a1 * b1;
1318  xmm2 += a1 * b2;
1319  xmm3 += a2 * b1;
1320  xmm4 += a2 * b2;
1321  xmm5 += a3 * b1;
1322  xmm6 += a3 * b2;
1323  xmm7 += a4 * b1;
1324  xmm8 += a4 * b2;
1325  }
1326 
1327  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1328  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1329  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
1330  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
1331  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
1332  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
1333  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
1334  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
1335  }
1336 
1337  if( j<jend )
1338  {
1339  SIMDType xmm1, xmm2, xmm3, xmm4;
1340 
1341  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1342  {
1343  const SIMDType a1( A2.load(i ,k) );
1344  const SIMDType a2( A2.load(i+1UL,k) );
1345  const SIMDType a3( A2.load(i+2UL,k) );
1346  const SIMDType a4( A2.load(i+3UL,k) );
1347 
1348  const SIMDType b1( B2.load(k,j) );
1349 
1350  xmm1 += a1 * b1;
1351  xmm2 += a2 * b1;
1352  xmm3 += a3 * b1;
1353  xmm4 += a4 * b1;
1354  }
1355 
1356  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
1357  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
1358  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
1359  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
1360  }
1361  }
1362  }
1363 
1364  for( ; (i+2UL) <= isize; i+=2UL )
1365  {
1366  if( jj > ibegin+i+1UL ) continue;
1367 
1368  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1369  size_t j( 0UL );
1370 
1371  for( ; (j+4UL) <= jend; j+=4UL )
1372  {
1373  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1374 
1375  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1376  {
1377  const SIMDType a1( A2.load(i ,k) );
1378  const SIMDType a2( A2.load(i+1UL,k) );
1379 
1380  const SIMDType b1( B2.load(k,j ) );
1381  const SIMDType b2( B2.load(k,j+1UL) );
1382  const SIMDType b3( B2.load(k,j+2UL) );
1383  const SIMDType b4( B2.load(k,j+3UL) );
1384 
1385  xmm1 += a1 * b1;
1386  xmm2 += a1 * b2;
1387  xmm3 += a1 * b3;
1388  xmm4 += a1 * b4;
1389  xmm5 += a2 * b1;
1390  xmm6 += a2 * b2;
1391  xmm7 += a2 * b3;
1392  xmm8 += a2 * b4;
1393  }
1394 
1395  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1396  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1397  c(ibegin+i ,jj+j+2UL) += sum( xmm3 ) * alpha;
1398  c(ibegin+i ,jj+j+3UL) += sum( xmm4 ) * alpha;
1399  c(ibegin+i+1UL,jj+j ) += sum( xmm5 ) * alpha;
1400  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm6 ) * alpha;
1401  c(ibegin+i+1UL,jj+j+2UL) += sum( xmm7 ) * alpha;
1402  c(ibegin+i+1UL,jj+j+3UL) += sum( xmm8 ) * alpha;
1403  }
1404 
1405  for( ; (j+2UL) <= jend; j+=2UL )
1406  {
1407  SIMDType xmm1, xmm2, xmm3, xmm4;
1408 
1409  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1410  {
1411  const SIMDType a1( A2.load(i ,k) );
1412  const SIMDType a2( A2.load(i+1UL,k) );
1413 
1414  const SIMDType b1( B2.load(k,j ) );
1415  const SIMDType b2( B2.load(k,j+1UL) );
1416 
1417  xmm1 += a1 * b1;
1418  xmm2 += a1 * b2;
1419  xmm3 += a2 * b1;
1420  xmm4 += a2 * b2;
1421  }
1422 
1423  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
1424  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
1425  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
1426  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
1427  }
1428 
1429  if( j<jend )
1430  {
1431  SIMDType xmm1, xmm2;
1432 
1433  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1434  {
1435  const SIMDType a1( A2.load(i ,k) );
1436  const SIMDType a2( A2.load(i+1UL,k) );
1437 
1438  const SIMDType b1( B2.load(k,j) );
1439 
1440  xmm1 += a1 * b1;
1441  xmm2 += a2 * b1;
1442  }
1443 
1444  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
1445  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
1446  }
1447  }
1448 
1449  if( i<isize && jj <= ibegin+i )
1450  {
1451  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1452  size_t j( 0UL );
1453 
1454  for( ; (j+2UL) <= jend; j+=2UL )
1455  {
1456  SIMDType xmm1, xmm2;
1457 
1458  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1459  {
1460  const SIMDType a1( A2.load(i,k) );
1461 
1462  xmm1 += a1 * B2.load(k,j );
1463  xmm2 += a1 * B2.load(k,j+1UL);
1464  }
1465 
1466  c(ibegin+i,jj+j ) += sum( xmm1 ) * alpha;
1467  c(ibegin+i,jj+j+1UL) += sum( xmm2 ) * alpha;
1468  }
1469 
1470  if( j<jend )
1471  {
1472  SIMDType xmm1;
1473 
1474  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1475  {
1476  const SIMDType a1( A2.load(i,k) );
1477 
1478  xmm1 += a1 * B2.load(k,j);
1479  }
1480 
1481  c(ibegin+i,jj+j) += sum( xmm1 ) * alpha;
1482  }
1483  }
1484 
1485  jj += jblock;
1486  }
1487 
1488  kk += kblock;
1489  }
1490 
1491  if( remainder && kk < K )
1492  {
1493  const size_t ksize( K - kk );
1494 
1495  const size_t ibegin( IsLower_v<MT2> ? kk : 0UL );
1496  const size_t isize ( M - ibegin );
1497 
1498  A2 = serial( submatrix( A, ibegin, kk, isize, ksize, unchecked ) );
1499 
1500  size_t jj( 0UL );
1501  size_t jblock( 0UL );
1502 
1503  while( jj < N )
1504  {
1505  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
1506 
1507  if( IsUpper_v<MT3> && jj+jblock <= kk ) {
1508  jj += jblock;
1509  continue;
1510  }
1511 
1512  B2 = serial( submatrix( B, kk, jj, ksize, jblock, unchecked ) );
1513 
1514  size_t i( 0UL );
1515 
1516  if( IsFloatingPoint_v<ET1> )
1517  {
1518  for( ; (i+5UL) <= isize; i+=5UL )
1519  {
1520  if( jj > ibegin+i+4UL ) continue;
1521 
1522  const size_t jend( min( ibegin+i-jj+5UL, jblock ) );
1523  size_t j( 0UL );
1524 
1525  for( ; (j+2UL) <= jend; j+=2UL ) {
1526  for( size_t k=0UL; k<ksize; ++k ) {
1527  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
1528  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1529  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1530  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1531  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
1532  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
1533  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
1534  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
1535  c(ibegin+i+4UL,jj+j ) += A2(i+4UL,k) * B2(k,j ) * alpha;
1536  c(ibegin+i+4UL,jj+j+1UL) += A2(i+4UL,k) * B2(k,j+1UL) * alpha;
1537  }
1538  }
1539 
1540  if( j<jend ) {
1541  for( size_t k=0UL; k<ksize; ++k ) {
1542  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
1543  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1544  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
1545  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
1546  c(ibegin+i+4UL,jj+j) += A2(i+4UL,k) * B2(k,j) * alpha;
1547  }
1548  }
1549  }
1550  }
1551  else
1552  {
1553  for( ; (i+4UL) <= isize; i+=4UL )
1554  {
1555  if( jj > ibegin+i+3UL ) continue;
1556 
1557  const size_t jend( min( ibegin+i-jj+4UL, jblock ) );
1558  size_t j( 0UL );
1559 
1560  for( ; (j+2UL) <= jend; j+=2UL ) {
1561  for( size_t k=0UL; k<ksize; ++k ) {
1562  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
1563  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1564  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1565  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1566  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
1567  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
1568  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
1569  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
1570  }
1571  }
1572 
1573  if( j<jend ) {
1574  for( size_t k=0UL; k<ksize; ++k ) {
1575  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
1576  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1577  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
1578  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
1579  }
1580  }
1581  }
1582  }
1583 
1584  for( ; (i+2UL) <= isize; i+=2UL )
1585  {
1586  if( jj > ibegin+i+1UL ) continue;
1587 
1588  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1589  size_t j( 0UL );
1590 
1591  for( ; (j+2UL) <= jend; j+=2UL ) {
1592  for( size_t k=0UL; k<ksize; ++k ) {
1593  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
1594  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
1595  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
1596  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
1597  }
1598  }
1599 
1600  if( j<jend ) {
1601  for( size_t k=0UL; k<ksize; ++k ) {
1602  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
1603  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
1604  }
1605  }
1606  }
1607 
1608  if( i<isize && jj <= ibegin+i )
1609  {
1610  const size_t jend( min( ibegin+i-jj+2UL, jblock ) );
1611  size_t j( 0UL );
1612 
1613  for( ; (j+2UL) <= jend; j+=2UL ) {
1614  for( size_t k=0UL; k<ksize; ++k ) {
1615  c(ibegin+i,jj+j ) += A2(i,k) * B2(k,j ) * alpha;
1616  c(ibegin+i,jj+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
1617  }
1618  }
1619 
1620  if( j<jend ) {
1621  for( size_t k=0UL; k<ksize; ++k ) {
1622  c(ibegin+i,jj+j) += A2(i,k) * B2(k,j) * alpha;
1623  }
1624  }
1625  }
1626 
1627  jj += jblock;
1628  }
1629  }
1630 }
1632 //*************************************************************************************************
1633 
1634 
1635 //*************************************************************************************************
1654 template< typename MT1, typename MT2, typename MT3, typename ST >
1655 void lmmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
1656 {
1657  using ET1 = ElementType_t<MT1>;
1658  using ET2 = ElementType_t<MT2>;
1659  using ET3 = ElementType_t<MT3>;
1660  using SIMDType = SIMDTrait_t<ET1>;
1661 
1670 
1673 
1676 
1679 
1680  constexpr size_t SIMDSIZE( SIMDTrait<ET1>::size );
1681 
1682  constexpr bool remainder( !IsPadded_v<MT2> || !IsPadded_v<MT3> );
1683 
1684  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
1685  constexpr size_t IBLOCK( MMM_INNER_BLOCK_SIZE );
1686 
1687  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
1688  BLAZE_STATIC_ASSERT( IBLOCK >= SIMDSIZE && IBLOCK % SIMDSIZE == 0UL );
1689 
1690  const size_t M( A.rows() );
1691  const size_t N( B.columns() );
1692  const size_t K( A.columns() );
1693 
1694  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
1695 
1696  DynamicMatrix<ET2,false> A2( IBLOCK, KBLOCK );
1697  DynamicMatrix<ET3,true> B2( KBLOCK, N );
1698 
1699  decltype(auto) c( derestrict( ~C ) );
1700 
1701  if( isDefault( beta ) ) {
1702  reset( c );
1703  }
1704  else if( !isOne( beta ) ) {
1705  c *= beta;
1706  }
1707 
1708  size_t kk( 0UL );
1709  size_t kblock( 0UL );
1710 
1711  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
1712  {
1713  if( remainder ) {
1714  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
1715  }
1716  else {
1717  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
1718  }
1719 
1720  const size_t jbegin( IsUpper_v<MT3> ? kk : 0UL );
1721  const size_t jend ( IsLower_v<MT3> ? kk+kblock : N );
1722  const size_t jsize ( jend - jbegin );
1723 
1724  B2 = serial( submatrix< remainder ? unaligned : aligned >( B, kk, jbegin, kblock, jsize, unchecked ) );
1725 
1726  size_t ii( 0UL );
1727  size_t iblock( 0UL );
1728 
1729  while( ii < M )
1730  {
1731  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
1732 
1733  if( ( IsLower_v<MT2> && ii+iblock <= kk ) ||
1734  ( IsUpper_v<MT2> && kk+kblock <= ii ) ) {
1735  ii += iblock;
1736  continue;
1737  }
1738 
1739  A2 = serial( submatrix< remainder ? unaligned : aligned >( A, ii, kk, iblock, kblock, unchecked ) );
1740 
1741  size_t j( 0UL );
1742 
1743  if( IsFloatingPoint_v<ET3> )
1744  {
1745  for( ; (j+5UL) <= jsize; j+=5UL )
1746  {
1747  if( ii+iblock < jbegin ) continue;
1748 
1749  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1750 
1751  for( ; (i+2UL) <= iblock; i+=2UL )
1752  {
1753  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1754 
1755  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1756  {
1757  const SIMDType a1( A2.load(i ,k) );
1758  const SIMDType a2( A2.load(i+1UL,k) );
1759 
1760  const SIMDType b1( B2.load(k,j ) );
1761  const SIMDType b2( B2.load(k,j+1UL) );
1762  const SIMDType b3( B2.load(k,j+2UL) );
1763  const SIMDType b4( B2.load(k,j+3UL) );
1764  const SIMDType b5( B2.load(k,j+4UL) );
1765 
1766  xmm1 += a1 * b1;
1767  xmm2 += a1 * b2;
1768  xmm3 += a1 * b3;
1769  xmm4 += a1 * b4;
1770  xmm5 += a1 * b5;
1771  xmm6 += a2 * b1;
1772  xmm7 += a2 * b2;
1773  xmm8 += a2 * b3;
1774  xmm9 += a2 * b4;
1775  xmm10 += a2 * b5;
1776  }
1777 
1778  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1779  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1780  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1781  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1782  c(ii+i ,jbegin+j+4UL) += sum( xmm5 ) * alpha;
1783  c(ii+i+1UL,jbegin+j ) += sum( xmm6 ) * alpha;
1784  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm7 ) * alpha;
1785  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm8 ) * alpha;
1786  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm9 ) * alpha;
1787  c(ii+i+1UL,jbegin+j+4UL) += sum( xmm10 ) * alpha;
1788  }
1789 
1790  if( i<iblock )
1791  {
1792  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1793 
1794  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1795  {
1796  const SIMDType a1( A2.load(i,k) );
1797 
1798  xmm1 += a1 * B2.load(k,j );
1799  xmm2 += a1 * B2.load(k,j+1UL);
1800  xmm3 += a1 * B2.load(k,j+2UL);
1801  xmm4 += a1 * B2.load(k,j+3UL);
1802  xmm5 += a1 * B2.load(k,j+4UL);
1803  }
1804 
1805  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
1806  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1807  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1808  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1809  c(ii+i,jbegin+j+4UL) += sum( xmm5 ) * alpha;
1810  }
1811  }
1812  }
1813  else
1814  {
1815  for( ; (j+4UL) <= jsize; j+=4UL )
1816  {
1817  if( ii+iblock < jbegin ) continue;
1818 
1819  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1820 
1821  for( ; (i+2UL) <= iblock; i+=2UL )
1822  {
1823  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1824 
1825  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1826  {
1827  const SIMDType a1( A2.load(i ,k) );
1828  const SIMDType a2( A2.load(i+1UL,k) );
1829 
1830  const SIMDType b1( B2.load(k,j ) );
1831  const SIMDType b2( B2.load(k,j+1UL) );
1832  const SIMDType b3( B2.load(k,j+2UL) );
1833  const SIMDType b4( B2.load(k,j+3UL) );
1834 
1835  xmm1 += a1 * b1;
1836  xmm2 += a1 * b2;
1837  xmm3 += a1 * b3;
1838  xmm4 += a1 * b4;
1839  xmm5 += a2 * b1;
1840  xmm6 += a2 * b2;
1841  xmm7 += a2 * b3;
1842  xmm8 += a2 * b4;
1843  }
1844 
1845  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1846  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1847  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1848  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1849  c(ii+i+1UL,jbegin+j ) += sum( xmm5 ) * alpha;
1850  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
1851  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm7 ) * alpha;
1852  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm8 ) * alpha;
1853  }
1854 
1855  if( i<iblock )
1856  {
1857  SIMDType xmm1, xmm2, xmm3, xmm4;
1858 
1859  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1860  {
1861  const SIMDType a1( A2.load(i,k) );
1862 
1863  xmm1 += a1 * B2.load(k,j );
1864  xmm2 += a1 * B2.load(k,j+1UL);
1865  xmm3 += a1 * B2.load(k,j+2UL);
1866  xmm4 += a1 * B2.load(k,j+3UL);
1867  }
1868 
1869  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
1870  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1871  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
1872  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
1873  }
1874  }
1875  }
1876 
1877  for( ; (j+2UL) <= jsize; j+=2UL )
1878  {
1879  if( ii+iblock < jbegin ) continue;
1880 
1881  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1882 
1883  for( ; (i+4UL) <= iblock; i+=4UL )
1884  {
1885  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1886 
1887  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1888  {
1889  const SIMDType a1( A2.load(i ,k) );
1890  const SIMDType a2( A2.load(i+1UL,k) );
1891  const SIMDType a3( A2.load(i+2UL,k) );
1892  const SIMDType a4( A2.load(i+3UL,k) );
1893 
1894  const SIMDType b1( B2.load(k,j ) );
1895  const SIMDType b2( B2.load(k,j+1UL) );
1896 
1897  xmm1 += a1 * b1;
1898  xmm2 += a1 * b2;
1899  xmm3 += a2 * b1;
1900  xmm4 += a2 * b2;
1901  xmm5 += a3 * b1;
1902  xmm6 += a3 * b2;
1903  xmm7 += a4 * b1;
1904  xmm8 += a4 * b2;
1905  }
1906 
1907  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1908  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1909  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
1910  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
1911  c(ii+i+2UL,jbegin+j ) += sum( xmm5 ) * alpha;
1912  c(ii+i+2UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
1913  c(ii+i+3UL,jbegin+j ) += sum( xmm7 ) * alpha;
1914  c(ii+i+3UL,jbegin+j+1UL) += sum( xmm8 ) * alpha;
1915  }
1916 
1917  for( ; (i+2UL) <= iblock; i+=2UL )
1918  {
1919  SIMDType xmm1, xmm2, xmm3, xmm4;
1920 
1921  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1922  {
1923  const SIMDType a1( A2.load(i ,k) );
1924  const SIMDType a2( A2.load(i+1UL,k) );
1925 
1926  const SIMDType b1( B2.load(k,j ) );
1927  const SIMDType b2( B2.load(k,j+1UL) );
1928 
1929  xmm1 += a1 * b1;
1930  xmm2 += a1 * b2;
1931  xmm3 += a2 * b1;
1932  xmm4 += a2 * b2;
1933  }
1934 
1935  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
1936  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1937  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
1938  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
1939  }
1940 
1941  if( i<iblock )
1942  {
1943  SIMDType xmm1, xmm2;
1944 
1945  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1946  {
1947  const SIMDType a1( A2.load(i,k) );
1948 
1949  xmm1 += a1 * B2.load(k,j );
1950  xmm2 += a1 * B2.load(k,j+1UL);
1951  }
1952 
1953  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
1954  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
1955  }
1956  }
1957 
1958  if( j<jsize && ii+iblock >= jbegin )
1959  {
1960  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
1961 
1962  for( ; (i+2UL) <= iblock; i+=2UL )
1963  {
1964  SIMDType xmm1, xmm2;
1965 
1966  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1967  {
1968  const SIMDType b1( B2.load(k,j) );
1969 
1970  xmm1 += A2.load(i ,k) * b1;
1971  xmm2 += A2.load(i+1UL,k) * b1;
1972  }
1973 
1974  c(ii+i ,jbegin+j) += sum( xmm1 ) * alpha;
1975  c(ii+i+1UL,jbegin+j) += sum( xmm2 ) * alpha;
1976  }
1977 
1978  if( i<iblock )
1979  {
1980  SIMDType xmm1;
1981 
1982  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
1983  {
1984  xmm1 += A2.load(i,k) * B2.load(k,j);
1985  }
1986 
1987  c(ii+i,jbegin+j) += sum( xmm1 ) * alpha;
1988  }
1989  }
1990 
1991  ii += iblock;
1992  }
1993 
1994  kk += kblock;
1995  }
1996 
1997  if( remainder && kk < K )
1998  {
1999  const size_t ksize( K - kk );
2000 
2001  const size_t jbegin( IsUpper_v<MT3> ? kk : 0UL );
2002  const size_t jsize ( N - jbegin );
2003 
2004  B2 = serial( submatrix( B, kk, jbegin, ksize, jsize, unchecked ) );
2005 
2006  size_t ii( 0UL );
2007  size_t iblock( 0UL );
2008 
2009  while( ii < M )
2010  {
2011  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
2012 
2013  if( IsLower_v<MT2> && ii+iblock <= kk ) {
2014  ii += iblock;
2015  continue;
2016  }
2017 
2018  A2 = serial( submatrix( A, ii, kk, iblock, ksize, unchecked ) );
2019 
2020  size_t j( 0UL );
2021 
2022  if( IsFloatingPoint_v<ET1> )
2023  {
2024  for( ; (j+5UL) <= jsize; j+=5UL )
2025  {
2026  if( ii+iblock < jbegin ) continue;
2027 
2028  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2029 
2030  for( ; (i+2UL) <= iblock; i+=2UL ) {
2031  for( size_t k=0UL; k<ksize; ++k ) {
2032  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
2033  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2034  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
2035  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
2036  c(ii+i ,jbegin+j+4UL) += A2(i ,k) * B2(k,j+4UL) * alpha;
2037  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2038  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2039  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
2040  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
2041  c(ii+i+1UL,jbegin+j+4UL) += A2(i+1UL,k) * B2(k,j+4UL) * alpha;
2042  }
2043  }
2044 
2045  if( i<iblock ) {
2046  for( size_t k=0UL; k<ksize; ++k ) {
2047  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
2048  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2049  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
2050  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
2051  c(ii+i,jbegin+j+4UL) += A2(i,k) * B2(k,j+4UL) * alpha;
2052  }
2053  }
2054  }
2055  }
2056  else
2057  {
2058  for( ; (j+4UL) <= jsize; j+=4UL )
2059  {
2060  if( ii+iblock < jbegin ) continue;
2061 
2062  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2063 
2064  for( ; (i+2UL) <= iblock; i+=2UL ) {
2065  for( size_t k=0UL; k<ksize; ++k ) {
2066  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
2067  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2068  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
2069  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
2070  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2071  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2072  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
2073  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
2074  }
2075  }
2076 
2077  if( i<iblock ) {
2078  for( size_t k=0UL; k<ksize; ++k ) {
2079  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
2080  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2081  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
2082  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
2083  }
2084  }
2085  }
2086  }
2087 
2088  for( ; (j+2UL) <= jsize; j+=2UL )
2089  {
2090  if( ii+iblock < jbegin ) continue;
2091 
2092  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2093 
2094  for( ; (i+2UL) <= iblock; i+=2UL ) {
2095  for( size_t k=0UL; k<ksize; ++k ) {
2096  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
2097  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2098  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2099  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2100  }
2101  }
2102 
2103  if( i<iblock ) {
2104  for( size_t k=0UL; k<ksize; ++k ) {
2105  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
2106  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2107  }
2108  }
2109  }
2110 
2111  if( j<jsize )
2112  {
2113  if( ii+iblock < jbegin ) continue;
2114 
2115  size_t i( ( ii > jbegin+j )?( 0UL ):( jbegin+j-ii ) );
2116 
2117  for( ; (i+2UL) <= iblock; i+=2UL ) {
2118  for( size_t k=0UL; k<ksize; ++k ) {
2119  c(ii+i ,jbegin+j) += A2(i ,k) * B2(k,j) * alpha;
2120  c(ii+i+1UL,jbegin+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2121  }
2122  }
2123 
2124  if( i<iblock ) {
2125  for( size_t k=0UL; k<ksize; ++k ) {
2126  c(ii+i,jbegin+j) += A2(i,k) * B2(k,j) * alpha;
2127  }
2128  }
2129  }
2130 
2131  ii += iblock;
2132  }
2133  }
2134 }
2136 //*************************************************************************************************
2137 
2138 
2139 //*************************************************************************************************
2155 template< typename MT1, typename MT2, typename MT3 >
2156 inline void lmmm( MT1& C, const MT2& A, const MT3& B )
2157 {
2158  using ET1 = ElementType_t<MT1>;
2159  using ET2 = ElementType_t<MT2>;
2160  using ET3 = ElementType_t<MT3>;
2161 
2164 
2165  lmmm( C, A, B, ET1(1), ET1(0) );
2166 }
2168 //*************************************************************************************************
2169 
2170 
2171 
2172 
2173 //=================================================================================================
2174 //
2175 // UPPER DENSE MATRIX MULTIPLICATION KERNELS
2176 //
2177 //=================================================================================================
2178 
2179 //*************************************************************************************************
2198 template< typename MT1, typename MT2, typename MT3, typename ST >
2199 void ummm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
2200 {
2201  using ET1 = ElementType_t<MT1>;
2202  using ET2 = ElementType_t<MT2>;
2203  using ET3 = ElementType_t<MT3>;
2204  using SIMDType = SIMDTrait_t<ET1>;
2205 
2214 
2217 
2220 
2223 
2224  constexpr size_t SIMDSIZE( SIMDTrait<ET1>::size );
2225 
2226  constexpr bool remainder( !IsPadded_v<MT2> || !IsPadded_v<MT3> );
2227 
2228  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
2229  constexpr size_t JBLOCK( MMM_INNER_BLOCK_SIZE );
2230 
2231  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
2232  BLAZE_STATIC_ASSERT( JBLOCK >= SIMDSIZE && JBLOCK % SIMDSIZE == 0UL );
2233 
2234  const size_t M( A.rows() );
2235  const size_t N( B.columns() );
2236  const size_t K( A.columns() );
2237 
2238  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
2239 
2240  DynamicMatrix<ET2,false> A2( M, KBLOCK );
2241  DynamicMatrix<ET3,true> B2( KBLOCK, JBLOCK );
2242 
2243  decltype(auto) c( derestrict( ~C ) );
2244 
2245  if( isDefault( beta ) ) {
2246  reset( c );
2247  }
2248  else if( !isOne( beta ) ) {
2249  c *= beta;
2250  }
2251 
2252  size_t kk( 0UL );
2253  size_t kblock( 0UL );
2254 
2255  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
2256  {
2257  if( remainder ) {
2258  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
2259  }
2260  else {
2261  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
2262  }
2263 
2264  const size_t ibegin( IsLower_v<MT2> ? kk : 0UL );
2265  const size_t iend ( IsUpper_v<MT2> ? kk+kblock : M );
2266  const size_t isize ( iend - ibegin );
2267 
2268  A2 = serial( submatrix< remainder ? unaligned : aligned >( A, ibegin, kk, isize, kblock, unchecked ) );
2269 
2270  size_t jj( 0UL );
2271  size_t jblock( 0UL );
2272 
2273  while( jj < N )
2274  {
2275  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
2276 
2277  if( ( IsLower_v<MT3> && kk+kblock <= jj ) ||
2278  ( IsUpper_v<MT3> && jj+jblock <= kk ) ) {
2279  jj += jblock;
2280  continue;
2281  }
2282 
2283  B2 = serial( submatrix< remainder ? unaligned : aligned >( B, kk, jj, kblock, jblock, unchecked ) );
2284 
2285  size_t i( 0UL );
2286 
2287  if( IsFloatingPoint_v<ET1> )
2288  {
2289  for( ; (i+5UL) <= isize; i+=5UL )
2290  {
2291  if( jj+jblock < ibegin ) continue;
2292 
2293  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2294 
2295  for( ; (j+2UL) <= jblock; j+=2UL )
2296  {
2297  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
2298 
2299  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2300  {
2301  const SIMDType a1( A2.load(i ,k) );
2302  const SIMDType a2( A2.load(i+1UL,k) );
2303  const SIMDType a3( A2.load(i+2UL,k) );
2304  const SIMDType a4( A2.load(i+3UL,k) );
2305  const SIMDType a5( A2.load(i+4UL,k) );
2306 
2307  const SIMDType b1( B2.load(k,j ) );
2308  const SIMDType b2( B2.load(k,j+1UL) );
2309 
2310  xmm1 += a1 * b1;
2311  xmm2 += a1 * b2;
2312  xmm3 += a2 * b1;
2313  xmm4 += a2 * b2;
2314  xmm5 += a3 * b1;
2315  xmm6 += a3 * b2;
2316  xmm7 += a4 * b1;
2317  xmm8 += a4 * b2;
2318  xmm9 += a5 * b1;
2319  xmm10 += a5 * b2;
2320  }
2321 
2322  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2323  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2324  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
2325  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
2326  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
2327  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
2328  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
2329  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
2330  c(ibegin+i+4UL,jj+j ) += sum( xmm9 ) * alpha;
2331  c(ibegin+i+4UL,jj+j+1UL) += sum( xmm10 ) * alpha;
2332  }
2333 
2334  if( j<jblock )
2335  {
2336  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
2337 
2338  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2339  {
2340  const SIMDType a1( A2.load(i ,k) );
2341  const SIMDType a2( A2.load(i+1UL,k) );
2342  const SIMDType a3( A2.load(i+2UL,k) );
2343  const SIMDType a4( A2.load(i+3UL,k) );
2344  const SIMDType a5( A2.load(i+4UL,k) );
2345 
2346  const SIMDType b1( B2.load(k,j) );
2347 
2348  xmm1 += a1 * b1;
2349  xmm2 += a2 * b1;
2350  xmm3 += a3 * b1;
2351  xmm4 += a4 * b1;
2352  xmm5 += a5 * b1;
2353  }
2354 
2355  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
2356  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
2357  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
2358  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
2359  c(ibegin+i+4UL,jj+j) += sum( xmm5 ) * alpha;
2360  }
2361  }
2362  }
2363  else
2364  {
2365  for( ; (i+4UL) <= isize; i+=4UL )
2366  {
2367  if( jj+jblock < ibegin ) continue;
2368 
2369  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2370 
2371  for( ; (j+2UL) <= jblock; j+=2UL )
2372  {
2373  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2374 
2375  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2376  {
2377  const SIMDType a1( A2.load(i ,k) );
2378  const SIMDType a2( A2.load(i+1UL,k) );
2379  const SIMDType a3( A2.load(i+2UL,k) );
2380  const SIMDType a4( A2.load(i+3UL,k) );
2381 
2382  const SIMDType b1( B2.load(k,j ) );
2383  const SIMDType b2( B2.load(k,j+1UL) );
2384 
2385  xmm1 += a1 * b1;
2386  xmm2 += a1 * b2;
2387  xmm3 += a2 * b1;
2388  xmm4 += a2 * b2;
2389  xmm5 += a3 * b1;
2390  xmm6 += a3 * b2;
2391  xmm7 += a4 * b1;
2392  xmm8 += a4 * b2;
2393  }
2394 
2395  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2396  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2397  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
2398  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
2399  c(ibegin+i+2UL,jj+j ) += sum( xmm5 ) * alpha;
2400  c(ibegin+i+2UL,jj+j+1UL) += sum( xmm6 ) * alpha;
2401  c(ibegin+i+3UL,jj+j ) += sum( xmm7 ) * alpha;
2402  c(ibegin+i+3UL,jj+j+1UL) += sum( xmm8 ) * alpha;
2403  }
2404 
2405  if( j<jblock )
2406  {
2407  SIMDType xmm1, xmm2, xmm3, xmm4;
2408 
2409  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2410  {
2411  const SIMDType a1( A2.load(i ,k) );
2412  const SIMDType a2( A2.load(i+1UL,k) );
2413  const SIMDType a3( A2.load(i+2UL,k) );
2414  const SIMDType a4( A2.load(i+3UL,k) );
2415 
2416  const SIMDType b1( B2.load(k,j) );
2417 
2418  xmm1 += a1 * b1;
2419  xmm2 += a2 * b1;
2420  xmm3 += a3 * b1;
2421  xmm4 += a4 * b1;
2422  }
2423 
2424  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
2425  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
2426  c(ibegin+i+2UL,jj+j) += sum( xmm3 ) * alpha;
2427  c(ibegin+i+3UL,jj+j) += sum( xmm4 ) * alpha;
2428  }
2429  }
2430  }
2431 
2432  for( ; (i+2UL) <= isize; i+=2UL )
2433  {
2434  if( jj+jblock < ibegin ) continue;
2435 
2436  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2437 
2438  for( ; (j+4UL) <= jblock; j+=4UL )
2439  {
2440  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2441 
2442  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2443  {
2444  const SIMDType a1( A2.load(i ,k) );
2445  const SIMDType a2( A2.load(i+1UL,k) );
2446 
2447  const SIMDType b1( B2.load(k,j ) );
2448  const SIMDType b2( B2.load(k,j+1UL) );
2449  const SIMDType b3( B2.load(k,j+2UL) );
2450  const SIMDType b4( B2.load(k,j+3UL) );
2451 
2452  xmm1 += a1 * b1;
2453  xmm2 += a1 * b2;
2454  xmm3 += a1 * b3;
2455  xmm4 += a1 * b4;
2456  xmm5 += a2 * b1;
2457  xmm6 += a2 * b2;
2458  xmm7 += a2 * b3;
2459  xmm8 += a2 * b4;
2460  }
2461 
2462  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2463  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2464  c(ibegin+i ,jj+j+2UL) += sum( xmm3 ) * alpha;
2465  c(ibegin+i ,jj+j+3UL) += sum( xmm4 ) * alpha;
2466  c(ibegin+i+1UL,jj+j ) += sum( xmm5 ) * alpha;
2467  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm6 ) * alpha;
2468  c(ibegin+i+1UL,jj+j+2UL) += sum( xmm7 ) * alpha;
2469  c(ibegin+i+1UL,jj+j+3UL) += sum( xmm8 ) * alpha;
2470  }
2471 
2472  for( ; (j+2UL) <= jblock; j+=2UL )
2473  {
2474  SIMDType xmm1, xmm2, xmm3, xmm4;
2475 
2476  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2477  {
2478  const SIMDType a1( A2.load(i ,k) );
2479  const SIMDType a2( A2.load(i+1UL,k) );
2480 
2481  const SIMDType b1( B2.load(k,j ) );
2482  const SIMDType b2( B2.load(k,j+1UL) );
2483 
2484  xmm1 += a1 * b1;
2485  xmm2 += a1 * b2;
2486  xmm3 += a2 * b1;
2487  xmm4 += a2 * b2;
2488  }
2489 
2490  c(ibegin+i ,jj+j ) += sum( xmm1 ) * alpha;
2491  c(ibegin+i ,jj+j+1UL) += sum( xmm2 ) * alpha;
2492  c(ibegin+i+1UL,jj+j ) += sum( xmm3 ) * alpha;
2493  c(ibegin+i+1UL,jj+j+1UL) += sum( xmm4 ) * alpha;
2494  }
2495 
2496  if( j<jblock )
2497  {
2498  SIMDType xmm1, xmm2;
2499 
2500  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2501  {
2502  const SIMDType a1( A2.load(i ,k) );
2503  const SIMDType a2( A2.load(i+1UL,k) );
2504 
2505  const SIMDType b1( B2.load(k,j) );
2506 
2507  xmm1 += a1 * b1;
2508  xmm2 += a2 * b1;
2509  }
2510 
2511  c(ibegin+i ,jj+j) += sum( xmm1 ) * alpha;
2512  c(ibegin+i+1UL,jj+j) += sum( xmm2 ) * alpha;
2513  }
2514  }
2515 
2516  if( i<isize && jj+jblock >= ibegin )
2517  {
2518  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2519 
2520  for( ; (j+2UL) <= jblock; j+=2UL )
2521  {
2522  SIMDType xmm1, xmm2;
2523 
2524  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2525  {
2526  const SIMDType a1( A2.load(i,k) );
2527 
2528  xmm1 += a1 * B2.load(k,j );
2529  xmm2 += a1 * B2.load(k,j+1UL);
2530  }
2531 
2532  c(ibegin+i,jj+j ) += sum( xmm1 ) * alpha;
2533  c(ibegin+i,jj+j+1UL) += sum( xmm2 ) * alpha;
2534  }
2535 
2536  if( j<jblock )
2537  {
2538  SIMDType xmm1;
2539 
2540  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2541  {
2542  const SIMDType a1( A2.load(i,k) );
2543 
2544  xmm1 += a1 * B2.load(k,j);
2545  }
2546 
2547  c(ibegin+i,jj+j) += sum( xmm1 ) * alpha;
2548  }
2549  }
2550 
2551  jj += jblock;
2552  }
2553 
2554  kk += kblock;
2555  }
2556 
2557  if( remainder && kk < K )
2558  {
2559  const size_t ksize( K - kk );
2560 
2561  const size_t ibegin( IsLower_v<MT2> ? kk : 0UL );
2562  const size_t isize ( M - ibegin );
2563 
2564  A2 = serial( submatrix( A, ibegin, kk, isize, ksize, unchecked ) );
2565 
2566  size_t jj( 0UL );
2567  size_t jblock( 0UL );
2568 
2569  while( jj < N )
2570  {
2571  jblock = ( ( jj+JBLOCK <= N )?( JBLOCK ):( N - jj ) );
2572 
2573  if( IsUpper_v<MT3> && jj+jblock <= kk ) {
2574  jj += jblock;
2575  continue;
2576  }
2577 
2578  B2 = serial( submatrix( B, kk, jj, ksize, jblock, unchecked ) );
2579 
2580  size_t i( 0UL );
2581 
2582  if( IsFloatingPoint_v<ET1> )
2583  {
2584  for( ; (i+5UL) <= isize; i+=5UL )
2585  {
2586  if( jj+jblock < ibegin ) continue;
2587 
2588  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2589 
2590  for( ; (j+2UL) <= jblock; j+=2UL ) {
2591  for( size_t k=0UL; k<ksize; ++k ) {
2592  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
2593  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2594  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2595  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2596  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
2597  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
2598  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
2599  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
2600  c(ibegin+i+4UL,jj+j ) += A2(i+4UL,k) * B2(k,j ) * alpha;
2601  c(ibegin+i+4UL,jj+j+1UL) += A2(i+4UL,k) * B2(k,j+1UL) * alpha;
2602  }
2603  }
2604 
2605  if( j<jblock ) {
2606  for( size_t k=0UL; k<ksize; ++k ) {
2607  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
2608  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2609  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
2610  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
2611  c(ibegin+i+4UL,jj+j) += A2(i+4UL,k) * B2(k,j) * alpha;
2612  }
2613  }
2614  }
2615  }
2616  else
2617  {
2618  for( ; (i+4UL) <= isize; i+=4UL )
2619  {
2620  if( jj+jblock < ibegin ) continue;
2621 
2622  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2623 
2624  for( ; (j+2UL) <= jblock; j+=2UL ) {
2625  for( size_t k=0UL; k<ksize; ++k ) {
2626  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
2627  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2628  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2629  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2630  c(ibegin+i+2UL,jj+j ) += A2(i+2UL,k) * B2(k,j ) * alpha;
2631  c(ibegin+i+2UL,jj+j+1UL) += A2(i+2UL,k) * B2(k,j+1UL) * alpha;
2632  c(ibegin+i+3UL,jj+j ) += A2(i+3UL,k) * B2(k,j ) * alpha;
2633  c(ibegin+i+3UL,jj+j+1UL) += A2(i+3UL,k) * B2(k,j+1UL) * alpha;
2634  }
2635  }
2636 
2637  if( j<jblock ) {
2638  for( size_t k=0UL; k<ksize; ++k ) {
2639  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
2640  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2641  c(ibegin+i+2UL,jj+j) += A2(i+2UL,k) * B2(k,j) * alpha;
2642  c(ibegin+i+3UL,jj+j) += A2(i+3UL,k) * B2(k,j) * alpha;
2643  }
2644  }
2645  }
2646  }
2647 
2648  for( ; (i+2UL) <= isize; i+=2UL )
2649  {
2650  if( jj+jblock < ibegin ) continue;
2651 
2652  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2653 
2654  for( ; (j+2UL) <= jblock; j+=2UL ) {
2655  for( size_t k=0UL; k<ksize; ++k ) {
2656  c(ibegin+i ,jj+j ) += A2(i ,k) * B2(k,j ) * alpha;
2657  c(ibegin+i ,jj+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
2658  c(ibegin+i+1UL,jj+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
2659  c(ibegin+i+1UL,jj+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
2660  }
2661  }
2662 
2663  if( j<jblock ) {
2664  for( size_t k=0UL; k<ksize; ++k ) {
2665  c(ibegin+i ,jj+j) += A2(i ,k) * B2(k,j) * alpha;
2666  c(ibegin+i+1UL,jj+j) += A2(i+1UL,k) * B2(k,j) * alpha;
2667  }
2668  }
2669  }
2670 
2671  if( i<isize && jj+jblock >= ibegin )
2672  {
2673  size_t j( ( jj > ibegin+i )?( 0UL ):( ibegin+i-jj ) );
2674 
2675  for( ; (j+2UL) <= jblock; j+=2UL ) {
2676  for( size_t k=0UL; k<ksize; ++k ) {
2677  c(ibegin+i,jj+j ) += A2(i,k) * B2(k,j ) * alpha;
2678  c(ibegin+i,jj+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
2679  }
2680  }
2681 
2682  if( j<jblock ) {
2683  for( size_t k=0UL; k<ksize; ++k ) {
2684  c(ibegin+i,jj+j) += A2(i,k) * B2(k,j) * alpha;
2685  }
2686  }
2687  }
2688 
2689  jj += jblock;
2690  }
2691  }
2692 }
2694 //*************************************************************************************************
2695 
2696 
2697 //*************************************************************************************************
2716 template< typename MT1, typename MT2, typename MT3, typename ST >
2717 void ummm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha, ST beta )
2718 {
2719  using ET1 = ElementType_t<MT1>;
2720  using ET2 = ElementType_t<MT2>;
2721  using ET3 = ElementType_t<MT3>;
2722  using SIMDType = SIMDTrait_t<ET1>;
2723 
2732 
2735 
2738 
2741 
2742  constexpr size_t SIMDSIZE( SIMDTrait<ET1>::size );
2743 
2744  constexpr bool remainder( !IsPadded_v<MT2> || !IsPadded_v<MT3> );
2745 
2746  constexpr size_t KBLOCK( MMM_OUTER_BLOCK_SIZE * ( 16UL/sizeof(ET1) ) );
2747  constexpr size_t IBLOCK( MMM_INNER_BLOCK_SIZE );
2748 
2749  BLAZE_STATIC_ASSERT( KBLOCK >= SIMDSIZE && KBLOCK % SIMDSIZE == 0UL );
2750  BLAZE_STATIC_ASSERT( IBLOCK >= SIMDSIZE && IBLOCK % SIMDSIZE == 0UL );
2751 
2752  const size_t M( A.rows() );
2753  const size_t N( B.columns() );
2754  const size_t K( A.columns() );
2755 
2756  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
2757 
2758  DynamicMatrix<ET2,false> A2( IBLOCK, KBLOCK );
2759  DynamicMatrix<ET3,true> B2( KBLOCK, N );
2760 
2761  decltype(auto) c( derestrict( ~C ) );
2762 
2763  if( isDefault( beta ) ) {
2764  reset( c );
2765  }
2766  else if( !isOne( beta ) ) {
2767  c *= beta;
2768  }
2769 
2770  size_t kk( 0UL );
2771  size_t kblock( 0UL );
2772 
2773  while( kk + ( remainder ? SIMDSIZE-1UL : 0UL ) < K )
2774  {
2775  if( remainder ) {
2776  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( ( K - kk ) & size_t(-SIMDSIZE) ) );
2777  }
2778  else {
2779  kblock = ( ( kk+KBLOCK <= K )?( KBLOCK ):( K - kk ) );
2780  }
2781 
2782  const size_t jbegin( IsUpper_v<MT3> ? kk : 0UL );
2783  const size_t jend ( IsLower_v<MT3> ? kk+kblock : N );
2784  const size_t jsize ( jend - jbegin );
2785 
2786  B2 = serial( submatrix< remainder ? unaligned : aligned >( B, kk, jbegin, kblock, jsize, unchecked ) );
2787 
2788  size_t ii( 0UL );
2789  size_t iblock( 0UL );
2790 
2791  while( ii < M )
2792  {
2793  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
2794 
2795  if( ( IsLower_v<MT2> && ii+iblock <= kk ) ||
2796  ( IsUpper_v<MT2> && kk+kblock <= ii ) ) {
2797  ii += iblock;
2798  continue;
2799  }
2800 
2801  A2 = serial( submatrix< remainder ? unaligned : aligned >( A, ii, kk, iblock, kblock, unchecked ) );
2802 
2803  size_t j( 0UL );
2804 
2805  if( IsFloatingPoint_v<ET3> )
2806  {
2807  for( ; (j+5UL) <= jsize; j+=5UL )
2808  {
2809  if( ii > jbegin+j+4UL ) continue;
2810 
2811  const size_t iend( min( iblock, jbegin+j-ii+5UL ) );
2812  size_t i( 0UL );
2813 
2814  for( ; (i+2UL) <= iend; i+=2UL )
2815  {
2816  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
2817 
2818  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2819  {
2820  const SIMDType a1( A2.load(i ,k) );
2821  const SIMDType a2( A2.load(i+1UL,k) );
2822 
2823  const SIMDType b1( B2.load(k,j ) );
2824  const SIMDType b2( B2.load(k,j+1UL) );
2825  const SIMDType b3( B2.load(k,j+2UL) );
2826  const SIMDType b4( B2.load(k,j+3UL) );
2827  const SIMDType b5( B2.load(k,j+4UL) );
2828 
2829  xmm1 += a1 * b1;
2830  xmm2 += a1 * b2;
2831  xmm3 += a1 * b3;
2832  xmm4 += a1 * b4;
2833  xmm5 += a1 * b5;
2834  xmm6 += a2 * b1;
2835  xmm7 += a2 * b2;
2836  xmm8 += a2 * b3;
2837  xmm9 += a2 * b4;
2838  xmm10 += a2 * b5;
2839  }
2840 
2841  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
2842  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2843  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2844  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2845  c(ii+i ,jbegin+j+4UL) += sum( xmm5 ) * alpha;
2846  c(ii+i+1UL,jbegin+j ) += sum( xmm6 ) * alpha;
2847  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm7 ) * alpha;
2848  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm8 ) * alpha;
2849  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm9 ) * alpha;
2850  c(ii+i+1UL,jbegin+j+4UL) += sum( xmm10 ) * alpha;
2851  }
2852 
2853  if( i<iend )
2854  {
2855  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
2856 
2857  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2858  {
2859  const SIMDType a1( A2.load(i,k) );
2860 
2861  xmm1 += a1 * B2.load(k,j );
2862  xmm2 += a1 * B2.load(k,j+1UL);
2863  xmm3 += a1 * B2.load(k,j+2UL);
2864  xmm4 += a1 * B2.load(k,j+3UL);
2865  xmm5 += a1 * B2.load(k,j+4UL);
2866  }
2867 
2868  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
2869  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2870  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2871  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2872  c(ii+i,jbegin+j+4UL) += sum( xmm5 ) * alpha;
2873  }
2874  }
2875  }
2876  else
2877  {
2878  for( ; (j+4UL) <= jsize; j+=4UL )
2879  {
2880  if( ii > jbegin+j+3UL ) continue;
2881 
2882  const size_t iend( min( iblock, jbegin+j-ii+4UL ) );
2883  size_t i( 0UL );
2884 
2885  for( ; (i+2UL) <= iend; i+=2UL )
2886  {
2887  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2888 
2889  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2890  {
2891  const SIMDType a1( A2.load(i ,k) );
2892  const SIMDType a2( A2.load(i+1UL,k) );
2893 
2894  const SIMDType b1( B2.load(k,j ) );
2895  const SIMDType b2( B2.load(k,j+1UL) );
2896  const SIMDType b3( B2.load(k,j+2UL) );
2897  const SIMDType b4( B2.load(k,j+3UL) );
2898 
2899  xmm1 += a1 * b1;
2900  xmm2 += a1 * b2;
2901  xmm3 += a1 * b3;
2902  xmm4 += a1 * b4;
2903  xmm5 += a2 * b1;
2904  xmm6 += a2 * b2;
2905  xmm7 += a2 * b3;
2906  xmm8 += a2 * b4;
2907  }
2908 
2909  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
2910  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2911  c(ii+i ,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2912  c(ii+i ,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2913  c(ii+i+1UL,jbegin+j ) += sum( xmm5 ) * alpha;
2914  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
2915  c(ii+i+1UL,jbegin+j+2UL) += sum( xmm7 ) * alpha;
2916  c(ii+i+1UL,jbegin+j+3UL) += sum( xmm8 ) * alpha;
2917  }
2918 
2919  if( i<iend )
2920  {
2921  SIMDType xmm1, xmm2, xmm3, xmm4;
2922 
2923  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2924  {
2925  const SIMDType a1( A2.load(i,k) );
2926 
2927  xmm1 += a1 * B2.load(k,j );
2928  xmm2 += a1 * B2.load(k,j+1UL);
2929  xmm3 += a1 * B2.load(k,j+2UL);
2930  xmm4 += a1 * B2.load(k,j+3UL);
2931  }
2932 
2933  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
2934  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2935  c(ii+i,jbegin+j+2UL) += sum( xmm3 ) * alpha;
2936  c(ii+i,jbegin+j+3UL) += sum( xmm4 ) * alpha;
2937  }
2938  }
2939  }
2940 
2941  for( ; (j+2UL) <= jsize; j+=2UL )
2942  {
2943  if( ii > jbegin+j+1UL ) continue;
2944 
2945  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
2946  size_t i( 0UL );
2947 
2948  for( ; (i+4UL) <= iend; i+=4UL )
2949  {
2950  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2951 
2952  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2953  {
2954  const SIMDType a1( A2.load(i ,k) );
2955  const SIMDType a2( A2.load(i+1UL,k) );
2956  const SIMDType a3( A2.load(i+2UL,k) );
2957  const SIMDType a4( A2.load(i+3UL,k) );
2958 
2959  const SIMDType b1( B2.load(k,j ) );
2960  const SIMDType b2( B2.load(k,j+1UL) );
2961 
2962  xmm1 += a1 * b1;
2963  xmm2 += a1 * b2;
2964  xmm3 += a2 * b1;
2965  xmm4 += a2 * b2;
2966  xmm5 += a3 * b1;
2967  xmm6 += a3 * b2;
2968  xmm7 += a4 * b1;
2969  xmm8 += a4 * b2;
2970  }
2971 
2972  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
2973  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
2974  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
2975  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
2976  c(ii+i+2UL,jbegin+j ) += sum( xmm5 ) * alpha;
2977  c(ii+i+2UL,jbegin+j+1UL) += sum( xmm6 ) * alpha;
2978  c(ii+i+3UL,jbegin+j ) += sum( xmm7 ) * alpha;
2979  c(ii+i+3UL,jbegin+j+1UL) += sum( xmm8 ) * alpha;
2980  }
2981 
2982  for( ; (i+2UL) <= iend; i+=2UL )
2983  {
2984  SIMDType xmm1, xmm2, xmm3, xmm4;
2985 
2986  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
2987  {
2988  const SIMDType a1( A2.load(i ,k) );
2989  const SIMDType a2( A2.load(i+1UL,k) );
2990 
2991  const SIMDType b1( B2.load(k,j ) );
2992  const SIMDType b2( B2.load(k,j+1UL) );
2993 
2994  xmm1 += a1 * b1;
2995  xmm2 += a1 * b2;
2996  xmm3 += a2 * b1;
2997  xmm4 += a2 * b2;
2998  }
2999 
3000  c(ii+i ,jbegin+j ) += sum( xmm1 ) * alpha;
3001  c(ii+i ,jbegin+j+1UL) += sum( xmm2 ) * alpha;
3002  c(ii+i+1UL,jbegin+j ) += sum( xmm3 ) * alpha;
3003  c(ii+i+1UL,jbegin+j+1UL) += sum( xmm4 ) * alpha;
3004  }
3005 
3006  if( i<iend )
3007  {
3008  SIMDType xmm1, xmm2;
3009 
3010  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
3011  {
3012  const SIMDType a1( A2.load(i,k) );
3013 
3014  xmm1 += a1 * B2.load(k,j );
3015  xmm2 += a1 * B2.load(k,j+1UL);
3016  }
3017 
3018  c(ii+i,jbegin+j ) += sum( xmm1 ) * alpha;
3019  c(ii+i,jbegin+j+1UL) += sum( xmm2 ) * alpha;
3020  }
3021  }
3022 
3023  if( j<jsize && ii <= jbegin+j )
3024  {
3025  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
3026  size_t i( 0UL );
3027 
3028  for( ; (i+2UL) <= iend; i+=2UL )
3029  {
3030  SIMDType xmm1, xmm2;
3031 
3032  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
3033  {
3034  const SIMDType b1( B2.load(k,j) );
3035 
3036  xmm1 += A2.load(i ,k) * b1;
3037  xmm2 += A2.load(i+1UL,k) * b1;
3038  }
3039 
3040  c(ii+i ,jbegin+j) += sum( xmm1 ) * alpha;
3041  c(ii+i+1UL,jbegin+j) += sum( xmm2 ) * alpha;
3042  }
3043 
3044  if( i<iend )
3045  {
3046  SIMDType xmm1;
3047 
3048  for( size_t k=0UL; k<kblock; k+=SIMDSIZE )
3049  {
3050  xmm1 += A2.load(i,k) * B2.load(k,j);
3051  }
3052 
3053  c(ii+i,jbegin+j) += sum( xmm1 ) * alpha;
3054  }
3055  }
3056 
3057  ii += iblock;
3058  }
3059 
3060  kk += kblock;
3061  }
3062 
3063  if( remainder && kk < K )
3064  {
3065  const size_t ksize( K - kk );
3066 
3067  const size_t jbegin( IsUpper_v<MT3> ? kk : 0UL );
3068  const size_t jsize ( N - jbegin );
3069 
3070  B2 = serial( submatrix( B, kk, jbegin, ksize, jsize, unchecked ) );
3071 
3072  size_t ii( 0UL );
3073  size_t iblock( 0UL );
3074 
3075  while( ii < M )
3076  {
3077  iblock = ( ( ii+IBLOCK <= M )?( IBLOCK ):( M - ii ) );
3078 
3079  if( IsLower_v<MT2> && ii+iblock <= kk ) {
3080  ii += iblock;
3081  continue;
3082  }
3083 
3084  A2 = serial( submatrix( A, ii, kk, iblock, ksize, unchecked ) );
3085 
3086  size_t j( 0UL );
3087 
3088  if( IsFloatingPoint_v<ET1> )
3089  {
3090  for( ; (j+5UL) <= jsize; j+=5UL )
3091  {
3092  if( ii > jbegin+j+4UL ) continue;
3093 
3094  const size_t iend( min( iblock, jbegin+j-ii+5UL ) );
3095  size_t i( 0UL );
3096 
3097  for( ; (i+2UL) <= iend; i+=2UL ) {
3098  for( size_t k=0UL; k<ksize; ++k ) {
3099  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
3100  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
3101  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
3102  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
3103  c(ii+i ,jbegin+j+4UL) += A2(i ,k) * B2(k,j+4UL) * alpha;
3104  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
3105  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
3106  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
3107  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
3108  c(ii+i+1UL,jbegin+j+4UL) += A2(i+1UL,k) * B2(k,j+4UL) * alpha;
3109  }
3110  }
3111 
3112  if( i<iend ) {
3113  for( size_t k=0UL; k<ksize; ++k ) {
3114  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
3115  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
3116  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
3117  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
3118  c(ii+i,jbegin+j+4UL) += A2(i,k) * B2(k,j+4UL) * alpha;
3119  }
3120  }
3121  }
3122  }
3123  else
3124  {
3125  for( ; (j+4UL) <= jsize; j+=4UL )
3126  {
3127  if( ii > jbegin+j+3UL ) continue;
3128 
3129  const size_t iend( min( iblock, jbegin+j-ii+4UL ) );
3130  size_t i( 0UL );
3131 
3132  for( ; (i+2UL) <= iend; i+=2UL ) {
3133  for( size_t k=0UL; k<ksize; ++k ) {
3134  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
3135  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
3136  c(ii+i ,jbegin+j+2UL) += A2(i ,k) * B2(k,j+2UL) * alpha;
3137  c(ii+i ,jbegin+j+3UL) += A2(i ,k) * B2(k,j+3UL) * alpha;
3138  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
3139  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
3140  c(ii+i+1UL,jbegin+j+2UL) += A2(i+1UL,k) * B2(k,j+2UL) * alpha;
3141  c(ii+i+1UL,jbegin+j+3UL) += A2(i+1UL,k) * B2(k,j+3UL) * alpha;
3142  }
3143  }
3144 
3145  if( i<iend ) {
3146  for( size_t k=0UL; k<ksize; ++k ) {
3147  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
3148  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
3149  c(ii+i,jbegin+j+2UL) += A2(i,k) * B2(k,j+2UL) * alpha;
3150  c(ii+i,jbegin+j+3UL) += A2(i,k) * B2(k,j+3UL) * alpha;
3151  }
3152  }
3153  }
3154  }
3155 
3156  for( ; (j+2UL) <= jsize; j+=2UL )
3157  {
3158  if( ii > jbegin+j+1UL ) continue;
3159 
3160  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
3161  size_t i( 0UL );
3162 
3163  for( ; (i+2UL) <= iend; i+=2UL ) {
3164  for( size_t k=0UL; k<ksize; ++k ) {
3165  c(ii+i ,jbegin+j ) += A2(i ,k) * B2(k,j ) * alpha;
3166  c(ii+i ,jbegin+j+1UL) += A2(i ,k) * B2(k,j+1UL) * alpha;
3167  c(ii+i+1UL,jbegin+j ) += A2(i+1UL,k) * B2(k,j ) * alpha;
3168  c(ii+i+1UL,jbegin+j+1UL) += A2(i+1UL,k) * B2(k,j+1UL) * alpha;
3169  }
3170  }
3171 
3172  if( i<iend ) {
3173  for( size_t k=0UL; k<ksize; ++k ) {
3174  c(ii+i,jbegin+j ) += A2(i,k) * B2(k,j ) * alpha;
3175  c(ii+i,jbegin+j+1UL) += A2(i,k) * B2(k,j+1UL) * alpha;
3176  }
3177  }
3178  }
3179 
3180  if( j<jsize && ii <= jbegin+j )
3181  {
3182  const size_t iend( min( iblock, jbegin+j-ii+2UL ) );
3183  size_t i( 0UL );
3184 
3185  for( ; (i+2UL) <= iend; i+=2UL ) {
3186  for( size_t k=0UL; k<ksize; ++k ) {
3187  c(ii+i ,jbegin+j) += A2(i ,k) * B2(k,j) * alpha;
3188  c(ii+i+1UL,jbegin+j) += A2(i+1UL,k) * B2(k,j) * alpha;
3189  }
3190  }
3191 
3192  if( i<iend ) {
3193  for( size_t k=0UL; k<ksize; ++k ) {
3194  c(ii+i,jbegin+j) += A2(i,k) * B2(k,j) * alpha;
3195  }
3196  }
3197  }
3198 
3199  ii += iblock;
3200  }
3201  }
3202 }
3204 //*************************************************************************************************
3205 
3206 
3207 //*************************************************************************************************
3223 template< typename MT1, typename MT2, typename MT3 >
3224 inline void ummm( MT1& C, const MT2& A, const MT3& B )
3225 {
3226  using ET1 = ElementType_t<MT1>;
3227  using ET2 = ElementType_t<MT2>;
3228  using ET3 = ElementType_t<MT3>;
3229 
3232 
3233  ummm( C, A, B, ET1(1), ET1(0) );
3234 }
3236 //*************************************************************************************************
3237 
3238 
3239 
3240 
3241 //=================================================================================================
3242 //
3243 // SYMMETRIC DENSE MATRIX MULTIPLICATION KERNELS
3244 //
3245 //=================================================================================================
3246 
3247 //*************************************************************************************************
3265 template< typename MT1, typename MT2, typename MT3, typename ST >
3266 void smmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha )
3267 {
3268  using ET1 = ElementType_t<MT1>;
3269  using ET2 = ElementType_t<MT2>;
3270  using ET3 = ElementType_t<MT3>;
3271 
3276 
3279 
3282 
3285 
3286  const size_t M( A.rows() );
3287  const size_t N( B.columns() );
3288 
3289  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3290 
3291  lmmm( C, A, B, alpha, ST(0) );
3292 
3293  for( size_t ii=0UL; ii<M; ii+=BLOCK_SIZE )
3294  {
3295  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3296 
3297  for( size_t i=ii; i<iend; ++i ) {
3298  for( size_t j=i+1UL; j<iend; ++j ) {
3299  (~C)(i,j) = (~C)(j,i);
3300  }
3301  }
3302 
3303  for( size_t jj=ii+BLOCK_SIZE; jj<N; jj+=BLOCK_SIZE ) {
3304  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3305  for( size_t i=ii; i<iend; ++i ) {
3306  for( size_t j=jj; j<jend; ++j ) {
3307  (~C)(i,j) = (~C)(j,i);
3308  }
3309  }
3310  }
3311  }
3312 }
3314 //*************************************************************************************************
3315 
3316 
3317 //*************************************************************************************************
3335 template< typename MT1, typename MT2, typename MT3, typename ST >
3336 void smmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha )
3337 {
3338  using ET1 = ElementType_t<MT1>;
3339  using ET2 = ElementType_t<MT2>;
3340  using ET3 = ElementType_t<MT3>;
3341 
3346 
3349 
3352 
3355 
3356  const size_t M( A.rows() );
3357  const size_t N( B.columns() );
3358 
3359  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3360 
3361  ummm( C, A, B, alpha, ST(0) );
3362 
3363  for( size_t jj=0UL; jj<N; jj+=BLOCK_SIZE )
3364  {
3365  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3366 
3367  for( size_t j=jj; j<jend; ++j ) {
3368  for( size_t i=jj+1UL; i<jend; ++i ) {
3369  (~C)(i,j) = (~C)(j,i);
3370  }
3371  }
3372 
3373  for( size_t ii=jj+BLOCK_SIZE; ii<M; ii+=BLOCK_SIZE ) {
3374  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3375  for( size_t j=jj; j<jend; ++j ) {
3376  for( size_t i=ii; i<iend; ++i ) {
3377  (~C)(i,j) = (~C)(j,i);
3378  }
3379  }
3380  }
3381  }
3382 }
3384 //*************************************************************************************************
3385 
3386 
3387 //*************************************************************************************************
3403 template< typename MT1, typename MT2, typename MT3 >
3404 inline void smmm( MT1& C, const MT2& A, const MT3& B )
3405 {
3406  using ET1 = ElementType_t<MT1>;
3407  using ET2 = ElementType_t<MT2>;
3408  using ET3 = ElementType_t<MT3>;
3409 
3412 
3413  smmm( C, A, B, ET1(1) );
3414 }
3416 //*************************************************************************************************
3417 
3418 
3419 
3420 
3421 //=================================================================================================
3422 //
3423 // HERMITIAN DENSE MATRIX MULTIPLICATION KERNELS
3424 //
3425 //=================================================================================================
3426 
3427 //*************************************************************************************************
3445 template< typename MT1, typename MT2, typename MT3, typename ST >
3446 void hmmm( DenseMatrix<MT1,false>& C, const MT2& A, const MT3& B, ST alpha )
3447 {
3448  using ET1 = ElementType_t<MT1>;
3449  using ET2 = ElementType_t<MT2>;
3450  using ET3 = ElementType_t<MT3>;
3451 
3456 
3459 
3462 
3465 
3466  const size_t M( A.rows() );
3467  const size_t N( B.columns() );
3468 
3469  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3470 
3471  lmmm( C, A, B, alpha, ST(0) );
3472 
3473  for( size_t ii=0UL; ii<M; ii+=BLOCK_SIZE )
3474  {
3475  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3476 
3477  for( size_t i=ii; i<iend; ++i ) {
3478  for( size_t j=i+1UL; j<iend; ++j ) {
3479  (~C)(i,j) = conj( (~C)(j,i) );
3480  }
3481  }
3482 
3483  for( size_t jj=ii+BLOCK_SIZE; jj<N; jj+=BLOCK_SIZE ) {
3484  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3485  for( size_t i=ii; i<iend; ++i ) {
3486  for( size_t j=jj; j<jend; ++j ) {
3487  (~C)(i,j) = conj( (~C)(j,i) );
3488  }
3489  }
3490  }
3491  }
3492 }
3494 //*************************************************************************************************
3495 
3496 
3497 //*************************************************************************************************
3515 template< typename MT1, typename MT2, typename MT3, typename ST >
3516 void hmmm( DenseMatrix<MT1,true>& C, const MT2& A, const MT3& B, ST alpha )
3517 {
3518  using ET1 = ElementType_t<MT1>;
3519  using ET2 = ElementType_t<MT2>;
3520  using ET3 = ElementType_t<MT3>;
3521 
3526 
3529 
3532 
3535 
3536  const size_t M( A.rows() );
3537  const size_t N( B.columns() );
3538 
3539  BLAZE_INTERNAL_ASSERT( A.columns() == B.rows(), "Invalid matrix sizes detected" );
3540 
3541  ummm( C, A, B, alpha, ST(0) );
3542 
3543  for( size_t jj=0UL; jj<N; jj+=BLOCK_SIZE )
3544  {
3545  const size_t jend( min( N, jj+BLOCK_SIZE ) );
3546 
3547  for( size_t j=jj; j<jend; ++j ) {
3548  for( size_t i=jj+1UL; i<jend; ++i ) {
3549  (~C)(i,j) = conj( (~C)(j,i) );
3550  }
3551  }
3552 
3553  for( size_t ii=jj+BLOCK_SIZE; ii<M; ii+=BLOCK_SIZE ) {
3554  const size_t iend( min( M, ii+BLOCK_SIZE ) );
3555  for( size_t j=jj; j<jend; ++j ) {
3556  for( size_t i=ii; i<iend; ++i ) {
3557  (~C)(i,j) = conj( (~C)(j,i) );
3558  }
3559  }
3560  }
3561  }
3562 }
3564 //*************************************************************************************************
3565 
3566 
3567 //*************************************************************************************************
3583 template< typename MT1, typename MT2, typename MT3 >
3584 inline void hmmm( MT1& C, const MT2& A, const MT3& B )
3585 {
3586  using ET1 = ElementType_t<MT1>;
3587  using ET2 = ElementType_t<MT2>;
3588  using ET3 = ElementType_t<MT3>;
3589 
3592 
3593  hmmm( C, A, B, ET1(1) );
3594 }
3596 //*************************************************************************************************
3597 
3598 } // namespace blaze
3599 
3600 #endif
Header file for the implementation of the Submatrix view.
Constraint on the data type.
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for kernel specific block sizes.
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_STRICTLY_UPPER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a strictly upper triangular matrix type,...
Definition: StrictlyUpper.h:81
Header file for basic type definitions.
decltype(auto) submatrix(Matrix< MT, SO > &, RSAs...)
Creating a view on a specific submatrix of the given matrix.
Definition: Submatrix.h:178
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_COMPUTATION_TYPE(T)
Constraint on the data type.In case the given data type T is a computational expression (i....
Definition: Computation.h:81
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_NOT_BE_UNIUPPER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a upper unitriangular matrix type,...
Definition: UniUpper.h:81
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
Constraint on the data type.
Constraints on the storage order of matrix types.
Constraint on the data type.
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_ADAPTOR_TYPE(T)
Constraint on the data type.In case the given data type T is an adaptor type (as for instance LowerMa...
Definition: Adaptor.h:81
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_STRICTLY_LOWER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a strictly lower triangular matrix type,...
Definition: StrictlyLower.h:81
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_SIMD_COMBINABLE_TYPES(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 are not SIMD combinable (i....
Definition: SIMDCombinable.h:61
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Compile time assertion.
Header file for the IsFloatingPoint type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2147
Header file for the DenseMatrix base class.
Header file for all SIMD functionality.
Header file for the IsLower type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_UPPER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a upper triangular matrix type,...
Definition: Upper.h:81
Header file for the implementation of a dynamic MxN matrix.
Constraint on the data type.
Header file for the IsPadded type trait.
Header file for the isOne shim.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type,...
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for run time assertion macros.
Constraint on the data type.
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_LOWER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a lower triangular matrix type,...
Definition: Lower.h:81
bool isOne(const DiagonalProxy< MT > &proxy)
Returns whether the represented element is 1.
Definition: DiagonalProxy.h:697
Header file for the isDefault shim.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_UNILOWER_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a lower unitriangular matrix type,...
Definition: UniLower.h:81
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
Constraint on the data type.
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
bool isDefault(const DiagonalProxy< MT > &proxy)
Returns whether the represented element is in default state.
Definition: DiagonalProxy.h:635
#define BLAZE_CONSTRAINT_MUST_NOT_BE_HERMITIAN_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is an Hermitian matrix type,...
Definition: Hermitian.h:79
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
#define BLAZE_STATIC_ASSERT(expr)
Compile time assertion macro.In case of an invalid compile time expression, a compilation error is cr...
Definition: StaticAssert.h:112
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Constraint on the data type.