35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 139 template<
typename MT1
145 class TDMatDMatMultExpr
146 :
public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
147 ,
private Computation
172 SYM = ( SF && !( HF || LF || UF ) ),
173 HERM = ( HF && !( LF || UF ) ),
174 LOW = ( LF || ( ( SF || HF ) && UF ) ),
175 UPP = ( UF || ( ( SF || HF ) && LF ) )
185 template<
typename T1,
typename T2,
typename T3 >
186 struct IsEvaluationRequired {
187 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseBlasKernel {
199 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
205 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
220 template<
typename T1,
typename T2,
typename T3 >
221 struct UseVectorizedDefaultKernel {
222 enum :
bool { value = useOptimizedKernels &&
224 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
282 MT1::simdEnabled && MT2::simdEnabled &&
287 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
288 !evaluateRight && MT2::smpAssignable };
343 :(
lhs_.columns() ) ) );
347 const size_t n(
end - begin );
367 if( i >=
lhs_.rows() ) {
370 if( j >=
rhs_.columns() ) {
382 inline size_t rows() const noexcept {
393 return rhs_.columns();
423 template<
typename T >
424 inline bool canAlias(
const T* alias )
const noexcept {
425 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
435 template<
typename T >
436 inline bool isAliased(
const T* alias )
const noexcept {
437 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
447 return lhs_.isAligned() &&
rhs_.isAligned();
458 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
460 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
461 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
485 template<
typename MT
494 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
497 else if( rhs.lhs_.columns() == 0UL ) {
512 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
528 template<
typename MT3
531 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
536 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
537 selectSmallAssignKernel( C, A, B );
539 selectBlasAssignKernel( C, A, B );
558 template<
typename MT3
564 const size_t M( A.rows() );
565 const size_t N( B.columns() );
566 const size_t K( A.columns() );
570 for(
size_t i=0UL; i<M; ++i )
581 for(
size_t j=0UL; j<N; ++j ) {
590 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
591 :( UPP ?
max(i,kbegin) : kbegin ) )
592 :( UPP ? i : 0UL ) );
595 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
596 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
597 :( LOW ? i+1UL : N ) );
600 for(
size_t j=0UL; j<jbegin; ++j ) {
605 reset( (~C)(i,0UL) );
607 for(
size_t j=jbegin; j<jend; ++j ) {
608 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
611 for(
size_t j=jend; j<N; ++j ) {
616 reset( (~C)(i,N-1UL) );
620 for(
size_t k=kbegin+1UL; k<kend; ++k )
624 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
625 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
626 :( SYM || HERM || UPP ? i : 0UL ) );
629 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
630 :( LOW ?
min(i+1UL,k) : k ) )
631 :( LOW ? i+1UL : N ) );
633 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
636 for(
size_t j=jbegin; j<jend; ++j ) {
637 (~C)(i,j) += A(i,k) * B(k,j);
640 (~C)(i,jend) = A(i,k) * B(k,jend);
646 for(
size_t i=1UL; i<M; ++i ) {
647 for(
size_t j=0UL; j<i; ++j ) {
648 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
670 template<
typename MT3
676 const size_t M( A.rows() );
677 const size_t N( B.columns() );
678 const size_t K( A.columns() );
682 for(
size_t j=0UL; j<N; ++j )
693 for(
size_t i=0UL; i<M; ++i ) {
702 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
703 :( LOW ?
max(j,kbegin) : kbegin ) )
704 :( LOW ? j : 0UL ) );
707 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
708 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
709 :( UPP ? j+1UL : M ) );
712 for(
size_t i=0UL; i<ibegin; ++i ) {
717 reset( (~C)(0UL,j) );
719 for(
size_t i=ibegin; i<iend; ++i ) {
720 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
723 for(
size_t i=iend; i<M; ++i ) {
728 reset( (~C)(M-1UL,j) );
732 for(
size_t k=kbegin+1UL; k<kend; ++k )
736 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
737 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
738 :( SYM || HERM || LOW ? j : 0UL ) );
741 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
742 :( UPP ?
min(j+1UL,k) : k ) )
743 :( UPP ? j+1UL : M ) );
745 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
748 for(
size_t i=ibegin; i<iend; ++i ) {
749 (~C)(i,j) += A(i,k) * B(k,j);
752 (~C)(iend,j) = A(iend,k) * B(k,j);
758 for(
size_t j=1UL; j<N; ++j ) {
759 for(
size_t i=0UL; i<j; ++i ) {
760 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
782 template<
typename MT3
785 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
788 constexpr
size_t block( BLOCK_SIZE );
790 const size_t M( A.rows() );
791 const size_t N( B.columns() );
793 for(
size_t ii=0UL; ii<M; ii+=block ) {
794 const size_t iend(
min( M, ii+block ) );
795 for(
size_t jj=0UL; jj<N; jj+=block ) {
796 const size_t jend(
min( N, jj+block ) );
797 for(
size_t i=ii; i<iend; ++i )
807 for(
size_t j=jj; j<jbegin; ++j ) {
811 for(
size_t j=jbegin; j<jpos; ++j ) {
812 (~C)(i,j) = A(i,j) * B(j,j);
815 for(
size_t j=jpos; j<jend; ++j ) {
840 template<
typename MT3
843 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
846 const size_t M( A.rows() );
847 const size_t N( B.columns() );
849 for(
size_t j=0UL; j<N; ++j )
860 for(
size_t i=0UL; i<ibegin; ++i ) {
864 for(
size_t i=ibegin; i<iend; ++i ) {
865 (~C)(i,j) = A(i,j) * B(j,j);
868 for(
size_t i=iend; i<M; ++i ) {
891 template<
typename MT3
897 const size_t M( A.rows() );
898 const size_t N( B.columns() );
900 for(
size_t i=0UL; i<M; ++i )
911 for(
size_t j=0UL; j<jbegin; ++j ) {
915 for(
size_t j=jbegin; j<jend; ++j ) {
916 (~C)(i,j) = A(i,i) * B(i,j);
919 for(
size_t j=jend; j<N; ++j ) {
942 template<
typename MT3
948 constexpr
size_t block( BLOCK_SIZE );
950 const size_t M( A.rows() );
951 const size_t N( B.columns() );
953 for(
size_t jj=0UL; jj<N; jj+=block ) {
954 const size_t jend(
min( N, jj+block ) );
955 for(
size_t ii=0UL; ii<M; ii+=block ) {
956 const size_t iend(
min( M, ii+block ) );
957 for(
size_t j=jj; j<jend; ++j )
967 for(
size_t i=ii; i<ibegin; ++i ) {
971 for(
size_t i=ibegin; i<ipos; ++i ) {
972 (~C)(i,j) = A(i,i) * B(i,j);
975 for(
size_t i=ipos; i<iend; ++i ) {
1000 template<
typename MT3
1004 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1008 for(
size_t i=0UL; i<A.rows(); ++i ) {
1009 C(i,i) = A(i,i) * B(i,i);
1029 template<
typename MT3
1033 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1035 selectDefaultAssignKernel( ~C, A, B );
1055 template<
typename MT3
1063 const size_t M( A.rows() );
1064 const size_t N( B.columns() );
1065 const size_t K( A.columns() );
1069 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1072 if( LOW && UPP && N > SIMDSIZE*3UL ) {
1081 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1082 for(
size_t i=0UL; i<M; ++i )
1095 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1097 for(
size_t k=kbegin; k<kend; ++k ) {
1098 const SIMDType a1(
set( A(i,k) ) );
1099 xmm1 += a1 * B.load(k,j );
1100 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1101 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1102 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1103 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1104 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1105 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1106 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1109 (~C).store( i, j , xmm1 );
1110 (~C).store( i, j+SIMDSIZE , xmm2 );
1111 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1112 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1113 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1114 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1115 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1116 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1121 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1125 for( ; (i+2UL) <= M; i+=2UL )
1138 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1140 for(
size_t k=kbegin; k<kend; ++k ) {
1141 const SIMDType a1(
set( A(i ,k) ) );
1142 const SIMDType a2(
set( A(i+1UL,k) ) );
1144 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1145 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1146 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1147 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1160 (~C).store( i , j , xmm1 );
1161 (~C).store( i , j+SIMDSIZE , xmm2 );
1162 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1163 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1164 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
1165 (~C).store( i+1UL, j , xmm6 );
1166 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
1167 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1168 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1169 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1181 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1183 for(
size_t k=kbegin; k<kend; ++k ) {
1184 const SIMDType a1(
set( A(i,k) ) );
1185 xmm1 += a1 * B.load(k,j );
1186 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1187 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1188 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1189 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1192 (~C).store( i, j , xmm1 );
1193 (~C).store( i, j+SIMDSIZE , xmm2 );
1194 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1195 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1196 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1200 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1202 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
1203 size_t i( LOW ? j : 0UL );
1205 for( ; (i+2UL) <= iend; i+=2UL )
1218 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1220 for(
size_t k=kbegin; k<kend; ++k ) {
1221 const SIMDType a1(
set( A(i ,k) ) );
1222 const SIMDType a2(
set( A(i+1UL,k) ) );
1224 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1225 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1226 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1237 (~C).store( i , j , xmm1 );
1238 (~C).store( i , j+SIMDSIZE , xmm2 );
1239 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1240 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1241 (~C).store( i+1UL, j , xmm5 );
1242 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1243 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1244 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1258 for(
size_t k=kbegin; k<kend; ++k ) {
1259 const SIMDType a1(
set( A(i,k) ) );
1260 xmm1 += a1 * B.load(k,j );
1261 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1262 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1263 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1266 (~C).store( i, j , xmm1 );
1267 (~C).store( i, j+SIMDSIZE , xmm2 );
1268 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1269 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1273 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1275 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
1276 size_t i( LOW ? j : 0UL );
1278 for( ; (i+2UL) <= iend; i+=2UL )
1291 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1293 for(
size_t k=kbegin; k<kend; ++k ) {
1294 const SIMDType a1(
set( A(i ,k) ) );
1295 const SIMDType a2(
set( A(i+1UL,k) ) );
1297 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1298 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1307 (~C).store( i , j , xmm1 );
1308 (~C).store( i , j+SIMDSIZE , xmm2 );
1309 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1310 (~C).store( i+1UL, j , xmm4 );
1311 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1312 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1326 for(
size_t k=kbegin; k<kend; ++k ) {
1327 const SIMDType a1(
set( A(i,k) ) );
1328 xmm1 += a1 * B.load(k,j );
1329 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1330 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1333 (~C).store( i, j , xmm1 );
1334 (~C).store( i, j+SIMDSIZE , xmm2 );
1335 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1339 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1341 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
1342 size_t i( LOW ? j : 0UL );
1344 for( ; (i+4UL) <= iend; i+=4UL )
1357 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1359 for(
size_t k=kbegin; k<kend; ++k ) {
1360 const SIMDType a1(
set( A(i ,k) ) );
1361 const SIMDType a2(
set( A(i+1UL,k) ) );
1362 const SIMDType a3(
set( A(i+2UL,k) ) );
1363 const SIMDType a4(
set( A(i+3UL,k) ) );
1365 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1376 (~C).store( i , j , xmm1 );
1377 (~C).store( i , j+SIMDSIZE, xmm2 );
1378 (~C).store( i+1UL, j , xmm3 );
1379 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1380 (~C).store( i+2UL, j , xmm5 );
1381 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1382 (~C).store( i+3UL, j , xmm7 );
1383 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
1386 for( ; (i+3UL) <= iend; i+=3UL )
1399 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1401 for(
size_t k=kbegin; k<kend; ++k ) {
1402 const SIMDType a1(
set( A(i ,k) ) );
1403 const SIMDType a2(
set( A(i+1UL,k) ) );
1404 const SIMDType a3(
set( A(i+2UL,k) ) );
1406 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1415 (~C).store( i , j , xmm1 );
1416 (~C).store( i , j+SIMDSIZE, xmm2 );
1417 (~C).store( i+1UL, j , xmm3 );
1418 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1419 (~C).store( i+2UL, j , xmm5 );
1420 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1423 for( ; (i+2UL) <= iend; i+=2UL )
1436 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1439 for( ; (k+2UL) <= kend; k+=2UL ) {
1440 const SIMDType a1(
set( A(i ,k ) ) );
1441 const SIMDType a2(
set( A(i+1UL,k ) ) );
1442 const SIMDType a3(
set( A(i ,k+1UL) ) );
1443 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
1444 const SIMDType b1( B.load(k ,j ) );
1445 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1446 const SIMDType b3( B.load(k+1UL,j ) );
1447 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1458 for( ; k<kend; ++k ) {
1459 const SIMDType a1(
set( A(i ,k) ) );
1460 const SIMDType a2(
set( A(i+1UL,k) ) );
1462 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1469 (~C).store( i , j , xmm1+xmm5 );
1470 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
1471 (~C).store( i+1UL, j , xmm3+xmm7 );
1472 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1487 for( ; (k+2UL) <= kend; k+=2UL ) {
1488 const SIMDType a1(
set( A(i,k ) ) );
1489 const SIMDType a2(
set( A(i,k+1UL) ) );
1490 xmm1 += a1 * B.load(k ,j );
1491 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1492 xmm3 += a2 * B.load(k+1UL,j );
1493 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1496 for( ; k<kend; ++k ) {
1497 const SIMDType a1(
set( A(i,k) ) );
1498 xmm1 += a1 * B.load(k,j );
1499 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1502 (~C).store( i, j , xmm1+xmm3 );
1503 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
1507 for( ; j<jpos; j+=SIMDSIZE )
1509 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
1510 size_t i( LOW ? j : 0UL );
1512 for( ; (i+4UL) <= iend; i+=4UL )
1523 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1526 for( ; (k+2UL) <= kend; k+=2UL ) {
1528 const SIMDType b2( B.load(k+1UL,j) );
1529 xmm1 +=
set( A(i ,k ) ) * b1;
1530 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1531 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1532 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1533 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1534 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1535 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1536 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1539 for( ; k<kend; ++k ) {
1541 xmm1 +=
set( A(i ,k) ) * b1;
1542 xmm2 +=
set( A(i+1UL,k) ) * b1;
1543 xmm3 +=
set( A(i+2UL,k) ) * b1;
1544 xmm4 +=
set( A(i+3UL,k) ) * b1;
1547 (~C).store( i , j, xmm1+xmm5 );
1548 (~C).store( i+1UL, j, xmm2+xmm6 );
1549 (~C).store( i+2UL, j, xmm3+xmm7 );
1550 (~C).store( i+3UL, j, xmm4+xmm8 );
1553 for( ; (i+3UL) <= iend; i+=3UL )
1564 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1567 for( ; (k+2UL) <= kend; k+=2UL ) {
1569 const SIMDType b2( B.load(k+1UL,j) );
1570 xmm1 +=
set( A(i ,k ) ) * b1;
1571 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1572 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1573 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1574 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1575 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1578 for( ; k<kend; ++k ) {
1580 xmm1 +=
set( A(i ,k) ) * b1;
1581 xmm2 +=
set( A(i+1UL,k) ) * b1;
1582 xmm3 +=
set( A(i+2UL,k) ) * b1;
1585 (~C).store( i , j, xmm1+xmm4 );
1586 (~C).store( i+1UL, j, xmm2+xmm5 );
1587 (~C).store( i+2UL, j, xmm3+xmm6 );
1590 for( ; (i+2UL) <= iend; i+=2UL )
1604 for( ; (k+2UL) <= kend; k+=2UL ) {
1606 const SIMDType b2( B.load(k+1UL,j) );
1607 xmm1 +=
set( A(i ,k ) ) * b1;
1608 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1609 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1610 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1613 for( ; k<kend; ++k ) {
1615 xmm1 +=
set( A(i ,k) ) * b1;
1616 xmm2 +=
set( A(i+1UL,k) ) * b1;
1619 (~C).store( i , j, xmm1+xmm3 );
1620 (~C).store( i+1UL, j, xmm2+xmm4 );
1634 for( ; (k+2UL) <= K; k+=2UL ) {
1635 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1636 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1640 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1643 (~C).store( i, j, xmm1+xmm2 );
1647 for( ; remainder && j<N; ++j )
1649 size_t i( LOW && UPP ? j : 0UL );
1651 for( ; (i+2UL) <= M; i+=2UL )
1665 for(
size_t k=kbegin; k<kend; ++k ) {
1666 value1 += A(i ,k) * B(k,j);
1667 value2 += A(i+1UL,k) * B(k,j);
1670 (~C)(i ,j) = value1;
1671 (~C)(i+1UL,j) = value2;
1684 for(
size_t k=kbegin; k<K; ++k ) {
1685 value += A(i,k) * B(k,j);
1693 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1694 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1695 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1696 for(
size_t j=0UL; j<jend; ++j ) {
1697 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1701 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1702 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1703 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1704 for(
size_t i=0UL; i<iend; ++i ) {
1709 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1710 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1711 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1712 for(
size_t j=0UL; j<jend; ++j ) {
1736 template<
typename MT3
1744 const size_t M( A.rows() );
1745 const size_t N( B.columns() );
1746 const size_t K( A.columns() );
1750 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1753 if( LOW && UPP && M > SIMDSIZE*3UL ) {
1762 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1763 for(
size_t j=0UL; j<N; ++j )
1776 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1778 for(
size_t k=kbegin; k<kend; ++k ) {
1779 const SIMDType b1(
set( B(k,j) ) );
1780 xmm1 += A.load(i ,k) * b1;
1781 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1782 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1783 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1784 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1785 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1786 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1787 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1790 (~C).store( i , j, xmm1 );
1791 (~C).store( i+SIMDSIZE , j, xmm2 );
1792 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1793 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1794 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1795 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1796 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1797 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1802 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1806 for( ; (j+2UL) <= N; j+=2UL )
1819 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1821 for(
size_t k=kbegin; k<kend; ++k ) {
1823 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1824 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1825 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1826 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1827 const SIMDType b1(
set( B(k,j ) ) );
1828 const SIMDType b2(
set( B(k,j+1UL) ) );
1841 (~C).store( i , j , xmm1 );
1842 (~C).store( i+SIMDSIZE , j , xmm2 );
1843 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1844 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1845 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1846 (~C).store( i , j+1UL, xmm6 );
1847 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1848 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1849 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1850 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1862 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1864 for(
size_t k=kbegin; k<kend; ++k ) {
1865 const SIMDType b1(
set( B(k,j) ) );
1866 xmm1 += A.load(i ,k) * b1;
1867 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1868 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1869 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1870 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1873 (~C).store( i , j, xmm1 );
1874 (~C).store( i+SIMDSIZE , j, xmm2 );
1875 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1876 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1877 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1881 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1883 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
1884 size_t j( UPP ? i : 0UL );
1886 for( ; (j+2UL) <= jend; j+=2UL )
1899 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1901 for(
size_t k=kbegin; k<kend; ++k ) {
1903 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1904 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1905 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1906 const SIMDType b1(
set( B(k,j ) ) );
1907 const SIMDType b2(
set( B(k,j+1UL) ) );
1918 (~C).store( i , j , xmm1 );
1919 (~C).store( i+SIMDSIZE , j , xmm2 );
1920 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1921 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1922 (~C).store( i , j+1UL, xmm5 );
1923 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1924 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1925 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1939 for(
size_t k=kbegin; k<kend; ++k ) {
1940 const SIMDType b1(
set( B(k,j) ) );
1941 xmm1 += A.load(i ,k) * b1;
1942 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1943 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1944 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1947 (~C).store( i , j, xmm1 );
1948 (~C).store( i+SIMDSIZE , j, xmm2 );
1949 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1950 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1954 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1956 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
1957 size_t j( UPP ? i : 0UL );
1959 for( ; (j+2UL) <= jend; j+=2UL )
1972 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1974 for(
size_t k=kbegin; k<kend; ++k ) {
1976 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1977 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1978 const SIMDType b1(
set( B(k,j ) ) );
1979 const SIMDType b2(
set( B(k,j+1UL) ) );
1988 (~C).store( i , j , xmm1 );
1989 (~C).store( i+SIMDSIZE , j , xmm2 );
1990 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1991 (~C).store( i , j+1UL, xmm4 );
1992 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1993 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2007 for(
size_t k=kbegin; k<kend; ++k ) {
2008 const SIMDType b1(
set( B(k,j) ) );
2009 xmm1 += A.load(i ,k) * b1;
2010 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2011 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2014 (~C).store( i , j, xmm1 );
2015 (~C).store( i+SIMDSIZE , j, xmm2 );
2016 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2020 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2022 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
2023 size_t j( UPP ? i : 0UL );
2025 for( ; (j+4UL) <= jend; j+=4UL )
2038 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2040 for(
size_t k=kbegin; k<kend; ++k ) {
2042 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2043 const SIMDType b1(
set( B(k,j ) ) );
2044 const SIMDType b2(
set( B(k,j+1UL) ) );
2045 const SIMDType b3(
set( B(k,j+2UL) ) );
2046 const SIMDType b4(
set( B(k,j+3UL) ) );
2057 (~C).store( i , j , xmm1 );
2058 (~C).store( i+SIMDSIZE, j , xmm2 );
2059 (~C).store( i , j+1UL, xmm3 );
2060 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2061 (~C).store( i , j+2UL, xmm5 );
2062 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2063 (~C).store( i , j+3UL, xmm7 );
2064 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2067 for( ; (j+3UL) <= jend; j+=3UL )
2080 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2082 for(
size_t k=kbegin; k<kend; ++k ) {
2084 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2085 const SIMDType b1(
set( B(k,j ) ) );
2086 const SIMDType b2(
set( B(k,j+1UL) ) );
2087 const SIMDType b3(
set( B(k,j+2UL) ) );
2096 (~C).store( i , j , xmm1 );
2097 (~C).store( i+SIMDSIZE, j , xmm2 );
2098 (~C).store( i , j+1UL, xmm3 );
2099 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2100 (~C).store( i , j+2UL, xmm5 );
2101 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2104 for( ; (j+2UL) <= jend; j+=2UL )
2117 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2120 for( ; (k+2UL) <= kend; k+=2UL ) {
2121 const SIMDType a1( A.load(i ,k ) );
2122 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2123 const SIMDType a3( A.load(i ,k+1UL) );
2124 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2125 const SIMDType b1(
set( B(k ,j ) ) );
2126 const SIMDType b2(
set( B(k ,j+1UL) ) );
2127 const SIMDType b3(
set( B(k+1UL,j ) ) );
2128 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
2139 for( ; k<kend; ++k ) {
2141 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2142 const SIMDType b1(
set( B(k,j ) ) );
2143 const SIMDType b2(
set( B(k,j+1UL) ) );
2150 (~C).store( i , j , xmm1+xmm5 );
2151 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2152 (~C).store( i , j+1UL, xmm3+xmm7 );
2153 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2168 for( ; (k+2UL) <= kend; k+=2UL ) {
2169 const SIMDType b1(
set( B(k ,j) ) );
2170 const SIMDType b2(
set( B(k+1UL,j) ) );
2171 xmm1 += A.load(i ,k ) * b1;
2172 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2173 xmm3 += A.load(i ,k+1UL) * b2;
2174 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2177 for( ; k<kend; ++k ) {
2178 const SIMDType b1(
set( B(k,j) ) );
2179 xmm1 += A.load(i ,k) * b1;
2180 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2183 (~C).store( i , j, xmm1+xmm3 );
2184 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2188 for( ; i<ipos; i+=SIMDSIZE )
2190 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
2191 size_t j( UPP ? i : 0UL );
2193 for( ; (j+4UL) <= jend; j+=4UL )
2204 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2207 for( ; (k+2UL) <= kend; k+=2UL ) {
2209 const SIMDType a2( A.load(i,k+1UL) );
2210 xmm1 += a1 *
set( B(k ,j ) );
2211 xmm2 += a1 *
set( B(k ,j+1UL) );
2212 xmm3 += a1 *
set( B(k ,j+2UL) );
2213 xmm4 += a1 *
set( B(k ,j+3UL) );
2214 xmm5 += a2 *
set( B(k+1UL,j ) );
2215 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2216 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2217 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2220 for( ; k<kend; ++k ) {
2222 xmm1 += a1 *
set( B(k,j ) );
2223 xmm2 += a1 *
set( B(k,j+1UL) );
2224 xmm3 += a1 *
set( B(k,j+2UL) );
2225 xmm4 += a1 *
set( B(k,j+3UL) );
2228 (~C).store( i, j , xmm1+xmm5 );
2229 (~C).store( i, j+1UL, xmm2+xmm6 );
2230 (~C).store( i, j+2UL, xmm3+xmm7 );
2231 (~C).store( i, j+3UL, xmm4+xmm8 );
2234 for( ; (j+3UL) <= jend; j+=3UL )
2245 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2248 for( ; (k+2UL) <= kend; k+=2UL ) {
2250 const SIMDType a2( A.load(i,k+1UL) );
2251 xmm1 += a1 *
set( B(k ,j ) );
2252 xmm2 += a1 *
set( B(k ,j+1UL) );
2253 xmm3 += a1 *
set( B(k ,j+2UL) );
2254 xmm4 += a2 *
set( B(k+1UL,j ) );
2255 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2256 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2259 for( ; k<kend; ++k ) {
2261 xmm1 += a1 *
set( B(k,j ) );
2262 xmm2 += a1 *
set( B(k,j+1UL) );
2263 xmm3 += a1 *
set( B(k,j+2UL) );
2266 (~C).store( i, j , xmm1+xmm4 );
2267 (~C).store( i, j+1UL, xmm2+xmm5 );
2268 (~C).store( i, j+2UL, xmm3+xmm6 );
2271 for( ; (j+2UL) <= jend; j+=2UL )
2285 for( ; (k+2UL) <= kend; k+=2UL ) {
2287 const SIMDType a2( A.load(i,k+1UL) );
2288 xmm1 += a1 *
set( B(k ,j ) );
2289 xmm2 += a1 *
set( B(k ,j+1UL) );
2290 xmm3 += a2 *
set( B(k+1UL,j ) );
2291 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2294 for( ; k<kend; ++k ) {
2296 xmm1 += a1 *
set( B(k,j ) );
2297 xmm2 += a1 *
set( B(k,j+1UL) );
2300 (~C).store( i, j , xmm1+xmm3 );
2301 (~C).store( i, j+1UL, xmm2+xmm4 );
2315 for( ; (k+2UL) <= K; k+=2UL ) {
2316 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2317 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2321 xmm1 += A.load(i,k) *
set( B(k,j) );
2324 (~C).store( i, j, xmm1+xmm2 );
2328 for( ; remainder && i<M; ++i )
2330 size_t j( LOW && UPP ? i : 0UL );
2332 for( ; (j+2UL) <= N; j+=2UL )
2346 for(
size_t k=kbegin; k<kend; ++k ) {
2347 value1 += A(i,k) * B(k,j );
2348 value2 += A(i,k) * B(k,j+1UL);
2351 (~C)(i,j ) = value1;
2352 (~C)(i,j+1UL) = value2;
2365 for(
size_t k=kbegin; k<K; ++k ) {
2366 value += A(i,k) * B(k,j);
2374 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
2375 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2376 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2377 for(
size_t i=0UL; i<iend; ++i ) {
2378 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
2382 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
2383 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2384 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2385 for(
size_t i=0UL; i<iend; ++i ) {
2390 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2391 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2392 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2393 for(
size_t j=0UL; j<jend; ++j ) {
2416 template<
typename MT3
2420 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2422 selectDefaultAssignKernel( C, A, B );
2442 template<
typename MT3
2446 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2476 template<
typename MT3
2480 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2482 selectLargeAssignKernel( C, A, B );
2488 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2502 template<
typename MT3
2506 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2519 gemm( C, A, B, ET(1), ET(0) );
2539 template<
typename MT
2557 const ForwardFunctor fwd;
2559 const TmpType tmp(
serial( rhs ) );
2560 assign( ~lhs, fwd( tmp ) );
2578 template<
typename MT
2587 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2601 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2617 template<
typename MT3
2620 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2625 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2626 selectSmallAddAssignKernel( C, A, B );
2628 selectBlasAddAssignKernel( C, A, B );
2647 template<
typename MT3
2653 const size_t M( A.rows() );
2654 const size_t N( B.columns() );
2655 const size_t K( A.columns() );
2659 for(
size_t i=0UL; i<M; ++i )
2669 for(
size_t k=kbegin; k<kend; ++k )
2673 ?( UPP ?
max(i,k+1UL) : k+1UL )
2674 :( UPP ?
max(i,k) : k ) )
2675 :( UPP ? i : 0UL ) );
2678 ?( LOW ?
min(i+1UL,k) : k )
2679 :( LOW ?
min(i,k)+1UL : k+1UL ) )
2680 :( LOW ? i+1UL : N ) );
2682 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
2685 const size_t jnum( jend - jbegin );
2686 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2688 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2689 (~C)(i,j ) += A(i,k) * B(k,j );
2690 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2693 (~C)(i,jpos) += A(i,k) * B(k,jpos);
2715 template<
typename MT3
2721 const size_t M( A.rows() );
2722 const size_t N( B.columns() );
2723 const size_t K( A.columns() );
2727 for(
size_t j=0UL; j<N; ++j )
2737 for(
size_t k=kbegin; k<kend; ++k )
2741 ?( LOW ?
max(j,k+1UL) : k+1UL )
2742 :( LOW ?
max(j,k) : k ) )
2743 :( LOW ? j : 0UL ) );
2746 ?( UPP ?
min(j+1UL,k) : k )
2747 :( UPP ?
min(j,k)+1UL : k+1UL ) )
2748 :( UPP ? j+1UL : M ) );
2750 if( ( LOW || UPP ) && ibegin >= iend )
continue;
2753 const size_t inum( iend - ibegin );
2754 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2756 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2757 (~C)(i ,j) += A(i ,k) * B(k,j);
2758 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2761 (~C)(ipos,j) += A(ipos,k) * B(k,j);
2783 template<
typename MT3
2786 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2789 constexpr
size_t block( BLOCK_SIZE );
2791 const size_t M( A.rows() );
2792 const size_t N( B.columns() );
2794 for(
size_t ii=0UL; ii<M; ii+=block ) {
2795 const size_t iend(
min( M, ii+block ) );
2796 for(
size_t jj=0UL; jj<N; jj+=block ) {
2797 const size_t jend(
min( N, jj+block ) );
2798 for(
size_t i=ii; i<iend; ++i )
2807 for(
size_t j=jbegin; j<jpos; ++j ) {
2808 (~C)(i,j) += A(i,j) * B(j,j);
2831 template<
typename MT3
2834 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2837 const size_t M( A.rows() );
2838 const size_t N( B.columns() );
2840 for(
size_t j=0UL; j<N; ++j )
2850 const size_t inum( iend - ibegin );
2851 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2853 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2854 (~C)(i ,j) += A(i ,j) * B(j,j);
2855 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2858 (~C)(ipos,j) += A(ipos,j) * B(j,j);
2879 template<
typename MT3
2885 const size_t M( A.rows() );
2886 const size_t N( B.columns() );
2888 for(
size_t i=0UL; i<M; ++i )
2898 const size_t jnum( jend - jbegin );
2899 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2901 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2902 (~C)(i,j ) += A(i,i) * B(i,j );
2903 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2906 (~C)(i,jpos) += A(i,i) * B(i,jpos);
2927 template<
typename MT3
2933 constexpr
size_t block( BLOCK_SIZE );
2935 const size_t M( A.rows() );
2936 const size_t N( B.columns() );
2938 for(
size_t jj=0UL; jj<N; jj+=block ) {
2939 const size_t jend(
min( N, jj+block ) );
2940 for(
size_t ii=0UL; ii<M; ii+=block ) {
2941 const size_t iend(
min( M, ii+block ) );
2942 for(
size_t j=jj; j<jend; ++j )
2951 for(
size_t i=ibegin; i<ipos; ++i ) {
2952 (~C)(i,j) += A(i,i) * B(i,j);
2975 template<
typename MT3
2979 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2981 for(
size_t i=0UL; i<A.rows(); ++i ) {
2982 C(i,i) += A(i,i) * B(i,i);
3002 template<
typename MT3
3006 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3008 selectDefaultAddAssignKernel( C, A, B );
3028 template<
typename MT3
3036 const size_t M( A.rows() );
3037 const size_t N( B.columns() );
3038 const size_t K( A.columns() );
3042 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3049 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3050 for(
size_t i=0UL; i<M; ++i )
3064 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3065 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3066 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3067 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3068 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
3069 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
3070 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
3072 for(
size_t k=kbegin; k<kend; ++k ) {
3073 const SIMDType a1(
set( A(i,k) ) );
3074 xmm1 += a1 * B.load(k,j );
3075 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3076 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3077 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3078 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3079 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3080 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3081 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3084 (~C).store( i, j , xmm1 );
3085 (~C).store( i, j+SIMDSIZE , xmm2 );
3086 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3087 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3088 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3089 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
3090 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
3091 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3096 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3100 for( ; (i+2UL) <= M; i+=2UL )
3113 SIMDType xmm1 ( (~C).load(i ,j ) );
3114 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
3115 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
3116 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
3117 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
3118 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
3119 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
3120 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3121 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3122 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
3124 for(
size_t k=kbegin; k<kend; ++k ) {
3125 const SIMDType a1(
set( A(i ,k) ) );
3126 const SIMDType a2(
set( A(i+1UL,k) ) );
3128 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3129 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3130 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3131 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3144 (~C).store( i , j , xmm1 );
3145 (~C).store( i , j+SIMDSIZE , xmm2 );
3146 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3147 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3148 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
3149 (~C).store( i+1UL, j , xmm6 );
3150 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
3151 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3152 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3153 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3166 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3167 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3168 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3169 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3171 for(
size_t k=kbegin; k<kend; ++k ) {
3172 const SIMDType a1(
set( A(i,k) ) );
3173 xmm1 += a1 * B.load(k,j );
3174 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3175 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3176 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3177 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3180 (~C).store( i, j , xmm1 );
3181 (~C).store( i, j+SIMDSIZE , xmm2 );
3182 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3183 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3184 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3188 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3192 for( ; (i+2UL) <= M; i+=2UL )
3206 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3207 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3208 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3209 SIMDType xmm5( (~C).load(i+1UL,j ) );
3210 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3211 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3212 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3214 for(
size_t k=kbegin; k<kend; ++k ) {
3215 const SIMDType a1(
set( A(i ,k) ) );
3216 const SIMDType a2(
set( A(i+1UL,k) ) );
3218 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3219 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3220 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3231 (~C).store( i , j , xmm1 );
3232 (~C).store( i , j+SIMDSIZE , xmm2 );
3233 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3234 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3235 (~C).store( i+1UL, j , xmm5 );
3236 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3237 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3238 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3251 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3252 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3253 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3255 for(
size_t k=kbegin; k<kend; ++k ) {
3256 const SIMDType a1(
set( A(i,k) ) );
3257 xmm1 += a1 * B.load(k,j );
3258 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3259 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3260 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3263 (~C).store( i, j , xmm1 );
3264 (~C).store( i, j+SIMDSIZE , xmm2 );
3265 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3266 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3270 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3274 for( ; (i+2UL) <= M; i+=2UL )
3288 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3289 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3290 SIMDType xmm4( (~C).load(i+1UL,j ) );
3291 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3292 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3294 for(
size_t k=kbegin; k<kend; ++k ) {
3295 const SIMDType a1(
set( A(i ,k) ) );
3296 const SIMDType a2(
set( A(i+1UL,k) ) );
3298 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3299 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3308 (~C).store( i , j , xmm1 );
3309 (~C).store( i , j+SIMDSIZE , xmm2 );
3310 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3311 (~C).store( i+1UL, j , xmm4 );
3312 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3313 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3326 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3327 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3329 for(
size_t k=kbegin; k<kend; ++k ) {
3330 const SIMDType a1(
set( A(i,k) ) );
3331 xmm1 += a1 * B.load(k,j );
3332 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3333 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3336 (~C).store( i, j , xmm1 );
3337 (~C).store( i, j+SIMDSIZE , xmm2 );
3338 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3342 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3344 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
3345 size_t i( LOW ? j : 0UL );
3347 for( ; (i+4UL) <= iend; i+=4UL )
3361 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3362 SIMDType xmm3( (~C).load(i+1UL,j ) );
3363 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3364 SIMDType xmm5( (~C).load(i+2UL,j ) );
3365 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3366 SIMDType xmm7( (~C).load(i+3UL,j ) );
3367 SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
3369 for(
size_t k=kbegin; k<kend; ++k ) {
3370 const SIMDType a1(
set( A(i ,k) ) );
3371 const SIMDType a2(
set( A(i+1UL,k) ) );
3372 const SIMDType a3(
set( A(i+2UL,k) ) );
3373 const SIMDType a4(
set( A(i+3UL,k) ) );
3375 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3386 (~C).store( i , j , xmm1 );
3387 (~C).store( i , j+SIMDSIZE, xmm2 );
3388 (~C).store( i+1UL, j , xmm3 );
3389 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3390 (~C).store( i+2UL, j , xmm5 );
3391 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3392 (~C).store( i+3UL, j , xmm7 );
3393 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
3396 for( ; (i+3UL) <= iend; i+=3UL )
3410 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3411 SIMDType xmm3( (~C).load(i+1UL,j ) );
3412 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3413 SIMDType xmm5( (~C).load(i+2UL,j ) );
3414 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3416 for(
size_t k=kbegin; k<kend; ++k ) {
3417 const SIMDType a1(
set( A(i ,k) ) );
3418 const SIMDType a2(
set( A(i+1UL,k) ) );
3419 const SIMDType a3(
set( A(i+2UL,k) ) );
3421 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3430 (~C).store( i , j , xmm1 );
3431 (~C).store( i , j+SIMDSIZE, xmm2 );
3432 (~C).store( i+1UL, j , xmm3 );
3433 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3434 (~C).store( i+2UL, j , xmm5 );
3435 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3438 for( ; (i+2UL) <= iend; i+=2UL )
3452 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3453 SIMDType xmm3( (~C).load(i+1UL,j ) );
3454 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3458 for( ; (k+2UL) <= kend; k+=2UL ) {
3459 const SIMDType a1(
set( A(i ,k ) ) );
3460 const SIMDType a2(
set( A(i+1UL,k ) ) );
3461 const SIMDType a3(
set( A(i ,k+1UL) ) );
3462 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
3463 const SIMDType b1( B.load(k ,j ) );
3464 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3465 const SIMDType b3( B.load(k+1UL,j ) );
3466 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3477 for( ; k<kend; ++k ) {
3478 const SIMDType a1(
set( A(i ,k) ) );
3479 const SIMDType a2(
set( A(i+1UL,k) ) );
3481 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3488 (~C).store( i , j , xmm1+xmm5 );
3489 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
3490 (~C).store( i+1UL, j , xmm3+xmm7 );
3491 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3504 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3508 for( ; (k+2UL) <= kend; k+=2UL ) {
3509 const SIMDType a1(
set( A(i,k ) ) );
3510 const SIMDType a2(
set( A(i,k+1UL) ) );
3511 xmm1 += a1 * B.load(k ,j );
3512 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
3513 xmm3 += a2 * B.load(k+1UL,j );
3514 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
3517 for( ; k<kend; ++k ) {
3518 const SIMDType a1(
set( A(i,k) ) );
3519 xmm1 += a1 * B.load(k,j );
3520 xmm2 += a1 * B.load(k,j+SIMDSIZE);
3523 (~C).store( i, j , xmm1+xmm3 );
3524 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
3528 for( ; j<jpos; j+=SIMDSIZE )
3530 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
3531 size_t i( LOW ? j : 0UL );
3533 for( ; (i+4UL) <= iend; i+=4UL )
3545 SIMDType xmm2( (~C).load(i+1UL,j) );
3546 SIMDType xmm3( (~C).load(i+2UL,j) );
3547 SIMDType xmm4( (~C).load(i+3UL,j) );
3551 for( ; (k+2UL) <= kend; k+=2UL ) {
3553 const SIMDType b2( B.load(k+1UL,j) );
3554 xmm1 +=
set( A(i ,k ) ) * b1;
3555 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3556 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3557 xmm4 +=
set( A(i+3UL,k ) ) * b1;
3558 xmm5 +=
set( A(i ,k+1UL) ) * b2;
3559 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
3560 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
3561 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
3564 for( ; k<kend; ++k ) {
3566 xmm1 +=
set( A(i ,k) ) * b1;
3567 xmm2 +=
set( A(i+1UL,k) ) * b1;
3568 xmm3 +=
set( A(i+2UL,k) ) * b1;
3569 xmm4 +=
set( A(i+3UL,k) ) * b1;
3572 (~C).store( i , j, xmm1+xmm5 );
3573 (~C).store( i+1UL, j, xmm2+xmm6 );
3574 (~C).store( i+2UL, j, xmm3+xmm7 );
3575 (~C).store( i+3UL, j, xmm4+xmm8 );
3578 for( ; (i+3UL) <= iend; i+=3UL )
3590 SIMDType xmm2( (~C).load(i+1UL,j) );
3591 SIMDType xmm3( (~C).load(i+2UL,j) );
3595 for( ; (k+2UL) <= kend; k+=2UL ) {
3597 const SIMDType b2( B.load(k+1UL,j) );
3598 xmm1 +=
set( A(i ,k ) ) * b1;
3599 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3600 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3601 xmm4 +=
set( A(i ,k+1UL) ) * b2;
3602 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
3603 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
3606 for( ; k<kend; ++k ) {
3608 xmm1 +=
set( A(i ,k) ) * b1;
3609 xmm2 +=
set( A(i+1UL,k) ) * b1;
3610 xmm3 +=
set( A(i+2UL,k) ) * b1;
3613 (~C).store( i , j, xmm1+xmm4 );
3614 (~C).store( i+1UL, j, xmm2+xmm5 );
3615 (~C).store( i+2UL, j, xmm3+xmm6 );
3618 for( ; (i+2UL) <= iend; i+=2UL )
3630 SIMDType xmm2( (~C).load(i+1UL,j) );
3634 for( ; (k+2UL) <= kend; k+=2UL ) {
3636 const SIMDType b2( B.load(k+1UL,j) );
3637 xmm1 +=
set( A(i ,k ) ) * b1;
3638 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3639 xmm3 +=
set( A(i ,k+1UL) ) * b2;
3640 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
3643 for( ; k<kend; ++k ) {
3645 xmm1 +=
set( A(i ,k) ) * b1;
3646 xmm2 +=
set( A(i+1UL,k) ) * b1;
3649 (~C).store( i , j, xmm1+xmm3 );
3650 (~C).store( i+1UL, j, xmm2+xmm4 );
3665 for( ; (k+2UL) <= K; k+=2UL ) {
3666 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
3667 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
3671 xmm1 +=
set( A(i,k) ) * B.load(k,j);
3674 (~C).store( i, j, xmm1+xmm2 );
3678 for( ; remainder && j<N; ++j )
3680 const size_t iend( UPP ? j+1UL : M );
3681 size_t i( LOW ? j : 0UL );
3683 for( ; (i+2UL) <= iend; i+=2UL )
3697 for(
size_t k=kbegin; k<kend; ++k ) {
3698 value1 += A(i ,k) * B(k,j);
3699 value2 += A(i+1UL,k) * B(k,j);
3702 (~C)(i ,j) = value1;
3703 (~C)(i+1UL,j) = value2;
3716 for(
size_t k=kbegin; k<K; ++k ) {
3717 value += A(i,k) * B(k,j);
3742 template<
typename MT3
3750 const size_t M( A.rows() );
3751 const size_t N( B.columns() );
3752 const size_t K( A.columns() );
3756 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3763 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3764 for(
size_t j=0UL; j<N; ++j )
3778 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3779 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3780 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3781 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3782 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3783 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3784 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3786 for(
size_t k=kbegin; k<kend; ++k ) {
3787 const SIMDType b1(
set( B(k,j) ) );
3788 xmm1 += A.load(i ,k) * b1;
3789 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3790 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3791 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3792 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3793 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3794 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3795 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3798 (~C).store( i , j, xmm1 );
3799 (~C).store( i+SIMDSIZE , j, xmm2 );
3800 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3801 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3802 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3803 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3804 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3805 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3810 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3814 for( ; (j+2UL) <= N; j+=2UL )
3827 SIMDType xmm1 ( (~C).load(i ,j ) );
3828 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3829 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3830 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3831 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3832 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3833 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3834 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3835 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3836 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3838 for(
size_t k=kbegin; k<kend; ++k ) {
3840 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3841 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3842 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3843 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3844 const SIMDType b1(
set( B(k,j ) ) );
3845 const SIMDType b2(
set( B(k,j+1UL) ) );
3858 (~C).store( i , j , xmm1 );
3859 (~C).store( i+SIMDSIZE , j , xmm2 );
3860 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3861 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3862 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3863 (~C).store( i , j+1UL, xmm6 );
3864 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3865 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3866 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3867 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3880 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3881 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3882 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3883 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3885 for(
size_t k=kbegin; k<kend; ++k ) {
3886 const SIMDType b1(
set( B(k,j) ) );
3887 xmm1 += A.load(i ,k) * b1;
3888 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3889 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3890 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3891 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3894 (~C).store( i , j, xmm1 );
3895 (~C).store( i+SIMDSIZE , j, xmm2 );
3896 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3897 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3898 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3902 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3906 for( ; (j+2UL) <= N; j+=2UL )
3920 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3921 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3922 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3923 SIMDType xmm5( (~C).load(i ,j+1UL) );
3924 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3925 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3926 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3928 for(
size_t k=kbegin; k<kend; ++k ) {
3930 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3931 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3932 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3933 const SIMDType b1(
set( B(k,j ) ) );
3934 const SIMDType b2(
set( B(k,j+1UL) ) );
3945 (~C).store( i , j , xmm1 );
3946 (~C).store( i+SIMDSIZE , j , xmm2 );
3947 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3948 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3949 (~C).store( i , j+1UL, xmm5 );
3950 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3951 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3952 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3965 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3966 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3967 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3969 for(
size_t k=kbegin; k<kend; ++k ) {
3970 const SIMDType b1(
set( B(k,j) ) );
3971 xmm1 += A.load(i ,k) * b1;
3972 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3973 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3974 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3977 (~C).store( i , j, xmm1 );
3978 (~C).store( i+SIMDSIZE , j, xmm2 );
3979 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3980 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3984 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3988 for( ; (j+2UL) <= N; j+=2UL )
4002 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4003 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4004 SIMDType xmm4( (~C).load(i ,j+1UL) );
4005 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
4006 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4008 for(
size_t k=kbegin; k<kend; ++k ) {
4010 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4011 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4012 const SIMDType b1(
set( B(k,j ) ) );
4013 const SIMDType b2(
set( B(k,j+1UL) ) );
4022 (~C).store( i , j , xmm1 );
4023 (~C).store( i+SIMDSIZE , j , xmm2 );
4024 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4025 (~C).store( i , j+1UL, xmm4 );
4026 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
4027 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4040 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4041 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4043 for(
size_t k=kbegin; k<kend; ++k ) {
4044 const SIMDType b1(
set( B(k,j) ) );
4045 xmm1 += A.load(i ,k) * b1;
4046 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4047 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4050 (~C).store( i , j, xmm1 );
4051 (~C).store( i+SIMDSIZE , j, xmm2 );
4052 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4056 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4058 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
4059 size_t j( UPP ? i : 0UL );
4061 for( ; (j+4UL) <= jend; j+=4UL )
4075 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4076 SIMDType xmm3( (~C).load(i ,j+1UL) );
4077 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4078 SIMDType xmm5( (~C).load(i ,j+2UL) );
4079 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4080 SIMDType xmm7( (~C).load(i ,j+3UL) );
4081 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
4083 for(
size_t k=kbegin; k<kend; ++k ) {
4085 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4086 const SIMDType b1(
set( B(k,j ) ) );
4087 const SIMDType b2(
set( B(k,j+1UL) ) );
4088 const SIMDType b3(
set( B(k,j+2UL) ) );
4089 const SIMDType b4(
set( B(k,j+3UL) ) );
4100 (~C).store( i , j , xmm1 );
4101 (~C).store( i+SIMDSIZE, j , xmm2 );
4102 (~C).store( i , j+1UL, xmm3 );
4103 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4104 (~C).store( i , j+2UL, xmm5 );
4105 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4106 (~C).store( i , j+3UL, xmm7 );
4107 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
4110 for( ; (j+3UL) <= jend; j+=3UL )
4124 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4125 SIMDType xmm3( (~C).load(i ,j+1UL) );
4126 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4127 SIMDType xmm5( (~C).load(i ,j+2UL) );
4128 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4130 for(
size_t k=kbegin; k<kend; ++k ) {
4132 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4133 const SIMDType b1(
set( B(k,j ) ) );
4134 const SIMDType b2(
set( B(k,j+1UL) ) );
4135 const SIMDType b3(
set( B(k,j+2UL) ) );
4144 (~C).store( i , j , xmm1 );
4145 (~C).store( i+SIMDSIZE, j , xmm2 );
4146 (~C).store( i , j+1UL, xmm3 );
4147 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4148 (~C).store( i , j+2UL, xmm5 );
4149 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4152 for( ; (j+2UL) <= jend; j+=2UL )
4166 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4167 SIMDType xmm3( (~C).load(i ,j+1UL) );
4168 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4172 for( ; (k+2UL) < kend; k+=2UL ) {
4173 const SIMDType a1( A.load(i ,k ) );
4174 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
4175 const SIMDType a3( A.load(i ,k+1UL) );
4176 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
4177 const SIMDType b1(
set( B(k ,j ) ) );
4178 const SIMDType b2(
set( B(k ,j+1UL) ) );
4179 const SIMDType b3(
set( B(k+1UL,j ) ) );
4180 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
4191 for( ; k<kend; ++k ) {
4193 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4194 const SIMDType b1(
set( B(k,j ) ) );
4195 const SIMDType b2(
set( B(k,j+1UL) ) );
4202 (~C).store( i , j , xmm1+xmm5 );
4203 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
4204 (~C).store( i , j+1UL, xmm3+xmm7 );
4205 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
4218 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
4222 for( ; (k+2UL) <= kend; k+=2UL ) {
4223 const SIMDType b1(
set( B(k ,j) ) );
4224 const SIMDType b2(
set( B(k+1UL,j) ) );
4225 xmm1 += A.load(i ,k ) * b1;
4226 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
4227 xmm3 += A.load(i ,k+1UL) * b2;
4228 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
4231 for( ; k<kend; ++k ) {
4232 const SIMDType b1(
set( B(k,j) ) );
4233 xmm1 += A.load(i ,k) * b1;
4234 xmm2 += A.load(i+SIMDSIZE,k) * b1;
4237 (~C).store( i , j, xmm1+xmm3 );
4238 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
4242 for( ; i<ipos; i+=SIMDSIZE )
4244 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
4245 size_t j( UPP ? i : 0UL );
4247 for( ; (j+4UL) <= jend; j+=4UL )
4259 SIMDType xmm2( (~C).load(i,j+1UL) );
4260 SIMDType xmm3( (~C).load(i,j+2UL) );
4261 SIMDType xmm4( (~C).load(i,j+3UL) );
4265 for( ; (k+2UL) <= kend; k+=2UL ) {
4267 const SIMDType a2( A.load(i,k+1UL) );
4268 xmm1 += a1 *
set( B(k ,j ) );
4269 xmm2 += a1 *
set( B(k ,j+1UL) );
4270 xmm3 += a1 *
set( B(k ,j+2UL) );
4271 xmm4 += a1 *
set( B(k ,j+3UL) );
4272 xmm5 += a2 *
set( B(k+1UL,j ) );
4273 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
4274 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
4275 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
4278 for( ; k<kend; ++k ) {
4280 xmm1 += a1 *
set( B(k,j ) );
4281 xmm2 += a1 *
set( B(k,j+1UL) );
4282 xmm3 += a1 *
set( B(k,j+2UL) );
4283 xmm4 += a1 *
set( B(k,j+3UL) );
4286 (~C).store( i, j , xmm1+xmm5 );
4287 (~C).store( i, j+1UL, xmm2+xmm6 );
4288 (~C).store( i, j+2UL, xmm3+xmm7 );
4289 (~C).store( i, j+3UL, xmm4+xmm8 );
4292 for( ; (j+3UL) <= jend; j+=3UL )
4304 SIMDType xmm2( (~C).load(i,j+1UL) );
4305 SIMDType xmm3( (~C).load(i,j+2UL) );
4309 for( ; (k+2UL) <= kend; k+=2UL ) {
4311 const SIMDType a2( A.load(i,k+1UL) );
4312 xmm1 += a1 *
set( B(k ,j ) );
4313 xmm2 += a1 *
set( B(k ,j+1UL) );
4314 xmm3 += a1 *
set( B(k ,j+2UL) );
4315 xmm4 += a2 *
set( B(k+1UL,j ) );
4316 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
4317 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
4320 for( ; k<kend; ++k ) {
4322 xmm1 += a1 *
set( B(k,j ) );
4323 xmm2 += a1 *
set( B(k,j+1UL) );
4324 xmm3 += a1 *
set( B(k,j+2UL) );
4327 (~C).store( i, j , xmm1+xmm4 );
4328 (~C).store( i, j+1UL, xmm2+xmm5 );
4329 (~C).store( i, j+2UL, xmm3+xmm6 );
4332 for( ; (j+2UL) <= jend; j+=2UL )
4344 SIMDType xmm2( (~C).load(i,j+1UL) );
4348 for( ; (k+2UL) <= kend; k+=2UL ) {
4350 const SIMDType a2( A.load(i,k+1UL) );
4351 xmm1 += a1 *
set( B(k ,j ) );
4352 xmm2 += a1 *
set( B(k ,j+1UL) );
4353 xmm3 += a2 *
set( B(k+1UL,j ) );
4354 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
4357 for( ; k<kend; ++k ) {
4359 xmm1 += a1 *
set( B(k,j ) );
4360 xmm2 += a1 *
set( B(k,j+1UL) );
4363 (~C).store( i, j , xmm1+xmm3 );
4364 (~C).store( i, j+1UL, xmm2+xmm4 );
4379 for( ; (k+2UL) <= K; k+=2UL ) {
4380 xmm1 += A.load(i,k ) *
set( B(k ,j) );
4381 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
4385 xmm1 += A.load(i,k) *
set( B(k,j) );
4388 (~C).store( i, j, xmm1+xmm2 );
4392 for( ; remainder && i<M; ++i )
4394 const size_t jend( LOW ? i+1UL : N );
4395 size_t j( UPP ? i : 0UL );
4397 for( ; (j+2UL) <= jend; j+=2UL )
4411 for(
size_t k=kbegin; k<kend; ++k ) {
4412 value1 += A(i,k) * B(k,j );
4413 value2 += A(i,k) * B(k,j+1UL);
4416 (~C)(i,j ) = value1;
4417 (~C)(i,j+1UL) = value2;
4430 for(
size_t k=kbegin; k<K; ++k ) {
4431 value += A(i,k) * B(k,j);
4455 template<
typename MT3
4459 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4461 selectDefaultAddAssignKernel( C, A, B );
4481 template<
typename MT3
4485 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4511 template<
typename MT3
4515 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4517 selectLargeAddAssignKernel( C, A, B );
4523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4537 template<
typename MT3
4541 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4548 addAssign( C, tmp );
4553 addAssign( C, tmp );
4556 gemm( C, A, B, ET(1), ET(1) );
4580 template<
typename MT
4589 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4603 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4619 template<
typename MT3
4622 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4627 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4628 selectSmallSubAssignKernel( C, A, B );
4630 selectBlasSubAssignKernel( C, A, B );
4649 template<
typename MT3
4655 const size_t M( A.rows() );
4656 const size_t N( B.columns() );
4657 const size_t K( A.columns() );
4661 for(
size_t i=0UL; i<M; ++i )
4671 for(
size_t k=kbegin; k<kend; ++k )
4675 ?( UPP ?
max(i,k+1UL) : k+1UL )
4676 :( UPP ?
max(i,k) : k ) )
4677 :( UPP ? i : 0UL ) );
4680 ?( LOW ?
min(i+1UL,k) : k )
4681 :( LOW ?
min(i,k)+1UL : k+1UL ) )
4682 :( LOW ? i+1UL : N ) );
4684 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
4687 const size_t jnum( jend - jbegin );
4688 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4690 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4691 (~C)(i,j ) -= A(i,k) * B(k,j );
4692 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4695 (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4717 template<
typename MT3
4723 const size_t M( A.rows() );
4724 const size_t N( B.columns() );
4725 const size_t K( A.columns() );
4729 for(
size_t j=0UL; j<N; ++j )
4739 for(
size_t k=kbegin; k<kend; ++k )
4743 ?( LOW ?
max(j,k+1UL) : k+1UL )
4744 :( LOW ?
max(j,k) : k ) )
4745 :( LOW ? j : 0UL ) );
4748 ?( UPP ?
min(j+1UL,k) : k )
4749 :( UPP ?
min(j,k)+1UL : k+1UL ) )
4750 :( UPP ? j+1UL : M ) );
4752 if( ( LOW || UPP ) && ( ibegin >= iend ) )
continue;
4755 const size_t inum( iend - ibegin );
4756 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4758 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4759 (~C)(i ,j) -= A(i ,k) * B(k,j);
4760 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4763 (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4785 template<
typename MT3
4788 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4791 constexpr
size_t block( BLOCK_SIZE );
4793 const size_t M( A.rows() );
4794 const size_t N( B.columns() );
4796 for(
size_t ii=0UL; ii<M; ii+=block ) {
4797 const size_t iend(
min( M, ii+block ) );
4798 for(
size_t jj=0UL; jj<N; jj+=block ) {
4799 const size_t jend(
min( N, jj+block ) );
4800 for(
size_t i=ii; i<iend; ++i )
4809 for(
size_t j=jbegin; j<jpos; ++j ) {
4810 (~C)(i,j) -= A(i,j) * B(j,j);
4833 template<
typename MT3
4836 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4839 const size_t M( A.rows() );
4840 const size_t N( B.columns() );
4842 for(
size_t j=0UL; j<N; ++j )
4852 const size_t inum( iend - ibegin );
4853 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4855 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4856 (~C)(i ,j) -= A(i ,j) * B(j,j);
4857 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4860 (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4881 template<
typename MT3
4887 const size_t M( A.rows() );
4888 const size_t N( B.columns() );
4890 for(
size_t i=0UL; i<M; ++i )
4900 const size_t jnum( jend - jbegin );
4901 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4903 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4904 (~C)(i,j ) -= A(i,i) * B(i,j );
4905 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4908 (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4929 template<
typename MT3
4935 constexpr
size_t block( BLOCK_SIZE );
4937 const size_t M( A.rows() );
4938 const size_t N( B.columns() );
4940 for(
size_t jj=0UL; jj<N; jj+=block ) {
4941 const size_t jend(
min( N, jj+block ) );
4942 for(
size_t ii=0UL; ii<M; ii+=block ) {
4943 const size_t iend(
min( M, ii+block ) );
4944 for(
size_t j=jj; j<jend; ++j )
4953 for(
size_t i=ibegin; i<ipos; ++i ) {
4954 (~C)(i,j) -= A(i,i) * B(i,j);
4977 template<
typename MT3
4981 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4983 for(
size_t i=0UL; i<A.rows(); ++i ) {
4984 C(i,i) -= A(i,i) * B(i,i);
5004 template<
typename MT3
5008 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5010 selectDefaultSubAssignKernel( C, A, B );
5030 template<
typename MT3
5038 const size_t M( A.rows() );
5039 const size_t N( B.columns() );
5040 const size_t K( A.columns() );
5044 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
5051 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5052 for(
size_t i=0UL; i<M; ++i )
5066 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5067 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5068 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5069 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5070 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
5071 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
5072 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
5074 for(
size_t k=kbegin; k<kend; ++k ) {
5075 const SIMDType a1(
set( A(i,k) ) );
5076 xmm1 -= a1 * B.load(k,j );
5077 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5078 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5079 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5080 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5081 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5082 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5083 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5086 (~C).store( i, j , xmm1 );
5087 (~C).store( i, j+SIMDSIZE , xmm2 );
5088 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5089 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5090 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5091 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
5092 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
5093 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
5098 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5102 for( ; (i+2UL) <= M; i+=2UL )
5115 SIMDType xmm1 ( (~C).load(i ,j ) );
5116 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
5117 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
5118 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
5119 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
5120 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
5121 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
5122 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5123 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5124 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
5126 for(
size_t k=kbegin; k<kend; ++k ) {
5127 const SIMDType a1(
set( A(i ,k) ) );
5128 const SIMDType a2(
set( A(i+1UL,k) ) );
5130 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5131 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5132 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5133 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5146 (~C).store( i , j , xmm1 );
5147 (~C).store( i , j+SIMDSIZE , xmm2 );
5148 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5149 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5150 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
5151 (~C).store( i+1UL, j , xmm6 );
5152 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
5153 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5154 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5155 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5168 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5169 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5170 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5171 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5173 for(
size_t k=kbegin; k<kend; ++k ) {
5174 const SIMDType a1(
set( A(i,k) ) );
5175 xmm1 -= a1 * B.load(k,j );
5176 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5177 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5178 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5179 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5182 (~C).store( i, j , xmm1 );
5183 (~C).store( i, j+SIMDSIZE , xmm2 );
5184 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5185 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5186 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5190 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5194 for( ; (i+2UL) <= M; i+=2UL )
5208 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5209 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5210 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
5211 SIMDType xmm5( (~C).load(i+1UL,j ) );
5212 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
5213 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5214 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5216 for(
size_t k=kbegin; k<kend; ++k ) {
5217 const SIMDType a1(
set( A(i ,k) ) );
5218 const SIMDType a2(
set( A(i+1UL,k) ) );
5220 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5221 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5222 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5233 (~C).store( i , j , xmm1 );
5234 (~C).store( i , j+SIMDSIZE , xmm2 );
5235 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5236 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5237 (~C).store( i+1UL, j , xmm5 );
5238 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
5239 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5240 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5253 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5254 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5255 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5257 for(
size_t k=kbegin; k<kend; ++k ) {
5258 const SIMDType a1(
set( A(i,k) ) );
5259 xmm1 -= a1 * B.load(k,j );
5260 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5261 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5262 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5265 (~C).store( i, j , xmm1 );
5266 (~C).store( i, j+SIMDSIZE , xmm2 );
5267 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5268 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5272 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5276 for( ; (i+2UL) <= M; i+=2UL )
5290 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5291 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5292 SIMDType xmm4( (~C).load(i+1UL,j ) );
5293 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
5294 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5296 for(
size_t k=kbegin; k<kend; ++k ) {
5297 const SIMDType a1(
set( A(i ,k) ) );
5298 const SIMDType a2(
set( A(i+1UL,k) ) );
5300 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5301 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5310 (~C).store( i , j , xmm1 );
5311 (~C).store( i , j+SIMDSIZE , xmm2 );
5312 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5313 (~C).store( i+1UL, j , xmm4 );
5314 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
5315 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5328 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5329 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5331 for(
size_t k=kbegin; k<kend; ++k ) {
5332 const SIMDType a1(
set( A(i,k) ) );
5333 xmm1 -= a1 * B.load(k,j );
5334 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5335 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5338 (~C).store( i, j , xmm1 );
5339 (~C).store( i, j+SIMDSIZE , xmm2 );
5340 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5344 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5346 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
5347 size_t i( LOW ? j : 0UL );
5349 for( ; (i+4UL) <= iend; i+=4UL )
5363 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5364 SIMDType xmm3( (~C).load(i+1UL,j ) );
5365 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5366 SIMDType xmm5( (~C).load(i+2UL,j ) );
5367 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5368 SIMDType xmm7( (~C).load(i+3UL,j ) );
5369 SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
5371 for(
size_t k=kbegin; k<kend; ++k ) {
5372 const SIMDType a1(
set( A(i ,k) ) );
5373 const SIMDType a2(
set( A(i+1UL,k) ) );
5374 const SIMDType a3(
set( A(i+2UL,k) ) );
5375 const SIMDType a4(
set( A(i+3UL,k) ) );
5377 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5388 (~C).store( i , j , xmm1 );
5389 (~C).store( i , j+SIMDSIZE, xmm2 );
5390 (~C).store( i+1UL, j , xmm3 );
5391 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5392 (~C).store( i+2UL, j , xmm5 );
5393 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5394 (~C).store( i+3UL, j , xmm7 );
5395 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
5398 for( ; (i+3UL) <= iend; i+=3UL )
5412 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5413 SIMDType xmm3( (~C).load(i+1UL,j ) );
5414 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5415 SIMDType xmm5( (~C).load(i+2UL,j ) );
5416 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5418 for(
size_t k=kbegin; k<kend; ++k ) {
5419 const SIMDType a1(
set( A(i ,k) ) );
5420 const SIMDType a2(
set( A(i+1UL,k) ) );
5421 const SIMDType a3(
set( A(i+2UL,k) ) );
5423 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5432 (~C).store( i , j , xmm1 );
5433 (~C).store( i , j+SIMDSIZE, xmm2 );
5434 (~C).store( i+1UL, j , xmm3 );
5435 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5436 (~C).store( i+2UL, j , xmm5 );
5437 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5440 for( ; (i+2UL) <= iend; i+=2UL )
5454 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5455 SIMDType xmm3( (~C).load(i+1UL,j ) );
5456 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5460 for( ; (k+2UL) <= kend; k+=2UL ) {
5461 const SIMDType a1(
set( A(i ,k ) ) );
5462 const SIMDType a2(
set( A(i+1UL,k ) ) );
5463 const SIMDType a3(
set( A(i ,k+1UL) ) );
5464 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
5465 const SIMDType b1( B.load(k ,j ) );
5466 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5467 const SIMDType b3( B.load(k+1UL,j ) );
5468 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5479 for( ; k<kend; ++k ) {
5480 const SIMDType a1(
set( A(i ,k) ) );
5481 const SIMDType a2(
set( A(i+1UL,k) ) );
5483 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5490 (~C).store( i , j , xmm1+xmm5 );
5491 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
5492 (~C).store( i+1UL, j , xmm3+xmm7 );
5493 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
5506 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
5510 for( ; (k+2UL) <= kend; k+=2UL ) {
5511 const SIMDType a1(
set( A(i,k ) ) );
5512 const SIMDType a2(
set( A(i,k+1UL) ) );
5513 xmm1 -= a1 * B.load(k ,j );
5514 xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
5515 xmm3 -= a2 * B.load(k+1UL,j );
5516 xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
5519 for( ; k<kend; ++k ) {
5520 const SIMDType a1(
set( A(i,k) ) );
5521 xmm1 -= a1 * B.load(k,j );
5522 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
5525 (~C).store( i, j , xmm1+xmm3 );
5526 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
5530 for( ; j<jpos; j+=SIMDSIZE )
5532 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
5533 size_t i( LOW ? j : 0UL );
5535 for( ; (i+4UL) <= iend; i+=4UL )
5547 SIMDType xmm2( (~C).load(i+1UL,j) );
5548 SIMDType xmm3( (~C).load(i+2UL,j) );
5549 SIMDType xmm4( (~C).load(i+3UL,j) );
5553 for( ; (k+2UL) <= kend; k+=2UL ) {
5555 const SIMDType b2( B.load(k+1UL,j) );
5556 xmm1 -=
set( A(i ,k ) ) * b1;
5557 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5558 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5559 xmm4 -=
set( A(i+3UL,k ) ) * b1;
5560 xmm5 -=
set( A(i ,k+1UL) ) * b2;
5561 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
5562 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
5563 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
5566 for( ; k<kend; ++k ) {
5568 xmm1 -=
set( A(i ,k) ) * b1;
5569 xmm2 -=
set( A(i+1UL,k) ) * b1;
5570 xmm3 -=
set( A(i+2UL,k) ) * b1;
5571 xmm4 -=
set( A(i+3UL,k) ) * b1;
5574 (~C).store( i , j, xmm1+xmm5 );
5575 (~C).store( i+1UL, j, xmm2+xmm6 );
5576 (~C).store( i+2UL, j, xmm3+xmm7 );
5577 (~C).store( i+3UL, j, xmm4+xmm8 );
5580 for( ; (i+3UL) <= iend; i+=3UL )
5592 SIMDType xmm2( (~C).load(i+1UL,j) );
5593 SIMDType xmm3( (~C).load(i+2UL,j) );
5597 for( ; (k+2UL) <= kend; k+=2UL ) {
5599 const SIMDType b2( B.load(k+1UL,j) );
5600 xmm1 -=
set( A(i ,k ) ) * b1;
5601 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5602 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5603 xmm4 -=
set( A(i ,k+1UL) ) * b2;
5604 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
5605 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
5608 for( ; k<kend; ++k ) {
5610 xmm1 -=
set( A(i ,k) ) * b1;
5611 xmm2 -=
set( A(i+1UL,k) ) * b1;
5612 xmm3 -=
set( A(i+2UL,k) ) * b1;
5615 (~C).store( i , j, xmm1+xmm4 );
5616 (~C).store( i+1UL, j, xmm2+xmm5 );
5617 (~C).store( i+2UL, j, xmm3+xmm6 );
5620 for( ; (i+2UL) <= iend; i+=2UL )
5632 SIMDType xmm2( (~C).load(i+1UL,j) );
5636 for( ; (k+2UL) <= kend; k+=2UL ) {
5638 const SIMDType b2( B.load(k+1UL,j) );
5639 xmm1 -=
set( A(i ,k ) ) * b1;
5640 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5641 xmm3 -=
set( A(i ,k+1UL) ) * b2;
5642 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
5645 for( ; k<kend; ++k ) {
5647 xmm1 -=
set( A(i ,k) ) * b1;
5648 xmm2 -=
set( A(i+1UL,k) ) * b1;
5651 (~C).store( i , j, xmm1+xmm3 );
5652 (~C).store( i+1UL, j, xmm2+xmm4 );
5667 for( ; (k+2UL) <= K; k+=2UL ) {
5668 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
5669 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
5673 xmm1 -=
set( A(i,k) ) * B.load(k,j);
5676 (~C).store( i, j, xmm1+xmm2 );
5680 for( ; remainder && j<N; ++j )
5682 const size_t iend( UPP ? j+1UL : M );
5683 size_t i( LOW ? j : 0UL );
5685 for( ; (i+2UL) <= iend; i+=2UL )
5699 for(
size_t k=kbegin; k<kend; ++k ) {
5700 value1 -= A(i ,k) * B(k,j);
5701 value2 -= A(i+1UL,k) * B(k,j);
5704 (~C)(i ,j) = value1;
5705 (~C)(i+1UL,j) = value2;
5718 for(
size_t k=kbegin; k<K; ++k ) {
5719 value -= A(i,k) * B(k,j);
5744 template<
typename MT3
5752 const size_t M( A.rows() );
5753 const size_t N( B.columns() );
5754 const size_t K( A.columns() );
5758 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
5765 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5766 for(
size_t j=0UL; j<N; ++j )
5780 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5781 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5782 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5783 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5784 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
5785 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
5786 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
5788 for(
size_t k=kbegin; k<kend; ++k ) {
5789 const SIMDType b1(
set( B(k,j) ) );
5790 xmm1 -= A.load(i ,k) * b1;
5791 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5792 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5793 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5794 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5795 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
5796 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
5797 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
5800 (~C).store( i , j, xmm1 );
5801 (~C).store( i+SIMDSIZE , j, xmm2 );
5802 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5803 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5804 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5805 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
5806 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
5807 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
5812 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5816 for( ; (j+2UL) <= N; j+=2UL )
5829 SIMDType xmm1 ( (~C).load(i ,j ) );
5830 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
5831 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
5832 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
5833 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
5834 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
5835 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
5836 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5837 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5838 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
5840 for(
size_t k=kbegin; k<kend; ++k ) {
5842 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5843 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5844 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5845 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5846 const SIMDType b1(
set( B(k,j ) ) );
5847 const SIMDType b2(
set( B(k,j+1UL) ) );
5860 (~C).store( i , j , xmm1 );
5861 (~C).store( i+SIMDSIZE , j , xmm2 );
5862 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5863 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5864 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
5865 (~C).store( i , j+1UL, xmm6 );
5866 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
5867 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
5868 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
5869 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
5882 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5883 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5884 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5885 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5887 for(
size_t k=kbegin; k<kend; ++k ) {
5888 const SIMDType b1(
set( B(k,j) ) );
5889 xmm1 -= A.load(i ,k) * b1;
5890 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5891 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5892 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5893 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5896 (~C).store( i , j, xmm1 );
5897 (~C).store( i+SIMDSIZE , j, xmm2 );
5898 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5899 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5900 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5904 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5908 for( ; (j+2UL) <= N; j+=2UL )
5922 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
5923 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
5924 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
5925 SIMDType xmm5( (~C).load(i ,j+1UL) );
5926 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
5927 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5928 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5930 for(
size_t k=kbegin; k<kend; ++k ) {
5932 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5933 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5934 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5935 const SIMDType b1(
set( B(k,j ) ) );
5936 const SIMDType b2(
set( B(k,j+1UL) ) );
5947 (~C).store( i , j , xmm1 );
5948 (~C).store( i+SIMDSIZE , j , xmm2 );
5949 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5950 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5951 (~C).store( i , j+1UL, xmm5 );
5952 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
5953 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
5954 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
5967 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5968 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5969 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5971 for(
size_t k=kbegin; k<kend; ++k ) {
5972 const SIMDType b1(
set( B(k,j) ) );
5973 xmm1 -= A.load(i ,k) * b1;
5974 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5975 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5976 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5979 (~C).store( i , j, xmm1 );
5980 (~C).store( i+SIMDSIZE , j, xmm2 );
5981 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5982 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5986 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5990 for( ; (j+2UL) <= N; j+=2UL )
6004 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
6005 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
6006 SIMDType xmm4( (~C).load(i ,j+1UL) );
6007 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
6008 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
6010 for(
size_t k=kbegin; k<kend; ++k ) {
6012 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6013 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6014 const SIMDType b1(
set( B(k,j ) ) );
6015 const SIMDType b2(
set( B(k,j+1UL) ) );
6024 (~C).store( i , j , xmm1 );
6025 (~C).store( i+SIMDSIZE , j , xmm2 );
6026 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
6027 (~C).store( i , j+1UL, xmm4 );
6028 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
6029 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6042 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
6043 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
6045 for(
size_t k=kbegin; k<kend; ++k ) {
6046 const SIMDType b1(
set( B(k,j) ) );
6047 xmm1 -= A.load(i ,k) * b1;
6048 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6049 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6052 (~C).store( i , j, xmm1 );
6053 (~C).store( i+SIMDSIZE , j, xmm2 );
6054 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
6058 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6060 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
6061 size_t j( UPP ? i : 0UL );
6063 for( ; (j+4UL) <= jend; j+=4UL )
6077 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6078 SIMDType xmm3( (~C).load(i ,j+1UL) );
6079 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6080 SIMDType xmm5( (~C).load(i ,j+2UL) );
6081 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6082 SIMDType xmm7( (~C).load(i ,j+3UL) );
6083 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
6085 for(
size_t k=kbegin; k<kend; ++k ) {
6087 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6088 const SIMDType b1(
set( B(k,j ) ) );
6089 const SIMDType b2(
set( B(k,j+1UL) ) );
6090 const SIMDType b3(
set( B(k,j+2UL) ) );
6091 const SIMDType b4(
set( B(k,j+3UL) ) );
6102 (~C).store( i , j , xmm1 );
6103 (~C).store( i+SIMDSIZE, j , xmm2 );
6104 (~C).store( i , j+1UL, xmm3 );
6105 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6106 (~C).store( i , j+2UL, xmm5 );
6107 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6108 (~C).store( i , j+3UL, xmm7 );
6109 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
6112 for( ; (j+3UL) <= jend; j+=3UL )
6126 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6127 SIMDType xmm3( (~C).load(i ,j+1UL) );
6128 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6129 SIMDType xmm5( (~C).load(i ,j+2UL) );
6130 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6132 for(
size_t k=kbegin; k<kend; ++k ) {
6134 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6135 const SIMDType b1(
set( B(k,j ) ) );
6136 const SIMDType b2(
set( B(k,j+1UL) ) );
6137 const SIMDType b3(
set( B(k,j+2UL) ) );
6146 (~C).store( i , j , xmm1 );
6147 (~C).store( i+SIMDSIZE, j , xmm2 );
6148 (~C).store( i , j+1UL, xmm3 );
6149 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6150 (~C).store( i , j+2UL, xmm5 );
6151 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6154 for( ; (j+2UL) <= jend; j+=2UL )
6168 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6169 SIMDType xmm3( (~C).load(i ,j+1UL) );
6170 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6174 for( ; (k+2UL) <= kend; k+=2UL ) {
6175 const SIMDType a1( A.load(i ,k ) );
6176 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6177 const SIMDType a3( A.load(i ,k+1UL) );
6178 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6179 const SIMDType b1(
set( B(k ,j ) ) );
6180 const SIMDType b2(
set( B(k ,j+1UL) ) );
6181 const SIMDType b3(
set( B(k+1UL,j ) ) );
6182 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
6193 for( ; k<kend; ++k ) {
6195 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6196 const SIMDType b1(
set( B(k,j ) ) );
6197 const SIMDType b2(
set( B(k,j+1UL) ) );
6204 (~C).store( i , j , xmm1+xmm5 );
6205 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
6206 (~C).store( i , j+1UL, xmm3+xmm7 );
6207 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
6220 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
6224 for( ; (k+2UL) <= kend; k+=2UL ) {
6225 const SIMDType b1(
set( B(k ,j) ) );
6226 const SIMDType b2(
set( B(k+1UL,j) ) );
6227 xmm1 -= A.load(i ,k ) * b1;
6228 xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
6229 xmm3 -= A.load(i ,k+1UL) * b2;
6230 xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
6233 for( ; k<kend; ++k ) {
6234 const SIMDType b1(
set( B(k,j) ) );
6235 xmm1 -= A.load(i ,k) * b1;
6236 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6239 (~C).store( i , j, xmm1+xmm3 );
6240 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
6244 for( ; i<ipos; i+=SIMDSIZE )
6246 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
6247 size_t j( UPP ? i : 0UL );
6249 for( ; (j+4UL) <= jend; j+=4UL )
6261 SIMDType xmm2( (~C).load(i,j+1UL) );
6262 SIMDType xmm3( (~C).load(i,j+2UL) );
6263 SIMDType xmm4( (~C).load(i,j+3UL) );
6267 for( ; (k+2UL) <= kend; k+=2UL ) {
6269 const SIMDType a2( A.load(i,k+1UL) );
6270 xmm1 -= a1 *
set( B(k ,j ) );
6271 xmm2 -= a1 *
set( B(k ,j+1UL) );
6272 xmm3 -= a1 *
set( B(k ,j+2UL) );
6273 xmm4 -= a1 *
set( B(k ,j+3UL) );
6274 xmm5 -= a2 *
set( B(k+1UL,j ) );
6275 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
6276 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
6277 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
6280 for( ; k<kend; ++k ) {
6282 xmm1 -= a1 *
set( B(k,j ) );
6283 xmm2 -= a1 *
set( B(k,j+1UL) );
6284 xmm3 -= a1 *
set( B(k,j+2UL) );
6285 xmm4 -= a1 *
set( B(k,j+3UL) );
6288 (~C).store( i, j , xmm1+xmm5 );
6289 (~C).store( i, j+1UL, xmm2+xmm6 );
6290 (~C).store( i, j+2UL, xmm3+xmm7 );
6291 (~C).store( i, j+3UL, xmm4+xmm8 );
6294 for( ; (j+3UL) <= jend; j+=3UL )
6306 SIMDType xmm2( (~C).load(i,j+1UL) );
6307 SIMDType xmm3( (~C).load(i,j+2UL) );
6311 for( ; (k+2UL) <= kend; k+=2UL ) {
6313 const SIMDType a2( A.load(i,k+1UL) );
6314 xmm1 -= a1 *
set( B(k ,j ) );
6315 xmm2 -= a1 *
set( B(k ,j+1UL) );
6316 xmm3 -= a1 *
set( B(k ,j+2UL) );
6317 xmm4 -= a2 *
set( B(k+1UL,j ) );
6318 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
6319 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
6322 for( ; k<kend; ++k ) {
6324 xmm1 -= a1 *
set( B(k,j ) );
6325 xmm2 -= a1 *
set( B(k,j+1UL) );
6326 xmm3 -= a1 *
set( B(k,j+2UL) );
6329 (~C).store( i, j , xmm1+xmm4 );
6330 (~C).store( i, j+1UL, xmm2+xmm5 );
6331 (~C).store( i, j+2UL, xmm3+xmm6 );
6334 for( ; (j+2UL) <= jend; j+=2UL )
6346 SIMDType xmm2( (~C).load(i,j+1UL) );
6350 for( ; (k+2UL) <= kend; k+=2UL ) {
6352 const SIMDType a2( A.load(i,k+1UL) );
6353 xmm1 -= a1 *
set( B(k ,j ) );
6354 xmm2 -= a1 *
set( B(k ,j+1UL) );
6355 xmm3 -= a2 *
set( B(k+1UL,j ) );
6356 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
6359 for( ; k<kend; ++k ) {
6361 xmm1 -= a1 *
set( B(k,j ) );
6362 xmm2 -= a1 *
set( B(k,j+1UL) );
6365 (~C).store( i, j , xmm1+xmm3 );
6366 (~C).store( i, j+1UL, xmm2+xmm4 );
6381 for( ; (k+2UL) <= K; k+=2UL ) {
6382 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
6383 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
6387 xmm1 -= A.load(i,k) *
set( B(k,j) );
6390 (~C).store( i, j, xmm1+xmm2 );
6394 for( ; remainder && i<M; ++i )
6396 const size_t jend( LOW ? i+1UL : N );
6397 size_t j( UPP ? i : 0UL );
6399 for( ; (j+2UL) <= jend; j+=2UL )
6413 for(
size_t k=kbegin; k<kend; ++k ) {
6414 value1 -= A(i,k) * B(k,j );
6415 value2 -= A(i,k) * B(k,j+1UL);
6418 (~C)(i,j ) = value1;
6419 (~C)(i,j+1UL) = value2;
6432 for(
size_t k=kbegin; k<K; ++k ) {
6433 value -= A(i,k) * B(k,j);
6457 template<
typename MT3
6461 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6463 selectDefaultSubAssignKernel( C, A, B );
6483 template<
typename MT3
6487 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6513 template<
typename MT3
6517 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6519 selectLargeSubAssignKernel( C, A, B );
6525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6539 template<
typename MT3
6543 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6550 subAssign( C, tmp );
6555 subAssign( C, tmp );
6558 gemm( C, A, B, ET(-1), ET(1) );
6582 template<
typename MT
6596 schurAssign( ~lhs, tmp );
6629 template<
typename MT
6639 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6642 else if( rhs.lhs_.columns() == 0UL ) {
6678 template<
typename MT
6697 const ForwardFunctor fwd;
6699 const TmpType tmp( rhs );
6721 template<
typename MT
6731 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6770 template<
typename MT
6780 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6816 template<
typename MT
6876 template<
typename MT1
6884 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
6914 SYM = ( SF && !( HF || LF || UF ) ),
6915 HERM = ( HF && !( LF || UF ) ),
6916 LOW = ( LF || ( ( SF || HF ) && UF ) ),
6917 UPP = ( UF || ( ( SF || HF ) && LF ) )
6926 template<
typename T1,
typename T2,
typename T3 >
6927 struct IsEvaluationRequired {
6928 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
6936 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6937 struct UseBlasKernel {
6938 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
6944 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6958 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6959 struct UseVectorizedDefaultKernel {
6960 enum :
bool { value = useOptimizedKernels &&
6964 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7018 MT1::simdEnabled && MT2::simdEnabled &&
7024 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
7025 !evaluateRight && MT2::smpAssignable };
7055 return matrix_(i,j) * scalar_;
7068 if( i >= matrix_.rows() ) {
7071 if( j >= matrix_.columns() ) {
7074 return (*
this)(i,j);
7083 inline size_t rows()
const {
7084 return matrix_.rows();
7093 inline size_t columns()
const {
7094 return matrix_.columns();
7124 template<
typename T >
7125 inline bool canAlias(
const T* alias )
const {
7126 return matrix_.canAlias( alias );
7136 template<
typename T >
7137 inline bool isAliased(
const T* alias )
const {
7138 return matrix_.isAliased( alias );
7148 return matrix_.isAligned();
7159 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7161 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7162 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7184 template<
typename MT
7196 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7199 else if( left.columns() == 0UL ) {
7214 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
7229 template<
typename MT3
7233 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7238 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7239 selectSmallAssignKernel( C, A, B, scalar );
7241 selectBlasAssignKernel( C, A, B, scalar );
7259 template<
typename MT3
7266 const size_t M( A.rows() );
7267 const size_t N( B.columns() );
7268 const size_t K( A.columns() );
7272 for(
size_t i=0UL; i<M; ++i )
7283 for(
size_t j=0UL; j<N; ++j ) {
7292 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
7293 :( UPP ?
max(i,kbegin) : kbegin ) )
7294 :( UPP ? i : 0UL ) );
7297 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
7298 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
7299 :( LOW ? i+1UL : N ) );
7302 for(
size_t j=0UL; j<jbegin; ++j ) {
7307 reset( (~C)(i,0UL) );
7309 for(
size_t j=jbegin; j<jend; ++j ) {
7310 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7313 for(
size_t j=jend; j<N; ++j ) {
7318 reset( (~C)(i,N-1UL) );
7322 for(
size_t k=kbegin+1UL; k<kend; ++k )
7326 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
7327 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
7328 :( SYM || HERM || UPP ? i : 0UL ) );
7331 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
7332 :( LOW ?
min(i+1UL,k) : k ) )
7333 :( LOW ? i+1UL : N ) );
7335 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7338 for(
size_t j=jbegin; j<jend; ++j ) {
7339 (~C)(i,j) += A(i,k) * B(k,j);
7342 (~C)(i,jend) = A(i,k) * B(k,jend);
7349 :( SYM || HERM || UPP ? i : 0UL ) );
7352 :( LOW ? i+1UL : N ) );
7354 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7357 for(
size_t j=jbegin; j<jend; ++j ) {
7358 (~C)(i,j) *= scalar;
7364 for(
size_t i=1UL; i<M; ++i ) {
7365 for(
size_t j=0UL; j<i; ++j ) {
7366 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
7387 template<
typename MT3
7394 const size_t M( A.rows() );
7395 const size_t N( B.columns() );
7396 const size_t K( A.columns() );
7400 for(
size_t j=0UL; j<N; ++j )
7411 for(
size_t i=0UL; i<M; ++i ) {
7420 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
7421 :( LOW ?
max(j,kbegin) : kbegin ) )
7422 :( LOW ? j : 0UL ) );
7425 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
7426 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
7427 :( UPP ? j+1UL : M ) );
7430 for(
size_t i=0UL; i<ibegin; ++i ) {
7435 reset( (~C)(0UL,j) );
7437 for(
size_t i=ibegin; i<iend; ++i ) {
7438 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7441 for(
size_t i=iend; i<M; ++i ) {
7446 reset( (~C)(M-1UL,j) );
7450 for(
size_t k=kbegin+1UL; k<kend; ++k )
7454 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
7455 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
7456 :( SYM || HERM || LOW ? j : 0UL ) );
7459 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
7460 :( UPP ?
min(j+1UL,k) : k ) )
7461 :( UPP ? j+1UL : M ) );
7463 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7466 for(
size_t i=ibegin; i<iend; ++i ) {
7467 (~C)(i,j) += A(i,k) * B(k,j);
7470 (~C)(iend,j) = A(iend,k) * B(k,j);
7477 :( SYM || HERM || LOW ? j : 0UL ) );
7480 :( UPP ? j+1UL : M ) );
7482 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7485 for(
size_t i=ibegin; i<iend; ++i ) {
7486 (~C)(i,j) *= scalar;
7492 for(
size_t j=1UL; j<N; ++j ) {
7493 for(
size_t i=0UL; i<j; ++i ) {
7494 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
7515 template<
typename MT3
7519 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7522 constexpr
size_t block( BLOCK_SIZE );
7524 const size_t M( A.rows() );
7525 const size_t N( B.columns() );
7527 for(
size_t ii=0UL; ii<M; ii+=block ) {
7528 const size_t iend(
min( M, ii+block ) );
7529 for(
size_t jj=0UL; jj<N; jj+=block ) {
7530 const size_t jend(
min( N, jj+block ) );
7531 for(
size_t i=ii; i<iend; ++i )
7541 for(
size_t j=jj; j<jbegin; ++j ) {
7545 for(
size_t j=jbegin; j<jpos; ++j ) {
7546 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7549 for(
size_t j=jpos; j<jend; ++j ) {
7573 template<
typename MT3
7577 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7580 const size_t M( A.rows() );
7581 const size_t N( B.columns() );
7583 for(
size_t j=0UL; j<N; ++j )
7594 for(
size_t i=0UL; i<ibegin; ++i ) {
7598 for(
size_t i=ibegin; i<iend; ++i ) {
7599 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7602 for(
size_t i=iend; i<M; ++i ) {
7624 template<
typename MT3
7631 const size_t M( A.rows() );
7632 const size_t N( B.columns() );
7634 for(
size_t i=0UL; i<M; ++i )
7645 for(
size_t j=0UL; j<jbegin; ++j ) {
7649 for(
size_t j=jbegin; j<jend; ++j ) {
7650 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7653 for(
size_t j=jend; j<N; ++j ) {
7675 template<
typename MT3
7682 constexpr
size_t block( BLOCK_SIZE );
7684 const size_t M( A.rows() );
7685 const size_t N( B.columns() );
7687 for(
size_t jj=0UL; jj<N; jj+=block ) {
7688 const size_t jend(
min( N, jj+block ) );
7689 for(
size_t ii=0UL; ii<M; ii+=block ) {
7690 const size_t iend(
min( M, ii+block ) );
7691 for(
size_t j=jj; j<jend; ++j )
7701 for(
size_t i=ii; i<ibegin; ++i ) {
7705 for(
size_t i=ibegin; i<ipos; ++i ) {
7706 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7709 for(
size_t i=ipos; i<iend; ++i ) {
7733 template<
typename MT3
7738 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7742 for(
size_t i=0UL; i<A.rows(); ++i ) {
7743 C(i,i) = A(i,i) * B(i,i) * scalar;
7762 template<
typename MT3
7767 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7769 selectDefaultAssignKernel( C, A, B, scalar );
7788 template<
typename MT3
7797 const size_t M( A.rows() );
7798 const size_t N( B.columns() );
7799 const size_t K( A.columns() );
7803 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
7806 const SIMDType factor(
set( scalar ) );
7808 if( LOW && UPP && N > SIMDSIZE*3UL ) {
7817 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7818 for(
size_t i=0UL; i<M; ++i )
7831 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7833 for(
size_t k=kbegin; k<kend; ++k ) {
7834 const SIMDType a1(
set( A(i,k) ) );
7835 xmm1 += a1 * B.load(k,j );
7836 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7837 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7838 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7839 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7840 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7841 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7842 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7845 (~C).store( i, j , xmm1 * factor );
7846 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7847 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7848 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7849 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7850 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
7851 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
7852 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
7857 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7861 for( ; (i+2UL) <= M; i+=2UL )
7874 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7876 for(
size_t k=kbegin; k<kend; ++k ) {
7877 const SIMDType a1(
set( A(i ,k) ) );
7878 const SIMDType a2(
set( A(i+1UL,k) ) );
7880 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7881 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7882 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7883 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7896 (~C).store( i , j , xmm1 * factor );
7897 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7898 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7899 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7900 (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
7901 (~C).store( i+1UL, j , xmm6 * factor );
7902 (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
7903 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
7904 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
7905 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
7917 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7919 for(
size_t k=kbegin; k<kend; ++k ) {
7920 const SIMDType a1(
set( A(i,k) ) );
7921 xmm1 += a1 * B.load(k,j );
7922 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7923 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7924 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7925 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7928 (~C).store( i, j , xmm1 * factor );
7929 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7930 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7931 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7932 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7936 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7938 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
7939 size_t i( LOW ? j : 0UL );
7941 for( ; (i+2UL) <= iend; i+=2UL )
7954 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7956 for(
size_t k=kbegin; k<kend; ++k ) {
7957 const SIMDType a1(
set( A(i ,k) ) );
7958 const SIMDType a2(
set( A(i+1UL,k) ) );
7960 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7961 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7962 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7973 (~C).store( i , j , xmm1 * factor );
7974 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7975 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7976 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7977 (~C).store( i+1UL, j , xmm5 * factor );
7978 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
7979 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
7980 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
7994 for(
size_t k=kbegin; k<kend; ++k ) {
7995 const SIMDType a1(
set( A(i,k) ) );
7996 xmm1 += a1 * B.load(k,j );
7997 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7998 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7999 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8002 (~C).store( i, j , xmm1 * factor );
8003 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8004 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8005 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8009 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8011 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
8012 size_t i( LOW ? j : 0UL );
8014 for( ; (i+2UL) <= iend; i+=2UL )
8027 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8029 for(
size_t k=kbegin; k<kend; ++k ) {
8030 const SIMDType a1(
set( A(i ,k) ) );
8031 const SIMDType a2(
set( A(i+1UL,k) ) );
8033 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8034 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8043 (~C).store( i , j , xmm1 * factor );
8044 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
8045 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8046 (~C).store( i+1UL, j , xmm4 * factor );
8047 (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8048 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8062 for(
size_t k=kbegin; k<kend; ++k ) {
8063 const SIMDType a1(
set( A(i,k) ) );
8064 xmm1 += a1 * B.load(k,j );
8065 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8066 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8069 (~C).store( i, j , xmm1 * factor );
8070 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8071 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8075 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8077 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
8078 size_t i( LOW ? j : 0UL );
8080 for( ; (i+4UL) <= iend; i+=4UL )
8093 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8095 for(
size_t k=kbegin; k<kend; ++k ) {
8096 const SIMDType a1(
set( A(i ,k) ) );
8097 const SIMDType a2(
set( A(i+1UL,k) ) );
8098 const SIMDType a3(
set( A(i+2UL,k) ) );
8099 const SIMDType a4(
set( A(i+3UL,k) ) );
8101 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8112 (~C).store( i , j , xmm1 * factor );
8113 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8114 (~C).store( i+1UL, j , xmm3 * factor );
8115 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8116 (~C).store( i+2UL, j , xmm5 * factor );
8117 (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8118 (~C).store( i+3UL, j , xmm7 * factor );
8119 (~C).store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8122 for( ; (i+3UL) <= iend; i+=3UL )
8135 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8137 for(
size_t k=kbegin; k<kend; ++k ) {
8138 const SIMDType a1(
set( A(i ,k) ) );
8139 const SIMDType a2(
set( A(i+1UL,k) ) );
8140 const SIMDType a3(
set( A(i+2UL,k) ) );
8142 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8151 (~C).store( i , j , xmm1 * factor );
8152 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8153 (~C).store( i+1UL, j , xmm3 * factor );
8154 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8155 (~C).store( i+2UL, j , xmm5 * factor );
8156 (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8159 for( ; (i+2UL) <= iend; i+=2UL )
8172 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8175 for( ; (k+2UL) <= kend; k+=2UL ) {
8176 const SIMDType a1(
set( A(i ,k ) ) );
8177 const SIMDType a2(
set( A(i+1UL,k ) ) );
8178 const SIMDType a3(
set( A(i ,k+1UL) ) );
8179 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
8180 const SIMDType b1( B.load(k ,j ) );
8181 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8182 const SIMDType b3( B.load(k+1UL,j ) );
8183 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8194 for( ; k<kend; ++k ) {
8195 const SIMDType a1(
set( A(i ,k) ) );
8196 const SIMDType a2(
set( A(i+1UL,k) ) );
8198 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8205 (~C).store( i , j , (xmm1+xmm5) * factor );
8206 (~C).store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
8207 (~C).store( i+1UL, j , (xmm3+xmm7) * factor );
8208 (~C).store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
8223 for( ; (k+2UL) <= kend; k+=2UL ) {
8224 const SIMDType a1(
set( A(i,k ) ) );
8225 const SIMDType a2(
set( A(i,k+1UL) ) );
8226 xmm1 += a1 * B.load(k ,j );
8227 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8228 xmm3 += a2 * B.load(k+1UL,j );
8229 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8232 for( ; k<kend; ++k ) {
8233 const SIMDType a1(
set( A(i,k) ) );
8234 xmm1 += a1 * B.load(k,j );
8235 xmm2 += a1 * B.load(k,j+SIMDSIZE);
8238 (~C).store( i, j , (xmm1+xmm3) * factor );
8239 (~C).store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
8243 for( ; j<jpos; j+=SIMDSIZE )
8245 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
8246 size_t i( LOW ? j : 0UL );
8248 for( ; (i+4UL) <= iend; i+=4UL )
8259 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8262 for( ; (k+2UL) <= kend; k+=2UL ) {
8264 const SIMDType b2( B.load(k+1UL,j) );
8265 xmm1 +=
set( A(i ,k ) ) * b1;
8266 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8267 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8268 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8269 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8270 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8271 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8272 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8275 for( ; k<kend; ++k ) {
8277 xmm1 +=
set( A(i ,k) ) * b1;
8278 xmm2 +=
set( A(i+1UL,k) ) * b1;
8279 xmm3 +=
set( A(i+2UL,k) ) * b1;
8280 xmm4 +=
set( A(i+3UL,k) ) * b1;
8283 (~C).store( i , j, (xmm1+xmm5) * factor );
8284 (~C).store( i+1UL, j, (xmm2+xmm6) * factor );
8285 (~C).store( i+2UL, j, (xmm3+xmm7) * factor );
8286 (~C).store( i+3UL, j, (xmm4+xmm8) * factor );
8289 for( ; (i+3UL) <= iend; i+=3UL )
8300 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8303 for( ; (k+2UL) <= kend; k+=2UL ) {
8305 const SIMDType b2( B.load(k+1UL,j) );
8306 xmm1 +=
set( A(i ,k ) ) * b1;
8307 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8308 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8309 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8310 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8311 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8314 for( ; k<kend; ++k ) {
8316 xmm1 +=
set( A(i ,k) ) * b1;
8317 xmm2 +=
set( A(i+1UL,k) ) * b1;
8318 xmm3 +=
set( A(i+2UL,k) ) * b1;
8321 (~C).store( i , j, (xmm1+xmm4) * factor );
8322 (~C).store( i+1UL, j, (xmm2+xmm5) * factor );
8323 (~C).store( i+2UL, j, (xmm3+xmm6) * factor );
8326 for( ; (i+2UL) <= iend; i+=2UL )
8340 for( ; (k+2UL) <= kend; k+=2UL ) {
8342 const SIMDType b2( B.load(k+1UL,j) );
8343 xmm1 +=
set( A(i ,k ) ) * b1;
8344 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8345 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8346 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8349 for( ; k<kend; ++k ) {
8351 xmm1 +=
set( A(i ,k) ) * b1;
8352 xmm2 +=
set( A(i+1UL,k) ) * b1;
8355 (~C).store( i , j, (xmm1+xmm3) * factor );
8356 (~C).store( i+1UL, j, (xmm2+xmm4) * factor );
8370 for( ; (k+2UL) <= K; k+=2UL ) {
8371 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8372 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8376 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8379 (~C).store( i, j, (xmm1+xmm2) * factor );
8383 for( ; remainder && j<N; ++j )
8385 size_t i( LOW && UPP ? j : 0UL );
8387 for( ; (i+2UL) <= M; i+=2UL )
8401 for(
size_t k=kbegin; k<kend; ++k ) {
8402 value1 += A(i ,k) * B(k,j);
8403 value2 += A(i+1UL,k) * B(k,j);
8406 (~C)(i ,j) = value1 * scalar;
8407 (~C)(i+1UL,j) = value2 * scalar;
8420 for(
size_t k=kbegin; k<K; ++k ) {
8421 value += A(i,k) * B(k,j);
8424 (~C)(i,j) = value * scalar;
8429 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
8430 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8431 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8432 for(
size_t j=0UL; j<jend; ++j ) {
8433 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
8437 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
8438 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
8439 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
8440 for(
size_t i=0UL; i<iend; ++i ) {
8445 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
8446 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8447 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8448 for(
size_t j=0UL; j<jend; ++j ) {
8471 template<
typename MT3
8480 const size_t M( A.rows() );
8481 const size_t N( B.columns() );
8482 const size_t K( A.columns() );
8486 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
8489 const SIMDType factor(
set( scalar ) );
8491 if( LOW && UPP && M > SIMDSIZE*3UL ) {
8500 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8501 for(
size_t j=0UL; j<N; ++j )
8514 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8516 for(
size_t k=kbegin; k<kend; ++k ) {
8517 const SIMDType b1(
set( B(k,j) ) );
8518 xmm1 += A.load(i ,k) * b1;
8519 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8520 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8521 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8522 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8523 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8524 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8525 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8528 (~C).store( i , j, xmm1 * factor );
8529 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8530 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8531 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8532 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8533 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
8534 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
8535 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
8540 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8544 for( ; (j+2UL) <= N; j+=2UL )
8557 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8559 for(
size_t k=kbegin; k<kend; ++k ) {
8561 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8562 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8563 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8564 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8565 const SIMDType b1(
set( B(k,j ) ) );
8566 const SIMDType b2(
set( B(k,j+1UL) ) );
8579 (~C).store( i , j , xmm1 * factor );
8580 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8581 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8582 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8583 (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
8584 (~C).store( i , j+1UL, xmm6 * factor );
8585 (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
8586 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8587 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8588 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8600 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8602 for(
size_t k=kbegin; k<kend; ++k ) {
8603 const SIMDType b1(
set( B(k,j) ) );
8604 xmm1 += A.load(i ,k) * b1;
8605 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8606 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8607 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8608 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8611 (~C).store( i , j, xmm1 * factor );
8612 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8613 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8614 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8615 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8619 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8621 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
8622 size_t j( UPP ? i : 0UL );
8624 for( ; (j+2UL) <= jend; j+=2UL )
8637 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8639 for(
size_t k=kbegin; k<kend; ++k ) {
8641 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8642 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8643 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8644 const SIMDType b1(
set( B(k,j ) ) );
8645 const SIMDType b2(
set( B(k,j+1UL) ) );
8656 (~C).store( i , j , xmm1 * factor );
8657 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8658 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8659 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8660 (~C).store( i , j+1UL, xmm5 * factor );
8661 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
8662 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8663 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8677 for(
size_t k=kbegin; k<kend; ++k ) {
8678 const SIMDType b1(
set( B(k,j) ) );
8679 xmm1 += A.load(i ,k) * b1;
8680 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8681 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8682 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8685 (~C).store( i , j, xmm1 * factor );
8686 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8687 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8688 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8692 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8694 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
8695 size_t j( UPP ? i : 0UL );
8697 for( ; (j+2UL) <= jend; j+=2UL )
8710 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8712 for(
size_t k=kbegin; k<kend; ++k ) {
8714 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8715 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8716 const SIMDType b1(
set( B(k,j ) ) );
8717 const SIMDType b2(
set( B(k,j+1UL) ) );
8726 (~C).store( i , j , xmm1 * factor );
8727 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8728 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8729 (~C).store( i , j+1UL, xmm4 * factor );
8730 (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
8731 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
8745 for(
size_t k=kbegin; k<kend; ++k ) {
8746 const SIMDType b1(
set( B(k,j) ) );
8747 xmm1 += A.load(i ,k) * b1;
8748 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8749 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8752 (~C).store( i , j, xmm1 * factor );
8753 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8754 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8758 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8760 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
8761 size_t j( UPP ? i : 0UL );
8763 for( ; (j+4UL) <= jend; j+=4UL )
8776 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8778 for(
size_t k=kbegin; k<kend; ++k ) {
8780 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8781 const SIMDType b1(
set( B(k,j ) ) );
8782 const SIMDType b2(
set( B(k,j+1UL) ) );
8783 const SIMDType b3(
set( B(k,j+2UL) ) );
8784 const SIMDType b4(
set( B(k,j+3UL) ) );
8795 (~C).store( i , j , xmm1 * factor );
8796 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8797 (~C).store( i , j+1UL, xmm3 * factor );
8798 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8799 (~C).store( i , j+2UL, xmm5 * factor );
8800 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8801 (~C).store( i , j+3UL, xmm7 * factor );
8802 (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
8805 for( ; (j+3UL) <= jend; j+=3UL )
8818 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8820 for(
size_t k=kbegin; k<kend; ++k ) {
8822 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8823 const SIMDType b1(
set( B(k,j ) ) );
8824 const SIMDType b2(
set( B(k,j+1UL) ) );
8825 const SIMDType b3(
set( B(k,j+2UL) ) );
8834 (~C).store( i , j , xmm1 * factor );
8835 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8836 (~C).store( i , j+1UL, xmm3 * factor );
8837 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8838 (~C).store( i , j+2UL, xmm5 * factor );
8839 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8842 for( ; (j+2UL) <= jend; j+=2UL )
8855 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8858 for( ; (k+2UL) <= kend; k+=2UL ) {
8859 const SIMDType a1( A.load(i ,k ) );
8860 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8861 const SIMDType a3( A.load(i ,k+1UL) );
8862 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8863 const SIMDType b1(
set( B(k ,j ) ) );
8864 const SIMDType b2(
set( B(k ,j+1UL) ) );
8865 const SIMDType b3(
set( B(k+1UL,j ) ) );
8866 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8877 for( ; k<kend; ++k ) {
8879 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8880 const SIMDType b1(
set( B(k,j ) ) );
8881 const SIMDType b2(
set( B(k,j+1UL) ) );
8888 (~C).store( i , j , (xmm1+xmm5) * factor );
8889 (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
8890 (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
8891 (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
8906 for( ; (k+2UL) <= kend; k+=2UL ) {
8907 const SIMDType b1(
set( B(k ,j) ) );
8908 const SIMDType b2(
set( B(k+1UL,j) ) );
8909 xmm1 += A.load(i ,k ) * b1;
8910 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8911 xmm3 += A.load(i ,k+1UL) * b2;
8912 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8915 for( ; k<kend; ++k ) {
8916 const SIMDType b1(
set( B(k,j) ) );
8917 xmm1 += A.load(i ,k) * b1;
8918 xmm2 += A.load(i+SIMDSIZE,k) * b1;
8921 (~C).store( i , j, (xmm1+xmm3) * factor );
8922 (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
8926 for( ; i<ipos; i+=SIMDSIZE )
8928 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
8929 size_t j( UPP ? i : 0UL );
8931 for( ; (j+4UL) <= jend; j+=4UL )
8942 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8945 for( ; (k+2UL) <= kend; k+=2UL ) {
8947 const SIMDType a2( A.load(i,k+1UL) );
8948 xmm1 += a1 *
set( B(k ,j ) );
8949 xmm2 += a1 *
set( B(k ,j+1UL) );
8950 xmm3 += a1 *
set( B(k ,j+2UL) );
8951 xmm4 += a1 *
set( B(k ,j+3UL) );
8952 xmm5 += a2 *
set( B(k+1UL,j ) );
8953 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8954 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8955 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8958 for( ; k<kend; ++k ) {
8960 xmm1 += a1 *
set( B(k,j ) );
8961 xmm2 += a1 *
set( B(k,j+1UL) );
8962 xmm3 += a1 *
set( B(k,j+2UL) );
8963 xmm4 += a1 *
set( B(k,j+3UL) );
8966 (~C).store( i, j , (xmm1+xmm5) * factor );
8967 (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
8968 (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
8969 (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
8972 for( ; (j+3UL) <= jend; j+=3UL )
8983 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8986 for( ; (k+2UL) <= kend; k+=2UL ) {
8988 const SIMDType a2( A.load(i,k+1UL) );
8989 xmm1 += a1 *
set( B(k ,j ) );
8990 xmm2 += a1 *
set( B(k ,j+1UL) );
8991 xmm3 += a1 *
set( B(k ,j+2UL) );
8992 xmm4 += a2 *
set( B(k+1UL,j ) );
8993 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
8994 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
8997 for( ; k<kend; ++k ) {
8999 xmm1 += a1 *
set( B(k,j ) );
9000 xmm2 += a1 *
set( B(k,j+1UL) );
9001 xmm3 += a1 *
set( B(k,j+2UL) );
9004 (~C).store( i, j , (xmm1+xmm4) * factor );
9005 (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
9006 (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
9009 for( ; (j+2UL) <= jend; j+=2UL )
9023 for( ; k<kend; ++k ) {
9025 xmm1 += a1 *
set( B(k,j ) );
9026 xmm2 += a1 *
set( B(k,j+1UL) );
9029 for( ; (k+2UL) <= kend; k+=2UL ) {
9031 const SIMDType a2( A.load(i,k+1UL) );
9032 xmm1 += a1 *
set( B(k ,j ) );
9033 xmm2 += a1 *
set( B(k ,j+1UL) );
9034 xmm3 += a2 *
set( B(k+1UL,j ) );
9035 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
9038 (~C).store( i, j , (xmm1+xmm3) * factor );
9039 (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
9053 for( ; (k+2UL) <= K; k+=2UL ) {
9054 xmm1 += A.load(i,k ) *
set( B(k ,j) );
9055 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
9059 xmm1 += A.load(i,k) *
set( B(k,j) );
9062 (~C).store( i, j, (xmm1+xmm2) * factor );
9066 for( ; remainder && i<M; ++i )
9068 size_t j( LOW && UPP ? i : 0UL );
9070 for( ; (j+2UL) <= N; j+=2UL )
9084 for(
size_t k=kbegin; k<kend; ++k ) {
9085 value1 += A(i,k) * B(k,j );
9086 value2 += A(i,k) * B(k,j+1UL);
9089 (~C)(i,j ) = value1 * scalar;
9090 (~C)(i,j+1UL) = value2 * scalar;
9103 for(
size_t k=kbegin; k<K; ++k ) {
9104 value += A(i,k) * B(k,j);
9107 (~C)(i,j) = value * scalar;
9112 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
9113 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9114 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9115 for(
size_t i=0UL; i<iend; ++i ) {
9116 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
9120 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
9121 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9122 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9123 for(
size_t i=0UL; i<iend; ++i ) {
9128 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
9129 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
9130 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
9131 for(
size_t j=0UL; j<jend; ++j ) {
9153 template<
typename MT3
9158 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9160 selectDefaultAssignKernel( C, A, B, scalar );
9179 template<
typename MT3
9184 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9187 smmm( C, A, B, scalar );
9189 hmmm( C, A, B, scalar );
9191 lmmm( C, A, B, scalar, ST2(0) );
9193 ummm( C, A, B, scalar, ST2(0) );
9195 mmm( C, A, B, scalar, ST2(0) );
9213 template<
typename MT3
9218 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9220 selectLargeAssignKernel( C, A, B, scalar );
9225 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 9239 template<
typename MT3
9244 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9250 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9254 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9257 gemm( C, A, B, ET(scalar), ET(0) );
9275 template<
typename MT
9293 const ForwardFunctor fwd;
9295 const TmpType tmp(
serial( rhs ) );
9296 assign( ~lhs, fwd( tmp ) );
9312 template<
typename MT
9324 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9338 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
9353 template<
typename MT3
9357 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9362 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9363 selectSmallAddAssignKernel( C, A, B, scalar );
9365 selectBlasAddAssignKernel( C, A, B, scalar );
9383 template<
typename MT3
9388 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9391 addAssign( C, tmp );
9409 template<
typename MT3
9413 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
9416 constexpr
size_t block( BLOCK_SIZE );
9418 const size_t M( A.rows() );
9419 const size_t N( B.columns() );
9421 for(
size_t ii=0UL; ii<M; ii+=block ) {
9422 const size_t iend(
min( M, ii+block ) );
9423 for(
size_t jj=0UL; jj<N; jj+=block ) {
9424 const size_t jend(
min( N, jj+block ) );
9425 for(
size_t i=ii; i<iend; ++i )
9434 for(
size_t j=jbegin; j<jpos; ++j ) {
9435 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
9457 template<
typename MT3
9461 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
9464 const size_t M( A.rows() );
9465 const size_t N( B.columns() );
9467 for(
size_t j=0UL; j<N; ++j )
9477 const size_t inum( iend - ibegin );
9478 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9480 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9481 (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
9482 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9485 (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9505 template<
typename MT3
9512 const size_t M( A.rows() );
9513 const size_t N( B.columns() );
9515 for(
size_t i=0UL; i<M; ++i )
9525 const size_t jnum( jend - jbegin );
9526 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9528 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9529 (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
9530 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9533 (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9553 template<
typename MT3
9560 constexpr
size_t block( BLOCK_SIZE );
9562 const size_t M( A.rows() );
9563 const size_t N( B.columns() );
9565 for(
size_t jj=0UL; jj<N; jj+=block ) {
9566 const size_t jend(
min( N, jj+block ) );
9567 for(
size_t ii=0UL; ii<M; ii+=block ) {
9568 const size_t iend(
min( M, ii+block ) );
9569 for(
size_t j=jj; j<jend; ++j )
9578 for(
size_t i=ibegin; i<ipos; ++i ) {
9579 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
9601 template<
typename MT3
9606 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9608 for(
size_t i=0UL; i<A.rows(); ++i ) {
9609 C(i,i) += A(i,i) * B(i,i) * scalar;
9628 template<
typename MT3
9633 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9635 selectDefaultAddAssignKernel( C, A, B, scalar );
9654 template<
typename MT3
9663 const size_t M( A.rows() );
9664 const size_t N( B.columns() );
9665 const size_t K( A.columns() );
9669 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
9672 const SIMDType factor(
set( scalar ) );
9678 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9679 for(
size_t i=0UL; i<M; ++i )
9692 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9694 for(
size_t k=kbegin; k<kend; ++k ) {
9695 const SIMDType a1(
set( A(i,k) ) );
9696 xmm1 += a1 * B.load(k,j );
9697 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9698 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9699 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9700 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9701 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9702 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9703 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9706 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9707 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9708 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9709 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9710 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9711 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
9712 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
9713 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
9718 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9722 for( ; (i+2UL) <= M; i+=2UL )
9735 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9737 for(
size_t k=kbegin; k<kend; ++k ) {
9738 const SIMDType a1(
set( A(i ,k) ) );
9739 const SIMDType a2(
set( A(i+1UL,k) ) );
9741 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9742 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9743 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9744 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9757 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9758 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9759 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9760 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9761 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
9762 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
9763 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
9764 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
9765 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
9766 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
9778 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9780 for(
size_t k=kbegin; k<kend; ++k ) {
9781 const SIMDType a1(
set( A(i,k) ) );
9782 xmm1 += a1 * B.load(k,j );
9783 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9784 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9785 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9786 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9789 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9790 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9791 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9792 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9793 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9797 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9801 for( ; (i+2UL) <= M; i+=2UL )
9814 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9816 for(
size_t k=kbegin; k<kend; ++k ) {
9817 const SIMDType a1(
set( A(i ,k) ) );
9818 const SIMDType a2(
set( A(i+1UL,k) ) );
9820 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9821 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9822 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9833 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9834 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9835 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9836 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9837 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
9838 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
9839 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
9840 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
9854 for(
size_t k=kbegin; k<kend; ++k ) {
9855 const SIMDType a1(
set( A(i,k) ) );
9856 xmm1 += a1 * B.load(k,j );
9857 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9858 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9859 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9862 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9863 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9864 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9865 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9869 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9873 for( ; (i+2UL) <= M; i+=2UL )
9886 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9888 for(
size_t k=kbegin; k<kend; ++k ) {
9889 const SIMDType a1(
set( A(i ,k) ) );
9890 const SIMDType a2(
set( A(i+1UL,k) ) );
9892 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9893 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9902 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9903 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9904 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9905 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
9906 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
9907 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
9921 for(
size_t k=kbegin; k<kend; ++k ) {
9922 const SIMDType a1(
set( A(i,k) ) );
9923 xmm1 += a1 * B.load(k,j );
9924 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9925 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9928 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9929 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9930 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9934 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9936 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
9937 size_t i( LOW ? j : 0UL );
9939 for( ; (i+4UL) <= iend; i+=4UL )
9952 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9954 for(
size_t k=kbegin; k<kend; ++k ) {
9955 const SIMDType a1(
set( A(i ,k) ) );
9956 const SIMDType a2(
set( A(i+1UL,k) ) );
9957 const SIMDType a3(
set( A(i+2UL,k) ) );
9958 const SIMDType a4(
set( A(i+3UL,k) ) );
9960 const SIMDType b2( B.load(k,j+SIMDSIZE) );
9971 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9972 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
9973 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9974 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
9975 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
9976 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
9977 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
9978 (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
9981 for( ; (i+3UL) <= iend; i+=3UL )
9994 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9996 for(
size_t k=kbegin; k<kend; ++k ) {
9997 const SIMDType a1(
set( A(i ,k) ) );
9998 const SIMDType a2(
set( A(i+1UL,k) ) );
9999 const SIMDType a3(
set( A(i+2UL,k) ) );
10000 const SIMDType b1( B.load(k,j ) );
10001 const SIMDType b2( B.load(k,j+SIMDSIZE) );
10010 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10011 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
10012 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
10013 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10014 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
10015 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10018 for( ; (i+2UL) <= iend; i+=2UL )
10031 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10032 size_t k( kbegin );
10034 for( ; (k+2UL) <= kend; k+=2UL ) {
10035 const SIMDType a1(
set( A(i ,k ) ) );
10036 const SIMDType a2(
set( A(i+1UL,k ) ) );
10037 const SIMDType a3(
set( A(i ,k+1UL) ) );
10038 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
10039 const SIMDType b1( B.load(k ,j ) );
10040 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
10041 const SIMDType b3( B.load(k+1UL,j ) );
10042 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
10053 for( ; k<kend; ++k ) {
10054 const SIMDType a1(
set( A(i ,k) ) );
10055 const SIMDType a2(
set( A(i+1UL,k) ) );
10056 const SIMDType b1( B.load(k,j ) );
10057 const SIMDType b2( B.load(k,j+SIMDSIZE) );
10064 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10065 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
10066 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + (xmm3+xmm7) * factor );
10067 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
10080 size_t k( kbegin );
10082 for( ; (k+2UL) <= kend; k+=2UL ) {
10083 const SIMDType a1(
set( A(i,k ) ) );
10084 const SIMDType a2(
set( A(i,k+1UL) ) );
10085 xmm1 += a1 * B.load(k ,j );
10086 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
10087 xmm3 += a2 * B.load(k+1UL,j );
10088 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
10091 for( ; k<kend; ++k ) {
10092 const SIMDType a1(
set( A(i,k) ) );
10093 xmm1 += a1 * B.load(k,j );
10094 xmm2 += a1 * B.load(k,j+SIMDSIZE);
10097 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10098 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
10102 for( ; j<jpos; j+=SIMDSIZE )
10104 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
10105 size_t i( LOW ? j : 0UL );
10107 for( ; (i+4UL) <= iend; i+=4UL )
10118 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10119 size_t k( kbegin );
10121 for( ; (k+2UL) <= kend; k+=2UL ) {
10122 const SIMDType b1( B.load(k ,j) );
10123 const SIMDType b2( B.load(k+1UL,j) );
10124 xmm1 +=
set( A(i ,k ) ) * b1;
10125 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10126 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10127 xmm4 +=
set( A(i+3UL,k ) ) * b1;
10128 xmm5 +=
set( A(i ,k+1UL) ) * b2;
10129 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
10130 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
10131 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
10134 for( ; k<kend; ++k ) {
10136 xmm1 +=
set( A(i ,k) ) * b1;
10137 xmm2 +=
set( A(i+1UL,k) ) * b1;
10138 xmm3 +=
set( A(i+2UL,k) ) * b1;
10139 xmm4 +=
set( A(i+3UL,k) ) * b1;
10142 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm5) * factor );
10143 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm6) * factor );
10144 (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm7) * factor );
10145 (~C).store( i+3UL, j, (~C).load(i+3UL,j) + (xmm4+xmm8) * factor );
10148 for( ; (i+3UL) <= iend; i+=3UL )
10159 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10160 size_t k( kbegin );
10162 for( ; (k+2UL) <= kend; k+=2UL ) {
10163 const SIMDType b1( B.load(k ,j) );
10164 const SIMDType b2( B.load(k+1UL,j) );
10165 xmm1 +=
set( A(i ,k ) ) * b1;
10166 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10167 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10168 xmm4 +=
set( A(i ,k+1UL) ) * b2;
10169 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
10170 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
10173 for( ; k<kend; ++k ) {
10175 xmm1 +=
set( A(i ,k) ) * b1;
10176 xmm2 +=
set( A(i+1UL,k) ) * b1;
10177 xmm3 +=
set( A(i+2UL,k) ) * b1;
10180 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm4) * factor );
10181 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm5) * factor );
10182 (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm6) * factor );
10185 for( ; (i+2UL) <= iend; i+=2UL )
10197 size_t k( kbegin );
10199 for( ; (k+2UL) <= kend; k+=2UL ) {
10200 const SIMDType b1( B.load(k ,j) );
10201 const SIMDType b2( B.load(k+1UL,j) );
10202 xmm1 +=
set( A(i ,k ) ) * b1;
10203 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10204 xmm3 +=
set( A(i ,k+1UL) ) * b2;
10205 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
10208 for( ; k<kend; ++k ) {
10210 xmm1 +=
set( A(i ,k) ) * b1;
10211 xmm2 +=
set( A(i+1UL,k) ) * b1;
10214 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10215 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm4) * factor );
10227 size_t k( kbegin );
10229 for( ; (k+2UL) <= K; k+=2UL ) {
10230 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
10231 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
10234 for( ; k<K; ++k ) {
10235 xmm1 +=
set( A(i,k) ) * B.load(k,j);
10238 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10242 for( ; remainder && j<N; ++j )
10244 const size_t iend( UPP ? j+1UL : M );
10245 size_t i( LOW ? j : 0UL );
10247 for( ; (i+2UL) <= iend; i+=2UL )
10261 for(
size_t k=kbegin; k<kend; ++k ) {
10262 value1 += A(i ,k) * B(k,j);
10263 value2 += A(i+1UL,k) * B(k,j);
10266 (~C)(i ,j) += value1 * scalar;
10267 (~C)(i+1UL,j) += value2 * scalar;
10280 for(
size_t k=kbegin; k<K; ++k ) {
10281 value += A(i,k) * B(k,j);
10284 (~C)(i,j) += value * scalar;
10305 template<
typename MT3
10314 const size_t M( A.rows() );
10315 const size_t N( B.columns() );
10316 const size_t K( A.columns() );
10320 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
10321 BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
10323 const SIMDType factor(
set( scalar ) );
10329 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10330 for(
size_t j=0UL; j<N; ++j )
10343 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10345 for(
size_t k=kbegin; k<kend; ++k ) {
10346 const SIMDType b1(
set( B(k,j) ) );
10347 xmm1 += A.load(i ,k) * b1;
10348 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10349 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10350 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10351 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10352 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
10353 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
10354 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
10357 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10358 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10359 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10360 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10361 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10362 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
10363 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
10364 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
10369 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
10373 for( ; (j+2UL) <= N; j+=2UL )
10386 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10388 for(
size_t k=kbegin; k<kend; ++k ) {
10389 const SIMDType a1( A.load(i ,k) );
10390 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10391 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10392 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10393 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
10394 const SIMDType b1(
set( B(k,j ) ) );
10395 const SIMDType b2(
set( B(k,j+1UL) ) );
10408 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10409 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10410 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10411 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10412 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
10413 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
10414 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
10415 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10416 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10417 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10429 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10431 for(
size_t k=kbegin; k<kend; ++k ) {
10432 const SIMDType b1(
set( B(k,j) ) );
10433 xmm1 += A.load(i ,k) * b1;
10434 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10435 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10436 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10437 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10440 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10441 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10442 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10443 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10444 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10448 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10452 for( ; (j+2UL) <= N; j+=2UL )
10465 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10467 for(
size_t k=kbegin; k<kend; ++k ) {
10468 const SIMDType a1( A.load(i ,k) );
10469 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10470 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10471 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10472 const SIMDType b1(
set( B(k,j ) ) );
10473 const SIMDType b2(
set( B(k,j+1UL) ) );
10484 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10485 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10486 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10487 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10488 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
10489 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
10490 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10491 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10505 for(
size_t k=kbegin; k<kend; ++k ) {
10506 const SIMDType b1(
set( B(k,j) ) );
10507 xmm1 += A.load(i ,k) * b1;
10508 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10509 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10510 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10513 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10514 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10515 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10516 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10520 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
10524 for( ; (j+2UL) <= N; j+=2UL )
10537 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10539 for(
size_t k=kbegin; k<kend; ++k ) {
10540 const SIMDType a1( A.load(i ,k) );
10541 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10542 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10543 const SIMDType b1(
set( B(k,j ) ) );
10544 const SIMDType b2(
set( B(k,j+1UL) ) );
10553 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10554 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10555 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10556 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
10557 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
10558 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10572 for(
size_t k=kbegin; k<kend; ++k ) {
10573 const SIMDType b1(
set( B(k,j) ) );
10574 xmm1 += A.load(i ,k) * b1;
10575 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10576 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10579 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10580 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10581 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10585 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10587 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
10588 size_t j( UPP ? i : 0UL );
10590 for( ; (j+4UL) <= jend; j+=4UL )
10603 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10605 for(
size_t k=kbegin; k<kend; ++k ) {
10606 const SIMDType a1( A.load(i ,k) );
10607 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10608 const SIMDType b1(
set( B(k,j ) ) );
10609 const SIMDType b2(
set( B(k,j+1UL) ) );
10610 const SIMDType b3(
set( B(k,j+2UL) ) );
10611 const SIMDType b4(
set( B(k,j+3UL) ) );
10622 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10623 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10624 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10625 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10626 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10627 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10628 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
10629 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
10632 for( ; (j+3UL) <= jend; j+=3UL )
10645 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10647 for(
size_t k=kbegin; k<kend; ++k ) {
10648 const SIMDType a1( A.load(i ,k) );
10649 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10650 const SIMDType b1(
set( B(k,j ) ) );
10651 const SIMDType b2(
set( B(k,j+1UL) ) );
10652 const SIMDType b3(
set( B(k,j+2UL) ) );
10661 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10662 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10663 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10664 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10665 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10666 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10669 for( ; (j+2UL) <= jend; j+=2UL )
10682 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10683 size_t k( kbegin );
10685 for( ; (k+2UL) <= kend; k+=2UL ) {
10686 const SIMDType a1( A.load(i ,k ) );
10687 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
10688 const SIMDType a3( A.load(i ,k+1UL) );
10689 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
10690 const SIMDType b1(
set( B(k ,j ) ) );
10691 const SIMDType b2(
set( B(k ,j+1UL) ) );
10692 const SIMDType b3(
set( B(k+1UL,j ) ) );
10693 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
10704 for( ; k<kend; ++k ) {
10705 const SIMDType a1( A.load(i ,k) );
10706 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10707 const SIMDType b1(
set( B(k,j ) ) );
10708 const SIMDType b2(
set( B(k,j+1UL) ) );
10715 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10716 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
10717 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
10718 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
10731 size_t k( kbegin );
10733 for( ; (k+2UL) <= kend; k+=2UL ) {
10734 const SIMDType b1(
set( B(k ,j) ) );
10735 const SIMDType b2(
set( B(k+1UL,j) ) );
10736 xmm1 += A.load(i ,k ) * b1;
10737 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
10738 xmm3 += A.load(i ,k+1UL) * b2;
10739 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
10742 for( ; k<kend; ++k ) {
10743 const SIMDType b1(
set( B(k,j) ) );
10744 xmm1 += A.load(i ,k) * b1;
10745 xmm2 += A.load(i+SIMDSIZE,k) * b1;
10748 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10749 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
10753 for( ; i<ipos; i+=SIMDSIZE )
10755 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
10756 size_t j( UPP ? i : 0UL );
10758 for( ; (j+4UL) <= jend; j+=4UL )
10769 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10770 size_t k( kbegin );
10772 for( ; (k+2UL) <= kend; k+=2UL ) {
10773 const SIMDType a1( A.load(i,k ) );
10774 const SIMDType a2( A.load(i,k+1UL) );
10775 xmm1 += a1 *
set( B(k ,j ) );
10776 xmm2 += a1 *
set( B(k ,j+1UL) );
10777 xmm3 += a1 *
set( B(k ,j+2UL) );
10778 xmm4 += a1 *
set( B(k ,j+3UL) );
10779 xmm5 += a2 *
set( B(k+1UL,j ) );
10780 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
10781 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
10782 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
10785 for( ; k<kend; ++k ) {
10787 xmm1 += a1 *
set( B(k,j ) );
10788 xmm2 += a1 *
set( B(k,j+1UL) );
10789 xmm3 += a1 *
set( B(k,j+2UL) );
10790 xmm4 += a1 *
set( B(k,j+3UL) );
10793 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
10794 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
10795 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
10796 (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
10799 for( ; (j+3UL) <= jend; j+=3UL )
10810 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10811 size_t k( kbegin );
10813 for( ; (k+2UL) <= kend; k+=2UL ) {
10814 const SIMDType a1( A.load(i,k ) );
10815 const SIMDType a2( A.load(i,k+1UL) );
10816 xmm1 += a1 *
set( B(k ,j ) );
10817 xmm2 += a1 *
set( B(k ,j+1UL) );
10818 xmm3 += a1 *
set( B(k ,j+2UL) );
10819 xmm4 += a2 *
set( B(k+1UL,j ) );
10820 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
10821 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
10824 for( ; k<kend; ++k ) {
10826 xmm1 += a1 *
set( B(k,j ) );
10827 xmm2 += a1 *
set( B(k,j+1UL) );
10828 xmm3 += a1 *
set( B(k,j+2UL) );
10831 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
10832 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
10833 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
10836 for( ; (j+2UL) <= jend; j+=2UL )
10848 size_t k( kbegin );
10850 for( ; (k+2UL) <= kend; k+=2UL ) {
10851 const SIMDType a1( A.load(i,k ) );
10852 const SIMDType a2( A.load(i,k+1UL) );
10853 xmm1 += a1 *
set( B(k ,j ) );
10854 xmm2 += a1 *
set( B(k ,j+1UL) );
10855 xmm3 += a2 *
set( B(k+1UL,j ) );
10856 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
10859 for( ; k<kend; ++k ) {
10861 xmm1 += a1 *
set( B(k,j ) );
10862 xmm2 += a1 *
set( B(k,j+1UL) );
10865 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10866 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
10878 size_t k( kbegin );
10880 for( ; (k+2UL) <= K; k+=2UL ) {
10881 xmm1 += A.load(i,k ) *
set( B(k ,j) );
10882 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
10885 for( ; k<K; ++k ) {
10886 xmm1 += A.load(i,k) *
set( B(k,j) );
10889 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10893 for( ; remainder && i<M; ++i )
10895 const size_t jend( LOW ? i+1UL : N );
10896 size_t j( UPP ? i : 0UL );
10898 for( ; (j+2UL) <= jend; j+=2UL )
10912 for(
size_t k=kbegin; k<kend; ++k ) {
10913 value1 += A(i,k) * B(k,j );
10914 value2 += A(i,k) * B(k,j+1UL);
10917 (~C)(i,j ) += value1 * scalar;
10918 (~C)(i,j+1UL) += value2 * scalar;
10931 for(
size_t k=kbegin; k<K; ++k ) {
10932 value += A(i,k) * B(k,j);
10935 (~C)(i,j) += value * scalar;
10955 template<
typename MT3
10960 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10962 selectDefaultAddAssignKernel( C, A, B, scalar );
10981 template<
typename MT3
10986 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10989 lmmm( C, A, B, scalar, ST2(1) );
10991 ummm( C, A, B, scalar, ST2(1) );
10993 mmm( C, A, B, scalar, ST2(1) );
11011 template<
typename MT3
11016 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11018 selectLargeAddAssignKernel( C, A, B, scalar );
11023 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 11037 template<
typename MT3
11042 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11048 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11049 addAssign( C, tmp );
11053 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11054 addAssign( C, tmp );
11057 gemm( C, A, B, ET(scalar), ET(1) );
11079 template<
typename MT
11091 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11105 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
11120 template<
typename MT3
11124 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11129 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11130 selectSmallSubAssignKernel( C, A, B, scalar );
11132 selectBlasSubAssignKernel( C, A, B, scalar );
11150 template<
typename MT3
11155 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11158 subAssign( C, tmp );
11176 template<
typename MT3
11180 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
11183 constexpr
size_t block( BLOCK_SIZE );
11185 const size_t M( A.rows() );
11186 const size_t N( B.columns() );
11188 for(
size_t ii=0UL; ii<M; ii+=block ) {
11189 const size_t iend(
min( M, ii+block ) );
11190 for(
size_t jj=0UL; jj<N; jj+=block ) {
11191 const size_t jend(
min( N, jj+block ) );
11192 for(
size_t i=ii; i<iend; ++i )
11201 for(
size_t j=jbegin; j<jpos; ++j ) {
11202 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
11224 template<
typename MT3
11228 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
11231 const size_t M( A.rows() );
11232 const size_t N( B.columns() );
11234 for(
size_t j=0UL; j<N; ++j )
11244 const size_t inum( iend - ibegin );
11245 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
11247 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
11248 (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
11249 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11251 if( ipos < iend ) {
11252 (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11272 template<
typename MT3
11279 const size_t M( A.rows() );
11280 const size_t N( B.columns() );
11282 for(
size_t i=0UL; i<M; ++i )
11292 const size_t jnum( jend - jbegin );
11293 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
11295 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
11296 (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
11297 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11299 if( jpos < jend ) {
11300 (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11320 template<
typename MT3
11327 constexpr
size_t block( BLOCK_SIZE );
11329 const size_t M( A.rows() );
11330 const size_t N( B.columns() );
11332 for(
size_t jj=0UL; jj<N; jj+=block ) {
11333 const size_t jend(
min( N, jj+block ) );
11334 for(
size_t ii=0UL; ii<M; ii+=block ) {
11335 const size_t iend(
min( M, ii+block ) );
11336 for(
size_t j=jj; j<jend; ++j )
11345 for(
size_t i=ibegin; i<ipos; ++i ) {
11346 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
11368 template<
typename MT3
11373 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11375 for(
size_t i=0UL; i<A.rows(); ++i ) {
11376 C(i,i) -= A(i,i) * B(i,i) * scalar;
11395 template<
typename MT3
11400 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11402 selectDefaultSubAssignKernel( C, A, B, scalar );
11421 template<
typename MT3
11430 const size_t M( A.rows() );
11431 const size_t N( B.columns() );
11432 const size_t K( A.columns() );
11436 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
11437 BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
11439 const SIMDType factor(
set( scalar ) );
11445 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
11446 for(
size_t i=0UL; i<M; ++i )
11459 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11461 for(
size_t k=kbegin; k<kend; ++k ) {
11462 const SIMDType a1(
set( A(i,k) ) );
11463 xmm1 += a1 * B.load(k,j );
11464 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11465 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11466 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11467 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11468 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
11469 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
11470 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
11473 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11474 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11475 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11476 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11477 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11478 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
11479 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
11480 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
11485 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
11489 for( ; (i+2UL) <= M; i+=2UL )
11502 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11504 for(
size_t k=kbegin; k<kend; ++k ) {
11505 const SIMDType a1(
set( A(i ,k) ) );
11506 const SIMDType a2(
set( A(i+1UL,k) ) );
11507 const SIMDType b1( B.load(k,j ) );
11508 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11509 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11510 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11511 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
11524 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11525 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11526 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11527 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11528 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
11529 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
11530 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
11531 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
11532 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
11533 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
11545 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11547 for(
size_t k=kbegin; k<kend; ++k ) {
11548 const SIMDType a1(
set( A(i,k) ) );
11549 xmm1 += a1 * B.load(k,j );
11550 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11551 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11552 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11553 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11556 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11557 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11558 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11559 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11560 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11564 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
11568 for( ; (i+2UL) <= M; i+=2UL )
11581 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11583 for(
size_t k=kbegin; k<kend; ++k ) {
11584 const SIMDType a1(
set( A(i ,k) ) );
11585 const SIMDType a2(
set( A(i+1UL,k) ) );
11586 const SIMDType b1( B.load(k,j ) );
11587 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11588 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11589 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11600 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11601 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11602 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11603 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11604 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
11605 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
11606 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
11607 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
11621 for(
size_t k=kbegin; k<kend; ++k ) {
11622 const SIMDType a1(
set( A(i,k) ) );
11623 xmm1 += a1 * B.load(k,j );
11624 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11625 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11626 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11629 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11630 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11631 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11632 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11636 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
11640 for( ; (i+2UL) <= M; i+=2UL )
11653 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11655 for(
size_t k=kbegin; k<kend; ++k ) {
11656 const SIMDType a1(
set( A(i ,k) ) );
11657 const SIMDType a2(
set( A(i+1UL,k) ) );
11658 const SIMDType b1( B.load(k,j ) );
11659 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11660 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11669 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11670 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11671 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11672 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
11673 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
11674 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
11688 for(
size_t k=kbegin; k<kend; ++k ) {
11689 const SIMDType a1(
set( A(i,k) ) );
11690 xmm1 += a1 * B.load(k,j );
11691 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11692 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11695 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11696 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11697 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11701 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
11703 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
11704 size_t i( LOW ? j : 0UL );
11706 for( ; (i+4UL) <= iend; i+=4UL )
11719 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11721 for(
size_t k=kbegin; k<kend; ++k ) {
11722 const SIMDType a1(
set( A(i ,k) ) );
11723 const SIMDType a2(
set( A(i+1UL,k) ) );
11724 const SIMDType a3(
set( A(i+2UL,k) ) );
11725 const SIMDType a4(
set( A(i+3UL,k) ) );
11726 const SIMDType b1( B.load(k,j ) );
11727 const SIMDType b2( B.load(k,j+SIMDSIZE) );
11738 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11739 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11740 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11741 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11742 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11743 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11744 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
11745 (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
11748 for( ; (i+3UL) <= iend; i+=3UL )
11761 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11763 for(
size_t k=kbegin; k<kend; ++k ) {
11764 const SIMDType a1(
set( A(i ,k) ) );
11765 const SIMDType a2(
set( A(i+1UL,k) ) );
11766 const SIMDType a3(
set( A(i+2UL,k) ) );
11767 const SIMDType b1( B.load(k,j ) );
11768 const SIMDType b2( B.load(k,j+SIMDSIZE) );
11777 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11778 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11779 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11780 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11781 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11782 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11785 for( ; (i+2UL) <= iend; i+=2UL )
11798 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11799 size_t k( kbegin );
11801 for( ; (k+2UL) <= kend; k+=2UL ) {
11802 const SIMDType a1(
set( A(i ,k ) ) );
11803 const SIMDType a2(
set( A(i+1UL,k ) ) );
11804 const SIMDType a3(
set( A(i ,k+1UL) ) );
11805 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
11806 const SIMDType b1( B.load(k ,j ) );
11807 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
11808 const SIMDType b3( B.load(k+1UL,j ) );
11809 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
11820 for( ; k<kend; ++k ) {
11821 const SIMDType a1(
set( A(i ,k) ) );
11822 const SIMDType a2(
set( A(i+1UL,k) ) );
11823 const SIMDType b1( B.load(k,j ) );
11824 const SIMDType b2( B.load(k,j+SIMDSIZE) );
11831 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
11832 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
11833 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - (xmm3+xmm7) * factor );
11834 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
11847 size_t k( kbegin );
11849 for( ; (k+2UL) <= kend; k+=2UL ) {
11850 const SIMDType a1(
set( A(i,k ) ) );
11851 const SIMDType a2(
set( A(i,k+1UL) ) );
11852 xmm1 += a1 * B.load(k ,j );
11853 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
11854 xmm3 += a2 * B.load(k+1UL,j );
11855 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
11858 for( ; k<kend; ++k ) {
11859 const SIMDType a1(
set( A(i,k) ) );
11860 xmm1 += a1 * B.load(k,j );
11861 xmm2 += a1 * B.load(k,j+SIMDSIZE);
11864 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
11865 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
11869 for( ; j<jpos; j+=SIMDSIZE )
11871 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
11872 size_t i( LOW ? j : 0UL );
11874 for( ; (i+4UL) <= iend; i+=4UL )
11885 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11886 size_t k( kbegin );
11888 for( ; (k+2UL) <= kend; k+=2UL ) {
11889 const SIMDType b1( B.load(k ,j) );
11890 const SIMDType b2( B.load(k+1UL,j) );
11891 xmm1 +=
set( A(i ,k ) ) * b1;
11892 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11893 xmm3 +=
set( A(i+2UL,k ) ) * b1;
11894 xmm4 +=
set( A(i+3UL,k ) ) * b1;
11895 xmm5 +=
set( A(i ,k+1UL) ) * b2;
11896 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
11897 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
11898 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
11901 for( ; k<kend; ++k ) {
11903 xmm1 +=
set( A(i ,k) ) * b1;
11904 xmm2 +=
set( A(i+1UL,k) ) * b1;
11905 xmm3 +=
set( A(i+2UL,k) ) * b1;
11906 xmm4 +=
set( A(i+3UL,k) ) * b1;
11909 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm5) * factor );
11910 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm6) * factor );
11911 (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm7) * factor );
11912 (~C).store( i+3UL, j, (~C).load(i+3UL,j) - (xmm4+xmm8) * factor );
11915 for( ; (i+3UL) <= iend; i+=3UL )
11926 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11927 size_t k( kbegin );
11929 for( ; (k+2UL) <= kend; k+=2UL ) {
11930 const SIMDType b1( B.load(k ,j) );
11931 const SIMDType b2( B.load(k+1UL,j) );
11932 xmm1 +=
set( A(i ,k ) ) * b1;
11933 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11934 xmm3 +=
set( A(i+2UL,k ) ) * b1;
11935 xmm4 +=
set( A(i ,k+1UL) ) * b2;
11936 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
11937 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
11940 for( ; k<kend; ++k ) {
11942 xmm1 +=
set( A(i ,k) ) * b1;
11943 xmm2 +=
set( A(i+1UL,k) ) * b1;
11944 xmm3 +=
set( A(i+2UL,k) ) * b1;
11947 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm4) * factor );
11948 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm5) * factor );
11949 (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm6) * factor );
11952 for( ; (i+2UL) <= iend; i+=2UL )
11964 size_t k( kbegin );
11966 for( ; (k+2UL) <= kend; k+=2UL ) {
11967 const SIMDType b1( B.load(k ,j) );
11968 const SIMDType b2( B.load(k+1UL,j) );
11969 xmm1 +=
set( A(i ,k ) ) * b1;
11970 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11971 xmm3 +=
set( A(i ,k+1UL) ) * b2;
11972 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
11975 for( ; k<kend; ++k ) {
11977 xmm1 +=
set( A(i ,k) ) * b1;
11978 xmm2 +=
set( A(i+1UL,k) ) * b1;
11981 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
11982 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm4) * factor );
11994 size_t k( kbegin );
11996 for( ; (k+2UL) <= K; k+=2UL ) {
11997 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
11998 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
12001 for( ; k<K; ++k ) {
12002 xmm1 +=
set( A(i,k) ) * B.load(k,j);
12005 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12009 for( ; remainder && j<N; ++j )
12011 const size_t iend( UPP ? j+1UL : M );
12012 size_t i( LOW ? j : 0UL );
12014 for( ; (i+2UL) <= iend; i+=2UL )
12028 for(
size_t k=kbegin; k<kend; ++k ) {
12029 value1 += A(i ,k) * B(k,j);
12030 value2 += A(i+1UL,k) * B(k,j);
12033 (~C)(i ,j) -= value1 * scalar;
12034 (~C)(i+1UL,j) -= value2 * scalar;
12047 for(
size_t k=kbegin; k<K; ++k ) {
12048 value += A(i,k) * B(k,j);
12051 (~C)(i,j) -= value * scalar;
12072 template<
typename MT3
12081 const size_t M( A.rows() );
12082 const size_t N( B.columns() );
12083 const size_t K( A.columns() );
12087 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
12088 BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
12090 const SIMDType factor(
set( scalar ) );
12096 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
12097 for(
size_t j=0UL; j<N; ++j )
12110 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12112 for(
size_t k=kbegin; k<kend; ++k ) {
12113 const SIMDType b1(
set( B(k,j) ) );
12114 xmm1 += A.load(i ,k) * b1;
12115 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12116 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12117 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12118 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12119 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
12120 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
12121 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
12124 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12125 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12126 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12127 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12128 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12129 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
12130 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
12131 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
12136 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
12140 for( ; (j+2UL) <= N; j+=2UL )
12153 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12155 for(
size_t k=kbegin; k<kend; ++k ) {
12156 const SIMDType a1( A.load(i ,k) );
12157 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12158 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12159 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12160 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
12161 const SIMDType b1(
set( B(k,j ) ) );
12162 const SIMDType b2(
set( B(k,j+1UL) ) );
12175 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12176 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12177 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12178 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12179 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
12180 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
12181 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
12182 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12183 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12184 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12196 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12198 for(
size_t k=kbegin; k<kend; ++k ) {
12199 const SIMDType b1(
set( B(k,j) ) );
12200 xmm1 += A.load(i ,k) * b1;
12201 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12202 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12203 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12204 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12207 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12208 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12209 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12210 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12211 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12215 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
12219 for( ; (j+2UL) <= N; j+=2UL )
12232 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12234 for(
size_t k=kbegin; k<kend; ++k ) {
12235 const SIMDType a1( A.load(i ,k) );
12236 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12237 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12238 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12239 const SIMDType b1(
set( B(k,j ) ) );
12240 const SIMDType b2(
set( B(k,j+1UL) ) );
12251 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12252 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12253 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12254 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12255 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
12256 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
12257 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12258 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12272 for(
size_t k=kbegin; k<kend; ++k ) {
12273 const SIMDType b1(
set( B(k,j) ) );
12274 xmm1 += A.load(i ,k) * b1;
12275 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12276 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12277 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12280 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12281 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12282 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12283 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12287 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
12291 for( ; (j+2UL) <= N; j+=2UL )
12304 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12306 for(
size_t k=kbegin; k<kend; ++k ) {
12307 const SIMDType a1( A.load(i ,k) );
12308 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12309 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12310 const SIMDType b1(
set( B(k,j ) ) );
12311 const SIMDType b2(
set( B(k,j+1UL) ) );
12320 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12321 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12322 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12323 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
12324 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
12325 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12339 for(
size_t k=kbegin; k<kend; ++k ) {
12340 const SIMDType b1(
set( B(k,j) ) );
12341 xmm1 += A.load(i ,k) * b1;
12342 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12343 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12346 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12347 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12348 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12352 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
12354 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
12355 size_t j( UPP ? i : 0UL );
12357 for( ; (j+4UL) <= jend; j+=4UL )
12370 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12372 for(
size_t k=kbegin; k<kend; ++k ) {
12373 const SIMDType a1( A.load(i ,k) );
12374 const SIMDType a2( A.load(i+SIMDSIZE,k) );
12375 const SIMDType b1(
set( B(k,j ) ) );
12376 const SIMDType b2(
set( B(k,j+1UL) ) );
12377 const SIMDType b3(
set( B(k,j+2UL) ) );
12378 const SIMDType b4(
set( B(k,j+3UL) ) );
12389 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12390 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12391 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12392 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12393 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12394 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12395 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
12396 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
12399 for( ; (j+3UL) <= jend; j+=3UL )
12412 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12414 for(
size_t k=kbegin; k<kend; ++k ) {
12415 const SIMDType a1( A.load(i ,k) );
12416 const SIMDType a2( A.load(i+SIMDSIZE,k) );
12417 const SIMDType b1(
set( B(k,j ) ) );
12418 const SIMDType b2(
set( B(k,j+1UL) ) );
12419 const SIMDType b3(
set( B(k,j+2UL) ) );
12428 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12429 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12430 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12431 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12432 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12433 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12436 for( ; (j+2UL) <= jend; j+=2UL )
12449 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12450 size_t k( kbegin );
12452 for( ; (k+2UL) <= kend; k+=2UL ) {
12453 const SIMDType a1( A.load(i ,k ) );
12454 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
12455 const SIMDType a3( A.load(i ,k+1UL) );
12456 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
12457 const SIMDType b1(
set( B(k ,j ) ) );
12458 const SIMDType b2(
set( B(k ,j+1UL) ) );
12459 const SIMDType b3(
set( B(k+1UL,j ) ) );
12460 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
12471 for( ; k<kend; ++k ) {
12472 const SIMDType a1( A.load(i ,k) );
12473 const SIMDType a2( A.load(i+SIMDSIZE,k) );
12474 const SIMDType b1(
set( B(k,j ) ) );
12475 const SIMDType b2(
set( B(k,j+1UL) ) );
12482 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
12483 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
12484 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
12485 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12498 size_t k( kbegin );
12500 for( ; (k+2UL) <= kend; k+=2UL ) {
12501 const SIMDType b1(
set( B(k ,j) ) );
12502 const SIMDType b2(
set( B(k+1UL,j) ) );
12503 xmm1 += A.load(i ,k ) * b1;
12504 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
12505 xmm3 += A.load(i ,k+1UL) * b2;
12506 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
12509 for( ; k<kend; ++k ) {
12510 const SIMDType b1(
set( B(k,j) ) );
12511 xmm1 += A.load(i ,k) * b1;
12512 xmm2 += A.load(i+SIMDSIZE,k) * b1;
12515 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
12516 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
12520 for( ; i<ipos; i+=SIMDSIZE )
12522 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
12523 size_t j( UPP ? i : 0UL );
12525 for( ; (j+4UL) <= jend; j+=4UL )
12536 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12537 size_t k( kbegin );
12539 for( ; (k+2UL) <= kend; k+=2UL ) {
12540 const SIMDType a1( A.load(i,k ) );
12541 const SIMDType a2( A.load(i,k+1UL) );
12542 xmm1 += a1 *
set( B(k ,j ) );
12543 xmm2 += a1 *
set( B(k ,j+1UL) );
12544 xmm3 += a1 *
set( B(k ,j+2UL) );
12545 xmm4 += a1 *
set( B(k ,j+3UL) );
12546 xmm5 += a2 *
set( B(k+1UL,j ) );
12547 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
12548 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
12549 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
12552 for( ; k<kend; ++k ) {
12554 xmm1 += a1 *
set( B(k,j ) );
12555 xmm2 += a1 *
set( B(k,j+1UL) );
12556 xmm3 += a1 *
set( B(k,j+2UL) );
12557 xmm4 += a1 *
set( B(k,j+3UL) );
12560 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
12561 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
12562 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
12563 (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
12566 for( ; (j+3UL) <= jend; j+=3UL )
12577 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12578 size_t k( kbegin );
12580 for( ; (k+2UL) <= kend; k+=2UL ) {
12581 const SIMDType a1( A.load(i,k ) );
12582 const SIMDType a2( A.load(i,k+1UL) );
12583 xmm1 += a1 *
set( B(k ,j ) );
12584 xmm2 += a1 *
set( B(k ,j+1UL) );
12585 xmm3 += a1 *
set( B(k ,j+2UL) );
12586 xmm4 += a2 *
set( B(k+1UL,j ) );
12587 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
12588 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
12591 for( ; k<kend; ++k ) {
12593 xmm1 += a1 *
set( B(k,j ) );
12594 xmm2 += a1 *
set( B(k,j+1UL) );
12595 xmm3 += a1 *
set( B(k,j+2UL) );
12598 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
12599 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
12600 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
12603 for( ; (j+2UL) <= jend; j+=2UL )
12615 size_t k( kbegin );
12617 for( ; (k+2UL) <= kend; k+=2UL ) {
12618 const SIMDType a1( A.load(i,k ) );
12619 const SIMDType a2( A.load(i,k+1UL) );
12620 xmm1 += a1 *
set( B(k ,j ) );
12621 xmm2 += a1 *
set( B(k ,j+1UL) );
12622 xmm3 += a2 *
set( B(k+1UL,j ) );
12623 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
12626 for( ; k<kend; ++k ) {
12628 xmm1 += a1 *
set( B(k,j ) );
12629 xmm2 += a1 *
set( B(k,j+1UL) );
12632 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
12633 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
12645 size_t k( kbegin );
12647 for( ; (k+2UL) <= K; k+=2UL ) {
12648 xmm1 += A.load(i,k ) *
set( B(k ,j) );
12649 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
12652 for( ; k<K; ++k ) {
12653 xmm1 += A.load(i,k) *
set( B(k,j) );
12656 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12660 for( ; remainder && i<M; ++i )
12662 const size_t jend( LOW ? i+1UL : N );
12663 size_t j( UPP ? i : 0UL );
12665 for( ; (j+2UL) <= jend; j+=2UL )
12679 for(
size_t k=kbegin; k<kend; ++k ) {
12680 value1 += A(i,k) * B(k,j );
12681 value2 += A(i,k) * B(k,j+1UL);
12684 (~C)(i,j ) -= value1 * scalar;
12685 (~C)(i,j+1UL) -= value2 * scalar;
12698 for(
size_t k=kbegin; k<K; ++k ) {
12699 value += A(i,k) * B(k,j);
12702 (~C)(i,j) -= value * scalar;
12722 template<
typename MT3
12727 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12729 selectDefaultSubAssignKernel( C, A, B, scalar );
12748 template<
typename MT3
12753 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12756 lmmm( C, A, B, -scalar, ST2(1) );
12758 ummm( C, A, B, -scalar, ST2(1) );
12760 mmm( C, A, B, -scalar, ST2(1) );
12778 template<
typename MT3
12783 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12785 selectLargeSubAssignKernel( C, A, B, scalar );
12790 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 12804 template<
typename MT3
12809 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12815 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12816 subAssign( C, tmp );
12820 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12821 subAssign( C, tmp );
12824 gemm( C, A, B, ET(-scalar), ET(1) );
12846 template<
typename MT
12860 schurAssign( ~lhs, tmp );
12891 template<
typename MT
12904 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
12907 else if( left.columns() == 0UL ) {
12941 template<
typename MT
12960 const ForwardFunctor fwd;
12962 const TmpType tmp( rhs );
12982 template<
typename MT
12995 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13032 template<
typename MT
13045 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13079 template<
typename MT
13162 template<
typename MT1
13164 inline decltype(
auto)
13212 template<
typename MT1
13227 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13258 template<
typename MT1
13273 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13304 template<
typename MT1
13319 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13350 template<
typename MT1
13365 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13396 template<
typename MT1
13411 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13427 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13428 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13429 :
public Size<MT1,0UL>
13432 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13433 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13434 :
public Size<MT2,1UL>
13450 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13451 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13452 :
public And< IsAligned<MT1>, IsAligned<MT2> >
13468 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13469 struct IsSymmetric< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13470 :
public Or< Bool<SF>
13472 , IsBuiltin< ElementType_< TDMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
13473 , And< Bool<LF>, Bool<UF> > >
13489 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
13490 struct IsHermitian< TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
13507 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13508 struct IsLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13509 :
public Or< Bool<LF>
13510 , And< IsLower<MT1>, IsLower<MT2> >
13511 , And< Or< Bool<SF>, Bool<HF> >
13512 , IsUpper<MT1>, IsUpper<MT2> > >
13528 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13529 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13530 :
public Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
13531 , And< Or< Bool<SF>, Bool<HF> >
13532 , IsUniUpper<MT1>, IsUniUpper<MT2> > >
13548 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13550 :
public Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13551 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
13552 , And< Or< Bool<SF>, Bool<HF> >
13553 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13554 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >
13570 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13571 struct IsUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13572 :
public Or< Bool<UF>
13573 , And< IsUpper<MT1>, IsUpper<MT2> >
13574 , And< Or< Bool<SF>, Bool<HF> >
13575 , IsLower<MT1>, IsLower<MT2> > >
13591 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13592 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13593 :
public Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
13594 , And< Or< Bool<SF>, Bool<HF> >
13595 , IsUniLower<MT1>, IsUniLower<MT2> > >
13611 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13613 :
public Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13614 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
13615 , And< Or< Bool<SF>, Bool<HF> >
13616 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13617 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:174
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Constraint on the data type.
Header file for kernel specific block sizes.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:270
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:264
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:468
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:544
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:469
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:173
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:617
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:534
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:276
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1026
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:402
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:412
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:260
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:392
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:148
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:261
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:424
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:107
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:263
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Not class template.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:258
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:382
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:273
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1026
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:86
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:616
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:430
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:456
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:107
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:436
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:366
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:262
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:110
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:259
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:175
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:302
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1028
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:172
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:267
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1028
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:446
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:317
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time logical 'or' evaluation.The Or alias declaration performs at compile time a logical 'or'...
Definition: Or.h:76
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:908
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.