35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 141 template<
typename MT1
147 class TDMatDMatMultExpr
148 :
public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
149 ,
private Computation
174 SYM = ( SF && !( HF || LF || UF ) ),
175 HERM = ( HF && !( LF || UF ) ),
176 LOW = ( LF || ( ( SF || HF ) && UF ) ),
177 UPP = ( UF || ( ( SF || HF ) && LF ) )
187 template<
typename T1,
typename T2,
typename T3 >
188 struct IsEvaluationRequired {
189 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
199 template<
typename T1,
typename T2,
typename T3 >
200 struct UseBlasKernel {
201 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
207 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
222 template<
typename T1,
typename T2,
typename T3 >
223 struct UseVectorizedDefaultKernel {
224 enum :
bool { value = useOptimizedKernels &&
226 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
284 MT1::simdEnabled && MT2::simdEnabled &&
289 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
290 !evaluateRight && MT2::smpAssignable };
345 :(
lhs_.columns() ) ) );
349 const size_t n(
end - begin );
368 if( i >=
lhs_.rows() ) {
371 if( j >=
rhs_.columns() ) {
383 inline size_t rows() const noexcept {
394 return rhs_.columns();
424 template<
typename T >
425 inline bool canAlias(
const T* alias )
const noexcept {
426 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
436 template<
typename T >
437 inline bool isAliased(
const T* alias )
const noexcept {
438 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
448 return lhs_.isAligned() &&
rhs_.isAligned();
459 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
461 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
462 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
486 template<
typename MT
495 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
498 else if( rhs.lhs_.columns() == 0UL ) {
513 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
529 template<
typename MT3
532 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
537 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
538 selectSmallAssignKernel( C, A, B );
540 selectBlasAssignKernel( C, A, B );
559 template<
typename MT3
565 const size_t M( A.rows() );
566 const size_t N( B.columns() );
567 const size_t K( A.columns() );
571 for(
size_t i=0UL; i<M; ++i )
582 for(
size_t j=0UL; j<N; ++j ) {
591 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
592 :( UPP ?
max(i,kbegin) : kbegin ) )
593 :( UPP ? i : 0UL ) );
596 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
597 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
598 :( LOW ? i+1UL : N ) );
601 for(
size_t j=0UL; j<jbegin; ++j ) {
606 reset( (~C)(i,0UL) );
608 for(
size_t j=jbegin; j<jend; ++j ) {
609 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
612 for(
size_t j=jend; j<N; ++j ) {
617 reset( (~C)(i,N-1UL) );
621 for(
size_t k=kbegin+1UL; k<kend; ++k )
625 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
626 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
627 :( SYM || HERM || UPP ? i : 0UL ) );
630 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
631 :( LOW ?
min(i+1UL,k) : k ) )
632 :( LOW ? i+1UL : N ) );
634 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
637 for(
size_t j=jbegin; j<jend; ++j ) {
638 (~C)(i,j) += A(i,k) * B(k,j);
641 (~C)(i,jend) = A(i,k) * B(k,jend);
647 for(
size_t i=1UL; i<M; ++i ) {
648 for(
size_t j=0UL; j<i; ++j ) {
649 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
671 template<
typename MT3
677 const size_t M( A.rows() );
678 const size_t N( B.columns() );
679 const size_t K( A.columns() );
683 for(
size_t j=0UL; j<N; ++j )
694 for(
size_t i=0UL; i<M; ++i ) {
703 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
704 :( LOW ?
max(j,kbegin) : kbegin ) )
705 :( LOW ? j : 0UL ) );
708 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
709 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
710 :( UPP ? j+1UL : M ) );
713 for(
size_t i=0UL; i<ibegin; ++i ) {
718 reset( (~C)(0UL,j) );
720 for(
size_t i=ibegin; i<iend; ++i ) {
721 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
724 for(
size_t i=iend; i<M; ++i ) {
729 reset( (~C)(M-1UL,j) );
733 for(
size_t k=kbegin+1UL; k<kend; ++k )
737 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
738 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
739 :( SYM || HERM || LOW ? j : 0UL ) );
742 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
743 :( UPP ?
min(j+1UL,k) : k ) )
744 :( UPP ? j+1UL : M ) );
746 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
749 for(
size_t i=ibegin; i<iend; ++i ) {
750 (~C)(i,j) += A(i,k) * B(k,j);
753 (~C)(iend,j) = A(iend,k) * B(k,j);
759 for(
size_t j=1UL; j<N; ++j ) {
760 for(
size_t i=0UL; i<j; ++i ) {
761 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
783 template<
typename MT3
786 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
789 constexpr
size_t block( BLOCK_SIZE );
791 const size_t M( A.rows() );
792 const size_t N( B.columns() );
794 for(
size_t ii=0UL; ii<M; ii+=block ) {
795 const size_t iend(
min( M, ii+block ) );
796 for(
size_t jj=0UL; jj<N; jj+=block ) {
797 const size_t jend(
min( N, jj+block ) );
798 for(
size_t i=ii; i<iend; ++i )
808 for(
size_t j=jj; j<jbegin; ++j ) {
812 for(
size_t j=jbegin; j<jpos; ++j ) {
813 (~C)(i,j) = A(i,j) * B(j,j);
816 for(
size_t j=jpos; j<jend; ++j ) {
841 template<
typename MT3
844 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
847 const size_t M( A.rows() );
848 const size_t N( B.columns() );
850 for(
size_t j=0UL; j<N; ++j )
861 for(
size_t i=0UL; i<ibegin; ++i ) {
865 for(
size_t i=ibegin; i<iend; ++i ) {
866 (~C)(i,j) = A(i,j) * B(j,j);
869 for(
size_t i=iend; i<M; ++i ) {
892 template<
typename MT3
898 const size_t M( A.rows() );
899 const size_t N( B.columns() );
901 for(
size_t i=0UL; i<M; ++i )
912 for(
size_t j=0UL; j<jbegin; ++j ) {
916 for(
size_t j=jbegin; j<jend; ++j ) {
917 (~C)(i,j) = A(i,i) * B(i,j);
920 for(
size_t j=jend; j<N; ++j ) {
943 template<
typename MT3
949 constexpr
size_t block( BLOCK_SIZE );
951 const size_t M( A.rows() );
952 const size_t N( B.columns() );
954 for(
size_t jj=0UL; jj<N; jj+=block ) {
955 const size_t jend(
min( N, jj+block ) );
956 for(
size_t ii=0UL; ii<M; ii+=block ) {
957 const size_t iend(
min( M, ii+block ) );
958 for(
size_t j=jj; j<jend; ++j )
968 for(
size_t i=ii; i<ibegin; ++i ) {
972 for(
size_t i=ibegin; i<ipos; ++i ) {
973 (~C)(i,j) = A(i,i) * B(i,j);
976 for(
size_t i=ipos; i<iend; ++i ) {
1001 template<
typename MT3
1005 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1009 for(
size_t i=0UL; i<A.rows(); ++i ) {
1010 C(i,i) = A(i,i) * B(i,i);
1030 template<
typename MT3
1034 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1036 selectDefaultAssignKernel( ~C, A, B );
1056 template<
typename MT3
1064 const size_t M( A.rows() );
1065 const size_t N( B.columns() );
1066 const size_t K( A.columns() );
1070 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1073 if( LOW && UPP && N > SIMDSIZE*3UL ) {
1082 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1083 for(
size_t i=0UL; i<M; ++i )
1096 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1098 for(
size_t k=kbegin; k<kend; ++k ) {
1099 const SIMDType a1(
set( A(i,k) ) );
1100 xmm1 += a1 * B.load(k,j );
1101 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1102 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1103 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1104 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1105 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1106 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1107 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1110 (~C).store( i, j , xmm1 );
1111 (~C).store( i, j+SIMDSIZE , xmm2 );
1112 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1113 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1114 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1115 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1116 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1117 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1122 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1126 for( ; (i+2UL) <= M; i+=2UL )
1139 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1141 for(
size_t k=kbegin; k<kend; ++k ) {
1142 const SIMDType a1(
set( A(i ,k) ) );
1143 const SIMDType a2(
set( A(i+1UL,k) ) );
1145 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1146 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1147 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1148 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1161 (~C).store( i , j , xmm1 );
1162 (~C).store( i , j+SIMDSIZE , xmm2 );
1163 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1164 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1165 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
1166 (~C).store( i+1UL, j , xmm6 );
1167 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
1168 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1169 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1170 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1182 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1184 for(
size_t k=kbegin; k<kend; ++k ) {
1185 const SIMDType a1(
set( A(i,k) ) );
1186 xmm1 += a1 * B.load(k,j );
1187 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1188 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1189 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1190 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1193 (~C).store( i, j , xmm1 );
1194 (~C).store( i, j+SIMDSIZE , xmm2 );
1195 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1196 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1197 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1201 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1203 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
1204 size_t i( LOW ? j : 0UL );
1206 for( ; (i+2UL) <= iend; i+=2UL )
1219 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1221 for(
size_t k=kbegin; k<kend; ++k ) {
1222 const SIMDType a1(
set( A(i ,k) ) );
1223 const SIMDType a2(
set( A(i+1UL,k) ) );
1225 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1226 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1227 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1238 (~C).store( i , j , xmm1 );
1239 (~C).store( i , j+SIMDSIZE , xmm2 );
1240 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1241 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1242 (~C).store( i+1UL, j , xmm5 );
1243 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1244 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1245 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1259 for(
size_t k=kbegin; k<kend; ++k ) {
1260 const SIMDType a1(
set( A(i,k) ) );
1261 xmm1 += a1 * B.load(k,j );
1262 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1263 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1264 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1267 (~C).store( i, j , xmm1 );
1268 (~C).store( i, j+SIMDSIZE , xmm2 );
1269 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1270 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1274 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1276 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
1277 size_t i( LOW ? j : 0UL );
1279 for( ; (i+2UL) <= iend; i+=2UL )
1292 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1294 for(
size_t k=kbegin; k<kend; ++k ) {
1295 const SIMDType a1(
set( A(i ,k) ) );
1296 const SIMDType a2(
set( A(i+1UL,k) ) );
1298 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1299 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1308 (~C).store( i , j , xmm1 );
1309 (~C).store( i , j+SIMDSIZE , xmm2 );
1310 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1311 (~C).store( i+1UL, j , xmm4 );
1312 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1313 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1327 for(
size_t k=kbegin; k<kend; ++k ) {
1328 const SIMDType a1(
set( A(i,k) ) );
1329 xmm1 += a1 * B.load(k,j );
1330 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1331 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1334 (~C).store( i, j , xmm1 );
1335 (~C).store( i, j+SIMDSIZE , xmm2 );
1336 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1340 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1342 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
1343 size_t i( LOW ? j : 0UL );
1345 for( ; (i+4UL) <= iend; i+=4UL )
1358 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1360 for(
size_t k=kbegin; k<kend; ++k ) {
1361 const SIMDType a1(
set( A(i ,k) ) );
1362 const SIMDType a2(
set( A(i+1UL,k) ) );
1363 const SIMDType a3(
set( A(i+2UL,k) ) );
1364 const SIMDType a4(
set( A(i+3UL,k) ) );
1366 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1377 (~C).store( i , j , xmm1 );
1378 (~C).store( i , j+SIMDSIZE, xmm2 );
1379 (~C).store( i+1UL, j , xmm3 );
1380 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1381 (~C).store( i+2UL, j , xmm5 );
1382 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1383 (~C).store( i+3UL, j , xmm7 );
1384 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
1387 for( ; (i+3UL) <= iend; i+=3UL )
1400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1402 for(
size_t k=kbegin; k<kend; ++k ) {
1403 const SIMDType a1(
set( A(i ,k) ) );
1404 const SIMDType a2(
set( A(i+1UL,k) ) );
1405 const SIMDType a3(
set( A(i+2UL,k) ) );
1407 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1416 (~C).store( i , j , xmm1 );
1417 (~C).store( i , j+SIMDSIZE, xmm2 );
1418 (~C).store( i+1UL, j , xmm3 );
1419 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1420 (~C).store( i+2UL, j , xmm5 );
1421 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1424 for( ; (i+2UL) <= iend; i+=2UL )
1437 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1440 for( ; (k+2UL) <= kend; k+=2UL ) {
1441 const SIMDType a1(
set( A(i ,k ) ) );
1442 const SIMDType a2(
set( A(i+1UL,k ) ) );
1443 const SIMDType a3(
set( A(i ,k+1UL) ) );
1444 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
1445 const SIMDType b1( B.load(k ,j ) );
1446 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1447 const SIMDType b3( B.load(k+1UL,j ) );
1448 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1459 for( ; k<kend; ++k ) {
1460 const SIMDType a1(
set( A(i ,k) ) );
1461 const SIMDType a2(
set( A(i+1UL,k) ) );
1463 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1470 (~C).store( i , j , xmm1+xmm5 );
1471 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
1472 (~C).store( i+1UL, j , xmm3+xmm7 );
1473 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1488 for( ; (k+2UL) <= kend; k+=2UL ) {
1489 const SIMDType a1(
set( A(i,k ) ) );
1490 const SIMDType a2(
set( A(i,k+1UL) ) );
1491 xmm1 += a1 * B.load(k ,j );
1492 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1493 xmm3 += a2 * B.load(k+1UL,j );
1494 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1497 for( ; k<kend; ++k ) {
1498 const SIMDType a1(
set( A(i,k) ) );
1499 xmm1 += a1 * B.load(k,j );
1500 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1503 (~C).store( i, j , xmm1+xmm3 );
1504 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
1508 for( ; j<jpos; j+=SIMDSIZE )
1510 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
1511 size_t i( LOW ? j : 0UL );
1513 for( ; (i+4UL) <= iend; i+=4UL )
1524 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1527 for( ; (k+2UL) <= kend; k+=2UL ) {
1529 const SIMDType b2( B.load(k+1UL,j) );
1530 xmm1 +=
set( A(i ,k ) ) * b1;
1531 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1532 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1533 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1534 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1535 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1536 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1537 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1540 for( ; k<kend; ++k ) {
1542 xmm1 +=
set( A(i ,k) ) * b1;
1543 xmm2 +=
set( A(i+1UL,k) ) * b1;
1544 xmm3 +=
set( A(i+2UL,k) ) * b1;
1545 xmm4 +=
set( A(i+3UL,k) ) * b1;
1548 (~C).store( i , j, xmm1+xmm5 );
1549 (~C).store( i+1UL, j, xmm2+xmm6 );
1550 (~C).store( i+2UL, j, xmm3+xmm7 );
1551 (~C).store( i+3UL, j, xmm4+xmm8 );
1554 for( ; (i+3UL) <= iend; i+=3UL )
1565 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1568 for( ; (k+2UL) <= kend; k+=2UL ) {
1570 const SIMDType b2( B.load(k+1UL,j) );
1571 xmm1 +=
set( A(i ,k ) ) * b1;
1572 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1573 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1574 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1575 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1576 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1579 for( ; k<kend; ++k ) {
1581 xmm1 +=
set( A(i ,k) ) * b1;
1582 xmm2 +=
set( A(i+1UL,k) ) * b1;
1583 xmm3 +=
set( A(i+2UL,k) ) * b1;
1586 (~C).store( i , j, xmm1+xmm4 );
1587 (~C).store( i+1UL, j, xmm2+xmm5 );
1588 (~C).store( i+2UL, j, xmm3+xmm6 );
1591 for( ; (i+2UL) <= iend; i+=2UL )
1605 for( ; (k+2UL) <= kend; k+=2UL ) {
1607 const SIMDType b2( B.load(k+1UL,j) );
1608 xmm1 +=
set( A(i ,k ) ) * b1;
1609 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1610 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1611 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1614 for( ; k<kend; ++k ) {
1616 xmm1 +=
set( A(i ,k) ) * b1;
1617 xmm2 +=
set( A(i+1UL,k) ) * b1;
1620 (~C).store( i , j, xmm1+xmm3 );
1621 (~C).store( i+1UL, j, xmm2+xmm4 );
1635 for( ; (k+2UL) <= K; k+=2UL ) {
1636 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1637 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1641 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1644 (~C).store( i, j, xmm1+xmm2 );
1648 for( ; remainder && j<N; ++j )
1650 size_t i( LOW && UPP ? j : 0UL );
1652 for( ; (i+2UL) <= M; i+=2UL )
1666 for(
size_t k=kbegin; k<kend; ++k ) {
1667 value1 += A(i ,k) * B(k,j);
1668 value2 += A(i+1UL,k) * B(k,j);
1671 (~C)(i ,j) = value1;
1672 (~C)(i+1UL,j) = value2;
1685 for(
size_t k=kbegin; k<K; ++k ) {
1686 value += A(i,k) * B(k,j);
1694 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1695 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1696 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1697 for(
size_t j=0UL; j<jend; ++j ) {
1698 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1702 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1703 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1704 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1705 for(
size_t i=0UL; i<iend; ++i ) {
1710 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1711 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1712 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1713 for(
size_t j=0UL; j<jend; ++j ) {
1737 template<
typename MT3
1745 const size_t M( A.rows() );
1746 const size_t N( B.columns() );
1747 const size_t K( A.columns() );
1751 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1754 if( LOW && UPP && M > SIMDSIZE*3UL ) {
1763 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1764 for(
size_t j=0UL; j<N; ++j )
1777 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1779 for(
size_t k=kbegin; k<kend; ++k ) {
1780 const SIMDType b1(
set( B(k,j) ) );
1781 xmm1 += A.load(i ,k) * b1;
1782 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1783 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1784 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1785 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1786 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1787 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1788 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1791 (~C).store( i , j, xmm1 );
1792 (~C).store( i+SIMDSIZE , j, xmm2 );
1793 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1794 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1795 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1796 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1797 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1798 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1803 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1807 for( ; (j+2UL) <= N; j+=2UL )
1820 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1822 for(
size_t k=kbegin; k<kend; ++k ) {
1824 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1825 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1826 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1827 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1828 const SIMDType b1(
set( B(k,j ) ) );
1829 const SIMDType b2(
set( B(k,j+1UL) ) );
1842 (~C).store( i , j , xmm1 );
1843 (~C).store( i+SIMDSIZE , j , xmm2 );
1844 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1845 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1846 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1847 (~C).store( i , j+1UL, xmm6 );
1848 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1849 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1850 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1851 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1863 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1865 for(
size_t k=kbegin; k<kend; ++k ) {
1866 const SIMDType b1(
set( B(k,j) ) );
1867 xmm1 += A.load(i ,k) * b1;
1868 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1869 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1870 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1871 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1874 (~C).store( i , j, xmm1 );
1875 (~C).store( i+SIMDSIZE , j, xmm2 );
1876 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1877 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1878 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1882 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1884 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
1885 size_t j( UPP ? i : 0UL );
1887 for( ; (j+2UL) <= jend; j+=2UL )
1900 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1902 for(
size_t k=kbegin; k<kend; ++k ) {
1904 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1905 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1906 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1907 const SIMDType b1(
set( B(k,j ) ) );
1908 const SIMDType b2(
set( B(k,j+1UL) ) );
1919 (~C).store( i , j , xmm1 );
1920 (~C).store( i+SIMDSIZE , j , xmm2 );
1921 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1922 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1923 (~C).store( i , j+1UL, xmm5 );
1924 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1925 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1926 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1940 for(
size_t k=kbegin; k<kend; ++k ) {
1941 const SIMDType b1(
set( B(k,j) ) );
1942 xmm1 += A.load(i ,k) * b1;
1943 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1944 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1945 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1948 (~C).store( i , j, xmm1 );
1949 (~C).store( i+SIMDSIZE , j, xmm2 );
1950 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1951 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1955 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1957 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
1958 size_t j( UPP ? i : 0UL );
1960 for( ; (j+2UL) <= jend; j+=2UL )
1973 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1975 for(
size_t k=kbegin; k<kend; ++k ) {
1977 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1978 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1979 const SIMDType b1(
set( B(k,j ) ) );
1980 const SIMDType b2(
set( B(k,j+1UL) ) );
1989 (~C).store( i , j , xmm1 );
1990 (~C).store( i+SIMDSIZE , j , xmm2 );
1991 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1992 (~C).store( i , j+1UL, xmm4 );
1993 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1994 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2008 for(
size_t k=kbegin; k<kend; ++k ) {
2009 const SIMDType b1(
set( B(k,j) ) );
2010 xmm1 += A.load(i ,k) * b1;
2011 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2012 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2015 (~C).store( i , j, xmm1 );
2016 (~C).store( i+SIMDSIZE , j, xmm2 );
2017 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2021 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2023 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
2024 size_t j( UPP ? i : 0UL );
2026 for( ; (j+4UL) <= jend; j+=4UL )
2039 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2041 for(
size_t k=kbegin; k<kend; ++k ) {
2043 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2044 const SIMDType b1(
set( B(k,j ) ) );
2045 const SIMDType b2(
set( B(k,j+1UL) ) );
2046 const SIMDType b3(
set( B(k,j+2UL) ) );
2047 const SIMDType b4(
set( B(k,j+3UL) ) );
2058 (~C).store( i , j , xmm1 );
2059 (~C).store( i+SIMDSIZE, j , xmm2 );
2060 (~C).store( i , j+1UL, xmm3 );
2061 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2062 (~C).store( i , j+2UL, xmm5 );
2063 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2064 (~C).store( i , j+3UL, xmm7 );
2065 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2068 for( ; (j+3UL) <= jend; j+=3UL )
2081 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2083 for(
size_t k=kbegin; k<kend; ++k ) {
2085 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2086 const SIMDType b1(
set( B(k,j ) ) );
2087 const SIMDType b2(
set( B(k,j+1UL) ) );
2088 const SIMDType b3(
set( B(k,j+2UL) ) );
2097 (~C).store( i , j , xmm1 );
2098 (~C).store( i+SIMDSIZE, j , xmm2 );
2099 (~C).store( i , j+1UL, xmm3 );
2100 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2101 (~C).store( i , j+2UL, xmm5 );
2102 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2105 for( ; (j+2UL) <= jend; j+=2UL )
2118 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2121 for( ; (k+2UL) <= kend; k+=2UL ) {
2122 const SIMDType a1( A.load(i ,k ) );
2123 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2124 const SIMDType a3( A.load(i ,k+1UL) );
2125 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2126 const SIMDType b1(
set( B(k ,j ) ) );
2127 const SIMDType b2(
set( B(k ,j+1UL) ) );
2128 const SIMDType b3(
set( B(k+1UL,j ) ) );
2129 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
2140 for( ; k<kend; ++k ) {
2142 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2143 const SIMDType b1(
set( B(k,j ) ) );
2144 const SIMDType b2(
set( B(k,j+1UL) ) );
2151 (~C).store( i , j , xmm1+xmm5 );
2152 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2153 (~C).store( i , j+1UL, xmm3+xmm7 );
2154 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2169 for( ; (k+2UL) <= kend; k+=2UL ) {
2170 const SIMDType b1(
set( B(k ,j) ) );
2171 const SIMDType b2(
set( B(k+1UL,j) ) );
2172 xmm1 += A.load(i ,k ) * b1;
2173 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2174 xmm3 += A.load(i ,k+1UL) * b2;
2175 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2178 for( ; k<kend; ++k ) {
2179 const SIMDType b1(
set( B(k,j) ) );
2180 xmm1 += A.load(i ,k) * b1;
2181 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2184 (~C).store( i , j, xmm1+xmm3 );
2185 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2189 for( ; i<ipos; i+=SIMDSIZE )
2191 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
2192 size_t j( UPP ? i : 0UL );
2194 for( ; (j+4UL) <= jend; j+=4UL )
2205 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2208 for( ; (k+2UL) <= kend; k+=2UL ) {
2210 const SIMDType a2( A.load(i,k+1UL) );
2211 xmm1 += a1 *
set( B(k ,j ) );
2212 xmm2 += a1 *
set( B(k ,j+1UL) );
2213 xmm3 += a1 *
set( B(k ,j+2UL) );
2214 xmm4 += a1 *
set( B(k ,j+3UL) );
2215 xmm5 += a2 *
set( B(k+1UL,j ) );
2216 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2217 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2218 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2221 for( ; k<kend; ++k ) {
2223 xmm1 += a1 *
set( B(k,j ) );
2224 xmm2 += a1 *
set( B(k,j+1UL) );
2225 xmm3 += a1 *
set( B(k,j+2UL) );
2226 xmm4 += a1 *
set( B(k,j+3UL) );
2229 (~C).store( i, j , xmm1+xmm5 );
2230 (~C).store( i, j+1UL, xmm2+xmm6 );
2231 (~C).store( i, j+2UL, xmm3+xmm7 );
2232 (~C).store( i, j+3UL, xmm4+xmm8 );
2235 for( ; (j+3UL) <= jend; j+=3UL )
2246 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2249 for( ; (k+2UL) <= kend; k+=2UL ) {
2251 const SIMDType a2( A.load(i,k+1UL) );
2252 xmm1 += a1 *
set( B(k ,j ) );
2253 xmm2 += a1 *
set( B(k ,j+1UL) );
2254 xmm3 += a1 *
set( B(k ,j+2UL) );
2255 xmm4 += a2 *
set( B(k+1UL,j ) );
2256 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2257 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2260 for( ; k<kend; ++k ) {
2262 xmm1 += a1 *
set( B(k,j ) );
2263 xmm2 += a1 *
set( B(k,j+1UL) );
2264 xmm3 += a1 *
set( B(k,j+2UL) );
2267 (~C).store( i, j , xmm1+xmm4 );
2268 (~C).store( i, j+1UL, xmm2+xmm5 );
2269 (~C).store( i, j+2UL, xmm3+xmm6 );
2272 for( ; (j+2UL) <= jend; j+=2UL )
2286 for( ; (k+2UL) <= kend; k+=2UL ) {
2288 const SIMDType a2( A.load(i,k+1UL) );
2289 xmm1 += a1 *
set( B(k ,j ) );
2290 xmm2 += a1 *
set( B(k ,j+1UL) );
2291 xmm3 += a2 *
set( B(k+1UL,j ) );
2292 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2295 for( ; k<kend; ++k ) {
2297 xmm1 += a1 *
set( B(k,j ) );
2298 xmm2 += a1 *
set( B(k,j+1UL) );
2301 (~C).store( i, j , xmm1+xmm3 );
2302 (~C).store( i, j+1UL, xmm2+xmm4 );
2316 for( ; (k+2UL) <= K; k+=2UL ) {
2317 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2318 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2322 xmm1 += A.load(i,k) *
set( B(k,j) );
2325 (~C).store( i, j, xmm1+xmm2 );
2329 for( ; remainder && i<M; ++i )
2331 size_t j( LOW && UPP ? i : 0UL );
2333 for( ; (j+2UL) <= N; j+=2UL )
2347 for(
size_t k=kbegin; k<kend; ++k ) {
2348 value1 += A(i,k) * B(k,j );
2349 value2 += A(i,k) * B(k,j+1UL);
2352 (~C)(i,j ) = value1;
2353 (~C)(i,j+1UL) = value2;
2366 for(
size_t k=kbegin; k<K; ++k ) {
2367 value += A(i,k) * B(k,j);
2375 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
2376 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2377 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2378 for(
size_t i=0UL; i<iend; ++i ) {
2379 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
2383 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
2384 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2385 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2386 for(
size_t i=0UL; i<iend; ++i ) {
2391 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2392 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2393 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2394 for(
size_t j=0UL; j<jend; ++j ) {
2417 template<
typename MT3
2421 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2423 selectDefaultAssignKernel( C, A, B );
2443 template<
typename MT3
2447 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2477 template<
typename MT3
2481 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2483 selectLargeAssignKernel( C, A, B );
2489 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2503 template<
typename MT3
2507 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2520 gemm( C, A, B, ET(1), ET(0) );
2540 template<
typename MT
2558 const ForwardFunctor fwd;
2560 const TmpType tmp(
serial( rhs ) );
2561 assign( ~lhs, fwd( tmp ) );
2579 template<
typename MT
2588 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2602 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2618 template<
typename MT3
2621 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2626 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2627 selectSmallAddAssignKernel( C, A, B );
2629 selectBlasAddAssignKernel( C, A, B );
2648 template<
typename MT3
2654 const size_t M( A.rows() );
2655 const size_t N( B.columns() );
2656 const size_t K( A.columns() );
2660 for(
size_t i=0UL; i<M; ++i )
2670 for(
size_t k=kbegin; k<kend; ++k )
2674 ?( UPP ?
max(i,k+1UL) : k+1UL )
2675 :( UPP ?
max(i,k) : k ) )
2676 :( UPP ? i : 0UL ) );
2679 ?( LOW ?
min(i+1UL,k) : k )
2680 :( LOW ?
min(i,k)+1UL : k+1UL ) )
2681 :( LOW ? i+1UL : N ) );
2683 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
2686 const size_t jnum( jend - jbegin );
2687 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2689 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2690 (~C)(i,j ) += A(i,k) * B(k,j );
2691 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2694 (~C)(i,jpos) += A(i,k) * B(k,jpos);
2716 template<
typename MT3
2722 const size_t M( A.rows() );
2723 const size_t N( B.columns() );
2724 const size_t K( A.columns() );
2728 for(
size_t j=0UL; j<N; ++j )
2738 for(
size_t k=kbegin; k<kend; ++k )
2742 ?( LOW ?
max(j,k+1UL) : k+1UL )
2743 :( LOW ?
max(j,k) : k ) )
2744 :( LOW ? j : 0UL ) );
2747 ?( UPP ?
min(j+1UL,k) : k )
2748 :( UPP ?
min(j,k)+1UL : k+1UL ) )
2749 :( UPP ? j+1UL : M ) );
2751 if( ( LOW || UPP ) && ibegin >= iend )
continue;
2754 const size_t inum( iend - ibegin );
2755 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2757 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2758 (~C)(i ,j) += A(i ,k) * B(k,j);
2759 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2762 (~C)(ipos,j) += A(ipos,k) * B(k,j);
2784 template<
typename MT3
2787 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2790 constexpr
size_t block( BLOCK_SIZE );
2792 const size_t M( A.rows() );
2793 const size_t N( B.columns() );
2795 for(
size_t ii=0UL; ii<M; ii+=block ) {
2796 const size_t iend(
min( M, ii+block ) );
2797 for(
size_t jj=0UL; jj<N; jj+=block ) {
2798 const size_t jend(
min( N, jj+block ) );
2799 for(
size_t i=ii; i<iend; ++i )
2808 for(
size_t j=jbegin; j<jpos; ++j ) {
2809 (~C)(i,j) += A(i,j) * B(j,j);
2832 template<
typename MT3
2835 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2838 const size_t M( A.rows() );
2839 const size_t N( B.columns() );
2841 for(
size_t j=0UL; j<N; ++j )
2851 const size_t inum( iend - ibegin );
2852 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2854 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2855 (~C)(i ,j) += A(i ,j) * B(j,j);
2856 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2859 (~C)(ipos,j) += A(ipos,j) * B(j,j);
2880 template<
typename MT3
2886 const size_t M( A.rows() );
2887 const size_t N( B.columns() );
2889 for(
size_t i=0UL; i<M; ++i )
2899 const size_t jnum( jend - jbegin );
2900 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2902 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2903 (~C)(i,j ) += A(i,i) * B(i,j );
2904 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2907 (~C)(i,jpos) += A(i,i) * B(i,jpos);
2928 template<
typename MT3
2934 constexpr
size_t block( BLOCK_SIZE );
2936 const size_t M( A.rows() );
2937 const size_t N( B.columns() );
2939 for(
size_t jj=0UL; jj<N; jj+=block ) {
2940 const size_t jend(
min( N, jj+block ) );
2941 for(
size_t ii=0UL; ii<M; ii+=block ) {
2942 const size_t iend(
min( M, ii+block ) );
2943 for(
size_t j=jj; j<jend; ++j )
2952 for(
size_t i=ibegin; i<ipos; ++i ) {
2953 (~C)(i,j) += A(i,i) * B(i,j);
2976 template<
typename MT3
2980 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2982 for(
size_t i=0UL; i<A.rows(); ++i ) {
2983 C(i,i) += A(i,i) * B(i,i);
3003 template<
typename MT3
3007 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3009 selectDefaultAddAssignKernel( C, A, B );
3029 template<
typename MT3
3037 const size_t M( A.rows() );
3038 const size_t N( B.columns() );
3039 const size_t K( A.columns() );
3043 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3050 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3051 for(
size_t i=0UL; i<M; ++i )
3065 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3066 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3067 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3068 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3069 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
3070 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
3071 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
3073 for(
size_t k=kbegin; k<kend; ++k ) {
3074 const SIMDType a1(
set( A(i,k) ) );
3075 xmm1 += a1 * B.load(k,j );
3076 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3077 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3078 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3079 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3080 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3081 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3082 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3085 (~C).store( i, j , xmm1 );
3086 (~C).store( i, j+SIMDSIZE , xmm2 );
3087 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3088 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3089 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3090 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
3091 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
3092 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3097 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3101 for( ; (i+2UL) <= M; i+=2UL )
3114 SIMDType xmm1 ( (~C).load(i ,j ) );
3115 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
3116 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
3117 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
3118 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
3119 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
3120 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
3121 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3122 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3123 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
3125 for(
size_t k=kbegin; k<kend; ++k ) {
3126 const SIMDType a1(
set( A(i ,k) ) );
3127 const SIMDType a2(
set( A(i+1UL,k) ) );
3129 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3130 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3131 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3132 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3145 (~C).store( i , j , xmm1 );
3146 (~C).store( i , j+SIMDSIZE , xmm2 );
3147 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3148 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3149 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
3150 (~C).store( i+1UL, j , xmm6 );
3151 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
3152 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3153 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3154 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3167 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3168 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3169 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3170 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3172 for(
size_t k=kbegin; k<kend; ++k ) {
3173 const SIMDType a1(
set( A(i,k) ) );
3174 xmm1 += a1 * B.load(k,j );
3175 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3176 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3177 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3178 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3181 (~C).store( i, j , xmm1 );
3182 (~C).store( i, j+SIMDSIZE , xmm2 );
3183 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3184 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3185 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3189 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3193 for( ; (i+2UL) <= M; i+=2UL )
3207 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3208 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3209 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3210 SIMDType xmm5( (~C).load(i+1UL,j ) );
3211 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3212 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3213 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3215 for(
size_t k=kbegin; k<kend; ++k ) {
3216 const SIMDType a1(
set( A(i ,k) ) );
3217 const SIMDType a2(
set( A(i+1UL,k) ) );
3219 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3220 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3221 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3232 (~C).store( i , j , xmm1 );
3233 (~C).store( i , j+SIMDSIZE , xmm2 );
3234 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3235 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3236 (~C).store( i+1UL, j , xmm5 );
3237 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3238 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3239 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3252 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3253 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3254 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3256 for(
size_t k=kbegin; k<kend; ++k ) {
3257 const SIMDType a1(
set( A(i,k) ) );
3258 xmm1 += a1 * B.load(k,j );
3259 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3260 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3261 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3264 (~C).store( i, j , xmm1 );
3265 (~C).store( i, j+SIMDSIZE , xmm2 );
3266 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3267 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3271 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3275 for( ; (i+2UL) <= M; i+=2UL )
3289 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3290 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3291 SIMDType xmm4( (~C).load(i+1UL,j ) );
3292 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3293 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3295 for(
size_t k=kbegin; k<kend; ++k ) {
3296 const SIMDType a1(
set( A(i ,k) ) );
3297 const SIMDType a2(
set( A(i+1UL,k) ) );
3299 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3300 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3309 (~C).store( i , j , xmm1 );
3310 (~C).store( i , j+SIMDSIZE , xmm2 );
3311 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3312 (~C).store( i+1UL, j , xmm4 );
3313 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3314 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3327 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3328 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3330 for(
size_t k=kbegin; k<kend; ++k ) {
3331 const SIMDType a1(
set( A(i,k) ) );
3332 xmm1 += a1 * B.load(k,j );
3333 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3334 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3337 (~C).store( i, j , xmm1 );
3338 (~C).store( i, j+SIMDSIZE , xmm2 );
3339 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3343 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3345 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
3346 size_t i( LOW ? j : 0UL );
3348 for( ; (i+4UL) <= iend; i+=4UL )
3362 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3363 SIMDType xmm3( (~C).load(i+1UL,j ) );
3364 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3365 SIMDType xmm5( (~C).load(i+2UL,j ) );
3366 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3367 SIMDType xmm7( (~C).load(i+3UL,j ) );
3368 SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
3370 for(
size_t k=kbegin; k<kend; ++k ) {
3371 const SIMDType a1(
set( A(i ,k) ) );
3372 const SIMDType a2(
set( A(i+1UL,k) ) );
3373 const SIMDType a3(
set( A(i+2UL,k) ) );
3374 const SIMDType a4(
set( A(i+3UL,k) ) );
3376 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3387 (~C).store( i , j , xmm1 );
3388 (~C).store( i , j+SIMDSIZE, xmm2 );
3389 (~C).store( i+1UL, j , xmm3 );
3390 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3391 (~C).store( i+2UL, j , xmm5 );
3392 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3393 (~C).store( i+3UL, j , xmm7 );
3394 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
3397 for( ; (i+3UL) <= iend; i+=3UL )
3411 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3412 SIMDType xmm3( (~C).load(i+1UL,j ) );
3413 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3414 SIMDType xmm5( (~C).load(i+2UL,j ) );
3415 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3417 for(
size_t k=kbegin; k<kend; ++k ) {
3418 const SIMDType a1(
set( A(i ,k) ) );
3419 const SIMDType a2(
set( A(i+1UL,k) ) );
3420 const SIMDType a3(
set( A(i+2UL,k) ) );
3422 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3431 (~C).store( i , j , xmm1 );
3432 (~C).store( i , j+SIMDSIZE, xmm2 );
3433 (~C).store( i+1UL, j , xmm3 );
3434 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3435 (~C).store( i+2UL, j , xmm5 );
3436 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3439 for( ; (i+2UL) <= iend; i+=2UL )
3453 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3454 SIMDType xmm3( (~C).load(i+1UL,j ) );
3455 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3459 for( ; (k+2UL) <= kend; k+=2UL ) {
3460 const SIMDType a1(
set( A(i ,k ) ) );
3461 const SIMDType a2(
set( A(i+1UL,k ) ) );
3462 const SIMDType a3(
set( A(i ,k+1UL) ) );
3463 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
3464 const SIMDType b1( B.load(k ,j ) );
3465 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3466 const SIMDType b3( B.load(k+1UL,j ) );
3467 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3478 for( ; k<kend; ++k ) {
3479 const SIMDType a1(
set( A(i ,k) ) );
3480 const SIMDType a2(
set( A(i+1UL,k) ) );
3482 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3489 (~C).store( i , j , xmm1+xmm5 );
3490 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
3491 (~C).store( i+1UL, j , xmm3+xmm7 );
3492 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3505 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3509 for( ; (k+2UL) <= kend; k+=2UL ) {
3510 const SIMDType a1(
set( A(i,k ) ) );
3511 const SIMDType a2(
set( A(i,k+1UL) ) );
3512 xmm1 += a1 * B.load(k ,j );
3513 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
3514 xmm3 += a2 * B.load(k+1UL,j );
3515 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
3518 for( ; k<kend; ++k ) {
3519 const SIMDType a1(
set( A(i,k) ) );
3520 xmm1 += a1 * B.load(k,j );
3521 xmm2 += a1 * B.load(k,j+SIMDSIZE);
3524 (~C).store( i, j , xmm1+xmm3 );
3525 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
3529 for( ; j<jpos; j+=SIMDSIZE )
3531 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
3532 size_t i( LOW ? j : 0UL );
3534 for( ; (i+4UL) <= iend; i+=4UL )
3546 SIMDType xmm2( (~C).load(i+1UL,j) );
3547 SIMDType xmm3( (~C).load(i+2UL,j) );
3548 SIMDType xmm4( (~C).load(i+3UL,j) );
3552 for( ; (k+2UL) <= kend; k+=2UL ) {
3554 const SIMDType b2( B.load(k+1UL,j) );
3555 xmm1 +=
set( A(i ,k ) ) * b1;
3556 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3557 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3558 xmm4 +=
set( A(i+3UL,k ) ) * b1;
3559 xmm5 +=
set( A(i ,k+1UL) ) * b2;
3560 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
3561 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
3562 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
3565 for( ; k<kend; ++k ) {
3567 xmm1 +=
set( A(i ,k) ) * b1;
3568 xmm2 +=
set( A(i+1UL,k) ) * b1;
3569 xmm3 +=
set( A(i+2UL,k) ) * b1;
3570 xmm4 +=
set( A(i+3UL,k) ) * b1;
3573 (~C).store( i , j, xmm1+xmm5 );
3574 (~C).store( i+1UL, j, xmm2+xmm6 );
3575 (~C).store( i+2UL, j, xmm3+xmm7 );
3576 (~C).store( i+3UL, j, xmm4+xmm8 );
3579 for( ; (i+3UL) <= iend; i+=3UL )
3591 SIMDType xmm2( (~C).load(i+1UL,j) );
3592 SIMDType xmm3( (~C).load(i+2UL,j) );
3596 for( ; (k+2UL) <= kend; k+=2UL ) {
3598 const SIMDType b2( B.load(k+1UL,j) );
3599 xmm1 +=
set( A(i ,k ) ) * b1;
3600 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3601 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3602 xmm4 +=
set( A(i ,k+1UL) ) * b2;
3603 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
3604 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
3607 for( ; k<kend; ++k ) {
3609 xmm1 +=
set( A(i ,k) ) * b1;
3610 xmm2 +=
set( A(i+1UL,k) ) * b1;
3611 xmm3 +=
set( A(i+2UL,k) ) * b1;
3614 (~C).store( i , j, xmm1+xmm4 );
3615 (~C).store( i+1UL, j, xmm2+xmm5 );
3616 (~C).store( i+2UL, j, xmm3+xmm6 );
3619 for( ; (i+2UL) <= iend; i+=2UL )
3631 SIMDType xmm2( (~C).load(i+1UL,j) );
3635 for( ; (k+2UL) <= kend; k+=2UL ) {
3637 const SIMDType b2( B.load(k+1UL,j) );
3638 xmm1 +=
set( A(i ,k ) ) * b1;
3639 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3640 xmm3 +=
set( A(i ,k+1UL) ) * b2;
3641 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
3644 for( ; k<kend; ++k ) {
3646 xmm1 +=
set( A(i ,k) ) * b1;
3647 xmm2 +=
set( A(i+1UL,k) ) * b1;
3650 (~C).store( i , j, xmm1+xmm3 );
3651 (~C).store( i+1UL, j, xmm2+xmm4 );
3666 for( ; (k+2UL) <= K; k+=2UL ) {
3667 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
3668 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
3672 xmm1 +=
set( A(i,k) ) * B.load(k,j);
3675 (~C).store( i, j, xmm1+xmm2 );
3679 for( ; remainder && j<N; ++j )
3681 const size_t iend( UPP ? j+1UL : M );
3682 size_t i( LOW ? j : 0UL );
3684 for( ; (i+2UL) <= iend; i+=2UL )
3698 for(
size_t k=kbegin; k<kend; ++k ) {
3699 value1 += A(i ,k) * B(k,j);
3700 value2 += A(i+1UL,k) * B(k,j);
3703 (~C)(i ,j) = value1;
3704 (~C)(i+1UL,j) = value2;
3717 for(
size_t k=kbegin; k<K; ++k ) {
3718 value += A(i,k) * B(k,j);
3743 template<
typename MT3
3751 const size_t M( A.rows() );
3752 const size_t N( B.columns() );
3753 const size_t K( A.columns() );
3757 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3764 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3765 for(
size_t j=0UL; j<N; ++j )
3779 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3780 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3781 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3782 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3783 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3784 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3785 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3787 for(
size_t k=kbegin; k<kend; ++k ) {
3788 const SIMDType b1(
set( B(k,j) ) );
3789 xmm1 += A.load(i ,k) * b1;
3790 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3791 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3792 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3793 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3794 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3795 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3796 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3799 (~C).store( i , j, xmm1 );
3800 (~C).store( i+SIMDSIZE , j, xmm2 );
3801 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3802 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3803 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3804 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3805 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3806 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3811 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3815 for( ; (j+2UL) <= N; j+=2UL )
3828 SIMDType xmm1 ( (~C).load(i ,j ) );
3829 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3830 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3831 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3832 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3833 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3834 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3835 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3836 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3837 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3839 for(
size_t k=kbegin; k<kend; ++k ) {
3841 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3842 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3843 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3844 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3845 const SIMDType b1(
set( B(k,j ) ) );
3846 const SIMDType b2(
set( B(k,j+1UL) ) );
3859 (~C).store( i , j , xmm1 );
3860 (~C).store( i+SIMDSIZE , j , xmm2 );
3861 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3862 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3863 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3864 (~C).store( i , j+1UL, xmm6 );
3865 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3866 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3867 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3868 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3881 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3882 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3883 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3884 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3886 for(
size_t k=kbegin; k<kend; ++k ) {
3887 const SIMDType b1(
set( B(k,j) ) );
3888 xmm1 += A.load(i ,k) * b1;
3889 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3890 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3891 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3892 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3895 (~C).store( i , j, xmm1 );
3896 (~C).store( i+SIMDSIZE , j, xmm2 );
3897 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3898 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3899 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3903 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3907 for( ; (j+2UL) <= N; j+=2UL )
3921 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3922 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3923 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3924 SIMDType xmm5( (~C).load(i ,j+1UL) );
3925 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3926 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3927 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3929 for(
size_t k=kbegin; k<kend; ++k ) {
3931 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3932 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3933 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3934 const SIMDType b1(
set( B(k,j ) ) );
3935 const SIMDType b2(
set( B(k,j+1UL) ) );
3946 (~C).store( i , j , xmm1 );
3947 (~C).store( i+SIMDSIZE , j , xmm2 );
3948 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3949 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3950 (~C).store( i , j+1UL, xmm5 );
3951 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3952 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3953 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3966 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3967 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3968 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3970 for(
size_t k=kbegin; k<kend; ++k ) {
3971 const SIMDType b1(
set( B(k,j) ) );
3972 xmm1 += A.load(i ,k) * b1;
3973 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3974 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3975 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3978 (~C).store( i , j, xmm1 );
3979 (~C).store( i+SIMDSIZE , j, xmm2 );
3980 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3981 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3985 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3989 for( ; (j+2UL) <= N; j+=2UL )
4003 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4004 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4005 SIMDType xmm4( (~C).load(i ,j+1UL) );
4006 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
4007 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4009 for(
size_t k=kbegin; k<kend; ++k ) {
4011 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4012 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4013 const SIMDType b1(
set( B(k,j ) ) );
4014 const SIMDType b2(
set( B(k,j+1UL) ) );
4023 (~C).store( i , j , xmm1 );
4024 (~C).store( i+SIMDSIZE , j , xmm2 );
4025 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4026 (~C).store( i , j+1UL, xmm4 );
4027 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
4028 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4041 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4042 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4044 for(
size_t k=kbegin; k<kend; ++k ) {
4045 const SIMDType b1(
set( B(k,j) ) );
4046 xmm1 += A.load(i ,k) * b1;
4047 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4048 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4051 (~C).store( i , j, xmm1 );
4052 (~C).store( i+SIMDSIZE , j, xmm2 );
4053 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4057 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4059 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
4060 size_t j( UPP ? i : 0UL );
4062 for( ; (j+4UL) <= jend; j+=4UL )
4076 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4077 SIMDType xmm3( (~C).load(i ,j+1UL) );
4078 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4079 SIMDType xmm5( (~C).load(i ,j+2UL) );
4080 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4081 SIMDType xmm7( (~C).load(i ,j+3UL) );
4082 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
4084 for(
size_t k=kbegin; k<kend; ++k ) {
4086 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4087 const SIMDType b1(
set( B(k,j ) ) );
4088 const SIMDType b2(
set( B(k,j+1UL) ) );
4089 const SIMDType b3(
set( B(k,j+2UL) ) );
4090 const SIMDType b4(
set( B(k,j+3UL) ) );
4101 (~C).store( i , j , xmm1 );
4102 (~C).store( i+SIMDSIZE, j , xmm2 );
4103 (~C).store( i , j+1UL, xmm3 );
4104 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4105 (~C).store( i , j+2UL, xmm5 );
4106 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4107 (~C).store( i , j+3UL, xmm7 );
4108 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
4111 for( ; (j+3UL) <= jend; j+=3UL )
4125 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4126 SIMDType xmm3( (~C).load(i ,j+1UL) );
4127 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4128 SIMDType xmm5( (~C).load(i ,j+2UL) );
4129 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4131 for(
size_t k=kbegin; k<kend; ++k ) {
4133 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4134 const SIMDType b1(
set( B(k,j ) ) );
4135 const SIMDType b2(
set( B(k,j+1UL) ) );
4136 const SIMDType b3(
set( B(k,j+2UL) ) );
4145 (~C).store( i , j , xmm1 );
4146 (~C).store( i+SIMDSIZE, j , xmm2 );
4147 (~C).store( i , j+1UL, xmm3 );
4148 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4149 (~C).store( i , j+2UL, xmm5 );
4150 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4153 for( ; (j+2UL) <= jend; j+=2UL )
4167 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4168 SIMDType xmm3( (~C).load(i ,j+1UL) );
4169 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4173 for( ; (k+2UL) < kend; k+=2UL ) {
4174 const SIMDType a1( A.load(i ,k ) );
4175 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
4176 const SIMDType a3( A.load(i ,k+1UL) );
4177 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
4178 const SIMDType b1(
set( B(k ,j ) ) );
4179 const SIMDType b2(
set( B(k ,j+1UL) ) );
4180 const SIMDType b3(
set( B(k+1UL,j ) ) );
4181 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
4192 for( ; k<kend; ++k ) {
4194 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4195 const SIMDType b1(
set( B(k,j ) ) );
4196 const SIMDType b2(
set( B(k,j+1UL) ) );
4203 (~C).store( i , j , xmm1+xmm5 );
4204 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
4205 (~C).store( i , j+1UL, xmm3+xmm7 );
4206 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
4219 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
4223 for( ; (k+2UL) <= kend; k+=2UL ) {
4224 const SIMDType b1(
set( B(k ,j) ) );
4225 const SIMDType b2(
set( B(k+1UL,j) ) );
4226 xmm1 += A.load(i ,k ) * b1;
4227 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
4228 xmm3 += A.load(i ,k+1UL) * b2;
4229 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
4232 for( ; k<kend; ++k ) {
4233 const SIMDType b1(
set( B(k,j) ) );
4234 xmm1 += A.load(i ,k) * b1;
4235 xmm2 += A.load(i+SIMDSIZE,k) * b1;
4238 (~C).store( i , j, xmm1+xmm3 );
4239 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
4243 for( ; i<ipos; i+=SIMDSIZE )
4245 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
4246 size_t j( UPP ? i : 0UL );
4248 for( ; (j+4UL) <= jend; j+=4UL )
4260 SIMDType xmm2( (~C).load(i,j+1UL) );
4261 SIMDType xmm3( (~C).load(i,j+2UL) );
4262 SIMDType xmm4( (~C).load(i,j+3UL) );
4266 for( ; (k+2UL) <= kend; k+=2UL ) {
4268 const SIMDType a2( A.load(i,k+1UL) );
4269 xmm1 += a1 *
set( B(k ,j ) );
4270 xmm2 += a1 *
set( B(k ,j+1UL) );
4271 xmm3 += a1 *
set( B(k ,j+2UL) );
4272 xmm4 += a1 *
set( B(k ,j+3UL) );
4273 xmm5 += a2 *
set( B(k+1UL,j ) );
4274 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
4275 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
4276 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
4279 for( ; k<kend; ++k ) {
4281 xmm1 += a1 *
set( B(k,j ) );
4282 xmm2 += a1 *
set( B(k,j+1UL) );
4283 xmm3 += a1 *
set( B(k,j+2UL) );
4284 xmm4 += a1 *
set( B(k,j+3UL) );
4287 (~C).store( i, j , xmm1+xmm5 );
4288 (~C).store( i, j+1UL, xmm2+xmm6 );
4289 (~C).store( i, j+2UL, xmm3+xmm7 );
4290 (~C).store( i, j+3UL, xmm4+xmm8 );
4293 for( ; (j+3UL) <= jend; j+=3UL )
4305 SIMDType xmm2( (~C).load(i,j+1UL) );
4306 SIMDType xmm3( (~C).load(i,j+2UL) );
4310 for( ; (k+2UL) <= kend; k+=2UL ) {
4312 const SIMDType a2( A.load(i,k+1UL) );
4313 xmm1 += a1 *
set( B(k ,j ) );
4314 xmm2 += a1 *
set( B(k ,j+1UL) );
4315 xmm3 += a1 *
set( B(k ,j+2UL) );
4316 xmm4 += a2 *
set( B(k+1UL,j ) );
4317 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
4318 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
4321 for( ; k<kend; ++k ) {
4323 xmm1 += a1 *
set( B(k,j ) );
4324 xmm2 += a1 *
set( B(k,j+1UL) );
4325 xmm3 += a1 *
set( B(k,j+2UL) );
4328 (~C).store( i, j , xmm1+xmm4 );
4329 (~C).store( i, j+1UL, xmm2+xmm5 );
4330 (~C).store( i, j+2UL, xmm3+xmm6 );
4333 for( ; (j+2UL) <= jend; j+=2UL )
4345 SIMDType xmm2( (~C).load(i,j+1UL) );
4349 for( ; (k+2UL) <= kend; k+=2UL ) {
4351 const SIMDType a2( A.load(i,k+1UL) );
4352 xmm1 += a1 *
set( B(k ,j ) );
4353 xmm2 += a1 *
set( B(k ,j+1UL) );
4354 xmm3 += a2 *
set( B(k+1UL,j ) );
4355 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
4358 for( ; k<kend; ++k ) {
4360 xmm1 += a1 *
set( B(k,j ) );
4361 xmm2 += a1 *
set( B(k,j+1UL) );
4364 (~C).store( i, j , xmm1+xmm3 );
4365 (~C).store( i, j+1UL, xmm2+xmm4 );
4380 for( ; (k+2UL) <= K; k+=2UL ) {
4381 xmm1 += A.load(i,k ) *
set( B(k ,j) );
4382 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
4386 xmm1 += A.load(i,k) *
set( B(k,j) );
4389 (~C).store( i, j, xmm1+xmm2 );
4393 for( ; remainder && i<M; ++i )
4395 const size_t jend( LOW ? i+1UL : N );
4396 size_t j( UPP ? i : 0UL );
4398 for( ; (j+2UL) <= jend; j+=2UL )
4412 for(
size_t k=kbegin; k<kend; ++k ) {
4413 value1 += A(i,k) * B(k,j );
4414 value2 += A(i,k) * B(k,j+1UL);
4417 (~C)(i,j ) = value1;
4418 (~C)(i,j+1UL) = value2;
4431 for(
size_t k=kbegin; k<K; ++k ) {
4432 value += A(i,k) * B(k,j);
4456 template<
typename MT3
4460 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4462 selectDefaultAddAssignKernel( C, A, B );
4482 template<
typename MT3
4486 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4512 template<
typename MT3
4516 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4518 selectLargeAddAssignKernel( C, A, B );
4524 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4538 template<
typename MT3
4542 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4549 addAssign( C, tmp );
4554 addAssign( C, tmp );
4557 gemm( C, A, B, ET(1), ET(1) );
4581 template<
typename MT
4590 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4604 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4620 template<
typename MT3
4623 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4628 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4629 selectSmallSubAssignKernel( C, A, B );
4631 selectBlasSubAssignKernel( C, A, B );
4650 template<
typename MT3
4656 const size_t M( A.rows() );
4657 const size_t N( B.columns() );
4658 const size_t K( A.columns() );
4662 for(
size_t i=0UL; i<M; ++i )
4672 for(
size_t k=kbegin; k<kend; ++k )
4676 ?( UPP ?
max(i,k+1UL) : k+1UL )
4677 :( UPP ?
max(i,k) : k ) )
4678 :( UPP ? i : 0UL ) );
4681 ?( LOW ?
min(i+1UL,k) : k )
4682 :( LOW ?
min(i,k)+1UL : k+1UL ) )
4683 :( LOW ? i+1UL : N ) );
4685 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
4688 const size_t jnum( jend - jbegin );
4689 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4691 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4692 (~C)(i,j ) -= A(i,k) * B(k,j );
4693 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4696 (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4718 template<
typename MT3
4724 const size_t M( A.rows() );
4725 const size_t N( B.columns() );
4726 const size_t K( A.columns() );
4730 for(
size_t j=0UL; j<N; ++j )
4740 for(
size_t k=kbegin; k<kend; ++k )
4744 ?( LOW ?
max(j,k+1UL) : k+1UL )
4745 :( LOW ?
max(j,k) : k ) )
4746 :( LOW ? j : 0UL ) );
4749 ?( UPP ?
min(j+1UL,k) : k )
4750 :( UPP ?
min(j,k)+1UL : k+1UL ) )
4751 :( UPP ? j+1UL : M ) );
4753 if( ( LOW || UPP ) && ( ibegin >= iend ) )
continue;
4756 const size_t inum( iend - ibegin );
4757 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4759 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4760 (~C)(i ,j) -= A(i ,k) * B(k,j);
4761 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4764 (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4786 template<
typename MT3
4789 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4792 constexpr
size_t block( BLOCK_SIZE );
4794 const size_t M( A.rows() );
4795 const size_t N( B.columns() );
4797 for(
size_t ii=0UL; ii<M; ii+=block ) {
4798 const size_t iend(
min( M, ii+block ) );
4799 for(
size_t jj=0UL; jj<N; jj+=block ) {
4800 const size_t jend(
min( N, jj+block ) );
4801 for(
size_t i=ii; i<iend; ++i )
4810 for(
size_t j=jbegin; j<jpos; ++j ) {
4811 (~C)(i,j) -= A(i,j) * B(j,j);
4834 template<
typename MT3
4837 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4840 const size_t M( A.rows() );
4841 const size_t N( B.columns() );
4843 for(
size_t j=0UL; j<N; ++j )
4853 const size_t inum( iend - ibegin );
4854 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4856 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4857 (~C)(i ,j) -= A(i ,j) * B(j,j);
4858 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4861 (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4882 template<
typename MT3
4888 const size_t M( A.rows() );
4889 const size_t N( B.columns() );
4891 for(
size_t i=0UL; i<M; ++i )
4901 const size_t jnum( jend - jbegin );
4902 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4904 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4905 (~C)(i,j ) -= A(i,i) * B(i,j );
4906 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4909 (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4930 template<
typename MT3
4936 constexpr
size_t block( BLOCK_SIZE );
4938 const size_t M( A.rows() );
4939 const size_t N( B.columns() );
4941 for(
size_t jj=0UL; jj<N; jj+=block ) {
4942 const size_t jend(
min( N, jj+block ) );
4943 for(
size_t ii=0UL; ii<M; ii+=block ) {
4944 const size_t iend(
min( M, ii+block ) );
4945 for(
size_t j=jj; j<jend; ++j )
4954 for(
size_t i=ibegin; i<ipos; ++i ) {
4955 (~C)(i,j) -= A(i,i) * B(i,j);
4978 template<
typename MT3
4982 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4984 for(
size_t i=0UL; i<A.rows(); ++i ) {
4985 C(i,i) -= A(i,i) * B(i,i);
5005 template<
typename MT3
5009 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5011 selectDefaultSubAssignKernel( C, A, B );
5031 template<
typename MT3
5039 const size_t M( A.rows() );
5040 const size_t N( B.columns() );
5041 const size_t K( A.columns() );
5045 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
5052 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5053 for(
size_t i=0UL; i<M; ++i )
5067 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5068 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5069 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5070 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5071 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
5072 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
5073 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
5075 for(
size_t k=kbegin; k<kend; ++k ) {
5076 const SIMDType a1(
set( A(i,k) ) );
5077 xmm1 -= a1 * B.load(k,j );
5078 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5079 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5080 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5081 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5082 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5083 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5084 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5087 (~C).store( i, j , xmm1 );
5088 (~C).store( i, j+SIMDSIZE , xmm2 );
5089 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5090 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5091 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5092 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
5093 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
5094 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
5099 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5103 for( ; (i+2UL) <= M; i+=2UL )
5116 SIMDType xmm1 ( (~C).load(i ,j ) );
5117 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
5118 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
5119 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
5120 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
5121 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
5122 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
5123 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5124 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5125 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
5127 for(
size_t k=kbegin; k<kend; ++k ) {
5128 const SIMDType a1(
set( A(i ,k) ) );
5129 const SIMDType a2(
set( A(i+1UL,k) ) );
5131 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5132 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5133 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5134 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5147 (~C).store( i , j , xmm1 );
5148 (~C).store( i , j+SIMDSIZE , xmm2 );
5149 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5150 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5151 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
5152 (~C).store( i+1UL, j , xmm6 );
5153 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
5154 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5155 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5156 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5169 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5170 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5171 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5172 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5174 for(
size_t k=kbegin; k<kend; ++k ) {
5175 const SIMDType a1(
set( A(i,k) ) );
5176 xmm1 -= a1 * B.load(k,j );
5177 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5178 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5179 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5180 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5183 (~C).store( i, j , xmm1 );
5184 (~C).store( i, j+SIMDSIZE , xmm2 );
5185 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5186 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5187 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5191 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5195 for( ; (i+2UL) <= M; i+=2UL )
5209 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5210 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5211 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
5212 SIMDType xmm5( (~C).load(i+1UL,j ) );
5213 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
5214 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5215 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5217 for(
size_t k=kbegin; k<kend; ++k ) {
5218 const SIMDType a1(
set( A(i ,k) ) );
5219 const SIMDType a2(
set( A(i+1UL,k) ) );
5221 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5222 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5223 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5234 (~C).store( i , j , xmm1 );
5235 (~C).store( i , j+SIMDSIZE , xmm2 );
5236 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5237 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5238 (~C).store( i+1UL, j , xmm5 );
5239 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
5240 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5241 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5254 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5255 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5256 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5258 for(
size_t k=kbegin; k<kend; ++k ) {
5259 const SIMDType a1(
set( A(i,k) ) );
5260 xmm1 -= a1 * B.load(k,j );
5261 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5262 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5263 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5266 (~C).store( i, j , xmm1 );
5267 (~C).store( i, j+SIMDSIZE , xmm2 );
5268 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5269 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5273 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5277 for( ; (i+2UL) <= M; i+=2UL )
5291 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5292 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5293 SIMDType xmm4( (~C).load(i+1UL,j ) );
5294 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
5295 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5297 for(
size_t k=kbegin; k<kend; ++k ) {
5298 const SIMDType a1(
set( A(i ,k) ) );
5299 const SIMDType a2(
set( A(i+1UL,k) ) );
5301 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5302 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5311 (~C).store( i , j , xmm1 );
5312 (~C).store( i , j+SIMDSIZE , xmm2 );
5313 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5314 (~C).store( i+1UL, j , xmm4 );
5315 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
5316 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5329 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5330 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5332 for(
size_t k=kbegin; k<kend; ++k ) {
5333 const SIMDType a1(
set( A(i,k) ) );
5334 xmm1 -= a1 * B.load(k,j );
5335 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5336 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5339 (~C).store( i, j , xmm1 );
5340 (~C).store( i, j+SIMDSIZE , xmm2 );
5341 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5345 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5347 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
5348 size_t i( LOW ? j : 0UL );
5350 for( ; (i+4UL) <= iend; i+=4UL )
5364 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5365 SIMDType xmm3( (~C).load(i+1UL,j ) );
5366 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5367 SIMDType xmm5( (~C).load(i+2UL,j ) );
5368 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5369 SIMDType xmm7( (~C).load(i+3UL,j ) );
5370 SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
5372 for(
size_t k=kbegin; k<kend; ++k ) {
5373 const SIMDType a1(
set( A(i ,k) ) );
5374 const SIMDType a2(
set( A(i+1UL,k) ) );
5375 const SIMDType a3(
set( A(i+2UL,k) ) );
5376 const SIMDType a4(
set( A(i+3UL,k) ) );
5378 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5389 (~C).store( i , j , xmm1 );
5390 (~C).store( i , j+SIMDSIZE, xmm2 );
5391 (~C).store( i+1UL, j , xmm3 );
5392 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5393 (~C).store( i+2UL, j , xmm5 );
5394 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5395 (~C).store( i+3UL, j , xmm7 );
5396 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
5399 for( ; (i+3UL) <= iend; i+=3UL )
5413 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5414 SIMDType xmm3( (~C).load(i+1UL,j ) );
5415 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5416 SIMDType xmm5( (~C).load(i+2UL,j ) );
5417 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5419 for(
size_t k=kbegin; k<kend; ++k ) {
5420 const SIMDType a1(
set( A(i ,k) ) );
5421 const SIMDType a2(
set( A(i+1UL,k) ) );
5422 const SIMDType a3(
set( A(i+2UL,k) ) );
5424 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5433 (~C).store( i , j , xmm1 );
5434 (~C).store( i , j+SIMDSIZE, xmm2 );
5435 (~C).store( i+1UL, j , xmm3 );
5436 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5437 (~C).store( i+2UL, j , xmm5 );
5438 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5441 for( ; (i+2UL) <= iend; i+=2UL )
5455 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5456 SIMDType xmm3( (~C).load(i+1UL,j ) );
5457 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5461 for( ; (k+2UL) <= kend; k+=2UL ) {
5462 const SIMDType a1(
set( A(i ,k ) ) );
5463 const SIMDType a2(
set( A(i+1UL,k ) ) );
5464 const SIMDType a3(
set( A(i ,k+1UL) ) );
5465 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
5466 const SIMDType b1( B.load(k ,j ) );
5467 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5468 const SIMDType b3( B.load(k+1UL,j ) );
5469 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5480 for( ; k<kend; ++k ) {
5481 const SIMDType a1(
set( A(i ,k) ) );
5482 const SIMDType a2(
set( A(i+1UL,k) ) );
5484 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5491 (~C).store( i , j , xmm1+xmm5 );
5492 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
5493 (~C).store( i+1UL, j , xmm3+xmm7 );
5494 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
5507 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
5511 for( ; (k+2UL) <= kend; k+=2UL ) {
5512 const SIMDType a1(
set( A(i,k ) ) );
5513 const SIMDType a2(
set( A(i,k+1UL) ) );
5514 xmm1 -= a1 * B.load(k ,j );
5515 xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
5516 xmm3 -= a2 * B.load(k+1UL,j );
5517 xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
5520 for( ; k<kend; ++k ) {
5521 const SIMDType a1(
set( A(i,k) ) );
5522 xmm1 -= a1 * B.load(k,j );
5523 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
5526 (~C).store( i, j , xmm1+xmm3 );
5527 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
5531 for( ; j<jpos; j+=SIMDSIZE )
5533 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
5534 size_t i( LOW ? j : 0UL );
5536 for( ; (i+4UL) <= iend; i+=4UL )
5548 SIMDType xmm2( (~C).load(i+1UL,j) );
5549 SIMDType xmm3( (~C).load(i+2UL,j) );
5550 SIMDType xmm4( (~C).load(i+3UL,j) );
5554 for( ; (k+2UL) <= kend; k+=2UL ) {
5556 const SIMDType b2( B.load(k+1UL,j) );
5557 xmm1 -=
set( A(i ,k ) ) * b1;
5558 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5559 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5560 xmm4 -=
set( A(i+3UL,k ) ) * b1;
5561 xmm5 -=
set( A(i ,k+1UL) ) * b2;
5562 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
5563 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
5564 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
5567 for( ; k<kend; ++k ) {
5569 xmm1 -=
set( A(i ,k) ) * b1;
5570 xmm2 -=
set( A(i+1UL,k) ) * b1;
5571 xmm3 -=
set( A(i+2UL,k) ) * b1;
5572 xmm4 -=
set( A(i+3UL,k) ) * b1;
5575 (~C).store( i , j, xmm1+xmm5 );
5576 (~C).store( i+1UL, j, xmm2+xmm6 );
5577 (~C).store( i+2UL, j, xmm3+xmm7 );
5578 (~C).store( i+3UL, j, xmm4+xmm8 );
5581 for( ; (i+3UL) <= iend; i+=3UL )
5593 SIMDType xmm2( (~C).load(i+1UL,j) );
5594 SIMDType xmm3( (~C).load(i+2UL,j) );
5598 for( ; (k+2UL) <= kend; k+=2UL ) {
5600 const SIMDType b2( B.load(k+1UL,j) );
5601 xmm1 -=
set( A(i ,k ) ) * b1;
5602 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5603 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5604 xmm4 -=
set( A(i ,k+1UL) ) * b2;
5605 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
5606 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
5609 for( ; k<kend; ++k ) {
5611 xmm1 -=
set( A(i ,k) ) * b1;
5612 xmm2 -=
set( A(i+1UL,k) ) * b1;
5613 xmm3 -=
set( A(i+2UL,k) ) * b1;
5616 (~C).store( i , j, xmm1+xmm4 );
5617 (~C).store( i+1UL, j, xmm2+xmm5 );
5618 (~C).store( i+2UL, j, xmm3+xmm6 );
5621 for( ; (i+2UL) <= iend; i+=2UL )
5633 SIMDType xmm2( (~C).load(i+1UL,j) );
5637 for( ; (k+2UL) <= kend; k+=2UL ) {
5639 const SIMDType b2( B.load(k+1UL,j) );
5640 xmm1 -=
set( A(i ,k ) ) * b1;
5641 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5642 xmm3 -=
set( A(i ,k+1UL) ) * b2;
5643 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
5646 for( ; k<kend; ++k ) {
5648 xmm1 -=
set( A(i ,k) ) * b1;
5649 xmm2 -=
set( A(i+1UL,k) ) * b1;
5652 (~C).store( i , j, xmm1+xmm3 );
5653 (~C).store( i+1UL, j, xmm2+xmm4 );
5668 for( ; (k+2UL) <= K; k+=2UL ) {
5669 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
5670 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
5674 xmm1 -=
set( A(i,k) ) * B.load(k,j);
5677 (~C).store( i, j, xmm1+xmm2 );
5681 for( ; remainder && j<N; ++j )
5683 const size_t iend( UPP ? j+1UL : M );
5684 size_t i( LOW ? j : 0UL );
5686 for( ; (i+2UL) <= iend; i+=2UL )
5700 for(
size_t k=kbegin; k<kend; ++k ) {
5701 value1 -= A(i ,k) * B(k,j);
5702 value2 -= A(i+1UL,k) * B(k,j);
5705 (~C)(i ,j) = value1;
5706 (~C)(i+1UL,j) = value2;
5719 for(
size_t k=kbegin; k<K; ++k ) {
5720 value -= A(i,k) * B(k,j);
5745 template<
typename MT3
5753 const size_t M( A.rows() );
5754 const size_t N( B.columns() );
5755 const size_t K( A.columns() );
5759 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
5766 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5767 for(
size_t j=0UL; j<N; ++j )
5781 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5782 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5783 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5784 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5785 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
5786 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
5787 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
5789 for(
size_t k=kbegin; k<kend; ++k ) {
5790 const SIMDType b1(
set( B(k,j) ) );
5791 xmm1 -= A.load(i ,k) * b1;
5792 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5793 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5794 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5795 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5796 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
5797 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
5798 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
5801 (~C).store( i , j, xmm1 );
5802 (~C).store( i+SIMDSIZE , j, xmm2 );
5803 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5804 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5805 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5806 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
5807 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
5808 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
5813 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5817 for( ; (j+2UL) <= N; j+=2UL )
5830 SIMDType xmm1 ( (~C).load(i ,j ) );
5831 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
5832 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
5833 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
5834 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
5835 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
5836 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
5837 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5838 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5839 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
5841 for(
size_t k=kbegin; k<kend; ++k ) {
5843 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5844 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5845 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5846 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5847 const SIMDType b1(
set( B(k,j ) ) );
5848 const SIMDType b2(
set( B(k,j+1UL) ) );
5861 (~C).store( i , j , xmm1 );
5862 (~C).store( i+SIMDSIZE , j , xmm2 );
5863 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5864 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5865 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
5866 (~C).store( i , j+1UL, xmm6 );
5867 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
5868 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
5869 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
5870 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
5883 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5884 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5885 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5886 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5888 for(
size_t k=kbegin; k<kend; ++k ) {
5889 const SIMDType b1(
set( B(k,j) ) );
5890 xmm1 -= A.load(i ,k) * b1;
5891 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5892 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5893 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5894 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5897 (~C).store( i , j, xmm1 );
5898 (~C).store( i+SIMDSIZE , j, xmm2 );
5899 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5900 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5901 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5905 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5909 for( ; (j+2UL) <= N; j+=2UL )
5923 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
5924 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
5925 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
5926 SIMDType xmm5( (~C).load(i ,j+1UL) );
5927 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
5928 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5929 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5931 for(
size_t k=kbegin; k<kend; ++k ) {
5933 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5934 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5935 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5936 const SIMDType b1(
set( B(k,j ) ) );
5937 const SIMDType b2(
set( B(k,j+1UL) ) );
5948 (~C).store( i , j , xmm1 );
5949 (~C).store( i+SIMDSIZE , j , xmm2 );
5950 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5951 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5952 (~C).store( i , j+1UL, xmm5 );
5953 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
5954 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
5955 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
5968 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5969 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5970 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5972 for(
size_t k=kbegin; k<kend; ++k ) {
5973 const SIMDType b1(
set( B(k,j) ) );
5974 xmm1 -= A.load(i ,k) * b1;
5975 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5976 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5977 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5980 (~C).store( i , j, xmm1 );
5981 (~C).store( i+SIMDSIZE , j, xmm2 );
5982 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5983 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5987 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5991 for( ; (j+2UL) <= N; j+=2UL )
6005 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
6006 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
6007 SIMDType xmm4( (~C).load(i ,j+1UL) );
6008 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
6009 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
6011 for(
size_t k=kbegin; k<kend; ++k ) {
6013 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6014 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6015 const SIMDType b1(
set( B(k,j ) ) );
6016 const SIMDType b2(
set( B(k,j+1UL) ) );
6025 (~C).store( i , j , xmm1 );
6026 (~C).store( i+SIMDSIZE , j , xmm2 );
6027 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
6028 (~C).store( i , j+1UL, xmm4 );
6029 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
6030 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6043 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
6044 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
6046 for(
size_t k=kbegin; k<kend; ++k ) {
6047 const SIMDType b1(
set( B(k,j) ) );
6048 xmm1 -= A.load(i ,k) * b1;
6049 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6050 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6053 (~C).store( i , j, xmm1 );
6054 (~C).store( i+SIMDSIZE , j, xmm2 );
6055 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
6059 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6061 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
6062 size_t j( UPP ? i : 0UL );
6064 for( ; (j+4UL) <= jend; j+=4UL )
6078 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6079 SIMDType xmm3( (~C).load(i ,j+1UL) );
6080 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6081 SIMDType xmm5( (~C).load(i ,j+2UL) );
6082 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6083 SIMDType xmm7( (~C).load(i ,j+3UL) );
6084 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
6086 for(
size_t k=kbegin; k<kend; ++k ) {
6088 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6089 const SIMDType b1(
set( B(k,j ) ) );
6090 const SIMDType b2(
set( B(k,j+1UL) ) );
6091 const SIMDType b3(
set( B(k,j+2UL) ) );
6092 const SIMDType b4(
set( B(k,j+3UL) ) );
6103 (~C).store( i , j , xmm1 );
6104 (~C).store( i+SIMDSIZE, j , xmm2 );
6105 (~C).store( i , j+1UL, xmm3 );
6106 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6107 (~C).store( i , j+2UL, xmm5 );
6108 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6109 (~C).store( i , j+3UL, xmm7 );
6110 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
6113 for( ; (j+3UL) <= jend; j+=3UL )
6127 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6128 SIMDType xmm3( (~C).load(i ,j+1UL) );
6129 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6130 SIMDType xmm5( (~C).load(i ,j+2UL) );
6131 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6133 for(
size_t k=kbegin; k<kend; ++k ) {
6135 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6136 const SIMDType b1(
set( B(k,j ) ) );
6137 const SIMDType b2(
set( B(k,j+1UL) ) );
6138 const SIMDType b3(
set( B(k,j+2UL) ) );
6147 (~C).store( i , j , xmm1 );
6148 (~C).store( i+SIMDSIZE, j , xmm2 );
6149 (~C).store( i , j+1UL, xmm3 );
6150 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6151 (~C).store( i , j+2UL, xmm5 );
6152 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6155 for( ; (j+2UL) <= jend; j+=2UL )
6169 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6170 SIMDType xmm3( (~C).load(i ,j+1UL) );
6171 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6175 for( ; (k+2UL) <= kend; k+=2UL ) {
6176 const SIMDType a1( A.load(i ,k ) );
6177 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6178 const SIMDType a3( A.load(i ,k+1UL) );
6179 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6180 const SIMDType b1(
set( B(k ,j ) ) );
6181 const SIMDType b2(
set( B(k ,j+1UL) ) );
6182 const SIMDType b3(
set( B(k+1UL,j ) ) );
6183 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
6194 for( ; k<kend; ++k ) {
6196 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6197 const SIMDType b1(
set( B(k,j ) ) );
6198 const SIMDType b2(
set( B(k,j+1UL) ) );
6205 (~C).store( i , j , xmm1+xmm5 );
6206 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
6207 (~C).store( i , j+1UL, xmm3+xmm7 );
6208 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
6221 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
6225 for( ; (k+2UL) <= kend; k+=2UL ) {
6226 const SIMDType b1(
set( B(k ,j) ) );
6227 const SIMDType b2(
set( B(k+1UL,j) ) );
6228 xmm1 -= A.load(i ,k ) * b1;
6229 xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
6230 xmm3 -= A.load(i ,k+1UL) * b2;
6231 xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
6234 for( ; k<kend; ++k ) {
6235 const SIMDType b1(
set( B(k,j) ) );
6236 xmm1 -= A.load(i ,k) * b1;
6237 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6240 (~C).store( i , j, xmm1+xmm3 );
6241 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
6245 for( ; i<ipos; i+=SIMDSIZE )
6247 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
6248 size_t j( UPP ? i : 0UL );
6250 for( ; (j+4UL) <= jend; j+=4UL )
6262 SIMDType xmm2( (~C).load(i,j+1UL) );
6263 SIMDType xmm3( (~C).load(i,j+2UL) );
6264 SIMDType xmm4( (~C).load(i,j+3UL) );
6268 for( ; (k+2UL) <= kend; k+=2UL ) {
6270 const SIMDType a2( A.load(i,k+1UL) );
6271 xmm1 -= a1 *
set( B(k ,j ) );
6272 xmm2 -= a1 *
set( B(k ,j+1UL) );
6273 xmm3 -= a1 *
set( B(k ,j+2UL) );
6274 xmm4 -= a1 *
set( B(k ,j+3UL) );
6275 xmm5 -= a2 *
set( B(k+1UL,j ) );
6276 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
6277 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
6278 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
6281 for( ; k<kend; ++k ) {
6283 xmm1 -= a1 *
set( B(k,j ) );
6284 xmm2 -= a1 *
set( B(k,j+1UL) );
6285 xmm3 -= a1 *
set( B(k,j+2UL) );
6286 xmm4 -= a1 *
set( B(k,j+3UL) );
6289 (~C).store( i, j , xmm1+xmm5 );
6290 (~C).store( i, j+1UL, xmm2+xmm6 );
6291 (~C).store( i, j+2UL, xmm3+xmm7 );
6292 (~C).store( i, j+3UL, xmm4+xmm8 );
6295 for( ; (j+3UL) <= jend; j+=3UL )
6307 SIMDType xmm2( (~C).load(i,j+1UL) );
6308 SIMDType xmm3( (~C).load(i,j+2UL) );
6312 for( ; (k+2UL) <= kend; k+=2UL ) {
6314 const SIMDType a2( A.load(i,k+1UL) );
6315 xmm1 -= a1 *
set( B(k ,j ) );
6316 xmm2 -= a1 *
set( B(k ,j+1UL) );
6317 xmm3 -= a1 *
set( B(k ,j+2UL) );
6318 xmm4 -= a2 *
set( B(k+1UL,j ) );
6319 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
6320 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
6323 for( ; k<kend; ++k ) {
6325 xmm1 -= a1 *
set( B(k,j ) );
6326 xmm2 -= a1 *
set( B(k,j+1UL) );
6327 xmm3 -= a1 *
set( B(k,j+2UL) );
6330 (~C).store( i, j , xmm1+xmm4 );
6331 (~C).store( i, j+1UL, xmm2+xmm5 );
6332 (~C).store( i, j+2UL, xmm3+xmm6 );
6335 for( ; (j+2UL) <= jend; j+=2UL )
6347 SIMDType xmm2( (~C).load(i,j+1UL) );
6351 for( ; (k+2UL) <= kend; k+=2UL ) {
6353 const SIMDType a2( A.load(i,k+1UL) );
6354 xmm1 -= a1 *
set( B(k ,j ) );
6355 xmm2 -= a1 *
set( B(k ,j+1UL) );
6356 xmm3 -= a2 *
set( B(k+1UL,j ) );
6357 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
6360 for( ; k<kend; ++k ) {
6362 xmm1 -= a1 *
set( B(k,j ) );
6363 xmm2 -= a1 *
set( B(k,j+1UL) );
6366 (~C).store( i, j , xmm1+xmm3 );
6367 (~C).store( i, j+1UL, xmm2+xmm4 );
6382 for( ; (k+2UL) <= K; k+=2UL ) {
6383 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
6384 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
6388 xmm1 -= A.load(i,k) *
set( B(k,j) );
6391 (~C).store( i, j, xmm1+xmm2 );
6395 for( ; remainder && i<M; ++i )
6397 const size_t jend( LOW ? i+1UL : N );
6398 size_t j( UPP ? i : 0UL );
6400 for( ; (j+2UL) <= jend; j+=2UL )
6414 for(
size_t k=kbegin; k<kend; ++k ) {
6415 value1 -= A(i,k) * B(k,j );
6416 value2 -= A(i,k) * B(k,j+1UL);
6419 (~C)(i,j ) = value1;
6420 (~C)(i,j+1UL) = value2;
6433 for(
size_t k=kbegin; k<K; ++k ) {
6434 value -= A(i,k) * B(k,j);
6458 template<
typename MT3
6462 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6464 selectDefaultSubAssignKernel( C, A, B );
6484 template<
typename MT3
6488 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6514 template<
typename MT3
6518 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6520 selectLargeSubAssignKernel( C, A, B );
6526 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6540 template<
typename MT3
6544 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6551 subAssign( C, tmp );
6556 subAssign( C, tmp );
6559 gemm( C, A, B, ET(-1), ET(1) );
6583 template<
typename MT
6597 schurAssign( ~lhs, tmp );
6630 template<
typename MT
6640 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6643 else if( rhs.lhs_.columns() == 0UL ) {
6679 template<
typename MT
6698 const ForwardFunctor fwd;
6700 const TmpType tmp( rhs );
6722 template<
typename MT
6732 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6771 template<
typename MT
6781 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6817 template<
typename MT
6877 template<
typename MT1
6885 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
6915 SYM = ( SF && !( HF || LF || UF ) ),
6916 HERM = ( HF && !( LF || UF ) ),
6917 LOW = ( LF || ( ( SF || HF ) && UF ) ),
6918 UPP = ( UF || ( ( SF || HF ) && LF ) )
6927 template<
typename T1,
typename T2,
typename T3 >
6928 struct IsEvaluationRequired {
6929 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
6937 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6938 struct UseBlasKernel {
6939 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
6945 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6959 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6960 struct UseVectorizedDefaultKernel {
6961 enum :
bool { value = useOptimizedKernels &&
6965 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7019 MT1::simdEnabled && MT2::simdEnabled &&
7025 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
7026 !evaluateRight && MT2::smpAssignable };
7056 return matrix_(i,j) * scalar_;
7069 if( i >= matrix_.rows() ) {
7072 if( j >= matrix_.columns() ) {
7075 return (*
this)(i,j);
7084 inline size_t rows()
const {
7085 return matrix_.rows();
7094 inline size_t columns()
const {
7095 return matrix_.columns();
7125 template<
typename T >
7126 inline bool canAlias(
const T* alias )
const {
7127 return matrix_.canAlias( alias );
7137 template<
typename T >
7138 inline bool isAliased(
const T* alias )
const {
7139 return matrix_.isAliased( alias );
7149 return matrix_.isAligned();
7160 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7162 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7163 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7185 template<
typename MT
7197 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7200 else if( left.columns() == 0UL ) {
7215 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
7230 template<
typename MT3
7234 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7239 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7240 selectSmallAssignKernel( C, A, B, scalar );
7242 selectBlasAssignKernel( C, A, B, scalar );
7260 template<
typename MT3
7267 const size_t M( A.rows() );
7268 const size_t N( B.columns() );
7269 const size_t K( A.columns() );
7273 for(
size_t i=0UL; i<M; ++i )
7284 for(
size_t j=0UL; j<N; ++j ) {
7293 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
7294 :( UPP ?
max(i,kbegin) : kbegin ) )
7295 :( UPP ? i : 0UL ) );
7298 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
7299 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
7300 :( LOW ? i+1UL : N ) );
7303 for(
size_t j=0UL; j<jbegin; ++j ) {
7308 reset( (~C)(i,0UL) );
7310 for(
size_t j=jbegin; j<jend; ++j ) {
7311 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7314 for(
size_t j=jend; j<N; ++j ) {
7319 reset( (~C)(i,N-1UL) );
7323 for(
size_t k=kbegin+1UL; k<kend; ++k )
7327 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
7328 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
7329 :( SYM || HERM || UPP ? i : 0UL ) );
7332 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
7333 :( LOW ?
min(i+1UL,k) : k ) )
7334 :( LOW ? i+1UL : N ) );
7336 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7339 for(
size_t j=jbegin; j<jend; ++j ) {
7340 (~C)(i,j) += A(i,k) * B(k,j);
7343 (~C)(i,jend) = A(i,k) * B(k,jend);
7350 :( SYM || HERM || UPP ? i : 0UL ) );
7353 :( LOW ? i+1UL : N ) );
7355 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7358 for(
size_t j=jbegin; j<jend; ++j ) {
7359 (~C)(i,j) *= scalar;
7365 for(
size_t i=1UL; i<M; ++i ) {
7366 for(
size_t j=0UL; j<i; ++j ) {
7367 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
7388 template<
typename MT3
7395 const size_t M( A.rows() );
7396 const size_t N( B.columns() );
7397 const size_t K( A.columns() );
7401 for(
size_t j=0UL; j<N; ++j )
7412 for(
size_t i=0UL; i<M; ++i ) {
7421 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
7422 :( LOW ?
max(j,kbegin) : kbegin ) )
7423 :( LOW ? j : 0UL ) );
7426 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
7427 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
7428 :( UPP ? j+1UL : M ) );
7431 for(
size_t i=0UL; i<ibegin; ++i ) {
7436 reset( (~C)(0UL,j) );
7438 for(
size_t i=ibegin; i<iend; ++i ) {
7439 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7442 for(
size_t i=iend; i<M; ++i ) {
7447 reset( (~C)(M-1UL,j) );
7451 for(
size_t k=kbegin+1UL; k<kend; ++k )
7455 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
7456 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
7457 :( SYM || HERM || LOW ? j : 0UL ) );
7460 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
7461 :( UPP ?
min(j+1UL,k) : k ) )
7462 :( UPP ? j+1UL : M ) );
7464 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7467 for(
size_t i=ibegin; i<iend; ++i ) {
7468 (~C)(i,j) += A(i,k) * B(k,j);
7471 (~C)(iend,j) = A(iend,k) * B(k,j);
7478 :( SYM || HERM || LOW ? j : 0UL ) );
7481 :( UPP ? j+1UL : M ) );
7483 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7486 for(
size_t i=ibegin; i<iend; ++i ) {
7487 (~C)(i,j) *= scalar;
7493 for(
size_t j=1UL; j<N; ++j ) {
7494 for(
size_t i=0UL; i<j; ++i ) {
7495 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
7516 template<
typename MT3
7520 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7523 constexpr
size_t block( BLOCK_SIZE );
7525 const size_t M( A.rows() );
7526 const size_t N( B.columns() );
7528 for(
size_t ii=0UL; ii<M; ii+=block ) {
7529 const size_t iend(
min( M, ii+block ) );
7530 for(
size_t jj=0UL; jj<N; jj+=block ) {
7531 const size_t jend(
min( N, jj+block ) );
7532 for(
size_t i=ii; i<iend; ++i )
7542 for(
size_t j=jj; j<jbegin; ++j ) {
7546 for(
size_t j=jbegin; j<jpos; ++j ) {
7547 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7550 for(
size_t j=jpos; j<jend; ++j ) {
7574 template<
typename MT3
7578 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7581 const size_t M( A.rows() );
7582 const size_t N( B.columns() );
7584 for(
size_t j=0UL; j<N; ++j )
7595 for(
size_t i=0UL; i<ibegin; ++i ) {
7599 for(
size_t i=ibegin; i<iend; ++i ) {
7600 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7603 for(
size_t i=iend; i<M; ++i ) {
7625 template<
typename MT3
7632 const size_t M( A.rows() );
7633 const size_t N( B.columns() );
7635 for(
size_t i=0UL; i<M; ++i )
7646 for(
size_t j=0UL; j<jbegin; ++j ) {
7650 for(
size_t j=jbegin; j<jend; ++j ) {
7651 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7654 for(
size_t j=jend; j<N; ++j ) {
7676 template<
typename MT3
7683 constexpr
size_t block( BLOCK_SIZE );
7685 const size_t M( A.rows() );
7686 const size_t N( B.columns() );
7688 for(
size_t jj=0UL; jj<N; jj+=block ) {
7689 const size_t jend(
min( N, jj+block ) );
7690 for(
size_t ii=0UL; ii<M; ii+=block ) {
7691 const size_t iend(
min( M, ii+block ) );
7692 for(
size_t j=jj; j<jend; ++j )
7702 for(
size_t i=ii; i<ibegin; ++i ) {
7706 for(
size_t i=ibegin; i<ipos; ++i ) {
7707 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7710 for(
size_t i=ipos; i<iend; ++i ) {
7734 template<
typename MT3
7739 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7743 for(
size_t i=0UL; i<A.rows(); ++i ) {
7744 C(i,i) = A(i,i) * B(i,i) * scalar;
7763 template<
typename MT3
7768 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7770 selectDefaultAssignKernel( C, A, B, scalar );
7789 template<
typename MT3
7798 const size_t M( A.rows() );
7799 const size_t N( B.columns() );
7800 const size_t K( A.columns() );
7804 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
7807 const SIMDType factor(
set( scalar ) );
7809 if( LOW && UPP && N > SIMDSIZE*3UL ) {
7818 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7819 for(
size_t i=0UL; i<M; ++i )
7832 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7834 for(
size_t k=kbegin; k<kend; ++k ) {
7835 const SIMDType a1(
set( A(i,k) ) );
7836 xmm1 += a1 * B.load(k,j );
7837 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7838 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7839 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7840 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7841 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7842 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7843 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7846 (~C).store( i, j , xmm1 * factor );
7847 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7848 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7849 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7850 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7851 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
7852 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
7853 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
7858 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7862 for( ; (i+2UL) <= M; i+=2UL )
7875 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7877 for(
size_t k=kbegin; k<kend; ++k ) {
7878 const SIMDType a1(
set( A(i ,k) ) );
7879 const SIMDType a2(
set( A(i+1UL,k) ) );
7881 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7882 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7883 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7884 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7897 (~C).store( i , j , xmm1 * factor );
7898 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7899 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7900 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7901 (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
7902 (~C).store( i+1UL, j , xmm6 * factor );
7903 (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
7904 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
7905 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
7906 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
7918 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7920 for(
size_t k=kbegin; k<kend; ++k ) {
7921 const SIMDType a1(
set( A(i,k) ) );
7922 xmm1 += a1 * B.load(k,j );
7923 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7924 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7925 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7926 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7929 (~C).store( i, j , xmm1 * factor );
7930 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7931 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7932 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7933 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7937 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7939 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
7940 size_t i( LOW ? j : 0UL );
7942 for( ; (i+2UL) <= iend; i+=2UL )
7955 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7957 for(
size_t k=kbegin; k<kend; ++k ) {
7958 const SIMDType a1(
set( A(i ,k) ) );
7959 const SIMDType a2(
set( A(i+1UL,k) ) );
7961 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7962 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7963 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7974 (~C).store( i , j , xmm1 * factor );
7975 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7976 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7977 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7978 (~C).store( i+1UL, j , xmm5 * factor );
7979 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
7980 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
7981 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
7995 for(
size_t k=kbegin; k<kend; ++k ) {
7996 const SIMDType a1(
set( A(i,k) ) );
7997 xmm1 += a1 * B.load(k,j );
7998 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7999 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8000 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8003 (~C).store( i, j , xmm1 * factor );
8004 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8005 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8006 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8010 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8012 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
8013 size_t i( LOW ? j : 0UL );
8015 for( ; (i+2UL) <= iend; i+=2UL )
8028 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8030 for(
size_t k=kbegin; k<kend; ++k ) {
8031 const SIMDType a1(
set( A(i ,k) ) );
8032 const SIMDType a2(
set( A(i+1UL,k) ) );
8034 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8035 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8044 (~C).store( i , j , xmm1 * factor );
8045 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
8046 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8047 (~C).store( i+1UL, j , xmm4 * factor );
8048 (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8049 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8063 for(
size_t k=kbegin; k<kend; ++k ) {
8064 const SIMDType a1(
set( A(i,k) ) );
8065 xmm1 += a1 * B.load(k,j );
8066 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8067 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8070 (~C).store( i, j , xmm1 * factor );
8071 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8072 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8076 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8078 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
8079 size_t i( LOW ? j : 0UL );
8081 for( ; (i+4UL) <= iend; i+=4UL )
8094 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8096 for(
size_t k=kbegin; k<kend; ++k ) {
8097 const SIMDType a1(
set( A(i ,k) ) );
8098 const SIMDType a2(
set( A(i+1UL,k) ) );
8099 const SIMDType a3(
set( A(i+2UL,k) ) );
8100 const SIMDType a4(
set( A(i+3UL,k) ) );
8102 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8113 (~C).store( i , j , xmm1 * factor );
8114 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8115 (~C).store( i+1UL, j , xmm3 * factor );
8116 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8117 (~C).store( i+2UL, j , xmm5 * factor );
8118 (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8119 (~C).store( i+3UL, j , xmm7 * factor );
8120 (~C).store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8123 for( ; (i+3UL) <= iend; i+=3UL )
8136 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8138 for(
size_t k=kbegin; k<kend; ++k ) {
8139 const SIMDType a1(
set( A(i ,k) ) );
8140 const SIMDType a2(
set( A(i+1UL,k) ) );
8141 const SIMDType a3(
set( A(i+2UL,k) ) );
8143 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8152 (~C).store( i , j , xmm1 * factor );
8153 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8154 (~C).store( i+1UL, j , xmm3 * factor );
8155 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8156 (~C).store( i+2UL, j , xmm5 * factor );
8157 (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8160 for( ; (i+2UL) <= iend; i+=2UL )
8173 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8176 for( ; (k+2UL) <= kend; k+=2UL ) {
8177 const SIMDType a1(
set( A(i ,k ) ) );
8178 const SIMDType a2(
set( A(i+1UL,k ) ) );
8179 const SIMDType a3(
set( A(i ,k+1UL) ) );
8180 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
8181 const SIMDType b1( B.load(k ,j ) );
8182 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8183 const SIMDType b3( B.load(k+1UL,j ) );
8184 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8195 for( ; k<kend; ++k ) {
8196 const SIMDType a1(
set( A(i ,k) ) );
8197 const SIMDType a2(
set( A(i+1UL,k) ) );
8199 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8206 (~C).store( i , j , (xmm1+xmm5) * factor );
8207 (~C).store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
8208 (~C).store( i+1UL, j , (xmm3+xmm7) * factor );
8209 (~C).store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
8224 for( ; (k+2UL) <= kend; k+=2UL ) {
8225 const SIMDType a1(
set( A(i,k ) ) );
8226 const SIMDType a2(
set( A(i,k+1UL) ) );
8227 xmm1 += a1 * B.load(k ,j );
8228 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8229 xmm3 += a2 * B.load(k+1UL,j );
8230 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8233 for( ; k<kend; ++k ) {
8234 const SIMDType a1(
set( A(i,k) ) );
8235 xmm1 += a1 * B.load(k,j );
8236 xmm2 += a1 * B.load(k,j+SIMDSIZE);
8239 (~C).store( i, j , (xmm1+xmm3) * factor );
8240 (~C).store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
8244 for( ; j<jpos; j+=SIMDSIZE )
8246 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
8247 size_t i( LOW ? j : 0UL );
8249 for( ; (i+4UL) <= iend; i+=4UL )
8260 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8263 for( ; (k+2UL) <= kend; k+=2UL ) {
8265 const SIMDType b2( B.load(k+1UL,j) );
8266 xmm1 +=
set( A(i ,k ) ) * b1;
8267 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8268 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8269 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8270 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8271 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8272 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8273 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8276 for( ; k<kend; ++k ) {
8278 xmm1 +=
set( A(i ,k) ) * b1;
8279 xmm2 +=
set( A(i+1UL,k) ) * b1;
8280 xmm3 +=
set( A(i+2UL,k) ) * b1;
8281 xmm4 +=
set( A(i+3UL,k) ) * b1;
8284 (~C).store( i , j, (xmm1+xmm5) * factor );
8285 (~C).store( i+1UL, j, (xmm2+xmm6) * factor );
8286 (~C).store( i+2UL, j, (xmm3+xmm7) * factor );
8287 (~C).store( i+3UL, j, (xmm4+xmm8) * factor );
8290 for( ; (i+3UL) <= iend; i+=3UL )
8301 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8304 for( ; (k+2UL) <= kend; k+=2UL ) {
8306 const SIMDType b2( B.load(k+1UL,j) );
8307 xmm1 +=
set( A(i ,k ) ) * b1;
8308 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8309 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8310 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8311 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8312 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8315 for( ; k<kend; ++k ) {
8317 xmm1 +=
set( A(i ,k) ) * b1;
8318 xmm2 +=
set( A(i+1UL,k) ) * b1;
8319 xmm3 +=
set( A(i+2UL,k) ) * b1;
8322 (~C).store( i , j, (xmm1+xmm4) * factor );
8323 (~C).store( i+1UL, j, (xmm2+xmm5) * factor );
8324 (~C).store( i+2UL, j, (xmm3+xmm6) * factor );
8327 for( ; (i+2UL) <= iend; i+=2UL )
8341 for( ; (k+2UL) <= kend; k+=2UL ) {
8343 const SIMDType b2( B.load(k+1UL,j) );
8344 xmm1 +=
set( A(i ,k ) ) * b1;
8345 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8346 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8347 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8350 for( ; k<kend; ++k ) {
8352 xmm1 +=
set( A(i ,k) ) * b1;
8353 xmm2 +=
set( A(i+1UL,k) ) * b1;
8356 (~C).store( i , j, (xmm1+xmm3) * factor );
8357 (~C).store( i+1UL, j, (xmm2+xmm4) * factor );
8371 for( ; (k+2UL) <= K; k+=2UL ) {
8372 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8373 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8377 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8380 (~C).store( i, j, (xmm1+xmm2) * factor );
8384 for( ; remainder && j<N; ++j )
8386 size_t i( LOW && UPP ? j : 0UL );
8388 for( ; (i+2UL) <= M; i+=2UL )
8402 for(
size_t k=kbegin; k<kend; ++k ) {
8403 value1 += A(i ,k) * B(k,j);
8404 value2 += A(i+1UL,k) * B(k,j);
8407 (~C)(i ,j) = value1 * scalar;
8408 (~C)(i+1UL,j) = value2 * scalar;
8421 for(
size_t k=kbegin; k<K; ++k ) {
8422 value += A(i,k) * B(k,j);
8425 (~C)(i,j) = value * scalar;
8430 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
8431 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8432 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8433 for(
size_t j=0UL; j<jend; ++j ) {
8434 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
8438 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
8439 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
8440 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
8441 for(
size_t i=0UL; i<iend; ++i ) {
8446 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
8447 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8448 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8449 for(
size_t j=0UL; j<jend; ++j ) {
8472 template<
typename MT3
8481 const size_t M( A.rows() );
8482 const size_t N( B.columns() );
8483 const size_t K( A.columns() );
8487 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
8490 const SIMDType factor(
set( scalar ) );
8492 if( LOW && UPP && M > SIMDSIZE*3UL ) {
8501 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8502 for(
size_t j=0UL; j<N; ++j )
8515 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8517 for(
size_t k=kbegin; k<kend; ++k ) {
8518 const SIMDType b1(
set( B(k,j) ) );
8519 xmm1 += A.load(i ,k) * b1;
8520 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8521 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8522 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8523 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8524 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8525 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8526 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8529 (~C).store( i , j, xmm1 * factor );
8530 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8531 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8532 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8533 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8534 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
8535 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
8536 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
8541 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8545 for( ; (j+2UL) <= N; j+=2UL )
8558 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8560 for(
size_t k=kbegin; k<kend; ++k ) {
8562 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8563 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8564 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8565 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8566 const SIMDType b1(
set( B(k,j ) ) );
8567 const SIMDType b2(
set( B(k,j+1UL) ) );
8580 (~C).store( i , j , xmm1 * factor );
8581 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8582 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8583 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8584 (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
8585 (~C).store( i , j+1UL, xmm6 * factor );
8586 (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
8587 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8588 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8589 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8601 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8603 for(
size_t k=kbegin; k<kend; ++k ) {
8604 const SIMDType b1(
set( B(k,j) ) );
8605 xmm1 += A.load(i ,k) * b1;
8606 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8607 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8608 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8609 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8612 (~C).store( i , j, xmm1 * factor );
8613 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8614 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8615 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8616 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8620 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8622 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
8623 size_t j( UPP ? i : 0UL );
8625 for( ; (j+2UL) <= jend; j+=2UL )
8638 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8640 for(
size_t k=kbegin; k<kend; ++k ) {
8642 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8643 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8644 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8645 const SIMDType b1(
set( B(k,j ) ) );
8646 const SIMDType b2(
set( B(k,j+1UL) ) );
8657 (~C).store( i , j , xmm1 * factor );
8658 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8659 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8660 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8661 (~C).store( i , j+1UL, xmm5 * factor );
8662 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
8663 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8664 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8678 for(
size_t k=kbegin; k<kend; ++k ) {
8679 const SIMDType b1(
set( B(k,j) ) );
8680 xmm1 += A.load(i ,k) * b1;
8681 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8682 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8683 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8686 (~C).store( i , j, xmm1 * factor );
8687 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8688 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8689 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8693 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8695 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
8696 size_t j( UPP ? i : 0UL );
8698 for( ; (j+2UL) <= jend; j+=2UL )
8711 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8713 for(
size_t k=kbegin; k<kend; ++k ) {
8715 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8716 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8717 const SIMDType b1(
set( B(k,j ) ) );
8718 const SIMDType b2(
set( B(k,j+1UL) ) );
8727 (~C).store( i , j , xmm1 * factor );
8728 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8729 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8730 (~C).store( i , j+1UL, xmm4 * factor );
8731 (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
8732 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
8746 for(
size_t k=kbegin; k<kend; ++k ) {
8747 const SIMDType b1(
set( B(k,j) ) );
8748 xmm1 += A.load(i ,k) * b1;
8749 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8750 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8753 (~C).store( i , j, xmm1 * factor );
8754 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8755 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8759 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8761 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
8762 size_t j( UPP ? i : 0UL );
8764 for( ; (j+4UL) <= jend; j+=4UL )
8777 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8779 for(
size_t k=kbegin; k<kend; ++k ) {
8781 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8782 const SIMDType b1(
set( B(k,j ) ) );
8783 const SIMDType b2(
set( B(k,j+1UL) ) );
8784 const SIMDType b3(
set( B(k,j+2UL) ) );
8785 const SIMDType b4(
set( B(k,j+3UL) ) );
8796 (~C).store( i , j , xmm1 * factor );
8797 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8798 (~C).store( i , j+1UL, xmm3 * factor );
8799 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8800 (~C).store( i , j+2UL, xmm5 * factor );
8801 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8802 (~C).store( i , j+3UL, xmm7 * factor );
8803 (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
8806 for( ; (j+3UL) <= jend; j+=3UL )
8819 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8821 for(
size_t k=kbegin; k<kend; ++k ) {
8823 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8824 const SIMDType b1(
set( B(k,j ) ) );
8825 const SIMDType b2(
set( B(k,j+1UL) ) );
8826 const SIMDType b3(
set( B(k,j+2UL) ) );
8835 (~C).store( i , j , xmm1 * factor );
8836 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8837 (~C).store( i , j+1UL, xmm3 * factor );
8838 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8839 (~C).store( i , j+2UL, xmm5 * factor );
8840 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8843 for( ; (j+2UL) <= jend; j+=2UL )
8856 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8859 for( ; (k+2UL) <= kend; k+=2UL ) {
8860 const SIMDType a1( A.load(i ,k ) );
8861 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8862 const SIMDType a3( A.load(i ,k+1UL) );
8863 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8864 const SIMDType b1(
set( B(k ,j ) ) );
8865 const SIMDType b2(
set( B(k ,j+1UL) ) );
8866 const SIMDType b3(
set( B(k+1UL,j ) ) );
8867 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8878 for( ; k<kend; ++k ) {
8880 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8881 const SIMDType b1(
set( B(k,j ) ) );
8882 const SIMDType b2(
set( B(k,j+1UL) ) );
8889 (~C).store( i , j , (xmm1+xmm5) * factor );
8890 (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
8891 (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
8892 (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
8907 for( ; (k+2UL) <= kend; k+=2UL ) {
8908 const SIMDType b1(
set( B(k ,j) ) );
8909 const SIMDType b2(
set( B(k+1UL,j) ) );
8910 xmm1 += A.load(i ,k ) * b1;
8911 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8912 xmm3 += A.load(i ,k+1UL) * b2;
8913 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8916 for( ; k<kend; ++k ) {
8917 const SIMDType b1(
set( B(k,j) ) );
8918 xmm1 += A.load(i ,k) * b1;
8919 xmm2 += A.load(i+SIMDSIZE,k) * b1;
8922 (~C).store( i , j, (xmm1+xmm3) * factor );
8923 (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
8927 for( ; i<ipos; i+=SIMDSIZE )
8929 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
8930 size_t j( UPP ? i : 0UL );
8932 for( ; (j+4UL) <= jend; j+=4UL )
8943 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8946 for( ; (k+2UL) <= kend; k+=2UL ) {
8948 const SIMDType a2( A.load(i,k+1UL) );
8949 xmm1 += a1 *
set( B(k ,j ) );
8950 xmm2 += a1 *
set( B(k ,j+1UL) );
8951 xmm3 += a1 *
set( B(k ,j+2UL) );
8952 xmm4 += a1 *
set( B(k ,j+3UL) );
8953 xmm5 += a2 *
set( B(k+1UL,j ) );
8954 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8955 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8956 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8959 for( ; k<kend; ++k ) {
8961 xmm1 += a1 *
set( B(k,j ) );
8962 xmm2 += a1 *
set( B(k,j+1UL) );
8963 xmm3 += a1 *
set( B(k,j+2UL) );
8964 xmm4 += a1 *
set( B(k,j+3UL) );
8967 (~C).store( i, j , (xmm1+xmm5) * factor );
8968 (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
8969 (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
8970 (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
8973 for( ; (j+3UL) <= jend; j+=3UL )
8984 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8987 for( ; (k+2UL) <= kend; k+=2UL ) {
8989 const SIMDType a2( A.load(i,k+1UL) );
8990 xmm1 += a1 *
set( B(k ,j ) );
8991 xmm2 += a1 *
set( B(k ,j+1UL) );
8992 xmm3 += a1 *
set( B(k ,j+2UL) );
8993 xmm4 += a2 *
set( B(k+1UL,j ) );
8994 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
8995 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
8998 for( ; k<kend; ++k ) {
9000 xmm1 += a1 *
set( B(k,j ) );
9001 xmm2 += a1 *
set( B(k,j+1UL) );
9002 xmm3 += a1 *
set( B(k,j+2UL) );
9005 (~C).store( i, j , (xmm1+xmm4) * factor );
9006 (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
9007 (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
9010 for( ; (j+2UL) <= jend; j+=2UL )
9024 for( ; k<kend; ++k ) {
9026 xmm1 += a1 *
set( B(k,j ) );
9027 xmm2 += a1 *
set( B(k,j+1UL) );
9030 for( ; (k+2UL) <= kend; k+=2UL ) {
9032 const SIMDType a2( A.load(i,k+1UL) );
9033 xmm1 += a1 *
set( B(k ,j ) );
9034 xmm2 += a1 *
set( B(k ,j+1UL) );
9035 xmm3 += a2 *
set( B(k+1UL,j ) );
9036 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
9039 (~C).store( i, j , (xmm1+xmm3) * factor );
9040 (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
9054 for( ; (k+2UL) <= K; k+=2UL ) {
9055 xmm1 += A.load(i,k ) *
set( B(k ,j) );
9056 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
9060 xmm1 += A.load(i,k) *
set( B(k,j) );
9063 (~C).store( i, j, (xmm1+xmm2) * factor );
9067 for( ; remainder && i<M; ++i )
9069 size_t j( LOW && UPP ? i : 0UL );
9071 for( ; (j+2UL) <= N; j+=2UL )
9085 for(
size_t k=kbegin; k<kend; ++k ) {
9086 value1 += A(i,k) * B(k,j );
9087 value2 += A(i,k) * B(k,j+1UL);
9090 (~C)(i,j ) = value1 * scalar;
9091 (~C)(i,j+1UL) = value2 * scalar;
9104 for(
size_t k=kbegin; k<K; ++k ) {
9105 value += A(i,k) * B(k,j);
9108 (~C)(i,j) = value * scalar;
9113 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
9114 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9115 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9116 for(
size_t i=0UL; i<iend; ++i ) {
9117 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
9121 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
9122 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9123 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9124 for(
size_t i=0UL; i<iend; ++i ) {
9129 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
9130 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
9131 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
9132 for(
size_t j=0UL; j<jend; ++j ) {
9154 template<
typename MT3
9159 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9161 selectDefaultAssignKernel( C, A, B, scalar );
9180 template<
typename MT3
9185 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9188 smmm( C, A, B, scalar );
9190 hmmm( C, A, B, scalar );
9192 lmmm( C, A, B, scalar, ST2(0) );
9194 ummm( C, A, B, scalar, ST2(0) );
9196 mmm( C, A, B, scalar, ST2(0) );
9214 template<
typename MT3
9219 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9221 selectLargeAssignKernel( C, A, B, scalar );
9226 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 9240 template<
typename MT3
9245 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9251 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9255 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9258 gemm( C, A, B, ET(scalar), ET(0) );
9276 template<
typename MT
9294 const ForwardFunctor fwd;
9296 const TmpType tmp(
serial( rhs ) );
9297 assign( ~lhs, fwd( tmp ) );
9313 template<
typename MT
9325 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9339 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
9354 template<
typename MT3
9358 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9363 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9364 selectSmallAddAssignKernel( C, A, B, scalar );
9366 selectBlasAddAssignKernel( C, A, B, scalar );
9384 template<
typename MT3
9389 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9392 addAssign( C, tmp );
9410 template<
typename MT3
9414 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
9417 constexpr
size_t block( BLOCK_SIZE );
9419 const size_t M( A.rows() );
9420 const size_t N( B.columns() );
9422 for(
size_t ii=0UL; ii<M; ii+=block ) {
9423 const size_t iend(
min( M, ii+block ) );
9424 for(
size_t jj=0UL; jj<N; jj+=block ) {
9425 const size_t jend(
min( N, jj+block ) );
9426 for(
size_t i=ii; i<iend; ++i )
9435 for(
size_t j=jbegin; j<jpos; ++j ) {
9436 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
9458 template<
typename MT3
9462 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
9465 const size_t M( A.rows() );
9466 const size_t N( B.columns() );
9468 for(
size_t j=0UL; j<N; ++j )
9478 const size_t inum( iend - ibegin );
9479 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9481 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9482 (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
9483 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9486 (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9506 template<
typename MT3
9513 const size_t M( A.rows() );
9514 const size_t N( B.columns() );
9516 for(
size_t i=0UL; i<M; ++i )
9526 const size_t jnum( jend - jbegin );
9527 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9529 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9530 (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
9531 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9534 (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9554 template<
typename MT3
9561 constexpr
size_t block( BLOCK_SIZE );
9563 const size_t M( A.rows() );
9564 const size_t N( B.columns() );
9566 for(
size_t jj=0UL; jj<N; jj+=block ) {
9567 const size_t jend(
min( N, jj+block ) );
9568 for(
size_t ii=0UL; ii<M; ii+=block ) {
9569 const size_t iend(
min( M, ii+block ) );
9570 for(
size_t j=jj; j<jend; ++j )
9579 for(
size_t i=ibegin; i<ipos; ++i ) {
9580 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
9602 template<
typename MT3
9607 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9609 for(
size_t i=0UL; i<A.rows(); ++i ) {
9610 C(i,i) += A(i,i) * B(i,i) * scalar;
9629 template<
typename MT3
9634 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9636 selectDefaultAddAssignKernel( C, A, B, scalar );
9655 template<
typename MT3
9664 const size_t M( A.rows() );
9665 const size_t N( B.columns() );
9666 const size_t K( A.columns() );
9670 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
9673 const SIMDType factor(
set( scalar ) );
9679 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9680 for(
size_t i=0UL; i<M; ++i )
9693 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9695 for(
size_t k=kbegin; k<kend; ++k ) {
9696 const SIMDType a1(
set( A(i,k) ) );
9697 xmm1 += a1 * B.load(k,j );
9698 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9699 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9700 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9701 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9702 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9703 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9704 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9707 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9708 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9709 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9710 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9711 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9712 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
9713 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
9714 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
9719 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9723 for( ; (i+2UL) <= M; i+=2UL )
9736 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9738 for(
size_t k=kbegin; k<kend; ++k ) {
9739 const SIMDType a1(
set( A(i ,k) ) );
9740 const SIMDType a2(
set( A(i+1UL,k) ) );
9742 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9743 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9744 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9745 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9758 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9759 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9760 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9761 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9762 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
9763 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
9764 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
9765 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
9766 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
9767 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
9779 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9781 for(
size_t k=kbegin; k<kend; ++k ) {
9782 const SIMDType a1(
set( A(i,k) ) );
9783 xmm1 += a1 * B.load(k,j );
9784 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9785 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9786 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9787 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9790 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9791 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9792 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9793 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9794 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9798 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9802 for( ; (i+2UL) <= M; i+=2UL )
9815 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9817 for(
size_t k=kbegin; k<kend; ++k ) {
9818 const SIMDType a1(
set( A(i ,k) ) );
9819 const SIMDType a2(
set( A(i+1UL,k) ) );
9821 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9822 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9823 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9834 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9835 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9836 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9837 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9838 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
9839 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
9840 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
9841 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
9855 for(
size_t k=kbegin; k<kend; ++k ) {
9856 const SIMDType a1(
set( A(i,k) ) );
9857 xmm1 += a1 * B.load(k,j );
9858 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9859 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9860 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9863 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9864 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9865 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9866 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9870 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9874 for( ; (i+2UL) <= M; i+=2UL )
9887 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9889 for(
size_t k=kbegin; k<kend; ++k ) {
9890 const SIMDType a1(
set( A(i ,k) ) );
9891 const SIMDType a2(
set( A(i+1UL,k) ) );
9893 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9894 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9903 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9904 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9905 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9906 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
9907 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
9908 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
9922 for(
size_t k=kbegin; k<kend; ++k ) {
9923 const SIMDType a1(
set( A(i,k) ) );
9924 xmm1 += a1 * B.load(k,j );
9925 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9926 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9929 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9930 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9931 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9935 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9937 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
9938 size_t i( LOW ? j : 0UL );
9940 for( ; (i+4UL) <= iend; i+=4UL )
9953 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9955 for(
size_t k=kbegin; k<kend; ++k ) {
9956 const SIMDType a1(
set( A(i ,k) ) );
9957 const SIMDType a2(
set( A(i+1UL,k) ) );
9958 const SIMDType a3(
set( A(i+2UL,k) ) );
9959 const SIMDType a4(
set( A(i+3UL,k) ) );
9961 const SIMDType b2( B.load(k,j+SIMDSIZE) );
9972 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9973 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
9974 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9975 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
9976 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
9977 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
9978 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
9979 (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
9982 for( ; (i+3UL) <= iend; i+=3UL )
9995 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9997 for(
size_t k=kbegin; k<kend; ++k ) {
9998 const SIMDType a1(
set( A(i ,k) ) );
9999 const SIMDType a2(
set( A(i+1UL,k) ) );
10000 const SIMDType a3(
set( A(i+2UL,k) ) );
10001 const SIMDType b1( B.load(k,j ) );
10002 const SIMDType b2( B.load(k,j+SIMDSIZE) );
10011 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10012 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
10013 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
10014 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10015 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
10016 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10019 for( ; (i+2UL) <= iend; i+=2UL )
10032 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10033 size_t k( kbegin );
10035 for( ; (k+2UL) <= kend; k+=2UL ) {
10036 const SIMDType a1(
set( A(i ,k ) ) );
10037 const SIMDType a2(
set( A(i+1UL,k ) ) );
10038 const SIMDType a3(
set( A(i ,k+1UL) ) );
10039 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
10040 const SIMDType b1( B.load(k ,j ) );
10041 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
10042 const SIMDType b3( B.load(k+1UL,j ) );
10043 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
10054 for( ; k<kend; ++k ) {
10055 const SIMDType a1(
set( A(i ,k) ) );
10056 const SIMDType a2(
set( A(i+1UL,k) ) );
10057 const SIMDType b1( B.load(k,j ) );
10058 const SIMDType b2( B.load(k,j+SIMDSIZE) );
10065 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10066 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
10067 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + (xmm3+xmm7) * factor );
10068 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
10081 size_t k( kbegin );
10083 for( ; (k+2UL) <= kend; k+=2UL ) {
10084 const SIMDType a1(
set( A(i,k ) ) );
10085 const SIMDType a2(
set( A(i,k+1UL) ) );
10086 xmm1 += a1 * B.load(k ,j );
10087 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
10088 xmm3 += a2 * B.load(k+1UL,j );
10089 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
10092 for( ; k<kend; ++k ) {
10093 const SIMDType a1(
set( A(i,k) ) );
10094 xmm1 += a1 * B.load(k,j );
10095 xmm2 += a1 * B.load(k,j+SIMDSIZE);
10098 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10099 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
10103 for( ; j<jpos; j+=SIMDSIZE )
10105 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
10106 size_t i( LOW ? j : 0UL );
10108 for( ; (i+4UL) <= iend; i+=4UL )
10119 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10120 size_t k( kbegin );
10122 for( ; (k+2UL) <= kend; k+=2UL ) {
10123 const SIMDType b1( B.load(k ,j) );
10124 const SIMDType b2( B.load(k+1UL,j) );
10125 xmm1 +=
set( A(i ,k ) ) * b1;
10126 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10127 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10128 xmm4 +=
set( A(i+3UL,k ) ) * b1;
10129 xmm5 +=
set( A(i ,k+1UL) ) * b2;
10130 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
10131 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
10132 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
10135 for( ; k<kend; ++k ) {
10137 xmm1 +=
set( A(i ,k) ) * b1;
10138 xmm2 +=
set( A(i+1UL,k) ) * b1;
10139 xmm3 +=
set( A(i+2UL,k) ) * b1;
10140 xmm4 +=
set( A(i+3UL,k) ) * b1;
10143 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm5) * factor );
10144 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm6) * factor );
10145 (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm7) * factor );
10146 (~C).store( i+3UL, j, (~C).load(i+3UL,j) + (xmm4+xmm8) * factor );
10149 for( ; (i+3UL) <= iend; i+=3UL )
10160 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10161 size_t k( kbegin );
10163 for( ; (k+2UL) <= kend; k+=2UL ) {
10164 const SIMDType b1( B.load(k ,j) );
10165 const SIMDType b2( B.load(k+1UL,j) );
10166 xmm1 +=
set( A(i ,k ) ) * b1;
10167 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10168 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10169 xmm4 +=
set( A(i ,k+1UL) ) * b2;
10170 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
10171 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
10174 for( ; k<kend; ++k ) {
10176 xmm1 +=
set( A(i ,k) ) * b1;
10177 xmm2 +=
set( A(i+1UL,k) ) * b1;
10178 xmm3 +=
set( A(i+2UL,k) ) * b1;
10181 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm4) * factor );
10182 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm5) * factor );
10183 (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm6) * factor );
10186 for( ; (i+2UL) <= iend; i+=2UL )
10198 size_t k( kbegin );
10200 for( ; (k+2UL) <= kend; k+=2UL ) {
10201 const SIMDType b1( B.load(k ,j) );
10202 const SIMDType b2( B.load(k+1UL,j) );
10203 xmm1 +=
set( A(i ,k ) ) * b1;
10204 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10205 xmm3 +=
set( A(i ,k+1UL) ) * b2;
10206 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
10209 for( ; k<kend; ++k ) {
10211 xmm1 +=
set( A(i ,k) ) * b1;
10212 xmm2 +=
set( A(i+1UL,k) ) * b1;
10215 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10216 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm4) * factor );
10228 size_t k( kbegin );
10230 for( ; (k+2UL) <= K; k+=2UL ) {
10231 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
10232 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
10235 for( ; k<K; ++k ) {
10236 xmm1 +=
set( A(i,k) ) * B.load(k,j);
10239 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10243 for( ; remainder && j<N; ++j )
10245 const size_t iend( UPP ? j+1UL : M );
10246 size_t i( LOW ? j : 0UL );
10248 for( ; (i+2UL) <= iend; i+=2UL )
10262 for(
size_t k=kbegin; k<kend; ++k ) {
10263 value1 += A(i ,k) * B(k,j);
10264 value2 += A(i+1UL,k) * B(k,j);
10267 (~C)(i ,j) += value1 * scalar;
10268 (~C)(i+1UL,j) += value2 * scalar;
10281 for(
size_t k=kbegin; k<K; ++k ) {
10282 value += A(i,k) * B(k,j);
10285 (~C)(i,j) += value * scalar;
10306 template<
typename MT3
10315 const size_t M( A.rows() );
10316 const size_t N( B.columns() );
10317 const size_t K( A.columns() );
10321 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
10322 BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
10324 const SIMDType factor(
set( scalar ) );
10330 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10331 for(
size_t j=0UL; j<N; ++j )
10344 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10346 for(
size_t k=kbegin; k<kend; ++k ) {
10347 const SIMDType b1(
set( B(k,j) ) );
10348 xmm1 += A.load(i ,k) * b1;
10349 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10350 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10351 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10352 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10353 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
10354 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
10355 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
10358 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10359 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10360 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10361 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10362 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10363 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
10364 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
10365 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
10370 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
10374 for( ; (j+2UL) <= N; j+=2UL )
10387 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10389 for(
size_t k=kbegin; k<kend; ++k ) {
10390 const SIMDType a1( A.load(i ,k) );
10391 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10392 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10393 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10394 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
10395 const SIMDType b1(
set( B(k,j ) ) );
10396 const SIMDType b2(
set( B(k,j+1UL) ) );
10409 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10410 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10411 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10412 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10413 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
10414 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
10415 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
10416 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10417 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10418 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10430 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10432 for(
size_t k=kbegin; k<kend; ++k ) {
10433 const SIMDType b1(
set( B(k,j) ) );
10434 xmm1 += A.load(i ,k) * b1;
10435 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10436 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10437 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10438 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10441 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10442 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10443 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10444 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10445 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10449 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10453 for( ; (j+2UL) <= N; j+=2UL )
10466 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10468 for(
size_t k=kbegin; k<kend; ++k ) {
10469 const SIMDType a1( A.load(i ,k) );
10470 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10471 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10472 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10473 const SIMDType b1(
set( B(k,j ) ) );
10474 const SIMDType b2(
set( B(k,j+1UL) ) );
10485 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10486 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10487 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10488 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10489 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
10490 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
10491 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10492 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10506 for(
size_t k=kbegin; k<kend; ++k ) {
10507 const SIMDType b1(
set( B(k,j) ) );
10508 xmm1 += A.load(i ,k) * b1;
10509 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10510 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10511 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10514 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10515 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10516 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10517 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10521 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
10525 for( ; (j+2UL) <= N; j+=2UL )
10538 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10540 for(
size_t k=kbegin; k<kend; ++k ) {
10541 const SIMDType a1( A.load(i ,k) );
10542 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10543 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10544 const SIMDType b1(
set( B(k,j ) ) );
10545 const SIMDType b2(
set( B(k,j+1UL) ) );
10554 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10555 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10556 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10557 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
10558 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
10559 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10573 for(
size_t k=kbegin; k<kend; ++k ) {
10574 const SIMDType b1(
set( B(k,j) ) );
10575 xmm1 += A.load(i ,k) * b1;
10576 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10577 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10580 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10581 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10582 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10586 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10588 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
10589 size_t j( UPP ? i : 0UL );
10591 for( ; (j+4UL) <= jend; j+=4UL )
10604 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10606 for(
size_t k=kbegin; k<kend; ++k ) {
10607 const SIMDType a1( A.load(i ,k) );
10608 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10609 const SIMDType b1(
set( B(k,j ) ) );
10610 const SIMDType b2(
set( B(k,j+1UL) ) );
10611 const SIMDType b3(
set( B(k,j+2UL) ) );
10612 const SIMDType b4(
set( B(k,j+3UL) ) );
10623 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10624 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10625 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10626 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10627 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10628 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10629 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
10630 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
10633 for( ; (j+3UL) <= jend; j+=3UL )
10646 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10648 for(
size_t k=kbegin; k<kend; ++k ) {
10649 const SIMDType a1( A.load(i ,k) );
10650 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10651 const SIMDType b1(
set( B(k,j ) ) );
10652 const SIMDType b2(
set( B(k,j+1UL) ) );
10653 const SIMDType b3(
set( B(k,j+2UL) ) );
10662 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10663 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10664 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10665 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10666 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10667 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10670 for( ; (j+2UL) <= jend; j+=2UL )
10683 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10684 size_t k( kbegin );
10686 for( ; (k+2UL) <= kend; k+=2UL ) {
10687 const SIMDType a1( A.load(i ,k ) );
10688 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
10689 const SIMDType a3( A.load(i ,k+1UL) );
10690 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
10691 const SIMDType b1(
set( B(k ,j ) ) );
10692 const SIMDType b2(
set( B(k ,j+1UL) ) );
10693 const SIMDType b3(
set( B(k+1UL,j ) ) );
10694 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
10705 for( ; k<kend; ++k ) {
10706 const SIMDType a1( A.load(i ,k) );
10707 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10708 const SIMDType b1(
set( B(k,j ) ) );
10709 const SIMDType b2(
set( B(k,j+1UL) ) );
10716 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10717 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
10718 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
10719 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
10732 size_t k( kbegin );
10734 for( ; (k+2UL) <= kend; k+=2UL ) {
10735 const SIMDType b1(
set( B(k ,j) ) );
10736 const SIMDType b2(
set( B(k+1UL,j) ) );
10737 xmm1 += A.load(i ,k ) * b1;
10738 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
10739 xmm3 += A.load(i ,k+1UL) * b2;
10740 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
10743 for( ; k<kend; ++k ) {
10744 const SIMDType b1(
set( B(k,j) ) );
10745 xmm1 += A.load(i ,k) * b1;
10746 xmm2 += A.load(i+SIMDSIZE,k) * b1;
10749 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10750 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
10754 for( ; i<ipos; i+=SIMDSIZE )
10756 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
10757 size_t j( UPP ? i : 0UL );
10759 for( ; (j+4UL) <= jend; j+=4UL )
10770 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10771 size_t k( kbegin );
10773 for( ; (k+2UL) <= kend; k+=2UL ) {
10774 const SIMDType a1( A.load(i,k ) );
10775 const SIMDType a2( A.load(i,k+1UL) );
10776 xmm1 += a1 *
set( B(k ,j ) );
10777 xmm2 += a1 *
set( B(k ,j+1UL) );
10778 xmm3 += a1 *
set( B(k ,j+2UL) );
10779 xmm4 += a1 *
set( B(k ,j+3UL) );
10780 xmm5 += a2 *
set( B(k+1UL,j ) );
10781 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
10782 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
10783 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
10786 for( ; k<kend; ++k ) {
10788 xmm1 += a1 *
set( B(k,j ) );
10789 xmm2 += a1 *
set( B(k,j+1UL) );
10790 xmm3 += a1 *
set( B(k,j+2UL) );
10791 xmm4 += a1 *
set( B(k,j+3UL) );
10794 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
10795 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
10796 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
10797 (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
10800 for( ; (j+3UL) <= jend; j+=3UL )
10811 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10812 size_t k( kbegin );
10814 for( ; (k+2UL) <= kend; k+=2UL ) {
10815 const SIMDType a1( A.load(i,k ) );
10816 const SIMDType a2( A.load(i,k+1UL) );
10817 xmm1 += a1 *
set( B(k ,j ) );
10818 xmm2 += a1 *
set( B(k ,j+1UL) );
10819 xmm3 += a1 *
set( B(k ,j+2UL) );
10820 xmm4 += a2 *
set( B(k+1UL,j ) );
10821 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
10822 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
10825 for( ; k<kend; ++k ) {
10827 xmm1 += a1 *
set( B(k,j ) );
10828 xmm2 += a1 *
set( B(k,j+1UL) );
10829 xmm3 += a1 *
set( B(k,j+2UL) );
10832 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
10833 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
10834 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
10837 for( ; (j+2UL) <= jend; j+=2UL )
10849 size_t k( kbegin );
10851 for( ; (k+2UL) <= kend; k+=2UL ) {
10852 const SIMDType a1( A.load(i,k ) );
10853 const SIMDType a2( A.load(i,k+1UL) );
10854 xmm1 += a1 *
set( B(k ,j ) );
10855 xmm2 += a1 *
set( B(k ,j+1UL) );
10856 xmm3 += a2 *
set( B(k+1UL,j ) );
10857 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
10860 for( ; k<kend; ++k ) {
10862 xmm1 += a1 *
set( B(k,j ) );
10863 xmm2 += a1 *
set( B(k,j+1UL) );
10866 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10867 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
10879 size_t k( kbegin );
10881 for( ; (k+2UL) <= K; k+=2UL ) {
10882 xmm1 += A.load(i,k ) *
set( B(k ,j) );
10883 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
10886 for( ; k<K; ++k ) {
10887 xmm1 += A.load(i,k) *
set( B(k,j) );
10890 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10894 for( ; remainder && i<M; ++i )
10896 const size_t jend( LOW ? i+1UL : N );
10897 size_t j( UPP ? i : 0UL );
10899 for( ; (j+2UL) <= jend; j+=2UL )
10913 for(
size_t k=kbegin; k<kend; ++k ) {
10914 value1 += A(i,k) * B(k,j );
10915 value2 += A(i,k) * B(k,j+1UL);
10918 (~C)(i,j ) += value1 * scalar;
10919 (~C)(i,j+1UL) += value2 * scalar;
10932 for(
size_t k=kbegin; k<K; ++k ) {
10933 value += A(i,k) * B(k,j);
10936 (~C)(i,j) += value * scalar;
10956 template<
typename MT3
10961 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10963 selectDefaultAddAssignKernel( C, A, B, scalar );
10982 template<
typename MT3
10987 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10990 lmmm( C, A, B, scalar, ST2(1) );
10992 ummm( C, A, B, scalar, ST2(1) );
10994 mmm( C, A, B, scalar, ST2(1) );
11012 template<
typename MT3
11017 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11019 selectLargeAddAssignKernel( C, A, B, scalar );
11024 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 11038 template<
typename MT3
11043 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11049 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11050 addAssign( C, tmp );
11054 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11055 addAssign( C, tmp );
11058 gemm( C, A, B, ET(scalar), ET(1) );
11080 template<
typename MT
11092 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11106 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
11121 template<
typename MT3
11125 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11130 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11131 selectSmallSubAssignKernel( C, A, B, scalar );
11133 selectBlasSubAssignKernel( C, A, B, scalar );
11151 template<
typename MT3
11156 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11159 subAssign( C, tmp );
11177 template<
typename MT3
11181 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
11184 constexpr
size_t block( BLOCK_SIZE );
11186 const size_t M( A.rows() );
11187 const size_t N( B.columns() );
11189 for(
size_t ii=0UL; ii<M; ii+=block ) {
11190 const size_t iend(
min( M, ii+block ) );
11191 for(
size_t jj=0UL; jj<N; jj+=block ) {
11192 const size_t jend(
min( N, jj+block ) );
11193 for(
size_t i=ii; i<iend; ++i )
11202 for(
size_t j=jbegin; j<jpos; ++j ) {
11203 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
11225 template<
typename MT3
11229 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
11232 const size_t M( A.rows() );
11233 const size_t N( B.columns() );
11235 for(
size_t j=0UL; j<N; ++j )
11245 const size_t inum( iend - ibegin );
11246 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
11248 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
11249 (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
11250 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11252 if( ipos < iend ) {
11253 (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11273 template<
typename MT3
11280 const size_t M( A.rows() );
11281 const size_t N( B.columns() );
11283 for(
size_t i=0UL; i<M; ++i )
11293 const size_t jnum( jend - jbegin );
11294 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
11296 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
11297 (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
11298 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11300 if( jpos < jend ) {
11301 (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11321 template<
typename MT3
11328 constexpr
size_t block( BLOCK_SIZE );
11330 const size_t M( A.rows() );
11331 const size_t N( B.columns() );
11333 for(
size_t jj=0UL; jj<N; jj+=block ) {
11334 const size_t jend(
min( N, jj+block ) );
11335 for(
size_t ii=0UL; ii<M; ii+=block ) {
11336 const size_t iend(
min( M, ii+block ) );
11337 for(
size_t j=jj; j<jend; ++j )
11346 for(
size_t i=ibegin; i<ipos; ++i ) {
11347 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
11369 template<
typename MT3
11374 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11376 for(
size_t i=0UL; i<A.rows(); ++i ) {
11377 C(i,i) -= A(i,i) * B(i,i) * scalar;
11396 template<
typename MT3
11401 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11403 selectDefaultSubAssignKernel( C, A, B, scalar );
11422 template<
typename MT3
11431 const size_t M( A.rows() );
11432 const size_t N( B.columns() );
11433 const size_t K( A.columns() );
11437 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
11438 BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
11440 const SIMDType factor(
set( scalar ) );
11446 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
11447 for(
size_t i=0UL; i<M; ++i )
11460 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11462 for(
size_t k=kbegin; k<kend; ++k ) {
11463 const SIMDType a1(
set( A(i,k) ) );
11464 xmm1 += a1 * B.load(k,j );
11465 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11466 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11467 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11468 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11469 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
11470 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
11471 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
11474 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11475 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11476 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11477 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11478 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11479 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
11480 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
11481 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
11486 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
11490 for( ; (i+2UL) <= M; i+=2UL )
11503 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11505 for(
size_t k=kbegin; k<kend; ++k ) {
11506 const SIMDType a1(
set( A(i ,k) ) );
11507 const SIMDType a2(
set( A(i+1UL,k) ) );
11508 const SIMDType b1( B.load(k,j ) );
11509 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11510 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11511 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11512 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
11525 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11526 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11527 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11528 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11529 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
11530 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
11531 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
11532 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
11533 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
11534 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
11546 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11548 for(
size_t k=kbegin; k<kend; ++k ) {
11549 const SIMDType a1(
set( A(i,k) ) );
11550 xmm1 += a1 * B.load(k,j );
11551 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11552 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11553 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11554 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11557 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11558 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11559 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11560 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11561 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11565 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
11569 for( ; (i+2UL) <= M; i+=2UL )
11582 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11584 for(
size_t k=kbegin; k<kend; ++k ) {
11585 const SIMDType a1(
set( A(i ,k) ) );
11586 const SIMDType a2(
set( A(i+1UL,k) ) );
11587 const SIMDType b1( B.load(k,j ) );
11588 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11589 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11590 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11601 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11602 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11603 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11604 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11605 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
11606 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
11607 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
11608 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
11622 for(
size_t k=kbegin; k<kend; ++k ) {
11623 const SIMDType a1(
set( A(i,k) ) );
11624 xmm1 += a1 * B.load(k,j );
11625 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11626 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11627 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11630 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11631 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11632 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11633 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11637 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
11641 for( ; (i+2UL) <= M; i+=2UL )
11654 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11656 for(
size_t k=kbegin; k<kend; ++k ) {
11657 const SIMDType a1(
set( A(i ,k) ) );
11658 const SIMDType a2(
set( A(i+1UL,k) ) );
11659 const SIMDType b1( B.load(k,j ) );
11660 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11661 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11670 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11671 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11672 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11673 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
11674 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
11675 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
11689 for(
size_t k=kbegin; k<kend; ++k ) {
11690 const SIMDType a1(
set( A(i,k) ) );
11691 xmm1 += a1 * B.load(k,j );
11692 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11693 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11696 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11697 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11698 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11702 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
11704 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
11705 size_t i( LOW ? j : 0UL );
11707 for( ; (i+4UL) <= iend; i+=4UL )
11720 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11722 for(
size_t k=kbegin; k<kend; ++k ) {
11723 const SIMDType a1(
set( A(i ,k) ) );
11724 const SIMDType a2(
set( A(i+1UL,k) ) );
11725 const SIMDType a3(
set( A(i+2UL,k) ) );
11726 const SIMDType a4(
set( A(i+3UL,k) ) );
11727 const SIMDType b1( B.load(k,j ) );
11728 const SIMDType b2( B.load(k,j+SIMDSIZE) );
11739 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11740 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11741 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11742 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11743 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11744 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11745 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
11746 (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
11749 for( ; (i+3UL) <= iend; i+=3UL )
11762 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11764 for(
size_t k=kbegin; k<kend; ++k ) {
11765 const SIMDType a1(
set( A(i ,k) ) );
11766 const SIMDType a2(
set( A(i+1UL,k) ) );
11767 const SIMDType a3(
set( A(i+2UL,k) ) );
11768 const SIMDType b1( B.load(k,j ) );
11769 const SIMDType b2( B.load(k,j+SIMDSIZE) );
11778 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11779 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11780 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11781 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11782 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11783 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11786 for( ; (i+2UL) <= iend; i+=2UL )
11799 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11800 size_t k( kbegin );
11802 for( ; (k+2UL) <= kend; k+=2UL ) {
11803 const SIMDType a1(
set( A(i ,k ) ) );
11804 const SIMDType a2(
set( A(i+1UL,k ) ) );
11805 const SIMDType a3(
set( A(i ,k+1UL) ) );
11806 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
11807 const SIMDType b1( B.load(k ,j ) );
11808 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
11809 const SIMDType b3( B.load(k+1UL,j ) );
11810 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
11821 for( ; k<kend; ++k ) {
11822 const SIMDType a1(
set( A(i ,k) ) );
11823 const SIMDType a2(
set( A(i+1UL,k) ) );
11824 const SIMDType b1( B.load(k,j ) );
11825 const SIMDType b2( B.load(k,j+SIMDSIZE) );
11832 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
11833 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
11834 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - (xmm3+xmm7) * factor );
11835 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
11848 size_t k( kbegin );
11850 for( ; (k+2UL) <= kend; k+=2UL ) {
11851 const SIMDType a1(
set( A(i,k ) ) );
11852 const SIMDType a2(
set( A(i,k+1UL) ) );
11853 xmm1 += a1 * B.load(k ,j );
11854 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
11855 xmm3 += a2 * B.load(k+1UL,j );
11856 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
11859 for( ; k<kend; ++k ) {
11860 const SIMDType a1(
set( A(i,k) ) );
11861 xmm1 += a1 * B.load(k,j );
11862 xmm2 += a1 * B.load(k,j+SIMDSIZE);
11865 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
11866 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
11870 for( ; j<jpos; j+=SIMDSIZE )
11872 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
11873 size_t i( LOW ? j : 0UL );
11875 for( ; (i+4UL) <= iend; i+=4UL )
11886 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11887 size_t k( kbegin );
11889 for( ; (k+2UL) <= kend; k+=2UL ) {
11890 const SIMDType b1( B.load(k ,j) );
11891 const SIMDType b2( B.load(k+1UL,j) );
11892 xmm1 +=
set( A(i ,k ) ) * b1;
11893 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11894 xmm3 +=
set( A(i+2UL,k ) ) * b1;
11895 xmm4 +=
set( A(i+3UL,k ) ) * b1;
11896 xmm5 +=
set( A(i ,k+1UL) ) * b2;
11897 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
11898 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
11899 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
11902 for( ; k<kend; ++k ) {
11904 xmm1 +=
set( A(i ,k) ) * b1;
11905 xmm2 +=
set( A(i+1UL,k) ) * b1;
11906 xmm3 +=
set( A(i+2UL,k) ) * b1;
11907 xmm4 +=
set( A(i+3UL,k) ) * b1;
11910 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm5) * factor );
11911 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm6) * factor );
11912 (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm7) * factor );
11913 (~C).store( i+3UL, j, (~C).load(i+3UL,j) - (xmm4+xmm8) * factor );
11916 for( ; (i+3UL) <= iend; i+=3UL )
11927 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11928 size_t k( kbegin );
11930 for( ; (k+2UL) <= kend; k+=2UL ) {
11931 const SIMDType b1( B.load(k ,j) );
11932 const SIMDType b2( B.load(k+1UL,j) );
11933 xmm1 +=
set( A(i ,k ) ) * b1;
11934 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11935 xmm3 +=
set( A(i+2UL,k ) ) * b1;
11936 xmm4 +=
set( A(i ,k+1UL) ) * b2;
11937 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
11938 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
11941 for( ; k<kend; ++k ) {
11943 xmm1 +=
set( A(i ,k) ) * b1;
11944 xmm2 +=
set( A(i+1UL,k) ) * b1;
11945 xmm3 +=
set( A(i+2UL,k) ) * b1;
11948 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm4) * factor );
11949 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm5) * factor );
11950 (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm6) * factor );
11953 for( ; (i+2UL) <= iend; i+=2UL )
11965 size_t k( kbegin );
11967 for( ; (k+2UL) <= kend; k+=2UL ) {
11968 const SIMDType b1( B.load(k ,j) );
11969 const SIMDType b2( B.load(k+1UL,j) );
11970 xmm1 +=
set( A(i ,k ) ) * b1;
11971 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11972 xmm3 +=
set( A(i ,k+1UL) ) * b2;
11973 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
11976 for( ; k<kend; ++k ) {
11978 xmm1 +=
set( A(i ,k) ) * b1;
11979 xmm2 +=
set( A(i+1UL,k) ) * b1;
11982 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
11983 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm4) * factor );
11995 size_t k( kbegin );
11997 for( ; (k+2UL) <= K; k+=2UL ) {
11998 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
11999 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
12002 for( ; k<K; ++k ) {
12003 xmm1 +=
set( A(i,k) ) * B.load(k,j);
12006 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12010 for( ; remainder && j<N; ++j )
12012 const size_t iend( UPP ? j+1UL : M );
12013 size_t i( LOW ? j : 0UL );
12015 for( ; (i+2UL) <= iend; i+=2UL )
12029 for(
size_t k=kbegin; k<kend; ++k ) {
12030 value1 += A(i ,k) * B(k,j);
12031 value2 += A(i+1UL,k) * B(k,j);
12034 (~C)(i ,j) -= value1 * scalar;
12035 (~C)(i+1UL,j) -= value2 * scalar;
12048 for(
size_t k=kbegin; k<K; ++k ) {
12049 value += A(i,k) * B(k,j);
12052 (~C)(i,j) -= value * scalar;
12073 template<
typename MT3
12082 const size_t M( A.rows() );
12083 const size_t N( B.columns() );
12084 const size_t K( A.columns() );
12088 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
12089 BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
12091 const SIMDType factor(
set( scalar ) );
12097 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
12098 for(
size_t j=0UL; j<N; ++j )
12111 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12113 for(
size_t k=kbegin; k<kend; ++k ) {
12114 const SIMDType b1(
set( B(k,j) ) );
12115 xmm1 += A.load(i ,k) * b1;
12116 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12117 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12118 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12119 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12120 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
12121 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
12122 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
12125 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12126 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12127 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12128 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12129 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12130 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
12131 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
12132 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
12137 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
12141 for( ; (j+2UL) <= N; j+=2UL )
12154 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12156 for(
size_t k=kbegin; k<kend; ++k ) {
12157 const SIMDType a1( A.load(i ,k) );
12158 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12159 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12160 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12161 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
12162 const SIMDType b1(
set( B(k,j ) ) );
12163 const SIMDType b2(
set( B(k,j+1UL) ) );
12176 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12177 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12178 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12179 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12180 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
12181 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
12182 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
12183 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12184 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12185 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12197 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12199 for(
size_t k=kbegin; k<kend; ++k ) {
12200 const SIMDType b1(
set( B(k,j) ) );
12201 xmm1 += A.load(i ,k) * b1;
12202 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12203 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12204 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12205 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12208 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12209 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12210 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12211 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12212 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12216 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
12220 for( ; (j+2UL) <= N; j+=2UL )
12233 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12235 for(
size_t k=kbegin; k<kend; ++k ) {
12236 const SIMDType a1( A.load(i ,k) );
12237 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12238 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12239 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12240 const SIMDType b1(
set( B(k,j ) ) );
12241 const SIMDType b2(
set( B(k,j+1UL) ) );
12252 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12253 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12254 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12255 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12256 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
12257 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
12258 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12259 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12273 for(
size_t k=kbegin; k<kend; ++k ) {
12274 const SIMDType b1(
set( B(k,j) ) );
12275 xmm1 += A.load(i ,k) * b1;
12276 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12277 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12278 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12281 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12282 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12283 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12284 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12288 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
12292 for( ; (j+2UL) <= N; j+=2UL )
12305 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12307 for(
size_t k=kbegin; k<kend; ++k ) {
12308 const SIMDType a1( A.load(i ,k) );
12309 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12310 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12311 const SIMDType b1(
set( B(k,j ) ) );
12312 const SIMDType b2(
set( B(k,j+1UL) ) );
12321 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12322 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12323 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12324 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
12325 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
12326 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12340 for(
size_t k=kbegin; k<kend; ++k ) {
12341 const SIMDType b1(
set( B(k,j) ) );
12342 xmm1 += A.load(i ,k) * b1;
12343 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12344 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12347 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12348 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12349 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12353 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
12355 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
12356 size_t j( UPP ? i : 0UL );
12358 for( ; (j+4UL) <= jend; j+=4UL )
12371 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12373 for(
size_t k=kbegin; k<kend; ++k ) {
12374 const SIMDType a1( A.load(i ,k) );
12375 const SIMDType a2( A.load(i+SIMDSIZE,k) );
12376 const SIMDType b1(
set( B(k,j ) ) );
12377 const SIMDType b2(
set( B(k,j+1UL) ) );
12378 const SIMDType b3(
set( B(k,j+2UL) ) );
12379 const SIMDType b4(
set( B(k,j+3UL) ) );
12390 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12391 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12392 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12393 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12394 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12395 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12396 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
12397 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
12400 for( ; (j+3UL) <= jend; j+=3UL )
12413 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12415 for(
size_t k=kbegin; k<kend; ++k ) {
12416 const SIMDType a1( A.load(i ,k) );
12417 const SIMDType a2( A.load(i+SIMDSIZE,k) );
12418 const SIMDType b1(
set( B(k,j ) ) );
12419 const SIMDType b2(
set( B(k,j+1UL) ) );
12420 const SIMDType b3(
set( B(k,j+2UL) ) );
12429 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12430 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12431 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12432 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12433 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12434 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12437 for( ; (j+2UL) <= jend; j+=2UL )
12450 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12451 size_t k( kbegin );
12453 for( ; (k+2UL) <= kend; k+=2UL ) {
12454 const SIMDType a1( A.load(i ,k ) );
12455 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
12456 const SIMDType a3( A.load(i ,k+1UL) );
12457 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
12458 const SIMDType b1(
set( B(k ,j ) ) );
12459 const SIMDType b2(
set( B(k ,j+1UL) ) );
12460 const SIMDType b3(
set( B(k+1UL,j ) ) );
12461 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
12472 for( ; k<kend; ++k ) {
12473 const SIMDType a1( A.load(i ,k) );
12474 const SIMDType a2( A.load(i+SIMDSIZE,k) );
12475 const SIMDType b1(
set( B(k,j ) ) );
12476 const SIMDType b2(
set( B(k,j+1UL) ) );
12483 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
12484 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
12485 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
12486 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12499 size_t k( kbegin );
12501 for( ; (k+2UL) <= kend; k+=2UL ) {
12502 const SIMDType b1(
set( B(k ,j) ) );
12503 const SIMDType b2(
set( B(k+1UL,j) ) );
12504 xmm1 += A.load(i ,k ) * b1;
12505 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
12506 xmm3 += A.load(i ,k+1UL) * b2;
12507 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
12510 for( ; k<kend; ++k ) {
12511 const SIMDType b1(
set( B(k,j) ) );
12512 xmm1 += A.load(i ,k) * b1;
12513 xmm2 += A.load(i+SIMDSIZE,k) * b1;
12516 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
12517 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
12521 for( ; i<ipos; i+=SIMDSIZE )
12523 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
12524 size_t j( UPP ? i : 0UL );
12526 for( ; (j+4UL) <= jend; j+=4UL )
12537 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12538 size_t k( kbegin );
12540 for( ; (k+2UL) <= kend; k+=2UL ) {
12541 const SIMDType a1( A.load(i,k ) );
12542 const SIMDType a2( A.load(i,k+1UL) );
12543 xmm1 += a1 *
set( B(k ,j ) );
12544 xmm2 += a1 *
set( B(k ,j+1UL) );
12545 xmm3 += a1 *
set( B(k ,j+2UL) );
12546 xmm4 += a1 *
set( B(k ,j+3UL) );
12547 xmm5 += a2 *
set( B(k+1UL,j ) );
12548 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
12549 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
12550 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
12553 for( ; k<kend; ++k ) {
12555 xmm1 += a1 *
set( B(k,j ) );
12556 xmm2 += a1 *
set( B(k,j+1UL) );
12557 xmm3 += a1 *
set( B(k,j+2UL) );
12558 xmm4 += a1 *
set( B(k,j+3UL) );
12561 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
12562 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
12563 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
12564 (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
12567 for( ; (j+3UL) <= jend; j+=3UL )
12578 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12579 size_t k( kbegin );
12581 for( ; (k+2UL) <= kend; k+=2UL ) {
12582 const SIMDType a1( A.load(i,k ) );
12583 const SIMDType a2( A.load(i,k+1UL) );
12584 xmm1 += a1 *
set( B(k ,j ) );
12585 xmm2 += a1 *
set( B(k ,j+1UL) );
12586 xmm3 += a1 *
set( B(k ,j+2UL) );
12587 xmm4 += a2 *
set( B(k+1UL,j ) );
12588 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
12589 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
12592 for( ; k<kend; ++k ) {
12594 xmm1 += a1 *
set( B(k,j ) );
12595 xmm2 += a1 *
set( B(k,j+1UL) );
12596 xmm3 += a1 *
set( B(k,j+2UL) );
12599 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
12600 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
12601 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
12604 for( ; (j+2UL) <= jend; j+=2UL )
12616 size_t k( kbegin );
12618 for( ; (k+2UL) <= kend; k+=2UL ) {
12619 const SIMDType a1( A.load(i,k ) );
12620 const SIMDType a2( A.load(i,k+1UL) );
12621 xmm1 += a1 *
set( B(k ,j ) );
12622 xmm2 += a1 *
set( B(k ,j+1UL) );
12623 xmm3 += a2 *
set( B(k+1UL,j ) );
12624 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
12627 for( ; k<kend; ++k ) {
12629 xmm1 += a1 *
set( B(k,j ) );
12630 xmm2 += a1 *
set( B(k,j+1UL) );
12633 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
12634 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
12646 size_t k( kbegin );
12648 for( ; (k+2UL) <= K; k+=2UL ) {
12649 xmm1 += A.load(i,k ) *
set( B(k ,j) );
12650 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
12653 for( ; k<K; ++k ) {
12654 xmm1 += A.load(i,k) *
set( B(k,j) );
12657 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12661 for( ; remainder && i<M; ++i )
12663 const size_t jend( LOW ? i+1UL : N );
12664 size_t j( UPP ? i : 0UL );
12666 for( ; (j+2UL) <= jend; j+=2UL )
12680 for(
size_t k=kbegin; k<kend; ++k ) {
12681 value1 += A(i,k) * B(k,j );
12682 value2 += A(i,k) * B(k,j+1UL);
12685 (~C)(i,j ) -= value1 * scalar;
12686 (~C)(i,j+1UL) -= value2 * scalar;
12699 for(
size_t k=kbegin; k<K; ++k ) {
12700 value += A(i,k) * B(k,j);
12703 (~C)(i,j) -= value * scalar;
12723 template<
typename MT3
12728 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12730 selectDefaultSubAssignKernel( C, A, B, scalar );
12749 template<
typename MT3
12754 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12757 lmmm( C, A, B, -scalar, ST2(1) );
12759 ummm( C, A, B, -scalar, ST2(1) );
12761 mmm( C, A, B, -scalar, ST2(1) );
12779 template<
typename MT3
12784 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12786 selectLargeSubAssignKernel( C, A, B, scalar );
12791 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 12805 template<
typename MT3
12810 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12816 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12817 subAssign( C, tmp );
12821 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12822 subAssign( C, tmp );
12825 gemm( C, A, B, ET(-scalar), ET(1) );
12847 template<
typename MT
12861 schurAssign( ~lhs, tmp );
12892 template<
typename MT
12905 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
12908 else if( left.columns() == 0UL ) {
12942 template<
typename MT
12961 const ForwardFunctor fwd;
12963 const TmpType tmp( rhs );
12983 template<
typename MT
12996 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13033 template<
typename MT
13046 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13080 template<
typename MT
13163 template<
typename MT1
13165 inline decltype(
auto)
13213 template<
typename MT1
13228 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13259 template<
typename MT1
13274 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13305 template<
typename MT1
13320 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13351 template<
typename MT1
13366 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13397 template<
typename MT1
13412 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13428 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13429 struct Rows< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13446 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13447 struct Columns< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13464 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13465 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13466 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
13482 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13483 struct IsSymmetric< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13486 , IsBuiltin< ElementType_< TDMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
13487 , And< Bool<LF>, Bool<UF> > >::value >
13503 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
13504 struct IsHermitian< TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
13521 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13522 struct IsLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13524 , And< IsLower<MT1>, IsLower<MT2> >
13525 , And< Or< Bool<SF>, Bool<HF> >
13526 , IsUpper<MT1>, IsUpper<MT2> > >::value >
13542 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13543 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13544 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
13545 , And< Or< Bool<SF>, Bool<HF> >
13546 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
13562 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13564 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13565 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
13566 , And< Or< Bool<SF>, Bool<HF> >
13567 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13568 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
13584 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13585 struct IsUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13587 , And< IsUpper<MT1>, IsUpper<MT2> >
13588 , And< Or< Bool<SF>, Bool<HF> >
13589 , IsLower<MT1>, IsLower<MT2> > >::value >
13605 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13606 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13607 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
13608 , And< Or< Bool<SF>, Bool<HF> >
13609 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
13625 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13627 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13628 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
13629 , And< Or< Bool<SF>, Bool<HF> >
13630 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13631 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:272
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:266
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:469
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:547
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:470
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:620
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:537
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:278
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1027
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:157
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:403
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:413
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:262
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:393
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:148
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:263
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:425
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:110
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:265
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:260
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:383
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:275
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:158
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1027
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:175
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:619
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:264
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:457
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:108
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:437
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:367
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:174
Header file for the MatScalarMultExpr base class.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:264
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:110
Utility type for generic codes.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:261
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:304
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1029
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:269
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1029
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:447
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:319
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:176
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:742
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:177
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.