35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 162 template<
typename MT1
168 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true >
169 ,
private MatMatMultExpr
170 ,
private Computation
195 SYM = ( SF && !( HF || LF || UF ) ),
196 HERM = ( HF && !( LF || UF ) ),
197 LOW = ( LF || ( ( SF || HF ) && UF ) ),
198 UPP = ( UF || ( ( SF || HF ) && LF ) )
208 template<
typename T1,
typename T2,
typename T3 >
209 struct IsEvaluationRequired {
210 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
220 template<
typename T1,
typename T2,
typename T3 >
221 struct UseBlasKernel {
228 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
243 template<
typename T1,
typename T2,
typename T3 >
244 struct UseVectorizedDefaultKernel {
247 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
272 ,
Noop > > > > ForwardFunctor;
305 MT1::simdEnabled && MT2::simdEnabled &&
310 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
311 !evaluateRight && MT2::smpAssignable };
366 :(
lhs_.columns() ) ) );
370 const size_t n(
end - begin );
388 inline ReturnType
at(
size_t i,
size_t j )
const {
389 if( i >=
lhs_.rows() ) {
392 if( j >=
rhs_.columns() ) {
404 inline size_t rows() const noexcept {
415 return rhs_.columns();
445 template<
typename T >
446 inline bool canAlias(
const T* alias )
const noexcept {
447 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
457 template<
typename T >
458 inline bool isAliased(
const T* alias )
const noexcept {
459 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
469 return lhs_.isAligned() &&
rhs_.isAligned();
480 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
481 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
505 template<
typename MT
514 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
517 else if( rhs.lhs_.columns() == 0UL ) {
522 LT A(
serial( rhs.lhs_ ) );
523 RT B(
serial( rhs.rhs_ ) );
532 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
548 template<
typename MT3
551 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
556 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
557 selectSmallAssignKernel( C, A, B );
559 selectBlasAssignKernel( C, A, B );
578 template<
typename MT3
584 const size_t M( A.rows() );
585 const size_t N( B.columns() );
586 const size_t K( A.columns() );
590 for(
size_t i=0UL; i<M; ++i )
601 for(
size_t j=0UL; j<N; ++j ) {
610 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
611 :( UPP ?
max(i,kbegin) : kbegin ) )
612 :( UPP ? i : 0UL ) );
615 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
616 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
617 :( LOW ? i+1UL : N ) );
620 for(
size_t j=0UL; j<jbegin; ++j ) {
625 reset( (~C)(i,0UL) );
627 for(
size_t j=jbegin; j<jend; ++j ) {
628 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
631 for(
size_t j=jend; j<N; ++j ) {
636 reset( (~C)(i,N-1UL) );
640 for(
size_t k=kbegin+1UL; k<kend; ++k )
644 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
645 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
646 :( SYM || HERM || UPP ? i : 0UL ) );
649 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
650 :( LOW ?
min(i+1UL,k) : k ) )
651 :( LOW ? i+1UL : N ) );
653 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
656 for(
size_t j=jbegin; j<jend; ++j ) {
657 (~C)(i,j) += A(i,k) * B(k,j);
660 (~C)(i,jend) = A(i,k) * B(k,jend);
666 for(
size_t i=1UL; i<M; ++i ) {
667 for(
size_t j=0UL; j<i; ++j ) {
668 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
690 template<
typename MT3
696 const size_t M( A.rows() );
697 const size_t N( B.columns() );
698 const size_t K( A.columns() );
702 for(
size_t j=0UL; j<N; ++j )
713 for(
size_t i=0UL; i<M; ++i ) {
722 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
723 :( LOW ?
max(j,kbegin) : kbegin ) )
724 :( LOW ? j : 0UL ) );
727 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
728 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
729 :( UPP ? j+1UL : M ) );
732 for(
size_t i=0UL; i<ibegin; ++i ) {
737 reset( (~C)(0UL,j) );
739 for(
size_t i=ibegin; i<iend; ++i ) {
740 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
743 for(
size_t i=iend; i<M; ++i ) {
748 reset( (~C)(M-1UL,j) );
752 for(
size_t k=kbegin+1UL; k<kend; ++k )
756 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
757 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
758 :( SYM || HERM || LOW ? j : 0UL ) );
761 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
762 :( UPP ?
min(j+1UL,k) : k ) )
763 :( UPP ? j+1UL : M ) );
765 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
768 for(
size_t i=ibegin; i<iend; ++i ) {
769 (~C)(i,j) += A(i,k) * B(k,j);
772 (~C)(iend,j) = A(iend,k) * B(k,j);
778 for(
size_t j=1UL; j<N; ++j ) {
779 for(
size_t i=0UL; i<j; ++i ) {
780 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
802 template<
typename MT3
805 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
808 constexpr
size_t block( BLOCK_SIZE );
810 const size_t M( A.rows() );
811 const size_t N( B.columns() );
813 for(
size_t ii=0UL; ii<M; ii+=block ) {
814 const size_t iend(
min( M, ii+block ) );
815 for(
size_t jj=0UL; jj<N; jj+=block ) {
816 const size_t jend(
min( N, jj+block ) );
817 for(
size_t i=ii; i<iend; ++i )
827 for(
size_t j=jj; j<jbegin; ++j ) {
831 for(
size_t j=jbegin; j<jpos; ++j ) {
832 (~C)(i,j) = A(i,j) * B(j,j);
835 for(
size_t j=jpos; j<jend; ++j ) {
860 template<
typename MT3
863 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
866 const size_t M( A.rows() );
867 const size_t N( B.columns() );
869 for(
size_t j=0UL; j<N; ++j )
880 for(
size_t i=0UL; i<ibegin; ++i ) {
884 for(
size_t i=ibegin; i<iend; ++i ) {
885 (~C)(i,j) = A(i,j) * B(j,j);
888 for(
size_t i=iend; i<M; ++i ) {
911 template<
typename MT3
917 const size_t M( A.rows() );
918 const size_t N( B.columns() );
920 for(
size_t i=0UL; i<M; ++i )
931 for(
size_t j=0UL; j<jbegin; ++j ) {
935 for(
size_t j=jbegin; j<jend; ++j ) {
936 (~C)(i,j) = A(i,i) * B(i,j);
939 for(
size_t j=jend; j<N; ++j ) {
962 template<
typename MT3
968 constexpr
size_t block( BLOCK_SIZE );
970 const size_t M( A.rows() );
971 const size_t N( B.columns() );
973 for(
size_t jj=0UL; jj<N; jj+=block ) {
974 const size_t jend(
min( N, jj+block ) );
975 for(
size_t ii=0UL; ii<M; ii+=block ) {
976 const size_t iend(
min( M, ii+block ) );
977 for(
size_t j=jj; j<jend; ++j )
987 for(
size_t i=ii; i<ibegin; ++i ) {
991 for(
size_t i=ibegin; i<ipos; ++i ) {
992 (~C)(i,j) = A(i,i) * B(i,j);
995 for(
size_t i=ipos; i<iend; ++i ) {
1020 template<
typename MT3
1024 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1028 for(
size_t i=0UL; i<A.rows(); ++i ) {
1029 C(i,i) = A(i,i) * B(i,i);
1049 template<
typename MT3
1053 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1055 selectDefaultAssignKernel( ~C, A, B );
1075 template<
typename MT3
1083 const size_t M( A.rows() );
1084 const size_t N( B.columns() );
1085 const size_t K( A.columns() );
1089 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1092 if( LOW && UPP && N > SIMDSIZE*3UL ) {
1101 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1102 for(
size_t i=0UL; i<M; ++i )
1115 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1117 for(
size_t k=kbegin; k<kend; ++k ) {
1118 const SIMDType a1(
set( A(i,k) ) );
1119 xmm1 += a1 * B.load(k,j );
1120 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1121 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1122 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1123 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1124 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1125 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1126 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1129 (~C).store( i, j , xmm1 );
1130 (~C).store( i, j+SIMDSIZE , xmm2 );
1131 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1132 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1133 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1134 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1135 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1136 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1141 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1145 for( ; (i+2UL) <= M; i+=2UL )
1158 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1160 for(
size_t k=kbegin; k<kend; ++k ) {
1161 const SIMDType a1(
set( A(i ,k) ) );
1162 const SIMDType a2(
set( A(i+1UL,k) ) );
1163 const SIMDType b1( B.load(k,j ) );
1164 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1165 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1166 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1167 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1180 (~C).store( i , j , xmm1 );
1181 (~C).store( i , j+SIMDSIZE , xmm2 );
1182 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1183 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1184 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
1185 (~C).store( i+1UL, j , xmm6 );
1186 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
1187 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1188 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1189 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1201 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1203 for(
size_t k=kbegin; k<kend; ++k ) {
1204 const SIMDType a1(
set( A(i,k) ) );
1205 xmm1 += a1 * B.load(k,j );
1206 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1207 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1208 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1209 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1212 (~C).store( i, j , xmm1 );
1213 (~C).store( i, j+SIMDSIZE , xmm2 );
1214 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1215 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1216 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1220 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1222 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
1223 size_t i( LOW ? j : 0UL );
1225 for( ; (i+2UL) <= iend; i+=2UL )
1238 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1240 for(
size_t k=kbegin; k<kend; ++k ) {
1241 const SIMDType a1(
set( A(i ,k) ) );
1242 const SIMDType a2(
set( A(i+1UL,k) ) );
1243 const SIMDType b1( B.load(k,j ) );
1244 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1245 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1246 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1257 (~C).store( i , j , xmm1 );
1258 (~C).store( i , j+SIMDSIZE , xmm2 );
1259 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1260 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1261 (~C).store( i+1UL, j , xmm5 );
1262 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1263 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1264 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1276 SIMDType xmm1, xmm2, xmm3, xmm4;
1278 for(
size_t k=kbegin; k<kend; ++k ) {
1279 const SIMDType a1(
set( A(i,k) ) );
1280 xmm1 += a1 * B.load(k,j );
1281 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1282 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1283 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1286 (~C).store( i, j , xmm1 );
1287 (~C).store( i, j+SIMDSIZE , xmm2 );
1288 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1289 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1293 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1295 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
1296 size_t i( LOW ? j : 0UL );
1298 for( ; (i+2UL) <= iend; i+=2UL )
1311 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1313 for(
size_t k=kbegin; k<kend; ++k ) {
1314 const SIMDType a1(
set( A(i ,k) ) );
1315 const SIMDType a2(
set( A(i+1UL,k) ) );
1316 const SIMDType b1( B.load(k,j ) );
1317 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1318 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1327 (~C).store( i , j , xmm1 );
1328 (~C).store( i , j+SIMDSIZE , xmm2 );
1329 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1330 (~C).store( i+1UL, j , xmm4 );
1331 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1332 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1344 SIMDType xmm1, xmm2, xmm3;
1346 for(
size_t k=kbegin; k<kend; ++k ) {
1347 const SIMDType a1(
set( A(i,k) ) );
1348 xmm1 += a1 * B.load(k,j );
1349 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1350 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1353 (~C).store( i, j , xmm1 );
1354 (~C).store( i, j+SIMDSIZE , xmm2 );
1355 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1359 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1361 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
1362 size_t i( LOW ? j : 0UL );
1364 for( ; (i+2UL) <= iend; i+=2UL )
1377 SIMDType xmm1, xmm2, xmm3, xmm4;
1379 for(
size_t k=kbegin; k<kend; ++k ) {
1380 const SIMDType a1(
set( A(i ,k) ) );
1381 const SIMDType a2(
set( A(i+1UL,k) ) );
1382 const SIMDType b1( B.load(k,j ) );
1383 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1390 (~C).store( i , j , xmm1 );
1391 (~C).store( i , j+SIMDSIZE, xmm2 );
1392 (~C).store( i+1UL, j , xmm3 );
1393 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1405 SIMDType xmm1, xmm2;
1407 for(
size_t k=kbegin; k<kend; ++k ) {
1408 const SIMDType a1(
set( A(i,k) ) );
1409 xmm1 += a1 * B.load(k,j );
1410 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1413 (~C).store( i, j , xmm1 );
1414 (~C).store( i, j+SIMDSIZE, xmm2 );
1418 for( ; j<jpos; j+=SIMDSIZE )
1420 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
1421 size_t i( LOW ? j : 0UL );
1423 for( ; (i+2UL) <= iend; i+=2UL )
1434 SIMDType xmm1, xmm2;
1436 for(
size_t k=kbegin; k<kend; ++k ) {
1437 const SIMDType b1( B.load(k,j) );
1438 xmm1 +=
set( A(i ,k) ) * b1;
1439 xmm2 +=
set( A(i+1UL,k) ) * b1;
1442 (~C).store( i , j, xmm1 );
1443 (~C).store( i+1UL, j, xmm2 );
1456 for(
size_t k=kbegin; k<K; ++k ) {
1457 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1460 (~C).store( i, j, xmm1 );
1464 for( ; remainder && j<N; ++j )
1466 size_t i( LOW && UPP ? j : 0UL );
1468 for( ; (i+2UL) <= M; i+=2UL )
1482 for(
size_t k=kbegin; k<kend; ++k ) {
1483 value1 += A(i ,k) * B(k,j);
1484 value2 += A(i+1UL,k) * B(k,j);
1487 (~C)(i ,j) = value1;
1488 (~C)(i+1UL,j) = value2;
1501 for(
size_t k=kbegin; k<K; ++k ) {
1502 value += A(i,k) * B(k,j);
1510 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1511 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1512 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1513 for(
size_t j=0UL; j<jend; ++j ) {
1514 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1518 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1519 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1520 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1521 for(
size_t i=0UL; i<iend; ++i ) {
1526 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1527 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1528 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1529 for(
size_t j=0UL; j<jend; ++j ) {
1553 template<
typename MT3
1561 const size_t M( A.rows() );
1562 const size_t N( B.columns() );
1563 const size_t K( A.columns() );
1567 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1570 if( LOW && UPP && M > SIMDSIZE*3UL ) {
1579 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1580 for(
size_t j=0UL; j<N; ++j )
1593 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1595 for(
size_t k=kbegin; k<kend; ++k ) {
1596 const SIMDType b1(
set( B(k,j) ) );
1597 xmm1 += A.load(i ,k) * b1;
1598 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1599 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1600 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1601 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1602 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1603 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1604 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1607 (~C).store( i , j, xmm1 );
1608 (~C).store( i+SIMDSIZE , j, xmm2 );
1609 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1610 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1611 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1612 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1613 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1614 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1619 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1623 for( ; (j+2UL) <= N; j+=2UL )
1636 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1638 for(
size_t k=kbegin; k<kend; ++k ) {
1639 const SIMDType a1( A.load(i ,k) );
1640 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1641 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1642 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1643 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1644 const SIMDType b1(
set( B(k,j ) ) );
1645 const SIMDType b2(
set( B(k,j+1UL) ) );
1658 (~C).store( i , j , xmm1 );
1659 (~C).store( i+SIMDSIZE , j , xmm2 );
1660 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1661 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1662 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1663 (~C).store( i , j+1UL, xmm6 );
1664 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1665 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1666 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1667 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1679 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1681 for(
size_t k=kbegin; k<kend; ++k ) {
1682 const SIMDType b1(
set( B(k,j) ) );
1683 xmm1 += A.load(i ,k) * b1;
1684 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1685 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1686 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1687 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1690 (~C).store( i , j, xmm1 );
1691 (~C).store( i+SIMDSIZE , j, xmm2 );
1692 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1693 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1694 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1698 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1700 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
1701 size_t j( UPP ? i : 0UL );
1703 for( ; (j+2UL) <= jend; j+=2UL )
1716 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1718 for(
size_t k=kbegin; k<kend; ++k ) {
1719 const SIMDType a1( A.load(i ,k) );
1720 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1721 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1722 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1723 const SIMDType b1(
set( B(k,j ) ) );
1724 const SIMDType b2(
set( B(k,j+1UL) ) );
1735 (~C).store( i , j , xmm1 );
1736 (~C).store( i+SIMDSIZE , j , xmm2 );
1737 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1738 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1739 (~C).store( i , j+1UL, xmm5 );
1740 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1741 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1742 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1754 SIMDType xmm1, xmm2, xmm3, xmm4;
1756 for(
size_t k=kbegin; k<kend; ++k ) {
1757 const SIMDType b1(
set( B(k,j) ) );
1758 xmm1 += A.load(i ,k) * b1;
1759 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1760 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1761 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1764 (~C).store( i , j, xmm1 );
1765 (~C).store( i+SIMDSIZE , j, xmm2 );
1766 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1767 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1771 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1773 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
1774 size_t j( UPP ? i : 0UL );
1776 for( ; (j+2UL) <= jend; j+=2UL )
1789 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1791 for(
size_t k=kbegin; k<kend; ++k ) {
1792 const SIMDType a1( A.load(i ,k) );
1793 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1794 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1795 const SIMDType b1(
set( B(k,j ) ) );
1796 const SIMDType b2(
set( B(k,j+1UL) ) );
1805 (~C).store( i , j , xmm1 );
1806 (~C).store( i+SIMDSIZE , j , xmm2 );
1807 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1808 (~C).store( i , j+1UL, xmm4 );
1809 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1810 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1822 SIMDType xmm1, xmm2, xmm3;
1824 for(
size_t k=kbegin; k<kend; ++k ) {
1825 const SIMDType b1(
set( B(k,j) ) );
1826 xmm1 += A.load(i ,k) * b1;
1827 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1828 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1831 (~C).store( i , j, xmm1 );
1832 (~C).store( i+SIMDSIZE , j, xmm2 );
1833 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1837 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1839 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
1840 size_t j( UPP ? i : 0UL );
1842 for( ; (j+2UL) <= jend; j+=2UL )
1855 SIMDType xmm1, xmm2, xmm3, xmm4;
1857 for(
size_t k=kbegin; k<kend; ++k ) {
1858 const SIMDType a1( A.load(i ,k) );
1859 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1860 const SIMDType b1(
set( B(k,j ) ) );
1861 const SIMDType b2(
set( B(k,j+1UL) ) );
1868 (~C).store( i , j , xmm1 );
1869 (~C).store( i+SIMDSIZE, j , xmm2 );
1870 (~C).store( i , j+1UL, xmm3 );
1871 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1883 SIMDType xmm1, xmm2;
1885 for(
size_t k=kbegin; k<kend; ++k ) {
1886 const SIMDType b1(
set( B(k,j) ) );
1887 xmm1 += A.load(i ,k) * b1;
1888 xmm2 += A.load(i+SIMDSIZE,k) * b1;
1891 (~C).store( i , j, xmm1 );
1892 (~C).store( i+SIMDSIZE, j, xmm2 );
1896 for( ; i<ipos; i+=SIMDSIZE )
1898 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
1899 size_t j( UPP ? i : 0UL );
1901 for( ; (j+2UL) <= jend; j+=2UL )
1912 SIMDType xmm1, xmm2;
1914 for(
size_t k=kbegin; k<kend; ++k ) {
1915 const SIMDType a1( A.load(i,k) );
1916 xmm1 += a1 *
set( B(k,j ) );
1917 xmm2 += a1 *
set( B(k,j+1UL) );
1920 (~C).store( i, j , xmm1 );
1921 (~C).store( i, j+1UL, xmm2 );
1934 for(
size_t k=kbegin; k<K; ++k ) {
1935 xmm1 += A.load(i,k) *
set( B(k,j) );
1938 (~C).store( i, j, xmm1 );
1942 for( ; remainder && i<M; ++i )
1944 size_t j( LOW && UPP ? i : 0UL );
1946 for( ; (j+2UL) <= N; j+=2UL )
1960 for(
size_t k=kbegin; k<kend; ++k ) {
1961 value1 += A(i,k) * B(k,j );
1962 value2 += A(i,k) * B(k,j+1UL);
1965 (~C)(i,j ) = value1;
1966 (~C)(i,j+1UL) = value2;
1979 for(
size_t k=kbegin; k<K; ++k ) {
1980 value += A(i,k) * B(k,j);
1988 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1989 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1990 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1991 for(
size_t i=0UL; i<iend; ++i ) {
1992 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1996 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1997 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1998 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1999 for(
size_t i=0UL; i<iend; ++i ) {
2004 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2005 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2006 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2007 for(
size_t j=0UL; j<jend; ++j ) {
2030 template<
typename MT3
2034 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2036 selectDefaultAssignKernel( C, A, B );
2056 template<
typename MT3
2060 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2090 template<
typename MT3
2094 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2096 selectLargeAssignKernel( C, A, B );
2102 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2116 template<
typename MT3
2120 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2133 gemm( C, A, B, ET(1), ET(0) );
2153 template<
typename MT
2171 const ForwardFunctor fwd;
2173 const TmpType tmp(
serial( rhs ) );
2174 assign( ~lhs, fwd( tmp ) );
2192 template<
typename MT
2201 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2205 LT A(
serial( rhs.lhs_ ) );
2206 RT B(
serial( rhs.rhs_ ) );
2215 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2231 template<
typename MT3
2234 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2239 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2240 selectSmallAddAssignKernel( C, A, B );
2242 selectBlasAddAssignKernel( C, A, B );
2261 template<
typename MT3
2267 const size_t M( A.rows() );
2268 const size_t N( B.columns() );
2269 const size_t K( A.columns() );
2273 for(
size_t i=0UL; i<M; ++i )
2283 for(
size_t k=kbegin; k<kend; ++k )
2287 ?( UPP ?
max(i,k+1UL) : k+1UL )
2288 :( UPP ?
max(i,k) : k ) )
2289 :( UPP ? i : 0UL ) );
2292 ?( LOW ?
min(i+1UL,k) : k )
2293 :( LOW ?
min(i,k)+1UL : k+1UL ) )
2294 :( LOW ? i+1UL : N ) );
2296 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
2299 const size_t jnum( jend - jbegin );
2300 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2302 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2303 (~C)(i,j ) += A(i,k) * B(k,j );
2304 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2307 (~C)(i,jpos) += A(i,k) * B(k,jpos);
2329 template<
typename MT3
2335 const size_t M( A.rows() );
2336 const size_t N( B.columns() );
2337 const size_t K( A.columns() );
2341 for(
size_t j=0UL; j<N; ++j )
2351 for(
size_t k=kbegin; k<kend; ++k )
2355 ?( LOW ?
max(j,k+1UL) : k+1UL )
2356 :( LOW ?
max(j,k) : k ) )
2357 :( LOW ? j : 0UL ) );
2360 ?( UPP ?
min(j+1UL,k) : k )
2361 :( UPP ?
min(j,k)+1UL : k+1UL ) )
2362 :( UPP ? j+1UL : M ) );
2364 if( ( LOW || UPP ) && ibegin >= iend )
continue;
2367 const size_t inum( iend - ibegin );
2368 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2370 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2371 (~C)(i ,j) += A(i ,k) * B(k,j);
2372 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2375 (~C)(ipos,j) += A(ipos,k) * B(k,j);
2397 template<
typename MT3
2400 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2403 constexpr
size_t block( BLOCK_SIZE );
2405 const size_t M( A.rows() );
2406 const size_t N( B.columns() );
2408 for(
size_t ii=0UL; ii<M; ii+=block ) {
2409 const size_t iend(
min( M, ii+block ) );
2410 for(
size_t jj=0UL; jj<N; jj+=block ) {
2411 const size_t jend(
min( N, jj+block ) );
2412 for(
size_t i=ii; i<iend; ++i )
2421 for(
size_t j=jbegin; j<jpos; ++j ) {
2422 (~C)(i,j) += A(i,j) * B(j,j);
2445 template<
typename MT3
2448 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2451 const size_t M( A.rows() );
2452 const size_t N( B.columns() );
2454 for(
size_t j=0UL; j<N; ++j )
2464 const size_t inum( iend - ibegin );
2465 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2467 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2468 (~C)(i ,j) += A(i ,j) * B(j,j);
2469 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2472 (~C)(ipos,j) += A(ipos,j) * B(j,j);
2493 template<
typename MT3
2499 const size_t M( A.rows() );
2500 const size_t N( B.columns() );
2502 for(
size_t i=0UL; i<M; ++i )
2512 const size_t jnum( jend - jbegin );
2513 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2515 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2516 (~C)(i,j ) += A(i,i) * B(i,j );
2517 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2520 (~C)(i,jpos) += A(i,i) * B(i,jpos);
2541 template<
typename MT3
2547 constexpr
size_t block( BLOCK_SIZE );
2549 const size_t M( A.rows() );
2550 const size_t N( B.columns() );
2552 for(
size_t jj=0UL; jj<N; jj+=block ) {
2553 const size_t jend(
min( N, jj+block ) );
2554 for(
size_t ii=0UL; ii<M; ii+=block ) {
2555 const size_t iend(
min( M, ii+block ) );
2556 for(
size_t j=jj; j<jend; ++j )
2565 for(
size_t i=ibegin; i<ipos; ++i ) {
2566 (~C)(i,j) += A(i,i) * B(i,j);
2589 template<
typename MT3
2593 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2595 for(
size_t i=0UL; i<A.rows(); ++i ) {
2596 C(i,i) += A(i,i) * B(i,i);
2616 template<
typename MT3
2620 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2622 selectDefaultAddAssignKernel( C, A, B );
2642 template<
typename MT3
2650 const size_t M( A.rows() );
2651 const size_t N( B.columns() );
2652 const size_t K( A.columns() );
2656 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2663 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2664 for(
size_t i=0UL; i<M; ++i )
2677 SIMDType xmm1( (~C).load(i,j ) );
2678 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2679 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2680 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2681 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2682 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2683 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2684 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2686 for(
size_t k=kbegin; k<kend; ++k ) {
2687 const SIMDType a1(
set( A(i,k) ) );
2688 xmm1 += a1 * B.load(k,j );
2689 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2690 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2691 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2692 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2693 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2694 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2695 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2698 (~C).store( i, j , xmm1 );
2699 (~C).store( i, j+SIMDSIZE , xmm2 );
2700 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2701 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2702 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2703 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2704 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2705 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2710 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2714 for( ; (i+2UL) <= M; i+=2UL )
2727 SIMDType xmm1 ( (~C).load(i ,j ) );
2728 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
2729 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
2730 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
2731 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
2732 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
2733 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
2734 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2735 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2736 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
2738 for(
size_t k=kbegin; k<kend; ++k ) {
2739 const SIMDType a1(
set( A(i ,k) ) );
2740 const SIMDType a2(
set( A(i+1UL,k) ) );
2741 const SIMDType b1( B.load(k,j ) );
2742 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2743 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2744 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2745 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2758 (~C).store( i , j , xmm1 );
2759 (~C).store( i , j+SIMDSIZE , xmm2 );
2760 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2761 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2762 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
2763 (~C).store( i+1UL, j , xmm6 );
2764 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
2765 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2766 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2767 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2779 SIMDType xmm1( (~C).load(i,j ) );
2780 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2781 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2782 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2783 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2785 for(
size_t k=kbegin; k<kend; ++k ) {
2786 const SIMDType a1(
set( A(i,k) ) );
2787 xmm1 += a1 * B.load(k,j );
2788 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2789 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2790 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2791 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2794 (~C).store( i, j , xmm1 );
2795 (~C).store( i, j+SIMDSIZE , xmm2 );
2796 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2797 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2798 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2802 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2806 for( ; (i+2UL) <= M; i+=2UL )
2819 SIMDType xmm1( (~C).load(i ,j ) );
2820 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2821 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2822 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2823 SIMDType xmm5( (~C).load(i+1UL,j ) );
2824 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2825 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2826 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2828 for(
size_t k=kbegin; k<kend; ++k ) {
2829 const SIMDType a1(
set( A(i ,k) ) );
2830 const SIMDType a2(
set( A(i+1UL,k) ) );
2831 const SIMDType b1( B.load(k,j ) );
2832 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2833 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2834 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2845 (~C).store( i , j , xmm1 );
2846 (~C).store( i , j+SIMDSIZE , xmm2 );
2847 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2848 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2849 (~C).store( i+1UL, j , xmm5 );
2850 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2851 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2852 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2864 SIMDType xmm1( (~C).load(i,j ) );
2865 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2866 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2867 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2869 for(
size_t k=kbegin; k<kend; ++k ) {
2870 const SIMDType a1(
set( A(i,k) ) );
2871 xmm1 += a1 * B.load(k,j );
2872 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2873 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2874 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2877 (~C).store( i, j , xmm1 );
2878 (~C).store( i, j+SIMDSIZE , xmm2 );
2879 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2880 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2884 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2888 for( ; (i+2UL) <= M; i+=2UL )
2901 SIMDType xmm1( (~C).load(i ,j ) );
2902 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2903 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2904 SIMDType xmm4( (~C).load(i+1UL,j ) );
2905 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
2906 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2908 for(
size_t k=kbegin; k<kend; ++k ) {
2909 const SIMDType a1(
set( A(i ,k) ) );
2910 const SIMDType a2(
set( A(i+1UL,k) ) );
2911 const SIMDType b1( B.load(k,j ) );
2912 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2913 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2922 (~C).store( i , j , xmm1 );
2923 (~C).store( i , j+SIMDSIZE , xmm2 );
2924 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2925 (~C).store( i+1UL, j , xmm4 );
2926 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
2927 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2939 SIMDType xmm1( (~C).load(i,j ) );
2940 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2941 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2943 for(
size_t k=kbegin; k<kend; ++k ) {
2944 const SIMDType a1(
set( A(i,k) ) );
2945 xmm1 += a1 * B.load(k,j );
2946 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2947 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2950 (~C).store( i, j , xmm1 );
2951 (~C).store( i, j+SIMDSIZE , xmm2 );
2952 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2956 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2958 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
2959 size_t i( LOW ? j : 0UL );
2961 for( ; (i+2UL) <= iend; i+=2UL )
2974 SIMDType xmm1( (~C).load(i ,j ) );
2975 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2976 SIMDType xmm3( (~C).load(i+1UL,j ) );
2977 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2979 for(
size_t k=kbegin; k<kend; ++k ) {
2980 const SIMDType a1(
set( A(i ,k) ) );
2981 const SIMDType a2(
set( A(i+1UL,k) ) );
2982 const SIMDType b1( B.load(k,j ) );
2983 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2990 (~C).store( i , j , xmm1 );
2991 (~C).store( i , j+SIMDSIZE, xmm2 );
2992 (~C).store( i+1UL, j , xmm3 );
2993 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3005 SIMDType xmm1( (~C).load(i,j ) );
3006 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3008 for(
size_t k=kbegin; k<kend; ++k ) {
3009 const SIMDType a1(
set( A(i,k) ) );
3010 xmm1 += a1 * B.load(k,j );
3011 xmm2 += a1 * B.load(k,j+SIMDSIZE);
3014 (~C).store( i, j , xmm1 );
3015 (~C).store( i, j+SIMDSIZE, xmm2 );
3019 for( ; j<jpos; j+=SIMDSIZE )
3021 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
3022 size_t i( LOW ? j : 0UL );
3024 for( ; (i+2UL) <= iend; i+=2UL )
3035 SIMDType xmm1( (~C).load(i ,j) );
3036 SIMDType xmm2( (~C).load(i+1UL,j) );
3038 for(
size_t k=kbegin; k<kend; ++k ) {
3039 const SIMDType b1( B.load(k,j) );
3040 xmm1 +=
set( A(i ,k) ) * b1;
3041 xmm2 +=
set( A(i+1UL,k) ) * b1;
3044 (~C).store( i , j, xmm1 );
3045 (~C).store( i+1UL, j, xmm2 );
3056 SIMDType xmm1( (~C).load(i,j) );
3058 for(
size_t k=kbegin; k<K; ++k ) {
3059 xmm1 +=
set( A(i,k) ) * B.load(k,j);
3062 (~C).store( i, j, xmm1 );
3066 for( ; remainder && j<N; ++j )
3068 const size_t iend( UPP ? j+1UL : M );
3069 size_t i( LOW ? j : 0UL );
3071 for( ; (i+2UL) <= iend; i+=2UL )
3082 ElementType value1( (~C)(i ,j) );
3083 ElementType value2( (~C)(i+1UL,j) );;
3085 for(
size_t k=kbegin; k<kend; ++k ) {
3086 value1 += A(i ,k) * B(k,j);
3087 value2 += A(i+1UL,k) * B(k,j);
3090 (~C)(i ,j) = value1;
3091 (~C)(i+1UL,j) = value2;
3102 ElementType value( (~C)(i,j) );
3104 for(
size_t k=kbegin; k<K; ++k ) {
3105 value += A(i,k) * B(k,j);
3130 template<
typename MT3
3138 const size_t M( A.rows() );
3139 const size_t N( B.columns() );
3140 const size_t K( A.columns() );
3144 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3151 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3152 for(
size_t j=0UL; j<N; ++j )
3165 SIMDType xmm1( (~C).load(i ,j) );
3166 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3167 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3168 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3169 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3170 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3171 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3172 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3174 for(
size_t k=kbegin; k<kend; ++k ) {
3175 const SIMDType b1(
set( B(k,j) ) );
3176 xmm1 += A.load(i ,k) * b1;
3177 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3178 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3179 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3180 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3181 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3182 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3183 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3186 (~C).store( i , j, xmm1 );
3187 (~C).store( i+SIMDSIZE , j, xmm2 );
3188 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3189 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3190 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3191 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3192 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3193 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3198 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3202 for( ; (j+2UL) <= N; j+=2UL )
3215 SIMDType xmm1 ( (~C).load(i ,j ) );
3216 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3217 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3218 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3219 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3220 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3221 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3222 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3223 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3224 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3226 for(
size_t k=kbegin; k<kend; ++k ) {
3227 const SIMDType a1( A.load(i ,k) );
3228 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3229 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3230 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3231 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3232 const SIMDType b1(
set( B(k,j ) ) );
3233 const SIMDType b2(
set( B(k,j+1UL) ) );
3246 (~C).store( i , j , xmm1 );
3247 (~C).store( i+SIMDSIZE , j , xmm2 );
3248 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3249 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3250 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3251 (~C).store( i , j+1UL, xmm6 );
3252 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3253 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3254 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3255 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3267 SIMDType xmm1( (~C).load(i ,j) );
3268 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3269 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3270 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3271 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3273 for(
size_t k=kbegin; k<kend; ++k ) {
3274 const SIMDType b1(
set( B(k,j) ) );
3275 xmm1 += A.load(i ,k) * b1;
3276 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3277 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3278 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3279 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3282 (~C).store( i , j, xmm1 );
3283 (~C).store( i+SIMDSIZE , j, xmm2 );
3284 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3285 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3286 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3290 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3294 for( ; (j+2UL) <= N; j+=2UL )
3307 SIMDType xmm1( (~C).load(i ,j ) );
3308 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3309 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3310 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3311 SIMDType xmm5( (~C).load(i ,j+1UL) );
3312 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3313 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3314 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3316 for(
size_t k=kbegin; k<kend; ++k ) {
3317 const SIMDType a1( A.load(i ,k) );
3318 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3319 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3320 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3321 const SIMDType b1(
set( B(k,j ) ) );
3322 const SIMDType b2(
set( B(k,j+1UL) ) );
3333 (~C).store( i , j , xmm1 );
3334 (~C).store( i+SIMDSIZE , j , xmm2 );
3335 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3336 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3337 (~C).store( i , j+1UL, xmm5 );
3338 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3339 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3340 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3352 SIMDType xmm1( (~C).load(i ,j) );
3353 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3354 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3355 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3357 for(
size_t k=kbegin; k<kend; ++k ) {
3358 const SIMDType b1(
set( B(k,j) ) );
3359 xmm1 += A.load(i ,k) * b1;
3360 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3361 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3362 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3365 (~C).store( i , j, xmm1 );
3366 (~C).store( i+SIMDSIZE , j, xmm2 );
3367 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3368 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3372 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3376 for( ; (j+2UL) <= N; j+=2UL )
3389 SIMDType xmm1( (~C).load(i ,j ) );
3390 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3391 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3392 SIMDType xmm4( (~C).load(i ,j+1UL) );
3393 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3394 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3396 for(
size_t k=kbegin; k<kend; ++k ) {
3397 const SIMDType a1( A.load(i ,k) );
3398 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3399 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3400 const SIMDType b1(
set( B(k,j ) ) );
3401 const SIMDType b2(
set( B(k,j+1UL) ) );
3410 (~C).store( i , j , xmm1 );
3411 (~C).store( i+SIMDSIZE , j , xmm2 );
3412 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3413 (~C).store( i , j+1UL, xmm4 );
3414 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3415 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3427 SIMDType xmm1( (~C).load(i ,j) );
3428 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3429 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3431 for(
size_t k=kbegin; k<kend; ++k ) {
3432 const SIMDType b1(
set( B(k,j) ) );
3433 xmm1 += A.load(i ,k) * b1;
3434 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3435 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3438 (~C).store( i , j, xmm1 );
3439 (~C).store( i+SIMDSIZE , j, xmm2 );
3440 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3444 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3446 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
3447 size_t j( UPP ? i : 0UL );
3449 for( ; (j+2UL) <= jend; j+=2UL )
3462 SIMDType xmm1( (~C).load(i ,j ) );
3463 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3464 SIMDType xmm3( (~C).load(i ,j+1UL) );
3465 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3467 for(
size_t k=kbegin; k<kend; ++k ) {
3468 const SIMDType a1( A.load(i ,k) );
3469 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3470 const SIMDType b1(
set( B(k,j ) ) );
3471 const SIMDType b2(
set( B(k,j+1UL) ) );
3478 (~C).store( i , j , xmm1 );
3479 (~C).store( i+SIMDSIZE, j , xmm2 );
3480 (~C).store( i , j+1UL, xmm3 );
3481 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3493 SIMDType xmm1( (~C).load(i ,j) );
3494 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3496 for(
size_t k=kbegin; k<kend; ++k ) {
3497 const SIMDType b1(
set( B(k,j) ) );
3498 xmm1 += A.load(i ,k) * b1;
3499 xmm2 += A.load(i+SIMDSIZE,k) * b1;
3502 (~C).store( i , j, xmm1 );
3503 (~C).store( i+SIMDSIZE, j, xmm2 );
3507 for( ; i<ipos; i+=SIMDSIZE )
3509 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
3510 size_t j( UPP ? i : 0UL );
3512 for( ; (j+2UL) <= jend; j+=2UL )
3523 SIMDType xmm1( (~C).load(i,j ) );
3524 SIMDType xmm2( (~C).load(i,j+1UL) );
3526 for(
size_t k=kbegin; k<kend; ++k ) {
3527 const SIMDType a1( A.load(i,k) );
3528 xmm1 += a1 *
set( B(k,j ) );
3529 xmm2 += a1 *
set( B(k,j+1UL) );
3532 (~C).store( i, j , xmm1 );
3533 (~C).store( i, j+1UL, xmm2 );
3544 SIMDType xmm1( (~C).load(i,j) );
3546 for(
size_t k=kbegin; k<K; ++k ) {
3547 xmm1 += A.load(i,k) *
set( B(k,j) );
3550 (~C).store( i, j, xmm1 );
3554 for( ; remainder && i<M; ++i )
3556 const size_t jend( LOW ? i+1UL : N );
3557 size_t j( UPP ? i : 0UL );
3559 for( ; (j+2UL) <= jend; j+=2UL )
3570 ElementType value1( (~C)(i,j ) );
3571 ElementType value2( (~C)(i,j+1UL) );
3573 for(
size_t k=kbegin; k<kend; ++k ) {
3574 value1 += A(i,k) * B(k,j );
3575 value2 += A(i,k) * B(k,j+1UL);
3578 (~C)(i,j ) = value1;
3579 (~C)(i,j+1UL) = value2;
3590 ElementType value( (~C)(i,j) );
3592 for(
size_t k=kbegin; k<K; ++k ) {
3593 value += A(i,k) * B(k,j);
3617 template<
typename MT3
3621 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3623 selectDefaultAddAssignKernel( C, A, B );
3643 template<
typename MT3
3647 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3673 template<
typename MT3
3677 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3679 selectLargeAddAssignKernel( C, A, B );
3685 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3699 template<
typename MT3
3703 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3710 addAssign( C, tmp );
3715 addAssign( C, tmp );
3718 gemm( C, A, B, ET(1), ET(1) );
3742 template<
typename MT
3751 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3755 LT A(
serial( rhs.lhs_ ) );
3756 RT B(
serial( rhs.rhs_ ) );
3765 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3781 template<
typename MT3
3784 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3789 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3790 selectSmallSubAssignKernel( C, A, B );
3792 selectBlasSubAssignKernel( C, A, B );
3811 template<
typename MT3
3817 const size_t M( A.rows() );
3818 const size_t N( B.columns() );
3819 const size_t K( A.columns() );
3823 for(
size_t i=0UL; i<M; ++i )
3833 for(
size_t k=kbegin; k<kend; ++k )
3837 ?( UPP ?
max(i,k+1UL) : k+1UL )
3838 :( UPP ?
max(i,k) : k ) )
3839 :( UPP ? i : 0UL ) );
3842 ?( LOW ?
min(i+1UL,k) : k )
3843 :( LOW ?
min(i,k)+1UL : k+1UL ) )
3844 :( LOW ? i+1UL : N ) );
3846 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
3849 const size_t jnum( jend - jbegin );
3850 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3852 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3853 (~C)(i,j ) -= A(i,k) * B(k,j );
3854 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3857 (~C)(i,jpos) -= A(i,k) * B(k,jpos);
3879 template<
typename MT3
3885 const size_t M( A.rows() );
3886 const size_t N( B.columns() );
3887 const size_t K( A.columns() );
3891 for(
size_t j=0UL; j<N; ++j )
3901 for(
size_t k=kbegin; k<kend; ++k )
3905 ?( LOW ?
max(j,k+1UL) : k+1UL )
3906 :( LOW ?
max(j,k) : k ) )
3907 :( LOW ? j : 0UL ) );
3910 ?( UPP ?
min(j+1UL,k) : k )
3911 :( UPP ?
min(j,k)+1UL : k+1UL ) )
3912 :( UPP ? j+1UL : M ) );
3914 if( ( LOW || UPP ) && ( ibegin >= iend ) )
continue;
3917 const size_t inum( iend - ibegin );
3918 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3920 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3921 (~C)(i ,j) -= A(i ,k) * B(k,j);
3922 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3925 (~C)(ipos,j) -= A(ipos,k) * B(k,j);
3947 template<
typename MT3
3950 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3953 constexpr
size_t block( BLOCK_SIZE );
3955 const size_t M( A.rows() );
3956 const size_t N( B.columns() );
3958 for(
size_t ii=0UL; ii<M; ii+=block ) {
3959 const size_t iend(
min( M, ii+block ) );
3960 for(
size_t jj=0UL; jj<N; jj+=block ) {
3961 const size_t jend(
min( N, jj+block ) );
3962 for(
size_t i=ii; i<iend; ++i )
3971 for(
size_t j=jbegin; j<jpos; ++j ) {
3972 (~C)(i,j) -= A(i,j) * B(j,j);
3995 template<
typename MT3
3998 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4001 const size_t M( A.rows() );
4002 const size_t N( B.columns() );
4004 for(
size_t j=0UL; j<N; ++j )
4014 const size_t inum( iend - ibegin );
4015 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4017 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4018 (~C)(i ,j) -= A(i ,j) * B(j,j);
4019 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4022 (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4043 template<
typename MT3
4049 const size_t M( A.rows() );
4050 const size_t N( B.columns() );
4052 for(
size_t i=0UL; i<M; ++i )
4062 const size_t jnum( jend - jbegin );
4063 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4065 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4066 (~C)(i,j ) -= A(i,i) * B(i,j );
4067 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4070 (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4091 template<
typename MT3
4097 constexpr
size_t block( BLOCK_SIZE );
4099 const size_t M( A.rows() );
4100 const size_t N( B.columns() );
4102 for(
size_t jj=0UL; jj<N; jj+=block ) {
4103 const size_t jend(
min( N, jj+block ) );
4104 for(
size_t ii=0UL; ii<M; ii+=block ) {
4105 const size_t iend(
min( M, ii+block ) );
4106 for(
size_t j=jj; j<jend; ++j )
4115 for(
size_t i=ibegin; i<ipos; ++i ) {
4116 (~C)(i,j) -= A(i,i) * B(i,j);
4139 template<
typename MT3
4143 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4145 for(
size_t i=0UL; i<A.rows(); ++i ) {
4146 C(i,i) -= A(i,i) * B(i,i);
4166 template<
typename MT3
4170 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4172 selectDefaultSubAssignKernel( C, A, B );
4192 template<
typename MT3
4200 const size_t M( A.rows() );
4201 const size_t N( B.columns() );
4202 const size_t K( A.columns() );
4206 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4213 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4214 for(
size_t i=0UL; i<M; ++i )
4227 SIMDType xmm1( (~C).load(i,j ) );
4228 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4229 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4230 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4231 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
4232 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
4233 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
4234 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
4236 for(
size_t k=kbegin; k<kend; ++k ) {
4237 const SIMDType a1(
set( A(i,k) ) );
4238 xmm1 -= a1 * B.load(k,j );
4239 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4240 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4241 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
4242 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
4243 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
4244 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
4245 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
4248 (~C).store( i, j , xmm1 );
4249 (~C).store( i, j+SIMDSIZE , xmm2 );
4250 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4251 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4252 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
4253 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
4254 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
4255 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
4260 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
4264 for( ; (i+2UL) <= M; i+=2UL )
4277 SIMDType xmm1 ( (~C).load(i ,j ) );
4278 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
4279 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
4280 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
4281 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
4282 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
4283 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
4284 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4285 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
4286 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
4288 for(
size_t k=kbegin; k<kend; ++k ) {
4289 const SIMDType a1(
set( A(i ,k) ) );
4290 const SIMDType a2(
set( A(i+1UL,k) ) );
4291 const SIMDType b1( B.load(k,j ) );
4292 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4293 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4294 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4295 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
4308 (~C).store( i , j , xmm1 );
4309 (~C).store( i , j+SIMDSIZE , xmm2 );
4310 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4311 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
4312 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
4313 (~C).store( i+1UL, j , xmm6 );
4314 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
4315 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
4316 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
4317 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
4329 SIMDType xmm1( (~C).load(i,j ) );
4330 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4331 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4332 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4333 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
4335 for(
size_t k=kbegin; k<kend; ++k ) {
4336 const SIMDType a1(
set( A(i,k) ) );
4337 xmm1 -= a1 * B.load(k,j );
4338 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4339 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4340 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
4341 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
4344 (~C).store( i, j , xmm1 );
4345 (~C).store( i, j+SIMDSIZE , xmm2 );
4346 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4347 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4348 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
4352 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4356 for( ; (i+2UL) <= M; i+=2UL )
4369 SIMDType xmm1( (~C).load(i ,j ) );
4370 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
4371 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
4372 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
4373 SIMDType xmm5( (~C).load(i+1UL,j ) );
4374 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
4375 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4376 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
4378 for(
size_t k=kbegin; k<kend; ++k ) {
4379 const SIMDType a1(
set( A(i ,k) ) );
4380 const SIMDType a2(
set( A(i+1UL,k) ) );
4381 const SIMDType b1( B.load(k,j ) );
4382 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4383 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4384 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4395 (~C).store( i , j , xmm1 );
4396 (~C).store( i , j+SIMDSIZE , xmm2 );
4397 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4398 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
4399 (~C).store( i+1UL, j , xmm5 );
4400 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
4401 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
4402 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
4414 SIMDType xmm1( (~C).load(i,j ) );
4415 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4416 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4417 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4419 for(
size_t k=kbegin; k<kend; ++k ) {
4420 const SIMDType a1(
set( A(i,k) ) );
4421 xmm1 -= a1 * B.load(k,j );
4422 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4423 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4424 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
4427 (~C).store( i, j , xmm1 );
4428 (~C).store( i, j+SIMDSIZE , xmm2 );
4429 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4430 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4434 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4438 for( ; (i+2UL) <= M; i+=2UL )
4451 SIMDType xmm1( (~C).load(i ,j ) );
4452 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
4453 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
4454 SIMDType xmm4( (~C).load(i+1UL,j ) );
4455 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
4456 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4458 for(
size_t k=kbegin; k<kend; ++k ) {
4459 const SIMDType a1(
set( A(i ,k) ) );
4460 const SIMDType a2(
set( A(i+1UL,k) ) );
4461 const SIMDType b1( B.load(k,j ) );
4462 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4463 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4472 (~C).store( i , j , xmm1 );
4473 (~C).store( i , j+SIMDSIZE , xmm2 );
4474 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4475 (~C).store( i+1UL, j , xmm4 );
4476 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
4477 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
4489 SIMDType xmm1( (~C).load(i,j ) );
4490 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4491 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4493 for(
size_t k=kbegin; k<kend; ++k ) {
4494 const SIMDType a1(
set( A(i,k) ) );
4495 xmm1 -= a1 * B.load(k,j );
4496 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4497 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4500 (~C).store( i, j , xmm1 );
4501 (~C).store( i, j+SIMDSIZE , xmm2 );
4502 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4506 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4508 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
4509 size_t i( LOW ? j : 0UL );
4511 for( ; (i+2UL) <= iend; i+=2UL )
4524 SIMDType xmm1( (~C).load(i ,j ) );
4525 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
4526 SIMDType xmm3( (~C).load(i+1UL,j ) );
4527 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
4529 for(
size_t k=kbegin; k<kend; ++k ) {
4530 const SIMDType a1(
set( A(i ,k) ) );
4531 const SIMDType a2(
set( A(i+1UL,k) ) );
4532 const SIMDType b1( B.load(k,j ) );
4533 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4540 (~C).store( i , j , xmm1 );
4541 (~C).store( i , j+SIMDSIZE, xmm2 );
4542 (~C).store( i+1UL, j , xmm3 );
4543 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
4555 SIMDType xmm1( (~C).load(i,j ) );
4556 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
4558 for(
size_t k=kbegin; k<kend; ++k ) {
4559 const SIMDType a1(
set( A(i,k) ) );
4560 xmm1 -= a1 * B.load(k,j );
4561 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
4564 (~C).store( i, j , xmm1 );
4565 (~C).store( i, j+SIMDSIZE, xmm2 );
4569 for( ; j<jpos; j+=SIMDSIZE )
4571 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
4572 size_t i( LOW ? j : 0UL );
4574 for( ; (i+2UL) <= iend; i+=2UL )
4585 SIMDType xmm1( (~C).load(i ,j) );
4586 SIMDType xmm2( (~C).load(i+1UL,j) );
4588 for(
size_t k=kbegin; k<kend; ++k ) {
4589 const SIMDType b1( B.load(k,j) );
4590 xmm1 -=
set( A(i ,k) ) * b1;
4591 xmm2 -=
set( A(i+1UL,k) ) * b1;
4594 (~C).store( i , j, xmm1 );
4595 (~C).store( i+1UL, j, xmm2 );
4606 SIMDType xmm1( (~C).load(i,j) );
4608 for(
size_t k=kbegin; k<K; ++k ) {
4609 xmm1 -=
set( A(i,k) ) * B.load(k,j);
4612 (~C).store( i, j, xmm1 );
4616 for( ; remainder && j<N; ++j )
4618 const size_t iend( UPP ? j+1UL : M );
4619 size_t i( LOW ? j : 0UL );
4621 for( ; (i+2UL) <= iend; i+=2UL )
4632 ElementType value1( (~C)(i ,j) );
4633 ElementType value2( (~C)(i+1UL,j) );
4635 for(
size_t k=kbegin; k<kend; ++k ) {
4636 value1 -= A(i ,k) * B(k,j);
4637 value2 -= A(i+1UL,k) * B(k,j);
4640 (~C)(i ,j) = value1;
4641 (~C)(i+1UL,j) = value2;
4652 ElementType value( (~C)(i,j) );
4654 for(
size_t k=kbegin; k<K; ++k ) {
4655 value -= A(i,k) * B(k,j);
4680 template<
typename MT3
4688 const size_t M( A.rows() );
4689 const size_t N( B.columns() );
4690 const size_t K( A.columns() );
4694 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4701 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4702 for(
size_t j=0UL; j<N; ++j )
4715 SIMDType xmm1( (~C).load(i ,j) );
4716 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4717 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4718 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4719 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
4720 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
4721 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
4722 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
4724 for(
size_t k=kbegin; k<kend; ++k ) {
4725 const SIMDType b1(
set( B(k,j) ) );
4726 xmm1 -= A.load(i ,k) * b1;
4727 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4728 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4729 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
4730 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
4731 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
4732 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
4733 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
4736 (~C).store( i , j, xmm1 );
4737 (~C).store( i+SIMDSIZE , j, xmm2 );
4738 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4739 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4740 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
4741 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
4742 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
4743 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
4748 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
4752 for( ; (j+2UL) <= N; j+=2UL )
4765 SIMDType xmm1 ( (~C).load(i ,j ) );
4766 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
4767 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
4768 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
4769 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
4770 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
4771 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
4772 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4773 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
4774 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
4776 for(
size_t k=kbegin; k<kend; ++k ) {
4777 const SIMDType a1( A.load(i ,k) );
4778 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4779 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4780 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4781 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
4782 const SIMDType b1(
set( B(k,j ) ) );
4783 const SIMDType b2(
set( B(k,j+1UL) ) );
4796 (~C).store( i , j , xmm1 );
4797 (~C).store( i+SIMDSIZE , j , xmm2 );
4798 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4799 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
4800 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
4801 (~C).store( i , j+1UL, xmm6 );
4802 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
4803 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
4804 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
4805 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
4817 SIMDType xmm1( (~C).load(i ,j) );
4818 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4819 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4820 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4821 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
4823 for(
size_t k=kbegin; k<kend; ++k ) {
4824 const SIMDType b1(
set( B(k,j) ) );
4825 xmm1 -= A.load(i ,k) * b1;
4826 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4827 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4828 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
4829 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
4832 (~C).store( i , j, xmm1 );
4833 (~C).store( i+SIMDSIZE , j, xmm2 );
4834 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4835 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4836 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
4840 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4844 for( ; (j+2UL) <= N; j+=2UL )
4857 SIMDType xmm1( (~C).load(i ,j ) );
4858 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4859 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4860 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
4861 SIMDType xmm5( (~C).load(i ,j+1UL) );
4862 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
4863 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4864 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
4866 for(
size_t k=kbegin; k<kend; ++k ) {
4867 const SIMDType a1( A.load(i ,k) );
4868 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4869 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4870 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4871 const SIMDType b1(
set( B(k,j ) ) );
4872 const SIMDType b2(
set( B(k,j+1UL) ) );
4883 (~C).store( i , j , xmm1 );
4884 (~C).store( i+SIMDSIZE , j , xmm2 );
4885 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4886 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
4887 (~C).store( i , j+1UL, xmm5 );
4888 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
4889 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
4890 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
4902 SIMDType xmm1( (~C).load(i ,j) );
4903 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4904 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4905 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4907 for(
size_t k=kbegin; k<kend; ++k ) {
4908 const SIMDType b1(
set( B(k,j) ) );
4909 xmm1 -= A.load(i ,k) * b1;
4910 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4911 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4912 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
4915 (~C).store( i , j, xmm1 );
4916 (~C).store( i+SIMDSIZE , j, xmm2 );
4917 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4918 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4922 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4926 for( ; (j+2UL) <= N; j+=2UL )
4939 SIMDType xmm1( (~C).load(i ,j ) );
4940 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4941 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4942 SIMDType xmm4( (~C).load(i ,j+1UL) );
4943 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
4944 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4946 for(
size_t k=kbegin; k<kend; ++k ) {
4947 const SIMDType a1( A.load(i ,k) );
4948 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4949 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4950 const SIMDType b1(
set( B(k,j ) ) );
4951 const SIMDType b2(
set( B(k,j+1UL) ) );
4960 (~C).store( i , j , xmm1 );
4961 (~C).store( i+SIMDSIZE , j , xmm2 );
4962 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4963 (~C).store( i , j+1UL, xmm4 );
4964 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
4965 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4977 SIMDType xmm1( (~C).load(i ,j) );
4978 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4979 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4981 for(
size_t k=kbegin; k<kend; ++k ) {
4982 const SIMDType b1(
set( B(k,j) ) );
4983 xmm1 -= A.load(i ,k) * b1;
4984 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4985 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4988 (~C).store( i , j, xmm1 );
4989 (~C).store( i+SIMDSIZE , j, xmm2 );
4990 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4994 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4996 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
4997 size_t j( UPP ? i : 0UL );
4999 for( ; (j+2UL) <= jend; j+=2UL )
5012 SIMDType xmm1( (~C).load(i ,j ) );
5013 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
5014 SIMDType xmm3( (~C).load(i ,j+1UL) );
5015 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
5017 for(
size_t k=kbegin; k<kend; ++k ) {
5018 const SIMDType a1( A.load(i ,k) );
5019 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5020 const SIMDType b1(
set( B(k,j ) ) );
5021 const SIMDType b2(
set( B(k,j+1UL) ) );
5028 (~C).store( i , j , xmm1 );
5029 (~C).store( i+SIMDSIZE, j , xmm2 );
5030 (~C).store( i , j+1UL, xmm3 );
5031 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
5043 SIMDType xmm1( (~C).load(i ,j) );
5044 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
5046 for(
size_t k=kbegin; k<kend; ++k ) {
5047 const SIMDType b1(
set( B(k,j) ) );
5048 xmm1 -= A.load(i ,k) * b1;
5049 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
5052 (~C).store( i , j, xmm1 );
5053 (~C).store( i+SIMDSIZE, j, xmm2 );
5057 for( ; i<ipos; i+=SIMDSIZE )
5059 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
5060 size_t j( UPP ? i : 0UL );
5062 for( ; (j+2UL) <= jend; j+=2UL )
5073 SIMDType xmm1( (~C).load(i,j ) );
5074 SIMDType xmm2( (~C).load(i,j+1UL) );
5076 for(
size_t k=kbegin; k<kend; ++k ) {
5077 const SIMDType a1( A.load(i,k) );
5078 xmm1 -= a1 *
set( B(k,j ) );
5079 xmm2 -= a1 *
set( B(k,j+1UL) );
5082 (~C).store( i, j , xmm1 );
5083 (~C).store( i, j+1UL, xmm2 );
5094 SIMDType xmm1( (~C).load(i,j) );
5096 for(
size_t k=kbegin; k<K; ++k ) {
5097 xmm1 -= A.load(i,k) *
set( B(k,j) );
5100 (~C).store( i, j, xmm1 );
5104 for( ; remainder && i<M; ++i )
5106 const size_t jend( LOW ? i+1UL : N );
5107 size_t j( UPP ? i : 0UL );
5109 for( ; (j+2UL) <= jend; j+=2UL )
5120 ElementType value1( (~C)(i,j ) );
5121 ElementType value2( (~C)(i,j+1UL) );
5123 for(
size_t k=kbegin; k<kend; ++k ) {
5124 value1 -= A(i,k) * B(k,j );
5125 value2 -= A(i,k) * B(k,j+1UL);
5128 (~C)(i,j ) = value1;
5129 (~C)(i,j+1UL) = value2;
5140 ElementType value( (~C)(i,j) );
5142 for(
size_t k=kbegin; k<K; ++k ) {
5143 value -= A(i,k) * B(k,j);
5167 template<
typename MT3
5171 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5173 selectDefaultSubAssignKernel( C, A, B );
5193 template<
typename MT3
5197 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5223 template<
typename MT3
5227 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5229 selectLargeSubAssignKernel( C, A, B );
5235 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5249 template<
typename MT3
5253 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5260 subAssign( C, tmp );
5265 subAssign( C, tmp );
5268 gemm( C, A, B, ET(-1), ET(1) );
5303 template<
typename MT
5313 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5316 else if( rhs.lhs_.columns() == 0UL ) {
5352 template<
typename MT
5371 const ForwardFunctor fwd;
5373 const TmpType tmp( rhs );
5395 template<
typename MT
5405 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5444 template<
typename MT
5454 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5514 template<
typename MT1
5522 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true >
5553 SYM = ( SF && !( HF || LF || UF ) ),
5554 HERM = ( HF && !( LF || UF ) ),
5555 LOW = ( LF || ( ( SF || HF ) && UF ) ),
5556 UPP = ( UF || ( ( SF || HF ) && LF ) )
5565 template<
typename T1,
typename T2,
typename T3 >
5566 struct IsEvaluationRequired {
5567 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
5575 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5576 struct UseBlasKernel {
5583 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5597 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5598 struct UseVectorizedDefaultKernel {
5603 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5627 ,
Noop > > > > ForwardFunctor;
5657 MT1::simdEnabled && MT2::simdEnabled &&
5663 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
5664 !evaluateRight && MT2::smpAssignable };
5691 inline ResultType
operator()(
size_t i,
size_t j )
const {
5694 return matrix_(i,j) * scalar_;
5706 inline ReturnType
at(
size_t i,
size_t j )
const {
5707 if( i >= matrix_.rows() ) {
5710 if( j >= matrix_.columns() ) {
5713 return (*
this)(i,j);
5722 inline size_t rows()
const {
5723 return matrix_.rows();
5732 inline size_t columns()
const {
5733 return matrix_.columns();
5763 template<
typename T >
5764 inline bool canAlias(
const T* alias )
const {
5765 return matrix_.canAlias( alias );
5775 template<
typename T >
5776 inline bool isAliased(
const T* alias )
const {
5777 return matrix_.isAliased( alias );
5787 return matrix_.isAligned();
5798 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
5799 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
5805 LeftOperand matrix_;
5806 RightOperand scalar_;
5821 template<
typename MT
5833 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5836 else if( left.columns() == 0UL ) {
5851 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
5866 template<
typename MT3
5870 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5875 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
5876 selectSmallAssignKernel( C, A, B, scalar );
5878 selectBlasAssignKernel( C, A, B, scalar );
5896 template<
typename MT3
5903 const size_t M( A.rows() );
5904 const size_t N( B.columns() );
5905 const size_t K( A.columns() );
5909 for(
size_t i=0UL; i<M; ++i )
5920 for(
size_t j=0UL; j<N; ++j ) {
5929 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
5930 :( UPP ?
max(i,kbegin) : kbegin ) )
5931 :( UPP ? i : 0UL ) );
5934 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
5935 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
5936 :( LOW ? i+1UL : N ) );
5939 for(
size_t j=0UL; j<jbegin; ++j ) {
5944 reset( (~C)(i,0UL) );
5946 for(
size_t j=jbegin; j<jend; ++j ) {
5947 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
5950 for(
size_t j=jend; j<N; ++j ) {
5955 reset( (~C)(i,N-1UL) );
5959 for(
size_t k=kbegin+1UL; k<kend; ++k )
5963 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
5964 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
5965 :( SYM || HERM || UPP ? i : 0UL ) );
5968 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
5969 :( LOW ?
min(i+1UL,k) : k ) )
5970 :( LOW ? i+1UL : N ) );
5972 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5975 for(
size_t j=jbegin; j<jend; ++j ) {
5976 (~C)(i,j) += A(i,k) * B(k,j);
5979 (~C)(i,jend) = A(i,k) * B(k,jend);
5986 :( SYM || HERM || UPP ? i : 0UL ) );
5989 :( LOW ? i+1UL : N ) );
5991 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5994 for(
size_t j=jbegin; j<jend; ++j ) {
5995 (~C)(i,j) *= scalar;
6001 for(
size_t i=1UL; i<M; ++i ) {
6002 for(
size_t j=0UL; j<i; ++j ) {
6003 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
6024 template<
typename MT3
6031 const size_t M( A.rows() );
6032 const size_t N( B.columns() );
6033 const size_t K( A.columns() );
6037 for(
size_t j=0UL; j<N; ++j )
6048 for(
size_t i=0UL; i<M; ++i ) {
6057 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
6058 :( LOW ?
max(j,kbegin) : kbegin ) )
6059 :( LOW ? j : 0UL ) );
6062 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
6063 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
6064 :( UPP ? j+1UL : M ) );
6067 for(
size_t i=0UL; i<ibegin; ++i ) {
6072 reset( (~C)(0UL,j) );
6074 for(
size_t i=ibegin; i<iend; ++i ) {
6075 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6078 for(
size_t i=iend; i<M; ++i ) {
6083 reset( (~C)(M-1UL,j) );
6087 for(
size_t k=kbegin+1UL; k<kend; ++k )
6091 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
6092 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
6093 :( SYM || HERM || LOW ? j : 0UL ) );
6096 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
6097 :( UPP ?
min(j+1UL,k) : k ) )
6098 :( UPP ? j+1UL : M ) );
6100 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
6103 for(
size_t i=ibegin; i<iend; ++i ) {
6104 (~C)(i,j) += A(i,k) * B(k,j);
6107 (~C)(iend,j) = A(iend,k) * B(k,j);
6114 :( SYM || HERM || LOW ? j : 0UL ) );
6117 :( UPP ? j+1UL : M ) );
6119 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
6122 for(
size_t i=ibegin; i<iend; ++i ) {
6123 (~C)(i,j) *= scalar;
6129 for(
size_t j=1UL; j<N; ++j ) {
6130 for(
size_t i=0UL; i<j; ++i ) {
6131 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
6152 template<
typename MT3
6156 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6159 constexpr
size_t block( BLOCK_SIZE );
6161 const size_t M( A.rows() );
6162 const size_t N( B.columns() );
6164 for(
size_t ii=0UL; ii<M; ii+=block ) {
6165 const size_t iend(
min( M, ii+block ) );
6166 for(
size_t jj=0UL; jj<N; jj+=block ) {
6167 const size_t jend(
min( N, jj+block ) );
6168 for(
size_t i=ii; i<iend; ++i )
6178 for(
size_t j=jj; j<jbegin; ++j ) {
6182 for(
size_t j=jbegin; j<jpos; ++j ) {
6183 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6186 for(
size_t j=jpos; j<jend; ++j ) {
6210 template<
typename MT3
6214 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6217 const size_t M( A.rows() );
6218 const size_t N( B.columns() );
6220 for(
size_t j=0UL; j<N; ++j )
6231 for(
size_t i=0UL; i<ibegin; ++i ) {
6235 for(
size_t i=ibegin; i<iend; ++i ) {
6236 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6239 for(
size_t i=iend; i<M; ++i ) {
6261 template<
typename MT3
6268 const size_t M( A.rows() );
6269 const size_t N( B.columns() );
6271 for(
size_t i=0UL; i<M; ++i )
6282 for(
size_t j=0UL; j<jbegin; ++j ) {
6286 for(
size_t j=jbegin; j<jend; ++j ) {
6287 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6290 for(
size_t j=jend; j<N; ++j ) {
6312 template<
typename MT3
6319 constexpr
size_t block( BLOCK_SIZE );
6321 const size_t M( A.rows() );
6322 const size_t N( B.columns() );
6324 for(
size_t jj=0UL; jj<N; jj+=block ) {
6325 const size_t jend(
min( N, jj+block ) );
6326 for(
size_t ii=0UL; ii<M; ii+=block ) {
6327 const size_t iend(
min( M, ii+block ) );
6328 for(
size_t j=jj; j<jend; ++j )
6338 for(
size_t i=ii; i<ibegin; ++i ) {
6342 for(
size_t i=ibegin; i<ipos; ++i ) {
6343 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6346 for(
size_t i=ipos; i<iend; ++i ) {
6370 template<
typename MT3
6375 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6379 for(
size_t i=0UL; i<A.rows(); ++i ) {
6380 C(i,i) = A(i,i) * B(i,i) * scalar;
6399 template<
typename MT3
6404 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6406 selectDefaultAssignKernel( C, A, B, scalar );
6425 template<
typename MT3
6434 const size_t M( A.rows() );
6435 const size_t N( B.columns() );
6436 const size_t K( A.columns() );
6440 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
6443 const SIMDType factor(
set( scalar ) );
6445 if( LOW && UPP && N > SIMDSIZE*3UL ) {
6454 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6455 for(
size_t i=0UL; i<M; ++i )
6468 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6470 for(
size_t k=kbegin; k<kend; ++k ) {
6471 const SIMDType a1(
set( A(i,k) ) );
6472 xmm1 += a1 * B.load(k,j );
6473 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6474 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6475 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6476 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6477 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6478 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6479 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6482 (~C).store( i, j , xmm1 * factor );
6483 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6484 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6485 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6486 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
6487 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
6488 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
6489 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
6494 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6498 for( ; (i+2UL) <= M; i+=2UL )
6511 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6513 for(
size_t k=kbegin; k<kend; ++k ) {
6514 const SIMDType a1(
set( A(i ,k) ) );
6515 const SIMDType a2(
set( A(i+1UL,k) ) );
6516 const SIMDType b1( B.load(k,j ) );
6517 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6518 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6519 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6520 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6533 (~C).store( i , j , xmm1 * factor );
6534 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6535 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6536 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
6537 (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
6538 (~C).store( i+1UL, j , xmm6 * factor );
6539 (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
6540 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
6541 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
6542 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
6554 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6556 for(
size_t k=kbegin; k<kend; ++k ) {
6557 const SIMDType a1(
set( A(i,k) ) );
6558 xmm1 += a1 * B.load(k,j );
6559 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6560 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6561 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6562 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6565 (~C).store( i, j , xmm1 * factor );
6566 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6567 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6568 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6569 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
6573 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6575 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
6576 size_t i( LOW ? j : 0UL );
6578 for( ; (i+2UL) <= iend; i+=2UL )
6591 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6593 for(
size_t k=kbegin; k<kend; ++k ) {
6594 const SIMDType a1(
set( A(i ,k) ) );
6595 const SIMDType a2(
set( A(i+1UL,k) ) );
6596 const SIMDType b1( B.load(k,j ) );
6597 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6598 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6599 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6610 (~C).store( i , j , xmm1 * factor );
6611 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6612 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6613 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
6614 (~C).store( i+1UL, j , xmm5 * factor );
6615 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
6616 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
6617 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
6629 SIMDType xmm1, xmm2, xmm3, xmm4;
6631 for(
size_t k=kbegin; k<kend; ++k ) {
6632 const SIMDType a1(
set( A(i,k) ) );
6633 xmm1 += a1 * B.load(k,j );
6634 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6635 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6636 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6639 (~C).store( i, j , xmm1 * factor );
6640 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6641 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6642 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6646 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6648 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
6649 size_t i( LOW ? j : 0UL );
6651 for( ; (i+2UL) <= iend; i+=2UL )
6664 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6666 for(
size_t k=kbegin; k<kend; ++k ) {
6667 const SIMDType a1(
set( A(i ,k) ) );
6668 const SIMDType a2(
set( A(i+1UL,k) ) );
6669 const SIMDType b1( B.load(k,j ) );
6670 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6671 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6680 (~C).store( i , j , xmm1 * factor );
6681 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6682 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6683 (~C).store( i+1UL, j , xmm4 * factor );
6684 (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
6685 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
6697 SIMDType xmm1, xmm2, xmm3;
6699 for(
size_t k=kbegin; k<kend; ++k ) {
6700 const SIMDType a1(
set( A(i,k) ) );
6701 xmm1 += a1 * B.load(k,j );
6702 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6703 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6706 (~C).store( i, j , xmm1 * factor );
6707 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6708 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6712 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6714 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
6715 size_t i( LOW ? j : 0UL );
6717 for( ; (i+2UL) <= iend; i+=2UL )
6730 SIMDType xmm1, xmm2, xmm3, xmm4;
6732 for(
size_t k=kbegin; k<kend; ++k ) {
6733 const SIMDType a1(
set( A(i ,k) ) );
6734 const SIMDType a2(
set( A(i+1UL,k) ) );
6735 const SIMDType b1( B.load(k,j ) );
6736 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6743 (~C).store( i , j , xmm1 * factor );
6744 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
6745 (~C).store( i+1UL, j , xmm3 * factor );
6746 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6758 SIMDType xmm1, xmm2;
6760 for(
size_t k=kbegin; k<kend; ++k ) {
6761 const SIMDType a1(
set( A(i,k) ) );
6762 xmm1 += a1 * B.load(k,j );
6763 xmm2 += a1 * B.load(k,j+SIMDSIZE);
6766 (~C).store( i, j , xmm1 * factor );
6767 (~C).store( i, j+SIMDSIZE, xmm2 * factor );
6771 for( ; j<jpos; j+=SIMDSIZE )
6773 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
6774 size_t i( LOW ? j : 0UL );
6776 for( ; (i+2UL) <= iend; i+=2UL )
6787 SIMDType xmm1, xmm2;
6789 for(
size_t k=kbegin; k<kend; ++k ) {
6790 const SIMDType b1( B.load(k,j) );
6791 xmm1 +=
set( A(i ,k) ) * b1;
6792 xmm2 +=
set( A(i+1UL,k) ) * b1;
6795 (~C).store( i , j, xmm1 * factor );
6796 (~C).store( i+1UL, j, xmm2 * factor );
6809 for(
size_t k=kbegin; k<K; ++k ) {
6810 xmm1 +=
set( A(i,k) ) * B.load(k,j);
6813 (~C).store( i, j, xmm1 * factor );
6817 for( ; remainder && j<N; ++j )
6819 size_t i( LOW && UPP ? j : 0UL );
6821 for( ; (i+2UL) <= M; i+=2UL )
6835 for(
size_t k=kbegin; k<kend; ++k ) {
6836 value1 += A(i ,k) * B(k,j);
6837 value2 += A(i+1UL,k) * B(k,j);
6840 (~C)(i ,j) = value1 * scalar;
6841 (~C)(i+1UL,j) = value2 * scalar;
6854 for(
size_t k=kbegin; k<K; ++k ) {
6855 value += A(i,k) * B(k,j);
6858 (~C)(i,j) = value * scalar;
6863 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
6864 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6865 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6866 for(
size_t j=0UL; j<jend; ++j ) {
6867 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
6871 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
6872 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6873 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6874 for(
size_t i=0UL; i<iend; ++i ) {
6879 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
6880 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6881 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6882 for(
size_t j=0UL; j<jend; ++j ) {
6905 template<
typename MT3
6914 const size_t M( A.rows() );
6915 const size_t N( B.columns() );
6916 const size_t K( A.columns() );
6920 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
6923 const SIMDType factor(
set( scalar ) );
6925 if( LOW && UPP && M > SIMDSIZE*3UL ) {
6934 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6935 for(
size_t j=0UL; j<N; ++j )
6948 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6950 for(
size_t k=kbegin; k<kend; ++k ) {
6951 const SIMDType b1(
set( B(k,j) ) );
6952 xmm1 += A.load(i ,k) * b1;
6953 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6954 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6955 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6956 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6957 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6958 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6959 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6962 (~C).store( i , j, xmm1 * factor );
6963 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
6964 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
6965 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
6966 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
6967 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
6968 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
6969 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
6974 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6978 for( ; (j+2UL) <= N; j+=2UL )
6991 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6993 for(
size_t k=kbegin; k<kend; ++k ) {
6994 const SIMDType a1( A.load(i ,k) );
6995 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6996 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6997 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6998 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6999 const SIMDType b1(
set( B(k,j ) ) );
7000 const SIMDType b2(
set( B(k,j+1UL) ) );
7013 (~C).store( i , j , xmm1 * factor );
7014 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7015 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7016 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
7017 (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
7018 (~C).store( i , j+1UL, xmm6 * factor );
7019 (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
7020 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
7021 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
7022 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
7034 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7036 for(
size_t k=kbegin; k<kend; ++k ) {
7037 const SIMDType b1(
set( B(k,j) ) );
7038 xmm1 += A.load(i ,k) * b1;
7039 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7040 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7041 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7042 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7045 (~C).store( i , j, xmm1 * factor );
7046 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7047 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7048 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7049 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
7053 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7055 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
7056 size_t j( UPP ? i : 0UL );
7058 for( ; (j+2UL) <= jend; j+=2UL )
7071 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7073 for(
size_t k=kbegin; k<kend; ++k ) {
7074 const SIMDType a1( A.load(i ,k) );
7075 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7076 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7077 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7078 const SIMDType b1(
set( B(k,j ) ) );
7079 const SIMDType b2(
set( B(k,j+1UL) ) );
7090 (~C).store( i , j , xmm1 * factor );
7091 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7092 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7093 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
7094 (~C).store( i , j+1UL, xmm5 * factor );
7095 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
7096 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
7097 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
7109 SIMDType xmm1, xmm2, xmm3, xmm4;
7111 for(
size_t k=kbegin; k<kend; ++k ) {
7112 const SIMDType b1(
set( B(k,j) ) );
7113 xmm1 += A.load(i ,k) * b1;
7114 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7115 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7116 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7119 (~C).store( i , j, xmm1 * factor );
7120 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7121 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7122 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7126 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7128 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
7129 size_t j( UPP ? i : 0UL );
7131 for( ; (j+2UL) <= jend; j+=2UL )
7144 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7146 for(
size_t k=kbegin; k<kend; ++k ) {
7147 const SIMDType a1( A.load(i ,k) );
7148 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7149 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7150 const SIMDType b1(
set( B(k,j ) ) );
7151 const SIMDType b2(
set( B(k,j+1UL) ) );
7160 (~C).store( i , j , xmm1 * factor );
7161 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7162 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7163 (~C).store( i , j+1UL, xmm4 * factor );
7164 (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
7165 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
7177 SIMDType xmm1, xmm2, xmm3;
7179 for(
size_t k=kbegin; k<kend; ++k ) {
7180 const SIMDType b1(
set( B(k,j) ) );
7181 xmm1 += A.load(i ,k) * b1;
7182 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7183 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7186 (~C).store( i , j, xmm1 * factor );
7187 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7188 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7192 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7194 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
7195 size_t j( UPP ? i : 0UL );
7197 for( ; (j+2UL) <= jend; j+=2UL )
7210 SIMDType xmm1, xmm2, xmm3, xmm4;
7212 for(
size_t k=kbegin; k<kend; ++k ) {
7213 const SIMDType a1( A.load(i ,k) );
7214 const SIMDType a2( A.load(i+SIMDSIZE,k) );
7215 const SIMDType b1(
set( B(k,j ) ) );
7216 const SIMDType b2(
set( B(k,j+1UL) ) );
7223 (~C).store( i , j , xmm1 * factor );
7224 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
7225 (~C).store( i , j+1UL, xmm3 * factor );
7226 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
7238 SIMDType xmm1, xmm2;
7240 for(
size_t k=kbegin; k<kend; ++k ) {
7241 const SIMDType b1(
set( B(k,j) ) );
7242 xmm1 += A.load(i ,k) * b1;
7243 xmm2 += A.load(i+SIMDSIZE,k) * b1;
7246 (~C).store( i , j, xmm1 * factor );
7247 (~C).store( i+SIMDSIZE, j, xmm2 * factor );
7251 for( ; i<ipos; i+=SIMDSIZE )
7253 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
7254 size_t j( UPP ? i : 0UL );
7256 for( ; (j+2UL) <= jend; j+=2UL )
7267 SIMDType xmm1, xmm2;
7269 for(
size_t k=kbegin; k<kend; ++k ) {
7270 const SIMDType a1( A.load(i,k) );
7271 xmm1 += a1 *
set( B(k,j ) );
7272 xmm2 += a1 *
set( B(k,j+1UL) );
7275 (~C).store( i, j , xmm1 * factor );
7276 (~C).store( i, j+1UL, xmm2 * factor );
7289 for(
size_t k=kbegin; k<K; ++k ) {
7290 xmm1 += A.load(i,k) *
set( B(k,j) );
7293 (~C).store( i, j, xmm1 * factor );
7297 for( ; remainder && i<M; ++i )
7299 size_t j( LOW && UPP ? i : 0UL );
7301 for( ; (j+2UL) <= N; j+=2UL )
7315 for(
size_t k=kbegin; k<kend; ++k ) {
7316 value1 += A(i,k) * B(k,j );
7317 value2 += A(i,k) * B(k,j+1UL);
7320 (~C)(i,j ) = value1 * scalar;
7321 (~C)(i,j+1UL) = value2 * scalar;
7334 for(
size_t k=kbegin; k<K; ++k ) {
7335 value += A(i,k) * B(k,j);
7338 (~C)(i,j) = value * scalar;
7343 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
7344 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
7345 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
7346 for(
size_t i=0UL; i<iend; ++i ) {
7347 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
7351 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
7352 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
7353 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
7354 for(
size_t i=0UL; i<iend; ++i ) {
7359 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
7360 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
7361 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
7362 for(
size_t j=0UL; j<jend; ++j ) {
7384 template<
typename MT3
7389 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7391 selectDefaultAssignKernel( C, A, B, scalar );
7410 template<
typename MT3
7415 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7418 smmm( C, A, B, scalar );
7420 hmmm( C, A, B, scalar );
7422 lmmm( C, A, B, scalar, ST2(0) );
7424 ummm( C, A, B, scalar, ST2(0) );
7426 mmm( C, A, B, scalar, ST2(0) );
7444 template<
typename MT3
7449 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7451 selectLargeAssignKernel( C, A, B, scalar );
7456 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7470 template<
typename MT3
7475 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7481 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7485 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7488 gemm( C, A, B, ET(scalar), ET(0) );
7506 template<
typename MT
7524 const ForwardFunctor fwd;
7526 const TmpType tmp(
serial( rhs ) );
7527 assign( ~lhs, fwd( tmp ) );
7543 template<
typename MT
7555 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7569 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
7584 template<
typename MT3
7588 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7593 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7594 selectSmallAddAssignKernel( C, A, B, scalar );
7596 selectBlasAddAssignKernel( C, A, B, scalar );
7614 template<
typename MT3
7619 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7621 const ResultType tmp(
serial( A * B * scalar ) );
7622 addAssign( C, tmp );
7640 template<
typename MT3
7644 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7647 constexpr
size_t block( BLOCK_SIZE );
7649 const size_t M( A.rows() );
7650 const size_t N( B.columns() );
7652 for(
size_t ii=0UL; ii<M; ii+=block ) {
7653 const size_t iend(
min( M, ii+block ) );
7654 for(
size_t jj=0UL; jj<N; jj+=block ) {
7655 const size_t jend(
min( N, jj+block ) );
7656 for(
size_t i=ii; i<iend; ++i )
7665 for(
size_t j=jbegin; j<jpos; ++j ) {
7666 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
7688 template<
typename MT3
7692 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7695 const size_t M( A.rows() );
7696 const size_t N( B.columns() );
7698 for(
size_t j=0UL; j<N; ++j )
7708 const size_t inum( iend - ibegin );
7709 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7711 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7712 (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
7713 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
7716 (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
7736 template<
typename MT3
7743 const size_t M( A.rows() );
7744 const size_t N( B.columns() );
7746 for(
size_t i=0UL; i<M; ++i )
7756 const size_t jnum( jend - jbegin );
7757 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7759 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7760 (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
7761 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
7764 (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
7784 template<
typename MT3
7791 constexpr
size_t block( BLOCK_SIZE );
7793 const size_t M( A.rows() );
7794 const size_t N( B.columns() );
7796 for(
size_t jj=0UL; jj<N; jj+=block ) {
7797 const size_t jend(
min( N, jj+block ) );
7798 for(
size_t ii=0UL; ii<M; ii+=block ) {
7799 const size_t iend(
min( M, ii+block ) );
7800 for(
size_t j=jj; j<jend; ++j )
7809 for(
size_t i=ibegin; i<ipos; ++i ) {
7810 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
7832 template<
typename MT3
7837 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7839 for(
size_t i=0UL; i<A.rows(); ++i ) {
7840 C(i,i) += A(i,i) * B(i,i) * scalar;
7859 template<
typename MT3
7864 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7866 selectDefaultAddAssignKernel( C, A, B, scalar );
7885 template<
typename MT3
7894 const size_t M( A.rows() );
7895 const size_t N( B.columns() );
7896 const size_t K( A.columns() );
7900 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
7903 const SIMDType factor(
set( scalar ) );
7909 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7910 for(
size_t i=0UL; i<M; ++i )
7923 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7925 for(
size_t k=kbegin; k<kend; ++k ) {
7926 const SIMDType a1(
set( A(i,k) ) );
7927 xmm1 += a1 * B.load(k,j );
7928 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7929 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7930 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7931 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7932 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7933 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7934 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7937 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7938 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
7939 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
7940 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
7941 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
7942 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
7943 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
7944 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
7949 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7953 for( ; (i+2UL) <= M; i+=2UL )
7966 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7968 for(
size_t k=kbegin; k<kend; ++k ) {
7969 const SIMDType a1(
set( A(i ,k) ) );
7970 const SIMDType a2(
set( A(i+1UL,k) ) );
7971 const SIMDType b1( B.load(k,j ) );
7972 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7973 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7974 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7975 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7988 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7989 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
7990 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
7991 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
7992 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
7993 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
7994 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
7995 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
7996 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
7997 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
8009 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8011 for(
size_t k=kbegin; k<kend; ++k ) {
8012 const SIMDType a1(
set( A(i,k) ) );
8013 xmm1 += a1 * B.load(k,j );
8014 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8015 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8016 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8017 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8020 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8021 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8022 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8023 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8024 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
8028 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8032 for( ; (i+2UL) <= M; i+=2UL )
8045 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8047 for(
size_t k=kbegin; k<kend; ++k ) {
8048 const SIMDType a1(
set( A(i ,k) ) );
8049 const SIMDType a2(
set( A(i+1UL,k) ) );
8050 const SIMDType b1( B.load(k,j ) );
8051 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8052 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8053 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8064 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8065 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
8066 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
8067 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
8068 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8069 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
8070 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
8071 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
8083 SIMDType xmm1, xmm2, xmm3, xmm4;
8085 for(
size_t k=kbegin; k<kend; ++k ) {
8086 const SIMDType a1(
set( A(i,k) ) );
8087 xmm1 += a1 * B.load(k,j );
8088 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8089 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8090 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8093 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8094 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8095 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8096 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8100 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8104 for( ; (i+2UL) <= M; i+=2UL )
8117 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8119 for(
size_t k=kbegin; k<kend; ++k ) {
8120 const SIMDType a1(
set( A(i ,k) ) );
8121 const SIMDType a2(
set( A(i+1UL,k) ) );
8122 const SIMDType b1( B.load(k,j ) );
8123 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8124 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8133 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8134 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
8135 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
8136 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
8137 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
8138 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
8150 SIMDType xmm1, xmm2, xmm3;
8152 for(
size_t k=kbegin; k<kend; ++k ) {
8153 const SIMDType a1(
set( A(i,k) ) );
8154 xmm1 += a1 * B.load(k,j );
8155 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8156 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8159 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8160 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8161 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8165 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8167 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
8168 size_t i( LOW ? j : 0UL );
8170 for( ; (i+2UL) <= iend; i+=2UL )
8183 SIMDType xmm1, xmm2, xmm3, xmm4;
8185 for(
size_t k=kbegin; k<kend; ++k ) {
8186 const SIMDType a1(
set( A(i ,k) ) );
8187 const SIMDType a2(
set( A(i+1UL,k) ) );
8188 const SIMDType b1( B.load(k,j ) );
8189 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8196 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8197 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
8198 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8199 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
8211 SIMDType xmm1, xmm2;
8213 for(
size_t k=kbegin; k<kend; ++k ) {
8214 const SIMDType a1(
set( A(i,k) ) );
8215 xmm1 += a1 * B.load(k,j );
8216 xmm2 += a1 * B.load(k,j+SIMDSIZE);
8219 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8220 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
8224 for( ; j<jpos; j+=SIMDSIZE )
8226 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
8227 size_t i( LOW ? j : 0UL );
8229 for( ; (i+2UL) <= iend; i+=2UL )
8240 SIMDType xmm1, xmm2;
8242 for(
size_t k=kbegin; k<kend; ++k ) {
8243 const SIMDType b1( B.load(k,j) );
8244 xmm1 +=
set( A(i ,k) ) * b1;
8245 xmm2 +=
set( A(i+1UL,k) ) * b1;
8248 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8249 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8262 for(
size_t k=kbegin; k<K; ++k ) {
8263 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8266 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8270 for( ; remainder && j<N; ++j )
8272 const size_t iend( UPP ? j+1UL : M );
8273 size_t i( LOW ? j : 0UL );
8275 for( ; (i+2UL) <= iend; i+=2UL )
8289 for(
size_t k=kbegin; k<kend; ++k ) {
8290 value1 += A(i ,k) * B(k,j);
8291 value2 += A(i+1UL,k) * B(k,j);
8294 (~C)(i ,j) += value1 * scalar;
8295 (~C)(i+1UL,j) += value2 * scalar;
8308 for(
size_t k=kbegin; k<K; ++k ) {
8309 value += A(i,k) * B(k,j);
8312 (~C)(i,j) += value * scalar;
8333 template<
typename MT3
8342 const size_t M( A.rows() );
8343 const size_t N( B.columns() );
8344 const size_t K( A.columns() );
8348 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
8351 const SIMDType factor(
set( scalar ) );
8357 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8358 for(
size_t j=0UL; j<N; ++j )
8371 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8373 for(
size_t k=kbegin; k<kend; ++k ) {
8374 const SIMDType b1(
set( B(k,j) ) );
8375 xmm1 += A.load(i ,k) * b1;
8376 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8377 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8378 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8379 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8380 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8381 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8382 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8385 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8386 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8387 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8388 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8389 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
8390 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
8391 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
8392 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
8397 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8401 for( ; (j+2UL) <= N; j+=2UL )
8414 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8416 for(
size_t k=kbegin; k<kend; ++k ) {
8417 const SIMDType a1( A.load(i ,k) );
8418 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8419 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8420 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8421 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8422 const SIMDType b1(
set( B(k,j ) ) );
8423 const SIMDType b2(
set( B(k,j+1UL) ) );
8436 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8437 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8438 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8439 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
8440 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
8441 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
8442 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
8443 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
8444 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
8445 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
8457 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8459 for(
size_t k=kbegin; k<kend; ++k ) {
8460 const SIMDType b1(
set( B(k,j) ) );
8461 xmm1 += A.load(i ,k) * b1;
8462 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8463 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8464 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8465 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8468 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8469 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8470 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8471 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8472 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
8476 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8480 for( ; (j+2UL) <= N; j+=2UL )
8493 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8495 for(
size_t k=kbegin; k<kend; ++k ) {
8496 const SIMDType a1( A.load(i ,k) );
8497 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8498 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8499 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8500 const SIMDType b1(
set( B(k,j ) ) );
8501 const SIMDType b2(
set( B(k,j+1UL) ) );
8512 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8513 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8514 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8515 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
8516 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8517 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
8518 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
8519 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
8531 SIMDType xmm1, xmm2, xmm3, xmm4;
8533 for(
size_t k=kbegin; k<kend; ++k ) {
8534 const SIMDType b1(
set( B(k,j) ) );
8535 xmm1 += A.load(i ,k) * b1;
8536 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8537 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8538 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8541 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8542 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8543 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8544 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8548 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8552 for( ; (j+2UL) <= N; j+=2UL )
8565 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8567 for(
size_t k=kbegin; k<kend; ++k ) {
8568 const SIMDType a1( A.load(i ,k) );
8569 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8570 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8571 const SIMDType b1(
set( B(k,j ) ) );
8572 const SIMDType b2(
set( B(k,j+1UL) ) );
8581 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8582 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8583 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8584 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
8585 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
8586 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
8598 SIMDType xmm1, xmm2, xmm3;
8600 for(
size_t k=kbegin; k<kend; ++k ) {
8601 const SIMDType b1(
set( B(k,j) ) );
8602 xmm1 += A.load(i ,k) * b1;
8603 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8604 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8607 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8608 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8609 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8613 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8615 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
8616 size_t j( UPP ? i : 0UL );
8618 for( ; (j+2UL) <= jend; j+=2UL )
8631 SIMDType xmm1, xmm2, xmm3, xmm4;
8633 for(
size_t k=kbegin; k<kend; ++k ) {
8634 const SIMDType a1( A.load(i ,k) );
8635 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8636 const SIMDType b1(
set( B(k,j ) ) );
8637 const SIMDType b2(
set( B(k,j+1UL) ) );
8644 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8645 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
8646 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8647 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
8659 SIMDType xmm1, xmm2;
8661 for(
size_t k=kbegin; k<kend; ++k ) {
8662 const SIMDType b1(
set( B(k,j) ) );
8663 xmm1 += A.load(i ,k) * b1;
8664 xmm2 += A.load(i+SIMDSIZE,k) * b1;
8667 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8668 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
8672 for( ; i<ipos; i+=SIMDSIZE )
8674 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
8675 size_t j( UPP ? i : 0UL );
8677 for( ; (j+2UL) <= jend; j+=2UL )
8688 SIMDType xmm1, xmm2;
8690 for(
size_t k=kbegin; k<kend; ++k ) {
8691 const SIMDType a1( A.load(i,k) );
8692 xmm1 += a1 *
set( B(k,j ) );
8693 xmm2 += a1 *
set( B(k,j+1UL) );
8696 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8697 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8710 for(
size_t k=kbegin; k<K; ++k ) {
8711 xmm1 += A.load(i,k) *
set( B(k,j) );
8714 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8718 for( ; remainder && i<M; ++i )
8720 const size_t jend( LOW ? i+1UL : N );
8721 size_t j( UPP ? i : 0UL );
8723 for( ; (j+2UL) <= jend; j+=2UL )
8737 for(
size_t k=kbegin; k<kend; ++k ) {
8738 value1 += A(i,k) * B(k,j );
8739 value2 += A(i,k) * B(k,j+1UL);
8742 (~C)(i,j ) += value1 * scalar;
8743 (~C)(i,j+1UL) += value2 * scalar;
8756 for(
size_t k=kbegin; k<K; ++k ) {
8757 value += A(i,k) * B(k,j);
8760 (~C)(i,j) += value * scalar;
8780 template<
typename MT3
8785 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8787 selectDefaultAddAssignKernel( C, A, B, scalar );
8806 template<
typename MT3
8811 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8814 lmmm( C, A, B, scalar, ST2(1) );
8816 ummm( C, A, B, scalar, ST2(1) );
8818 mmm( C, A, B, scalar, ST2(1) );
8836 template<
typename MT3
8841 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8843 selectLargeAddAssignKernel( C, A, B, scalar );
8848 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8862 template<
typename MT3
8867 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8873 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8874 addAssign( C, tmp );
8878 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8879 addAssign( C, tmp );
8882 gemm( C, A, B, ET(scalar), ET(1) );
8904 template<
typename MT
8916 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8930 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
8945 template<
typename MT3
8949 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8954 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
8955 selectSmallSubAssignKernel( C, A, B, scalar );
8957 selectBlasSubAssignKernel( C, A, B, scalar );
8975 template<
typename MT3
8980 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8982 const ResultType tmp(
serial( A * B * scalar ) );
8983 subAssign( C, tmp );
9001 template<
typename MT3
9005 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
9008 constexpr
size_t block( BLOCK_SIZE );
9010 const size_t M( A.rows() );
9011 const size_t N( B.columns() );
9013 for(
size_t ii=0UL; ii<M; ii+=block ) {
9014 const size_t iend(
min( M, ii+block ) );
9015 for(
size_t jj=0UL; jj<N; jj+=block ) {
9016 const size_t jend(
min( N, jj+block ) );
9017 for(
size_t i=ii; i<iend; ++i )
9026 for(
size_t j=jbegin; j<jpos; ++j ) {
9027 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9049 template<
typename MT3
9053 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
9056 const size_t M( A.rows() );
9057 const size_t N( B.columns() );
9059 for(
size_t j=0UL; j<N; ++j )
9069 const size_t inum( iend - ibegin );
9070 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9072 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9073 (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9074 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9077 (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9097 template<
typename MT3
9104 const size_t M( A.rows() );
9105 const size_t N( B.columns() );
9107 for(
size_t i=0UL; i<M; ++i )
9117 const size_t jnum( jend - jbegin );
9118 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9120 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9121 (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9122 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9125 (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9145 template<
typename MT3
9152 constexpr
size_t block( BLOCK_SIZE );
9154 const size_t M( A.rows() );
9155 const size_t N( B.columns() );
9157 for(
size_t jj=0UL; jj<N; jj+=block ) {
9158 const size_t jend(
min( N, jj+block ) );
9159 for(
size_t ii=0UL; ii<M; ii+=block ) {
9160 const size_t iend(
min( M, ii+block ) );
9161 for(
size_t j=jj; j<jend; ++j )
9170 for(
size_t i=ibegin; i<ipos; ++i ) {
9171 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9193 template<
typename MT3
9198 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9200 for(
size_t i=0UL; i<A.rows(); ++i ) {
9201 C(i,i) -= A(i,i) * B(i,i) * scalar;
9220 template<
typename MT3
9225 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9227 selectDefaultSubAssignKernel( C, A, B, scalar );
9246 template<
typename MT3
9255 const size_t M( A.rows() );
9256 const size_t N( B.columns() );
9257 const size_t K( A.columns() );
9261 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
9264 const SIMDType factor(
set( scalar ) );
9270 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9271 for(
size_t i=0UL; i<M; ++i )
9284 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9286 for(
size_t k=kbegin; k<kend; ++k ) {
9287 const SIMDType a1(
set( A(i,k) ) );
9288 xmm1 += a1 * B.load(k,j );
9289 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9290 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9291 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9292 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9293 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9294 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9295 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9298 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9299 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9300 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9301 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9302 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
9303 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
9304 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
9305 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
9310 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9314 for( ; (i+2UL) <= M; i+=2UL )
9327 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9329 for(
size_t k=kbegin; k<kend; ++k ) {
9330 const SIMDType a1(
set( A(i ,k) ) );
9331 const SIMDType a2(
set( A(i+1UL,k) ) );
9332 const SIMDType b1( B.load(k,j ) );
9333 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9334 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9335 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9336 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9349 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9350 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9351 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9352 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
9353 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
9354 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
9355 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
9356 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
9357 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
9358 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
9370 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9372 for(
size_t k=kbegin; k<kend; ++k ) {
9373 const SIMDType a1(
set( A(i,k) ) );
9374 xmm1 += a1 * B.load(k,j );
9375 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9376 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9377 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9378 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9381 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9382 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9383 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9384 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9385 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
9389 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9393 for( ; (i+2UL) <= M; i+=2UL )
9406 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9408 for(
size_t k=kbegin; k<kend; ++k ) {
9409 const SIMDType a1(
set( A(i ,k) ) );
9410 const SIMDType a2(
set( A(i+1UL,k) ) );
9411 const SIMDType b1( B.load(k,j ) );
9412 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9413 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9414 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9425 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9426 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9427 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9428 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
9429 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9430 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
9431 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
9432 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
9444 SIMDType xmm1, xmm2, xmm3, xmm4;
9446 for(
size_t k=kbegin; k<kend; ++k ) {
9447 const SIMDType a1(
set( A(i,k) ) );
9448 xmm1 += a1 * B.load(k,j );
9449 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9450 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9451 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9454 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9455 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9456 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9457 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9461 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9465 for( ; (i+2UL) <= M; i+=2UL )
9478 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9480 for(
size_t k=kbegin; k<kend; ++k ) {
9481 const SIMDType a1(
set( A(i ,k) ) );
9482 const SIMDType a2(
set( A(i+1UL,k) ) );
9483 const SIMDType b1( B.load(k,j ) );
9484 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9485 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9494 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9495 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9496 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9497 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
9498 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
9499 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
9511 SIMDType xmm1, xmm2, xmm3;
9513 for(
size_t k=kbegin; k<kend; ++k ) {
9514 const SIMDType a1(
set( A(i,k) ) );
9515 xmm1 += a1 * B.load(k,j );
9516 xmm2 += a1 * B.load(k,j+SIMDSIZE );
9517 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9520 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9521 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9522 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9526 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9528 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
9529 size_t i( LOW ? j : 0UL );
9531 for( ; (i+2UL) <= iend; i+=2UL )
9544 SIMDType xmm1, xmm2, xmm3, xmm4;
9546 for(
size_t k=kbegin; k<kend; ++k ) {
9547 const SIMDType a1(
set( A(i ,k) ) );
9548 const SIMDType a2(
set( A(i+1UL,k) ) );
9549 const SIMDType b1( B.load(k,j ) );
9550 const SIMDType b2( B.load(k,j+SIMDSIZE) );
9557 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9558 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
9559 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
9560 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
9572 SIMDType xmm1, xmm2;
9574 for(
size_t k=kbegin; k<kend; ++k ) {
9575 const SIMDType a1(
set( A(i,k) ) );
9576 xmm1 += a1 * B.load(k,j );
9577 xmm2 += a1 * B.load(k,j+SIMDSIZE);
9580 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9581 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
9585 for( ; j<jpos; j+=SIMDSIZE )
9587 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
9588 size_t i( LOW ? j : 0UL );
9590 for( ; (i+2UL) <= iend; i+=2UL )
9601 SIMDType xmm1, xmm2;
9603 for(
size_t k=kbegin; k<kend; ++k ) {
9604 const SIMDType b1( B.load(k,j) );
9605 xmm1 +=
set( A(i ,k) ) * b1;
9606 xmm2 +=
set( A(i+1UL,k) ) * b1;
9609 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9610 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
9623 for(
size_t k=kbegin; k<K; ++k ) {
9624 xmm1 +=
set( A(i,k) ) * B.load(k,j);
9627 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
9631 for( ; remainder && j<N; ++j )
9633 const size_t iend( UPP ? j+1UL : M );
9634 size_t i( LOW ? j : 0UL );
9636 for( ; (i+2UL) <= iend; i+=2UL )
9650 for(
size_t k=kbegin; k<kend; ++k ) {
9651 value1 += A(i ,k) * B(k,j);
9652 value2 += A(i+1UL,k) * B(k,j);
9655 (~C)(i ,j) -= value1 * scalar;
9656 (~C)(i+1UL,j) -= value2 * scalar;
9669 for(
size_t k=kbegin; k<K; ++k ) {
9670 value += A(i,k) * B(k,j);
9673 (~C)(i,j) -= value * scalar;
9694 template<
typename MT3
9703 const size_t M( A.rows() );
9704 const size_t N( B.columns() );
9705 const size_t K( A.columns() );
9709 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
9712 const SIMDType factor(
set( scalar ) );
9718 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
9719 for(
size_t j=0UL; j<N; ++j )
9732 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9734 for(
size_t k=kbegin; k<kend; ++k ) {
9735 const SIMDType b1(
set( B(k,j) ) );
9736 xmm1 += A.load(i ,k) * b1;
9737 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9738 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9739 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9740 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
9741 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
9742 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
9743 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
9746 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9747 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9748 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9749 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
9750 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
9751 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
9752 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
9753 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
9758 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
9762 for( ; (j+2UL) <= N; j+=2UL )
9775 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9777 for(
size_t k=kbegin; k<kend; ++k ) {
9778 const SIMDType a1( A.load(i ,k) );
9779 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9780 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9781 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
9782 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
9783 const SIMDType b1(
set( B(k,j ) ) );
9784 const SIMDType b2(
set( B(k,j+1UL) ) );
9797 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9798 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
9799 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
9800 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
9801 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
9802 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
9803 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
9804 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
9805 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
9806 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
9818 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9820 for(
size_t k=kbegin; k<kend; ++k ) {
9821 const SIMDType b1(
set( B(k,j) ) );
9822 xmm1 += A.load(i ,k) * b1;
9823 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9824 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9825 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9826 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
9829 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9830 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9831 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9832 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
9833 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
9837 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
9841 for( ; (j+2UL) <= N; j+=2UL )
9854 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9856 for(
size_t k=kbegin; k<kend; ++k ) {
9857 const SIMDType a1( A.load(i ,k) );
9858 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9859 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9860 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
9861 const SIMDType b1(
set( B(k,j ) ) );
9862 const SIMDType b2(
set( B(k,j+1UL) ) );
9873 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9874 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
9875 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
9876 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
9877 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
9878 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
9879 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
9880 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
9892 SIMDType xmm1, xmm2, xmm3, xmm4;
9894 for(
size_t k=kbegin; k<kend; ++k ) {
9895 const SIMDType b1(
set( B(k,j) ) );
9896 xmm1 += A.load(i ,k) * b1;
9897 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9898 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9899 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9902 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9903 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9904 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9905 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
9909 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
9913 for( ; (j+2UL) <= N; j+=2UL )
9926 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9928 for(
size_t k=kbegin; k<kend; ++k ) {
9929 const SIMDType a1( A.load(i ,k) );
9930 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9931 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9932 const SIMDType b1(
set( B(k,j ) ) );
9933 const SIMDType b2(
set( B(k,j+1UL) ) );
9942 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9943 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
9944 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
9945 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
9946 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
9947 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
9959 SIMDType xmm1, xmm2, xmm3;
9961 for(
size_t k=kbegin; k<kend; ++k ) {
9962 const SIMDType b1(
set( B(k,j) ) );
9963 xmm1 += A.load(i ,k) * b1;
9964 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9965 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9968 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9969 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9970 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9974 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
9976 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
9977 size_t j( UPP ? i : 0UL );
9979 for( ; (j+2UL) <= jend; j+=2UL )
9992 SIMDType xmm1, xmm2, xmm3, xmm4;
9994 for(
size_t k=kbegin; k<kend; ++k ) {
9995 const SIMDType a1( A.load(i ,k) );
9996 const SIMDType a2( A.load(i+SIMDSIZE,k) );
9997 const SIMDType b1(
set( B(k,j ) ) );
9998 const SIMDType b2(
set( B(k,j+1UL) ) );
10005 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10006 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
10007 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10008 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
10020 SIMDType xmm1, xmm2;
10022 for(
size_t k=kbegin; k<kend; ++k ) {
10023 const SIMDType b1(
set( B(k,j) ) );
10024 xmm1 += A.load(i ,k) * b1;
10025 xmm2 += A.load(i+SIMDSIZE,k) * b1;
10028 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10029 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
10033 for( ; i<ipos; i+=SIMDSIZE )
10035 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
10036 size_t j( UPP ? i : 0UL );
10038 for( ; (j+2UL) <= jend; j+=2UL )
10049 SIMDType xmm1, xmm2;
10051 for(
size_t k=kbegin; k<kend; ++k ) {
10052 const SIMDType a1( A.load(i,k) );
10053 xmm1 += a1 *
set( B(k,j ) );
10054 xmm2 += a1 *
set( B(k,j+1UL) );
10057 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10058 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10071 for(
size_t k=kbegin; k<K; ++k ) {
10072 xmm1 += A.load(i,k) *
set( B(k,j) );
10075 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10079 for( ; remainder && i<M; ++i )
10081 const size_t jend( LOW ? i+1UL : N );
10082 size_t j( UPP ? i : 0UL );
10084 for( ; (j+2UL) <= jend; j+=2UL )
10098 for(
size_t k=kbegin; k<kend; ++k ) {
10099 value1 += A(i,k) * B(k,j );
10100 value2 += A(i,k) * B(k,j+1UL);
10103 (~C)(i,j ) -= value1 * scalar;
10104 (~C)(i,j+1UL) -= value2 * scalar;
10117 for(
size_t k=kbegin; k<K; ++k ) {
10118 value += A(i,k) * B(k,j);
10121 (~C)(i,j) -= value * scalar;
10141 template<
typename MT3
10146 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10148 selectDefaultSubAssignKernel( C, A, B, scalar );
10167 template<
typename MT3
10172 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10175 lmmm( C, A, B, -scalar, ST2(1) );
10177 ummm( C, A, B, -scalar, ST2(1) );
10179 mmm( C, A, B, -scalar, ST2(1) );
10197 template<
typename MT3
10202 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10204 selectLargeSubAssignKernel( C, A, B, scalar );
10209 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 10223 template<
typename MT3
10228 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10234 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10235 subAssign( C, tmp );
10239 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10240 subAssign( C, tmp );
10243 gemm( C, A, B, ET(-scalar), ET(1) );
10276 template<
typename MT
10289 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
10292 else if( left.columns() == 0UL ) {
10326 template<
typename MT
10345 const ForwardFunctor fwd;
10347 const TmpType tmp( rhs );
10367 template<
typename MT
10380 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
10417 template<
typename MT
10430 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
10513 template<
typename T1
10562 template<
typename MT1
10608 template<
typename MT1
10654 template<
typename MT1
10700 template<
typename MT1
10746 template<
typename MT1
10777 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10778 struct Rows< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Rows<MT1>
10794 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10795 struct Columns< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Columns<MT2>
10811 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10812 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10813 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
10829 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10830 struct IsSymmetric< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10833 , IsBuiltin< ElementType_< TDMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
10834 , And< Bool<LF>, Bool<UF> > >::value >
10850 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
10851 struct IsHermitian< TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
10868 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10869 struct IsLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10871 , And< IsLower<MT1>, IsLower<MT2> >
10872 , And< Or< Bool<SF>, Bool<HF> >
10873 , IsUpper<MT1>, IsUpper<MT2> > >::value >
10889 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10890 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10891 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
10892 , And< Or< Bool<SF>, Bool<HF> >
10893 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
10909 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10911 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
10912 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
10913 , And< Or< Bool<SF>, Bool<HF> >
10914 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
10915 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
10931 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10932 struct IsUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10934 , And< IsUpper<MT1>, IsUpper<MT2> >
10935 , And< Or< Bool<SF>, Bool<HF> >
10936 , IsLower<MT1>, IsLower<MT2> > >::value >
10952 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10953 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10954 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
10955 , And< Or< Bool<SF>, Bool<HF> >
10956 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
10972 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10974 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
10975 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
10976 , And< Or< Bool<SF>, Bool<HF> >
10977 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
10978 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
10994 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
11012 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
11030 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11048 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11066 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11083 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11100 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11117 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11134 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11151 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
bool AF >
11166 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
11167 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11180 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
typename SubmatrixExprTrait< MT, AF >::Type SubmatrixExprTrait_
Auxiliary alias declaration for the SubmatrixExprTrait type trait.The SubmatrixExprTrait_ alias decla...
Definition: SubmatrixExprTrait.h:134
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Evaluation of the expression type of a dense matrix declherm operation.Via this type trait it is poss...
Definition: TDMatDeclHermExprTrait.h:75
Compile time check for row vector types.This type trait tests whether or not the given template argum...
Definition: IsRowVector.h:80
const DMatForEachExpr< MT, Conj, SO > conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatForEachExpr.h:1214
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Evaluation of the expression type of a dense matrix decllow operation.Via this type trait it is possi...
Definition: TDMatDeclLowExprTrait.h:75
Header file for kernel specific block sizes.
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:195
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:488
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:281
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:560
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:489
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:633
Header file for the dense matrix multiplication kernels.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:299
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:290
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:550
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:66
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Evaluation of the expression type of a sparse vector/transpose dense matrix multiplication.Via this type trait it is possible to evaluate the resulting expression type of a sparse vector/transpose dense matrix multiplication. Given the transpose sparse vector type VT and the column-major dense matrix type MT, the nested type Type corresponds to the resulting expression type. In case either VT is not a transpose sparse vector type or MT is not a column-major dense matrix type, the resulting data type Type is set to INVALID_TYPE.
Definition: TSVecTDMatMultExprTrait.h:81
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
DisableIf_< IsSymmetric< MT >, const DMatDeclSymExpr< MT, SO > > declsym(const DenseMatrix< MT, SO > &dm)
Declares the given non-symmetric dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:841
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the TDMatDeclDiagExprTrait class template.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:424
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:434
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:198
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:285
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:119
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:176
Constraint on the data type.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:283
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:414
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:138
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
DisableIf_< IsHermitian< MT >, const DMatDeclHermExpr< MT, SO > > declherm(const DenseMatrix< MT, SO > &dm)
Declares the given non-Hermitian dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:841
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:179
Header file for the DisableIf class template.
Compile time check for dense vector types.This type trait tests whether or not the given template par...
Definition: IsDenseVector.h:78
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:446
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:83
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Header file for the TSVecTDMatMultExprTrait class template.
Evaluation of the expression type of a dense matrix declupp operation.Via this type trait it is possi...
Definition: TDMatDeclUppExprTrait.h:75
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:123
Header file for the TDMatSVecMultExprTrait class template.
Header file for the TDMatDeclHermExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:293
Header file for the Not class template.
Header file for the TDMatDeclUppExprTrait class template.
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:404
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:177
Evaluation of the expression type of a dense matrix decldiag operation.Via this type trait it is poss...
Definition: TDMatDeclDiagExprTrait.h:75
Compile time check for sparse vector types.This type trait tests whether or not the given template pa...
Definition: IsSparseVector.h:78
Evaluation of the expression type type of a submatrix operation.Via this type trait it is possible to...
Definition: SubmatrixExprTrait.h:80
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Compile time check for column vector types.This type trait tests whether or not the given template ar...
Definition: IsColumnVector.h:80
Evaluation of the expression type of a dense matrix declsym operation.Via this type trait it is possi...
Definition: TDMatDeclSymExprTrait.h:75
Constraints on the storage order of matrix types.
typename TDVecDMatMultExprTrait< VT, MT >::Type TDVecDMatMultExprTrait_
Auxiliary alias declaration for the TDVecDMatMultExprTrait class template.The TDVecDMatMultExprTrait_...
Definition: TDVecDMatMultExprTrait.h:119
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:196
Header file for the exception macros of the math module.
DisableIf_< IsLower< MT >, const DMatDeclLowExpr< MT, SO > > decllow(const DenseMatrix< MT, SO > &dm)
Declares the given non-lower dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:842
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Evaluation of the expression type type of a row operation.Via this type trait it is possible to evalu...
Definition: RowExprTrait.h:79
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:632
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:260
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:478
Compile time check for dense matrix types.This type trait tests whether or not the given template par...
Definition: IsDenseMatrix.h:78
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:284
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:458
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:388
Header file for the conjugate shim.
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Evaluation of the expression type of a transpose dense matrix/sparse vector multiplication.Via this type trait it is possible to evaluate the resulting expression type of a transpose dense matrix/sparse vector multiplication. Given the column-major dense matrix type MT and the non-transpose sparse vector type VT, the nested type Type corresponds to the resulting expression type. In case either MT is not a column-major dense matrix type or VT is not a non-transpose sparse vector type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDMatSVecMultExprTrait.h:79
Header file for the IsSIMDCombinable type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:197
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TDMatDMatMultExpr< MT1, MT2, SF, HF, LF, UF > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:279
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:287
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:83
Utility type for generic codes.
Header file for the TDMatDeclLowExprTrait class template.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:325
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:282
Compile time type negation.The Not class template negates the given compile time condition. In case the given condition would evaluate to true, the nested member enumeration is set to false and vice versa:
Definition: Not.h:70
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
Header file for the TDMatDeclSymExprTrait class template.
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename TDMatDVecMultExprTrait< MT, VT >::Type TDMatDVecMultExprTrait_
Auxiliary alias declaration for the TDMatDVecMultExprTrait class template.The TDMatDVecMultExprTrait_...
Definition: TDMatDVecMultExprTrait.h:120
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:468
Evaluation of the expression type of a dense vector/transpose dense matrix multiplication.Via this type trait it is possible to evaluate the resulting expression type of a dense vector/transpose dense matrix multiplication. Given the transpose dense vector type VT and the column-major dense matrix type MT, the nested type Type corresponds to the resulting expression type. In case either VT is not a transpose dense vector type or MT is not a column-major dense matrix type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDVecTDMatMultExprTrait.h:79
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:174
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:340
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:296
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
Evaluation of the expression type of a transpose dense matrix/dense vector multiplication.Via this type trait it is possible to evaluate the resulting expression type of a transpose dense matrix/dense vector multiplication. Given the column-major dense matrix type MT and the non-transpose dense vector type VT, the nested type Type corresponds to the resulting expression type. In case either MT is not a column-major dense matrix type or VT is not a non-transpose dense vector type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDMatDVecMultExprTrait.h:79
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:286
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:76
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:175
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:677
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
DisableIf_< IsDiagonal< MT >, const DMatDeclDiagExpr< MT, SO > > decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given non-diagonal dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:841
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:178
DisableIf_< IsUpper< MT >, const DMatDeclUppExpr< MT, SO > > declupp(const DenseMatrix< MT, SO > &dm)
Declares the given non-upper dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:842
Evaluation of the expression type type of a column operation.Via this type trait it is possible to ev...
Definition: ColumnExprTrait.h:78
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.