35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 160 template<
typename MT1
193 SYM = ( SF && !( HF || LF || UF ) ),
194 HERM = ( HF && !( LF || UF ) ),
195 LOW = ( LF || ( ( SF || HF ) && UF ) ),
196 UPP = ( UF || ( ( SF || HF ) && LF ) )
206 template<
typename T1,
typename T2,
typename T3 >
207 struct IsEvaluationRequired {
208 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
218 template<
typename T1,
typename T2,
typename T3 >
219 struct UseBlasKernel {
226 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
241 template<
typename T1,
typename T2,
typename T3 >
242 struct UseVectorizedDefaultKernel {
245 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
270 ,
Noop > > > > ForwardFunctor;
303 MT1::simdEnabled && MT2::simdEnabled &&
308 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
309 !evaluateRight && MT2::smpAssignable };
364 :(
lhs_.columns() ) ) );
368 const size_t n(
end - begin );
386 inline ReturnType
at(
size_t i,
size_t j )
const {
387 if( i >=
lhs_.rows() ) {
390 if( j >=
rhs_.columns() ) {
402 inline size_t rows() const noexcept {
413 return rhs_.columns();
443 template<
typename T >
444 inline bool canAlias(
const T* alias )
const noexcept {
445 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
455 template<
typename T >
456 inline bool isAliased(
const T* alias )
const noexcept {
457 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
467 return lhs_.isAligned() &&
rhs_.isAligned();
478 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
479 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
503 template<
typename MT
512 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
515 else if( rhs.
lhs_.columns() == 0UL ) {
530 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
546 template<
typename MT3
549 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
552 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
553 selectSmallAssignKernel( C, A, B );
555 selectBlasAssignKernel( C, A, B );
574 template<
typename MT3
580 const size_t M( A.rows() );
581 const size_t N( B.columns() );
582 const size_t K( A.columns() );
594 for(
size_t i=0UL; i<ibegin; ++i ) {
595 for(
size_t j=0UL; j<N; ++j ) {
599 for(
size_t i=ibegin; i<iend; ++i )
606 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
607 :( SYM || HERM || UPP ? i : 0UL ) ) );
613 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
614 :( LOW ? i+1UL : N ) ) );
616 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
617 for(
size_t j=0UL; j<N; ++j ) {
625 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
628 for(
size_t j=jbegin; j<jend; ++j )
648 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
649 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
650 (~C)(i,j) += A(i,k) * B(k,j);
653 for(
size_t j=jend; j<N; ++j ) {
657 for(
size_t i=iend; i<M; ++i ) {
658 for(
size_t j=0UL; j<N; ++j ) {
664 for(
size_t i=1UL; i<M; ++i ) {
665 for(
size_t j=0UL; j<i; ++j ) {
666 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
688 template<
typename MT3
694 const size_t M( A.rows() );
695 const size_t N( B.columns() );
696 const size_t K( A.columns() );
708 for(
size_t j=0UL; j<jbegin; ++j ) {
709 for(
size_t i=0UL; i<M; ++i ) {
713 for(
size_t j=jbegin; j<jend; ++j )
720 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
721 :( SYM || HERM || LOW ? j : 0UL ) ) );
727 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
728 :( UPP ? j+1UL : M ) ) );
730 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
731 for(
size_t i=0UL; i<M; ++i ) {
739 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
742 for(
size_t i=ibegin; i<iend; ++i )
762 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
763 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
764 (~C)(i,j) += A(i,k) * B(k,j);
767 for(
size_t i=iend; i<M; ++i ) {
771 for(
size_t j=jend; j<N; ++j ) {
772 for(
size_t i=0UL; i<M; ++i ) {
778 for(
size_t j=1UL; j<N; ++j ) {
779 for(
size_t i=0UL; i<j; ++i ) {
780 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
802 template<
typename MT3
805 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
808 const size_t M( A.rows() );
809 const size_t N( B.columns() );
811 for(
size_t i=0UL; i<M; ++i )
822 for(
size_t j=0UL; j<jbegin; ++j ) {
826 for(
size_t j=jbegin; j<jend; ++j ) {
827 (~C)(i,j) = A(i,j) * B(j,j);
830 for(
size_t j=jend; j<N; ++j ) {
853 template<
typename MT3
856 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
859 constexpr
size_t block( BLOCK_SIZE );
861 const size_t M( A.rows() );
862 const size_t N( B.columns() );
864 for(
size_t jj=0UL; jj<N; jj+=block ) {
865 const size_t jend(
min( N, jj+block ) );
866 for(
size_t ii=0UL; ii<M; ii+=block ) {
867 const size_t iend(
min( M, ii+block ) );
868 for(
size_t j=jj; j<jend; ++j )
878 for(
size_t i=ii; i<ibegin; ++i ) {
882 for(
size_t i=ibegin; i<ipos; ++i ) {
883 (~C)(i,j) = A(i,j) * B(j,j);
886 for(
size_t i=ipos; i<iend; ++i ) {
911 template<
typename MT3
917 constexpr
size_t block( BLOCK_SIZE );
919 const size_t M( A.rows() );
920 const size_t N( B.columns() );
922 for(
size_t ii=0UL; ii<M; ii+=block ) {
923 const size_t iend(
min( M, ii+block ) );
924 for(
size_t jj=0UL; jj<N; jj+=block ) {
925 const size_t jend(
min( N, jj+block ) );
926 for(
size_t i=ii; i<iend; ++i )
936 for(
size_t j=jj; j<jbegin; ++j ) {
940 for(
size_t j=jbegin; j<jpos; ++j ) {
941 (~C)(i,j) = A(i,i) * B(i,j);
944 for(
size_t j=jpos; j<jend; ++j ) {
969 template<
typename MT3
975 const size_t M( A.rows() );
976 const size_t N( B.columns() );
978 for(
size_t j=0UL; j<N; ++j )
989 for(
size_t i=0UL; i<ibegin; ++i ) {
993 for(
size_t i=ibegin; i<iend; ++i ) {
994 (~C)(i,j) = A(i,i) * B(i,j);
997 for(
size_t i=iend; i<M; ++i ) {
1020 template<
typename MT3
1024 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1028 for(
size_t i=0UL; i<A.rows(); ++i ) {
1029 C(i,i) = A(i,i) * B(i,i);
1049 template<
typename MT3
1053 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1055 selectDefaultAssignKernel( C, A, B );
1075 template<
typename MT3
1083 const size_t M( A.rows() );
1084 const size_t N( B.columns() );
1085 const size_t K( A.columns() );
1096 for( ; !( LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1098 const size_t jend( LOW ? i+2UL : N );
1099 size_t j( SYM || HERM || UPP ? i : 0UL );
1101 for( ; (j+4UL) <= jend; j+=4UL )
1110 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1111 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1113 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1116 for( ; k<kpos; k+=SIMDSIZE ) {
1117 const SIMDType a1( A.load(i ,k) );
1118 const SIMDType a2( A.load(i+1UL,k) );
1119 const SIMDType b1( B.load(k,j ) );
1120 const SIMDType b2( B.load(k,j+1UL) );
1121 const SIMDType b3( B.load(k,j+2UL) );
1122 const SIMDType b4( B.load(k,j+3UL) );
1133 (~C)(i ,j ) =
sum( xmm1 );
1134 (~C)(i ,j+1UL) =
sum( xmm2 );
1135 (~C)(i ,j+2UL) =
sum( xmm3 );
1136 (~C)(i ,j+3UL) =
sum( xmm4 );
1137 (~C)(i+1UL,j ) =
sum( xmm5 );
1138 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
1139 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
1140 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
1142 for( ; remainder && k<kend; ++k ) {
1143 (~C)(i ,j ) += A(i ,k) * B(k,j );
1144 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1145 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1146 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1147 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1148 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1149 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1150 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1154 for( ; (j+2UL) <= jend; j+=2UL )
1163 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1164 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1166 SIMDType xmm1, xmm2, xmm3, xmm4;
1169 for( ; k<kpos; k+=SIMDSIZE ) {
1170 const SIMDType a1( A.load(i ,k) );
1171 const SIMDType a2( A.load(i+1UL,k) );
1172 const SIMDType b1( B.load(k,j ) );
1173 const SIMDType b2( B.load(k,j+1UL) );
1180 (~C)(i ,j ) =
sum( xmm1 );
1181 (~C)(i ,j+1UL) =
sum( xmm2 );
1182 (~C)(i+1UL,j ) =
sum( xmm3 );
1183 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1185 for( ; remainder && k<kend; ++k ) {
1186 (~C)(i ,j ) += A(i ,k) * B(k,j );
1187 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1188 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1189 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1200 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1201 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1203 SIMDType xmm1, xmm2;
1206 for( ; k<kpos; k+=SIMDSIZE ) {
1207 const SIMDType b1( B.load(k,j) );
1208 xmm1 += A.load(i ,k) * b1;
1209 xmm2 += A.load(i+1UL,k) * b1;
1212 (~C)(i ,j) =
sum( xmm1 );
1213 (~C)(i+1UL,j) =
sum( xmm2 );
1215 for( ; remainder && k<kend; ++k ) {
1216 (~C)(i ,j) += A(i ,k) * B(k,j);
1217 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1224 const size_t jend( LOW ? i+1UL : N );
1225 size_t j( SYM || HERM || UPP ? i : 0UL );
1227 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
1234 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1235 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1237 SIMDType xmm1, xmm2, xmm3, xmm4;
1240 for( ; k<kpos; k+=SIMDSIZE ) {
1241 const SIMDType a1( A.load(i,k) );
1242 xmm1 += a1 * B.load(k,j );
1243 xmm2 += a1 * B.load(k,j+1UL);
1244 xmm3 += a1 * B.load(k,j+2UL);
1245 xmm4 += a1 * B.load(k,j+3UL);
1248 (~C)(i,j ) =
sum( xmm1 );
1249 (~C)(i,j+1UL) =
sum( xmm2 );
1250 (~C)(i,j+2UL) =
sum( xmm3 );
1251 (~C)(i,j+3UL) =
sum( xmm4 );
1253 for( ; remainder && k<kend; ++k ) {
1254 (~C)(i,j ) += A(i,k) * B(k,j );
1255 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1256 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
1257 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
1261 for( ; !( LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1268 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1269 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1271 SIMDType xmm1, xmm2;
1274 for( ; k<kpos; k+=SIMDSIZE ) {
1275 const SIMDType a1( A.load(i,k) );
1276 xmm1 += a1 * B.load(k,j );
1277 xmm2 += a1 * B.load(k,j+1UL);
1280 (~C)(i,j ) =
sum( xmm1 );
1281 (~C)(i,j+1UL) =
sum( xmm2 );
1283 for( ; remainder && k<kend; ++k ) {
1284 (~C)(i,j ) += A(i,k) * B(k,j );
1285 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1289 for( ; j<jend; ++j )
1295 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
1296 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1301 for( ; k<kpos; k+=SIMDSIZE ) {
1302 xmm1 += A.load(i,k) * B.load(k,j);
1305 (~C)(i,j) =
sum( xmm1 );
1307 for( ; remainder && k<K; ++k ) {
1308 (~C)(i,j) += A(i,k) * B(k,j);
1315 for(
size_t i=2UL; i<M; ++i ) {
1316 const size_t jend( 2UL * ( i/2UL ) );
1317 for(
size_t j=0UL; j<jend; ++j ) {
1318 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1322 else if( LOW && !UPP ) {
1323 for(
size_t j=2UL; j<N; ++j ) {
1324 const size_t iend( 2UL * ( j/2UL ) );
1325 for(
size_t i=0UL; i<iend; ++i ) {
1330 else if( !LOW && UPP ) {
1331 for(
size_t i=2UL; i<M; ++i ) {
1332 const size_t jend( 2UL * ( i/2UL ) );
1333 for(
size_t j=0UL; j<jend; ++j ) {
1357 template<
typename MT3
1365 const size_t M( A.rows() );
1366 const size_t N( B.columns() );
1367 const size_t K( A.columns() );
1378 for( ; !( LOW &&
UPP ) && (i+4UL) <= M; i+=4UL )
1380 const size_t jend( SYM || HERM || LOW ? i+4UL : N );
1381 size_t j( UPP ? i : 0UL );
1383 for( ; (j+2UL) <= jend; j+=2UL )
1392 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1393 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1395 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1398 for( ; k<kpos; k+=SIMDSIZE ) {
1399 const SIMDType a1( A.load(i ,k) );
1400 const SIMDType a2( A.load(i+1UL,k) );
1401 const SIMDType a3( A.load(i+2UL,k) );
1402 const SIMDType a4( A.load(i+3UL,k) );
1403 const SIMDType b1( B.load(k,j ) );
1404 const SIMDType b2( B.load(k,j+1UL) );
1415 (~C)(i ,j ) =
sum( xmm1 );
1416 (~C)(i ,j+1UL) =
sum( xmm2 );
1417 (~C)(i+1UL,j ) =
sum( xmm3 );
1418 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1419 (~C)(i+2UL,j ) =
sum( xmm5 );
1420 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
1421 (~C)(i+3UL,j ) =
sum( xmm7 );
1422 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
1424 for( ; remainder && k<kend; ++k ) {
1425 (~C)(i ,j ) += A(i ,k) * B(k,j );
1426 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1427 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1428 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1429 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1430 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1431 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1432 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1443 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1444 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1446 SIMDType xmm1, xmm2, xmm3, xmm4;
1449 for( ; k<kpos; k+=SIMDSIZE ) {
1450 const SIMDType b1( B.load(k,j) );
1451 xmm1 += A.load(i ,k) * b1;
1452 xmm2 += A.load(i+1UL,k) * b1;
1453 xmm3 += A.load(i+2UL,k) * b1;
1454 xmm4 += A.load(i+3UL,k) * b1;
1457 (~C)(i ,j) =
sum( xmm1 );
1458 (~C)(i+1UL,j) =
sum( xmm2 );
1459 (~C)(i+2UL,j) =
sum( xmm3 );
1460 (~C)(i+3UL,j) =
sum( xmm4 );
1462 for( ; remainder && k<kend; ++k ) {
1463 (~C)(i ,j) += A(i ,k) * B(k,j);
1464 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1465 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
1466 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
1471 for( ; !( LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1475 for( ; (j+2UL) <= N; j+=2UL )
1484 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1485 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1487 SIMDType xmm1, xmm2, xmm3, xmm4;
1490 for( ; k<kpos; k+=SIMDSIZE ) {
1491 const SIMDType a1( A.load(i ,k) );
1492 const SIMDType a2( A.load(i+1UL,k) );
1493 const SIMDType b1( B.load(k,j ) );
1494 const SIMDType b2( B.load(k,j+1UL) );
1501 (~C)(i ,j ) =
sum( xmm1 );
1502 (~C)(i ,j+1UL) =
sum( xmm2 );
1503 (~C)(i+1UL,j ) =
sum( xmm3 );
1504 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1506 for( ; remainder && k<kend; ++k ) {
1507 (~C)(i ,j ) += A(i ,k) * B(k,j );
1508 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1509 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1510 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1521 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1522 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1524 SIMDType xmm1, xmm2;
1527 for( ; k<kpos; k+=SIMDSIZE ) {
1528 const SIMDType b1( B.load(k,j) );
1529 xmm1 += A.load(i ,k) * b1;
1530 xmm2 += A.load(i+1UL,k) * b1;
1533 (~C)(i ,j) =
sum( xmm1 );
1534 (~C)(i+1UL,j) =
sum( xmm2 );
1536 for( ; remainder && k<kend; ++k ) {
1537 (~C)(i ,j) += A(i ,k) * B(k,j);
1538 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1545 const size_t jend( LOW && UPP ? i+1UL : N );
1546 size_t j( LOW && UPP ? i : 0UL );
1548 for( ; !( LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1555 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1556 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1558 SIMDType xmm1, xmm2;
1561 for( ; k<kpos; k+=SIMDSIZE ) {
1562 const SIMDType a1( A.load(i,k) );
1563 xmm1 += a1 * B.load(k,j );
1564 xmm2 += a1 * B.load(k,j+1UL);
1567 (~C)(i,j ) =
sum( xmm1 );
1568 (~C)(i,j+1UL) =
sum( xmm2 );
1570 for( ; remainder && k<kend; ++k ) {
1571 (~C)(i,j ) += A(i,k) * B(k,j );
1572 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1576 for( ; j<jend; ++j )
1582 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
1583 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1588 for( ; k<kpos; k+=SIMDSIZE ) {
1589 xmm1 += A.load(i,k) * B.load(k,j);
1592 (~C)(i,j) =
sum( xmm1 );
1594 for( ; remainder && k<K; ++k ) {
1595 (~C)(i,j) += A(i,k) * B(k,j);
1601 if( ( SYM || HERM ) && ( N > 4UL ) ) {
1602 for(
size_t j=4UL; j<N; ++j ) {
1603 const size_t iend( 4UL * ( j/4UL ) );
1604 for(
size_t i=0UL; i<iend; ++i ) {
1605 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1609 else if( LOW && !UPP ) {
1610 for(
size_t j=4UL; j<N; ++j ) {
1611 const size_t iend( 4UL * ( j/4UL ) );
1612 for(
size_t i=0UL; i<iend; ++i ) {
1617 else if( !LOW && UPP ) {
1618 for(
size_t i=4UL; i<N; ++i ) {
1619 const size_t jend( 4UL * ( i/4UL ) );
1620 for(
size_t j=0UL; j<jend; ++j ) {
1643 template<
typename MT3
1647 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1649 selectDefaultAssignKernel( C, A, B );
1669 template<
typename MT3
1673 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1703 template<
typename MT3
1707 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1709 selectLargeAssignKernel( C, A, B );
1715 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1729 template<
typename MT3
1733 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1746 gemm( C, A, B, ET(1), ET(0) );
1766 template<
typename MT
1784 const ForwardFunctor fwd;
1786 const TmpType tmp(
serial( rhs ) );
1787 assign( ~lhs, fwd( tmp ) );
1805 template<
typename MT
1814 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
1828 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1844 template<
typename MT3
1847 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1850 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1851 selectSmallAddAssignKernel( C, A, B );
1853 selectBlasAddAssignKernel( C, A, B );
1872 template<
typename MT3
1878 const size_t M( A.rows() );
1879 const size_t N( B.columns() );
1880 const size_t K( A.columns() );
1892 for(
size_t i=ibegin; i<iend; ++i )
1899 ?( UPP ?
max( i, 1UL ) : 1UL )
1900 :( UPP ? i : 0UL ) ) );
1906 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
1907 :( LOW ? i+1UL : N ) ) );
1909 if( ( LOW || UPP ) && ( jbegin > jend ) )
continue;
1912 for(
size_t j=jbegin; j<jend; ++j )
1932 const size_t knum( kend - kbegin );
1933 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1935 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1936 (~C)(i,j) += A(i,k ) * B(k ,j);
1937 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1940 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1962 template<
typename MT3
1968 const size_t M( A.rows() );
1969 const size_t N( B.columns() );
1970 const size_t K( A.columns() );
1982 for(
size_t j=jbegin; j<jend; ++j )
1989 ?( LOW ?
max( j, 1UL ) : 1UL )
1990 :( LOW ? j : 0UL ) ) );
1996 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
1997 :( UPP ? j+1UL : M ) ) );
1999 if( ( LOW || UPP ) && ( ibegin > iend ) )
continue;
2002 for(
size_t i=ibegin; i<iend; ++i )
2022 const size_t knum( kend - kbegin );
2023 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2025 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2026 (~C)(i,j) += A(i,k ) * B(k ,j);
2027 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
2030 (~C)(i,j) += A(i,kpos) * B(kpos,j);
2052 template<
typename MT3
2055 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2058 const size_t M( A.rows() );
2059 const size_t N( B.columns() );
2061 for(
size_t i=0UL; i<M; ++i )
2071 const size_t jnum( jend - jbegin );
2072 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2074 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2075 (~C)(i,j ) += A(i,j ) * B(j ,j );
2076 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2079 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos);
2100 template<
typename MT3
2103 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2106 constexpr
size_t block( BLOCK_SIZE );
2108 const size_t M( A.rows() );
2109 const size_t N( B.columns() );
2111 for(
size_t jj=0UL; jj<N; jj+=block ) {
2112 const size_t jend(
min( N, jj+block ) );
2113 for(
size_t ii=0UL; ii<M; ii+=block ) {
2114 const size_t iend(
min( M, ii+block ) );
2115 for(
size_t j=jj; j<jend; ++j )
2124 for(
size_t i=ibegin; i<ipos; ++i ) {
2125 (~C)(i,j) += A(i,j) * B(j,j);
2148 template<
typename MT3
2154 constexpr
size_t block( BLOCK_SIZE );
2156 const size_t M( A.rows() );
2157 const size_t N( B.columns() );
2159 for(
size_t ii=0UL; ii<M; ii+=block ) {
2160 const size_t iend(
min( M, ii+block ) );
2161 for(
size_t jj=0UL; jj<N; jj+=block ) {
2162 const size_t jend(
min( N, jj+block ) );
2163 for(
size_t i=ii; i<iend; ++i )
2172 for(
size_t j=jbegin; j<jpos; ++j ) {
2173 (~C)(i,j) += A(i,i) * B(i,j);
2196 template<
typename MT3
2202 const size_t M( A.rows() );
2203 const size_t N( B.columns() );
2205 for(
size_t j=0UL; j<N; ++j )
2215 const size_t inum( iend - ibegin );
2216 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2218 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2219 (~C)(i ,j) += A(i ,i ) * B(i ,j);
2220 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2223 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j);
2244 template<
typename MT3
2248 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2250 for(
size_t i=0UL; i<A.rows(); ++i ) {
2251 C(i,i) += A(i,i) * B(i,i);
2271 template<
typename MT3
2275 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2277 selectDefaultAddAssignKernel( C, A, B );
2297 template<
typename MT3
2305 const size_t M( A.rows() );
2306 const size_t N( B.columns() );
2307 const size_t K( A.columns() );
2313 for( ; (i+2UL) <= M; i+=2UL )
2315 const size_t jend( LOW ? i+2UL : N );
2316 size_t j( UPP ? i : 0UL );
2318 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2327 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2328 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2330 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2333 for( ; k<kpos; k+=SIMDSIZE ) {
2334 const SIMDType a1( A.load(i ,k) );
2335 const SIMDType a2( A.load(i+1UL,k) );
2336 const SIMDType b1( B.load(k,j ) );
2337 const SIMDType b2( B.load(k,j+1UL) );
2338 const SIMDType b3( B.load(k,j+2UL) );
2339 const SIMDType b4( B.load(k,j+3UL) );
2350 (~C)(i ,j ) +=
sum( xmm1 );
2351 (~C)(i ,j+1UL) +=
sum( xmm2 );
2352 (~C)(i ,j+2UL) +=
sum( xmm3 );
2353 (~C)(i ,j+3UL) +=
sum( xmm4 );
2354 (~C)(i+1UL,j ) +=
sum( xmm5 );
2355 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
2356 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
2357 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
2359 for( ; remainder && k<kend; ++k ) {
2360 (~C)(i ,j ) += A(i ,k) * B(k,j );
2361 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2362 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2363 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2364 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2365 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2366 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2367 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2371 for( ; (j+2UL) <= jend; j+=2UL )
2380 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2381 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2383 SIMDType xmm1, xmm2, xmm3, xmm4;
2386 for( ; k<kpos; k+=SIMDSIZE ) {
2387 const SIMDType a1( A.load(i ,k) );
2388 const SIMDType a2( A.load(i+1UL,k) );
2389 const SIMDType b1( B.load(k,j ) );
2390 const SIMDType b2( B.load(k,j+1UL) );
2397 (~C)(i ,j ) +=
sum( xmm1 );
2398 (~C)(i ,j+1UL) +=
sum( xmm2 );
2399 (~C)(i+1UL,j ) +=
sum( xmm3 );
2400 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2402 for( ; remainder && k<kend; ++k ) {
2403 (~C)(i ,j ) += A(i ,k) * B(k,j );
2404 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2405 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2406 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2417 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2418 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2420 SIMDType xmm1, xmm2;
2423 for( ; k<kpos; k+=SIMDSIZE ) {
2424 const SIMDType b1( B.load(k,j) );
2425 xmm1 += A.load(i ,k) * b1;
2426 xmm2 += A.load(i+1UL,k) * b1;
2429 (~C)(i ,j) +=
sum( xmm1 );
2430 (~C)(i+1UL,j) +=
sum( xmm2 );
2432 for( ; remainder && k<kend; ++k ) {
2433 (~C)(i ,j) += A(i ,k) * B(k,j);
2434 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2441 const size_t jend( LOW ? i+1UL : N );
2442 size_t j( UPP ? i : 0UL );
2444 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2451 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2452 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2454 SIMDType xmm1, xmm2, xmm3, xmm4;
2457 for( ; k<kpos; k+=SIMDSIZE ) {
2458 const SIMDType a1( A.load(i,k) );
2459 xmm1 += a1 * B.load(k,j );
2460 xmm2 += a1 * B.load(k,j+1UL);
2461 xmm3 += a1 * B.load(k,j+2UL);
2462 xmm4 += a1 * B.load(k,j+3UL);
2465 (~C)(i,j ) +=
sum( xmm1 );
2466 (~C)(i,j+1UL) +=
sum( xmm2 );
2467 (~C)(i,j+2UL) +=
sum( xmm3 );
2468 (~C)(i,j+3UL) +=
sum( xmm4 );
2470 for( ; remainder && k<kend; ++k ) {
2471 (~C)(i,j ) += A(i,k) * B(k,j );
2472 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2473 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
2474 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
2478 for( ; (j+2UL) <= jend; j+=2UL )
2485 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2486 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2488 SIMDType xmm1, xmm2;
2491 for( ; k<kpos; k+=SIMDSIZE ) {
2492 const SIMDType a1( A.load(i,k) );
2493 xmm1 += a1 * B.load(k,j );
2494 xmm2 += a1 * B.load(k,j+1UL);
2497 (~C)(i,j ) +=
sum( xmm1 );
2498 (~C)(i,j+1UL) +=
sum( xmm2 );
2500 for( ; remainder && k<kend; ++k ) {
2501 (~C)(i,j ) += A(i,k) * B(k,j );
2502 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2512 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
2513 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2518 for( ; k<kpos; k+=SIMDSIZE ) {
2519 xmm1 += A.load(i,k) * B.load(k,j);
2522 (~C)(i,j) +=
sum( xmm1 );
2524 for( ; remainder && k<K; ++k ) {
2525 (~C)(i,j) += A(i,k) * B(k,j);
2548 template<
typename MT3
2556 const size_t M( A.rows() );
2557 const size_t N( B.columns() );
2558 const size_t K( A.columns() );
2564 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
2568 for( ; (j+2UL) <= N; j+=2UL )
2577 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2578 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2580 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2583 for( ; k<kpos; k+=SIMDSIZE ) {
2584 const SIMDType a1( A.load(i ,k) );
2585 const SIMDType a2( A.load(i+1UL,k) );
2586 const SIMDType a3( A.load(i+2UL,k) );
2587 const SIMDType a4( A.load(i+3UL,k) );
2588 const SIMDType b1( B.load(k,j ) );
2589 const SIMDType b2( B.load(k,j+1UL) );
2600 (~C)(i ,j ) +=
sum( xmm1 );
2601 (~C)(i ,j+1UL) +=
sum( xmm2 );
2602 (~C)(i+1UL,j ) +=
sum( xmm3 );
2603 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2604 (~C)(i+2UL,j ) +=
sum( xmm5 );
2605 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
2606 (~C)(i+3UL,j ) +=
sum( xmm7 );
2607 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
2609 for( ; remainder && k<kend; ++k ) {
2610 (~C)(i ,j ) += A(i ,k) * B(k,j );
2611 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2612 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2613 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2614 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2615 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2616 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2617 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2628 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2629 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2631 SIMDType xmm1, xmm2, xmm3, xmm4;
2634 for( ; k<kpos; k+=SIMDSIZE ) {
2635 const SIMDType b1( B.load(k,j) );
2636 xmm1 += A.load(i ,k) * b1;
2637 xmm2 += A.load(i+1UL,k) * b1;
2638 xmm3 += A.load(i+2UL,k) * b1;
2639 xmm4 += A.load(i+3UL,k) * b1;
2642 (~C)(i ,j) +=
sum( xmm1 );
2643 (~C)(i+1UL,j) +=
sum( xmm2 );
2644 (~C)(i+2UL,j) +=
sum( xmm3 );
2645 (~C)(i+3UL,j) +=
sum( xmm4 );
2647 for( ; remainder && k<kend; ++k ) {
2648 (~C)(i ,j) += A(i ,k) * B(k,j);
2649 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2650 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
2651 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
2656 for( ; (i+2UL) <= M; i+=2UL )
2658 const size_t jend( LOW ? i+2UL : N );
2659 size_t j( UPP ? i : 0UL );
2661 for( ; (j+2UL) <= jend; j+=2UL )
2670 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2671 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2673 SIMDType xmm1, xmm2, xmm3, xmm4;
2676 for( ; k<kpos; k+=SIMDSIZE ) {
2677 const SIMDType a1( A.load(i ,k) );
2678 const SIMDType a2( A.load(i+1UL,k) );
2679 const SIMDType b1( B.load(k,j ) );
2680 const SIMDType b2( B.load(k,j+1UL) );
2687 (~C)(i ,j ) +=
sum( xmm1 );
2688 (~C)(i ,j+1UL) +=
sum( xmm2 );
2689 (~C)(i+1UL,j ) +=
sum( xmm3 );
2690 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2692 for( ; remainder && k<kend; ++k ) {
2693 (~C)(i ,j ) += A(i ,k) * B(k,j );
2694 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2695 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2696 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2707 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2708 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2710 SIMDType xmm1, xmm2;
2713 for( ; k<kpos; k+=SIMDSIZE ) {
2714 const SIMDType b1( B.load(k,j) );
2715 xmm1 += A.load(i ,k) * b1;
2716 xmm2 += A.load(i+1UL,k) * b1;
2719 (~C)(i ,j) +=
sum( xmm1 );
2720 (~C)(i+1UL,j) +=
sum( xmm2 );
2722 for( ; remainder && k<kend; ++k ) {
2723 (~C)(i ,j) += A(i ,k) * B(k,j);
2724 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2731 const size_t jend( LOW ? i+1UL : N );
2732 size_t j( UPP ? i : 0UL );
2734 for( ; (j+2UL) <= jend; j+=2UL )
2741 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2742 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2744 SIMDType xmm1, xmm2;
2747 for( ; k<kpos; k+=SIMDSIZE ) {
2748 const SIMDType a1( A.load(i,k) );
2749 xmm1 += a1 * B.load(k,j );
2750 xmm2 += a1 * B.load(k,j+1UL);
2753 (~C)(i,j ) +=
sum( xmm1 );
2754 (~C)(i,j+1UL) +=
sum( xmm2 );
2756 for( ; remainder && k<kend; ++k ) {
2757 (~C)(i,j ) += A(i,k) * B(k,j );
2758 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2768 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
2769 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2774 for( ; k<kpos; k+=SIMDSIZE ) {
2775 xmm1 += A.load(i,k) * B.load(k,j);
2778 (~C)(i,j) +=
sum( xmm1 );
2780 for( ; remainder && k<K; ++k ) {
2781 (~C)(i,j) += A(i,k) * B(k,j);
2803 template<
typename MT3
2807 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2809 selectDefaultAddAssignKernel( C, A, B );
2829 template<
typename MT3
2833 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2859 template<
typename MT3
2863 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2865 selectLargeAddAssignKernel( C, A, B );
2871 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2885 template<
typename MT3
2889 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2896 addAssign( C, tmp );
2901 addAssign( C, tmp );
2904 gemm( C, A, B, ET(1), ET(1) );
2928 template<
typename MT
2937 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
2951 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2967 template<
typename MT3
2970 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2973 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2974 selectSmallSubAssignKernel( C, A, B );
2976 selectBlasSubAssignKernel( C, A, B );
2995 template<
typename MT3
3001 const size_t M( A.rows() );
3002 const size_t N( B.columns() );
3003 const size_t K( A.columns() );
3015 for(
size_t i=ibegin; i<iend; ++i )
3022 ?( UPP ?
max( i, 1UL ) : 1UL )
3023 :( UPP ? i : 0UL ) ) );
3029 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
3030 :( LOW ? i+1UL : N ) ) );
3032 if( ( LOW || UPP ) && ( jbegin > jend ) )
continue;
3035 for(
size_t j=jbegin; j<jend; ++j )
3055 const size_t knum( kend - kbegin );
3056 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3058 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3059 (~C)(i,j) -= A(i,k ) * B(k ,j);
3060 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3063 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3085 template<
typename MT3
3091 const size_t M( A.rows() );
3092 const size_t N( B.columns() );
3093 const size_t K( A.columns() );
3105 for(
size_t j=jbegin; j<jend; ++j )
3112 ?( LOW ?
max( j, 1UL ) : 1UL )
3113 :( LOW ? j : 0UL ) ) );
3119 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
3120 :( UPP ? j+1UL : M ) ) );
3122 if( ( LOW || UPP ) && ( ibegin > iend ) )
continue;
3125 for(
size_t i=ibegin; i<iend; ++i )
3145 const size_t knum( kend - kbegin );
3146 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3148 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3149 (~C)(i,j) -= A(i,k ) * B(k ,j);
3150 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3153 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3175 template<
typename MT3
3178 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3181 const size_t M( A.rows() );
3182 const size_t N( B.columns() );
3184 for(
size_t i=0UL; i<M; ++i )
3194 const size_t jnum( jend - jbegin );
3195 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3197 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3198 (~C)(i,j ) -= A(i,j ) * B(j ,j );
3199 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3202 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3223 template<
typename MT3
3226 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3229 constexpr
size_t block( BLOCK_SIZE );
3231 const size_t M( A.rows() );
3232 const size_t N( B.columns() );
3234 for(
size_t jj=0UL; jj<N; jj+=block ) {
3235 const size_t jend(
min( N, jj+block ) );
3236 for(
size_t ii=0UL; ii<M; ii+=block ) {
3237 const size_t iend(
min( M, ii+block ) );
3238 for(
size_t j=jj; j<jend; ++j )
3247 for(
size_t i=ibegin; i<ipos; ++i ) {
3248 (~C)(i,j) -= A(i,j) * B(j,j);
3271 template<
typename MT3
3277 constexpr
size_t block( BLOCK_SIZE );
3279 const size_t M( A.rows() );
3280 const size_t N( B.columns() );
3282 for(
size_t ii=0UL; ii<M; ii+=block ) {
3283 const size_t iend(
min( M, ii+block ) );
3284 for(
size_t jj=0UL; jj<N; jj+=block ) {
3285 const size_t jend(
min( N, jj+block ) );
3286 for(
size_t i=ii; i<iend; ++i )
3295 for(
size_t j=jbegin; j<jpos; ++j ) {
3296 (~C)(i,j) -= A(i,i) * B(i,j);
3319 template<
typename MT3
3325 const size_t M( A.rows() );
3326 const size_t N( B.columns() );
3328 for(
size_t j=0UL; j<N; ++j )
3338 const size_t inum( iend - ibegin );
3339 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3341 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3342 (~C)(i ,j) -= A(i ,i ) * B(i ,j);
3343 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3346 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3367 template<
typename MT3
3371 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3373 for(
size_t i=0UL; i<A.rows(); ++i ) {
3374 C(i,i) -= A(i,i) * B(i,i);
3394 template<
typename MT3
3398 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3400 selectDefaultSubAssignKernel( ~C, A, B );
3420 template<
typename MT3
3428 const size_t M( A.rows() );
3429 const size_t N( B.columns() );
3430 const size_t K( A.columns() );
3436 for( ; (i+2UL) <= M; i+=2UL )
3438 const size_t jend( LOW ? i+2UL : N );
3439 size_t j( UPP ? i : 0UL );
3441 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3450 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3451 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3453 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3456 for( ; k<kpos; k+=SIMDSIZE ) {
3457 const SIMDType a1( A.load(i ,k) );
3458 const SIMDType a2( A.load(i+1UL,k) );
3459 const SIMDType b1( B.load(k,j ) );
3460 const SIMDType b2( B.load(k,j+1UL) );
3461 const SIMDType b3( B.load(k,j+2UL) );
3462 const SIMDType b4( B.load(k,j+3UL) );
3473 (~C)(i ,j ) -=
sum( xmm1 );
3474 (~C)(i ,j+1UL) -=
sum( xmm2 );
3475 (~C)(i ,j+2UL) -=
sum( xmm3 );
3476 (~C)(i ,j+3UL) -=
sum( xmm4 );
3477 (~C)(i+1UL,j ) -=
sum( xmm5 );
3478 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
3479 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
3480 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
3482 for( ; remainder && k<kend; ++k ) {
3483 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3484 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3485 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3486 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3487 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3488 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3489 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3490 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3494 for( ; (j+2UL) <= jend; j+=2UL )
3503 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3504 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3506 SIMDType xmm1, xmm2, xmm3, xmm4;
3509 for( ; k<kpos; k+=SIMDSIZE ) {
3510 const SIMDType a1( A.load(i ,k) );
3511 const SIMDType a2( A.load(i+1UL,k) );
3512 const SIMDType b1( B.load(k,j ) );
3513 const SIMDType b2( B.load(k,j+1UL) );
3520 (~C)(i ,j ) -=
sum( xmm1 );
3521 (~C)(i ,j+1UL) -=
sum( xmm2 );
3522 (~C)(i+1UL,j ) -=
sum( xmm3 );
3523 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3525 for( ; remainder && k<kend; ++k ) {
3526 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3527 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3528 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3529 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3540 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3541 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3543 SIMDType xmm1, xmm2;
3546 for( ; k<kpos; k+=SIMDSIZE ) {
3547 const SIMDType b1( B.load(k,j) );
3548 xmm1 += A.load(i ,k) * b1;
3549 xmm2 += A.load(i+1UL,k) * b1;
3552 (~C)(i ,j) -=
sum( xmm1 );
3553 (~C)(i+1UL,j) -=
sum( xmm2 );
3555 for( ; remainder && k<kend; ++k ) {
3556 (~C)(i ,j) -= A(i ,k) * B(k,j);
3557 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3564 const size_t jend( LOW ? i+1UL : N );
3565 size_t j( UPP ? i : 0UL );
3567 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3574 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3575 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3577 SIMDType xmm1, xmm2, xmm3, xmm4;
3580 for( ; k<kpos; k+=SIMDSIZE ) {
3581 const SIMDType a1( A.load(i,k) );
3582 xmm1 += a1 * B.load(k,j );
3583 xmm2 += a1 * B.load(k,j+1UL);
3584 xmm3 += a1 * B.load(k,j+2UL);
3585 xmm4 += a1 * B.load(k,j+3UL);
3588 (~C)(i,j ) -=
sum( xmm1 );
3589 (~C)(i,j+1UL) -=
sum( xmm2 );
3590 (~C)(i,j+2UL) -=
sum( xmm3 );
3591 (~C)(i,j+3UL) -=
sum( xmm4 );
3593 for( ; remainder && k<kend; ++k ) {
3594 (~C)(i,j ) -= A(i,k) * B(k,j );
3595 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3596 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3597 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3601 for( ; (j+2UL) <= jend; j+=2UL )
3608 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3609 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3611 SIMDType xmm1, xmm2;
3614 for( ; k<kpos; k+=SIMDSIZE ) {
3615 const SIMDType a1( A.load(i,k) );
3616 xmm1 += a1 * B.load(k,j );
3617 xmm2 += a1 * B.load(k,j+1UL);
3620 (~C)(i,j ) -=
sum( xmm1 );
3621 (~C)(i,j+1UL) -=
sum( xmm2 );
3623 for( ; remainder && k<kend; ++k ) {
3624 (~C)(i,j ) -= A(i,k) * B(k,j );
3625 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3635 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
3636 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3641 for( ; k<kpos; k+=SIMDSIZE ) {
3642 xmm1 += A.load(i,k) * B.load(k,j);
3645 (~C)(i,j) -=
sum( xmm1 );
3647 for( ; remainder && k<K; ++k ) {
3648 (~C)(i,j) -= A(i,k) * B(k,j);
3671 template<
typename MT3
3679 const size_t M( A.rows() );
3680 const size_t N( B.columns() );
3681 const size_t K( A.columns() );
3687 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
3691 for( ; (j+2UL) <= N; j+=2UL )
3700 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3701 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3706 for( ; k<kpos; k+=SIMDSIZE ) {
3707 const SIMDType a1( A.load(i ,k) );
3708 const SIMDType a2( A.load(i+1UL,k) );
3709 const SIMDType a3( A.load(i+2UL,k) );
3710 const SIMDType a4( A.load(i+3UL,k) );
3711 const SIMDType b1( B.load(k,j ) );
3712 const SIMDType b2( B.load(k,j+1UL) );
3723 (~C)(i ,j ) -=
sum( xmm1 );
3724 (~C)(i ,j+1UL) -=
sum( xmm2 );
3725 (~C)(i+1UL,j ) -=
sum( xmm3 );
3726 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3727 (~C)(i+2UL,j ) -=
sum( xmm5 );
3728 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
3729 (~C)(i+3UL,j ) -=
sum( xmm7 );
3730 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
3732 for( ; remainder && k<kend; ++k ) {
3733 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3734 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3735 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3736 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3737 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3738 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3739 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3740 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3751 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3752 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3754 SIMDType xmm1, xmm2, xmm3, xmm4;
3757 for( ; k<kpos; k+=SIMDSIZE ) {
3758 const SIMDType b1( B.load(k,j) );
3759 xmm1 += A.load(i ,k) * b1;
3760 xmm2 += A.load(i+1UL,k) * b1;
3761 xmm3 += A.load(i+2UL,k) * b1;
3762 xmm4 += A.load(i+3UL,k) * b1;
3765 (~C)(i ,j) -=
sum( xmm1 );
3766 (~C)(i+1UL,j) -=
sum( xmm2 );
3767 (~C)(i+2UL,j) -=
sum( xmm3 );
3768 (~C)(i+3UL,j) -=
sum( xmm4 );
3770 for( ; remainder && k<kend; ++k ) {
3771 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3772 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3773 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3774 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3779 for( ; (i+2UL) <= M; i+=2UL )
3781 const size_t jend( LOW ? i+2UL : N );
3782 size_t j( UPP ? i : 0UL );
3784 for( ; (j+2UL) <= jend; j+=2UL )
3793 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3794 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3796 SIMDType xmm1, xmm2, xmm3, xmm4;
3799 for( ; k<kpos; k+=SIMDSIZE ) {
3800 const SIMDType a1( A.load(i ,k) );
3801 const SIMDType a2( A.load(i+1UL,k) );
3802 const SIMDType b1( B.load(k,j ) );
3803 const SIMDType b2( B.load(k,j+1UL) );
3810 (~C)(i ,j ) -=
sum( xmm1 );
3811 (~C)(i ,j+1UL) -=
sum( xmm2 );
3812 (~C)(i+1UL,j ) -=
sum( xmm3 );
3813 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3815 for( ; remainder && k<kend; ++k ) {
3816 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3817 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3818 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3819 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3830 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3831 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3833 SIMDType xmm1, xmm2;
3836 for( ; k<kpos; k+=SIMDSIZE ) {
3837 const SIMDType b1( B.load(k,j) );
3838 xmm1 += A.load(i ,k) * b1;
3839 xmm2 += A.load(i+1UL,k) * b1;
3842 (~C)(i ,j) -=
sum( xmm1 );
3843 (~C)(i+1UL,j) -=
sum( xmm2 );
3845 for( ; remainder && k<kend; ++k ) {
3846 (~C)(i ,j) -= A(i ,k) * B(k,j);
3847 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3854 const size_t jend( LOW ? i+1UL : N );
3855 size_t j( UPP ? i : 0UL );
3857 for( ; (j+2UL) <= jend; j+=2UL )
3864 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3865 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3867 SIMDType xmm1, xmm2;
3870 for( ; k<kpos; k+=SIMDSIZE ) {
3871 const SIMDType a1( A.load(i,k) );
3872 xmm1 += a1 * B.load(k,j );
3873 xmm2 += a1 * B.load(k,j+1UL);
3876 (~C)(i,j ) -=
sum( xmm1 );
3877 (~C)(i,j+1UL) -=
sum( xmm2 );
3879 for( ; remainder && k<kend; ++k ) {
3880 (~C)(i,j ) -= A(i,k) * B(k,j );
3881 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3891 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
3892 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3897 for( ; k<kpos; k+=SIMDSIZE ) {
3898 xmm1 += A.load(i,k) * B.load(k,j);
3901 (~C)(i,j) -=
sum( xmm1 );
3903 for( ; remainder && k<K; ++k ) {
3904 (~C)(i,j) -= A(i,k) * B(k,j);
3926 template<
typename MT3
3930 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3932 selectDefaultSubAssignKernel( ~C, A, B );
3952 template<
typename MT3
3956 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3982 template<
typename MT3
3986 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3988 selectLargeSubAssignKernel( C, A, B );
3994 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4008 template<
typename MT3
4012 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4019 subAssign( C, tmp );
4024 subAssign( C, tmp );
4027 gemm( C, A, B, ET(-1), ET(1) );
4061 template<
typename MT
4071 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4074 else if( rhs.
lhs_.columns() == 0UL ) {
4109 template<
typename MT
4128 const ForwardFunctor fwd;
4130 const TmpType tmp( rhs );
4152 template<
typename MT
4162 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
4201 template<
typename MT
4211 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
4271 template<
typename MT1
4279 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false >
4310 SYM = ( SF && !( HF || LF || UF ) ),
4311 HERM = ( HF && !( LF || UF ) ),
4312 LOW = ( LF || ( ( SF || HF ) && UF ) ),
4313 UPP = ( UF || ( ( SF || HF ) && LF ) )
4322 template<
typename T1,
typename T2,
typename T3 >
4323 struct IsEvaluationRequired {
4324 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
4332 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4333 struct UseBlasKernel {
4340 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4354 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4355 struct UseVectorizedDefaultKernel {
4358 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4382 ,
Noop > > > > ForwardFunctor;
4412 MT1::simdEnabled && MT2::simdEnabled &&
4418 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4419 !evaluateRight && MT2::smpAssignable };
4446 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4449 return matrix_(i,j) * scalar_;
4461 inline ReturnType
at(
size_t i,
size_t j )
const {
4462 if( i >= matrix_.rows() ) {
4465 if( j >= matrix_.columns() ) {
4468 return (*
this)(i,j);
4477 inline size_t rows()
const {
4478 return matrix_.rows();
4487 inline size_t columns()
const {
4488 return matrix_.columns();
4518 template<
typename T >
4519 inline bool canAlias(
const T* alias )
const {
4520 return matrix_.canAlias( alias );
4530 template<
typename T >
4531 inline bool isAliased(
const T* alias )
const {
4532 return matrix_.isAliased( alias );
4542 return matrix_.isAligned();
4553 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4554 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4560 LeftOperand matrix_;
4561 RightOperand scalar_;
4576 template<
typename MT
4588 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4591 else if( left.columns() == 0UL ) {
4606 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4621 template<
typename MT3
4625 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4628 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4629 selectSmallAssignKernel( C, A, B, scalar );
4631 selectBlasAssignKernel( C, A, B, scalar );
4649 template<
typename MT3
4656 const size_t M( A.rows() );
4657 const size_t N( B.columns() );
4658 const size_t K( A.columns() );
4670 for(
size_t i=0UL; i<ibegin; ++i ) {
4671 for(
size_t j=0UL; j<N; ++j ) {
4675 for(
size_t i=ibegin; i<iend; ++i )
4682 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
4683 :( SYM || HERM || UPP ? i : 0UL ) ) );
4689 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
4690 :( LOW ? i+1UL : N ) ) );
4692 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
4693 for(
size_t j=0UL; j<N; ++j ) {
4701 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
4704 for(
size_t j=jbegin; j<jend; ++j )
4724 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4725 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4726 (~C)(i,j) += A(i,k) * B(k,j);
4728 (~C)(i,j) *= scalar;
4730 for(
size_t j=jend; j<N; ++j ) {
4734 for(
size_t i=iend; i<M; ++i ) {
4735 for(
size_t j=0UL; j<N; ++j ) {
4741 for(
size_t i=1UL; i<M; ++i ) {
4742 for(
size_t j=0UL; j<i; ++j ) {
4743 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
4764 template<
typename MT3
4771 const size_t M( A.rows() );
4772 const size_t N( B.columns() );
4773 const size_t K( A.columns() );
4785 for(
size_t j=0UL; j<jbegin; ++j ) {
4786 for(
size_t i=0UL; i<M; ++i ) {
4790 for(
size_t j=jbegin; j<jend; ++j )
4797 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
4798 :( SYM || HERM || LOW ? j : 0UL ) ) );
4804 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
4805 :( UPP ? j+1UL : M ) ) );
4807 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
4808 for(
size_t i=0UL; i<M; ++i ) {
4816 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
4819 for(
size_t i=ibegin; i<iend; ++i )
4839 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4840 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4841 (~C)(i,j) += A(i,k) * B(k,j);
4843 (~C)(i,j) *= scalar;
4845 for(
size_t i=iend; i<M; ++i ) {
4849 for(
size_t j=jend; j<N; ++j ) {
4850 for(
size_t i=0UL; i<M; ++i ) {
4856 for(
size_t j=1UL; j<N; ++j ) {
4857 for(
size_t i=0UL; i<j; ++i ) {
4858 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
4879 template<
typename MT3
4883 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4886 const size_t M( A.rows() );
4887 const size_t N( B.columns() );
4889 for(
size_t i=0UL; i<M; ++i )
4900 for(
size_t j=0UL; j<jbegin; ++j ) {
4904 for(
size_t j=jbegin; j<jend; ++j ) {
4905 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4908 for(
size_t j=jend; j<N; ++j ) {
4930 template<
typename MT3
4934 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4937 constexpr
size_t block( BLOCK_SIZE );
4939 const size_t M( A.rows() );
4940 const size_t N( B.columns() );
4942 for(
size_t jj=0UL; jj<N; jj+=block ) {
4943 const size_t jend(
min( N, jj+block ) );
4944 for(
size_t ii=0UL; ii<M; ii+=block ) {
4945 const size_t iend(
min( M, ii+block ) );
4946 for(
size_t j=jj; j<jend; ++j )
4956 for(
size_t i=ii; i<ibegin; ++i ) {
4960 for(
size_t i=ibegin; i<ipos; ++i ) {
4961 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4964 for(
size_t i=ipos; i<iend; ++i ) {
4988 template<
typename MT3
4995 constexpr
size_t block( BLOCK_SIZE );
4997 const size_t M( A.rows() );
4998 const size_t N( B.columns() );
5000 for(
size_t ii=0UL; ii<M; ii+=block ) {
5001 const size_t iend(
min( M, ii+block ) );
5002 for(
size_t jj=0UL; jj<N; jj+=block ) {
5003 const size_t jend(
min( N, jj+block ) );
5004 for(
size_t i=ii; i<iend; ++i )
5014 for(
size_t j=jj; j<jbegin; ++j ) {
5018 for(
size_t j=jbegin; j<jpos; ++j ) {
5019 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
5022 for(
size_t j=jpos; j<jend; ++j ) {
5046 template<
typename MT3
5053 const size_t M( A.rows() );
5054 const size_t N( B.columns() );
5056 for(
size_t j=0UL; j<N; ++j )
5067 for(
size_t i=0UL; i<ibegin; ++i ) {
5071 for(
size_t i=ibegin; i<iend; ++i ) {
5072 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
5075 for(
size_t i=iend; i<M; ++i ) {
5097 template<
typename MT3
5102 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5106 for(
size_t i=0UL; i<A.rows(); ++i ) {
5107 C(i,i) = A(i,i) * B(i,i) * scalar;
5126 template<
typename MT3
5131 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5133 selectDefaultAssignKernel( C, A, B, scalar );
5152 template<
typename MT3
5161 const size_t M( A.rows() );
5162 const size_t N( B.columns() );
5163 const size_t K( A.columns() );
5174 for( ; !( LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
5176 const size_t jend( LOW ? i+2UL : N );
5177 size_t j( SYM || HERM || UPP ? i : 0UL );
5179 for( ; (j+4UL) <= jend; j+=4UL )
5188 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5189 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5191 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5194 for( ; k<kpos; k+=SIMDSIZE ) {
5195 const SIMDType a1( A.load(i ,k) );
5196 const SIMDType a2( A.load(i+1UL,k) );
5197 const SIMDType b1( B.load(k,j ) );
5198 const SIMDType b2( B.load(k,j+1UL) );
5199 const SIMDType b3( B.load(k,j+2UL) );
5200 const SIMDType b4( B.load(k,j+3UL) );
5211 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5212 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5213 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
5214 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
5215 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
5216 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
5217 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
5218 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
5220 for( ; remainder && k<kend; ++k ) {
5221 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5222 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5223 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5224 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5225 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5226 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5227 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
5228 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
5232 for( ; (j+2UL) <= jend; j+=2UL )
5241 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5242 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5244 SIMDType xmm1, xmm2, xmm3, xmm4;
5247 for( ; k<kpos; k+=SIMDSIZE ) {
5248 const SIMDType a1( A.load(i ,k) );
5249 const SIMDType a2( A.load(i+1UL,k) );
5250 const SIMDType b1( B.load(k,j ) );
5251 const SIMDType b2( B.load(k,j+1UL) );
5258 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5259 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5260 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5261 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5263 for( ; remainder && k<kend; ++k ) {
5264 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5265 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5266 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5267 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5278 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5279 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5281 SIMDType xmm1, xmm2;
5284 for( ; k<kpos; k+=SIMDSIZE ) {
5285 const SIMDType b1( B.load(k,j) );
5286 xmm1 += A.load(i ,k) * b1;
5287 xmm2 += A.load(i+1UL,k) * b1;
5290 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5291 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5293 for( ; remainder && k<kend; ++k ) {
5294 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5295 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5302 const size_t jend( LOW ? i+1UL : N );
5303 size_t j( SYM || HERM || UPP ? i : 0UL );
5305 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
5312 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5313 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5315 SIMDType xmm1, xmm2, xmm3, xmm4;
5318 for( ; k<kpos; k+=SIMDSIZE ) {
5319 const SIMDType a1( A.load(i,k) );
5320 xmm1 += a1 * B.load(k,j );
5321 xmm2 += a1 * B.load(k,j+1UL);
5322 xmm3 += a1 * B.load(k,j+2UL);
5323 xmm4 += a1 * B.load(k,j+3UL);
5326 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5327 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5328 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
5329 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
5331 for( ; remainder && k<kend; ++k ) {
5332 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5333 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5334 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5335 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5339 for( ; !( LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
5346 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5347 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5349 SIMDType xmm1, xmm2;
5352 for( ; k<kpos; k+=SIMDSIZE ) {
5353 const SIMDType a1( A.load(i,k) );
5354 xmm1 += a1 * B.load(k,j );
5355 xmm2 += a1 * B.load(k,j+1UL);
5358 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5359 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5361 for( ; remainder && k<kend; ++k ) {
5362 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5363 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5367 for( ; j<jend; ++j )
5373 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
5374 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5379 for( ; k<kpos; k+=SIMDSIZE ) {
5380 xmm1 += A.load(i,k) * B.load(k,j);
5383 (~C)(i,j) =
sum( xmm1 ) * scalar;
5385 for( ; remainder && k<K; ++k ) {
5386 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5393 for(
size_t i=2UL; i<M; ++i ) {
5394 const size_t jend( 2UL * ( i/2UL ) );
5395 for(
size_t j=0UL; j<jend; ++j ) {
5396 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5400 else if( LOW && !UPP ) {
5401 for(
size_t j=2UL; j<N; ++j ) {
5402 const size_t iend( 2UL * ( j/2UL ) );
5403 for(
size_t i=0UL; i<iend; ++i ) {
5408 else if( !LOW && UPP ) {
5409 for(
size_t i=2UL; i<M; ++i ) {
5410 const size_t jend( 2UL * ( i/2UL ) );
5411 for(
size_t j=0UL; j<jend; ++j ) {
5434 template<
typename MT3
5443 const size_t M( A.rows() );
5444 const size_t N( B.columns() );
5445 const size_t K( A.columns() );
5456 for( ; !SYM && !HERM && !LOW && !UPP && (i+4UL) <= M; i+=4UL )
5460 for( ; (j+2UL) <= N; j+=2UL )
5469 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5470 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5472 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5475 for( ; k<kpos; k+=SIMDSIZE ) {
5476 const SIMDType a1( A.load(i ,k) );
5477 const SIMDType a2( A.load(i+1UL,k) );
5478 const SIMDType a3( A.load(i+2UL,k) );
5479 const SIMDType a4( A.load(i+3UL,k) );
5480 const SIMDType b1( B.load(k,j ) );
5481 const SIMDType b2( B.load(k,j+1UL) );
5492 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5493 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5494 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5495 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5496 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
5497 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5498 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
5499 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5501 for( ; remainder && k<kend; ++k ) {
5502 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5503 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5504 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5505 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5506 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5507 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5508 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5509 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5520 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5521 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5523 SIMDType xmm1, xmm2, xmm3, xmm4;
5526 for( ; k<kpos; k+=SIMDSIZE ) {
5527 const SIMDType b1( B.load(k,j) );
5528 xmm1 += A.load(i ,k) * b1;
5529 xmm2 += A.load(i+1UL,k) * b1;
5530 xmm3 += A.load(i+2UL,k) * b1;
5531 xmm4 += A.load(i+3UL,k) * b1;
5534 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5535 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5536 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
5537 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
5539 for( ; remainder && k<kend; ++k ) {
5540 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5541 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5542 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5543 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5548 for( ; (i+2UL) <= M; i+=2UL )
5550 const size_t jend( LOW ? i+2UL : N );
5551 size_t j( SYM || HERM || UPP ? i : 0UL );
5553 for( ; (j+2UL) <= jend; j+=2UL )
5562 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5563 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5565 SIMDType xmm1, xmm2, xmm3, xmm4;
5568 for( ; k<kpos; k+=SIMDSIZE ) {
5569 const SIMDType a1( A.load(i ,k) );
5570 const SIMDType a2( A.load(i+1UL,k) );
5571 const SIMDType b1( B.load(k,j ) );
5572 const SIMDType b2( B.load(k,j+1UL) );
5579 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5580 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5581 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5582 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5584 for( ; remainder && k<kend; ++k ) {
5585 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5586 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5587 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5588 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5599 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5600 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5602 SIMDType xmm1, xmm2;
5605 for( ; k<kpos; k+=SIMDSIZE ) {
5606 const SIMDType b1( B.load(k,j) );
5607 xmm1 += A.load(i ,k) * b1;
5608 xmm2 += A.load(i+1UL,k) * b1;
5611 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5612 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5614 for( ; remainder && k<kend; ++k ) {
5615 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5616 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5623 const size_t jend( LOW ? i+1UL : N );
5624 size_t j( SYM || HERM || UPP ? i : 0UL );
5626 for( ; (j+2UL) <= jend; j+=2UL )
5633 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5634 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5636 SIMDType xmm1, xmm2;
5639 for( ; k<kpos; k+=SIMDSIZE ) {
5640 const SIMDType a1( A.load(i,k) );
5641 xmm1 += a1 * B.load(k,j );
5642 xmm2 += a1 * B.load(k,j+1UL);
5645 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5646 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5648 for( ; remainder && k<kend; ++k ) {
5649 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5650 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5660 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
5661 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5666 for( ; k<kpos; k+=SIMDSIZE ) {
5667 xmm1 += A.load(i,k) * B.load(k,j);
5670 (~C)(i,j) =
sum( xmm1 ) * scalar;
5672 for( ; remainder && k<K; ++k ) {
5673 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5680 for(
size_t j=0UL; j<N; ++j ) {
5681 for(
size_t i=j+1UL; i<M; ++i ) {
5682 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5703 template<
typename MT3
5708 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5710 selectDefaultAssignKernel( C, A, B, scalar );
5729 template<
typename MT3
5734 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5737 smmm( C, A, B, scalar );
5739 hmmm( C, A, B, scalar );
5741 lmmm( C, A, B, scalar, ST2(0) );
5743 ummm( C, A, B, scalar, ST2(0) );
5745 mmm( C, A, B, scalar, ST2(0) );
5763 template<
typename MT3
5768 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5770 selectLargeAssignKernel( C, A, B, scalar );
5775 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5789 template<
typename MT3
5794 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5800 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5804 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5807 gemm( C, A, B, ET(scalar), ET(0) );
5825 template<
typename MT
5843 const ForwardFunctor fwd;
5845 const TmpType tmp(
serial( rhs ) );
5846 assign( ~lhs, fwd( tmp ) );
5862 template<
typename MT
5874 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5888 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
5903 template<
typename MT3
5907 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5910 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5911 selectSmallAddAssignKernel( C, A, B, scalar );
5913 selectBlasAddAssignKernel( C, A, B, scalar );
5931 template<
typename MT3
5936 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5938 const ResultType tmp(
serial( A * B * scalar ) );
5939 addAssign( C, tmp );
5957 template<
typename MT3
5961 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
5964 const size_t M( A.rows() );
5965 const size_t N( B.columns() );
5967 for(
size_t i=0UL; i<M; ++i )
5977 const size_t jnum( jend - jbegin );
5978 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5980 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5981 (~C)(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5982 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5985 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6005 template<
typename MT3
6009 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6012 constexpr
size_t block( BLOCK_SIZE );
6014 const size_t M( A.rows() );
6015 const size_t N( B.columns() );
6017 for(
size_t jj=0UL; jj<N; jj+=block ) {
6018 const size_t jend(
min( N, jj+block ) );
6019 for(
size_t ii=0UL; ii<M; ii+=block ) {
6020 const size_t iend(
min( M, ii+block ) );
6021 for(
size_t j=jj; j<jend; ++j )
6030 for(
size_t i=ibegin; i<ipos; ++i ) {
6031 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
6053 template<
typename MT3
6060 constexpr
size_t block( BLOCK_SIZE );
6062 const size_t M( A.rows() );
6063 const size_t N( B.columns() );
6065 for(
size_t ii=0UL; ii<M; ii+=block ) {
6066 const size_t iend(
min( M, ii+block ) );
6067 for(
size_t jj=0UL; jj<N; jj+=block ) {
6068 const size_t jend(
min( N, jj+block ) );
6069 for(
size_t i=ii; i<iend; ++i )
6078 for(
size_t j=jbegin; j<jpos; ++j ) {
6079 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
6101 template<
typename MT3
6108 const size_t M( A.rows() );
6109 const size_t N( B.columns() );
6111 for(
size_t j=0UL; j<N; ++j )
6121 const size_t inum( iend - ibegin );
6122 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6124 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6125 (~C)(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6126 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6129 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6149 template<
typename MT3
6154 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6156 for(
size_t i=0UL; i<A.rows(); ++i ) {
6157 C(i,i) += A(i,i) * B(i,i) * scalar;
6176 template<
typename MT3
6181 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6183 selectDefaultAddAssignKernel( C, A, B, scalar );
6202 template<
typename MT3
6211 const size_t M( A.rows() );
6212 const size_t N( B.columns() );
6213 const size_t K( A.columns() );
6219 for( ; (i+2UL) <= M; i+=2UL )
6221 const size_t jend( LOW ? i+2UL : N );
6222 size_t j( UPP ? i : 0UL );
6224 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
6233 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6234 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6236 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6239 for( ; k<kpos; k+=SIMDSIZE ) {
6240 const SIMDType a1( A.load(i ,k) );
6241 const SIMDType a2( A.load(i+1UL,k) );
6242 const SIMDType b1( B.load(k,j ) );
6243 const SIMDType b2( B.load(k,j+1UL) );
6244 const SIMDType b3( B.load(k,j+2UL) );
6245 const SIMDType b4( B.load(k,j+3UL) );
6256 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6257 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6258 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
6259 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
6260 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
6261 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
6262 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
6263 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
6265 for( ; remainder && k<kend; ++k ) {
6266 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6267 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6268 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6269 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6270 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6271 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6272 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6273 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6277 for( ; (j+2UL) <= jend; j+=2UL )
6286 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6287 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6289 SIMDType xmm1, xmm2, xmm3, xmm4;
6292 for( ; k<kpos; k+=SIMDSIZE ) {
6293 const SIMDType a1( A.load(i ,k) );
6294 const SIMDType a2( A.load(i+1UL,k) );
6295 const SIMDType b1( B.load(k,j ) );
6296 const SIMDType b2( B.load(k,j+1UL) );
6303 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6304 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6305 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6306 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6308 for( ; remainder && k<kend; ++k ) {
6309 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6310 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6311 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6312 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6323 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6324 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6326 SIMDType xmm1, xmm2;
6329 for( ; k<kpos; k+=SIMDSIZE ) {
6330 const SIMDType b1( B.load(k,j) );
6331 xmm1 += A.load(i ,k) * b1;
6332 xmm2 += A.load(i+1UL,k) * b1;
6335 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6336 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6338 for( ; remainder && k<kend; ++k ) {
6339 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6340 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6347 const size_t jend( LOW ? i+1UL : N );
6348 size_t j( UPP ? i : 0UL );
6350 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
6357 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6358 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6360 SIMDType xmm1, xmm2, xmm3, xmm4;
6363 for( ; k<kpos; k+=SIMDSIZE ) {
6364 const SIMDType a1( A.load(i,k) );
6365 xmm1 += a1 * B.load(k,j );
6366 xmm2 += a1 * B.load(k,j+1UL);
6367 xmm3 += a1 * B.load(k,j+2UL);
6368 xmm4 += a1 * B.load(k,j+3UL);
6371 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6372 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6373 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
6374 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
6376 for( ; remainder && k<kend; ++k ) {
6377 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6378 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6379 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6380 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6384 for( ; (j+2UL) <= jend; j+=2UL )
6391 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6392 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6394 SIMDType xmm1, xmm2;
6397 for( ; k<kpos; k+=SIMDSIZE ) {
6398 const SIMDType a1( A.load(i,k) );
6399 xmm1 += a1 * B.load(k,j );
6400 xmm2 += a1 * B.load(k,j+1UL);
6403 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6404 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6406 for( ; remainder && k<kend; ++k ) {
6407 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6408 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6418 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
6419 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6424 for( ; k<kpos; k+=SIMDSIZE ) {
6425 xmm1 += A.load(i,k) * B.load(k,j);
6428 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6430 for( ; remainder && k<K; ++k ) {
6431 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6453 template<
typename MT3
6462 const size_t M( A.rows() );
6463 const size_t N( B.columns() );
6464 const size_t K( A.columns() );
6470 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
6474 for( ; (j+2UL) <= N; j+=2UL )
6483 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6484 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6486 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6489 for( ; k<kpos; k+=SIMDSIZE ) {
6490 const SIMDType a1( A.load(i ,k) );
6491 const SIMDType a2( A.load(i+1UL,k) );
6492 const SIMDType a3( A.load(i+2UL,k) );
6493 const SIMDType a4( A.load(i+3UL,k) );
6494 const SIMDType b1( B.load(k,j ) );
6495 const SIMDType b2( B.load(k,j+1UL) );
6506 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6507 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6508 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6509 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6510 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6511 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6512 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6513 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6515 for( ; remainder && k<kend; ++k ) {
6516 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6517 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6518 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6519 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6520 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6521 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6522 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6523 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6534 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6535 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6537 SIMDType xmm1, xmm2, xmm3, xmm4;
6540 for( ; k<kpos; k+=SIMDSIZE ) {
6541 const SIMDType b1( B.load(k,j) );
6542 xmm1 += A.load(i ,k) * b1;
6543 xmm2 += A.load(i+1UL,k) * b1;
6544 xmm3 += A.load(i+2UL,k) * b1;
6545 xmm4 += A.load(i+3UL,k) * b1;
6548 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6549 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6550 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
6551 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
6553 for( ; remainder && k<kend; ++k ) {
6554 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6555 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6556 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6557 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6562 for( ; (i+2UL) <= M; i+=2UL )
6564 const size_t jend( LOW ? i+2UL : N );
6565 size_t j( UPP ? i : 0UL );
6567 for( ; (j+2UL) <= jend; j+=2UL )
6576 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6577 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6579 SIMDType xmm1, xmm2, xmm3, xmm4;
6582 for( ; k<kpos; k+=SIMDSIZE ) {
6583 const SIMDType a1( A.load(i ,k) );
6584 const SIMDType a2( A.load(i+1UL,k) );
6585 const SIMDType b1( B.load(k,j ) );
6586 const SIMDType b2( B.load(k,j+1UL) );
6593 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6594 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6595 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6596 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6598 for( ; remainder && k<kend; ++k ) {
6599 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6600 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6601 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6602 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6613 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6614 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6616 SIMDType xmm1, xmm2;
6619 for( ; k<kpos; k+=SIMDSIZE ) {
6620 const SIMDType b1( B.load(k,j) );
6621 xmm1 += A.load(i ,k) * b1;
6622 xmm2 += A.load(i+1UL,k) * b1;
6625 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6626 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6628 for( ; remainder && k<kend; ++k ) {
6629 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6630 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6637 const size_t jend( LOW ? i+1UL : N );
6638 size_t j( UPP ? i : 0UL );
6640 for( ; (j+2UL) <= jend; j+=2UL )
6647 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6648 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6650 SIMDType xmm1, xmm2;
6653 for( ; k<kpos; k+=SIMDSIZE ) {
6654 const SIMDType a1( A.load(i,k) );
6655 xmm1 += a1 * B.load(k,j );
6656 xmm2 += a1 * B.load(k,j+1UL);
6659 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6660 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6662 for( ; remainder && k<kend; ++k ) {
6663 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6664 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6674 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
6675 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6680 for( ; k<kpos; k+=SIMDSIZE ) {
6681 xmm1 += A.load(i,k) * B.load(k,j);
6684 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6686 for( ; remainder && k<K; ++k ) {
6687 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6708 template<
typename MT3
6713 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6715 selectDefaultAddAssignKernel( C, A, B, scalar );
6734 template<
typename MT3
6739 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6742 lmmm( C, A, B, scalar, ST2(1) );
6744 ummm( C, A, B, scalar, ST2(1) );
6746 mmm( C, A, B, scalar, ST2(1) );
6764 template<
typename MT3
6769 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6771 selectLargeAddAssignKernel( C, A, B, scalar );
6776 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6790 template<
typename MT3
6795 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6801 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6802 addAssign( C, tmp );
6806 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6807 addAssign( C, tmp );
6810 gemm( C, A, B, ET(scalar), ET(1) );
6832 template<
typename MT
6844 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6858 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6873 template<
typename MT3
6877 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6880 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6881 selectSmallSubAssignKernel( C, A, B, scalar );
6883 selectBlasSubAssignKernel( C, A, B, scalar );
6901 template<
typename MT3
6906 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6908 const ResultType tmp(
serial( A * B * scalar ) );
6909 subAssign( C, tmp );
6927 template<
typename MT3
6931 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6934 const size_t M( A.rows() );
6935 const size_t N( B.columns() );
6937 for(
size_t i=0UL; i<M; ++i )
6947 const size_t jnum( jend - jbegin );
6948 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6950 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6951 (~C)(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6952 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6955 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6975 template<
typename MT3
6979 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6982 constexpr
size_t block( BLOCK_SIZE );
6984 const size_t M( A.rows() );
6985 const size_t N( B.columns() );
6987 for(
size_t jj=0UL; jj<N; jj+=block ) {
6988 const size_t jend(
min( N, jj+block ) );
6989 for(
size_t ii=0UL; ii<M; ii+=block ) {
6990 const size_t iend(
min( M, ii+block ) );
6991 for(
size_t j=jj; j<jend; ++j )
7000 for(
size_t i=ibegin; i<ipos; ++i ) {
7001 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
7024 template<
typename MT3
7031 constexpr
size_t block( BLOCK_SIZE );
7033 const size_t M( A.rows() );
7034 const size_t N( B.columns() );
7036 for(
size_t ii=0UL; ii<M; ii+=block ) {
7037 const size_t iend(
min( M, ii+block ) );
7038 for(
size_t jj=0UL; jj<N; jj+=block ) {
7039 const size_t jend(
min( N, jj+block ) );
7040 for(
size_t i=ii; i<iend; ++i )
7049 for(
size_t j=jbegin; j<jpos; ++j ) {
7050 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
7073 template<
typename MT3
7080 const size_t M( A.rows() );
7081 const size_t N( B.columns() );
7083 for(
size_t j=0UL; j<N; ++j )
7093 const size_t inum( iend - ibegin );
7094 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7096 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7097 (~C)(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7098 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7101 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7121 template<
typename MT3
7126 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7128 for(
size_t i=0UL; i<A.rows(); ++i ) {
7129 C(i,i) -= A(i,i) * B(i,i) * scalar;
7148 template<
typename MT3
7153 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7155 selectDefaultSubAssignKernel( C, A, B, scalar );
7174 template<
typename MT3
7183 const size_t M( A.rows() );
7184 const size_t N( B.columns() );
7185 const size_t K( A.columns() );
7191 for( ; (i+2UL) <= M; i+=2UL )
7193 const size_t jend( LOW ? i+2UL : N );
7194 size_t j( UPP ? i : 0UL );
7196 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
7205 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7206 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7208 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7211 for( ; k<kpos; k+=SIMDSIZE ) {
7212 const SIMDType a1( A.load(i ,k) );
7213 const SIMDType a2( A.load(i+1UL,k) );
7214 const SIMDType b1( B.load(k,j ) );
7215 const SIMDType b2( B.load(k,j+1UL) );
7216 const SIMDType b3( B.load(k,j+2UL) );
7217 const SIMDType b4( B.load(k,j+3UL) );
7228 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7229 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7230 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
7231 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
7232 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
7233 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
7234 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
7235 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
7237 for( ; remainder && k<kend; ++k ) {
7238 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7239 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7240 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7241 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7242 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7243 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7244 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7245 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7249 for( ; (j+2UL) <= jend; j+=2UL )
7258 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7259 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7261 SIMDType xmm1, xmm2, xmm3, xmm4;
7264 for( ; k<kpos; k+=SIMDSIZE ) {
7265 const SIMDType a1( A.load(i ,k) );
7266 const SIMDType a2( A.load(i+1UL,k) );
7267 const SIMDType b1( B.load(k,j ) );
7268 const SIMDType b2( B.load(k,j+1UL) );
7275 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7276 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7277 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7278 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7280 for( ; remainder && k<kend; ++k ) {
7281 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7282 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7283 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7284 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7295 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7296 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7298 SIMDType xmm1, xmm2;
7301 for( ; k<kpos; k+=SIMDSIZE ) {
7302 const SIMDType b1( B.load(k,j) );
7303 xmm1 += A.load(i ,k) * b1;
7304 xmm2 += A.load(i+1UL,k) * b1;
7307 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7308 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7310 for( ; remainder && k<kend; ++k ) {
7311 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7312 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7319 const size_t jend( LOW ? i+1UL : N );
7320 size_t j( UPP ? i : 0UL );
7322 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
7329 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7330 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7332 SIMDType xmm1, xmm2, xmm3, xmm4;
7335 for( ; k<kpos; k+=SIMDSIZE ) {
7336 const SIMDType a1( A.load(i,k) );
7337 xmm1 += a1 * B.load(k,j );
7338 xmm2 += a1 * B.load(k,j+1UL);
7339 xmm3 += a1 * B.load(k,j+2UL);
7340 xmm4 += a1 * B.load(k,j+3UL);
7343 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7344 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7345 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
7346 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
7348 for( ; remainder && k<kend; ++k ) {
7349 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7350 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7351 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7352 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7356 for( ; (j+2UL) <= jend; j+=2UL )
7363 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7364 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7366 SIMDType xmm1, xmm2;
7369 for( ; k<kpos; k+=SIMDSIZE ) {
7370 const SIMDType a1( A.load(i,k) );
7371 xmm1 += a1 * B.load(k,j );
7372 xmm2 += a1 * B.load(k,j+1UL);
7375 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7376 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7378 for( ; remainder && k<kend; ++k ) {
7379 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7380 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7390 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
7391 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7396 for( ; k<kpos; k+=SIMDSIZE ) {
7397 xmm1 += A.load(i,k) * B.load(k,j);
7400 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7402 for( ; remainder && k<K; ++k ) {
7403 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7425 template<
typename MT3
7434 const size_t M( A.rows() );
7435 const size_t N( B.columns() );
7436 const size_t K( A.columns() );
7442 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
7446 for( ; (j+2UL) <= N; j+=2UL )
7455 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7456 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7458 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7461 for( ; k<kpos; k+=SIMDSIZE )
7463 const SIMDType a1( A.load(i ,k) );
7464 const SIMDType a2( A.load(i+1UL,k) );
7465 const SIMDType a3( A.load(i+2UL,k) );
7466 const SIMDType a4( A.load(i+3UL,k) );
7467 const SIMDType b1( B.load(k,j ) );
7468 const SIMDType b2( B.load(k,j+1UL) );
7479 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7480 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7481 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7482 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7483 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7484 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7485 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7486 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7488 for( ; remainder && k<kend; ++k ) {
7489 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7490 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7491 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7492 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7493 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7494 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7495 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7496 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7507 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7508 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7510 SIMDType xmm1, xmm2, xmm3, xmm4;
7513 for( ; k<kpos; k+=SIMDSIZE ) {
7514 const SIMDType b1( B.load(k,j) );
7515 xmm1 += A.load(i ,k) * b1;
7516 xmm2 += A.load(i+1UL,k) * b1;
7517 xmm3 += A.load(i+2UL,k) * b1;
7518 xmm4 += A.load(i+3UL,k) * b1;
7521 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7522 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7523 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
7524 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
7526 for( ; remainder && k<kend; ++k ) {
7527 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7528 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7529 (~C)(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7530 (~C)(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7535 for( ; (i+2UL) <= M; i+=2UL )
7537 const size_t jend( LOW ? i+2UL : N );
7538 size_t j( UPP ? i : 0UL );
7540 for( ; (j+2UL) <= jend; j+=2UL )
7549 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7550 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7552 SIMDType xmm1, xmm2, xmm3, xmm4;
7555 for( ; k<kpos; k+=SIMDSIZE ) {
7556 const SIMDType a1( A.load(i ,k) );
7557 const SIMDType a2( A.load(i+1UL,k) );
7558 const SIMDType b1( B.load(k,j ) );
7559 const SIMDType b2( B.load(k,j+1UL) );
7566 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7567 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7568 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7569 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7571 for( ; remainder && k<kend; ++k ) {
7572 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7573 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7574 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7575 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7586 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7587 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7589 SIMDType xmm1, xmm2;
7592 for( ; k<kpos; k+=SIMDSIZE ) {
7593 const SIMDType b1( B.load(k,j) );
7594 xmm1 += A.load(i ,k) * b1;
7595 xmm2 += A.load(i+1UL,k) * b1;
7598 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7599 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7601 for( ; remainder && k<kend; ++k ) {
7602 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7603 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7610 const size_t jend( LOW ? i+1UL : N );
7611 size_t j( UPP ? i : 0UL );
7613 for( ; (j+2UL) <= jend; j+=2UL )
7620 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7621 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7623 SIMDType xmm1, xmm2;
7626 for( ; k<kpos; k+=SIMDSIZE ) {
7627 const SIMDType a1( A.load(i,k) );
7628 xmm1 += a1 * B.load(k,j );
7629 xmm2 += a1 * B.load(k,j+1UL);
7632 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7633 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7635 for( ; remainder && k<kend; ++k ) {
7636 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7637 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7647 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
7648 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7653 for( ; k<kpos; k+=SIMDSIZE ) {
7654 xmm1 += A.load(i,k) * B.load(k,j);
7657 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7659 for( ; remainder && k<K; ++k ) {
7660 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7681 template<
typename MT3
7686 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7688 selectDefaultSubAssignKernel( C, A, B, scalar );
7707 template<
typename MT3
7712 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7715 lmmm( C, A, B, -scalar, ST2(1) );
7717 ummm( C, A, B, -scalar, ST2(1) );
7719 mmm( C, A, B, -scalar, ST2(1) );
7737 template<
typename MT3
7742 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7744 selectLargeSubAssignKernel( C, A, B, scalar );
7749 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7763 template<
typename MT3
7768 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7774 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7775 subAssign( C, tmp );
7779 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7780 subAssign( C, tmp );
7783 gemm( C, A, B, ET(-scalar), ET(1) );
7816 template<
typename MT
7829 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7832 else if( left.columns() == 0UL ) {
7866 template<
typename MT
7885 const ForwardFunctor fwd;
7887 const TmpType tmp( rhs );
7907 template<
typename MT
7920 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7957 template<
typename MT
7970 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8053 template<
typename T1
8102 template<
typename MT1
8148 template<
typename MT1
8194 template<
typename MT1
8240 template<
typename MT1
8286 template<
typename MT1
8317 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8318 struct Rows< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Rows<MT1>
8334 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8335 struct Columns< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Columns<MT2>
8351 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8352 struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8353 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
8369 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8370 struct IsSymmetric< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8373 , IsBuiltin< ElementType_< DMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
8374 , And< Bool<LF>, Bool<UF> > >::value >
8390 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
8391 struct IsHermitian< DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
8408 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8409 struct IsLower< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8411 , And< IsLower<MT1>, IsLower<MT2> >
8412 , And< Or< Bool<SF>, Bool<HF> >
8413 , IsUpper<MT1>, IsUpper<MT2> > >::value >
8429 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8430 struct IsUniLower< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8431 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
8432 , And< Or< Bool<SF>, Bool<HF> >
8433 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
8449 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8451 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
8452 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
8453 , And< Or< Bool<SF>, Bool<HF> >
8454 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8455 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
8471 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8472 struct IsUpper< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8474 , And< IsUpper<MT1>, IsUpper<MT2> >
8475 , And< Or< Bool<SF>, Bool<HF> >
8476 , IsLower<MT1>, IsLower<MT2> > >::value >
8492 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8493 struct IsUniUpper< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8494 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
8495 , And< Or< Bool<SF>, Bool<HF> >
8496 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
8512 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8514 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8515 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
8516 , And< Or< Bool<SF>, Bool<HF> >
8517 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
8518 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
8534 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
8552 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
8570 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8588 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8606 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8623 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8640 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8657 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8674 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8691 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
bool AF >
8706 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8707 struct RowExprTrait< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8720 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
typename SubmatrixExprTrait< MT, AF >::Type SubmatrixExprTrait_
Auxiliary alias declaration for the SubmatrixExprTrait type trait.The SubmatrixExprTrait_ alias decla...
Definition: SubmatrixExprTrait.h:134
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:288
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Evaluation of the expression type of a dense matrix declupp operation.Via this type trait it is possi...
Definition: DMatDeclUppExprTrait.h:75
Compile time check for row vector types.This type trait tests whether or not the given template argum...
Definition: IsRowVector.h:80
const DMatForEachExpr< MT, Conj, SO > conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatForEachExpr.h:1214
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
Header file for the DMatDeclDiagExprTrait class template.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:560
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:386
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:633
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:550
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:66
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:177
Header file for the TDVecSMatMultExprTrait class template.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
DisableIf_< IsSymmetric< MT >, const DMatDeclSymExpr< MT, SO > > declsym(const DenseMatrix< MT, SO > &dm)
Declares the given non-symmetric dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:841
Evaluation of the expression type of a dense matrix/dense vector multiplication.Via this type trait i...
Definition: DMatDVecMultExprTrait.h:78
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:119
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:486
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:402
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:323
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:284
Evaluation of the expression type of a dense matrix declsym operation.Via this type trait it is possi...
Definition: DMatDeclSymExprTrait.h:75
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
DisableIf_< IsHermitian< MT >, const DMatDeclHermExpr< MT, SO > > declherm(const DenseMatrix< MT, SO > &dm)
Declares the given non-Hermitian dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:841
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:176
Header file for the DisableIf class template.
Compile time check for dense vector types.This type trait tests whether or not the given template par...
Definition: IsDenseVector.h:78
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:196
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:83
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:297
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:294
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:193
Header file for the Or class template.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:456
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:123
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:174
Header file for the DenseMatrix base class.
Header file for the DMatDeclLowExprTrait class template.
Header file for the Columns type trait.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Evaluation of the expression type of a dense matrix/sparse vector multiplication.Via this type trait ...
Definition: DMatSVecMultExprTrait.h:80
Compile time check for sparse vector types.This type trait tests whether or not the given template pa...
Definition: IsSparseVector.h:78
Evaluation of the expression type type of a submatrix operation.Via this type trait it is possible to...
Definition: SubmatrixExprTrait.h:80
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:444
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Header file for the DMatDeclUppExprTrait class template.
Header file for the DMatDeclSymExprTrait class template.
Compile time check for column vector types.This type trait tests whether or not the given template ar...
Definition: IsColumnVector.h:80
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:195
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
DisableIf_< IsLower< MT >, const DMatDeclLowExpr< MT, SO > > decllow(const DenseMatrix< MT, SO > &dm)
Declares the given non-lower dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:842
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:487
Evaluation of the expression type type of a row operation.Via this type trait it is possible to evalu...
Definition: RowExprTrait.h:79
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:632
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:260
Header file for the DeclDiag functor.
Evaluation of the expression type of a dense matrix declherm operation.Via this type trait it is poss...
Definition: DMatDeclHermExprTrait.h:75
Compile time check for dense matrix types.This type trait tests whether or not the given template par...
Definition: IsDenseMatrix.h:78
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:281
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:279
Header file for the conjugate shim.
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:466
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSIMDCombinable type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
typename TDVecTDMatMultExprTrait< VT, MT >::Type TDVecTDMatMultExprTrait_
Auxiliary alias declaration for the TDVecTDMatMultExprTrait class template.The TDVecTDMatMultExprTrai...
Definition: TDVecTDMatMultExprTrait.h:120
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:173
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:83
Utility type for generic codes.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:283
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:175
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
Compile time type negation.The Not class template negates the given compile time condition. In case the given condition would evaluate to true, the nested member enumeration is set to false and vice versa:
Definition: Not.h:70
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:432
Header file for the DMatDeclHermExprTrait class template.
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename DMatDVecMultExprTrait< MT, VT >::Type DMatDVecMultExprTrait_
Auxiliary alias declaration for the DMatDVecMultExprTrait class template.The DMatDVecMultExprTrait_ a...
Definition: DMatDVecMultExprTrait.h:119
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:282
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:172
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:285
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:422
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
Evaluation of the expression type of a dense vector/dense matrix multiplication.Via this type trait i...
Definition: TDVecDMatMultExprTrait.h:78
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:338
Evaluation of the expression type of a sparse vector/dense matrix multiplication.Via this type trait ...
Definition: TSVecDMatMultExprTrait.h:78
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:291
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
DMatTDMatMultExpr< MT1, MT2, SF, HF, LF, UF > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:277
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:76
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:280
Evaluation of the expression type of a dense matrix decllow operation.Via this type trait it is possi...
Definition: DMatDeclLowExprTrait.h:75
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Evaluation of the expression type of a dense matrix decldiag operation.Via this type trait it is poss...
Definition: DMatDeclDiagExprTrait.h:75
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:166
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:194
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:412
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:677
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
DisableIf_< IsDiagonal< MT >, const DMatDeclDiagExpr< MT, SO > > decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given non-diagonal dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:841
DisableIf_< IsUpper< MT >, const DMatDeclUppExpr< MT, SO > > declupp(const DenseMatrix< MT, SO > &dm)
Declares the given non-upper dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:842
Evaluation of the expression type type of a column operation.Via this type trait it is possible to ev...
Definition: ColumnExprTrait.h:78
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the DeclSym functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:476
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.