35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 136 template<
typename MT1
143 :
public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
169 SYM = ( SF && !( HF || LF || UF ) ),
170 HERM = ( HF && !( LF || UF ) ),
171 LOW = ( LF || ( ( SF || HF ) && UF ) ),
172 UPP = ( UF || ( ( SF || HF ) && LF ) )
182 template<
typename T1,
typename T2,
typename T3 >
183 struct IsEvaluationRequired {
184 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
194 template<
typename T1,
typename T2,
typename T3 >
195 struct UseBlasKernel {
196 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
202 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
217 template<
typename T1,
typename T2,
typename T3 >
218 struct UseVectorizedDefaultKernel {
219 enum :
bool { value = useOptimizedKernels &&
221 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
279 MT1::simdEnabled && MT2::simdEnabled &&
284 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
285 !evaluateRight && MT2::smpAssignable };
340 :(
lhs_.columns() ) ) );
344 const size_t n(
end - begin );
364 if( i >=
lhs_.rows() ) {
367 if( j >=
rhs_.columns() ) {
379 inline size_t rows() const noexcept {
390 return rhs_.columns();
420 template<
typename T >
421 inline bool canAlias(
const T* alias )
const noexcept {
422 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
432 template<
typename T >
433 inline bool isAliased(
const T* alias )
const noexcept {
434 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
444 return lhs_.isAligned() &&
rhs_.isAligned();
455 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
457 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
458 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
482 template<
typename MT
491 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
494 else if( rhs.
lhs_.columns() == 0UL ) {
509 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
525 template<
typename MT3
528 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
531 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
532 selectSmallAssignKernel( C, A, B );
534 selectBlasAssignKernel( C, A, B );
553 template<
typename MT3
559 const size_t M( A.rows() );
560 const size_t N( B.columns() );
561 const size_t K( A.columns() );
573 for(
size_t i=0UL; i<ibegin; ++i ) {
574 for(
size_t j=0UL; j<N; ++j ) {
578 for(
size_t i=ibegin; i<iend; ++i )
585 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
586 :( SYM || HERM || UPP ? i : 0UL ) ) );
592 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
593 :( LOW ? i+1UL : N ) ) );
595 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
596 for(
size_t j=0UL; j<N; ++j ) {
604 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
607 for(
size_t j=jbegin; j<jend; ++j )
627 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
628 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
629 (~C)(i,j) += A(i,k) * B(k,j);
632 for(
size_t j=jend; j<N; ++j ) {
636 for(
size_t i=iend; i<M; ++i ) {
637 for(
size_t j=0UL; j<N; ++j ) {
643 for(
size_t i=1UL; i<M; ++i ) {
644 for(
size_t j=0UL; j<i; ++j ) {
645 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
667 template<
typename MT3
673 const size_t M( A.rows() );
674 const size_t N( B.columns() );
675 const size_t K( A.columns() );
687 for(
size_t j=0UL; j<jbegin; ++j ) {
688 for(
size_t i=0UL; i<M; ++i ) {
692 for(
size_t j=jbegin; j<jend; ++j )
699 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
700 :( SYM || HERM || LOW ? j : 0UL ) ) );
706 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
707 :( UPP ? j+1UL : M ) ) );
709 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
710 for(
size_t i=0UL; i<M; ++i ) {
718 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
721 for(
size_t i=ibegin; i<iend; ++i )
741 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
742 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
743 (~C)(i,j) += A(i,k) * B(k,j);
746 for(
size_t i=iend; i<M; ++i ) {
750 for(
size_t j=jend; j<N; ++j ) {
751 for(
size_t i=0UL; i<M; ++i ) {
757 for(
size_t j=1UL; j<N; ++j ) {
758 for(
size_t i=0UL; i<j; ++i ) {
759 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
781 template<
typename MT3
784 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
787 const size_t M( A.rows() );
788 const size_t N( B.columns() );
790 for(
size_t i=0UL; i<M; ++i )
801 for(
size_t j=0UL; j<jbegin; ++j ) {
805 for(
size_t j=jbegin; j<jend; ++j ) {
806 (~C)(i,j) = A(i,j) * B(j,j);
809 for(
size_t j=jend; j<N; ++j ) {
832 template<
typename MT3
835 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
838 constexpr
size_t block( BLOCK_SIZE );
840 const size_t M( A.rows() );
841 const size_t N( B.columns() );
843 for(
size_t jj=0UL; jj<N; jj+=block ) {
844 const size_t jend(
min( N, jj+block ) );
845 for(
size_t ii=0UL; ii<M; ii+=block ) {
846 const size_t iend(
min( M, ii+block ) );
847 for(
size_t j=jj; j<jend; ++j )
857 for(
size_t i=ii; i<ibegin; ++i ) {
861 for(
size_t i=ibegin; i<ipos; ++i ) {
862 (~C)(i,j) = A(i,j) * B(j,j);
865 for(
size_t i=ipos; i<iend; ++i ) {
890 template<
typename MT3
896 constexpr
size_t block( BLOCK_SIZE );
898 const size_t M( A.rows() );
899 const size_t N( B.columns() );
901 for(
size_t ii=0UL; ii<M; ii+=block ) {
902 const size_t iend(
min( M, ii+block ) );
903 for(
size_t jj=0UL; jj<N; jj+=block ) {
904 const size_t jend(
min( N, jj+block ) );
905 for(
size_t i=ii; i<iend; ++i )
915 for(
size_t j=jj; j<jbegin; ++j ) {
919 for(
size_t j=jbegin; j<jpos; ++j ) {
920 (~C)(i,j) = A(i,i) * B(i,j);
923 for(
size_t j=jpos; j<jend; ++j ) {
948 template<
typename MT3
954 const size_t M( A.rows() );
955 const size_t N( B.columns() );
957 for(
size_t j=0UL; j<N; ++j )
968 for(
size_t i=0UL; i<ibegin; ++i ) {
972 for(
size_t i=ibegin; i<iend; ++i ) {
973 (~C)(i,j) = A(i,i) * B(i,j);
976 for(
size_t i=iend; i<M; ++i ) {
999 template<
typename MT3
1003 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1007 for(
size_t i=0UL; i<A.rows(); ++i ) {
1008 C(i,i) = A(i,i) * B(i,i);
1028 template<
typename MT3
1032 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1034 selectDefaultAssignKernel( C, A, B );
1054 template<
typename MT3
1062 const size_t M( A.rows() );
1063 const size_t N( B.columns() );
1064 const size_t K( A.columns() );
1075 for( ; !( LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1077 const size_t jend( LOW ? i+2UL : N );
1078 size_t j( SYM || HERM || UPP ? i : 0UL );
1080 for( ; (j+4UL) <= jend; j+=4UL )
1089 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1090 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1092 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1095 for( ; k<kpos; k+=SIMDSIZE ) {
1097 const SIMDType a2( A.load(i+1UL,k) );
1099 const SIMDType b2( B.load(k,j+1UL) );
1100 const SIMDType b3( B.load(k,j+2UL) );
1101 const SIMDType b4( B.load(k,j+3UL) );
1112 (~C)(i ,j ) =
sum( xmm1 );
1113 (~C)(i ,j+1UL) =
sum( xmm2 );
1114 (~C)(i ,j+2UL) =
sum( xmm3 );
1115 (~C)(i ,j+3UL) =
sum( xmm4 );
1116 (~C)(i+1UL,j ) =
sum( xmm5 );
1117 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
1118 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
1119 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
1121 for( ; remainder && k<kend; ++k ) {
1122 (~C)(i ,j ) += A(i ,k) * B(k,j );
1123 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1124 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1125 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1126 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1127 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1128 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1129 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1133 for( ; (j+2UL) <= jend; j+=2UL )
1142 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1143 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1148 for( ; k<kpos; k+=SIMDSIZE ) {
1150 const SIMDType a2( A.load(i+1UL,k) );
1152 const SIMDType b2( B.load(k,j+1UL) );
1159 (~C)(i ,j ) =
sum( xmm1 );
1160 (~C)(i ,j+1UL) =
sum( xmm2 );
1161 (~C)(i+1UL,j ) =
sum( xmm3 );
1162 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1164 for( ; remainder && k<kend; ++k ) {
1165 (~C)(i ,j ) += A(i ,k) * B(k,j );
1166 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1167 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1168 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1179 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1180 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1185 for( ; k<kpos; k+=SIMDSIZE ) {
1187 xmm1 += A.load(i ,k) * b1;
1188 xmm2 += A.load(i+1UL,k) * b1;
1191 (~C)(i ,j) =
sum( xmm1 );
1192 (~C)(i+1UL,j) =
sum( xmm2 );
1194 for( ; remainder && k<kend; ++k ) {
1195 (~C)(i ,j) += A(i ,k) * B(k,j);
1196 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1203 const size_t jend( LOW ? i+1UL : N );
1204 size_t j( SYM || HERM || UPP ? i : 0UL );
1206 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
1213 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1214 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1219 for( ; k<kpos; k+=SIMDSIZE ) {
1221 xmm1 += a1 * B.load(k,j );
1222 xmm2 += a1 * B.load(k,j+1UL);
1223 xmm3 += a1 * B.load(k,j+2UL);
1224 xmm4 += a1 * B.load(k,j+3UL);
1227 (~C)(i,j ) =
sum( xmm1 );
1228 (~C)(i,j+1UL) =
sum( xmm2 );
1229 (~C)(i,j+2UL) =
sum( xmm3 );
1230 (~C)(i,j+3UL) =
sum( xmm4 );
1232 for( ; remainder && k<kend; ++k ) {
1233 (~C)(i,j ) += A(i,k) * B(k,j );
1234 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1235 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
1236 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
1240 for( ; !( LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1247 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1248 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1253 for( ; k<kpos; k+=SIMDSIZE ) {
1255 xmm1 += a1 * B.load(k,j );
1256 xmm2 += a1 * B.load(k,j+1UL);
1259 (~C)(i,j ) =
sum( xmm1 );
1260 (~C)(i,j+1UL) =
sum( xmm2 );
1262 for( ; remainder && k<kend; ++k ) {
1263 (~C)(i,j ) += A(i,k) * B(k,j );
1264 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1268 for( ; j<jend; ++j )
1274 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
1275 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1280 for( ; k<kpos; k+=SIMDSIZE ) {
1281 xmm1 += A.load(i,k) * B.load(k,j);
1284 (~C)(i,j) =
sum( xmm1 );
1286 for( ; remainder && k<K; ++k ) {
1287 (~C)(i,j) += A(i,k) * B(k,j);
1294 for(
size_t i=2UL; i<M; ++i ) {
1295 const size_t jend( 2UL * ( i/2UL ) );
1296 for(
size_t j=0UL; j<jend; ++j ) {
1297 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1301 else if( LOW && !UPP ) {
1302 for(
size_t j=2UL; j<N; ++j ) {
1303 const size_t iend( 2UL * ( j/2UL ) );
1304 for(
size_t i=0UL; i<iend; ++i ) {
1309 else if( !LOW && UPP ) {
1310 for(
size_t i=2UL; i<M; ++i ) {
1311 const size_t jend( 2UL * ( i/2UL ) );
1312 for(
size_t j=0UL; j<jend; ++j ) {
1336 template<
typename MT3
1344 const size_t M( A.rows() );
1345 const size_t N( B.columns() );
1346 const size_t K( A.columns() );
1357 for( ; !( LOW &&
UPP ) && (i+4UL) <= M; i+=4UL )
1359 const size_t jend( SYM || HERM || LOW ? i+4UL : N );
1360 size_t j( UPP ? i : 0UL );
1362 for( ; (j+2UL) <= jend; j+=2UL )
1371 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1372 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1374 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1377 for( ; k<kpos; k+=SIMDSIZE ) {
1379 const SIMDType a2( A.load(i+1UL,k) );
1380 const SIMDType a3( A.load(i+2UL,k) );
1381 const SIMDType a4( A.load(i+3UL,k) );
1383 const SIMDType b2( B.load(k,j+1UL) );
1394 (~C)(i ,j ) =
sum( xmm1 );
1395 (~C)(i ,j+1UL) =
sum( xmm2 );
1396 (~C)(i+1UL,j ) =
sum( xmm3 );
1397 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1398 (~C)(i+2UL,j ) =
sum( xmm5 );
1399 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
1400 (~C)(i+3UL,j ) =
sum( xmm7 );
1401 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
1403 for( ; remainder && k<kend; ++k ) {
1404 (~C)(i ,j ) += A(i ,k) * B(k,j );
1405 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1406 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1407 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1408 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1409 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1410 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1411 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1422 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1423 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1428 for( ; k<kpos; k+=SIMDSIZE ) {
1430 xmm1 += A.load(i ,k) * b1;
1431 xmm2 += A.load(i+1UL,k) * b1;
1432 xmm3 += A.load(i+2UL,k) * b1;
1433 xmm4 += A.load(i+3UL,k) * b1;
1436 (~C)(i ,j) =
sum( xmm1 );
1437 (~C)(i+1UL,j) =
sum( xmm2 );
1438 (~C)(i+2UL,j) =
sum( xmm3 );
1439 (~C)(i+3UL,j) =
sum( xmm4 );
1441 for( ; remainder && k<kend; ++k ) {
1442 (~C)(i ,j) += A(i ,k) * B(k,j);
1443 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1444 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
1445 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
1450 for( ; !( LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1454 for( ; (j+2UL) <= N; j+=2UL )
1463 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1464 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1469 for( ; k<kpos; k+=SIMDSIZE ) {
1471 const SIMDType a2( A.load(i+1UL,k) );
1473 const SIMDType b2( B.load(k,j+1UL) );
1480 (~C)(i ,j ) =
sum( xmm1 );
1481 (~C)(i ,j+1UL) =
sum( xmm2 );
1482 (~C)(i+1UL,j ) =
sum( xmm3 );
1483 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1485 for( ; remainder && k<kend; ++k ) {
1486 (~C)(i ,j ) += A(i ,k) * B(k,j );
1487 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1488 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1489 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1500 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1501 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1506 for( ; k<kpos; k+=SIMDSIZE ) {
1508 xmm1 += A.load(i ,k) * b1;
1509 xmm2 += A.load(i+1UL,k) * b1;
1512 (~C)(i ,j) =
sum( xmm1 );
1513 (~C)(i+1UL,j) =
sum( xmm2 );
1515 for( ; remainder && k<kend; ++k ) {
1516 (~C)(i ,j) += A(i ,k) * B(k,j);
1517 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1524 const size_t jend( LOW && UPP ? i+1UL : N );
1525 size_t j( LOW && UPP ? i : 0UL );
1527 for( ; !( LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1534 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1535 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1540 for( ; k<kpos; k+=SIMDSIZE ) {
1542 xmm1 += a1 * B.load(k,j );
1543 xmm2 += a1 * B.load(k,j+1UL);
1546 (~C)(i,j ) =
sum( xmm1 );
1547 (~C)(i,j+1UL) =
sum( xmm2 );
1549 for( ; remainder && k<kend; ++k ) {
1550 (~C)(i,j ) += A(i,k) * B(k,j );
1551 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1555 for( ; j<jend; ++j )
1561 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
1562 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1567 for( ; k<kpos; k+=SIMDSIZE ) {
1568 xmm1 += A.load(i,k) * B.load(k,j);
1571 (~C)(i,j) =
sum( xmm1 );
1573 for( ; remainder && k<K; ++k ) {
1574 (~C)(i,j) += A(i,k) * B(k,j);
1580 if( ( SYM || HERM ) && ( N > 4UL ) ) {
1581 for(
size_t j=4UL; j<N; ++j ) {
1582 const size_t iend( 4UL * ( j/4UL ) );
1583 for(
size_t i=0UL; i<iend; ++i ) {
1584 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1588 else if( LOW && !UPP ) {
1589 for(
size_t j=4UL; j<N; ++j ) {
1590 const size_t iend( 4UL * ( j/4UL ) );
1591 for(
size_t i=0UL; i<iend; ++i ) {
1596 else if( !LOW && UPP ) {
1597 for(
size_t i=4UL; i<N; ++i ) {
1598 const size_t jend( 4UL * ( i/4UL ) );
1599 for(
size_t j=0UL; j<jend; ++j ) {
1622 template<
typename MT3
1626 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1628 selectDefaultAssignKernel( C, A, B );
1648 template<
typename MT3
1652 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1682 template<
typename MT3
1686 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1688 selectLargeAssignKernel( C, A, B );
1694 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1708 template<
typename MT3
1712 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1725 gemm( C, A, B, ET(1), ET(0) );
1745 template<
typename MT
1763 const ForwardFunctor fwd;
1765 const TmpType tmp(
serial( rhs ) );
1766 assign( ~lhs, fwd( tmp ) );
1784 template<
typename MT
1793 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
1807 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1823 template<
typename MT3
1826 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1829 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1830 selectSmallAddAssignKernel( C, A, B );
1832 selectBlasAddAssignKernel( C, A, B );
1851 template<
typename MT3
1857 const size_t M( A.rows() );
1858 const size_t N( B.columns() );
1859 const size_t K( A.columns() );
1871 for(
size_t i=ibegin; i<iend; ++i )
1878 ?( UPP ?
max( i, 1UL ) : 1UL )
1879 :( UPP ? i : 0UL ) ) );
1885 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
1886 :( LOW ? i+1UL : N ) ) );
1888 if( ( LOW || UPP ) && ( jbegin > jend ) )
continue;
1891 for(
size_t j=jbegin; j<jend; ++j )
1911 const size_t knum( kend - kbegin );
1912 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1914 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1915 (~C)(i,j) += A(i,k ) * B(k ,j);
1916 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1919 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1941 template<
typename MT3
1947 const size_t M( A.rows() );
1948 const size_t N( B.columns() );
1949 const size_t K( A.columns() );
1961 for(
size_t j=jbegin; j<jend; ++j )
1968 ?( LOW ?
max( j, 1UL ) : 1UL )
1969 :( LOW ? j : 0UL ) ) );
1975 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
1976 :( UPP ? j+1UL : M ) ) );
1978 if( ( LOW || UPP ) && ( ibegin > iend ) )
continue;
1981 for(
size_t i=ibegin; i<iend; ++i )
2001 const size_t knum( kend - kbegin );
2002 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2004 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2005 (~C)(i,j) += A(i,k ) * B(k ,j);
2006 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
2009 (~C)(i,j) += A(i,kpos) * B(kpos,j);
2031 template<
typename MT3
2034 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2037 const size_t M( A.rows() );
2038 const size_t N( B.columns() );
2040 for(
size_t i=0UL; i<M; ++i )
2050 const size_t jnum( jend - jbegin );
2051 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2053 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2054 (~C)(i,j ) += A(i,j ) * B(j ,j );
2055 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2058 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos);
2079 template<
typename MT3
2082 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2085 constexpr
size_t block( BLOCK_SIZE );
2087 const size_t M( A.rows() );
2088 const size_t N( B.columns() );
2090 for(
size_t jj=0UL; jj<N; jj+=block ) {
2091 const size_t jend(
min( N, jj+block ) );
2092 for(
size_t ii=0UL; ii<M; ii+=block ) {
2093 const size_t iend(
min( M, ii+block ) );
2094 for(
size_t j=jj; j<jend; ++j )
2103 for(
size_t i=ibegin; i<ipos; ++i ) {
2104 (~C)(i,j) += A(i,j) * B(j,j);
2127 template<
typename MT3
2133 constexpr
size_t block( BLOCK_SIZE );
2135 const size_t M( A.rows() );
2136 const size_t N( B.columns() );
2138 for(
size_t ii=0UL; ii<M; ii+=block ) {
2139 const size_t iend(
min( M, ii+block ) );
2140 for(
size_t jj=0UL; jj<N; jj+=block ) {
2141 const size_t jend(
min( N, jj+block ) );
2142 for(
size_t i=ii; i<iend; ++i )
2151 for(
size_t j=jbegin; j<jpos; ++j ) {
2152 (~C)(i,j) += A(i,i) * B(i,j);
2175 template<
typename MT3
2181 const size_t M( A.rows() );
2182 const size_t N( B.columns() );
2184 for(
size_t j=0UL; j<N; ++j )
2194 const size_t inum( iend - ibegin );
2195 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2197 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2198 (~C)(i ,j) += A(i ,i ) * B(i ,j);
2199 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2202 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j);
2223 template<
typename MT3
2227 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2229 for(
size_t i=0UL; i<A.rows(); ++i ) {
2230 C(i,i) += A(i,i) * B(i,i);
2250 template<
typename MT3
2254 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2256 selectDefaultAddAssignKernel( C, A, B );
2276 template<
typename MT3
2284 const size_t M( A.rows() );
2285 const size_t N( B.columns() );
2286 const size_t K( A.columns() );
2292 for( ; (i+2UL) <= M; i+=2UL )
2294 const size_t jend( LOW ? i+2UL : N );
2295 size_t j( UPP ? i : 0UL );
2297 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2306 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2307 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2309 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2312 for( ; k<kpos; k+=SIMDSIZE ) {
2314 const SIMDType a2( A.load(i+1UL,k) );
2316 const SIMDType b2( B.load(k,j+1UL) );
2317 const SIMDType b3( B.load(k,j+2UL) );
2318 const SIMDType b4( B.load(k,j+3UL) );
2329 (~C)(i ,j ) +=
sum( xmm1 );
2330 (~C)(i ,j+1UL) +=
sum( xmm2 );
2331 (~C)(i ,j+2UL) +=
sum( xmm3 );
2332 (~C)(i ,j+3UL) +=
sum( xmm4 );
2333 (~C)(i+1UL,j ) +=
sum( xmm5 );
2334 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
2335 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
2336 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
2338 for( ; remainder && k<kend; ++k ) {
2339 (~C)(i ,j ) += A(i ,k) * B(k,j );
2340 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2341 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2342 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2343 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2344 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2345 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2346 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2350 for( ; (j+2UL) <= jend; j+=2UL )
2359 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2360 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2365 for( ; k<kpos; k+=SIMDSIZE ) {
2367 const SIMDType a2( A.load(i+1UL,k) );
2369 const SIMDType b2( B.load(k,j+1UL) );
2376 (~C)(i ,j ) +=
sum( xmm1 );
2377 (~C)(i ,j+1UL) +=
sum( xmm2 );
2378 (~C)(i+1UL,j ) +=
sum( xmm3 );
2379 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2381 for( ; remainder && k<kend; ++k ) {
2382 (~C)(i ,j ) += A(i ,k) * B(k,j );
2383 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2384 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2385 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2396 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2397 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2402 for( ; k<kpos; k+=SIMDSIZE ) {
2404 xmm1 += A.load(i ,k) * b1;
2405 xmm2 += A.load(i+1UL,k) * b1;
2408 (~C)(i ,j) +=
sum( xmm1 );
2409 (~C)(i+1UL,j) +=
sum( xmm2 );
2411 for( ; remainder && k<kend; ++k ) {
2412 (~C)(i ,j) += A(i ,k) * B(k,j);
2413 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2420 const size_t jend( LOW ? i+1UL : N );
2421 size_t j( UPP ? i : 0UL );
2423 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2430 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2431 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2436 for( ; k<kpos; k+=SIMDSIZE ) {
2438 xmm1 += a1 * B.load(k,j );
2439 xmm2 += a1 * B.load(k,j+1UL);
2440 xmm3 += a1 * B.load(k,j+2UL);
2441 xmm4 += a1 * B.load(k,j+3UL);
2444 (~C)(i,j ) +=
sum( xmm1 );
2445 (~C)(i,j+1UL) +=
sum( xmm2 );
2446 (~C)(i,j+2UL) +=
sum( xmm3 );
2447 (~C)(i,j+3UL) +=
sum( xmm4 );
2449 for( ; remainder && k<kend; ++k ) {
2450 (~C)(i,j ) += A(i,k) * B(k,j );
2451 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2452 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
2453 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
2457 for( ; (j+2UL) <= jend; j+=2UL )
2464 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2465 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2470 for( ; k<kpos; k+=SIMDSIZE ) {
2472 xmm1 += a1 * B.load(k,j );
2473 xmm2 += a1 * B.load(k,j+1UL);
2476 (~C)(i,j ) +=
sum( xmm1 );
2477 (~C)(i,j+1UL) +=
sum( xmm2 );
2479 for( ; remainder && k<kend; ++k ) {
2480 (~C)(i,j ) += A(i,k) * B(k,j );
2481 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2491 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
2492 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2497 for( ; k<kpos; k+=SIMDSIZE ) {
2498 xmm1 += A.load(i,k) * B.load(k,j);
2501 (~C)(i,j) +=
sum( xmm1 );
2503 for( ; remainder && k<K; ++k ) {
2504 (~C)(i,j) += A(i,k) * B(k,j);
2527 template<
typename MT3
2535 const size_t M( A.rows() );
2536 const size_t N( B.columns() );
2537 const size_t K( A.columns() );
2543 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
2547 for( ; (j+2UL) <= N; j+=2UL )
2556 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2557 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2559 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2562 for( ; k<kpos; k+=SIMDSIZE ) {
2564 const SIMDType a2( A.load(i+1UL,k) );
2565 const SIMDType a3( A.load(i+2UL,k) );
2566 const SIMDType a4( A.load(i+3UL,k) );
2568 const SIMDType b2( B.load(k,j+1UL) );
2579 (~C)(i ,j ) +=
sum( xmm1 );
2580 (~C)(i ,j+1UL) +=
sum( xmm2 );
2581 (~C)(i+1UL,j ) +=
sum( xmm3 );
2582 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2583 (~C)(i+2UL,j ) +=
sum( xmm5 );
2584 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
2585 (~C)(i+3UL,j ) +=
sum( xmm7 );
2586 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
2588 for( ; remainder && k<kend; ++k ) {
2589 (~C)(i ,j ) += A(i ,k) * B(k,j );
2590 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2591 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2592 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2593 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2594 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2595 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2596 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2607 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2608 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2613 for( ; k<kpos; k+=SIMDSIZE ) {
2615 xmm1 += A.load(i ,k) * b1;
2616 xmm2 += A.load(i+1UL,k) * b1;
2617 xmm3 += A.load(i+2UL,k) * b1;
2618 xmm4 += A.load(i+3UL,k) * b1;
2621 (~C)(i ,j) +=
sum( xmm1 );
2622 (~C)(i+1UL,j) +=
sum( xmm2 );
2623 (~C)(i+2UL,j) +=
sum( xmm3 );
2624 (~C)(i+3UL,j) +=
sum( xmm4 );
2626 for( ; remainder && k<kend; ++k ) {
2627 (~C)(i ,j) += A(i ,k) * B(k,j);
2628 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2629 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
2630 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
2635 for( ; (i+2UL) <= M; i+=2UL )
2637 const size_t jend( LOW ? i+2UL : N );
2638 size_t j( UPP ? i : 0UL );
2640 for( ; (j+2UL) <= jend; j+=2UL )
2649 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2650 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2655 for( ; k<kpos; k+=SIMDSIZE ) {
2657 const SIMDType a2( A.load(i+1UL,k) );
2659 const SIMDType b2( B.load(k,j+1UL) );
2666 (~C)(i ,j ) +=
sum( xmm1 );
2667 (~C)(i ,j+1UL) +=
sum( xmm2 );
2668 (~C)(i+1UL,j ) +=
sum( xmm3 );
2669 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2671 for( ; remainder && k<kend; ++k ) {
2672 (~C)(i ,j ) += A(i ,k) * B(k,j );
2673 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2674 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2675 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2686 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2687 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2692 for( ; k<kpos; k+=SIMDSIZE ) {
2694 xmm1 += A.load(i ,k) * b1;
2695 xmm2 += A.load(i+1UL,k) * b1;
2698 (~C)(i ,j) +=
sum( xmm1 );
2699 (~C)(i+1UL,j) +=
sum( xmm2 );
2701 for( ; remainder && k<kend; ++k ) {
2702 (~C)(i ,j) += A(i ,k) * B(k,j);
2703 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2710 const size_t jend( LOW ? i+1UL : N );
2711 size_t j( UPP ? i : 0UL );
2713 for( ; (j+2UL) <= jend; j+=2UL )
2720 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2721 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2726 for( ; k<kpos; k+=SIMDSIZE ) {
2728 xmm1 += a1 * B.load(k,j );
2729 xmm2 += a1 * B.load(k,j+1UL);
2732 (~C)(i,j ) +=
sum( xmm1 );
2733 (~C)(i,j+1UL) +=
sum( xmm2 );
2735 for( ; remainder && k<kend; ++k ) {
2736 (~C)(i,j ) += A(i,k) * B(k,j );
2737 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2747 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
2748 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2753 for( ; k<kpos; k+=SIMDSIZE ) {
2754 xmm1 += A.load(i,k) * B.load(k,j);
2757 (~C)(i,j) +=
sum( xmm1 );
2759 for( ; remainder && k<K; ++k ) {
2760 (~C)(i,j) += A(i,k) * B(k,j);
2782 template<
typename MT3
2786 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2788 selectDefaultAddAssignKernel( C, A, B );
2808 template<
typename MT3
2812 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2838 template<
typename MT3
2842 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2844 selectLargeAddAssignKernel( C, A, B );
2850 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2864 template<
typename MT3
2868 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2875 addAssign( C, tmp );
2880 addAssign( C, tmp );
2883 gemm( C, A, B, ET(1), ET(1) );
2907 template<
typename MT
2916 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
2930 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2946 template<
typename MT3
2949 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2952 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2953 selectSmallSubAssignKernel( C, A, B );
2955 selectBlasSubAssignKernel( C, A, B );
2974 template<
typename MT3
2980 const size_t M( A.rows() );
2981 const size_t N( B.columns() );
2982 const size_t K( A.columns() );
2994 for(
size_t i=ibegin; i<iend; ++i )
3001 ?( UPP ?
max( i, 1UL ) : 1UL )
3002 :( UPP ? i : 0UL ) ) );
3008 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
3009 :( LOW ? i+1UL : N ) ) );
3011 if( ( LOW || UPP ) && ( jbegin > jend ) )
continue;
3014 for(
size_t j=jbegin; j<jend; ++j )
3034 const size_t knum( kend - kbegin );
3035 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3037 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3038 (~C)(i,j) -= A(i,k ) * B(k ,j);
3039 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3042 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3064 template<
typename MT3
3070 const size_t M( A.rows() );
3071 const size_t N( B.columns() );
3072 const size_t K( A.columns() );
3084 for(
size_t j=jbegin; j<jend; ++j )
3091 ?( LOW ?
max( j, 1UL ) : 1UL )
3092 :( LOW ? j : 0UL ) ) );
3098 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
3099 :( UPP ? j+1UL : M ) ) );
3101 if( ( LOW || UPP ) && ( ibegin > iend ) )
continue;
3104 for(
size_t i=ibegin; i<iend; ++i )
3124 const size_t knum( kend - kbegin );
3125 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3127 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3128 (~C)(i,j) -= A(i,k ) * B(k ,j);
3129 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3132 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3154 template<
typename MT3
3157 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3160 const size_t M( A.rows() );
3161 const size_t N( B.columns() );
3163 for(
size_t i=0UL; i<M; ++i )
3173 const size_t jnum( jend - jbegin );
3174 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3176 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3177 (~C)(i,j ) -= A(i,j ) * B(j ,j );
3178 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3181 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3202 template<
typename MT3
3205 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3208 constexpr
size_t block( BLOCK_SIZE );
3210 const size_t M( A.rows() );
3211 const size_t N( B.columns() );
3213 for(
size_t jj=0UL; jj<N; jj+=block ) {
3214 const size_t jend(
min( N, jj+block ) );
3215 for(
size_t ii=0UL; ii<M; ii+=block ) {
3216 const size_t iend(
min( M, ii+block ) );
3217 for(
size_t j=jj; j<jend; ++j )
3226 for(
size_t i=ibegin; i<ipos; ++i ) {
3227 (~C)(i,j) -= A(i,j) * B(j,j);
3250 template<
typename MT3
3256 constexpr
size_t block( BLOCK_SIZE );
3258 const size_t M( A.rows() );
3259 const size_t N( B.columns() );
3261 for(
size_t ii=0UL; ii<M; ii+=block ) {
3262 const size_t iend(
min( M, ii+block ) );
3263 for(
size_t jj=0UL; jj<N; jj+=block ) {
3264 const size_t jend(
min( N, jj+block ) );
3265 for(
size_t i=ii; i<iend; ++i )
3274 for(
size_t j=jbegin; j<jpos; ++j ) {
3275 (~C)(i,j) -= A(i,i) * B(i,j);
3298 template<
typename MT3
3304 const size_t M( A.rows() );
3305 const size_t N( B.columns() );
3307 for(
size_t j=0UL; j<N; ++j )
3317 const size_t inum( iend - ibegin );
3318 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3320 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3321 (~C)(i ,j) -= A(i ,i ) * B(i ,j);
3322 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3325 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3346 template<
typename MT3
3350 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3352 for(
size_t i=0UL; i<A.rows(); ++i ) {
3353 C(i,i) -= A(i,i) * B(i,i);
3373 template<
typename MT3
3377 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3379 selectDefaultSubAssignKernel( ~C, A, B );
3399 template<
typename MT3
3407 const size_t M( A.rows() );
3408 const size_t N( B.columns() );
3409 const size_t K( A.columns() );
3415 for( ; (i+2UL) <= M; i+=2UL )
3417 const size_t jend( LOW ? i+2UL : N );
3418 size_t j( UPP ? i : 0UL );
3420 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3429 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3430 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3432 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3435 for( ; k<kpos; k+=SIMDSIZE ) {
3437 const SIMDType a2( A.load(i+1UL,k) );
3439 const SIMDType b2( B.load(k,j+1UL) );
3440 const SIMDType b3( B.load(k,j+2UL) );
3441 const SIMDType b4( B.load(k,j+3UL) );
3452 (~C)(i ,j ) -=
sum( xmm1 );
3453 (~C)(i ,j+1UL) -=
sum( xmm2 );
3454 (~C)(i ,j+2UL) -=
sum( xmm3 );
3455 (~C)(i ,j+3UL) -=
sum( xmm4 );
3456 (~C)(i+1UL,j ) -=
sum( xmm5 );
3457 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
3458 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
3459 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
3461 for( ; remainder && k<kend; ++k ) {
3462 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3463 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3464 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3465 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3466 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3467 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3468 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3469 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3473 for( ; (j+2UL) <= jend; j+=2UL )
3482 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3483 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3488 for( ; k<kpos; k+=SIMDSIZE ) {
3490 const SIMDType a2( A.load(i+1UL,k) );
3492 const SIMDType b2( B.load(k,j+1UL) );
3499 (~C)(i ,j ) -=
sum( xmm1 );
3500 (~C)(i ,j+1UL) -=
sum( xmm2 );
3501 (~C)(i+1UL,j ) -=
sum( xmm3 );
3502 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3504 for( ; remainder && k<kend; ++k ) {
3505 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3506 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3507 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3508 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3519 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3520 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3525 for( ; k<kpos; k+=SIMDSIZE ) {
3527 xmm1 += A.load(i ,k) * b1;
3528 xmm2 += A.load(i+1UL,k) * b1;
3531 (~C)(i ,j) -=
sum( xmm1 );
3532 (~C)(i+1UL,j) -=
sum( xmm2 );
3534 for( ; remainder && k<kend; ++k ) {
3535 (~C)(i ,j) -= A(i ,k) * B(k,j);
3536 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3543 const size_t jend( LOW ? i+1UL : N );
3544 size_t j( UPP ? i : 0UL );
3546 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3553 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3554 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3559 for( ; k<kpos; k+=SIMDSIZE ) {
3561 xmm1 += a1 * B.load(k,j );
3562 xmm2 += a1 * B.load(k,j+1UL);
3563 xmm3 += a1 * B.load(k,j+2UL);
3564 xmm4 += a1 * B.load(k,j+3UL);
3567 (~C)(i,j ) -=
sum( xmm1 );
3568 (~C)(i,j+1UL) -=
sum( xmm2 );
3569 (~C)(i,j+2UL) -=
sum( xmm3 );
3570 (~C)(i,j+3UL) -=
sum( xmm4 );
3572 for( ; remainder && k<kend; ++k ) {
3573 (~C)(i,j ) -= A(i,k) * B(k,j );
3574 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3575 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3576 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3580 for( ; (j+2UL) <= jend; j+=2UL )
3587 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3588 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3593 for( ; k<kpos; k+=SIMDSIZE ) {
3595 xmm1 += a1 * B.load(k,j );
3596 xmm2 += a1 * B.load(k,j+1UL);
3599 (~C)(i,j ) -=
sum( xmm1 );
3600 (~C)(i,j+1UL) -=
sum( xmm2 );
3602 for( ; remainder && k<kend; ++k ) {
3603 (~C)(i,j ) -= A(i,k) * B(k,j );
3604 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3614 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
3615 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3620 for( ; k<kpos; k+=SIMDSIZE ) {
3621 xmm1 += A.load(i,k) * B.load(k,j);
3624 (~C)(i,j) -=
sum( xmm1 );
3626 for( ; remainder && k<K; ++k ) {
3627 (~C)(i,j) -= A(i,k) * B(k,j);
3650 template<
typename MT3
3658 const size_t M( A.rows() );
3659 const size_t N( B.columns() );
3660 const size_t K( A.columns() );
3666 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
3670 for( ; (j+2UL) <= N; j+=2UL )
3679 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3680 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3682 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3685 for( ; k<kpos; k+=SIMDSIZE ) {
3687 const SIMDType a2( A.load(i+1UL,k) );
3688 const SIMDType a3( A.load(i+2UL,k) );
3689 const SIMDType a4( A.load(i+3UL,k) );
3691 const SIMDType b2( B.load(k,j+1UL) );
3702 (~C)(i ,j ) -=
sum( xmm1 );
3703 (~C)(i ,j+1UL) -=
sum( xmm2 );
3704 (~C)(i+1UL,j ) -=
sum( xmm3 );
3705 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3706 (~C)(i+2UL,j ) -=
sum( xmm5 );
3707 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
3708 (~C)(i+3UL,j ) -=
sum( xmm7 );
3709 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
3711 for( ; remainder && k<kend; ++k ) {
3712 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3713 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3714 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3715 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3716 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3717 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3718 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3719 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3730 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3731 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3736 for( ; k<kpos; k+=SIMDSIZE ) {
3738 xmm1 += A.load(i ,k) * b1;
3739 xmm2 += A.load(i+1UL,k) * b1;
3740 xmm3 += A.load(i+2UL,k) * b1;
3741 xmm4 += A.load(i+3UL,k) * b1;
3744 (~C)(i ,j) -=
sum( xmm1 );
3745 (~C)(i+1UL,j) -=
sum( xmm2 );
3746 (~C)(i+2UL,j) -=
sum( xmm3 );
3747 (~C)(i+3UL,j) -=
sum( xmm4 );
3749 for( ; remainder && k<kend; ++k ) {
3750 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3751 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3752 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3753 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3758 for( ; (i+2UL) <= M; i+=2UL )
3760 const size_t jend( LOW ? i+2UL : N );
3761 size_t j( UPP ? i : 0UL );
3763 for( ; (j+2UL) <= jend; j+=2UL )
3772 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3773 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3778 for( ; k<kpos; k+=SIMDSIZE ) {
3780 const SIMDType a2( A.load(i+1UL,k) );
3782 const SIMDType b2( B.load(k,j+1UL) );
3789 (~C)(i ,j ) -=
sum( xmm1 );
3790 (~C)(i ,j+1UL) -=
sum( xmm2 );
3791 (~C)(i+1UL,j ) -=
sum( xmm3 );
3792 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3794 for( ; remainder && k<kend; ++k ) {
3795 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3796 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3797 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3798 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3809 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3810 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3815 for( ; k<kpos; k+=SIMDSIZE ) {
3817 xmm1 += A.load(i ,k) * b1;
3818 xmm2 += A.load(i+1UL,k) * b1;
3821 (~C)(i ,j) -=
sum( xmm1 );
3822 (~C)(i+1UL,j) -=
sum( xmm2 );
3824 for( ; remainder && k<kend; ++k ) {
3825 (~C)(i ,j) -= A(i ,k) * B(k,j);
3826 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3833 const size_t jend( LOW ? i+1UL : N );
3834 size_t j( UPP ? i : 0UL );
3836 for( ; (j+2UL) <= jend; j+=2UL )
3843 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3844 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3849 for( ; k<kpos; k+=SIMDSIZE ) {
3851 xmm1 += a1 * B.load(k,j );
3852 xmm2 += a1 * B.load(k,j+1UL);
3855 (~C)(i,j ) -=
sum( xmm1 );
3856 (~C)(i,j+1UL) -=
sum( xmm2 );
3858 for( ; remainder && k<kend; ++k ) {
3859 (~C)(i,j ) -= A(i,k) * B(k,j );
3860 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3870 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
3871 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3876 for( ; k<kpos; k+=SIMDSIZE ) {
3877 xmm1 += A.load(i,k) * B.load(k,j);
3880 (~C)(i,j) -=
sum( xmm1 );
3882 for( ; remainder && k<K; ++k ) {
3883 (~C)(i,j) -= A(i,k) * B(k,j);
3905 template<
typename MT3
3909 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3911 selectDefaultSubAssignKernel( ~C, A, B );
3931 template<
typename MT3
3935 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3961 template<
typename MT3
3965 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3967 selectLargeSubAssignKernel( C, A, B );
3973 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3987 template<
typename MT3
3991 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3998 subAssign( C, tmp );
4003 subAssign( C, tmp );
4006 gemm( C, A, B, ET(-1), ET(1) );
4030 template<
typename MT
4044 schurAssign( ~lhs, tmp );
4076 template<
typename MT
4086 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4089 else if( rhs.
lhs_.columns() == 0UL ) {
4124 template<
typename MT
4143 const ForwardFunctor fwd;
4145 const TmpType tmp( rhs );
4167 template<
typename MT
4177 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
4216 template<
typename MT
4226 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
4262 template<
typename MT
4322 template<
typename MT1
4330 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4360 SYM = ( SF && !( HF || LF || UF ) ),
4361 HERM = ( HF && !( LF || UF ) ),
4362 LOW = ( LF || ( ( SF || HF ) && UF ) ),
4363 UPP = ( UF || ( ( SF || HF ) && LF ) )
4372 template<
typename T1,
typename T2,
typename T3 >
4373 struct IsEvaluationRequired {
4374 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
4382 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4383 struct UseBlasKernel {
4384 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4390 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4404 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4405 struct UseVectorizedDefaultKernel {
4406 enum :
bool { value = useOptimizedKernels &&
4408 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4464 MT1::simdEnabled && MT2::simdEnabled &&
4470 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4471 !evaluateRight && MT2::smpAssignable };
4501 return matrix_(i,j) * scalar_;
4514 if( i >= matrix_.rows() ) {
4517 if( j >= matrix_.columns() ) {
4520 return (*
this)(i,j);
4529 inline size_t rows()
const {
4530 return matrix_.rows();
4539 inline size_t columns()
const {
4540 return matrix_.columns();
4570 template<
typename T >
4571 inline bool canAlias(
const T* alias )
const {
4572 return matrix_.canAlias( alias );
4582 template<
typename T >
4583 inline bool isAliased(
const T* alias )
const {
4584 return matrix_.isAliased( alias );
4594 return matrix_.isAligned();
4605 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4607 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4608 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4630 template<
typename MT
4642 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4645 else if( left.columns() == 0UL ) {
4660 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4675 template<
typename MT3
4679 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4682 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4683 selectSmallAssignKernel( C, A, B, scalar );
4685 selectBlasAssignKernel( C, A, B, scalar );
4703 template<
typename MT3
4710 const size_t M( A.rows() );
4711 const size_t N( B.columns() );
4712 const size_t K( A.columns() );
4724 for(
size_t i=0UL; i<ibegin; ++i ) {
4725 for(
size_t j=0UL; j<N; ++j ) {
4729 for(
size_t i=ibegin; i<iend; ++i )
4736 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
4737 :( SYM || HERM || UPP ? i : 0UL ) ) );
4743 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
4744 :( LOW ? i+1UL : N ) ) );
4746 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
4747 for(
size_t j=0UL; j<N; ++j ) {
4755 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
4758 for(
size_t j=jbegin; j<jend; ++j )
4778 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4779 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4780 (~C)(i,j) += A(i,k) * B(k,j);
4782 (~C)(i,j) *= scalar;
4784 for(
size_t j=jend; j<N; ++j ) {
4788 for(
size_t i=iend; i<M; ++i ) {
4789 for(
size_t j=0UL; j<N; ++j ) {
4795 for(
size_t i=1UL; i<M; ++i ) {
4796 for(
size_t j=0UL; j<i; ++j ) {
4797 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
4818 template<
typename MT3
4825 const size_t M( A.rows() );
4826 const size_t N( B.columns() );
4827 const size_t K( A.columns() );
4839 for(
size_t j=0UL; j<jbegin; ++j ) {
4840 for(
size_t i=0UL; i<M; ++i ) {
4844 for(
size_t j=jbegin; j<jend; ++j )
4851 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
4852 :( SYM || HERM || LOW ? j : 0UL ) ) );
4858 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
4859 :( UPP ? j+1UL : M ) ) );
4861 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
4862 for(
size_t i=0UL; i<M; ++i ) {
4870 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
4873 for(
size_t i=ibegin; i<iend; ++i )
4893 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4894 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4895 (~C)(i,j) += A(i,k) * B(k,j);
4897 (~C)(i,j) *= scalar;
4899 for(
size_t i=iend; i<M; ++i ) {
4903 for(
size_t j=jend; j<N; ++j ) {
4904 for(
size_t i=0UL; i<M; ++i ) {
4910 for(
size_t j=1UL; j<N; ++j ) {
4911 for(
size_t i=0UL; i<j; ++i ) {
4912 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
4933 template<
typename MT3
4937 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4940 const size_t M( A.rows() );
4941 const size_t N( B.columns() );
4943 for(
size_t i=0UL; i<M; ++i )
4954 for(
size_t j=0UL; j<jbegin; ++j ) {
4958 for(
size_t j=jbegin; j<jend; ++j ) {
4959 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4962 for(
size_t j=jend; j<N; ++j ) {
4984 template<
typename MT3
4988 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4991 constexpr
size_t block( BLOCK_SIZE );
4993 const size_t M( A.rows() );
4994 const size_t N( B.columns() );
4996 for(
size_t jj=0UL; jj<N; jj+=block ) {
4997 const size_t jend(
min( N, jj+block ) );
4998 for(
size_t ii=0UL; ii<M; ii+=block ) {
4999 const size_t iend(
min( M, ii+block ) );
5000 for(
size_t j=jj; j<jend; ++j )
5010 for(
size_t i=ii; i<ibegin; ++i ) {
5014 for(
size_t i=ibegin; i<ipos; ++i ) {
5015 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
5018 for(
size_t i=ipos; i<iend; ++i ) {
5042 template<
typename MT3
5049 constexpr
size_t block( BLOCK_SIZE );
5051 const size_t M( A.rows() );
5052 const size_t N( B.columns() );
5054 for(
size_t ii=0UL; ii<M; ii+=block ) {
5055 const size_t iend(
min( M, ii+block ) );
5056 for(
size_t jj=0UL; jj<N; jj+=block ) {
5057 const size_t jend(
min( N, jj+block ) );
5058 for(
size_t i=ii; i<iend; ++i )
5068 for(
size_t j=jj; j<jbegin; ++j ) {
5072 for(
size_t j=jbegin; j<jpos; ++j ) {
5073 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
5076 for(
size_t j=jpos; j<jend; ++j ) {
5100 template<
typename MT3
5107 const size_t M( A.rows() );
5108 const size_t N( B.columns() );
5110 for(
size_t j=0UL; j<N; ++j )
5121 for(
size_t i=0UL; i<ibegin; ++i ) {
5125 for(
size_t i=ibegin; i<iend; ++i ) {
5126 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
5129 for(
size_t i=iend; i<M; ++i ) {
5151 template<
typename MT3
5156 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5160 for(
size_t i=0UL; i<A.rows(); ++i ) {
5161 C(i,i) = A(i,i) * B(i,i) * scalar;
5180 template<
typename MT3
5185 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5187 selectDefaultAssignKernel( C, A, B, scalar );
5206 template<
typename MT3
5215 const size_t M( A.rows() );
5216 const size_t N( B.columns() );
5217 const size_t K( A.columns() );
5228 for( ; !( LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
5230 const size_t jend( LOW ? i+2UL : N );
5231 size_t j( SYM || HERM || UPP ? i : 0UL );
5233 for( ; (j+4UL) <= jend; j+=4UL )
5242 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5243 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5245 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5248 for( ; k<kpos; k+=SIMDSIZE ) {
5250 const SIMDType a2( A.load(i+1UL,k) );
5252 const SIMDType b2( B.load(k,j+1UL) );
5253 const SIMDType b3( B.load(k,j+2UL) );
5254 const SIMDType b4( B.load(k,j+3UL) );
5265 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5266 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5267 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
5268 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
5269 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
5270 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
5271 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
5272 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
5274 for( ; remainder && k<kend; ++k ) {
5275 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5276 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5277 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5278 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5279 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5280 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5281 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
5282 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
5286 for( ; (j+2UL) <= jend; j+=2UL )
5295 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5296 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5301 for( ; k<kpos; k+=SIMDSIZE ) {
5303 const SIMDType a2( A.load(i+1UL,k) );
5305 const SIMDType b2( B.load(k,j+1UL) );
5312 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5313 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5314 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5315 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5317 for( ; remainder && k<kend; ++k ) {
5318 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5319 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5320 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5321 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5332 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5333 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5338 for( ; k<kpos; k+=SIMDSIZE ) {
5340 xmm1 += A.load(i ,k) * b1;
5341 xmm2 += A.load(i+1UL,k) * b1;
5344 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5345 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5347 for( ; remainder && k<kend; ++k ) {
5348 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5349 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5356 const size_t jend( LOW ? i+1UL : N );
5357 size_t j( SYM || HERM || UPP ? i : 0UL );
5359 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
5366 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5367 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5372 for( ; k<kpos; k+=SIMDSIZE ) {
5374 xmm1 += a1 * B.load(k,j );
5375 xmm2 += a1 * B.load(k,j+1UL);
5376 xmm3 += a1 * B.load(k,j+2UL);
5377 xmm4 += a1 * B.load(k,j+3UL);
5380 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5381 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5382 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
5383 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
5385 for( ; remainder && k<kend; ++k ) {
5386 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5387 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5388 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5389 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5393 for( ; !( LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
5400 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5401 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5406 for( ; k<kpos; k+=SIMDSIZE ) {
5408 xmm1 += a1 * B.load(k,j );
5409 xmm2 += a1 * B.load(k,j+1UL);
5412 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5413 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5415 for( ; remainder && k<kend; ++k ) {
5416 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5417 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5421 for( ; j<jend; ++j )
5427 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
5428 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5433 for( ; k<kpos; k+=SIMDSIZE ) {
5434 xmm1 += A.load(i,k) * B.load(k,j);
5437 (~C)(i,j) =
sum( xmm1 ) * scalar;
5439 for( ; remainder && k<K; ++k ) {
5440 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5447 for(
size_t i=2UL; i<M; ++i ) {
5448 const size_t jend( 2UL * ( i/2UL ) );
5449 for(
size_t j=0UL; j<jend; ++j ) {
5450 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5454 else if( LOW && !UPP ) {
5455 for(
size_t j=2UL; j<N; ++j ) {
5456 const size_t iend( 2UL * ( j/2UL ) );
5457 for(
size_t i=0UL; i<iend; ++i ) {
5462 else if( !LOW && UPP ) {
5463 for(
size_t i=2UL; i<M; ++i ) {
5464 const size_t jend( 2UL * ( i/2UL ) );
5465 for(
size_t j=0UL; j<jend; ++j ) {
5488 template<
typename MT3
5497 const size_t M( A.rows() );
5498 const size_t N( B.columns() );
5499 const size_t K( A.columns() );
5510 for( ; !SYM && !HERM && !LOW && !UPP && (i+4UL) <= M; i+=4UL )
5514 for( ; (j+2UL) <= N; j+=2UL )
5523 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5524 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5526 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5529 for( ; k<kpos; k+=SIMDSIZE ) {
5531 const SIMDType a2( A.load(i+1UL,k) );
5532 const SIMDType a3( A.load(i+2UL,k) );
5533 const SIMDType a4( A.load(i+3UL,k) );
5535 const SIMDType b2( B.load(k,j+1UL) );
5546 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5547 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5548 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5549 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5550 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
5551 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5552 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
5553 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5555 for( ; remainder && k<kend; ++k ) {
5556 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5557 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5558 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5559 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5560 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5561 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5562 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5563 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5574 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5575 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5580 for( ; k<kpos; k+=SIMDSIZE ) {
5582 xmm1 += A.load(i ,k) * b1;
5583 xmm2 += A.load(i+1UL,k) * b1;
5584 xmm3 += A.load(i+2UL,k) * b1;
5585 xmm4 += A.load(i+3UL,k) * b1;
5588 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5589 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5590 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
5591 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
5593 for( ; remainder && k<kend; ++k ) {
5594 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5595 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5596 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5597 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5602 for( ; (i+2UL) <= M; i+=2UL )
5604 const size_t jend( LOW ? i+2UL : N );
5605 size_t j( SYM || HERM || UPP ? i : 0UL );
5607 for( ; (j+2UL) <= jend; j+=2UL )
5616 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5617 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5622 for( ; k<kpos; k+=SIMDSIZE ) {
5624 const SIMDType a2( A.load(i+1UL,k) );
5626 const SIMDType b2( B.load(k,j+1UL) );
5633 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5634 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5635 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5636 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5638 for( ; remainder && k<kend; ++k ) {
5639 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5640 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5641 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5642 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5653 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5654 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5659 for( ; k<kpos; k+=SIMDSIZE ) {
5661 xmm1 += A.load(i ,k) * b1;
5662 xmm2 += A.load(i+1UL,k) * b1;
5665 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5666 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5668 for( ; remainder && k<kend; ++k ) {
5669 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5670 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5677 const size_t jend( LOW ? i+1UL : N );
5678 size_t j( SYM || HERM || UPP ? i : 0UL );
5680 for( ; (j+2UL) <= jend; j+=2UL )
5687 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5688 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5693 for( ; k<kpos; k+=SIMDSIZE ) {
5695 xmm1 += a1 * B.load(k,j );
5696 xmm2 += a1 * B.load(k,j+1UL);
5699 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5700 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5702 for( ; remainder && k<kend; ++k ) {
5703 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5704 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5714 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
5715 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5720 for( ; k<kpos; k+=SIMDSIZE ) {
5721 xmm1 += A.load(i,k) * B.load(k,j);
5724 (~C)(i,j) =
sum( xmm1 ) * scalar;
5726 for( ; remainder && k<K; ++k ) {
5727 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5734 for(
size_t j=0UL; j<N; ++j ) {
5735 for(
size_t i=j+1UL; i<M; ++i ) {
5736 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5757 template<
typename MT3
5762 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5764 selectDefaultAssignKernel( C, A, B, scalar );
5783 template<
typename MT3
5788 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5791 smmm( C, A, B, scalar );
5793 hmmm( C, A, B, scalar );
5795 lmmm( C, A, B, scalar, ST2(0) );
5797 ummm( C, A, B, scalar, ST2(0) );
5799 mmm( C, A, B, scalar, ST2(0) );
5817 template<
typename MT3
5822 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5824 selectLargeAssignKernel( C, A, B, scalar );
5829 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5843 template<
typename MT3
5848 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5854 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5858 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5861 gemm( C, A, B, ET(scalar), ET(0) );
5879 template<
typename MT
5897 const ForwardFunctor fwd;
5899 const TmpType tmp(
serial( rhs ) );
5900 assign( ~lhs, fwd( tmp ) );
5916 template<
typename MT
5928 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5942 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
5957 template<
typename MT3
5961 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5964 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5965 selectSmallAddAssignKernel( C, A, B, scalar );
5967 selectBlasAddAssignKernel( C, A, B, scalar );
5985 template<
typename MT3
5990 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5993 addAssign( C, tmp );
6011 template<
typename MT3
6015 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6018 const size_t M( A.rows() );
6019 const size_t N( B.columns() );
6021 for(
size_t i=0UL; i<M; ++i )
6031 const size_t jnum( jend - jbegin );
6032 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6034 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6035 (~C)(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6036 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6039 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6059 template<
typename MT3
6063 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6066 constexpr
size_t block( BLOCK_SIZE );
6068 const size_t M( A.rows() );
6069 const size_t N( B.columns() );
6071 for(
size_t jj=0UL; jj<N; jj+=block ) {
6072 const size_t jend(
min( N, jj+block ) );
6073 for(
size_t ii=0UL; ii<M; ii+=block ) {
6074 const size_t iend(
min( M, ii+block ) );
6075 for(
size_t j=jj; j<jend; ++j )
6084 for(
size_t i=ibegin; i<ipos; ++i ) {
6085 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
6107 template<
typename MT3
6114 constexpr
size_t block( BLOCK_SIZE );
6116 const size_t M( A.rows() );
6117 const size_t N( B.columns() );
6119 for(
size_t ii=0UL; ii<M; ii+=block ) {
6120 const size_t iend(
min( M, ii+block ) );
6121 for(
size_t jj=0UL; jj<N; jj+=block ) {
6122 const size_t jend(
min( N, jj+block ) );
6123 for(
size_t i=ii; i<iend; ++i )
6132 for(
size_t j=jbegin; j<jpos; ++j ) {
6133 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
6155 template<
typename MT3
6162 const size_t M( A.rows() );
6163 const size_t N( B.columns() );
6165 for(
size_t j=0UL; j<N; ++j )
6175 const size_t inum( iend - ibegin );
6176 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6178 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6179 (~C)(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6180 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6183 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6203 template<
typename MT3
6208 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6210 for(
size_t i=0UL; i<A.rows(); ++i ) {
6211 C(i,i) += A(i,i) * B(i,i) * scalar;
6230 template<
typename MT3
6235 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6237 selectDefaultAddAssignKernel( C, A, B, scalar );
6256 template<
typename MT3
6265 const size_t M( A.rows() );
6266 const size_t N( B.columns() );
6267 const size_t K( A.columns() );
6273 for( ; (i+2UL) <= M; i+=2UL )
6275 const size_t jend( LOW ? i+2UL : N );
6276 size_t j( UPP ? i : 0UL );
6278 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
6287 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6288 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6290 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6293 for( ; k<kpos; k+=SIMDSIZE ) {
6295 const SIMDType a2( A.load(i+1UL,k) );
6297 const SIMDType b2( B.load(k,j+1UL) );
6298 const SIMDType b3( B.load(k,j+2UL) );
6299 const SIMDType b4( B.load(k,j+3UL) );
6310 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6311 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6312 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
6313 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
6314 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
6315 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
6316 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
6317 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
6319 for( ; remainder && k<kend; ++k ) {
6320 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6321 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6322 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6323 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6324 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6325 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6326 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6327 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6331 for( ; (j+2UL) <= jend; j+=2UL )
6340 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6341 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6346 for( ; k<kpos; k+=SIMDSIZE ) {
6348 const SIMDType a2( A.load(i+1UL,k) );
6350 const SIMDType b2( B.load(k,j+1UL) );
6357 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6358 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6359 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6360 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6362 for( ; remainder && k<kend; ++k ) {
6363 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6364 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6365 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6366 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6377 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6378 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6383 for( ; k<kpos; k+=SIMDSIZE ) {
6385 xmm1 += A.load(i ,k) * b1;
6386 xmm2 += A.load(i+1UL,k) * b1;
6389 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6390 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6392 for( ; remainder && k<kend; ++k ) {
6393 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6394 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6401 const size_t jend( LOW ? i+1UL : N );
6402 size_t j( UPP ? i : 0UL );
6404 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
6411 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6412 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6417 for( ; k<kpos; k+=SIMDSIZE ) {
6419 xmm1 += a1 * B.load(k,j );
6420 xmm2 += a1 * B.load(k,j+1UL);
6421 xmm3 += a1 * B.load(k,j+2UL);
6422 xmm4 += a1 * B.load(k,j+3UL);
6425 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6426 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6427 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
6428 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
6430 for( ; remainder && k<kend; ++k ) {
6431 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6432 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6433 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6434 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6438 for( ; (j+2UL) <= jend; j+=2UL )
6445 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6446 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6451 for( ; k<kpos; k+=SIMDSIZE ) {
6453 xmm1 += a1 * B.load(k,j );
6454 xmm2 += a1 * B.load(k,j+1UL);
6457 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6458 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6460 for( ; remainder && k<kend; ++k ) {
6461 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6462 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6472 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
6473 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6478 for( ; k<kpos; k+=SIMDSIZE ) {
6479 xmm1 += A.load(i,k) * B.load(k,j);
6482 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6484 for( ; remainder && k<K; ++k ) {
6485 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6507 template<
typename MT3
6516 const size_t M( A.rows() );
6517 const size_t N( B.columns() );
6518 const size_t K( A.columns() );
6524 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
6528 for( ; (j+2UL) <= N; j+=2UL )
6537 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6538 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6540 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6543 for( ; k<kpos; k+=SIMDSIZE ) {
6545 const SIMDType a2( A.load(i+1UL,k) );
6546 const SIMDType a3( A.load(i+2UL,k) );
6547 const SIMDType a4( A.load(i+3UL,k) );
6549 const SIMDType b2( B.load(k,j+1UL) );
6560 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6561 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6562 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6563 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6564 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6565 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6566 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6567 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6569 for( ; remainder && k<kend; ++k ) {
6570 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6571 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6572 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6573 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6574 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6575 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6576 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6577 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6588 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6589 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6594 for( ; k<kpos; k+=SIMDSIZE ) {
6596 xmm1 += A.load(i ,k) * b1;
6597 xmm2 += A.load(i+1UL,k) * b1;
6598 xmm3 += A.load(i+2UL,k) * b1;
6599 xmm4 += A.load(i+3UL,k) * b1;
6602 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6603 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6604 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
6605 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
6607 for( ; remainder && k<kend; ++k ) {
6608 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6609 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6610 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6611 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6616 for( ; (i+2UL) <= M; i+=2UL )
6618 const size_t jend( LOW ? i+2UL : N );
6619 size_t j( UPP ? i : 0UL );
6621 for( ; (j+2UL) <= jend; j+=2UL )
6630 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6631 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6636 for( ; k<kpos; k+=SIMDSIZE ) {
6638 const SIMDType a2( A.load(i+1UL,k) );
6640 const SIMDType b2( B.load(k,j+1UL) );
6647 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6648 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6649 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6650 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6652 for( ; remainder && k<kend; ++k ) {
6653 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6654 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6655 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6656 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6667 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6668 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6673 for( ; k<kpos; k+=SIMDSIZE ) {
6675 xmm1 += A.load(i ,k) * b1;
6676 xmm2 += A.load(i+1UL,k) * b1;
6679 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6680 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6682 for( ; remainder && k<kend; ++k ) {
6683 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6684 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6691 const size_t jend( LOW ? i+1UL : N );
6692 size_t j( UPP ? i : 0UL );
6694 for( ; (j+2UL) <= jend; j+=2UL )
6701 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6702 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6707 for( ; k<kpos; k+=SIMDSIZE ) {
6709 xmm1 += a1 * B.load(k,j );
6710 xmm2 += a1 * B.load(k,j+1UL);
6713 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6714 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6716 for( ; remainder && k<kend; ++k ) {
6717 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6718 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6728 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
6729 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6734 for( ; k<kpos; k+=SIMDSIZE ) {
6735 xmm1 += A.load(i,k) * B.load(k,j);
6738 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6740 for( ; remainder && k<K; ++k ) {
6741 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6762 template<
typename MT3
6767 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6769 selectDefaultAddAssignKernel( C, A, B, scalar );
6788 template<
typename MT3
6793 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6796 lmmm( C, A, B, scalar, ST2(1) );
6798 ummm( C, A, B, scalar, ST2(1) );
6800 mmm( C, A, B, scalar, ST2(1) );
6818 template<
typename MT3
6823 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6825 selectLargeAddAssignKernel( C, A, B, scalar );
6830 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6844 template<
typename MT3
6849 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6855 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6856 addAssign( C, tmp );
6860 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6861 addAssign( C, tmp );
6864 gemm( C, A, B, ET(scalar), ET(1) );
6886 template<
typename MT
6898 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6912 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6927 template<
typename MT3
6931 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6934 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6935 selectSmallSubAssignKernel( C, A, B, scalar );
6937 selectBlasSubAssignKernel( C, A, B, scalar );
6955 template<
typename MT3
6960 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6963 subAssign( C, tmp );
6981 template<
typename MT3
6985 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6988 const size_t M( A.rows() );
6989 const size_t N( B.columns() );
6991 for(
size_t i=0UL; i<M; ++i )
7001 const size_t jnum( jend - jbegin );
7002 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7004 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7005 (~C)(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7006 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7009 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7029 template<
typename MT3
7033 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7036 constexpr
size_t block( BLOCK_SIZE );
7038 const size_t M( A.rows() );
7039 const size_t N( B.columns() );
7041 for(
size_t jj=0UL; jj<N; jj+=block ) {
7042 const size_t jend(
min( N, jj+block ) );
7043 for(
size_t ii=0UL; ii<M; ii+=block ) {
7044 const size_t iend(
min( M, ii+block ) );
7045 for(
size_t j=jj; j<jend; ++j )
7054 for(
size_t i=ibegin; i<ipos; ++i ) {
7055 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
7078 template<
typename MT3
7085 constexpr
size_t block( BLOCK_SIZE );
7087 const size_t M( A.rows() );
7088 const size_t N( B.columns() );
7090 for(
size_t ii=0UL; ii<M; ii+=block ) {
7091 const size_t iend(
min( M, ii+block ) );
7092 for(
size_t jj=0UL; jj<N; jj+=block ) {
7093 const size_t jend(
min( N, jj+block ) );
7094 for(
size_t i=ii; i<iend; ++i )
7103 for(
size_t j=jbegin; j<jpos; ++j ) {
7104 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
7127 template<
typename MT3
7134 const size_t M( A.rows() );
7135 const size_t N( B.columns() );
7137 for(
size_t j=0UL; j<N; ++j )
7147 const size_t inum( iend - ibegin );
7148 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7150 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7151 (~C)(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7152 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7155 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7175 template<
typename MT3
7180 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7182 for(
size_t i=0UL; i<A.rows(); ++i ) {
7183 C(i,i) -= A(i,i) * B(i,i) * scalar;
7202 template<
typename MT3
7207 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7209 selectDefaultSubAssignKernel( C, A, B, scalar );
7228 template<
typename MT3
7237 const size_t M( A.rows() );
7238 const size_t N( B.columns() );
7239 const size_t K( A.columns() );
7245 for( ; (i+2UL) <= M; i+=2UL )
7247 const size_t jend( LOW ? i+2UL : N );
7248 size_t j( UPP ? i : 0UL );
7250 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
7259 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7260 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7262 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7265 for( ; k<kpos; k+=SIMDSIZE ) {
7267 const SIMDType a2( A.load(i+1UL,k) );
7269 const SIMDType b2( B.load(k,j+1UL) );
7270 const SIMDType b3( B.load(k,j+2UL) );
7271 const SIMDType b4( B.load(k,j+3UL) );
7282 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7283 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7284 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
7285 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
7286 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
7287 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
7288 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
7289 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
7291 for( ; remainder && k<kend; ++k ) {
7292 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7293 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7294 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7295 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7296 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7297 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7298 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7299 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7303 for( ; (j+2UL) <= jend; j+=2UL )
7312 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7313 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7318 for( ; k<kpos; k+=SIMDSIZE ) {
7320 const SIMDType a2( A.load(i+1UL,k) );
7322 const SIMDType b2( B.load(k,j+1UL) );
7329 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7330 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7331 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7332 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7334 for( ; remainder && k<kend; ++k ) {
7335 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7336 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7337 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7338 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7349 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7350 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7355 for( ; k<kpos; k+=SIMDSIZE ) {
7357 xmm1 += A.load(i ,k) * b1;
7358 xmm2 += A.load(i+1UL,k) * b1;
7361 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7362 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7364 for( ; remainder && k<kend; ++k ) {
7365 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7366 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7373 const size_t jend( LOW ? i+1UL : N );
7374 size_t j( UPP ? i : 0UL );
7376 for( ; !( LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
7383 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7384 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7389 for( ; k<kpos; k+=SIMDSIZE ) {
7391 xmm1 += a1 * B.load(k,j );
7392 xmm2 += a1 * B.load(k,j+1UL);
7393 xmm3 += a1 * B.load(k,j+2UL);
7394 xmm4 += a1 * B.load(k,j+3UL);
7397 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7398 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7399 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
7400 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
7402 for( ; remainder && k<kend; ++k ) {
7403 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7404 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7405 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7406 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7410 for( ; (j+2UL) <= jend; j+=2UL )
7417 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7418 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7423 for( ; k<kpos; k+=SIMDSIZE ) {
7425 xmm1 += a1 * B.load(k,j );
7426 xmm2 += a1 * B.load(k,j+1UL);
7429 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7430 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7432 for( ; remainder && k<kend; ++k ) {
7433 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7434 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7444 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
7445 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7450 for( ; k<kpos; k+=SIMDSIZE ) {
7451 xmm1 += A.load(i,k) * B.load(k,j);
7454 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7456 for( ; remainder && k<K; ++k ) {
7457 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7479 template<
typename MT3
7488 const size_t M( A.rows() );
7489 const size_t N( B.columns() );
7490 const size_t K( A.columns() );
7496 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
7500 for( ; (j+2UL) <= N; j+=2UL )
7509 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7510 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7512 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7515 for( ; k<kpos; k+=SIMDSIZE )
7518 const SIMDType a2( A.load(i+1UL,k) );
7519 const SIMDType a3( A.load(i+2UL,k) );
7520 const SIMDType a4( A.load(i+3UL,k) );
7522 const SIMDType b2( B.load(k,j+1UL) );
7533 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7534 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7535 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7536 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7537 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7538 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7539 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7540 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7542 for( ; remainder && k<kend; ++k ) {
7543 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7544 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7545 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7546 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7547 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7548 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7549 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7550 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7561 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7562 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7567 for( ; k<kpos; k+=SIMDSIZE ) {
7569 xmm1 += A.load(i ,k) * b1;
7570 xmm2 += A.load(i+1UL,k) * b1;
7571 xmm3 += A.load(i+2UL,k) * b1;
7572 xmm4 += A.load(i+3UL,k) * b1;
7575 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7576 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7577 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
7578 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
7580 for( ; remainder && k<kend; ++k ) {
7581 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7582 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7583 (~C)(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7584 (~C)(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7589 for( ; (i+2UL) <= M; i+=2UL )
7591 const size_t jend( LOW ? i+2UL : N );
7592 size_t j( UPP ? i : 0UL );
7594 for( ; (j+2UL) <= jend; j+=2UL )
7603 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7604 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7609 for( ; k<kpos; k+=SIMDSIZE ) {
7611 const SIMDType a2( A.load(i+1UL,k) );
7613 const SIMDType b2( B.load(k,j+1UL) );
7620 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7621 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7622 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7623 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7625 for( ; remainder && k<kend; ++k ) {
7626 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7627 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7628 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7629 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7640 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7641 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7646 for( ; k<kpos; k+=SIMDSIZE ) {
7648 xmm1 += A.load(i ,k) * b1;
7649 xmm2 += A.load(i+1UL,k) * b1;
7652 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7653 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7655 for( ; remainder && k<kend; ++k ) {
7656 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7657 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7664 const size_t jend( LOW ? i+1UL : N );
7665 size_t j( UPP ? i : 0UL );
7667 for( ; (j+2UL) <= jend; j+=2UL )
7674 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7675 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7680 for( ; k<kpos; k+=SIMDSIZE ) {
7682 xmm1 += a1 * B.load(k,j );
7683 xmm2 += a1 * B.load(k,j+1UL);
7686 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7687 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7689 for( ; remainder && k<kend; ++k ) {
7690 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7691 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7701 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
7702 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7707 for( ; k<kpos; k+=SIMDSIZE ) {
7708 xmm1 += A.load(i,k) * B.load(k,j);
7711 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7713 for( ; remainder && k<K; ++k ) {
7714 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7735 template<
typename MT3
7740 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7742 selectDefaultSubAssignKernel( C, A, B, scalar );
7761 template<
typename MT3
7766 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7769 lmmm( C, A, B, -scalar, ST2(1) );
7771 ummm( C, A, B, -scalar, ST2(1) );
7773 mmm( C, A, B, -scalar, ST2(1) );
7791 template<
typename MT3
7796 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7798 selectLargeSubAssignKernel( C, A, B, scalar );
7803 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7817 template<
typename MT3
7822 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7828 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7829 subAssign( C, tmp );
7833 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7834 subAssign( C, tmp );
7837 gemm( C, A, B, ET(-scalar), ET(1) );
7859 template<
typename MT
7873 schurAssign( ~lhs, tmp );
7904 template<
typename MT
7917 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7920 else if( left.columns() == 0UL ) {
7954 template<
typename MT
7973 const ForwardFunctor fwd;
7975 const TmpType tmp( rhs );
7995 template<
typename MT
8008 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8045 template<
typename MT
8058 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8092 template<
typename MT
8175 template<
typename MT1
8177 inline decltype(
auto)
8225 template<
typename MT1
8271 template<
typename MT1
8317 template<
typename MT1
8363 template<
typename MT1
8409 template<
typename MT1
8440 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8441 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
8442 :
public Size<MT1,0UL>
8445 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8446 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
8447 :
public Size<MT2,1UL>
8463 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8464 struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8465 :
public And< IsAligned<MT1>, IsAligned<MT2> >
8481 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8482 struct IsSymmetric< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8483 :
public Or< Bool<SF>
8485 , IsBuiltin< ElementType_< DMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
8486 , And< Bool<LF>, Bool<UF> > >
8502 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
8503 struct IsHermitian< DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
8520 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8521 struct IsLower< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8522 :
public Or< Bool<LF>
8523 , And< IsLower<MT1>, IsLower<MT2> >
8524 , And< Or< Bool<SF>, Bool<HF> >
8525 , IsUpper<MT1>, IsUpper<MT2> > >
8541 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8542 struct IsUniLower< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8543 :
public Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
8544 , And< Or< Bool<SF>, Bool<HF> >
8545 , IsUniUpper<MT1>, IsUniUpper<MT2> > >
8561 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8563 :
public Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
8564 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
8565 , And< Or< Bool<SF>, Bool<HF> >
8566 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8567 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >
8583 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8584 struct IsUpper< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8585 :
public Or< Bool<UF>
8586 , And< IsUpper<MT1>, IsUpper<MT2> >
8587 , And< Or< Bool<SF>, Bool<HF> >
8588 , IsLower<MT1>, IsLower<MT2> > >
8604 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8605 struct IsUniUpper< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8606 :
public Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
8607 , And< Or< Bool<SF>, Bool<HF> >
8608 , IsUniLower<MT1>, IsUniLower<MT2> > >
8624 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8626 :
public Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8627 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
8628 , And< Or< Bool<SF>, Bool<HF> >
8629 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
8630 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Constraint on the data type.
Header file for kernel specific block sizes.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:264
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:261
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:544
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:363
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:617
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:534
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1026
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:169
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:465
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:379
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:299
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:255
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:256
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:433
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:107
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:270
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:170
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:257
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1026
Header file for the IsLower type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:421
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:260
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:466
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:616
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:430
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:267
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:107
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
Header file for the conjugate shim.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:258
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:443
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:259
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:148
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:409
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1028
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:273
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:399
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1028
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:314
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time logical 'or' evaluation.The Or alias declaration performs at compile time a logical 'or'...
Definition: Or.h:76
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Header file for the DeclHerm functor.
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:142
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:171
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:389
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:908
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:453
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:172
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.