35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 164 template<
typename MT1
197 SYM = ( SF && !( HF || LF || UF ) ),
198 HERM = ( HF && !( LF || UF ) ),
199 LOW = ( LF || ( ( SF || HF ) && UF ) ),
200 UPP = ( UF || ( ( SF || HF ) && LF ) )
212 template<
typename T1,
typename T2,
typename T3 >
213 struct CanExploitSymmetry {
226 template<
typename T1,
typename T2,
typename T3 >
227 struct IsEvaluationRequired {
228 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
229 !CanExploitSymmetry<T1,T2,T3>::value };
239 template<
typename T1,
typename T2,
typename T3 >
240 struct UseBlasKernel {
247 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
262 template<
typename T1,
typename T2,
typename T3 >
263 struct UseVectorizedDefaultKernel {
266 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
291 ,
Noop > > > > ForwardFunctor;
324 MT1::simdEnabled && MT2::simdEnabled &&
329 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
330 !evaluateRight && MT2::smpAssignable };
385 :(
lhs_.columns() ) ) );
389 const size_t n(
end - begin );
407 inline ReturnType
at(
size_t i,
size_t j )
const {
408 if( i >=
lhs_.rows() ) {
411 if( j >=
rhs_.columns() ) {
423 inline size_t rows() const noexcept {
434 return rhs_.columns();
464 template<
typename T >
465 inline bool canAlias(
const T* alias )
const noexcept {
466 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
476 template<
typename T >
477 inline bool isAliased(
const T* alias )
const noexcept {
478 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
488 return lhs_.isAligned() &&
rhs_.isAligned();
499 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
500 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
524 template<
typename MT
534 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
537 else if( rhs.
lhs_.columns() == 0UL ) {
552 DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
568 template<
typename MT3
571 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
574 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
575 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
576 selectSmallAssignKernel( C, A, B );
578 selectBlasAssignKernel( C, A, B );
597 template<
typename MT3
601 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
603 const size_t M( A.rows() );
604 const size_t N( B.columns() );
605 const size_t K( A.columns() );
609 for(
size_t i=0UL; i<M; ++i )
620 for(
size_t j=0UL; j<N; ++j ) {
629 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
630 :( UPP ?
max(i,kbegin) : kbegin ) )
631 :( UPP ? i : 0UL ) );
634 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
635 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
636 :( LOW ? i+1UL : N ) );
639 for(
size_t j=0UL; j<jbegin; ++j ) {
646 for(
size_t j=jbegin; j<jend; ++j ) {
647 C(i,j) = A(i,kbegin) * B(kbegin,j);
650 for(
size_t j=jend; j<N; ++j ) {
659 for(
size_t k=kbegin+1UL; k<kend; ++k )
663 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
664 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
665 :( SYM || HERM || UPP ? i : 0UL ) );
668 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
669 :( LOW ?
min(i+1UL,k) : k ) )
670 :( LOW ? i+1UL : N ) );
672 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
675 for(
size_t j=jbegin; j<jend; ++j ) {
676 C(i,j) += A(i,k) * B(k,j);
679 C(i,jend) = A(i,k) * B(k,jend);
685 for(
size_t i=1UL; i<M; ++i ) {
686 for(
size_t j=0UL; j<i; ++j ) {
687 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
709 template<
typename MT3
712 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
713 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
717 const size_t M( A.rows() );
718 const size_t N( B.columns() );
720 for(
size_t i=0UL; i<M; ++i )
731 for(
size_t j=0UL; j<jbegin; ++j ) {
735 for(
size_t j=jbegin; j<jend; ++j ) {
736 C(i,j) = A(i,j) * B(j,j);
739 for(
size_t j=jend; j<N; ++j ) {
762 template<
typename MT3
766 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
770 const size_t M( A.rows() );
771 const size_t N( B.columns() );
773 for(
size_t i=0UL; i<M; ++i )
784 for(
size_t j=0UL; j<jbegin; ++j ) {
788 for(
size_t j=jbegin; j<jend; ++j ) {
789 C(i,j) = A(i,i) * B(i,j);
792 for(
size_t j=jend; j<N; ++j ) {
815 template<
typename MT3
819 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
825 for(
size_t i=0UL; i<A.rows(); ++i ) {
826 C(i,i) = A(i,i) * B(i,i);
845 template<
typename MT3
849 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
851 selectDefaultAssignKernel( C, A, B );
871 template<
typename MT3
879 const size_t M( A.rows() );
880 const size_t N( B.columns() );
881 const size_t K( A.columns() );
885 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
888 if( LOW && UPP && N > SIMDSIZE*3UL ) {
897 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
898 for(
size_t i=0UL; i<M; ++i )
911 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
913 for(
size_t k=kbegin; k<kend; ++k ) {
914 const SIMDType a1(
set( A(i,k) ) );
915 xmm1 += a1 * B.load(k,j );
916 xmm2 += a1 * B.load(k,j+SIMDSIZE );
917 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
918 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
919 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
920 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
921 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
922 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
925 (~C).store( i, j , xmm1 );
926 (~C).store( i, j+SIMDSIZE , xmm2 );
927 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
928 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
929 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
930 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
931 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
932 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
937 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
941 for( ; (i+2UL) <= M; i+=2UL )
954 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
956 for(
size_t k=kbegin; k<kend; ++k ) {
957 const SIMDType a1(
set( A(i ,k) ) );
958 const SIMDType a2(
set( A(i+1UL,k) ) );
959 const SIMDType b1( B.load(k,j ) );
960 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
961 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
962 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
963 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
976 (~C).store( i , j , xmm1 );
977 (~C).store( i , j+SIMDSIZE , xmm2 );
978 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
979 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
980 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
981 (~C).store( i+1UL, j , xmm6 );
982 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
983 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
984 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
985 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
997 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
999 for(
size_t k=kbegin; k<kend; ++k ) {
1000 const SIMDType a1(
set( A(i,k) ) );
1001 xmm1 += a1 * B.load(k,j );
1002 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1003 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1004 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1005 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1008 (~C).store( i, j , xmm1 );
1009 (~C).store( i, j+SIMDSIZE , xmm2 );
1010 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1011 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1012 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1016 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1018 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
1019 size_t i( LOW ? j : 0UL );
1021 for( ; (i+2UL) <= iend; i+=2UL )
1034 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1036 for(
size_t k=kbegin; k<kend; ++k ) {
1037 const SIMDType a1(
set( A(i ,k) ) );
1038 const SIMDType a2(
set( A(i+1UL,k) ) );
1039 const SIMDType b1( B.load(k,j ) );
1040 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1041 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1042 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1053 (~C).store( i , j , xmm1 );
1054 (~C).store( i , j+SIMDSIZE , xmm2 );
1055 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1056 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1057 (~C).store( i+1UL, j , xmm5 );
1058 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1059 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1060 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1072 SIMDType xmm1, xmm2, xmm3, xmm4;
1074 for(
size_t k=kbegin; k<kend; ++k ) {
1075 const SIMDType a1(
set( A(i,k) ) );
1076 xmm1 += a1 * B.load(k,j );
1077 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1078 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1079 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1082 (~C).store( i, j , xmm1 );
1083 (~C).store( i, j+SIMDSIZE , xmm2 );
1084 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1085 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1089 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1091 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
1092 size_t i( LOW ? j : 0UL );
1094 for( ; (i+2UL) <= iend; i+=2UL )
1107 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1109 for(
size_t k=kbegin; k<kend; ++k ) {
1110 const SIMDType a1(
set( A(i ,k) ) );
1111 const SIMDType a2(
set( A(i+1UL,k) ) );
1112 const SIMDType b1( B.load(k,j ) );
1113 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1114 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1123 (~C).store( i , j , xmm1 );
1124 (~C).store( i , j+SIMDSIZE , xmm2 );
1125 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1126 (~C).store( i+1UL, j , xmm4 );
1127 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1128 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1140 SIMDType xmm1, xmm2, xmm3;
1142 for(
size_t k=kbegin; k<kend; ++k ) {
1143 const SIMDType a1(
set( A(i,k) ) );
1144 xmm1 += a1 * B.load(k,j );
1145 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1146 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1149 (~C).store( i, j , xmm1 );
1150 (~C).store( i, j+SIMDSIZE , xmm2 );
1151 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1155 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1157 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
1158 size_t i( LOW ? j : 0UL );
1160 for( ; (i+2UL) <= iend; i+=2UL )
1173 SIMDType xmm1, xmm2, xmm3, xmm4;
1175 for(
size_t k=kbegin; k<kend; ++k ) {
1176 const SIMDType a1(
set( A(i ,k) ) );
1177 const SIMDType a2(
set( A(i+1UL,k) ) );
1178 const SIMDType b1( B.load(k,j ) );
1179 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1186 (~C).store( i , j , xmm1 );
1187 (~C).store( i , j+SIMDSIZE, xmm2 );
1188 (~C).store( i+1UL, j , xmm3 );
1189 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1201 SIMDType xmm1, xmm2;
1203 for(
size_t k=kbegin; k<kend; ++k ) {
1204 const SIMDType a1(
set( A(i,k) ) );
1205 xmm1 += a1 * B.load(k,j );
1206 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1209 (~C).store( i, j , xmm1 );
1210 (~C).store( i, j+SIMDSIZE, xmm2 );
1214 for( ; j<jpos; j+=SIMDSIZE )
1216 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
1217 size_t i( LOW ? j : 0UL );
1219 for( ; (i+2UL) <= iend; i+=2UL )
1230 SIMDType xmm1, xmm2;
1232 for(
size_t k=kbegin; k<kend; ++k ) {
1233 const SIMDType b1( B.load(k,j) );
1234 xmm1 +=
set( A(i ,k) ) * b1;
1235 xmm2 +=
set( A(i+1UL,k) ) * b1;
1238 (~C).store( i , j, xmm1 );
1239 (~C).store( i+1UL, j, xmm2 );
1252 for(
size_t k=kbegin; k<K; ++k ) {
1253 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1256 (~C).store( i, j, xmm1 );
1260 for( ; remainder && j<N; ++j )
1262 size_t i( LOW && UPP ? j : 0UL );
1264 for( ; (i+2UL) <= M; i+=2UL )
1278 for(
size_t k=kbegin; k<kend; ++k ) {
1279 value1 += A(i ,k) * B(k,j);
1280 value2 += A(i+1UL,k) * B(k,j);
1283 (~C)(i ,j) = value1;
1284 (~C)(i+1UL,j) = value2;
1297 for(
size_t k=kbegin; k<K; ++k ) {
1298 value += A(i,k) * B(k,j);
1306 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1307 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1308 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1309 for(
size_t j=0UL; j<jend; ++j ) {
1310 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1314 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1315 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1316 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1317 for(
size_t i=0UL; i<iend; ++i ) {
1322 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1323 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1324 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1325 for(
size_t j=0UL; j<jend; ++j ) {
1349 template<
typename MT3
1360 const ForwardFunctor fwd;
1364 assign( ~C, fwd( tmp * B ) );
1368 assign( ~C, fwd( A * tmp ) );
1370 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1372 assign( ~C, fwd( tmp * B ) );
1376 assign( ~C, fwd( A * tmp ) );
1395 template<
typename MT3
1399 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1401 selectDefaultAssignKernel( C, A, B );
1420 template<
typename MT3
1424 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1453 template<
typename MT3
1457 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1459 selectLargeAssignKernel( C, A, B );
1465 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1478 template<
typename MT3
1482 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1495 gemm( C, A, B, ET(1), ET(0) );
1515 template<
typename MT
1534 const ForwardFunctor fwd;
1536 const TmpType tmp(
serial( rhs ) );
1537 assign( ~lhs, fwd( tmp ) );
1557 template<
typename MT >
1568 const ForwardFunctor fwd;
1593 template<
typename MT
1603 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
1617 DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1633 template<
typename MT3
1636 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1639 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
1640 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1641 selectSmallAddAssignKernel( C, A, B );
1643 selectBlasAddAssignKernel( C, A, B );
1662 template<
typename MT3
1666 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1668 const size_t M( A.rows() );
1669 const size_t N( B.columns() );
1670 const size_t K( A.columns() );
1674 for(
size_t i=0UL; i<M; ++i )
1684 for(
size_t k=kbegin; k<kend; ++k )
1688 ?( UPP ?
max(i,k+1UL) : k+1UL )
1689 :( UPP ?
max(i,k) : k ) )
1690 :( UPP ? i : 0UL ) );
1693 ?( LOW ?
min(i+1UL,k) : k )
1694 :( LOW ?
min(i,k)+1UL : k+1UL ) )
1695 :( LOW ? i+1UL : N ) );
1697 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
1700 const size_t jnum( jend - jbegin );
1701 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1703 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1704 C(i,j ) += A(i,k) * B(k,j );
1705 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1708 C(i,jpos) += A(i,k) * B(k,jpos);
1730 template<
typename MT3
1733 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
1734 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1738 const size_t M( A.rows() );
1739 const size_t N( B.columns() );
1741 for(
size_t i=0UL; i<M; ++i )
1751 const size_t jnum( jend - jbegin );
1752 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1754 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1755 C(i,j ) += A(i,j ) * B(j ,j );
1756 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1759 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1780 template<
typename MT3
1784 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1788 const size_t M( A.rows() );
1789 const size_t N( B.columns() );
1791 for(
size_t i=0UL; i<M; ++i )
1801 const size_t jnum( jend - jbegin );
1802 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1804 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1805 C(i,j ) += A(i,i) * B(i,j );
1806 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1809 C(i,jpos) += A(i,i) * B(i,jpos);
1830 template<
typename MT3
1834 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1838 for(
size_t i=0UL; i<A.rows(); ++i ) {
1839 C(i,i) += A(i,i) * B(i,i);
1859 template<
typename MT3
1863 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1865 selectDefaultAddAssignKernel( C, A, B );
1885 template<
typename MT3
1893 const size_t M( A.rows() );
1894 const size_t N( B.columns() );
1895 const size_t K( A.columns() );
1899 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1906 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1907 for(
size_t i=0UL; i<M; ++i )
1920 SIMDType xmm1( (~C).load(i,j ) );
1921 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
1922 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
1923 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
1924 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
1925 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
1926 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
1927 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
1929 for(
size_t k=kbegin; k<kend; ++k ) {
1930 const SIMDType a1(
set( A(i,k) ) );
1931 xmm1 += a1 * B.load(k,j );
1932 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1933 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1934 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1935 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1936 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1937 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1938 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1941 (~C).store( i, j , xmm1 );
1942 (~C).store( i, j+SIMDSIZE , xmm2 );
1943 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1944 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1945 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1946 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1947 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1948 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1953 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1957 for( ; (i+2UL) <= M; i+=2UL )
1970 SIMDType xmm1 ( (~C).load(i ,j ) );
1971 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
1972 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
1973 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
1974 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
1975 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
1976 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
1977 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
1978 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
1979 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
1981 for(
size_t k=kbegin; k<kend; ++k ) {
1982 const SIMDType a1(
set( A(i ,k) ) );
1983 const SIMDType a2(
set( A(i+1UL,k) ) );
1984 const SIMDType b1( B.load(k,j ) );
1985 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1986 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1987 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1988 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2001 (~C).store( i , j , xmm1 );
2002 (~C).store( i , j+SIMDSIZE , xmm2 );
2003 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2004 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2005 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
2006 (~C).store( i+1UL, j , xmm6 );
2007 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
2008 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2009 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2010 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2022 SIMDType xmm1( (~C).load(i,j ) );
2023 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2024 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2025 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2026 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2028 for(
size_t k=kbegin; k<kend; ++k ) {
2029 const SIMDType a1(
set( A(i,k) ) );
2030 xmm1 += a1 * B.load(k,j );
2031 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2032 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2033 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2034 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2037 (~C).store( i, j , xmm1 );
2038 (~C).store( i, j+SIMDSIZE , xmm2 );
2039 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2040 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2041 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2045 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2049 for( ; (i+2UL) <= M; i+=2UL )
2062 SIMDType xmm1( (~C).load(i ,j ) );
2063 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2064 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2065 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2066 SIMDType xmm5( (~C).load(i+1UL,j ) );
2067 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2068 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2069 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2071 for(
size_t k=kbegin; k<kend; ++k ) {
2072 const SIMDType a1(
set( A(i ,k) ) );
2073 const SIMDType a2(
set( A(i+1UL,k) ) );
2074 const SIMDType b1( B.load(k,j ) );
2075 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2076 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2077 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2088 (~C).store( i , j , xmm1 );
2089 (~C).store( i , j+SIMDSIZE , xmm2 );
2090 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2091 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2092 (~C).store( i+1UL, j , xmm5 );
2093 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2094 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2095 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2107 SIMDType xmm1( (~C).load(i,j ) );
2108 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2109 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2110 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2112 for(
size_t k=kbegin; k<kend; ++k ) {
2113 const SIMDType a1(
set( A(i,k) ) );
2114 xmm1 += a1 * B.load(k,j );
2115 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2116 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2117 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2120 (~C).store( i, j , xmm1 );
2121 (~C).store( i, j+SIMDSIZE , xmm2 );
2122 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2123 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2127 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2131 for( ; (i+2UL) <= M; i+=2UL )
2144 SIMDType xmm1( (~C).load(i ,j ) );
2145 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2146 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2147 SIMDType xmm4( (~C).load(i+1UL,j ) );
2148 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
2149 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2151 for(
size_t k=kbegin; k<kend; ++k ) {
2152 const SIMDType a1(
set( A(i ,k) ) );
2153 const SIMDType a2(
set( A(i+1UL,k) ) );
2154 const SIMDType b1( B.load(k,j ) );
2155 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2156 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2165 (~C).store( i , j , xmm1 );
2166 (~C).store( i , j+SIMDSIZE , xmm2 );
2167 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2168 (~C).store( i+1UL, j , xmm4 );
2169 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
2170 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2182 SIMDType xmm1( (~C).load(i,j ) );
2183 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2184 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2186 for(
size_t k=kbegin; k<kend; ++k ) {
2187 const SIMDType a1(
set( A(i,k) ) );
2188 xmm1 += a1 * B.load(k,j );
2189 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2190 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2193 (~C).store( i, j , xmm1 );
2194 (~C).store( i, j+SIMDSIZE , xmm2 );
2195 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2199 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2201 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
2202 size_t i( LOW ? j : 0UL );
2204 for( ; (i+2UL) <= iend; i+=2UL )
2217 SIMDType xmm1( (~C).load(i ,j ) );
2218 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2219 SIMDType xmm3( (~C).load(i+1UL,j ) );
2220 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2222 for(
size_t k=kbegin; k<kend; ++k ) {
2223 const SIMDType a1(
set( A(i ,k) ) );
2224 const SIMDType a2(
set( A(i+1UL,k) ) );
2225 const SIMDType b1( B.load(k,j ) );
2226 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2233 (~C).store( i , j , xmm1 );
2234 (~C).store( i , j+SIMDSIZE, xmm2 );
2235 (~C).store( i+1UL, j , xmm3 );
2236 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2248 SIMDType xmm1( (~C).load(i,j ) );
2249 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2251 for(
size_t k=kbegin; k<kend; ++k ) {
2252 const SIMDType a1(
set( A(i,k) ) );
2253 xmm1 += a1 * B.load(k,j );
2254 xmm2 += a1 * B.load(k,j+SIMDSIZE);
2257 (~C).store( i, j , xmm1 );
2258 (~C).store( i, j+SIMDSIZE, xmm2 );
2262 for( ; j<jpos; j+=SIMDSIZE )
2264 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
2265 size_t i( LOW ? j : 0UL );
2267 for( ; (i+2UL) <= iend; i+=2UL )
2278 SIMDType xmm1( (~C).load(i ,j) );
2279 SIMDType xmm2( (~C).load(i+1UL,j) );
2281 for(
size_t k=kbegin; k<kend; ++k ) {
2282 const SIMDType b1( B.load(k,j) );
2283 xmm1 +=
set( A(i ,k) ) * b1;
2284 xmm2 +=
set( A(i+1UL,k) ) * b1;
2287 (~C).store( i , j, xmm1 );
2288 (~C).store( i+1UL, j, xmm2 );
2299 SIMDType xmm1( (~C).load(i,j) );
2301 for(
size_t k=kbegin; k<K; ++k ) {
2302 xmm1 +=
set( A(i,k) ) * B.load(k,j);
2305 (~C).store( i, j, xmm1 );
2309 for( ; remainder && j<N; ++j )
2311 const size_t iend( UPP ? j+1UL : M );
2312 size_t i( LOW ? j : 0UL );
2314 for( ; (i+2UL) <= iend; i+=2UL )
2325 ElementType value1( (~C)(i ,j) );
2326 ElementType value2( (~C)(i+1UL,j) );;
2328 for(
size_t k=kbegin; k<kend; ++k ) {
2329 value1 += A(i ,k) * B(k,j);
2330 value2 += A(i+1UL,k) * B(k,j);
2333 (~C)(i ,j) = value1;
2334 (~C)(i+1UL,j) = value2;
2345 ElementType value( (~C)(i,j) );
2347 for(
size_t k=kbegin; k<K; ++k ) {
2348 value += A(i,k) * B(k,j);
2373 template<
typename MT3
2384 const ForwardFunctor fwd;
2388 addAssign( ~C, fwd( tmp * B ) );
2392 addAssign( ~C, fwd( A * tmp ) );
2394 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2396 addAssign( ~C, fwd( tmp * B ) );
2400 addAssign( ~C, fwd( A * tmp ) );
2420 template<
typename MT3
2424 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2426 selectDefaultAddAssignKernel( C, A, B );
2446 template<
typename MT3
2450 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2476 template<
typename MT3
2480 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2482 selectLargeAddAssignKernel( C, A, B );
2488 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2502 template<
typename MT3
2506 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2513 addAssign( C, tmp );
2518 addAssign( C, tmp );
2521 gemm( C, A, B, ET(1), ET(1) );
2543 template<
typename MT >
2554 const ForwardFunctor fwd;
2583 template<
typename MT
2593 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
2607 DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2623 template<
typename MT3
2626 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2629 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
2630 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2631 selectSmallSubAssignKernel( C, A, B );
2633 selectBlasSubAssignKernel( C, A, B );
2652 template<
typename MT3
2656 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2658 const size_t M( A.rows() );
2659 const size_t N( B.columns() );
2660 const size_t K( A.columns() );
2664 for(
size_t i=0UL; i<M; ++i )
2674 for(
size_t k=kbegin; k<kend; ++k )
2678 ?( UPP ?
max(i,k+1UL) : k+1UL )
2679 :( UPP ?
max(i,k) : k ) )
2680 :( UPP ? i : 0UL ) );
2683 ?( LOW ?
min(i+1UL,k) : k )
2684 :( LOW ?
min(i,k)+1UL : k+1UL ) )
2685 :( LOW ? i+1UL : N ) );
2687 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
2690 const size_t jnum( jend - jbegin );
2691 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2693 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2694 C(i,j ) -= A(i,k) * B(k,j );
2695 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
2698 C(i,jpos) -= A(i,k) * B(k,jpos);
2720 template<
typename MT3
2723 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2724 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2728 const size_t M( A.rows() );
2729 const size_t N( B.columns() );
2731 for(
size_t i=0UL; i<M; ++i )
2741 const size_t jnum( jend - jbegin );
2742 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2744 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2745 C(i,j ) -= A(i,j ) * B(j ,j );
2746 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
2749 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
2770 template<
typename MT3
2774 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2778 const size_t M( A.rows() );
2779 const size_t N( B.columns() );
2781 for(
size_t i=0UL; i<M; ++i )
2791 const size_t jnum( jend - jbegin );
2792 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2794 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2795 C(i,j ) -= A(i,i) * B(i,j );
2796 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
2799 C(i,jpos) -= A(i,i) * B(i,jpos);
2820 template<
typename MT3
2824 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2828 for(
size_t i=0UL; i<A.rows(); ++i ) {
2829 C(i,i) -= A(i,i) * B(i,i);
2849 template<
typename MT3
2853 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2855 selectDefaultSubAssignKernel( C, A, B );
2875 template<
typename MT3
2883 const size_t M( A.rows() );
2884 const size_t N( B.columns() );
2885 const size_t K( A.columns() );
2889 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2896 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2897 for(
size_t i=0UL; i<M; ++i )
2910 SIMDType xmm1( (~C).load(i,j ) );
2911 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2912 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2913 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2914 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2915 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2916 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2917 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2919 for(
size_t k=kbegin; k<kend; ++k ) {
2920 const SIMDType a1(
set( A(i,k) ) );
2921 xmm1 -= a1 * B.load(k,j );
2922 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
2923 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
2924 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
2925 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
2926 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
2927 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
2928 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
2931 (~C).store( i, j , xmm1 );
2932 (~C).store( i, j+SIMDSIZE , xmm2 );
2933 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2934 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2935 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2936 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2937 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2938 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2943 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2947 for( ; (i+2UL) <= M; i+=2UL )
2960 SIMDType xmm1 ( (~C).load(i ,j ) );
2961 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
2962 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
2963 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
2964 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
2965 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
2966 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
2967 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2968 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2969 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
2971 for(
size_t k=kbegin; k<kend; ++k ) {
2972 const SIMDType a1(
set( A(i ,k) ) );
2973 const SIMDType a2(
set( A(i+1UL,k) ) );
2974 const SIMDType b1( B.load(k,j ) );
2975 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2976 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2977 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2978 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2991 (~C).store( i , j , xmm1 );
2992 (~C).store( i , j+SIMDSIZE , xmm2 );
2993 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2994 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2995 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
2996 (~C).store( i+1UL, j , xmm6 );
2997 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
2998 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2999 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3000 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3012 SIMDType xmm1( (~C).load(i,j ) );
3013 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3014 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3015 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3016 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3018 for(
size_t k=kbegin; k<kend; ++k ) {
3019 const SIMDType a1(
set( A(i,k) ) );
3020 xmm1 -= a1 * B.load(k,j );
3021 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3022 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3023 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3024 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3027 (~C).store( i, j , xmm1 );
3028 (~C).store( i, j+SIMDSIZE , xmm2 );
3029 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3030 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3031 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3035 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3039 for( ; (i+2UL) <= M; i+=2UL )
3052 SIMDType xmm1( (~C).load(i ,j ) );
3053 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3054 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3055 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3056 SIMDType xmm5( (~C).load(i+1UL,j ) );
3057 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3058 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3059 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3061 for(
size_t k=kbegin; k<kend; ++k ) {
3062 const SIMDType a1(
set( A(i ,k) ) );
3063 const SIMDType a2(
set( A(i+1UL,k) ) );
3064 const SIMDType b1( B.load(k,j ) );
3065 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3066 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3067 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3078 (~C).store( i , j , xmm1 );
3079 (~C).store( i , j+SIMDSIZE , xmm2 );
3080 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3081 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3082 (~C).store( i+1UL, j , xmm5 );
3083 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3084 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3085 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3097 SIMDType xmm1( (~C).load(i,j ) );
3098 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3099 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3100 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3102 for(
size_t k=kbegin; k<kend; ++k ) {
3103 const SIMDType a1(
set( A(i,k) ) );
3104 xmm1 -= a1 * B.load(k,j );
3105 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3106 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3107 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3110 (~C).store( i, j , xmm1 );
3111 (~C).store( i, j+SIMDSIZE , xmm2 );
3112 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3113 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3117 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3121 for( ; (i+2UL) <= M; i+=2UL )
3134 SIMDType xmm1( (~C).load(i ,j ) );
3135 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3136 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3137 SIMDType xmm4( (~C).load(i+1UL,j ) );
3138 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3139 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3141 for(
size_t k=kbegin; k<kend; ++k ) {
3142 const SIMDType a1(
set( A(i ,k) ) );
3143 const SIMDType a2(
set( A(i+1UL,k) ) );
3144 const SIMDType b1( B.load(k,j ) );
3145 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3146 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3155 (~C).store( i , j , xmm1 );
3156 (~C).store( i , j+SIMDSIZE , xmm2 );
3157 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3158 (~C).store( i+1UL, j , xmm4 );
3159 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3160 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3172 SIMDType xmm1( (~C).load(i,j ) );
3173 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3174 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3176 for(
size_t k=kbegin; k<kend; ++k ) {
3177 const SIMDType a1(
set( A(i,k) ) );
3178 xmm1 -= a1 * B.load(k,j );
3179 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3180 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3183 (~C).store( i, j , xmm1 );
3184 (~C).store( i, j+SIMDSIZE , xmm2 );
3185 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3189 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3191 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
3192 size_t i( LOW ? j : 0UL );
3194 for( ; (i+2UL) <= iend; i+=2UL )
3207 SIMDType xmm1( (~C).load(i ,j ) );
3208 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3209 SIMDType xmm3( (~C).load(i+1UL,j ) );
3210 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3212 for(
size_t k=kbegin; k<kend; ++k ) {
3213 const SIMDType a1(
set( A(i ,k) ) );
3214 const SIMDType a2(
set( A(i+1UL,k) ) );
3215 const SIMDType b1( B.load(k,j ) );
3216 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3223 (~C).store( i , j , xmm1 );
3224 (~C).store( i , j+SIMDSIZE, xmm2 );
3225 (~C).store( i+1UL, j , xmm3 );
3226 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3238 SIMDType xmm1( (~C).load(i,j ) );
3239 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3241 for(
size_t k=kbegin; k<kend; ++k ) {
3242 const SIMDType a1(
set( A(i,k) ) );
3243 xmm1 -= a1 * B.load(k,j );
3244 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
3247 (~C).store( i, j , xmm1 );
3248 (~C).store( i, j+SIMDSIZE, xmm2 );
3252 for( ; j<jpos; j+=SIMDSIZE )
3254 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
3255 size_t i( LOW ? j : 0UL );
3257 for( ; (i+2UL) <= iend; i+=2UL )
3268 SIMDType xmm1( (~C).load(i ,j) );
3269 SIMDType xmm2( (~C).load(i+1UL,j) );
3271 for(
size_t k=kbegin; k<kend; ++k ) {
3272 const SIMDType b1( B.load(k,j) );
3273 xmm1 -=
set( A(i ,k) ) * b1;
3274 xmm2 -=
set( A(i+1UL,k) ) * b1;
3277 (~C).store( i , j, xmm1 );
3278 (~C).store( i+1UL, j, xmm2 );
3289 SIMDType xmm1( (~C).load(i,j) );
3291 for(
size_t k=kbegin; k<K; ++k ) {
3292 xmm1 -=
set( A(i,k) ) * B.load(k,j);
3295 (~C).store( i, j, xmm1 );
3299 for( ; remainder && j<N; ++j )
3301 const size_t iend( UPP ? j+1UL : M );
3302 size_t i( LOW ? j : 0UL );
3304 for( ; (i+2UL) <= iend; i+=2UL )
3315 ElementType value1( (~C)(i ,j) );
3316 ElementType value2( (~C)(i+1UL,j) );
3318 for(
size_t k=kbegin; k<kend; ++k ) {
3319 value1 -= A(i ,k) * B(k,j);
3320 value2 -= A(i+1UL,k) * B(k,j);
3323 (~C)(i ,j) = value1;
3324 (~C)(i+1UL,j) = value2;
3335 ElementType value( (~C)(i,j) );
3337 for(
size_t k=kbegin; k<K; ++k ) {
3338 value -= A(i,k) * B(k,j);
3363 template<
typename MT3
3374 const ForwardFunctor fwd;
3378 subAssign( ~C, fwd( tmp * B ) );
3382 subAssign( ~C, fwd( A * tmp ) );
3384 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3386 subAssign( ~C, fwd( tmp * B ) );
3390 subAssign( ~C, fwd( A * tmp ) );
3410 template<
typename MT3
3414 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3416 selectDefaultSubAssignKernel( C, A, B );
3436 template<
typename MT3
3440 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3466 template<
typename MT3
3470 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3472 selectLargeSubAssignKernel( C, A, B );
3478 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3492 template<
typename MT3
3496 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3503 subAssign( C, tmp );
3508 subAssign( C, tmp );
3511 gemm( C, A, B, ET(-1), ET(1) );
3533 template<
typename MT >
3544 const ForwardFunctor fwd;
3583 template<
typename MT
3593 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3596 else if( rhs.
lhs_.columns() == 0UL ) {
3631 template<
typename MT
3650 const ForwardFunctor fwd;
3652 const TmpType tmp( rhs );
3673 template<
typename MT >
3684 const ForwardFunctor fwd;
3712 template<
typename MT
3722 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
3756 template<
typename MT >
3767 const ForwardFunctor fwd;
3799 template<
typename MT
3809 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
3843 template<
typename MT >
3854 const ForwardFunctor fwd;
3907 template<
typename MT1
3915 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false >
3946 SYM = ( SF && !( HF || LF || UF ) ),
3947 HERM = ( HF && !( LF || UF ) ),
3948 LOW = ( LF || ( ( SF || HF ) && UF ) ),
3949 UPP = ( UF || ( ( SF || HF ) && LF ) )
3960 template<
typename T1,
typename T2,
typename T3 >
3961 struct CanExploitSymmetry {
3972 template<
typename T1,
typename T2,
typename T3 >
3973 struct IsEvaluationRequired {
3974 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
3975 !CanExploitSymmetry<T1,T2,T3>::value };
3983 template<
typename T1,
typename T2,
typename T3,
typename T4 >
3984 struct UseBlasKernel {
3991 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4005 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4006 struct UseVectorizedDefaultKernel {
4009 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4033 ,
Noop > > > > ForwardFunctor;
4065 MT1::simdEnabled && MT2::simdEnabled &&
4071 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4072 !evaluateRight && MT2::smpAssignable };
4099 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4102 return matrix_(i,j) * scalar_;
4114 inline ReturnType
at(
size_t i,
size_t j )
const {
4115 if( i >= matrix_.rows() ) {
4118 if( j >= matrix_.columns() ) {
4121 return (*
this)(i,j);
4130 inline size_t rows()
const {
4131 return matrix_.rows();
4140 inline size_t columns()
const {
4141 return matrix_.columns();
4171 template<
typename T >
4172 inline bool canAlias(
const T* alias )
const {
4173 return matrix_.canAlias( alias );
4183 template<
typename T >
4184 inline bool isAliased(
const T* alias )
const {
4185 return matrix_.isAliased( alias );
4195 return matrix_.isAligned();
4206 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
4207 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
4213 LeftOperand matrix_;
4214 RightOperand scalar_;
4229 template<
typename MT
4242 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4245 else if( left.columns() == 0UL ) {
4260 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4275 template<
typename MT3
4279 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4282 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
4283 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4284 selectSmallAssignKernel( C, A, B, scalar );
4286 selectBlasAssignKernel( C, A, B, scalar );
4304 template<
typename MT3
4309 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4311 const size_t M( A.rows() );
4312 const size_t N( B.columns() );
4313 const size_t K( A.columns() );
4317 for(
size_t i=0UL; i<M; ++i )
4328 for(
size_t j=0UL; j<N; ++j ) {
4337 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
4338 :( UPP ?
max(i,kbegin) : kbegin ) )
4339 :( UPP ? i : 0UL ) );
4342 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
4343 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
4344 :( LOW ? i+1UL : N ) );
4347 for(
size_t j=0UL; j<jbegin; ++j ) {
4354 for(
size_t j=jbegin; j<jend; ++j ) {
4355 C(i,j) = A(i,kbegin) * B(kbegin,j);
4358 for(
size_t j=jend; j<N; ++j ) {
4363 reset( C(i,N-1UL) );
4367 for(
size_t k=kbegin+1UL; k<kend; ++k )
4371 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
4372 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
4373 :( SYM || HERM || UPP ? i : 0UL ) );
4376 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
4377 :( LOW ?
min(i+1UL,k) : k ) )
4378 :( LOW ? i+1UL : N ) );
4380 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
4383 for(
size_t j=jbegin; j<jend; ++j ) {
4384 C(i,j) += A(i,k) * B(k,j);
4387 C(i,jend) = A(i,k) * B(k,jend);
4394 :( SYM || HERM || UPP ? i : 0UL ) );
4397 :( LOW ? i+1UL : N ) );
4399 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
4402 for(
size_t j=jbegin; j<jend; ++j ) {
4409 for(
size_t i=1UL; i<M; ++i ) {
4410 for(
size_t j=0UL; j<i; ++j ) {
4411 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
4432 template<
typename MT3
4436 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4437 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4441 const size_t M( A.rows() );
4442 const size_t N( B.columns() );
4444 for(
size_t i=0UL; i<M; ++i )
4455 for(
size_t j=0UL; j<jbegin; ++j ) {
4459 for(
size_t j=jbegin; j<jend; ++j ) {
4460 C(i,j) = A(i,j) * B(j,j) * scalar;
4463 for(
size_t j=jend; j<N; ++j ) {
4485 template<
typename MT3
4490 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4494 const size_t M( A.rows() );
4495 const size_t N( B.columns() );
4497 for(
size_t i=0UL; i<M; ++i )
4508 for(
size_t j=0UL; j<jbegin; ++j ) {
4512 for(
size_t j=jbegin; j<jend; ++j ) {
4513 C(i,j) = A(i,i) * B(i,j) * scalar;
4516 for(
size_t j=jend; j<N; ++j ) {
4538 template<
typename MT3
4543 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4549 for(
size_t i=0UL; i<A.rows(); ++i ) {
4550 C(i,i) = A(i,i) * B(i,i) * scalar;
4569 template<
typename MT3
4574 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4576 selectDefaultAssignKernel( C, A, B, scalar );
4595 template<
typename MT3
4604 const size_t M( A.rows() );
4605 const size_t N( B.columns() );
4606 const size_t K( A.columns() );
4610 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4613 const SIMDType factor(
set( scalar ) );
4615 if( LOW && UPP && N > SIMDSIZE*3UL ) {
4624 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4625 for(
size_t i=0UL; i<M; ++i )
4638 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4640 for(
size_t k=kbegin; k<kend; ++k ) {
4641 const SIMDType a1(
set( A(i,k) ) );
4642 xmm1 += a1 * B.load(k,j );
4643 xmm2 += a1 * B.load(k,j+SIMDSIZE );
4644 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
4645 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
4646 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
4647 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
4648 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
4649 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
4652 (~C).store( i, j , xmm1 * factor );
4653 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4654 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4655 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4656 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
4657 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
4658 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
4659 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
4664 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
4668 for( ; (i+2UL) <= M; i+=2UL )
4681 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
4683 for(
size_t k=kbegin; k<kend; ++k ) {
4684 const SIMDType a1(
set( A(i ,k) ) );
4685 const SIMDType a2(
set( A(i+1UL,k) ) );
4686 const SIMDType b1( B.load(k,j ) );
4687 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4688 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4689 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4690 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
4703 (~C).store( i , j , xmm1 * factor );
4704 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
4705 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
4706 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
4707 (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
4708 (~C).store( i+1UL, j , xmm6 * factor );
4709 (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
4710 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
4711 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
4712 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
4724 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
4726 for(
size_t k=kbegin; k<kend; ++k ) {
4727 const SIMDType a1(
set( A(i,k) ) );
4728 xmm1 += a1 * B.load(k,j );
4729 xmm2 += a1 * B.load(k,j+SIMDSIZE );
4730 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
4731 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
4732 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
4735 (~C).store( i, j , xmm1 * factor );
4736 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4737 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4738 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4739 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
4743 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4745 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
4746 size_t i( LOW ? j : 0UL );
4748 for( ; (i+2UL) <= iend; i+=2UL )
4761 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4763 for(
size_t k=kbegin; k<kend; ++k ) {
4764 const SIMDType a1(
set( A(i ,k) ) );
4765 const SIMDType a2(
set( A(i+1UL,k) ) );
4766 const SIMDType b1( B.load(k,j ) );
4767 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4768 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4769 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4780 (~C).store( i , j , xmm1 * factor );
4781 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
4782 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
4783 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
4784 (~C).store( i+1UL, j , xmm5 * factor );
4785 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
4786 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
4787 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
4799 SIMDType xmm1, xmm2, xmm3, xmm4;
4801 for(
size_t k=kbegin; k<kend; ++k ) {
4802 const SIMDType a1(
set( A(i,k) ) );
4803 xmm1 += a1 * B.load(k,j );
4804 xmm2 += a1 * B.load(k,j+SIMDSIZE );
4805 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
4806 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
4809 (~C).store( i, j , xmm1 * factor );
4810 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4811 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4812 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4816 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4818 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
4819 size_t i( LOW ? j : 0UL );
4821 for( ; (i+2UL) <= iend; i+=2UL )
4834 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
4836 for(
size_t k=kbegin; k<kend; ++k ) {
4837 const SIMDType a1(
set( A(i ,k) ) );
4838 const SIMDType a2(
set( A(i+1UL,k) ) );
4839 const SIMDType b1( B.load(k,j ) );
4840 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4841 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4850 (~C).store( i , j , xmm1 * factor );
4851 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
4852 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
4853 (~C).store( i+1UL, j , xmm4 * factor );
4854 (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
4855 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
4867 SIMDType xmm1, xmm2, xmm3;
4869 for(
size_t k=kbegin; k<kend; ++k ) {
4870 const SIMDType a1(
set( A(i,k) ) );
4871 xmm1 += a1 * B.load(k,j );
4872 xmm2 += a1 * B.load(k,j+SIMDSIZE );
4873 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
4876 (~C).store( i, j , xmm1 * factor );
4877 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4878 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4882 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4884 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
4885 size_t i( LOW ? j : 0UL );
4887 for( ; (i+2UL) <= iend; i+=2UL )
4900 SIMDType xmm1, xmm2, xmm3, xmm4;
4902 for(
size_t k=kbegin; k<kend; ++k ) {
4903 const SIMDType a1(
set( A(i ,k) ) );
4904 const SIMDType a2(
set( A(i+1UL,k) ) );
4905 const SIMDType b1( B.load(k,j ) );
4906 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4913 (~C).store( i , j , xmm1 * factor );
4914 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
4915 (~C).store( i+1UL, j , xmm3 * factor );
4916 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
4928 SIMDType xmm1, xmm2;
4930 for(
size_t k=kbegin; k<kend; ++k ) {
4931 const SIMDType a1(
set( A(i,k) ) );
4932 xmm1 += a1 * B.load(k,j );
4933 xmm2 += a1 * B.load(k,j+SIMDSIZE);
4936 (~C).store( i, j , xmm1 * factor );
4937 (~C).store( i, j+SIMDSIZE, xmm2 * factor );
4941 for( ; j<jpos; j+=SIMDSIZE )
4943 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
4944 size_t i( LOW ? j : 0UL );
4946 for( ; (i+2UL) <= iend; i+=2UL )
4957 SIMDType xmm1, xmm2;
4959 for(
size_t k=kbegin; k<kend; ++k ) {
4960 const SIMDType b1( B.load(k,j) );
4961 xmm1 +=
set( A(i ,k) ) * b1;
4962 xmm2 +=
set( A(i+1UL,k) ) * b1;
4965 (~C).store( i , j, xmm1 * factor );
4966 (~C).store( i+1UL, j, xmm2 * factor );
4979 for(
size_t k=kbegin; k<K; ++k ) {
4980 xmm1 +=
set( A(i,k) ) * B.load(k,j);
4983 (~C).store( i, j, xmm1 * factor );
4987 for( ; remainder && j<N; ++j )
4989 size_t i( LOW && UPP ? j : 0UL );
4991 for( ; (i+2UL) <= M; i+=2UL )
5005 for(
size_t k=kbegin; k<kend; ++k ) {
5006 value1 += A(i ,k) * B(k,j);
5007 value2 += A(i+1UL,k) * B(k,j);
5010 (~C)(i ,j) = value1 * scalar;
5011 (~C)(i+1UL,j) = value2 * scalar;
5024 for(
size_t k=kbegin; k<K; ++k ) {
5025 value += A(i,k) * B(k,j);
5028 (~C)(i,j) = value * scalar;
5033 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
5034 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5035 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5036 for(
size_t j=0UL; j<jend; ++j ) {
5037 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5041 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
5042 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5043 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
5044 for(
size_t i=0UL; i<iend; ++i ) {
5049 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
5050 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5051 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5052 for(
size_t j=0UL; j<jend; ++j ) {
5075 template<
typename MT3
5087 const ForwardFunctor fwd;
5091 assign( ~C, fwd( tmp * B ) * scalar );
5095 assign( ~C, fwd( A * tmp ) * scalar );
5097 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
5099 assign( ~C, fwd( tmp * B ) * scalar );
5103 assign( ~C, fwd( A * tmp ) * scalar );
5122 template<
typename MT3
5127 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5129 selectDefaultAssignKernel( C, A, B, scalar );
5148 template<
typename MT3
5153 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5156 smmm( C, A, B, scalar );
5158 hmmm( C, A, B, scalar );
5160 lmmm( C, A, B, scalar, ST2(0) );
5162 ummm( C, A, B, scalar, ST2(0) );
5164 mmm( C, A, B, scalar, ST2(0) );
5182 template<
typename MT3
5187 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5189 selectLargeAssignKernel( C, A, B, scalar );
5194 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5208 template<
typename MT3
5213 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5219 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5223 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5226 gemm( C, A, B, ET(scalar), ET(0) );
5244 template<
typename MT
5263 const ForwardFunctor fwd;
5265 const TmpType tmp(
serial( rhs ) );
5266 assign( ~lhs, fwd( tmp ) );
5284 template<
typename MT >
5295 const ForwardFunctor fwd;
5303 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
5305 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
5321 template<
typename MT
5334 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5348 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
5363 template<
typename MT3
5367 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5370 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
5371 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5372 selectSmallAddAssignKernel( C, A, B, scalar );
5374 selectBlasAddAssignKernel( C, A, B, scalar );
5392 template<
typename MT3
5397 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5399 const ResultType tmp(
serial( A * B * scalar ) );
5400 addAssign( C, tmp );
5418 template<
typename MT3
5422 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
5423 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5427 const size_t M( A.rows() );
5428 const size_t N( B.columns() );
5430 for(
size_t i=0UL; i<M; ++i )
5440 const size_t jnum( jend - jbegin );
5441 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5443 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5444 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5445 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5448 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5468 template<
typename MT3
5473 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5477 const size_t M( A.rows() );
5478 const size_t N( B.columns() );
5480 for(
size_t i=0UL; i<M; ++i )
5490 const size_t jnum( jend - jbegin );
5491 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5493 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5494 C(i,j ) += A(i,i) * B(i,j ) * scalar;
5495 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
5498 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
5518 template<
typename MT3
5523 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5527 for(
size_t i=0UL; i<A.rows(); ++i ) {
5528 C(i,i) += A(i,i) * B(i,i) * scalar;
5547 template<
typename MT3
5552 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5554 selectDefaultAddAssignKernel( C, A, B, scalar );
5573 template<
typename MT3
5582 const size_t M( A.rows() );
5583 const size_t N( B.columns() );
5584 const size_t K( A.columns() );
5588 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
5591 const SIMDType factor(
set( scalar ) );
5597 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5598 for(
size_t i=0UL; i<M; ++i )
5611 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5613 for(
size_t k=kbegin; k<kend; ++k ) {
5614 const SIMDType a1(
set( A(i,k) ) );
5615 xmm1 += a1 * B.load(k,j );
5616 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5617 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5618 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5619 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5620 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
5621 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
5622 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
5625 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5626 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5627 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5628 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5629 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
5630 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
5631 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
5632 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
5637 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5641 for( ; (i+2UL) <= M; i+=2UL )
5654 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5656 for(
size_t k=kbegin; k<kend; ++k ) {
5657 const SIMDType a1(
set( A(i ,k) ) );
5658 const SIMDType a2(
set( A(i+1UL,k) ) );
5659 const SIMDType b1( B.load(k,j ) );
5660 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5661 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5662 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5663 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5676 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5677 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
5678 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
5679 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
5680 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
5681 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
5682 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
5683 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
5684 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
5685 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
5697 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5699 for(
size_t k=kbegin; k<kend; ++k ) {
5700 const SIMDType a1(
set( A(i,k) ) );
5701 xmm1 += a1 * B.load(k,j );
5702 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5703 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5704 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5705 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5708 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5709 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5710 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5711 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5712 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
5716 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5720 for( ; (i+2UL) <= M; i+=2UL )
5733 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5735 for(
size_t k=kbegin; k<kend; ++k ) {
5736 const SIMDType a1(
set( A(i ,k) ) );
5737 const SIMDType a2(
set( A(i+1UL,k) ) );
5738 const SIMDType b1( B.load(k,j ) );
5739 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5740 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5741 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5752 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5753 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
5754 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
5755 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
5756 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5757 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
5758 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
5759 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
5771 SIMDType xmm1, xmm2, xmm3, xmm4;
5773 for(
size_t k=kbegin; k<kend; ++k ) {
5774 const SIMDType a1(
set( A(i,k) ) );
5775 xmm1 += a1 * B.load(k,j );
5776 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5777 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5778 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5781 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5782 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5783 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5784 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5788 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5792 for( ; (i+2UL) <= M; i+=2UL )
5805 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5807 for(
size_t k=kbegin; k<kend; ++k ) {
5808 const SIMDType a1(
set( A(i ,k) ) );
5809 const SIMDType a2(
set( A(i+1UL,k) ) );
5810 const SIMDType b1( B.load(k,j ) );
5811 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5812 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5821 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5822 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
5823 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
5824 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
5825 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
5826 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
5838 SIMDType xmm1, xmm2, xmm3;
5840 for(
size_t k=kbegin; k<kend; ++k ) {
5841 const SIMDType a1(
set( A(i,k) ) );
5842 xmm1 += a1 * B.load(k,j );
5843 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5844 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5847 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5848 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5849 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5853 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5855 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
5856 size_t i( LOW ? j : 0UL );
5858 for( ; (i+2UL) <= iend; i+=2UL )
5871 SIMDType xmm1, xmm2, xmm3, xmm4;
5873 for(
size_t k=kbegin; k<kend; ++k ) {
5874 const SIMDType a1(
set( A(i ,k) ) );
5875 const SIMDType a2(
set( A(i+1UL,k) ) );
5876 const SIMDType b1( B.load(k,j ) );
5877 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5884 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5885 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
5886 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5887 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
5899 SIMDType xmm1, xmm2;
5901 for(
size_t k=kbegin; k<kend; ++k ) {
5902 const SIMDType a1(
set( A(i,k) ) );
5903 xmm1 += a1 * B.load(k,j );
5904 xmm2 += a1 * B.load(k,j+SIMDSIZE);
5907 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5908 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
5912 for( ; j<jpos; j+=SIMDSIZE )
5914 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
5915 size_t i( LOW ? j : 0UL );
5917 for( ; (i+2UL) <= iend; i+=2UL )
5928 SIMDType xmm1, xmm2;
5930 for(
size_t k=kbegin; k<kend; ++k ) {
5931 const SIMDType b1( B.load(k,j) );
5932 xmm1 +=
set( A(i ,k) ) * b1;
5933 xmm2 +=
set( A(i+1UL,k) ) * b1;
5936 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5937 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
5950 for(
size_t k=kbegin; k<K; ++k ) {
5951 xmm1 +=
set( A(i,k) ) * B.load(k,j);
5954 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5958 for( ; remainder && j<N; ++j )
5960 const size_t iend( UPP ? j+1UL : M );
5961 size_t i( LOW ? j : 0UL );
5963 for( ; (i+2UL) <= iend; i+=2UL )
5977 for(
size_t k=kbegin; k<kend; ++k ) {
5978 value1 += A(i ,k) * B(k,j);
5979 value2 += A(i+1UL,k) * B(k,j);
5982 (~C)(i ,j) += value1 * scalar;
5983 (~C)(i+1UL,j) += value2 * scalar;
5996 for(
size_t k=kbegin; k<K; ++k ) {
5997 value += A(i,k) * B(k,j);
6000 (~C)(i,j) += value * scalar;
6021 template<
typename MT3
6033 const ForwardFunctor fwd;
6037 addAssign( ~C, fwd( tmp * B ) * scalar );
6041 addAssign( ~C, fwd( A * tmp ) * scalar );
6043 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6045 addAssign( ~C, fwd( tmp * B ) * scalar );
6049 addAssign( ~C, fwd( A * tmp ) * scalar );
6068 template<
typename MT3
6073 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6075 selectDefaultAddAssignKernel( C, A, B, scalar );
6094 template<
typename MT3
6099 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6102 lmmm( C, A, B, scalar, ST2(1) );
6104 ummm( C, A, B, scalar, ST2(1) );
6106 mmm( C, A, B, scalar, ST2(1) );
6124 template<
typename MT3
6129 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6131 selectLargeAddAssignKernel( C, A, B, scalar );
6136 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6150 template<
typename MT3
6155 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6161 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6162 addAssign( C, tmp );
6166 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6167 addAssign( C, tmp );
6170 gemm( C, A, B, ET(scalar), ET(1) );
6190 template<
typename MT >
6201 const ForwardFunctor fwd;
6209 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
6211 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
6231 template<
typename MT
6244 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6258 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6273 template<
typename MT3
6277 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6280 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
6281 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6282 selectSmallSubAssignKernel( C, A, B, scalar );
6284 selectBlasSubAssignKernel( C, A, B, scalar );
6302 template<
typename MT3
6307 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6309 const ResultType tmp(
serial( A * B * scalar ) );
6310 subAssign( C, tmp );
6328 template<
typename MT3
6332 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6333 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6337 const size_t M( A.rows() );
6338 const size_t N( B.columns() );
6340 for(
size_t i=0UL; i<M; ++i )
6350 const size_t jnum( jend - jbegin );
6351 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6353 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6354 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6355 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6358 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6378 template<
typename MT3
6383 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6387 const size_t M( A.rows() );
6388 const size_t N( B.columns() );
6390 for(
size_t i=0UL; i<M; ++i )
6400 const size_t jnum( jend - jbegin );
6401 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6403 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6404 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
6405 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
6408 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
6428 template<
typename MT3
6433 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6437 for(
size_t i=0UL; i<A.rows(); ++i ) {
6438 C(i,i) -= A(i,i) * B(i,i) * scalar;
6457 template<
typename MT3
6462 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6464 selectDefaultSubAssignKernel( C, A, B, scalar );
6483 template<
typename MT3
6492 const size_t M( A.rows() );
6493 const size_t N( B.columns() );
6494 const size_t K( A.columns() );
6498 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
6501 const SIMDType factor(
set( scalar ) );
6507 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6508 for(
size_t i=0UL; i<M; ++i )
6521 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6523 for(
size_t k=kbegin; k<kend; ++k ) {
6524 const SIMDType a1(
set( A(i,k) ) );
6525 xmm1 += a1 * B.load(k,j );
6526 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6527 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6528 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6529 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6530 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6531 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6532 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6535 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6536 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6537 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6538 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6539 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
6540 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
6541 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
6542 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
6547 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6551 for( ; (i+2UL) <= M; i+=2UL )
6564 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6566 for(
size_t k=kbegin; k<kend; ++k ) {
6567 const SIMDType a1(
set( A(i ,k) ) );
6568 const SIMDType a2(
set( A(i+1UL,k) ) );
6569 const SIMDType b1( B.load(k,j ) );
6570 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6571 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6572 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6573 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6586 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6587 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
6588 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
6589 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
6590 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
6591 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
6592 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
6593 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
6594 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
6595 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
6607 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6609 for(
size_t k=kbegin; k<kend; ++k ) {
6610 const SIMDType a1(
set( A(i,k) ) );
6611 xmm1 += a1 * B.load(k,j );
6612 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6613 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6614 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6615 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6618 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6619 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6620 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6621 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6622 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
6626 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6630 for( ; (i+2UL) <= M; i+=2UL )
6643 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6645 for(
size_t k=kbegin; k<kend; ++k ) {
6646 const SIMDType a1(
set( A(i ,k) ) );
6647 const SIMDType a2(
set( A(i+1UL,k) ) );
6648 const SIMDType b1( B.load(k,j ) );
6649 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6650 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6651 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6662 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6663 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
6664 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
6665 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
6666 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
6667 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
6668 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
6669 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
6681 SIMDType xmm1, xmm2, xmm3, xmm4;
6683 for(
size_t k=kbegin; k<kend; ++k ) {
6684 const SIMDType a1(
set( A(i,k) ) );
6685 xmm1 += a1 * B.load(k,j );
6686 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6687 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6688 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6691 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6692 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6693 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6694 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6698 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6702 for( ; (i+2UL) <= M; i+=2UL )
6715 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6717 for(
size_t k=kbegin; k<kend; ++k ) {
6718 const SIMDType a1(
set( A(i ,k) ) );
6719 const SIMDType a2(
set( A(i+1UL,k) ) );
6720 const SIMDType b1( B.load(k,j ) );
6721 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6722 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6731 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6732 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
6733 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
6734 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
6735 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
6736 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
6748 SIMDType xmm1, xmm2, xmm3;
6750 for(
size_t k=kbegin; k<kend; ++k ) {
6751 const SIMDType a1(
set( A(i,k) ) );
6752 xmm1 += a1 * B.load(k,j );
6753 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6754 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6757 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6758 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6759 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6763 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6765 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
6766 size_t i( LOW ? j : 0UL );
6768 for( ; (i+2UL) <= iend; i+=2UL )
6781 SIMDType xmm1, xmm2, xmm3, xmm4;
6783 for(
size_t k=kbegin; k<kend; ++k ) {
6784 const SIMDType a1(
set( A(i ,k) ) );
6785 const SIMDType a2(
set( A(i+1UL,k) ) );
6786 const SIMDType b1( B.load(k,j ) );
6787 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6794 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6795 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
6796 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
6797 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
6809 SIMDType xmm1, xmm2;
6811 for(
size_t k=kbegin; k<kend; ++k ) {
6812 const SIMDType a1(
set( A(i,k) ) );
6813 xmm1 += a1 * B.load(k,j );
6814 xmm2 += a1 * B.load(k,j+SIMDSIZE);
6817 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6818 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
6822 for( ; j<jpos; j+=SIMDSIZE )
6824 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
6825 size_t i( LOW ? j : 0UL );
6827 for( ; (i+2UL) <= iend; i+=2UL )
6838 SIMDType xmm1, xmm2;
6840 for(
size_t k=kbegin; k<kend; ++k ) {
6841 const SIMDType b1( B.load(k,j) );
6842 xmm1 +=
set( A(i ,k) ) * b1;
6843 xmm2 +=
set( A(i+1UL,k) ) * b1;
6846 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6847 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
6860 for(
size_t k=kbegin; k<K; ++k ) {
6861 xmm1 +=
set( A(i,k) ) * B.load(k,j);
6864 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6868 for( ; remainder && j<N; ++j )
6870 const size_t iend( UPP ? j+1UL : M );
6871 size_t i( LOW ? j : 0UL );
6873 for( ; (i+2UL) <= iend; i+=2UL )
6887 for(
size_t k=kbegin; k<kend; ++k ) {
6888 value1 += A(i ,k) * B(k,j);
6889 value2 += A(i+1UL,k) * B(k,j);
6892 (~C)(i ,j) -= value1 * scalar;
6893 (~C)(i+1UL,j) -= value2 * scalar;
6906 for(
size_t k=kbegin; k<K; ++k ) {
6907 value += A(i,k) * B(k,j);
6910 (~C)(i,j) -= value * scalar;
6930 template<
typename MT3
6942 const ForwardFunctor fwd;
6946 subAssign( ~C, fwd( tmp * B ) * scalar );
6950 subAssign( ~C, fwd( A * tmp ) * scalar );
6952 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6954 subAssign( ~C, fwd( tmp * B ) * scalar );
6958 subAssign( ~C, fwd( A * tmp ) * scalar );
6977 template<
typename MT3
6982 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6984 selectDefaultSubAssignKernel( C, A, B, scalar );
7003 template<
typename MT3
7008 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7011 lmmm( C, A, B, -scalar, ST2(1) );
7013 ummm( C, A, B, -scalar, ST2(1) );
7015 mmm( C, A, B, -scalar, ST2(1) );
7033 template<
typename MT3
7038 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7040 selectLargeSubAssignKernel( C, A, B, scalar );
7045 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7059 template<
typename MT3
7064 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7070 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7071 subAssign( C, tmp );
7075 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7076 subAssign( C, tmp );
7079 gemm( C, A, B, ET(-scalar), ET(1) );
7099 template<
typename MT >
7110 const ForwardFunctor fwd;
7118 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
7120 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
7151 template<
typename MT
7164 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7167 else if( left.columns() == 0UL ) {
7201 template<
typename MT
7220 const ForwardFunctor fwd;
7222 const TmpType tmp( rhs );
7241 template<
typename MT >
7252 const ForwardFunctor fwd;
7281 template<
typename MT
7294 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7326 template<
typename MT >
7337 const ForwardFunctor fwd;
7370 template<
typename MT
7383 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7415 template<
typename MT >
7426 const ForwardFunctor fwd;
7502 template<
typename T1
7549 template<
typename MT1
7593 template<
typename MT1
7637 template<
typename MT1
7681 template<
typename MT1
7725 template<
typename MT1
7756 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7757 struct Rows< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Rows<MT1>
7773 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7774 struct Columns< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Columns<MT2>
7790 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7791 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7792 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7808 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7809 struct IsSymmetric< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7812 , IsBuiltin< ElementType_< DMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
7813 , And< Bool<LF>, Bool<UF> > >::value >
7829 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
7830 struct IsHermitian< DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
7847 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7848 struct IsLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7850 , And< IsLower<MT1>, IsLower<MT2> >
7851 , And< Or< Bool<SF>, Bool<HF> >
7852 , IsUpper<MT1>, IsUpper<MT2> > >::value >
7868 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7869 struct IsUniLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7870 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
7871 , And< Or< Bool<SF>, Bool<HF> >
7872 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
7888 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7890 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7891 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
7892 , And< Or< Bool<SF>, Bool<HF> >
7893 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7894 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
7910 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7911 struct IsUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7913 , And< IsUpper<MT1>, IsUpper<MT2> >
7914 , And< Or< Bool<SF>, Bool<HF> >
7915 , IsLower<MT1>, IsLower<MT2> > >::value >
7931 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7932 struct IsUniUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7933 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
7934 , And< Or< Bool<SF>, Bool<HF> >
7935 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
7951 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7953 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7954 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
7955 , And< Or< Bool<SF>, Bool<HF> >
7956 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7957 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
7973 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
7991 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
8009 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8027 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8045 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8062 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8079 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8096 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8113 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8130 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
bool AF >
8145 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8146 struct RowExprTrait< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8159 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
typename SubmatrixExprTrait< MT, AF >::Type SubmatrixExprTrait_
Auxiliary alias declaration for the SubmatrixExprTrait type trait.The SubmatrixExprTrait_ alias decla...
Definition: SubmatrixExprTrait.h:134
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:344
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:179
Evaluation of the expression type of a dense matrix declupp operation.Via this type trait it is possi...
Definition: DMatDeclUppExprTrait.h:75
Compile time check for row vector types.This type trait tests whether or not the given template argum...
Definition: IsRowVector.h:80
const DMatForEachExpr< MT, Conj, SO > conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatForEachExpr.h:1214
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:433
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:300
Header file for the IsUniUpper type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Header file for the SparseVector base class.
Header file for the DMatDeclDiagExprTrait class template.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:312
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:507
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:305
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:560
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:198
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:633
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:318
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:550
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:477
Header file for the IsIntegral type trait.
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:309
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:66
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
DMatDMatMultExpr< MT1, MT2, SF, HF, LF, UF > This
Type of this DMatDMatMultExpr instance.
Definition: DMatDMatMultExpr.h:298
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:306
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
DisableIf_< IsSymmetric< MT >, const DMatDeclSymExpr< MT, SO > > declsym(const DenseMatrix< MT, SO > &dm)
Declares the given non-symmetric dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:841
Evaluation of the expression type of a dense matrix/dense vector multiplication.Via this type trait i...
Definition: DMatDVecMultExprTrait.h:78
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:170
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:453
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:199
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:181
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:119
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Evaluation of the expression type of a dense matrix declsym operation.Via this type trait it is possi...
Definition: DMatDeclSymExprTrait.h:75
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
DisableIf_< IsHermitian< MT >, const DMatDeclHermExpr< MT, SO > > declherm(const DenseMatrix< MT, SO > &dm)
Declares the given non-Hermitian dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:841
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Header file for the DisableIf class template.
Compile time check for dense vector types.This type trait tests whether or not the given template par...
Definition: IsDenseVector.h:78
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:178
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:83
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:123
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:197
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:176
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the DMatDeclLowExprTrait class template.
Header file for the Columns type trait.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:497
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Evaluation of the expression type of a dense matrix/sparse vector multiplication.Via this type trait ...
Definition: DMatSVecMultExprTrait.h:80
Compile time check for sparse vector types.This type trait tests whether or not the given template pa...
Definition: IsSparseVector.h:78
Evaluation of the expression type type of a submatrix operation.Via this type trait it is possible to...
Definition: SubmatrixExprTrait.h:80
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:301
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:359
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Header file for the DMatDeclUppExprTrait class template.
Header file for the DMatDeclSymExprTrait class template.
Compile time check for column vector types.This type trait tests whether or not the given template ar...
Definition: IsColumnVector.h:80
Constraints on the storage order of matrix types.
typename TDVecDMatMultExprTrait< VT, MT >::Type TDVecDMatMultExprTrait_
Auxiliary alias declaration for the TDVecDMatMultExprTrait class template.The TDVecDMatMultExprTrait_...
Definition: TDVecDMatMultExprTrait.h:119
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
DisableIf_< IsLower< MT >, const DMatDeclLowExpr< MT, SO > > decllow(const DenseMatrix< MT, SO > &dm)
Declares the given non-lower dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:842
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Evaluation of the expression type type of a row operation.Via this type trait it is possible to evalu...
Definition: RowExprTrait.h:79
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:632
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:260
Header file for the DeclDiag functor.
Evaluation of the expression type of a dense matrix declherm operation.Via this type trait it is poss...
Definition: DMatDeclHermExprTrait.h:75
Compile time check for dense matrix types.This type trait tests whether or not the given template par...
Definition: IsDenseMatrix.h:78
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:302
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:180
Header file for the conjugate shim.
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSIMDCombinable type trait.
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:407
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:303
Utility type for generic codes.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
Compile time type negation.The Not class template negates the given compile time condition. In case the given condition would evaluate to true, the nested member enumeration is set to false and vice versa:
Definition: Not.h:70
Header file for the DMatDeclHermExprTrait class template.
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:94
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:304
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:465
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:315
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:177
typename DMatDVecMultExprTrait< MT, VT >::Type DMatDVecMultExprTrait_
Auxiliary alias declaration for the DMatDVecMultExprTrait class template.The DMatDVecMultExprTrait_ a...
Definition: DMatDVecMultExprTrait.h:119
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:443
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
Evaluation of the expression type of a dense vector/dense matrix multiplication.Via this type trait i...
Definition: TDVecDMatMultExprTrait.h:78
Evaluation of the expression type of a sparse vector/dense matrix multiplication.Via this type trait ...
Definition: TSVecDMatMultExprTrait.h:78
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:733
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:487
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:76
Evaluation of the expression type of a dense matrix decllow operation.Via this type trait it is possi...
Definition: DMatDeclLowExprTrait.h:75
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Evaluation of the expression type of a dense matrix decldiag operation.Via this type trait it is poss...
Definition: DMatDeclDiagExprTrait.h:75
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the TSVecDMatMultExprTrait class template.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:423
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:677
Header file for the IsResizable type trait.
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:200
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:508
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
DisableIf_< IsDiagonal< MT >, const DMatDeclDiagExpr< MT, SO > > decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given non-diagonal dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:841
DisableIf_< IsUpper< MT >, const DMatDeclUppExpr< MT, SO > > declupp(const DenseMatrix< MT, SO > &dm)
Declares the given non-upper dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:842
Evaluation of the expression type type of a column operation.Via this type trait it is possible to ev...
Definition: ColumnExprTrait.h:78
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.