35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 162 template<
typename MT1
168 class TDMatTDMatMultExpr :
public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true >
169 ,
private MatMatMultExpr
170 ,
private Computation
195 SYM = ( SF && !( HF || LF || UF ) ),
196 HERM = ( HF && !( LF || UF ) ),
197 LOW = ( LF || ( ( SF || HF ) && UF ) ),
198 UPP = ( UF || ( ( SF || HF ) && LF ) )
210 template<
typename T1,
typename T2,
typename T3 >
211 struct CanExploitSymmetry {
224 template<
typename T1,
typename T2,
typename T3 >
225 struct IsEvaluationRequired {
226 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
227 CanExploitSymmetry<T1,T2,T3>::value };
237 template<
typename T1,
typename T2,
typename T3 >
238 struct UseBlasKernel {
245 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
260 template<
typename T1,
typename T2,
typename T3 >
261 struct UseVectorizedDefaultKernel {
264 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
289 ,
Noop > > > > ForwardFunctor;
322 MT1::simdEnabled && MT2::simdEnabled &&
327 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
328 !evaluateRight && MT2::smpAssignable };
383 :(
lhs_.columns() ) ) );
387 const size_t n(
end - begin );
405 inline ReturnType
at(
size_t i,
size_t j )
const {
406 if( i >=
lhs_.rows() ) {
409 if( j >=
rhs_.columns() ) {
421 inline size_t rows() const noexcept {
432 return rhs_.columns();
462 template<
typename T >
463 inline bool canAlias(
const T* alias )
const noexcept {
464 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
474 template<
typename T >
475 inline bool isAliased(
const T* alias )
const noexcept {
476 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
486 return lhs_.isAligned() &&
rhs_.isAligned();
497 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
498 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
522 template<
typename MT
532 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
535 else if( rhs.lhs_.columns() == 0UL ) {
540 LT A(
serial( rhs.lhs_ ) );
541 RT B(
serial( rhs.rhs_ ) );
550 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
566 template<
typename MT3
569 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
572 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
573 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
574 selectSmallAssignKernel( C, A, B );
576 selectBlasAssignKernel( C, A, B );
595 template<
typename MT3
599 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
601 const size_t M( A.rows() );
602 const size_t N( B.columns() );
603 const size_t K( A.columns() );
607 for(
size_t j=0UL; j<N; ++j )
618 for(
size_t i=0UL; i<M; ++i ) {
627 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
628 :( LOW ?
max(j,kbegin) : kbegin ) )
629 :( LOW ? j : 0UL ) );
632 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
633 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
634 :( UPP ? j+1UL : M ) );
637 for(
size_t i=0UL; i<ibegin; ++i ) {
644 for(
size_t i=ibegin; i<iend; ++i ) {
645 C(i,j) = A(i,kbegin) * B(kbegin,j);
648 for(
size_t i=iend; i<M; ++i ) {
657 for(
size_t k=kbegin+1UL; k<kend; ++k )
661 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
662 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
663 :( SYM || HERM || LOW ? j : 0UL ) );
666 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
667 :( UPP ?
min(j+1UL,k) : k ) )
668 :( UPP ? j+1UL : M ) );
670 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
673 for(
size_t i=ibegin; i<iend; ++i ) {
674 C(i,j) += A(i,k) * B(k,j);
677 C(iend,j) = A(iend,k) * B(k,j);
683 for(
size_t j=1UL; j<N; ++j ) {
684 for(
size_t i=0UL; i<j; ++i ) {
685 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
707 template<
typename MT3
710 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
711 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
715 const size_t M( A.rows() );
716 const size_t N( B.columns() );
718 for(
size_t j=0UL; j<N; ++j )
729 for(
size_t i=0UL; i<ibegin; ++i ) {
733 for(
size_t i=ibegin; i<iend; ++i ) {
734 C(i,j) = A(i,j) * B(j,j);
737 for(
size_t i=iend; i<M; ++i ) {
760 template<
typename MT3
764 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
768 const size_t M( A.rows() );
769 const size_t N( B.columns() );
771 for(
size_t j=0UL; j<N; ++j )
782 for(
size_t i=0UL; i<ibegin; ++i ) {
786 for(
size_t i=ibegin; i<iend; ++i ) {
787 C(i,j) = A(i,i) * B(i,j);
790 for(
size_t i=iend; i<M; ++i ) {
813 template<
typename MT3
817 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
823 for(
size_t i=0UL; i<A.rows(); ++i ) {
824 C(i,i) = A(i,i) * B(i,i);
844 template<
typename MT3
848 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
850 selectDefaultAssignKernel( C, A, B );
870 template<
typename MT3
881 const ForwardFunctor fwd;
885 assign( ~C, fwd( A * tmp ) );
889 assign( ~C, fwd( tmp * B ) );
891 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
893 assign( ~C, fwd( A * tmp ) );
897 assign( ~C, fwd( tmp * B ) );
918 template<
typename MT3
926 const size_t M( A.rows() );
927 const size_t N( B.columns() );
928 const size_t K( A.columns() );
932 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
935 if( LOW && UPP && M > SIMDSIZE*3UL ) {
944 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
945 for(
size_t j=0UL; j<N; ++j )
958 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
960 for(
size_t k=kbegin; k<kend; ++k ) {
961 const SIMDType b1(
set( B(k,j) ) );
962 xmm1 += A.load(i ,k) * b1;
963 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
964 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
965 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
966 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
967 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
968 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
969 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
972 (~C).store( i , j, xmm1 );
973 (~C).store( i+SIMDSIZE , j, xmm2 );
974 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
975 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
976 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
977 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
978 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
979 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
984 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
988 for( ; (j+2UL) <= N; j+=2UL )
1001 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1003 for(
size_t k=kbegin; k<kend; ++k ) {
1004 const SIMDType a1( A.load(i ,k) );
1005 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1006 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1007 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1008 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1009 const SIMDType b1(
set( B(k,j ) ) );
1010 const SIMDType b2(
set( B(k,j+1UL) ) );
1023 (~C).store( i , j , xmm1 );
1024 (~C).store( i+SIMDSIZE , j , xmm2 );
1025 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1026 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1027 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1028 (~C).store( i , j+1UL, xmm6 );
1029 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1030 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1031 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1032 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1044 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1046 for(
size_t k=kbegin; k<kend; ++k ) {
1047 const SIMDType b1(
set( B(k,j) ) );
1048 xmm1 += A.load(i ,k) * b1;
1049 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1050 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1051 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1052 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1055 (~C).store( i , j, xmm1 );
1056 (~C).store( i+SIMDSIZE , j, xmm2 );
1057 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1058 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1059 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1063 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1065 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
1066 size_t j( UPP ? i : 0UL );
1068 for( ; (j+2UL) <= jend; j+=2UL )
1081 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1083 for(
size_t k=kbegin; k<kend; ++k ) {
1084 const SIMDType a1( A.load(i ,k) );
1085 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1086 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1087 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1088 const SIMDType b1(
set( B(k,j ) ) );
1089 const SIMDType b2(
set( B(k,j+1UL) ) );
1100 (~C).store( i , j , xmm1 );
1101 (~C).store( i+SIMDSIZE , j , xmm2 );
1102 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1103 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1104 (~C).store( i , j+1UL, xmm5 );
1105 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1106 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1107 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1119 SIMDType xmm1, xmm2, xmm3, xmm4;
1121 for(
size_t k=kbegin; k<kend; ++k ) {
1122 const SIMDType b1(
set( B(k,j) ) );
1123 xmm1 += A.load(i ,k) * b1;
1124 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1125 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1126 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1129 (~C).store( i , j, xmm1 );
1130 (~C).store( i+SIMDSIZE , j, xmm2 );
1131 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1132 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1136 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1138 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
1139 size_t j( UPP ? i : 0UL );
1141 for( ; (j+2UL) <= jend; j+=2UL )
1154 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1156 for(
size_t k=kbegin; k<kend; ++k ) {
1157 const SIMDType a1( A.load(i ,k) );
1158 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1159 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1160 const SIMDType b1(
set( B(k,j ) ) );
1161 const SIMDType b2(
set( B(k,j+1UL) ) );
1170 (~C).store( i , j , xmm1 );
1171 (~C).store( i+SIMDSIZE , j , xmm2 );
1172 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1173 (~C).store( i , j+1UL, xmm4 );
1174 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1175 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1187 SIMDType xmm1, xmm2, xmm3;
1189 for(
size_t k=kbegin; k<kend; ++k ) {
1190 const SIMDType b1(
set( B(k,j) ) );
1191 xmm1 += A.load(i ,k) * b1;
1192 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1193 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1196 (~C).store( i , j, xmm1 );
1197 (~C).store( i+SIMDSIZE , j, xmm2 );
1198 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1202 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1204 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
1205 size_t j( UPP ? i : 0UL );
1207 for( ; (j+2UL) <= jend; j+=2UL )
1220 SIMDType xmm1, xmm2, xmm3, xmm4;
1222 for(
size_t k=kbegin; k<kend; ++k ) {
1223 const SIMDType a1( A.load(i ,k) );
1224 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1225 const SIMDType b1(
set( B(k,j ) ) );
1226 const SIMDType b2(
set( B(k,j+1UL) ) );
1233 (~C).store( i , j , xmm1 );
1234 (~C).store( i+SIMDSIZE, j , xmm2 );
1235 (~C).store( i , j+1UL, xmm3 );
1236 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1248 SIMDType xmm1, xmm2;
1250 for(
size_t k=kbegin; k<kend; ++k ) {
1251 const SIMDType b1(
set( B(k,j) ) );
1252 xmm1 += A.load(i ,k) * b1;
1253 xmm2 += A.load(i+SIMDSIZE,k) * b1;
1256 (~C).store( i , j, xmm1 );
1257 (~C).store( i+SIMDSIZE, j, xmm2 );
1261 for( ; i<ipos; i+=SIMDSIZE )
1263 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
1264 size_t j( UPP ? i : 0UL );
1266 for( ; (j+2UL) <= jend; j+=2UL )
1277 SIMDType xmm1, xmm2;
1279 for(
size_t k=kbegin; k<kend; ++k ) {
1280 const SIMDType a1( A.load(i,k) );
1281 xmm1 += a1 *
set( B(k,j ) );
1282 xmm2 += a1 *
set( B(k,j+1UL) );
1285 (~C).store( i, j , xmm1 );
1286 (~C).store( i, j+1UL, xmm2 );
1299 for(
size_t k=kbegin; k<K; ++k ) {
1300 xmm1 += A.load(i,k) *
set( B(k,j) );
1303 (~C).store( i, j, xmm1 );
1307 for( ; remainder && i<M; ++i )
1309 size_t j( LOW && UPP ? i : 0UL );
1311 for( ; (j+2UL) <= N; j+=2UL )
1325 for(
size_t k=kbegin; k<kend; ++k ) {
1326 value1 += A(i,k) * B(k,j );
1327 value2 += A(i,k) * B(k,j+1UL);
1330 (~C)(i,j ) = value1;
1331 (~C)(i,j+1UL) = value2;
1344 for(
size_t k=kbegin; k<K; ++k ) {
1345 value += A(i,k) * B(k,j);
1353 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1354 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1355 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1356 for(
size_t i=0UL; i<iend; ++i ) {
1357 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1361 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1362 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1363 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1364 for(
size_t i=0UL; i<iend; ++i ) {
1369 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
1370 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1371 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1372 for(
size_t j=0UL; j<jend; ++j ) {
1395 template<
typename MT3
1399 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1401 selectDefaultAssignKernel( C, A, B );
1421 template<
typename MT3
1425 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1455 template<
typename MT3
1459 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1461 selectLargeAssignKernel( C, A, B );
1467 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1481 template<
typename MT3
1485 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1498 gemm( C, A, B, ET(1), ET(0) );
1518 template<
typename MT
1537 const ForwardFunctor fwd;
1539 const TmpType tmp(
serial( rhs ) );
1540 assign( ~lhs, fwd( tmp ) );
1560 template<
typename MT >
1571 const ForwardFunctor fwd;
1574 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1576 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1578 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1596 template<
typename MT
1606 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1610 LT A(
serial( rhs.lhs_ ) );
1611 RT B(
serial( rhs.rhs_ ) );
1620 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1636 template<
typename MT3
1639 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1642 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1643 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1644 selectSmallAddAssignKernel( C, A, B );
1646 selectBlasAddAssignKernel( C, A, B );
1665 template<
typename MT3
1669 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1671 const size_t M( A.rows() );
1672 const size_t N( B.columns() );
1673 const size_t K( A.columns() );
1677 for(
size_t j=0UL; j<N; ++j )
1687 for(
size_t k=kbegin; k<kend; ++k )
1691 ?( LOW ?
max(j,k+1UL) : k+1UL )
1692 :( LOW ?
max(j,k) : k ) )
1693 :( LOW ? j : 0UL ) );
1696 ?( UPP ?
min(j+1UL,k) : k )
1697 :( UPP ?
min(j,k)+1UL : k+1UL ) )
1698 :( UPP ? j+1UL : M ) );
1700 if( ( LOW || UPP ) && ibegin >= iend )
continue;
1703 const size_t inum( iend - ibegin );
1704 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1706 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1707 C(i ,j) += A(i ,k) * B(k,j);
1708 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1711 C(ipos,j) += A(ipos,k) * B(k,j);
1733 template<
typename MT3
1736 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
1737 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1741 const size_t M( A.rows() );
1742 const size_t N( B.columns() );
1744 for(
size_t j=0UL; j<N; ++j )
1754 const size_t inum( iend - ibegin );
1755 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1757 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1758 C(i ,j) += A(i ,j) * B(j,j);
1759 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1762 C(ipos,j) += A(ipos,j) * B(j,j);
1783 template<
typename MT3
1787 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1791 const size_t M( A.rows() );
1792 const size_t N( B.columns() );
1794 for(
size_t j=0UL; j<N; ++j )
1804 const size_t inum( iend - ibegin );
1805 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1807 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1808 C(i ,j) += A(i ,i ) * B(i ,j);
1809 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1812 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1833 template<
typename MT3
1837 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1841 for(
size_t i=0UL; i<A.rows(); ++i ) {
1842 C(i,i) += A(i,i) * B(i,i);
1862 template<
typename MT3
1866 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1868 selectDefaultAddAssignKernel( C, A, B );
1888 template<
typename MT3
1899 const ForwardFunctor fwd;
1903 addAssign( ~C, fwd( A * tmp ) );
1907 addAssign( ~C, fwd( tmp * B ) );
1909 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1911 addAssign( ~C, fwd( A * tmp ) );
1915 addAssign( ~C, fwd( tmp * B ) );
1936 template<
typename MT3
1944 const size_t M( A.rows() );
1945 const size_t N( B.columns() );
1946 const size_t K( A.columns() );
1950 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1957 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1958 for(
size_t j=0UL; j<N; ++j )
1971 SIMDType xmm1( (~C).load(i ,j) );
1972 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
1973 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
1974 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
1975 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
1976 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
1977 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
1978 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
1980 for(
size_t k=kbegin; k<kend; ++k ) {
1981 const SIMDType b1(
set( B(k,j) ) );
1982 xmm1 += A.load(i ,k) * b1;
1983 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1984 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1985 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1986 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1987 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1988 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1989 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1992 (~C).store( i , j, xmm1 );
1993 (~C).store( i+SIMDSIZE , j, xmm2 );
1994 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1995 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1996 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1997 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1998 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1999 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
2004 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2008 for( ; (j+2UL) <= N; j+=2UL )
2021 SIMDType xmm1 ( (~C).load(i ,j ) );
2022 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
2023 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
2024 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
2025 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
2026 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
2027 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
2028 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2029 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2030 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
2032 for(
size_t k=kbegin; k<kend; ++k ) {
2033 const SIMDType a1( A.load(i ,k) );
2034 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2035 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2036 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2037 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2038 const SIMDType b1(
set( B(k,j ) ) );
2039 const SIMDType b2(
set( B(k,j+1UL) ) );
2052 (~C).store( i , j , xmm1 );
2053 (~C).store( i+SIMDSIZE , j , xmm2 );
2054 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2055 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2056 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
2057 (~C).store( i , j+1UL, xmm6 );
2058 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
2059 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2060 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2061 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2073 SIMDType xmm1( (~C).load(i ,j) );
2074 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2075 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2076 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2077 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2079 for(
size_t k=kbegin; k<kend; ++k ) {
2080 const SIMDType b1(
set( B(k,j) ) );
2081 xmm1 += A.load(i ,k) * b1;
2082 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2083 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2084 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2085 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2088 (~C).store( i , j, xmm1 );
2089 (~C).store( i+SIMDSIZE , j, xmm2 );
2090 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2091 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2092 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2096 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2100 for( ; (j+2UL) <= N; j+=2UL )
2113 SIMDType xmm1( (~C).load(i ,j ) );
2114 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2115 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2116 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2117 SIMDType xmm5( (~C).load(i ,j+1UL) );
2118 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2119 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2120 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2122 for(
size_t k=kbegin; k<kend; ++k ) {
2123 const SIMDType a1( A.load(i ,k) );
2124 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2125 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2126 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2127 const SIMDType b1(
set( B(k,j ) ) );
2128 const SIMDType b2(
set( B(k,j+1UL) ) );
2139 (~C).store( i , j , xmm1 );
2140 (~C).store( i+SIMDSIZE , j , xmm2 );
2141 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2142 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2143 (~C).store( i , j+1UL, xmm5 );
2144 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2145 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2146 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2158 SIMDType xmm1( (~C).load(i ,j) );
2159 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2160 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2161 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2163 for(
size_t k=kbegin; k<kend; ++k ) {
2164 const SIMDType b1(
set( B(k,j) ) );
2165 xmm1 += A.load(i ,k) * b1;
2166 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2167 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2168 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2171 (~C).store( i , j, xmm1 );
2172 (~C).store( i+SIMDSIZE , j, xmm2 );
2173 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2174 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2178 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2182 for( ; (j+2UL) <= N; j+=2UL )
2195 SIMDType xmm1( (~C).load(i ,j ) );
2196 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2197 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2198 SIMDType xmm4( (~C).load(i ,j+1UL) );
2199 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
2200 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2202 for(
size_t k=kbegin; k<kend; ++k ) {
2203 const SIMDType a1( A.load(i ,k) );
2204 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2205 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2206 const SIMDType b1(
set( B(k,j ) ) );
2207 const SIMDType b2(
set( B(k,j+1UL) ) );
2216 (~C).store( i , j , xmm1 );
2217 (~C).store( i+SIMDSIZE , j , xmm2 );
2218 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2219 (~C).store( i , j+1UL, xmm4 );
2220 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
2221 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2233 SIMDType xmm1( (~C).load(i ,j) );
2234 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2235 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2237 for(
size_t k=kbegin; k<kend; ++k ) {
2238 const SIMDType b1(
set( B(k,j) ) );
2239 xmm1 += A.load(i ,k) * b1;
2240 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2241 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2244 (~C).store( i , j, xmm1 );
2245 (~C).store( i+SIMDSIZE , j, xmm2 );
2246 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2250 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2252 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
2253 size_t j( UPP ? i : 0UL );
2255 for( ; (j+2UL) <= jend; j+=2UL )
2268 SIMDType xmm1( (~C).load(i ,j ) );
2269 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2270 SIMDType xmm3( (~C).load(i ,j+1UL) );
2271 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2273 for(
size_t k=kbegin; k<kend; ++k ) {
2274 const SIMDType a1( A.load(i ,k) );
2275 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2276 const SIMDType b1(
set( B(k,j ) ) );
2277 const SIMDType b2(
set( B(k,j+1UL) ) );
2284 (~C).store( i , j , xmm1 );
2285 (~C).store( i+SIMDSIZE, j , xmm2 );
2286 (~C).store( i , j+1UL, xmm3 );
2287 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2299 SIMDType xmm1( (~C).load(i ,j) );
2300 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2302 for(
size_t k=kbegin; k<kend; ++k ) {
2303 const SIMDType b1(
set( B(k,j) ) );
2304 xmm1 += A.load(i ,k) * b1;
2305 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2308 (~C).store( i , j, xmm1 );
2309 (~C).store( i+SIMDSIZE, j, xmm2 );
2313 for( ; i<ipos; i+=SIMDSIZE )
2315 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
2316 size_t j( UPP ? i : 0UL );
2318 for( ; (j+2UL) <= jend; j+=2UL )
2329 SIMDType xmm1( (~C).load(i,j ) );
2330 SIMDType xmm2( (~C).load(i,j+1UL) );
2332 for(
size_t k=kbegin; k<kend; ++k ) {
2333 const SIMDType a1( A.load(i,k) );
2334 xmm1 += a1 *
set( B(k,j ) );
2335 xmm2 += a1 *
set( B(k,j+1UL) );
2338 (~C).store( i, j , xmm1 );
2339 (~C).store( i, j+1UL, xmm2 );
2350 SIMDType xmm1( (~C).load(i,j) );
2352 for(
size_t k=kbegin; k<K; ++k ) {
2353 xmm1 += A.load(i,k) *
set( B(k,j) );
2356 (~C).store( i, j, xmm1 );
2360 for( ; remainder && i<M; ++i )
2362 const size_t jend( LOW ? i+1UL : N );
2363 size_t j( UPP ? i : 0UL );
2365 for( ; (j+2UL) <= jend; j+=2UL )
2376 ElementType value1( (~C)(i,j ) );
2377 ElementType value2( (~C)(i,j+1UL) );
2379 for(
size_t k=kbegin; k<kend; ++k ) {
2380 value1 += A(i,k) * B(k,j );
2381 value2 += A(i,k) * B(k,j+1UL);
2384 (~C)(i,j ) = value1;
2385 (~C)(i,j+1UL) = value2;
2396 ElementType value( (~C)(i,j) );
2398 for(
size_t k=kbegin; k<K; ++k ) {
2399 value += A(i,k) * B(k,j);
2423 template<
typename MT3
2427 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2429 selectDefaultAddAssignKernel( C, A, B );
2449 template<
typename MT3
2453 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2479 template<
typename MT3
2483 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2485 selectLargeAddAssignKernel( C, A, B );
2491 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2505 template<
typename MT3
2509 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2516 addAssign( C, tmp );
2521 addAssign( C, tmp );
2524 gemm( C, A, B, ET(1), ET(1) );
2546 template<
typename MT >
2557 const ForwardFunctor fwd;
2560 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
2562 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
2564 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
2586 template<
typename MT
2596 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2600 LT A(
serial( rhs.lhs_ ) );
2601 RT B(
serial( rhs.rhs_ ) );
2610 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2626 template<
typename MT3
2629 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2632 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
2633 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2634 selectSmallSubAssignKernel( C, A, B );
2636 selectBlasSubAssignKernel( C, A, B );
2655 template<
typename MT3
2659 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2661 const size_t M( A.rows() );
2662 const size_t N( B.columns() );
2663 const size_t K( A.columns() );
2667 for(
size_t j=0UL; j<N; ++j )
2677 for(
size_t k=kbegin; k<kend; ++k )
2681 ?( LOW ?
max(j,k+1UL) : k+1UL )
2682 :( LOW ?
max(j,k) : k ) )
2683 :( LOW ? j : 0UL ) );
2686 ?( UPP ?
min(j+1UL,k) : k )
2687 :( UPP ?
min(j,k)+1UL : k+1UL ) )
2688 :( UPP ? j+1UL : M ) );
2690 if( ( LOW || UPP ) && ( ibegin >= iend ) )
continue;
2693 const size_t inum( iend - ibegin );
2694 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2696 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2697 C(i ,j) -= A(i ,k) * B(k,j);
2698 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
2701 C(ipos,j) -= A(ipos,k) * B(k,j);
2723 template<
typename MT3
2726 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
2727 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2731 const size_t M( A.rows() );
2732 const size_t N( B.columns() );
2734 for(
size_t j=0UL; j<N; ++j )
2744 const size_t inum( iend - ibegin );
2745 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2747 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2748 C(i ,j) -= A(i ,j) * B(j,j);
2749 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
2752 C(ipos,j) -= A(ipos,j) * B(j,j);
2773 template<
typename MT3
2777 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2781 const size_t M( A.rows() );
2782 const size_t N( B.columns() );
2784 for(
size_t j=0UL; j<N; ++j )
2794 const size_t inum( iend - ibegin );
2795 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2797 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2798 C(i ,j) -= A(i ,i ) * B(i ,j);
2799 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
2802 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
2823 template<
typename MT3
2827 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2831 for(
size_t i=0UL; i<A.rows(); ++i ) {
2832 C(i,i) -= A(i,i) * B(i,i);
2852 template<
typename MT3
2856 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2858 selectDefaultSubAssignKernel( C, A, B );
2878 template<
typename MT3
2889 const ForwardFunctor fwd;
2893 subAssign( ~C, fwd( A * tmp ) );
2897 subAssign( ~C, fwd( tmp * B ) );
2899 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2901 subAssign( ~C, fwd( A * tmp ) );
2905 subAssign( ~C, fwd( tmp * B ) );
2926 template<
typename MT3
2934 const size_t M( A.rows() );
2935 const size_t N( B.columns() );
2936 const size_t K( A.columns() );
2940 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2947 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2948 for(
size_t j=0UL; j<N; ++j )
2961 SIMDType xmm1( (~C).load(i ,j) );
2962 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2963 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2964 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2965 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2966 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
2967 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
2968 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
2970 for(
size_t k=kbegin; k<kend; ++k ) {
2971 const SIMDType b1(
set( B(k,j) ) );
2972 xmm1 -= A.load(i ,k) * b1;
2973 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
2974 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
2975 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
2976 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
2977 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
2978 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
2979 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
2982 (~C).store( i , j, xmm1 );
2983 (~C).store( i+SIMDSIZE , j, xmm2 );
2984 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2985 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2986 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2987 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
2988 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
2989 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
2994 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2998 for( ; (j+2UL) <= N; j+=2UL )
3011 SIMDType xmm1 ( (~C).load(i ,j ) );
3012 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3013 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3014 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3015 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3016 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3017 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3018 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3019 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3020 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3022 for(
size_t k=kbegin; k<kend; ++k ) {
3023 const SIMDType a1( A.load(i ,k) );
3024 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3025 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3026 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3027 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3028 const SIMDType b1(
set( B(k,j ) ) );
3029 const SIMDType b2(
set( B(k,j+1UL) ) );
3042 (~C).store( i , j , xmm1 );
3043 (~C).store( i+SIMDSIZE , j , xmm2 );
3044 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3045 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3046 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3047 (~C).store( i , j+1UL, xmm6 );
3048 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3049 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3050 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3051 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3063 SIMDType xmm1( (~C).load(i ,j) );
3064 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3065 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3066 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3067 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3069 for(
size_t k=kbegin; k<kend; ++k ) {
3070 const SIMDType b1(
set( B(k,j) ) );
3071 xmm1 -= A.load(i ,k) * b1;
3072 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3073 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3074 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3075 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3078 (~C).store( i , j, xmm1 );
3079 (~C).store( i+SIMDSIZE , j, xmm2 );
3080 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3081 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3082 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3086 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3090 for( ; (j+2UL) <= N; j+=2UL )
3103 SIMDType xmm1( (~C).load(i ,j ) );
3104 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3105 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3106 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3107 SIMDType xmm5( (~C).load(i ,j+1UL) );
3108 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3109 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3110 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3112 for(
size_t k=kbegin; k<kend; ++k ) {
3113 const SIMDType a1( A.load(i ,k) );
3114 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3115 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3116 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3117 const SIMDType b1(
set( B(k,j ) ) );
3118 const SIMDType b2(
set( B(k,j+1UL) ) );
3129 (~C).store( i , j , xmm1 );
3130 (~C).store( i+SIMDSIZE , j , xmm2 );
3131 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3132 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3133 (~C).store( i , j+1UL, xmm5 );
3134 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3135 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3136 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3148 SIMDType xmm1( (~C).load(i ,j) );
3149 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3150 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3151 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3153 for(
size_t k=kbegin; k<kend; ++k ) {
3154 const SIMDType b1(
set( B(k,j) ) );
3155 xmm1 -= A.load(i ,k) * b1;
3156 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3157 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3158 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3161 (~C).store( i , j, xmm1 );
3162 (~C).store( i+SIMDSIZE , j, xmm2 );
3163 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3164 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3168 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3172 for( ; (j+2UL) <= N; j+=2UL )
3185 SIMDType xmm1( (~C).load(i ,j ) );
3186 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3187 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3188 SIMDType xmm4( (~C).load(i ,j+1UL) );
3189 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3190 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3192 for(
size_t k=kbegin; k<kend; ++k ) {
3193 const SIMDType a1( A.load(i ,k) );
3194 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3195 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3196 const SIMDType b1(
set( B(k,j ) ) );
3197 const SIMDType b2(
set( B(k,j+1UL) ) );
3206 (~C).store( i , j , xmm1 );
3207 (~C).store( i+SIMDSIZE , j , xmm2 );
3208 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3209 (~C).store( i , j+1UL, xmm4 );
3210 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3211 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3223 SIMDType xmm1( (~C).load(i ,j) );
3224 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3225 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3227 for(
size_t k=kbegin; k<kend; ++k ) {
3228 const SIMDType b1(
set( B(k,j) ) );
3229 xmm1 -= A.load(i ,k) * b1;
3230 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3231 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3234 (~C).store( i , j, xmm1 );
3235 (~C).store( i+SIMDSIZE , j, xmm2 );
3236 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3240 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3242 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
3243 size_t j( UPP ? i : 0UL );
3245 for( ; (j+2UL) <= jend; j+=2UL )
3258 SIMDType xmm1( (~C).load(i ,j ) );
3259 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3260 SIMDType xmm3( (~C).load(i ,j+1UL) );
3261 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3263 for(
size_t k=kbegin; k<kend; ++k ) {
3264 const SIMDType a1( A.load(i ,k) );
3265 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3266 const SIMDType b1(
set( B(k,j ) ) );
3267 const SIMDType b2(
set( B(k,j+1UL) ) );
3274 (~C).store( i , j , xmm1 );
3275 (~C).store( i+SIMDSIZE, j , xmm2 );
3276 (~C).store( i , j+1UL, xmm3 );
3277 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3289 SIMDType xmm1( (~C).load(i ,j) );
3290 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3292 for(
size_t k=kbegin; k<kend; ++k ) {
3293 const SIMDType b1(
set( B(k,j) ) );
3294 xmm1 -= A.load(i ,k) * b1;
3295 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3298 (~C).store( i , j, xmm1 );
3299 (~C).store( i+SIMDSIZE, j, xmm2 );
3303 for( ; i<ipos; i+=SIMDSIZE )
3305 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
3306 size_t j( UPP ? i : 0UL );
3308 for( ; (j+2UL) <= jend; j+=2UL )
3319 SIMDType xmm1( (~C).load(i,j ) );
3320 SIMDType xmm2( (~C).load(i,j+1UL) );
3322 for(
size_t k=kbegin; k<kend; ++k ) {
3323 const SIMDType a1( A.load(i,k) );
3324 xmm1 -= a1 *
set( B(k,j ) );
3325 xmm2 -= a1 *
set( B(k,j+1UL) );
3328 (~C).store( i, j , xmm1 );
3329 (~C).store( i, j+1UL, xmm2 );
3340 SIMDType xmm1( (~C).load(i,j) );
3342 for(
size_t k=kbegin; k<K; ++k ) {
3343 xmm1 -= A.load(i,k) *
set( B(k,j) );
3346 (~C).store( i, j, xmm1 );
3350 for( ; remainder && i<M; ++i )
3352 const size_t jend( LOW ? i+1UL : N );
3353 size_t j( UPP ? i : 0UL );
3355 for( ; (j+2UL) <= jend; j+=2UL )
3366 ElementType value1( (~C)(i,j ) );
3367 ElementType value2( (~C)(i,j+1UL) );
3369 for(
size_t k=kbegin; k<kend; ++k ) {
3370 value1 -= A(i,k) * B(k,j );
3371 value2 -= A(i,k) * B(k,j+1UL);
3374 (~C)(i,j ) = value1;
3375 (~C)(i,j+1UL) = value2;
3386 ElementType value( (~C)(i,j) );
3388 for(
size_t k=kbegin; k<K; ++k ) {
3389 value -= A(i,k) * B(k,j);
3413 template<
typename MT3
3417 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3419 selectDefaultSubAssignKernel( C, A, B );
3439 template<
typename MT3
3443 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3469 template<
typename MT3
3473 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3475 selectLargeSubAssignKernel( C, A, B );
3481 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3495 template<
typename MT3
3499 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3506 subAssign( C, tmp );
3511 subAssign( C, tmp );
3514 gemm( C, A, B, ET(-1), ET(1) );
3537 template<
typename MT >
3548 const ForwardFunctor fwd;
3551 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
3553 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
3555 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
3588 template<
typename MT
3598 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3601 else if( rhs.lhs_.columns() == 0UL ) {
3637 template<
typename MT
3656 const ForwardFunctor fwd;
3658 const TmpType tmp( rhs );
3679 template<
typename MT >
3690 const ForwardFunctor fwd;
3718 template<
typename MT
3728 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3763 template<
typename MT >
3774 const ForwardFunctor fwd;
3806 template<
typename MT
3816 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3851 template<
typename MT >
3862 const ForwardFunctor fwd;
3915 template<
typename MT1
3923 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true >
3954 SYM = ( SF && !( HF || LF || UF ) ),
3955 HERM = ( HF && !( LF || UF ) ),
3956 LOW = ( LF || ( ( SF || HF ) && UF ) ),
3957 UPP = ( UF || ( ( SF || HF ) && LF ) )
3968 template<
typename T1,
typename T2,
typename T3 >
3969 struct CanExploitSymmetry {
3980 template<
typename T1,
typename T2,
typename T3 >
3981 struct IsEvaluationRequired {
3982 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
3983 !CanExploitSymmetry<T1,T2,T3>::value };
3991 template<
typename T1,
typename T2,
typename T3,
typename T4 >
3992 struct UseBlasKernel {
3999 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4013 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4014 struct UseVectorizedDefaultKernel {
4017 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4041 ,
Noop > > > > ForwardFunctor;
4071 MT1::simdEnabled && MT2::simdEnabled &&
4077 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4078 !evaluateRight && MT2::smpAssignable };
4105 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4108 return matrix_(i,j) * scalar_;
4120 inline ReturnType
at(
size_t i,
size_t j )
const {
4121 if( i >= matrix_.rows() ) {
4124 if( j >= matrix_.columns() ) {
4127 return (*
this)(i,j);
4136 inline size_t rows()
const {
4137 return matrix_.rows();
4146 inline size_t columns()
const {
4147 return matrix_.columns();
4177 template<
typename T >
4178 inline bool canAlias(
const T* alias )
const {
4179 return matrix_.canAlias( alias );
4189 template<
typename T >
4190 inline bool isAliased(
const T* alias )
const {
4191 return matrix_.isAliased( alias );
4201 return matrix_.isAligned();
4212 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4213 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4219 LeftOperand matrix_;
4220 RightOperand scalar_;
4235 template<
typename MT
4248 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4251 else if( left.columns() == 0UL ) {
4266 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4281 template<
typename MT3
4285 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4288 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
4289 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4290 selectSmallAssignKernel( C, A, B, scalar );
4292 selectBlasAssignKernel( C, A, B, scalar );
4310 template<
typename MT3
4315 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4317 const size_t M( A.rows() );
4318 const size_t N( B.columns() );
4319 const size_t K( A.columns() );
4323 for(
size_t j=0UL; j<N; ++j )
4334 for(
size_t i=0UL; i<M; ++i ) {
4343 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
4344 :( LOW ?
max(j,kbegin) : kbegin ) )
4345 :( LOW ? j : 0UL ) );
4348 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
4349 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
4350 :( UPP ? j+1UL : M ) );
4353 for(
size_t i=0UL; i<ibegin; ++i ) {
4360 for(
size_t i=ibegin; i<iend; ++i ) {
4361 C(i,j) = A(i,kbegin) * B(kbegin,j);
4364 for(
size_t i=iend; i<M; ++i ) {
4369 reset( C(M-1UL,j) );
4373 for(
size_t k=kbegin+1UL; k<kend; ++k )
4377 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
4378 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
4379 :( SYM || HERM || LOW ? j : 0UL ) );
4382 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
4383 :( UPP ?
min(j+1UL,k) : k ) )
4384 :( UPP ? j+1UL : M ) );
4386 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
4389 for(
size_t i=ibegin; i<iend; ++i ) {
4390 C(i,j) += A(i,k) * B(k,j);
4393 C(iend,j) = A(iend,k) * B(k,j);
4400 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
4403 :( UPP ? j+1UL : M ) );
4405 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
4408 for(
size_t i=ibegin; i<iend; ++i ) {
4415 for(
size_t j=1UL; j<N; ++j ) {
4416 for(
size_t i=0UL; i<j; ++i ) {
4417 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
4438 template<
typename MT3
4442 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
4443 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4447 const size_t M( A.rows() );
4448 const size_t N( B.columns() );
4450 for(
size_t j=0UL; j<N; ++j )
4461 for(
size_t i=0UL; i<ibegin; ++i ) {
4465 for(
size_t i=ibegin; i<iend; ++i ) {
4466 C(i,j) = A(i,j) * B(j,j) * scalar;
4469 for(
size_t i=iend; i<M; ++i ) {
4491 template<
typename MT3
4496 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4500 const size_t M( A.rows() );
4501 const size_t N( B.columns() );
4503 for(
size_t j=0UL; j<N; ++j )
4514 for(
size_t i=0UL; i<ibegin; ++i ) {
4518 for(
size_t i=ibegin; i<iend; ++i ) {
4519 C(i,j) = A(i,i) * B(i,j) * scalar;
4522 for(
size_t i=iend; i<M; ++i ) {
4544 template<
typename MT3
4549 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4555 for(
size_t i=0UL; i<A.rows(); ++i ) {
4556 C(i,i) = A(i,i) * B(i,i) * scalar;
4575 template<
typename MT3
4580 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4582 selectDefaultAssignKernel( C, A, B, scalar );
4601 template<
typename MT3
4613 const ForwardFunctor fwd;
4617 assign( ~C, fwd( A * tmp ) * scalar );
4621 assign( ~C, fwd( tmp * B ) * scalar );
4623 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
4625 assign( ~C, fwd( A * tmp ) * scalar );
4629 assign( ~C, fwd( tmp * B ) * scalar );
4649 template<
typename MT3
4658 const size_t M( A.rows() );
4659 const size_t N( B.columns() );
4660 const size_t K( A.columns() );
4664 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4667 const SIMDType factor(
set( scalar ) );
4669 if( LOW && UPP && M > SIMDSIZE*3UL ) {
4678 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4679 for(
size_t j=0UL; j<N; ++j )
4692 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4694 for(
size_t k=kbegin; k<kend; ++k ) {
4695 const SIMDType b1(
set( B(k,j) ) );
4696 xmm1 += A.load(i ,k) * b1;
4697 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4698 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4699 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4700 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
4701 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
4702 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
4703 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
4706 (~C).store( i , j, xmm1 * factor );
4707 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4708 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4709 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4710 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
4711 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
4712 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
4713 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
4718 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
4722 for( ; (j+2UL) <= N; j+=2UL )
4735 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
4737 for(
size_t k=kbegin; k<kend; ++k ) {
4738 const SIMDType a1( A.load(i ,k) );
4739 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4740 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4741 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4742 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
4743 const SIMDType b1(
set( B(k,j ) ) );
4744 const SIMDType b2(
set( B(k,j+1UL) ) );
4757 (~C).store( i , j , xmm1 * factor );
4758 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
4759 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
4760 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
4761 (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
4762 (~C).store( i , j+1UL, xmm6 * factor );
4763 (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
4764 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
4765 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
4766 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
4778 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
4780 for(
size_t k=kbegin; k<kend; ++k ) {
4781 const SIMDType b1(
set( B(k,j) ) );
4782 xmm1 += A.load(i ,k) * b1;
4783 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4784 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4785 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4786 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
4789 (~C).store( i , j, xmm1 * factor );
4790 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4791 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4792 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4793 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
4797 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4799 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
4800 size_t j( UPP ? i : 0UL );
4802 for( ; (j+2UL) <= jend; j+=2UL )
4815 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4817 for(
size_t k=kbegin; k<kend; ++k ) {
4818 const SIMDType a1( A.load(i ,k) );
4819 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4820 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4821 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4822 const SIMDType b1(
set( B(k,j ) ) );
4823 const SIMDType b2(
set( B(k,j+1UL) ) );
4834 (~C).store( i , j , xmm1 * factor );
4835 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
4836 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
4837 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
4838 (~C).store( i , j+1UL, xmm5 * factor );
4839 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
4840 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
4841 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
4853 SIMDType xmm1, xmm2, xmm3, xmm4;
4855 for(
size_t k=kbegin; k<kend; ++k ) {
4856 const SIMDType b1(
set( B(k,j) ) );
4857 xmm1 += A.load(i ,k) * b1;
4858 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4859 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4860 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4863 (~C).store( i , j, xmm1 * factor );
4864 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4865 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4866 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4870 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4872 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
4873 size_t j( UPP ? i : 0UL );
4875 for( ; (j+2UL) <= jend; j+=2UL )
4888 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
4890 for(
size_t k=kbegin; k<kend; ++k ) {
4891 const SIMDType a1( A.load(i ,k) );
4892 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4893 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4894 const SIMDType b1(
set( B(k,j ) ) );
4895 const SIMDType b2(
set( B(k,j+1UL) ) );
4904 (~C).store( i , j , xmm1 * factor );
4905 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
4906 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
4907 (~C).store( i , j+1UL, xmm4 * factor );
4908 (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
4909 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
4921 SIMDType xmm1, xmm2, xmm3;
4923 for(
size_t k=kbegin; k<kend; ++k ) {
4924 const SIMDType b1(
set( B(k,j) ) );
4925 xmm1 += A.load(i ,k) * b1;
4926 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4927 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4930 (~C).store( i , j, xmm1 * factor );
4931 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4932 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4936 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4938 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
4939 size_t j( UPP ? i : 0UL );
4941 for( ; (j+2UL) <= jend; j+=2UL )
4954 SIMDType xmm1, xmm2, xmm3, xmm4;
4956 for(
size_t k=kbegin; k<kend; ++k ) {
4957 const SIMDType a1( A.load(i ,k) );
4958 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4959 const SIMDType b1(
set( B(k,j ) ) );
4960 const SIMDType b2(
set( B(k,j+1UL) ) );
4967 (~C).store( i , j , xmm1 * factor );
4968 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
4969 (~C).store( i , j+1UL, xmm3 * factor );
4970 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
4982 SIMDType xmm1, xmm2;
4984 for(
size_t k=kbegin; k<kend; ++k ) {
4985 const SIMDType b1(
set( B(k,j) ) );
4986 xmm1 += A.load(i ,k) * b1;
4987 xmm2 += A.load(i+SIMDSIZE,k) * b1;
4990 (~C).store( i , j, xmm1 * factor );
4991 (~C).store( i+SIMDSIZE, j, xmm2 * factor );
4995 for( ; i<ipos; i+=SIMDSIZE )
4997 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
4998 size_t j( UPP ? i : 0UL );
5000 for( ; (j+2UL) <= jend; j+=2UL )
5011 SIMDType xmm1, xmm2;
5013 for(
size_t k=kbegin; k<kend; ++k ) {
5014 const SIMDType a1( A.load(i,k) );
5015 xmm1 += a1 *
set( B(k,j ) );
5016 xmm2 += a1 *
set( B(k,j+1UL) );
5019 (~C).store( i, j , xmm1 * factor );
5020 (~C).store( i, j+1UL, xmm2 * factor );
5033 for(
size_t k=kbegin; k<K; ++k ) {
5034 xmm1 += A.load(i,k) *
set( B(k,j) );
5037 (~C).store( i, j, xmm1 * factor );
5041 for( ; remainder && i<M; ++i )
5043 size_t j( LOW && UPP ? i : 0UL );
5045 for( ; (j+2UL) <= N; j+=2UL )
5059 for(
size_t k=kbegin; k<kend; ++k ) {
5060 value1 += A(i,k) * B(k,j );
5061 value2 += A(i,k) * B(k,j+1UL);
5064 (~C)(i,j ) = value1 * scalar;
5065 (~C)(i,j+1UL) = value2 * scalar;
5078 for(
size_t k=kbegin; k<K; ++k ) {
5079 value += A(i,k) * B(k,j);
5082 (~C)(i,j) = value * scalar;
5087 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
5088 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5089 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
5090 for(
size_t i=0UL; i<iend; ++i ) {
5091 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5095 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
5096 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5097 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
5098 for(
size_t i=0UL; i<iend; ++i ) {
5103 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
5104 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5105 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5106 for(
size_t j=0UL; j<jend; ++j ) {
5128 template<
typename MT3
5133 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5135 selectDefaultAssignKernel( C, A, B, scalar );
5154 template<
typename MT3
5159 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5162 smmm( C, A, B, scalar );
5164 hmmm( C, A, B, scalar );
5166 lmmm( C, A, B, scalar, ST2(0) );
5168 ummm( C, A, B, scalar, ST2(0) );
5170 mmm( C, A, B, scalar, ST2(0) );
5188 template<
typename MT3
5193 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5195 selectLargeAssignKernel( C, A, B, scalar );
5200 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5214 template<
typename MT3
5219 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5225 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5229 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5232 gemm( C, A, B, ET(scalar), ET(0) );
5250 template<
typename MT
5269 const ForwardFunctor fwd;
5271 const TmpType tmp(
serial( rhs ) );
5272 assign( ~lhs, fwd( tmp ) );
5290 template<
typename MT >
5301 const ForwardFunctor fwd;
5309 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
5311 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
5327 template<
typename MT
5340 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5354 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
5369 template<
typename MT3
5373 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5376 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
5377 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5378 selectSmallAddAssignKernel( C, A, B, scalar );
5380 selectBlasAddAssignKernel( C, A, B, scalar );
5398 template<
typename MT3
5403 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5405 const ResultType tmp(
serial( A * B * scalar ) );
5406 addAssign( C, tmp );
5424 template<
typename MT3
5428 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
5429 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5433 const size_t M( A.rows() );
5434 const size_t N( B.columns() );
5436 for(
size_t j=0UL; j<N; ++j )
5446 const size_t inum( iend - ibegin );
5447 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5449 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5450 C(i ,j) += A(i ,j) * B(j,j) * scalar;
5451 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
5454 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
5474 template<
typename MT3
5479 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5483 const size_t M( A.rows() );
5484 const size_t N( B.columns() );
5486 for(
size_t j=0UL; j<N; ++j )
5496 const size_t inum( iend - ibegin );
5497 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5499 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5500 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5501 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5504 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5524 template<
typename MT3
5529 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5533 for(
size_t i=0UL; i<A.rows(); ++i ) {
5534 C(i,i) += A(i,i) * B(i,i) * scalar;
5553 template<
typename MT3
5558 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5560 selectDefaultAddAssignKernel( C, A, B, scalar );
5579 template<
typename MT3
5591 const ForwardFunctor fwd;
5595 addAssign( ~C, fwd( A * tmp ) * scalar );
5599 addAssign( ~C, fwd( tmp * B ) * scalar );
5601 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5603 addAssign( ~C, fwd( A * tmp ) * scalar );
5607 addAssign( ~C, fwd( tmp * B ) * scalar );
5627 template<
typename MT3
5636 const size_t M( A.rows() );
5637 const size_t N( B.columns() );
5638 const size_t K( A.columns() );
5642 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
5645 const SIMDType factor(
set( scalar ) );
5651 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5652 for(
size_t j=0UL; j<N; ++j )
5665 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5667 for(
size_t k=kbegin; k<kend; ++k ) {
5668 const SIMDType b1(
set( B(k,j) ) );
5669 xmm1 += A.load(i ,k) * b1;
5670 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5671 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5672 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5673 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5674 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5675 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5676 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5679 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5680 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5681 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5682 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5683 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
5684 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
5685 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
5686 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
5691 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5695 for( ; (j+2UL) <= N; j+=2UL )
5708 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5710 for(
size_t k=kbegin; k<kend; ++k ) {
5711 const SIMDType a1( A.load(i ,k) );
5712 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5713 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5714 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5715 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5716 const SIMDType b1(
set( B(k,j ) ) );
5717 const SIMDType b2(
set( B(k,j+1UL) ) );
5730 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5731 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
5732 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
5733 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
5734 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
5735 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
5736 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
5737 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
5738 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
5739 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
5751 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5753 for(
size_t k=kbegin; k<kend; ++k ) {
5754 const SIMDType b1(
set( B(k,j) ) );
5755 xmm1 += A.load(i ,k) * b1;
5756 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5757 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5758 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5759 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5762 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5763 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5764 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5765 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5766 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
5770 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5774 for( ; (j+2UL) <= N; j+=2UL )
5787 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5789 for(
size_t k=kbegin; k<kend; ++k ) {
5790 const SIMDType a1( A.load(i ,k) );
5791 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5792 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5793 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5794 const SIMDType b1(
set( B(k,j ) ) );
5795 const SIMDType b2(
set( B(k,j+1UL) ) );
5806 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5807 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
5808 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
5809 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
5810 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5811 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
5812 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
5813 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
5825 SIMDType xmm1, xmm2, xmm3, xmm4;
5827 for(
size_t k=kbegin; k<kend; ++k ) {
5828 const SIMDType b1(
set( B(k,j) ) );
5829 xmm1 += A.load(i ,k) * b1;
5830 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5831 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5832 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5835 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5836 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5837 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5838 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5842 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5846 for( ; (j+2UL) <= N; j+=2UL )
5859 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5861 for(
size_t k=kbegin; k<kend; ++k ) {
5862 const SIMDType a1( A.load(i ,k) );
5863 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5864 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5865 const SIMDType b1(
set( B(k,j ) ) );
5866 const SIMDType b2(
set( B(k,j+1UL) ) );
5875 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5876 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
5877 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
5878 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
5879 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
5880 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
5892 SIMDType xmm1, xmm2, xmm3;
5894 for(
size_t k=kbegin; k<kend; ++k ) {
5895 const SIMDType b1(
set( B(k,j) ) );
5896 xmm1 += A.load(i ,k) * b1;
5897 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5898 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5901 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5902 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5903 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5907 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5909 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
5910 size_t j( UPP ? i : 0UL );
5912 for( ; (j+2UL) <= jend; j+=2UL )
5925 SIMDType xmm1, xmm2, xmm3, xmm4;
5927 for(
size_t k=kbegin; k<kend; ++k ) {
5928 const SIMDType a1( A.load(i ,k) );
5929 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5930 const SIMDType b1(
set( B(k,j ) ) );
5931 const SIMDType b2(
set( B(k,j+1UL) ) );
5938 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5939 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
5940 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5941 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
5953 SIMDType xmm1, xmm2;
5955 for(
size_t k=kbegin; k<kend; ++k ) {
5956 const SIMDType b1(
set( B(k,j) ) );
5957 xmm1 += A.load(i ,k) * b1;
5958 xmm2 += A.load(i+SIMDSIZE,k) * b1;
5961 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5962 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
5966 for( ; i<ipos; i+=SIMDSIZE )
5968 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
5969 size_t j( UPP ? i : 0UL );
5971 for( ; (j+2UL) <= jend; j+=2UL )
5982 SIMDType xmm1, xmm2;
5984 for(
size_t k=kbegin; k<kend; ++k ) {
5985 const SIMDType a1( A.load(i,k) );
5986 xmm1 += a1 *
set( B(k,j ) );
5987 xmm2 += a1 *
set( B(k,j+1UL) );
5990 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5991 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
6004 for(
size_t k=kbegin; k<K; ++k ) {
6005 xmm1 += A.load(i,k) *
set( B(k,j) );
6008 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6012 for( ; remainder && i<M; ++i )
6014 const size_t jend( LOW ? i+1UL : N );
6015 size_t j( UPP ? i : 0UL );
6017 for( ; (j+2UL) <= jend; j+=2UL )
6031 for(
size_t k=kbegin; k<kend; ++k ) {
6032 value1 += A(i,k) * B(k,j );
6033 value2 += A(i,k) * B(k,j+1UL);
6036 (~C)(i,j ) += value1 * scalar;
6037 (~C)(i,j+1UL) += value2 * scalar;
6050 for(
size_t k=kbegin; k<K; ++k ) {
6051 value += A(i,k) * B(k,j);
6054 (~C)(i,j) += value * scalar;
6074 template<
typename MT3
6079 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6081 selectDefaultAddAssignKernel( C, A, B, scalar );
6100 template<
typename MT3
6105 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6108 lmmm( C, A, B, scalar, ST2(1) );
6110 ummm( C, A, B, scalar, ST2(1) );
6112 mmm( C, A, B, scalar, ST2(1) );
6131 template<
typename MT3
6136 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6138 selectLargeAddAssignKernel( C, A, B, scalar );
6143 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6157 template<
typename MT3
6162 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6168 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6169 addAssign( C, tmp );
6173 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6174 addAssign( C, tmp );
6177 gemm( C, A, B, ET(scalar), ET(1) );
6198 template<
typename MT >
6209 const ForwardFunctor fwd;
6217 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
6219 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
6239 template<
typename MT
6252 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6266 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6281 template<
typename MT3
6285 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6288 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6289 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6290 selectSmallSubAssignKernel( C, A, B, scalar );
6292 selectBlasSubAssignKernel( C, A, B, scalar );
6310 template<
typename MT3
6315 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6317 const ResultType tmp(
serial( A * B * scalar ) );
6318 subAssign( C, tmp );
6336 template<
typename MT3
6340 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6341 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6345 const size_t M( A.rows() );
6346 const size_t N( B.columns() );
6348 for(
size_t j=0UL; j<N; ++j )
6358 const size_t inum( iend - ibegin );
6359 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6361 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6362 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
6363 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
6366 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
6386 template<
typename MT3
6391 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6395 const size_t M( A.rows() );
6396 const size_t N( B.columns() );
6398 for(
size_t j=0UL; j<N; ++j )
6408 const size_t inum( iend - ibegin );
6409 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6411 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6412 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6413 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6416 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6436 template<
typename MT3
6441 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6445 for(
size_t i=0UL; i<A.rows(); ++i ) {
6446 C(i,i) -= A(i,i) * B(i,i) * scalar;
6465 template<
typename MT3
6470 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6472 selectDefaultSubAssignKernel( C, A, B, scalar );
6491 template<
typename MT3
6503 const ForwardFunctor fwd;
6507 subAssign( ~C, fwd( A * tmp ) * scalar );
6511 subAssign( ~C, fwd( tmp * B ) * scalar );
6513 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6515 subAssign( ~C, fwd( A * tmp ) * scalar );
6519 subAssign( ~C, fwd( tmp * B ) * scalar );
6539 template<
typename MT3
6548 const size_t M( A.rows() );
6549 const size_t N( B.columns() );
6550 const size_t K( A.columns() );
6554 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
6557 const SIMDType factor(
set( scalar ) );
6563 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6564 for(
size_t j=0UL; j<N; ++j )
6577 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6579 for(
size_t k=kbegin; k<kend; ++k ) {
6580 const SIMDType b1(
set( B(k,j) ) );
6581 xmm1 += A.load(i ,k) * b1;
6582 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6583 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6584 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6585 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6586 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6587 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6588 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6591 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6592 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6593 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6594 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6595 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
6596 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
6597 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
6598 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
6603 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6607 for( ; (j+2UL) <= N; j+=2UL )
6620 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6622 for(
size_t k=kbegin; k<kend; ++k ) {
6623 const SIMDType a1( A.load(i ,k) );
6624 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6625 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6626 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6627 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6628 const SIMDType b1(
set( B(k,j ) ) );
6629 const SIMDType b2(
set( B(k,j+1UL) ) );
6642 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6643 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
6644 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
6645 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
6646 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
6647 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
6648 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
6649 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
6650 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
6651 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
6663 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6665 for(
size_t k=kbegin; k<kend; ++k ) {
6666 const SIMDType b1(
set( B(k,j) ) );
6667 xmm1 += A.load(i ,k) * b1;
6668 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6669 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6670 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6671 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6674 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6675 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6676 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6677 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6678 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
6682 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6686 for( ; (j+2UL) <= N; j+=2UL )
6699 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6701 for(
size_t k=kbegin; k<kend; ++k ) {
6702 const SIMDType a1( A.load(i ,k) );
6703 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6704 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6705 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6706 const SIMDType b1(
set( B(k,j ) ) );
6707 const SIMDType b2(
set( B(k,j+1UL) ) );
6718 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6719 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
6720 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
6721 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
6722 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
6723 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
6724 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
6725 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
6737 SIMDType xmm1, xmm2, xmm3, xmm4;
6739 for(
size_t k=kbegin; k<kend; ++k ) {
6740 const SIMDType b1(
set( B(k,j) ) );
6741 xmm1 += A.load(i ,k) * b1;
6742 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6743 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6744 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6747 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6748 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6749 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6750 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6754 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6758 for( ; (j+2UL) <= N; j+=2UL )
6771 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6773 for(
size_t k=kbegin; k<kend; ++k ) {
6774 const SIMDType a1( A.load(i ,k) );
6775 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6776 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6777 const SIMDType b1(
set( B(k,j ) ) );
6778 const SIMDType b2(
set( B(k,j+1UL) ) );
6787 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6788 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
6789 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
6790 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
6791 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
6792 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
6804 SIMDType xmm1, xmm2, xmm3;
6806 for(
size_t k=kbegin; k<kend; ++k ) {
6807 const SIMDType b1(
set( B(k,j) ) );
6808 xmm1 += A.load(i ,k) * b1;
6809 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6810 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6813 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6814 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6815 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6819 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6821 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
6822 size_t j( UPP ? i : 0UL );
6824 for( ; (j+2UL) <= jend; j+=2UL )
6837 SIMDType xmm1, xmm2, xmm3, xmm4;
6839 for(
size_t k=kbegin; k<kend; ++k ) {
6840 const SIMDType a1( A.load(i ,k) );
6841 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6842 const SIMDType b1(
set( B(k,j ) ) );
6843 const SIMDType b2(
set( B(k,j+1UL) ) );
6850 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6851 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
6852 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
6853 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
6865 SIMDType xmm1, xmm2;
6867 for(
size_t k=kbegin; k<kend; ++k ) {
6868 const SIMDType b1(
set( B(k,j) ) );
6869 xmm1 += A.load(i ,k) * b1;
6870 xmm2 += A.load(i+SIMDSIZE,k) * b1;
6873 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6874 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
6878 for( ; i<ipos; i+=SIMDSIZE )
6880 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
6881 size_t j( UPP ? i : 0UL );
6883 for( ; (j+2UL) <= jend; j+=2UL )
6894 SIMDType xmm1, xmm2;
6896 for(
size_t k=kbegin; k<kend; ++k ) {
6897 const SIMDType a1( A.load(i,k) );
6898 xmm1 += a1 *
set( B(k,j ) );
6899 xmm2 += a1 *
set( B(k,j+1UL) );
6902 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6903 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
6916 for(
size_t k=kbegin; k<K; ++k ) {
6917 xmm1 += A.load(i,k) *
set( B(k,j) );
6920 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6924 for( ; remainder && i<M; ++i )
6926 const size_t jend( LOW ? i+1UL : N );
6927 size_t j( UPP ? i : 0UL );
6929 for( ; (j+2UL) <= jend; j+=2UL )
6943 for(
size_t k=kbegin; k<kend; ++k ) {
6944 value1 += A(i,k) * B(k,j );
6945 value2 += A(i,k) * B(k,j+1UL);
6948 (~C)(i,j ) -= value1 * scalar;
6949 (~C)(i,j+1UL) -= value2 * scalar;
6962 for(
size_t k=kbegin; k<K; ++k ) {
6963 value += A(i,k) * B(k,j);
6966 (~C)(i,j) -= value * scalar;
6986 template<
typename MT3
6991 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6993 selectDefaultSubAssignKernel( C, A, B, scalar );
7012 template<
typename MT3
7017 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7020 lmmm( C, A, B, -scalar, ST2(1) );
7022 ummm( C, A, B, -scalar, ST2(1) );
7024 mmm( C, A, B, -scalar, ST2(1) );
7043 template<
typename MT3
7048 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7050 selectLargeSubAssignKernel( C, A, B, scalar );
7055 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7069 template<
typename MT3
7074 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7080 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7081 subAssign( C, tmp );
7085 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7086 subAssign( C, tmp );
7089 gemm( C, A, B, ET(-scalar), ET(1) );
7109 template<
typename MT >
7120 const ForwardFunctor fwd;
7128 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
7130 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
7161 template<
typename MT
7174 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7177 else if( left.columns() == 0UL ) {
7211 template<
typename MT
7230 const ForwardFunctor fwd;
7232 const TmpType tmp( rhs );
7251 template<
typename MT >
7262 const ForwardFunctor fwd;
7291 template<
typename MT
7304 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7337 template<
typename MT >
7348 const ForwardFunctor fwd;
7381 template<
typename MT
7394 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7427 template<
typename MT >
7438 const ForwardFunctor fwd;
7514 template<
typename T1
7561 template<
typename MT1
7605 template<
typename MT1
7649 template<
typename MT1
7693 template<
typename MT1
7737 template<
typename MT1
7768 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7769 struct Rows< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Rows<MT1>
7785 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7786 struct Columns< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > :
public Columns<MT2>
7802 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7803 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7804 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7820 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7821 struct IsSymmetric< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7824 , IsBuiltin< ElementType_< TDMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
7825 , And< Bool<LF>, Bool<UF> > >::value >
7841 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
7842 struct IsHermitian< TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
7859 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7860 struct IsLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7862 , And< IsLower<MT1>, IsLower<MT2> >
7863 , And< Or< Bool<SF>, Bool<HF> >
7864 , IsUpper<MT1>, IsUpper<MT2> > >::value >
7880 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7881 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7882 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
7883 , And< Or< Bool<SF>, Bool<HF> >
7884 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
7900 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7902 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7903 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
7904 , And< Or< Bool<SF>, Bool<HF> >
7905 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7906 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
7922 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7923 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7925 , And< IsUpper<MT1>, IsUpper<MT2> >
7926 , And< Or< Bool<SF>, Bool<HF> >
7927 , IsLower<MT1>, IsLower<MT2> > >::value >
7943 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7944 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
7945 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
7946 , And< Or< Bool<SF>, Bool<HF> >
7947 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
7963 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
7965 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7966 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
7967 , And< Or< Bool<SF>, Bool<HF> >
7968 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7969 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
7985 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
8003 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
typename VT >
8021 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8039 template<
typename VT,
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8057 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8074 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8091 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8108 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8125 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8142 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF,
bool AF >
8157 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8158 struct RowExprTrait< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8171 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
typename SubmatrixExprTrait< MT, AF >::Type SubmatrixExprTrait_
Auxiliary alias declaration for the SubmatrixExprTrait type trait.The SubmatrixExprTrait_ alias decla...
Definition: SubmatrixExprTrait.h:134
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Evaluation of the expression type of a dense matrix declherm operation.Via this type trait it is poss...
Definition: TDMatDeclHermExprTrait.h:75
Compile time check for row vector types.This type trait tests whether or not the given template argum...
Definition: IsRowVector.h:80
const DMatForEachExpr< MT, Conj, SO > conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatForEachExpr.h:1214
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:310
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:174
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Evaluation of the expression type of a dense matrix decllow operation.Via this type trait it is possi...
Definition: TDMatDeclLowExprTrait.h:75
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:307
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:505
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:304
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:196
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:495
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:177
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:560
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:298
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:178
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:633
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:550
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:66
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:342
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Evaluation of the expression type of a sparse vector/transpose dense matrix multiplication.Via this type trait it is possible to evaluate the resulting expression type of a sparse vector/transpose dense matrix multiplication. Given the transpose sparse vector type VT and the column-major dense matrix type MT, the nested type Type corresponds to the resulting expression type. In case either VT is not a transpose sparse vector type or MT is not a column-major dense matrix type, the resulting data type Type is set to INVALID_TYPE.
Definition: TSVecTDMatMultExprTrait.h:81
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
DisableIf_< IsSymmetric< MT >, const DMatDeclSymExpr< MT, SO > > declsym(const DenseMatrix< MT, SO > &dm)
Declares the given non-symmetric dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:841
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:144
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:195
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:313
Header file for the TDMatDeclDiagExprTrait class template.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:441
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:119
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
DisableIf_< IsHermitian< MT >, const DMatDeclHermExpr< MT, SO > > declherm(const DenseMatrix< MT, SO > &dm)
Declares the given non-Hermitian dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:841
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:198
Header file for the DisableIf class template.
Compile time check for dense vector types.This type trait tests whether or not the given template par...
Definition: IsDenseVector.h:78
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:451
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:300
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:83
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Header file for the TSVecTDMatMultExprTrait class template.
Evaluation of the expression type of a dense matrix declupp operation.Via this type trait it is possi...
Definition: TDMatDeclUppExprTrait.h:75
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:123
Header file for the TDMatSVecMultExprTrait class template.
Header file for the TDMatDeclHermExprTrait class template.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:475
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:197
Header file for the TDMatDeclUppExprTrait class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Evaluation of the expression type of a dense matrix decldiag operation.Via this type trait it is poss...
Definition: TDMatDeclDiagExprTrait.h:75
Compile time check for sparse vector types.This type trait tests whether or not the given template pa...
Definition: IsSparseVector.h:78
Evaluation of the expression type type of a submatrix operation.Via this type trait it is possible to...
Definition: SubmatrixExprTrait.h:80
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:303
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Compile time check for column vector types.This type trait tests whether or not the given template ar...
Definition: IsColumnVector.h:80
Evaluation of the expression type of a dense matrix declsym operation.Via this type trait it is possi...
Definition: TDMatDeclSymExprTrait.h:75
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
DisableIf_< IsLower< MT >, const DMatDeclLowExpr< MT, SO > > decllow(const DenseMatrix< MT, SO > &dm)
Declares the given non-lower dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:842
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Evaluation of the expression type type of a row operation.Via this type trait it is possible to evalu...
Definition: RowExprTrait.h:79
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:632
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:260
Header file for the DeclDiag functor.
Compile time check for dense matrix types.This type trait tests whether or not the given template par...
Definition: IsDenseMatrix.h:78
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:179
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:506
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:316
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
Header file for the conjugate shim.
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Evaluation of the expression type of a transpose dense matrix/sparse vector multiplication.Via this type trait it is possible to evaluate the resulting expression type of a transpose dense matrix/sparse vector multiplication. Given the column-major dense matrix type MT and the non-transpose sparse vector type VT, the nested type Type corresponds to the resulting expression type. In case either MT is not a column-major dense matrix type or VT is not a non-transpose sparse vector type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDMatSVecMultExprTrait.h:79
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:431
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
typename TDVecTDMatMultExprTrait< VT, MT >::Type TDVecTDMatMultExprTrait_
Auxiliary alias declaration for the TDVecTDMatMultExprTrait class template.The TDVecTDMatMultExprTrai...
Definition: TDVecTDMatMultExprTrait.h:120
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:301
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:83
Utility type for generic codes.
Header file for the TDMatDeclLowExprTrait class template.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:175
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:405
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
Compile time type negation.The Not class template negates the given compile time condition. In case the given condition would evaluate to true, the nested member enumeration is set to false and vice versa:
Definition: Not.h:70
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:94
TDMatTDMatMultExpr< MT1, MT2, SF, HF, LF, UF > This
Type of this TDMatTDMatMultExpr instance.
Definition: TDMatTDMatMultExpr.h:296
Constraint on the data type.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:302
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
Header file for the TDMatDeclSymExprTrait class template.
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename TDMatDVecMultExprTrait< MT, VT >::Type TDMatDVecMultExprTrait_
Auxiliary alias declaration for the TDMatDVecMultExprTrait class template.The TDMatDVecMultExprTrait_...
Definition: TDMatDVecMultExprTrait.h:120
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:176
Evaluation of the expression type of a dense vector/transpose dense matrix multiplication.Via this type trait it is possible to evaluate the resulting expression type of a dense vector/transpose dense matrix multiplication. Given the transpose dense vector type VT and the column-major dense matrix type MT, the nested type Type corresponds to the resulting expression type. In case either VT is not a transpose dense vector type or MT is not a column-major dense matrix type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDVecTDMatMultExprTrait.h:79
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:299
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:733
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDMatDVecMultExprTrait class template.
Evaluation of the expression type of a transpose dense matrix/dense vector multiplication.Via this type trait it is possible to evaluate the resulting expression type of a transpose dense matrix/dense vector multiplication. Given the column-major dense matrix type MT and the non-transpose dense vector type VT, the nested type Type corresponds to the resulting expression type. In case either MT is not a column-major dense matrix type or VT is not a non-transpose dense vector type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDMatDVecMultExprTrait.h:79
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:421
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:463
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:76
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:357
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the IsColumnVector type trait.
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:485
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:677
Header file for the IsResizable type trait.
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
DisableIf_< IsDiagonal< MT >, const DMatDeclDiagExpr< MT, SO > > decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given non-diagonal dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:841
DisableIf_< IsUpper< MT >, const DMatDeclUppExpr< MT, SO > > declupp(const DenseMatrix< MT, SO > &dm)
Declares the given non-upper dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:842
Evaluation of the expression type type of a column operation.Via this type trait it is possible to ev...
Definition: ColumnExprTrait.h:78
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.