35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 145 template<
typename MT1
152 :
public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
178 SYM = ( SF && !( HF || LF || UF ) ),
179 HERM = ( HF && !( LF || UF ) ),
180 LOW = ( LF || ( ( SF || HF ) && UF ) ),
181 UPP = ( UF || ( ( SF || HF ) && LF ) )
193 template<
typename T1,
typename T2,
typename T3 >
194 struct CanExploitSymmetry {
207 template<
typename T1,
typename T2,
typename T3 >
208 struct IsEvaluationRequired {
209 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
210 !CanExploitSymmetry<T1,T2,T3>::value };
220 template<
typename T1,
typename T2,
typename T3 >
221 struct UseBlasKernel {
222 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
228 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
243 template<
typename T1,
typename T2,
typename T3 >
244 struct UseVectorizedDefaultKernel {
245 enum :
bool { value = useOptimizedKernels &&
247 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
305 MT1::simdEnabled && MT2::simdEnabled &&
310 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
311 !evaluateRight && MT2::smpAssignable };
366 :(
lhs_.columns() ) ) );
370 const size_t n(
end - begin );
389 if( i >=
lhs_.rows() ) {
392 if( j >=
rhs_.columns() ) {
404 inline size_t rows() const noexcept {
415 return rhs_.columns();
445 template<
typename T >
446 inline bool canAlias(
const T* alias )
const noexcept {
447 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
457 template<
typename T >
458 inline bool isAliased(
const T* alias )
const noexcept {
459 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
469 return lhs_.isAligned() &&
rhs_.isAligned();
480 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
482 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
483 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
507 template<
typename MT
517 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
520 else if( rhs.
lhs_.columns() == 0UL ) {
535 DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
551 template<
typename MT3
554 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
557 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
558 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
559 selectSmallAssignKernel( C, A, B );
561 selectBlasAssignKernel( C, A, B );
580 template<
typename MT3
584 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
586 const size_t M( A.rows() );
587 const size_t N( B.columns() );
588 const size_t K( A.columns() );
592 for(
size_t i=0UL; i<M; ++i )
603 for(
size_t j=0UL; j<N; ++j ) {
612 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
613 :( UPP ?
max(i,kbegin) : kbegin ) )
614 :( UPP ? i : 0UL ) );
617 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
618 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
619 :( LOW ? i+1UL : N ) );
622 for(
size_t j=0UL; j<jbegin; ++j ) {
629 for(
size_t j=jbegin; j<jend; ++j ) {
630 C(i,j) = A(i,kbegin) * B(kbegin,j);
633 for(
size_t j=jend; j<N; ++j ) {
642 for(
size_t k=kbegin+1UL; k<kend; ++k )
646 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
647 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
648 :( SYM || HERM || UPP ? i : 0UL ) );
651 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
652 :( LOW ?
min(i+1UL,k) : k ) )
653 :( LOW ? i+1UL : N ) );
655 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
658 for(
size_t j=jbegin; j<jend; ++j ) {
659 C(i,j) += A(i,k) * B(k,j);
662 C(i,jend) = A(i,k) * B(k,jend);
668 for(
size_t i=1UL; i<M; ++i ) {
669 for(
size_t j=0UL; j<i; ++j ) {
670 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
692 template<
typename MT3
695 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
696 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
700 const size_t M( A.rows() );
701 const size_t N( B.columns() );
703 for(
size_t i=0UL; i<M; ++i )
714 for(
size_t j=0UL; j<jbegin; ++j ) {
718 for(
size_t j=jbegin; j<jend; ++j ) {
719 C(i,j) = A(i,j) * B(j,j);
722 for(
size_t j=jend; j<N; ++j ) {
745 template<
typename MT3
749 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
753 const size_t M( A.rows() );
754 const size_t N( B.columns() );
756 for(
size_t i=0UL; i<M; ++i )
767 for(
size_t j=0UL; j<jbegin; ++j ) {
771 for(
size_t j=jbegin; j<jend; ++j ) {
772 C(i,j) = A(i,i) * B(i,j);
775 for(
size_t j=jend; j<N; ++j ) {
798 template<
typename MT3
802 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
808 for(
size_t i=0UL; i<A.rows(); ++i ) {
809 C(i,i) = A(i,i) * B(i,i);
828 template<
typename MT3
832 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
834 selectDefaultAssignKernel( C, A, B );
854 template<
typename MT3
862 const size_t M( A.rows() );
863 const size_t N( B.columns() );
864 const size_t K( A.columns() );
868 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
871 if( LOW && UPP && N > SIMDSIZE*3UL ) {
880 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
881 for(
size_t i=0UL; i<M; ++i )
894 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
896 for(
size_t k=kbegin; k<kend; ++k ) {
898 xmm1 += a1 * B.load(k,j );
899 xmm2 += a1 * B.load(k,j+SIMDSIZE );
900 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
901 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
902 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
903 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
904 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
905 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
908 (~C).store( i, j , xmm1 );
909 (~C).store( i, j+SIMDSIZE , xmm2 );
910 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
911 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
912 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
913 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
914 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
915 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
920 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
924 for( ; (i+2UL) <= M; i+=2UL )
937 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
939 for(
size_t k=kbegin; k<kend; ++k ) {
940 const SIMDType a1(
set( A(i ,k) ) );
941 const SIMDType a2(
set( A(i+1UL,k) ) );
943 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
944 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
945 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
946 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
959 (~C).store( i , j , xmm1 );
960 (~C).store( i , j+SIMDSIZE , xmm2 );
961 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
962 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
963 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
964 (~C).store( i+1UL, j , xmm6 );
965 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
966 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
967 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
968 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
980 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
982 for(
size_t k=kbegin; k<kend; ++k ) {
984 xmm1 += a1 * B.load(k,j );
985 xmm2 += a1 * B.load(k,j+SIMDSIZE );
986 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
987 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
988 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
991 (~C).store( i, j , xmm1 );
992 (~C).store( i, j+SIMDSIZE , xmm2 );
993 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
994 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
995 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
999 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1001 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
1002 size_t i( LOW ? j : 0UL );
1004 for( ; (i+2UL) <= iend; i+=2UL )
1017 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1019 for(
size_t k=kbegin; k<kend; ++k ) {
1020 const SIMDType a1(
set( A(i ,k) ) );
1021 const SIMDType a2(
set( A(i+1UL,k) ) );
1023 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1024 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1025 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1036 (~C).store( i , j , xmm1 );
1037 (~C).store( i , j+SIMDSIZE , xmm2 );
1038 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1039 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1040 (~C).store( i+1UL, j , xmm5 );
1041 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1042 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1043 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1057 for(
size_t k=kbegin; k<kend; ++k ) {
1058 const SIMDType a1(
set( A(i,k) ) );
1059 xmm1 += a1 * B.load(k,j );
1060 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1061 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1062 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1065 (~C).store( i, j , xmm1 );
1066 (~C).store( i, j+SIMDSIZE , xmm2 );
1067 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1068 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1072 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1074 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
1075 size_t i( LOW ? j : 0UL );
1077 for( ; (i+2UL) <= iend; i+=2UL )
1090 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1092 for(
size_t k=kbegin; k<kend; ++k ) {
1093 const SIMDType a1(
set( A(i ,k) ) );
1094 const SIMDType a2(
set( A(i+1UL,k) ) );
1096 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1097 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1106 (~C).store( i , j , xmm1 );
1107 (~C).store( i , j+SIMDSIZE , xmm2 );
1108 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1109 (~C).store( i+1UL, j , xmm4 );
1110 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1111 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1125 for(
size_t k=kbegin; k<kend; ++k ) {
1126 const SIMDType a1(
set( A(i,k) ) );
1127 xmm1 += a1 * B.load(k,j );
1128 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1129 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1132 (~C).store( i, j , xmm1 );
1133 (~C).store( i, j+SIMDSIZE , xmm2 );
1134 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1138 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1140 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
1141 size_t i( LOW ? j : 0UL );
1143 for( ; (i+4UL) <= iend; i+=4UL )
1156 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1158 for(
size_t k=kbegin; k<kend; ++k ) {
1159 const SIMDType a1(
set( A(i ,k) ) );
1160 const SIMDType a2(
set( A(i+1UL,k) ) );
1161 const SIMDType a3(
set( A(i+2UL,k) ) );
1162 const SIMDType a4(
set( A(i+3UL,k) ) );
1164 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1175 (~C).store( i , j , xmm1 );
1176 (~C).store( i , j+SIMDSIZE, xmm2 );
1177 (~C).store( i+1UL, j , xmm3 );
1178 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1179 (~C).store( i+2UL, j , xmm5 );
1180 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1181 (~C).store( i+3UL, j , xmm7 );
1182 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
1185 for( ; (i+3UL) <= iend; i+=3UL )
1198 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1200 for(
size_t k=kbegin; k<kend; ++k ) {
1201 const SIMDType a1(
set( A(i ,k) ) );
1202 const SIMDType a2(
set( A(i+1UL,k) ) );
1203 const SIMDType a3(
set( A(i+2UL,k) ) );
1205 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1214 (~C).store( i , j , xmm1 );
1215 (~C).store( i , j+SIMDSIZE, xmm2 );
1216 (~C).store( i+1UL, j , xmm3 );
1217 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1218 (~C).store( i+2UL, j , xmm5 );
1219 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1222 for( ; (i+2UL) <= iend; i+=2UL )
1235 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1238 for( ; (k+2UL) <= kend; k+=2UL ) {
1239 const SIMDType a1(
set( A(i ,k ) ) );
1240 const SIMDType a2(
set( A(i+1UL,k ) ) );
1241 const SIMDType a3(
set( A(i ,k+1UL) ) );
1242 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
1243 const SIMDType b1( B.load(k ,j ) );
1244 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1245 const SIMDType b3( B.load(k+1UL,j ) );
1246 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1257 for( ; k<kend; ++k ) {
1258 const SIMDType a1(
set( A(i ,k) ) );
1259 const SIMDType a2(
set( A(i+1UL,k) ) );
1261 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1268 (~C).store( i , j , xmm1+xmm5 );
1269 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
1270 (~C).store( i+1UL, j , xmm3+xmm7 );
1271 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1286 for( ; (k+2UL) <= kend; k+=2UL ) {
1287 const SIMDType a1(
set( A(i,k ) ) );
1288 const SIMDType a2(
set( A(i,k+1UL) ) );
1289 xmm1 += a1 * B.load(k ,j );
1290 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1291 xmm3 += a2 * B.load(k+1UL,j );
1292 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1295 for( ; k<kend; ++k ) {
1296 const SIMDType a1(
set( A(i,k) ) );
1297 xmm1 += a1 * B.load(k,j );
1298 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1301 (~C).store( i, j , xmm1+xmm3 );
1302 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
1306 for( ; j<jpos; j+=SIMDSIZE )
1308 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
1309 size_t i( LOW ? j : 0UL );
1311 for( ; (i+4UL) <= iend; i+=4UL )
1322 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1325 for( ; (k+2UL) <= kend; k+=2UL ) {
1327 const SIMDType b2( B.load(k+1UL,j) );
1328 xmm1 +=
set( A(i ,k ) ) * b1;
1329 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1330 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1331 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1332 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1333 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1334 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1335 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1338 for( ; k<kend; ++k ) {
1340 xmm1 +=
set( A(i ,k) ) * b1;
1341 xmm2 +=
set( A(i+1UL,k) ) * b1;
1342 xmm3 +=
set( A(i+2UL,k) ) * b1;
1343 xmm4 +=
set( A(i+3UL,k) ) * b1;
1346 (~C).store( i , j, xmm1+xmm5 );
1347 (~C).store( i+1UL, j, xmm2+xmm6 );
1348 (~C).store( i+2UL, j, xmm3+xmm7 );
1349 (~C).store( i+3UL, j, xmm4+xmm8 );
1352 for( ; (i+3UL) <= iend; i+=3UL )
1363 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1366 for( ; (k+2UL) <= kend; k+=2UL ) {
1368 const SIMDType b2( B.load(k+1UL,j) );
1369 xmm1 +=
set( A(i ,k ) ) * b1;
1370 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1371 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1372 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1373 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1374 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1377 for( ; k<kend; ++k ) {
1379 xmm1 +=
set( A(i ,k) ) * b1;
1380 xmm2 +=
set( A(i+1UL,k) ) * b1;
1381 xmm3 +=
set( A(i+2UL,k) ) * b1;
1384 (~C).store( i , j, xmm1+xmm4 );
1385 (~C).store( i+1UL, j, xmm2+xmm5 );
1386 (~C).store( i+2UL, j, xmm3+xmm6 );
1389 for( ; (i+2UL) <= iend; i+=2UL )
1403 for( ; (k+2UL) <= kend; k+=2UL ) {
1405 const SIMDType b2( B.load(k+1UL,j) );
1406 xmm1 +=
set( A(i ,k ) ) * b1;
1407 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1408 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1409 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1412 for( ; k<kend; ++k ) {
1414 xmm1 +=
set( A(i ,k) ) * b1;
1415 xmm2 +=
set( A(i+1UL,k) ) * b1;
1418 (~C).store( i , j, xmm1+xmm3 );
1419 (~C).store( i+1UL, j, xmm2+xmm4 );
1433 for( ; (k+2UL) <= K; k+=2UL ) {
1434 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1435 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1439 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1442 (~C).store( i, j, xmm1+xmm2 );
1446 for( ; remainder && j<N; ++j )
1448 size_t i( LOW && UPP ? j : 0UL );
1450 for( ; (i+2UL) <= M; i+=2UL )
1464 for(
size_t k=kbegin; k<kend; ++k ) {
1465 value1 += A(i ,k) * B(k,j);
1466 value2 += A(i+1UL,k) * B(k,j);
1469 (~C)(i ,j) = value1;
1470 (~C)(i+1UL,j) = value2;
1483 for(
size_t k=kbegin; k<K; ++k ) {
1484 value += A(i,k) * B(k,j);
1492 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1493 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1494 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1495 for(
size_t j=0UL; j<jend; ++j ) {
1496 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1500 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1501 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1502 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1503 for(
size_t i=0UL; i<iend; ++i ) {
1508 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1509 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1510 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1511 for(
size_t j=0UL; j<jend; ++j ) {
1535 template<
typename MT3
1546 const ForwardFunctor fwd;
1550 assign( ~C, fwd( tmp * B ) );
1554 assign( ~C, fwd( A * tmp ) );
1556 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1558 assign( ~C, fwd( tmp * B ) );
1562 assign( ~C, fwd( A * tmp ) );
1581 template<
typename MT3
1585 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1587 selectDefaultAssignKernel( C, A, B );
1606 template<
typename MT3
1610 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1639 template<
typename MT3
1643 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1645 selectLargeAssignKernel( C, A, B );
1651 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1664 template<
typename MT3
1668 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1681 gemm( C, A, B, ET(1), ET(0) );
1701 template<
typename MT
1720 const ForwardFunctor fwd;
1722 const TmpType tmp(
serial( rhs ) );
1723 assign( ~lhs, fwd( tmp ) );
1743 template<
typename MT >
1754 const ForwardFunctor fwd;
1779 template<
typename MT
1789 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
1803 DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1819 template<
typename MT3
1822 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1825 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
1826 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1827 selectSmallAddAssignKernel( C, A, B );
1829 selectBlasAddAssignKernel( C, A, B );
1848 template<
typename MT3
1852 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1854 const size_t M( A.rows() );
1855 const size_t N( B.columns() );
1856 const size_t K( A.columns() );
1860 for(
size_t i=0UL; i<M; ++i )
1870 for(
size_t k=kbegin; k<kend; ++k )
1874 ?( UPP ?
max(i,k+1UL) : k+1UL )
1875 :( UPP ?
max(i,k) : k ) )
1876 :( UPP ? i : 0UL ) );
1879 ?( LOW ?
min(i+1UL,k) : k )
1880 :( LOW ?
min(i,k)+1UL : k+1UL ) )
1881 :( LOW ? i+1UL : N ) );
1883 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
1886 const size_t jnum( jend - jbegin );
1887 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1889 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1890 C(i,j ) += A(i,k) * B(k,j );
1891 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1894 C(i,jpos) += A(i,k) * B(k,jpos);
1916 template<
typename MT3
1919 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
1920 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1924 const size_t M( A.rows() );
1925 const size_t N( B.columns() );
1927 for(
size_t i=0UL; i<M; ++i )
1937 const size_t jnum( jend - jbegin );
1938 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1940 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1941 C(i,j ) += A(i,j ) * B(j ,j );
1942 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1945 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1966 template<
typename MT3
1970 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1974 const size_t M( A.rows() );
1975 const size_t N( B.columns() );
1977 for(
size_t i=0UL; i<M; ++i )
1987 const size_t jnum( jend - jbegin );
1988 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1990 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1991 C(i,j ) += A(i,i) * B(i,j );
1992 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1995 C(i,jpos) += A(i,i) * B(i,jpos);
2016 template<
typename MT3
2020 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2024 for(
size_t i=0UL; i<A.rows(); ++i ) {
2025 C(i,i) += A(i,i) * B(i,i);
2045 template<
typename MT3
2049 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2051 selectDefaultAddAssignKernel( C, A, B );
2071 template<
typename MT3
2079 const size_t M( A.rows() );
2080 const size_t N( B.columns() );
2081 const size_t K( A.columns() );
2085 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2092 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2093 for(
size_t i=0UL; i<M; ++i )
2107 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2108 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2109 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2110 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2111 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2112 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2113 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2115 for(
size_t k=kbegin; k<kend; ++k ) {
2116 const SIMDType a1(
set( A(i,k) ) );
2117 xmm1 += a1 * B.load(k,j );
2118 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2119 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2120 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2121 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2122 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2123 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2124 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2127 (~C).store( i, j , xmm1 );
2128 (~C).store( i, j+SIMDSIZE , xmm2 );
2129 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2130 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2131 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2132 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2133 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2134 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2139 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2143 for( ; (i+2UL) <= M; i+=2UL )
2156 SIMDType xmm1 ( (~C).load(i ,j ) );
2157 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
2158 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
2159 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
2160 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
2161 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
2162 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
2163 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2164 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2165 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
2167 for(
size_t k=kbegin; k<kend; ++k ) {
2168 const SIMDType a1(
set( A(i ,k) ) );
2169 const SIMDType a2(
set( A(i+1UL,k) ) );
2171 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2172 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2173 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2174 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2187 (~C).store( i , j , xmm1 );
2188 (~C).store( i , j+SIMDSIZE , xmm2 );
2189 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2190 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2191 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
2192 (~C).store( i+1UL, j , xmm6 );
2193 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
2194 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2195 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2196 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2209 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2210 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2211 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2212 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2214 for(
size_t k=kbegin; k<kend; ++k ) {
2215 const SIMDType a1(
set( A(i,k) ) );
2216 xmm1 += a1 * B.load(k,j );
2217 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2218 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2219 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2220 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2223 (~C).store( i, j , xmm1 );
2224 (~C).store( i, j+SIMDSIZE , xmm2 );
2225 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2226 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2227 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2231 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2235 for( ; (i+2UL) <= M; i+=2UL )
2249 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2250 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2251 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2252 SIMDType xmm5( (~C).load(i+1UL,j ) );
2253 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2254 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2255 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2257 for(
size_t k=kbegin; k<kend; ++k ) {
2258 const SIMDType a1(
set( A(i ,k) ) );
2259 const SIMDType a2(
set( A(i+1UL,k) ) );
2261 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2262 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2263 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2274 (~C).store( i , j , xmm1 );
2275 (~C).store( i , j+SIMDSIZE , xmm2 );
2276 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2277 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2278 (~C).store( i+1UL, j , xmm5 );
2279 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2280 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2281 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2294 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2295 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2296 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2298 for(
size_t k=kbegin; k<kend; ++k ) {
2299 const SIMDType a1(
set( A(i,k) ) );
2300 xmm1 += a1 * B.load(k,j );
2301 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2302 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2303 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2306 (~C).store( i, j , xmm1 );
2307 (~C).store( i, j+SIMDSIZE , xmm2 );
2308 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2309 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2313 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2317 for( ; (i+2UL) <= M; i+=2UL )
2331 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2332 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2333 SIMDType xmm4( (~C).load(i+1UL,j ) );
2334 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
2335 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2337 for(
size_t k=kbegin; k<kend; ++k ) {
2338 const SIMDType a1(
set( A(i ,k) ) );
2339 const SIMDType a2(
set( A(i+1UL,k) ) );
2341 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2342 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2351 (~C).store( i , j , xmm1 );
2352 (~C).store( i , j+SIMDSIZE , xmm2 );
2353 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2354 (~C).store( i+1UL, j , xmm4 );
2355 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
2356 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2369 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2370 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2372 for(
size_t k=kbegin; k<kend; ++k ) {
2373 const SIMDType a1(
set( A(i,k) ) );
2374 xmm1 += a1 * B.load(k,j );
2375 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2376 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2379 (~C).store( i, j , xmm1 );
2380 (~C).store( i, j+SIMDSIZE , xmm2 );
2381 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2385 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2387 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
2388 size_t i( LOW ? j : 0UL );
2390 for( ; (i+4UL) <= iend; i+=4UL )
2404 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2405 SIMDType xmm3( (~C).load(i+1UL,j ) );
2406 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2407 SIMDType xmm5( (~C).load(i+2UL,j ) );
2408 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
2409 SIMDType xmm7( (~C).load(i+3UL,j ) );
2410 SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
2412 for(
size_t k=kbegin; k<kend; ++k ) {
2413 const SIMDType a1(
set( A(i ,k) ) );
2414 const SIMDType a2(
set( A(i+1UL,k) ) );
2415 const SIMDType a3(
set( A(i+2UL,k) ) );
2416 const SIMDType a4(
set( A(i+3UL,k) ) );
2418 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2429 (~C).store( i , j , xmm1 );
2430 (~C).store( i , j+SIMDSIZE, xmm2 );
2431 (~C).store( i+1UL, j , xmm3 );
2432 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2433 (~C).store( i+2UL, j , xmm5 );
2434 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
2435 (~C).store( i+3UL, j , xmm7 );
2436 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
2439 for( ; (i+3UL) <= iend; i+=3UL )
2453 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2454 SIMDType xmm3( (~C).load(i+1UL,j ) );
2455 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2456 SIMDType xmm5( (~C).load(i+2UL,j ) );
2457 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
2459 for(
size_t k=kbegin; k<kend; ++k ) {
2460 const SIMDType a1(
set( A(i ,k) ) );
2461 const SIMDType a2(
set( A(i+1UL,k) ) );
2462 const SIMDType a3(
set( A(i+2UL,k) ) );
2464 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2473 (~C).store( i , j , xmm1 );
2474 (~C).store( i , j+SIMDSIZE, xmm2 );
2475 (~C).store( i+1UL, j , xmm3 );
2476 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2477 (~C).store( i+2UL, j , xmm5 );
2478 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
2481 for( ; (i+2UL) <= iend; i+=2UL )
2495 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2496 SIMDType xmm3( (~C).load(i+1UL,j ) );
2497 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2501 for( ; (k+2UL) <= kend; k+=2UL ) {
2502 const SIMDType a1(
set( A(i ,k ) ) );
2503 const SIMDType a2(
set( A(i+1UL,k ) ) );
2504 const SIMDType a3(
set( A(i ,k+1UL) ) );
2505 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
2506 const SIMDType b1( B.load(k ,j ) );
2507 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
2508 const SIMDType b3( B.load(k+1UL,j ) );
2509 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
2520 for( ; k<kend; ++k ) {
2521 const SIMDType a1(
set( A(i ,k) ) );
2522 const SIMDType a2(
set( A(i+1UL,k) ) );
2524 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2531 (~C).store( i , j , xmm1+xmm5 );
2532 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
2533 (~C).store( i+1UL, j , xmm3+xmm7 );
2534 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
2547 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2551 for( ; (k+2UL) <= kend; k+=2UL ) {
2552 const SIMDType a1(
set( A(i,k ) ) );
2553 const SIMDType a2(
set( A(i,k+1UL) ) );
2554 xmm1 += a1 * B.load(k ,j );
2555 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
2556 xmm3 += a2 * B.load(k+1UL,j );
2557 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
2560 for( ; k<kend; ++k ) {
2561 const SIMDType a1(
set( A(i,k) ) );
2562 xmm1 += a1 * B.load(k,j );
2563 xmm2 += a1 * B.load(k,j+SIMDSIZE);
2566 (~C).store( i, j , xmm1+xmm3 );
2567 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
2571 for( ; j<jpos; j+=SIMDSIZE )
2573 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
2574 size_t i( LOW ? j : 0UL );
2576 for( ; (i+4UL) <= iend; i+=4UL )
2588 SIMDType xmm2( (~C).load(i+1UL,j) );
2589 SIMDType xmm3( (~C).load(i+2UL,j) );
2590 SIMDType xmm4( (~C).load(i+3UL,j) );
2594 for( ; (k+2UL) <= kend; k+=2UL ) {
2596 const SIMDType b2( B.load(k+1UL,j) );
2597 xmm1 +=
set( A(i ,k ) ) * b1;
2598 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2599 xmm3 +=
set( A(i+2UL,k ) ) * b1;
2600 xmm4 +=
set( A(i+3UL,k ) ) * b1;
2601 xmm5 +=
set( A(i ,k+1UL) ) * b2;
2602 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
2603 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
2604 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
2607 for( ; k<kend; ++k ) {
2609 xmm1 +=
set( A(i ,k) ) * b1;
2610 xmm2 +=
set( A(i+1UL,k) ) * b1;
2611 xmm3 +=
set( A(i+2UL,k) ) * b1;
2612 xmm4 +=
set( A(i+3UL,k) ) * b1;
2615 (~C).store( i , j, xmm1+xmm5 );
2616 (~C).store( i+1UL, j, xmm2+xmm6 );
2617 (~C).store( i+2UL, j, xmm3+xmm7 );
2618 (~C).store( i+3UL, j, xmm4+xmm8 );
2621 for( ; (i+3UL) <= iend; i+=3UL )
2633 SIMDType xmm2( (~C).load(i+1UL,j) );
2634 SIMDType xmm3( (~C).load(i+2UL,j) );
2638 for( ; (k+2UL) <= kend; k+=2UL ) {
2640 const SIMDType b2( B.load(k+1UL,j) );
2641 xmm1 +=
set( A(i ,k ) ) * b1;
2642 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2643 xmm3 +=
set( A(i+2UL,k ) ) * b1;
2644 xmm4 +=
set( A(i ,k+1UL) ) * b2;
2645 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
2646 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
2649 for( ; k<kend; ++k ) {
2651 xmm1 +=
set( A(i ,k) ) * b1;
2652 xmm2 +=
set( A(i+1UL,k) ) * b1;
2653 xmm3 +=
set( A(i+2UL,k) ) * b1;
2656 (~C).store( i , j, xmm1+xmm4 );
2657 (~C).store( i+1UL, j, xmm2+xmm5 );
2658 (~C).store( i+2UL, j, xmm3+xmm6 );
2661 for( ; (i+2UL) <= iend; i+=2UL )
2673 SIMDType xmm2( (~C).load(i+1UL,j) );
2677 for( ; (k+2UL) <= kend; k+=2UL ) {
2679 const SIMDType b2( B.load(k+1UL,j) );
2680 xmm1 +=
set( A(i ,k ) ) * b1;
2681 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2682 xmm3 +=
set( A(i ,k+1UL) ) * b2;
2683 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
2686 for( ; k<kend; ++k ) {
2688 xmm1 +=
set( A(i ,k) ) * b1;
2689 xmm2 +=
set( A(i+1UL,k) ) * b1;
2692 (~C).store( i , j, xmm1+xmm3 );
2693 (~C).store( i+1UL, j, xmm2+xmm4 );
2708 for( ; (k+2UL) <= K; k+=2UL ) {
2709 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
2710 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
2714 xmm1 +=
set( A(i,k) ) * B.load(k,j);
2717 (~C).store( i, j, xmm1+xmm2 );
2721 for( ; remainder && j<N; ++j )
2723 const size_t iend( UPP ? j+1UL : M );
2724 size_t i( LOW ? j : 0UL );
2726 for( ; (i+2UL) <= iend; i+=2UL )
2740 for(
size_t k=kbegin; k<kend; ++k ) {
2741 value1 += A(i ,k) * B(k,j);
2742 value2 += A(i+1UL,k) * B(k,j);
2745 (~C)(i ,j) = value1;
2746 (~C)(i+1UL,j) = value2;
2759 for(
size_t k=kbegin; k<K; ++k ) {
2760 value += A(i,k) * B(k,j);
2785 template<
typename MT3
2796 const ForwardFunctor fwd;
2800 addAssign( ~C, fwd( tmp * B ) );
2804 addAssign( ~C, fwd( A * tmp ) );
2806 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2808 addAssign( ~C, fwd( tmp * B ) );
2812 addAssign( ~C, fwd( A * tmp ) );
2832 template<
typename MT3
2836 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2838 selectDefaultAddAssignKernel( C, A, B );
2858 template<
typename MT3
2862 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2888 template<
typename MT3
2892 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2894 selectLargeAddAssignKernel( C, A, B );
2900 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2914 template<
typename MT3
2918 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2925 addAssign( C, tmp );
2930 addAssign( C, tmp );
2933 gemm( C, A, B, ET(1), ET(1) );
2955 template<
typename MT >
2966 const ForwardFunctor fwd;
2995 template<
typename MT
3005 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
3019 DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3035 template<
typename MT3
3038 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3041 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
3042 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3043 selectSmallSubAssignKernel( C, A, B );
3045 selectBlasSubAssignKernel( C, A, B );
3064 template<
typename MT3
3068 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3070 const size_t M( A.rows() );
3071 const size_t N( B.columns() );
3072 const size_t K( A.columns() );
3076 for(
size_t i=0UL; i<M; ++i )
3086 for(
size_t k=kbegin; k<kend; ++k )
3090 ?( UPP ?
max(i,k+1UL) : k+1UL )
3091 :( UPP ?
max(i,k) : k ) )
3092 :( UPP ? i : 0UL ) );
3095 ?( LOW ?
min(i+1UL,k) : k )
3096 :( LOW ?
min(i,k)+1UL : k+1UL ) )
3097 :( LOW ? i+1UL : N ) );
3099 if( ( LOW || UPP ) && ( jbegin >= jend ) )
continue;
3102 const size_t jnum( jend - jbegin );
3103 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3105 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3106 C(i,j ) -= A(i,k) * B(k,j );
3107 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3110 C(i,jpos) -= A(i,k) * B(k,jpos);
3132 template<
typename MT3
3135 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3136 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3140 const size_t M( A.rows() );
3141 const size_t N( B.columns() );
3143 for(
size_t i=0UL; i<M; ++i )
3153 const size_t jnum( jend - jbegin );
3154 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3156 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3157 C(i,j ) -= A(i,j ) * B(j ,j );
3158 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3161 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3182 template<
typename MT3
3186 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3190 const size_t M( A.rows() );
3191 const size_t N( B.columns() );
3193 for(
size_t i=0UL; i<M; ++i )
3203 const size_t jnum( jend - jbegin );
3204 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3206 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3207 C(i,j ) -= A(i,i) * B(i,j );
3208 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3211 C(i,jpos) -= A(i,i) * B(i,jpos);
3232 template<
typename MT3
3236 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3240 for(
size_t i=0UL; i<A.rows(); ++i ) {
3241 C(i,i) -= A(i,i) * B(i,i);
3261 template<
typename MT3
3265 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3267 selectDefaultSubAssignKernel( C, A, B );
3287 template<
typename MT3
3295 const size_t M( A.rows() );
3296 const size_t N( B.columns() );
3297 const size_t K( A.columns() );
3301 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3308 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3309 for(
size_t i=0UL; i<M; ++i )
3323 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3324 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3325 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3326 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3327 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
3328 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
3329 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
3331 for(
size_t k=kbegin; k<kend; ++k ) {
3332 const SIMDType a1(
set( A(i,k) ) );
3333 xmm1 -= a1 * B.load(k,j );
3334 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3335 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3336 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3337 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3338 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
3339 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
3340 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
3343 (~C).store( i, j , xmm1 );
3344 (~C).store( i, j+SIMDSIZE , xmm2 );
3345 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3346 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3347 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3348 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
3349 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
3350 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3355 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3359 for( ; (i+2UL) <= M; i+=2UL )
3372 SIMDType xmm1 ( (~C).load(i ,j ) );
3373 SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
3374 SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
3375 SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
3376 SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
3377 SIMDType xmm6 ( (~C).load(i+1UL,j ) );
3378 SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
3379 SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3380 SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3381 SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
3383 for(
size_t k=kbegin; k<kend; ++k ) {
3384 const SIMDType a1(
set( A(i ,k) ) );
3385 const SIMDType a2(
set( A(i+1UL,k) ) );
3387 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3388 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3389 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3390 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3403 (~C).store( i , j , xmm1 );
3404 (~C).store( i , j+SIMDSIZE , xmm2 );
3405 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3406 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3407 (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
3408 (~C).store( i+1UL, j , xmm6 );
3409 (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
3410 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3411 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3412 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3425 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3426 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3427 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3428 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3430 for(
size_t k=kbegin; k<kend; ++k ) {
3431 const SIMDType a1(
set( A(i,k) ) );
3432 xmm1 -= a1 * B.load(k,j );
3433 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3434 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3435 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3436 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3439 (~C).store( i, j , xmm1 );
3440 (~C).store( i, j+SIMDSIZE , xmm2 );
3441 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3442 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3443 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3447 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3451 for( ; (i+2UL) <= M; i+=2UL )
3465 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3466 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3467 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3468 SIMDType xmm5( (~C).load(i+1UL,j ) );
3469 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3470 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3471 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3473 for(
size_t k=kbegin; k<kend; ++k ) {
3474 const SIMDType a1(
set( A(i ,k) ) );
3475 const SIMDType a2(
set( A(i+1UL,k) ) );
3477 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3478 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3479 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3490 (~C).store( i , j , xmm1 );
3491 (~C).store( i , j+SIMDSIZE , xmm2 );
3492 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3493 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3494 (~C).store( i+1UL, j , xmm5 );
3495 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3496 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3497 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3510 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3511 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3512 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3514 for(
size_t k=kbegin; k<kend; ++k ) {
3515 const SIMDType a1(
set( A(i,k) ) );
3516 xmm1 -= a1 * B.load(k,j );
3517 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3518 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3519 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3522 (~C).store( i, j , xmm1 );
3523 (~C).store( i, j+SIMDSIZE , xmm2 );
3524 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3525 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3529 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3533 for( ; (i+2UL) <= M; i+=2UL )
3547 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3548 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3549 SIMDType xmm4( (~C).load(i+1UL,j ) );
3550 SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3551 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3553 for(
size_t k=kbegin; k<kend; ++k ) {
3554 const SIMDType a1(
set( A(i ,k) ) );
3555 const SIMDType a2(
set( A(i+1UL,k) ) );
3557 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3558 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3567 (~C).store( i , j , xmm1 );
3568 (~C).store( i , j+SIMDSIZE , xmm2 );
3569 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3570 (~C).store( i+1UL, j , xmm4 );
3571 (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3572 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3585 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3586 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3588 for(
size_t k=kbegin; k<kend; ++k ) {
3589 const SIMDType a1(
set( A(i,k) ) );
3590 xmm1 -= a1 * B.load(k,j );
3591 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3592 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3595 (~C).store( i, j , xmm1 );
3596 (~C).store( i, j+SIMDSIZE , xmm2 );
3597 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3601 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3603 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
3604 size_t i( LOW ? j : 0UL );
3606 for( ; (i+4UL) <= iend; i+=4UL )
3620 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3621 SIMDType xmm3( (~C).load(i+1UL,j ) );
3622 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3623 SIMDType xmm5( (~C).load(i+2UL,j ) );
3624 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3625 SIMDType xmm7( (~C).load(i+3UL,j ) );
3626 SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
3628 for(
size_t k=kbegin; k<kend; ++k ) {
3629 const SIMDType a1(
set( A(i ,k) ) );
3630 const SIMDType a2(
set( A(i+1UL,k) ) );
3631 const SIMDType a3(
set( A(i+2UL,k) ) );
3632 const SIMDType a4(
set( A(i+3UL,k) ) );
3634 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3645 (~C).store( i , j , xmm1 );
3646 (~C).store( i , j+SIMDSIZE, xmm2 );
3647 (~C).store( i+1UL, j , xmm3 );
3648 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3649 (~C).store( i+2UL, j , xmm5 );
3650 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3651 (~C).store( i+3UL, j , xmm7 );
3652 (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
3655 for( ; (i+3UL) <= iend; i+=3UL )
3669 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3670 SIMDType xmm3( (~C).load(i+1UL,j ) );
3671 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3672 SIMDType xmm5( (~C).load(i+2UL,j ) );
3673 SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3675 for(
size_t k=kbegin; k<kend; ++k ) {
3676 const SIMDType a1(
set( A(i ,k) ) );
3677 const SIMDType a2(
set( A(i+1UL,k) ) );
3678 const SIMDType a3(
set( A(i+2UL,k) ) );
3680 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3689 (~C).store( i , j , xmm1 );
3690 (~C).store( i , j+SIMDSIZE, xmm2 );
3691 (~C).store( i+1UL, j , xmm3 );
3692 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3693 (~C).store( i+2UL, j , xmm5 );
3694 (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3697 for( ; (i+2UL) <= iend; i+=2UL )
3711 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3712 SIMDType xmm3( (~C).load(i+1UL,j ) );
3713 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3717 for( ; (k+2UL) <= kend; k+=2UL ) {
3718 const SIMDType a1(
set( A(i ,k ) ) );
3719 const SIMDType a2(
set( A(i+1UL,k ) ) );
3720 const SIMDType a3(
set( A(i ,k+1UL) ) );
3721 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
3722 const SIMDType b1( B.load(k ,j ) );
3723 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3724 const SIMDType b3( B.load(k+1UL,j ) );
3725 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3736 for( ; k<kend; ++k ) {
3737 const SIMDType a1(
set( A(i ,k) ) );
3738 const SIMDType a2(
set( A(i+1UL,k) ) );
3740 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3747 (~C).store( i , j , xmm1+xmm5 );
3748 (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
3749 (~C).store( i+1UL, j , xmm3+xmm7 );
3750 (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3763 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3767 for( ; (k+2UL) <= kend; k+=2UL ) {
3768 const SIMDType a1(
set( A(i,k ) ) );
3769 const SIMDType a2(
set( A(i,k+1UL) ) );
3770 xmm1 -= a1 * B.load(k ,j );
3771 xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
3772 xmm3 -= a2 * B.load(k+1UL,j );
3773 xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
3776 for( ; k<kend; ++k ) {
3777 const SIMDType a1(
set( A(i,k) ) );
3778 xmm1 -= a1 * B.load(k,j );
3779 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
3782 (~C).store( i, j , xmm1+xmm3 );
3783 (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
3787 for( ; j<jpos; j+=SIMDSIZE )
3789 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
3790 size_t i( LOW ? j : 0UL );
3792 for( ; (i+4UL) <= iend; i+=4UL )
3804 SIMDType xmm2( (~C).load(i+1UL,j) );
3805 SIMDType xmm3( (~C).load(i+2UL,j) );
3806 SIMDType xmm4( (~C).load(i+3UL,j) );
3810 for( ; (k+2UL) <= kend; k+=2UL ) {
3812 const SIMDType b2( B.load(k+1UL,j) );
3813 xmm1 -=
set( A(i ,k ) ) * b1;
3814 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3815 xmm3 -=
set( A(i+2UL,k ) ) * b1;
3816 xmm4 -=
set( A(i+3UL,k ) ) * b1;
3817 xmm5 -=
set( A(i ,k+1UL) ) * b2;
3818 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
3819 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
3820 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
3823 for( ; k<kend; ++k ) {
3825 xmm1 -=
set( A(i ,k) ) * b1;
3826 xmm2 -=
set( A(i+1UL,k) ) * b1;
3827 xmm3 -=
set( A(i+2UL,k) ) * b1;
3828 xmm4 -=
set( A(i+3UL,k) ) * b1;
3831 (~C).store( i , j, xmm1+xmm5 );
3832 (~C).store( i+1UL, j, xmm2+xmm6 );
3833 (~C).store( i+2UL, j, xmm3+xmm7 );
3834 (~C).store( i+3UL, j, xmm4+xmm8 );
3837 for( ; (i+3UL) <= iend; i+=3UL )
3849 SIMDType xmm2( (~C).load(i+1UL,j) );
3850 SIMDType xmm3( (~C).load(i+2UL,j) );
3854 for( ; (k+2UL) <= kend; k+=2UL ) {
3856 const SIMDType b2( B.load(k+1UL,j) );
3857 xmm1 -=
set( A(i ,k ) ) * b1;
3858 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3859 xmm3 -=
set( A(i+2UL,k ) ) * b1;
3860 xmm4 -=
set( A(i ,k+1UL) ) * b2;
3861 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
3862 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
3865 for( ; k<kend; ++k ) {
3867 xmm1 -=
set( A(i ,k) ) * b1;
3868 xmm2 -=
set( A(i+1UL,k) ) * b1;
3869 xmm3 -=
set( A(i+2UL,k) ) * b1;
3872 (~C).store( i , j, xmm1+xmm4 );
3873 (~C).store( i+1UL, j, xmm2+xmm5 );
3874 (~C).store( i+2UL, j, xmm3+xmm6 );
3877 for( ; (i+2UL) <= iend; i+=2UL )
3889 SIMDType xmm2( (~C).load(i+1UL,j) );
3893 for( ; (k+2UL) <= kend; k+=2UL ) {
3895 const SIMDType b2( B.load(k+1UL,j) );
3896 xmm1 -=
set( A(i ,k ) ) * b1;
3897 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3898 xmm3 -=
set( A(i ,k+1UL) ) * b2;
3899 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
3902 for( ; k<kend; ++k ) {
3904 xmm1 -=
set( A(i ,k) ) * b1;
3905 xmm2 -=
set( A(i+1UL,k) ) * b1;
3908 (~C).store( i , j, xmm1+xmm3 );
3909 (~C).store( i+1UL, j, xmm2+xmm4 );
3924 for( ; (k+2UL) <= K; k+=2UL ) {
3925 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
3926 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
3930 xmm1 -=
set( A(i,k) ) * B.load(k,j);
3933 (~C).store( i, j, xmm1+xmm2 );
3937 for( ; remainder && j<N; ++j )
3939 const size_t iend( UPP ? j+1UL : M );
3940 size_t i( LOW ? j : 0UL );
3942 for( ; (i+2UL) <= iend; i+=2UL )
3956 for(
size_t k=kbegin; k<kend; ++k ) {
3957 value1 -= A(i ,k) * B(k,j);
3958 value2 -= A(i+1UL,k) * B(k,j);
3961 (~C)(i ,j) = value1;
3962 (~C)(i+1UL,j) = value2;
3975 for(
size_t k=kbegin; k<K; ++k ) {
3976 value -= A(i,k) * B(k,j);
4001 template<
typename MT3
4012 const ForwardFunctor fwd;
4016 subAssign( ~C, fwd( tmp * B ) );
4020 subAssign( ~C, fwd( A * tmp ) );
4022 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4024 subAssign( ~C, fwd( tmp * B ) );
4028 subAssign( ~C, fwd( A * tmp ) );
4048 template<
typename MT3
4052 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4054 selectDefaultSubAssignKernel( C, A, B );
4074 template<
typename MT3
4078 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4104 template<
typename MT3
4108 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4110 selectLargeSubAssignKernel( C, A, B );
4116 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4130 template<
typename MT3
4134 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4141 subAssign( C, tmp );
4146 subAssign( C, tmp );
4149 gemm( C, A, B, ET(-1), ET(1) );
4171 template<
typename MT >
4182 const ForwardFunctor fwd;
4211 template<
typename MT
4225 schurAssign( ~lhs, tmp );
4257 template<
typename MT
4267 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4270 else if( rhs.
lhs_.columns() == 0UL ) {
4305 template<
typename MT
4324 const ForwardFunctor fwd;
4326 const TmpType tmp( rhs );
4347 template<
typename MT >
4358 const ForwardFunctor fwd;
4386 template<
typename MT
4396 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
4430 template<
typename MT >
4441 const ForwardFunctor fwd;
4473 template<
typename MT
4483 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.
lhs_.columns() == 0UL ) {
4517 template<
typename MT >
4528 const ForwardFunctor fwd;
4557 template<
typename MT
4617 template<
typename MT1
4625 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4655 SYM = ( SF && !( HF || LF || UF ) ),
4656 HERM = ( HF && !( LF || UF ) ),
4657 LOW = ( LF || ( ( SF || HF ) && UF ) ),
4658 UPP = ( UF || ( ( SF || HF ) && LF ) )
4669 template<
typename T1,
typename T2,
typename T3 >
4670 struct CanExploitSymmetry {
4681 template<
typename T1,
typename T2,
typename T3 >
4682 struct IsEvaluationRequired {
4683 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
4684 !CanExploitSymmetry<T1,T2,T3>::value };
4692 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4693 struct UseBlasKernel {
4694 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4700 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4714 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4715 struct UseVectorizedDefaultKernel {
4716 enum :
bool { value = useOptimizedKernels &&
4718 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4774 MT1::simdEnabled && MT2::simdEnabled &&
4780 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4781 !evaluateRight && MT2::smpAssignable };
4811 return matrix_(i,j) * scalar_;
4824 if( i >= matrix_.rows() ) {
4827 if( j >= matrix_.columns() ) {
4830 return (*
this)(i,j);
4839 inline size_t rows()
const {
4840 return matrix_.rows();
4849 inline size_t columns()
const {
4850 return matrix_.columns();
4880 template<
typename T >
4881 inline bool canAlias(
const T* alias )
const {
4882 return matrix_.canAlias( alias );
4892 template<
typename T >
4893 inline bool isAliased(
const T* alias )
const {
4894 return matrix_.isAliased( alias );
4904 return matrix_.isAligned();
4915 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4917 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
4918 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
4940 template<
typename MT
4953 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4956 else if( left.columns() == 0UL ) {
4971 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4986 template<
typename MT3
4990 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4993 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
4994 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4995 selectSmallAssignKernel( C, A, B, scalar );
4997 selectBlasAssignKernel( C, A, B, scalar );
5015 template<
typename MT3
5020 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5022 const size_t M( A.rows() );
5023 const size_t N( B.columns() );
5024 const size_t K( A.columns() );
5028 for(
size_t i=0UL; i<M; ++i )
5039 for(
size_t j=0UL; j<N; ++j ) {
5048 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
5049 :( UPP ?
max(i,kbegin) : kbegin ) )
5050 :( UPP ? i : 0UL ) );
5053 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
5054 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
5055 :( LOW ? i+1UL : N ) );
5058 for(
size_t j=0UL; j<jbegin; ++j ) {
5065 for(
size_t j=jbegin; j<jend; ++j ) {
5066 C(i,j) = A(i,kbegin) * B(kbegin,j);
5069 for(
size_t j=jend; j<N; ++j ) {
5074 reset( C(i,N-1UL) );
5078 for(
size_t k=kbegin+1UL; k<kend; ++k )
5082 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
5083 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
5084 :( SYM || HERM || UPP ? i : 0UL ) );
5087 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
5088 :( LOW ?
min(i+1UL,k) : k ) )
5089 :( LOW ? i+1UL : N ) );
5091 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5094 for(
size_t j=jbegin; j<jend; ++j ) {
5095 C(i,j) += A(i,k) * B(k,j);
5098 C(i,jend) = A(i,k) * B(k,jend);
5105 :( SYM || HERM || UPP ? i : 0UL ) );
5108 :( LOW ? i+1UL : N ) );
5110 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5113 for(
size_t j=jbegin; j<jend; ++j ) {
5120 for(
size_t i=1UL; i<M; ++i ) {
5121 for(
size_t j=0UL; j<i; ++j ) {
5122 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5143 template<
typename MT3
5147 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
5148 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5152 const size_t M( A.rows() );
5153 const size_t N( B.columns() );
5155 for(
size_t i=0UL; i<M; ++i )
5166 for(
size_t j=0UL; j<jbegin; ++j ) {
5170 for(
size_t j=jbegin; j<jend; ++j ) {
5171 C(i,j) = A(i,j) * B(j,j) * scalar;
5174 for(
size_t j=jend; j<N; ++j ) {
5196 template<
typename MT3
5201 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5205 const size_t M( A.rows() );
5206 const size_t N( B.columns() );
5208 for(
size_t i=0UL; i<M; ++i )
5219 for(
size_t j=0UL; j<jbegin; ++j ) {
5223 for(
size_t j=jbegin; j<jend; ++j ) {
5224 C(i,j) = A(i,i) * B(i,j) * scalar;
5227 for(
size_t j=jend; j<N; ++j ) {
5249 template<
typename MT3
5254 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5260 for(
size_t i=0UL; i<A.rows(); ++i ) {
5261 C(i,i) = A(i,i) * B(i,i) * scalar;
5280 template<
typename MT3
5285 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5287 selectDefaultAssignKernel( C, A, B, scalar );
5306 template<
typename MT3
5315 const size_t M( A.rows() );
5316 const size_t N( B.columns() );
5317 const size_t K( A.columns() );
5321 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
5324 const SIMDType factor(
set( scalar ) );
5326 if( LOW && UPP && N > SIMDSIZE*3UL ) {
5335 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5336 for(
size_t i=0UL; i<M; ++i )
5349 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5351 for(
size_t k=kbegin; k<kend; ++k ) {
5352 const SIMDType a1(
set( A(i,k) ) );
5353 xmm1 += a1 * B.load(k,j );
5354 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5355 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5356 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5357 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5358 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
5359 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
5360 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
5363 (~C).store( i, j , xmm1 * factor );
5364 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5365 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5366 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5367 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5368 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
5369 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
5370 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
5375 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5379 for( ; (i+2UL) <= M; i+=2UL )
5392 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5394 for(
size_t k=kbegin; k<kend; ++k ) {
5395 const SIMDType a1(
set( A(i ,k) ) );
5396 const SIMDType a2(
set( A(i+1UL,k) ) );
5398 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5399 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5400 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5401 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5414 (~C).store( i , j , xmm1 * factor );
5415 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
5416 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5417 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5418 (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
5419 (~C).store( i+1UL, j , xmm6 * factor );
5420 (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
5421 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
5422 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
5423 (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
5435 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5437 for(
size_t k=kbegin; k<kend; ++k ) {
5438 const SIMDType a1(
set( A(i,k) ) );
5439 xmm1 += a1 * B.load(k,j );
5440 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5441 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5442 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5443 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5446 (~C).store( i, j , xmm1 * factor );
5447 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5448 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5449 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5450 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5454 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5456 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*4UL,M) : M );
5457 size_t i( LOW ? j : 0UL );
5459 for( ; (i+2UL) <= iend; i+=2UL )
5472 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5474 for(
size_t k=kbegin; k<kend; ++k ) {
5475 const SIMDType a1(
set( A(i ,k) ) );
5476 const SIMDType a2(
set( A(i+1UL,k) ) );
5478 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5479 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5480 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5491 (~C).store( i , j , xmm1 * factor );
5492 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
5493 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5494 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5495 (~C).store( i+1UL, j , xmm5 * factor );
5496 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
5497 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
5498 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
5512 for(
size_t k=kbegin; k<kend; ++k ) {
5513 const SIMDType a1(
set( A(i,k) ) );
5514 xmm1 += a1 * B.load(k,j );
5515 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5516 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5517 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5520 (~C).store( i, j , xmm1 * factor );
5521 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5522 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5523 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5527 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5529 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*3UL,M) : M );
5530 size_t i( LOW ? j : 0UL );
5532 for( ; (i+2UL) <= iend; i+=2UL )
5545 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5547 for(
size_t k=kbegin; k<kend; ++k ) {
5548 const SIMDType a1(
set( A(i ,k) ) );
5549 const SIMDType a2(
set( A(i+1UL,k) ) );
5551 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5552 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5561 (~C).store( i , j , xmm1 * factor );
5562 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
5563 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5564 (~C).store( i+1UL, j , xmm4 * factor );
5565 (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
5566 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
5580 for(
size_t k=kbegin; k<kend; ++k ) {
5581 const SIMDType a1(
set( A(i,k) ) );
5582 xmm1 += a1 * B.load(k,j );
5583 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5584 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5587 (~C).store( i, j , xmm1 * factor );
5588 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5589 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5593 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5595 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE*2UL,M) : M );
5596 size_t i( LOW ? j : 0UL );
5598 for( ; (i+4UL) <= iend; i+=4UL )
5611 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5613 for(
size_t k=kbegin; k<kend; ++k ) {
5614 const SIMDType a1(
set( A(i ,k) ) );
5615 const SIMDType a2(
set( A(i+1UL,k) ) );
5616 const SIMDType a3(
set( A(i+2UL,k) ) );
5617 const SIMDType a4(
set( A(i+3UL,k) ) );
5619 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5630 (~C).store( i , j , xmm1 * factor );
5631 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
5632 (~C).store( i+1UL, j , xmm3 * factor );
5633 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5634 (~C).store( i+2UL, j , xmm5 * factor );
5635 (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5636 (~C).store( i+3UL, j , xmm7 * factor );
5637 (~C).store( i+3UL, j+SIMDSIZE, xmm8 * factor );
5640 for( ; (i+3UL) <= iend; i+=3UL )
5653 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5655 for(
size_t k=kbegin; k<kend; ++k ) {
5656 const SIMDType a1(
set( A(i ,k) ) );
5657 const SIMDType a2(
set( A(i+1UL,k) ) );
5658 const SIMDType a3(
set( A(i+2UL,k) ) );
5660 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5669 (~C).store( i , j , xmm1 * factor );
5670 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
5671 (~C).store( i+1UL, j , xmm3 * factor );
5672 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5673 (~C).store( i+2UL, j , xmm5 * factor );
5674 (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5677 for( ; (i+2UL) <= iend; i+=2UL )
5690 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5693 for( ; (k+2UL) <= kend; k+=2UL ) {
5694 const SIMDType a1(
set( A(i ,k ) ) );
5695 const SIMDType a2(
set( A(i+1UL,k ) ) );
5696 const SIMDType a3(
set( A(i ,k+1UL) ) );
5697 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
5698 const SIMDType b1( B.load(k ,j ) );
5699 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5700 const SIMDType b3( B.load(k+1UL,j ) );
5701 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5712 for( ; k<kend; ++k ) {
5713 const SIMDType a1(
set( A(i ,k) ) );
5714 const SIMDType a2(
set( A(i+1UL,k) ) );
5716 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5723 (~C).store( i , j , (xmm1+xmm5) * factor );
5724 (~C).store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
5725 (~C).store( i+1UL, j , (xmm3+xmm7) * factor );
5726 (~C).store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
5741 for( ; (k+2UL) <= kend; k+=2UL ) {
5742 const SIMDType a1(
set( A(i,k ) ) );
5743 const SIMDType a2(
set( A(i,k+1UL) ) );
5744 xmm1 += a1 * B.load(k ,j );
5745 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
5746 xmm3 += a2 * B.load(k+1UL,j );
5747 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
5750 for( ; k<kend; ++k ) {
5751 const SIMDType a1(
set( A(i,k) ) );
5752 xmm1 += a1 * B.load(k,j );
5753 xmm2 += a1 * B.load(k,j+SIMDSIZE);
5756 (~C).store( i, j , (xmm1+xmm3) * factor );
5757 (~C).store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
5761 for( ; j<jpos; j+=SIMDSIZE )
5763 const size_t iend( SYM || HERM || UPP ?
min(j+SIMDSIZE,M) : M );
5764 size_t i( LOW ? j : 0UL );
5766 for( ; (i+4UL) <= iend; i+=4UL )
5777 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5780 for( ; (k+2UL) <= kend; k+=2UL ) {
5782 const SIMDType b2( B.load(k+1UL,j) );
5783 xmm1 +=
set( A(i ,k ) ) * b1;
5784 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5785 xmm3 +=
set( A(i+2UL,k ) ) * b1;
5786 xmm4 +=
set( A(i+3UL,k ) ) * b1;
5787 xmm5 +=
set( A(i ,k+1UL) ) * b2;
5788 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
5789 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
5790 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
5793 for( ; k<kend; ++k ) {
5795 xmm1 +=
set( A(i ,k) ) * b1;
5796 xmm2 +=
set( A(i+1UL,k) ) * b1;
5797 xmm3 +=
set( A(i+2UL,k) ) * b1;
5798 xmm4 +=
set( A(i+3UL,k) ) * b1;
5801 (~C).store( i , j, (xmm1+xmm5) * factor );
5802 (~C).store( i+1UL, j, (xmm2+xmm6) * factor );
5803 (~C).store( i+2UL, j, (xmm3+xmm7) * factor );
5804 (~C).store( i+3UL, j, (xmm4+xmm8) * factor );
5807 for( ; (i+3UL) <= iend; i+=3UL )
5818 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5821 for( ; (k+2UL) <= kend; k+=2UL ) {
5823 const SIMDType b2( B.load(k+1UL,j) );
5824 xmm1 +=
set( A(i ,k ) ) * b1;
5825 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5826 xmm3 +=
set( A(i+2UL,k ) ) * b1;
5827 xmm4 +=
set( A(i ,k+1UL) ) * b2;
5828 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
5829 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
5832 for( ; k<kend; ++k ) {
5834 xmm1 +=
set( A(i ,k) ) * b1;
5835 xmm2 +=
set( A(i+1UL,k) ) * b1;
5836 xmm3 +=
set( A(i+2UL,k) ) * b1;
5839 (~C).store( i , j, (xmm1+xmm4) * factor );
5840 (~C).store( i+1UL, j, (xmm2+xmm5) * factor );
5841 (~C).store( i+2UL, j, (xmm3+xmm6) * factor );
5844 for( ; (i+2UL) <= iend; i+=2UL )
5858 for( ; (k+2UL) <= kend; k+=2UL ) {
5860 const SIMDType b2( B.load(k+1UL,j) );
5861 xmm1 +=
set( A(i ,k ) ) * b1;
5862 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5863 xmm3 +=
set( A(i ,k+1UL) ) * b2;
5864 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
5867 for( ; k<kend; ++k ) {
5869 xmm1 +=
set( A(i ,k) ) * b1;
5870 xmm2 +=
set( A(i+1UL,k) ) * b1;
5873 (~C).store( i , j, (xmm1+xmm3) * factor );
5874 (~C).store( i+1UL, j, (xmm2+xmm4) * factor );
5888 for( ; (k+2UL) <= K; k+=2UL ) {
5889 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
5890 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
5894 xmm1 +=
set( A(i,k) ) * B.load(k,j);
5897 (~C).store( i, j, (xmm1+xmm2) * factor );
5901 for( ; remainder && j<N; ++j )
5903 size_t i( LOW && UPP ? j : 0UL );
5905 for( ; (i+2UL) <= M; i+=2UL )
5919 for(
size_t k=kbegin; k<kend; ++k ) {
5920 value1 += A(i ,k) * B(k,j);
5921 value2 += A(i+1UL,k) * B(k,j);
5924 (~C)(i ,j) = value1 * scalar;
5925 (~C)(i+1UL,j) = value2 * scalar;
5938 for(
size_t k=kbegin; k<K; ++k ) {
5939 value += A(i,k) * B(k,j);
5942 (~C)(i,j) = value * scalar;
5947 if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
5948 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5949 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5950 for(
size_t j=0UL; j<jend; ++j ) {
5951 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
5955 else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
5956 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5957 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
5958 for(
size_t i=0UL; i<iend; ++i ) {
5963 else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
5964 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5965 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5966 for(
size_t j=0UL; j<jend; ++j ) {
5989 template<
typename MT3
6001 const ForwardFunctor fwd;
6005 assign( ~C, fwd( tmp * B ) * scalar );
6009 assign( ~C, fwd( A * tmp ) * scalar );
6011 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6013 assign( ~C, fwd( tmp * B ) * scalar );
6017 assign( ~C, fwd( A * tmp ) * scalar );
6036 template<
typename MT3
6041 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6043 selectDefaultAssignKernel( C, A, B, scalar );
6062 template<
typename MT3
6067 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6070 smmm( C, A, B, scalar );
6072 hmmm( C, A, B, scalar );
6074 lmmm( C, A, B, scalar, ST2(0) );
6076 ummm( C, A, B, scalar, ST2(0) );
6078 mmm( C, A, B, scalar, ST2(0) );
6096 template<
typename MT3
6101 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6103 selectLargeAssignKernel( C, A, B, scalar );
6108 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6122 template<
typename MT3
6127 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6133 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6137 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6140 gemm( C, A, B, ET(scalar), ET(0) );
6158 template<
typename MT
6177 const ForwardFunctor fwd;
6179 const TmpType tmp(
serial( rhs ) );
6180 assign( ~lhs, fwd( tmp ) );
6198 template<
typename MT >
6209 const ForwardFunctor fwd;
6217 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
6219 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
6235 template<
typename MT
6248 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6262 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6277 template<
typename MT3
6281 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6284 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
6285 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6286 selectSmallAddAssignKernel( C, A, B, scalar );
6288 selectBlasAddAssignKernel( C, A, B, scalar );
6306 template<
typename MT3
6311 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6314 addAssign( C, tmp );
6332 template<
typename MT3
6336 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6337 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6341 const size_t M( A.rows() );
6342 const size_t N( B.columns() );
6344 for(
size_t i=0UL; i<M; ++i )
6354 const size_t jnum( jend - jbegin );
6355 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6357 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6358 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6359 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6362 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6382 template<
typename MT3
6387 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6391 const size_t M( A.rows() );
6392 const size_t N( B.columns() );
6394 for(
size_t i=0UL; i<M; ++i )
6404 const size_t jnum( jend - jbegin );
6405 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6407 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6408 C(i,j ) += A(i,i) * B(i,j ) * scalar;
6409 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6412 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6432 template<
typename MT3
6437 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6441 for(
size_t i=0UL; i<A.rows(); ++i ) {
6442 C(i,i) += A(i,i) * B(i,i) * scalar;
6461 template<
typename MT3
6466 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6468 selectDefaultAddAssignKernel( C, A, B, scalar );
6487 template<
typename MT3
6496 const size_t M( A.rows() );
6497 const size_t N( B.columns() );
6498 const size_t K( A.columns() );
6502 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
6505 const SIMDType factor(
set( scalar ) );
6511 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6512 for(
size_t i=0UL; i<M; ++i )
6525 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6527 for(
size_t k=kbegin; k<kend; ++k ) {
6528 const SIMDType a1(
set( A(i,k) ) );
6529 xmm1 += a1 * B.load(k,j );
6530 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6531 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6532 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6533 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6534 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6535 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6536 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6539 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6540 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6541 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6542 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6543 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6544 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
6545 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
6546 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
6551 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6555 for( ; (i+2UL) <= M; i+=2UL )
6568 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6570 for(
size_t k=kbegin; k<kend; ++k ) {
6571 const SIMDType a1(
set( A(i ,k) ) );
6572 const SIMDType a2(
set( A(i+1UL,k) ) );
6574 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6575 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6576 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6577 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6590 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6591 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
6592 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6593 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6594 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
6595 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
6596 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
6597 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
6598 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
6599 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
6611 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6613 for(
size_t k=kbegin; k<kend; ++k ) {
6614 const SIMDType a1(
set( A(i,k) ) );
6615 xmm1 += a1 * B.load(k,j );
6616 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6617 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6618 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6619 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6622 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6623 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6624 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6625 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6626 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6630 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6634 for( ; (i+2UL) <= M; i+=2UL )
6647 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6649 for(
size_t k=kbegin; k<kend; ++k ) {
6650 const SIMDType a1(
set( A(i ,k) ) );
6651 const SIMDType a2(
set( A(i+1UL,k) ) );
6653 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6654 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6655 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6666 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6667 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
6668 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6669 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6670 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
6671 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
6672 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
6673 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
6687 for(
size_t k=kbegin; k<kend; ++k ) {
6688 const SIMDType a1(
set( A(i,k) ) );
6689 xmm1 += a1 * B.load(k,j );
6690 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6691 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6692 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6695 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6696 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6697 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6698 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6702 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6706 for( ; (i+2UL) <= M; i+=2UL )
6719 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6721 for(
size_t k=kbegin; k<kend; ++k ) {
6722 const SIMDType a1(
set( A(i ,k) ) );
6723 const SIMDType a2(
set( A(i+1UL,k) ) );
6725 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6726 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6735 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6736 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
6737 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6738 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
6739 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
6740 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
6754 for(
size_t k=kbegin; k<kend; ++k ) {
6755 const SIMDType a1(
set( A(i,k) ) );
6756 xmm1 += a1 * B.load(k,j );
6757 xmm2 += a1 * B.load(k,j+SIMDSIZE );
6758 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6761 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6762 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6763 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6767 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6769 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
6770 size_t i( LOW ? j : 0UL );
6772 for( ; (i+4UL) <= iend; i+=4UL )
6785 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6787 for(
size_t k=kbegin; k<kend; ++k ) {
6788 const SIMDType a1(
set( A(i ,k) ) );
6789 const SIMDType a2(
set( A(i+1UL,k) ) );
6790 const SIMDType a3(
set( A(i+2UL,k) ) );
6791 const SIMDType a4(
set( A(i+3UL,k) ) );
6793 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6804 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6805 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
6806 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6807 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
6808 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6809 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
6810 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
6811 (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
6814 for( ; (i+3UL) <= iend; i+=3UL )
6827 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6829 for(
size_t k=kbegin; k<kend; ++k ) {
6830 const SIMDType a1(
set( A(i ,k) ) );
6831 const SIMDType a2(
set( A(i+1UL,k) ) );
6832 const SIMDType a3(
set( A(i+2UL,k) ) );
6834 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6843 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6844 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
6845 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6846 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
6847 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6848 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
6851 for( ; (i+2UL) <= iend; i+=2UL )
6864 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6867 for( ; (k+2UL) <= kend; k+=2UL ) {
6868 const SIMDType a1(
set( A(i ,k ) ) );
6869 const SIMDType a2(
set( A(i+1UL,k ) ) );
6870 const SIMDType a3(
set( A(i ,k+1UL) ) );
6871 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
6872 const SIMDType b1( B.load(k ,j ) );
6873 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
6874 const SIMDType b3( B.load(k+1UL,j ) );
6875 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
6886 for( ; k<kend; ++k ) {
6887 const SIMDType a1(
set( A(i ,k) ) );
6888 const SIMDType a2(
set( A(i+1UL,k) ) );
6890 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6897 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
6898 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
6899 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + (xmm3+xmm7) * factor );
6900 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
6915 for( ; (k+2UL) <= kend; k+=2UL ) {
6916 const SIMDType a1(
set( A(i,k ) ) );
6917 const SIMDType a2(
set( A(i,k+1UL) ) );
6918 xmm1 += a1 * B.load(k ,j );
6919 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
6920 xmm3 += a2 * B.load(k+1UL,j );
6921 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
6924 for( ; k<kend; ++k ) {
6925 const SIMDType a1(
set( A(i,k) ) );
6926 xmm1 += a1 * B.load(k,j );
6927 xmm2 += a1 * B.load(k,j+SIMDSIZE);
6930 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
6931 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
6935 for( ; j<jpos; j+=SIMDSIZE )
6937 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
6938 size_t i( LOW ? j : 0UL );
6940 for( ; (i+4UL) <= iend; i+=4UL )
6951 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6954 for( ; (k+2UL) <= kend; k+=2UL ) {
6956 const SIMDType b2( B.load(k+1UL,j) );
6957 xmm1 +=
set( A(i ,k ) ) * b1;
6958 xmm2 +=
set( A(i+1UL,k ) ) * b1;
6959 xmm3 +=
set( A(i+2UL,k ) ) * b1;
6960 xmm4 +=
set( A(i+3UL,k ) ) * b1;
6961 xmm5 +=
set( A(i ,k+1UL) ) * b2;
6962 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
6963 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
6964 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
6967 for( ; k<kend; ++k ) {
6969 xmm1 +=
set( A(i ,k) ) * b1;
6970 xmm2 +=
set( A(i+1UL,k) ) * b1;
6971 xmm3 +=
set( A(i+2UL,k) ) * b1;
6972 xmm4 +=
set( A(i+3UL,k) ) * b1;
6975 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm5) * factor );
6976 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm6) * factor );
6977 (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm7) * factor );
6978 (~C).store( i+3UL, j, (~C).load(i+3UL,j) + (xmm4+xmm8) * factor );
6981 for( ; (i+3UL) <= iend; i+=3UL )
6992 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6995 for( ; (k+2UL) <= kend; k+=2UL ) {
6997 const SIMDType b2( B.load(k+1UL,j) );
6998 xmm1 +=
set( A(i ,k ) ) * b1;
6999 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7000 xmm3 +=
set( A(i+2UL,k ) ) * b1;
7001 xmm4 +=
set( A(i ,k+1UL) ) * b2;
7002 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
7003 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
7006 for( ; k<kend; ++k ) {
7008 xmm1 +=
set( A(i ,k) ) * b1;
7009 xmm2 +=
set( A(i+1UL,k) ) * b1;
7010 xmm3 +=
set( A(i+2UL,k) ) * b1;
7013 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm4) * factor );
7014 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm5) * factor );
7015 (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm6) * factor );
7018 for( ; (i+2UL) <= iend; i+=2UL )
7032 for( ; (k+2UL) <= kend; k+=2UL ) {
7034 const SIMDType b2( B.load(k+1UL,j) );
7035 xmm1 +=
set( A(i ,k ) ) * b1;
7036 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7037 xmm3 +=
set( A(i ,k+1UL) ) * b2;
7038 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
7041 for( ; k<kend; ++k ) {
7043 xmm1 +=
set( A(i ,k) ) * b1;
7044 xmm2 +=
set( A(i+1UL,k) ) * b1;
7047 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
7048 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm4) * factor );
7062 for( ; (k+2UL) <= K; k+=2UL ) {
7063 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
7064 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
7068 xmm1 +=
set( A(i,k) ) * B.load(k,j);
7071 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
7075 for( ; remainder && j<N; ++j )
7077 const size_t iend( UPP ? j+1UL : M );
7078 size_t i( LOW ? j : 0UL );
7080 for( ; (i+2UL) <= iend; i+=2UL )
7094 for(
size_t k=kbegin; k<kend; ++k ) {
7095 value1 += A(i ,k) * B(k,j);
7096 value2 += A(i+1UL,k) * B(k,j);
7099 (~C)(i ,j) += value1 * scalar;
7100 (~C)(i+1UL,j) += value2 * scalar;
7113 for(
size_t k=kbegin; k<K; ++k ) {
7114 value += A(i,k) * B(k,j);
7117 (~C)(i,j) += value * scalar;
7138 template<
typename MT3
7150 const ForwardFunctor fwd;
7154 addAssign( ~C, fwd( tmp * B ) * scalar );
7158 addAssign( ~C, fwd( A * tmp ) * scalar );
7160 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7162 addAssign( ~C, fwd( tmp * B ) * scalar );
7166 addAssign( ~C, fwd( A * tmp ) * scalar );
7185 template<
typename MT3
7190 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7192 selectDefaultAddAssignKernel( C, A, B, scalar );
7211 template<
typename MT3
7216 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7219 lmmm( C, A, B, scalar, ST2(1) );
7221 ummm( C, A, B, scalar, ST2(1) );
7223 mmm( C, A, B, scalar, ST2(1) );
7241 template<
typename MT3
7246 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7248 selectLargeAddAssignKernel( C, A, B, scalar );
7253 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7267 template<
typename MT3
7272 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7278 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7279 addAssign( C, tmp );
7283 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7284 addAssign( C, tmp );
7287 gemm( C, A, B, ET(scalar), ET(1) );
7307 template<
typename MT >
7318 const ForwardFunctor fwd;
7326 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
7328 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
7348 template<
typename MT
7361 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7375 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
7390 template<
typename MT3
7394 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7397 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
7398 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
7399 selectSmallSubAssignKernel( C, A, B, scalar );
7401 selectBlasSubAssignKernel( C, A, B, scalar );
7419 template<
typename MT3
7424 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7427 subAssign( C, tmp );
7445 template<
typename MT3
7449 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7450 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7454 const size_t M( A.rows() );
7455 const size_t N( B.columns() );
7457 for(
size_t i=0UL; i<M; ++i )
7467 const size_t jnum( jend - jbegin );
7468 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7470 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7471 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7472 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7475 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7495 template<
typename MT3
7500 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7504 const size_t M( A.rows() );
7505 const size_t N( B.columns() );
7507 for(
size_t i=0UL; i<M; ++i )
7517 const size_t jnum( jend - jbegin );
7518 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7520 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7521 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
7522 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
7525 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
7545 template<
typename MT3
7550 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7554 for(
size_t i=0UL; i<A.rows(); ++i ) {
7555 C(i,i) -= A(i,i) * B(i,i) * scalar;
7574 template<
typename MT3
7579 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7581 selectDefaultSubAssignKernel( C, A, B, scalar );
7600 template<
typename MT3
7609 const size_t M( A.rows() );
7610 const size_t N( B.columns() );
7611 const size_t K( A.columns() );
7615 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
7618 const SIMDType factor(
set( scalar ) );
7624 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7625 for(
size_t i=0UL; i<M; ++i )
7638 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7640 for(
size_t k=kbegin; k<kend; ++k ) {
7641 const SIMDType a1(
set( A(i,k) ) );
7642 xmm1 += a1 * B.load(k,j );
7643 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7644 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7645 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7646 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7647 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7648 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7649 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7652 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7653 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7654 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7655 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7656 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7657 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
7658 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
7659 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
7664 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7668 for( ; (i+2UL) <= M; i+=2UL )
7681 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7683 for(
size_t k=kbegin; k<kend; ++k ) {
7684 const SIMDType a1(
set( A(i ,k) ) );
7685 const SIMDType a2(
set( A(i+1UL,k) ) );
7687 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7688 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7689 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7690 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7703 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7704 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
7705 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7706 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7707 (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
7708 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
7709 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
7710 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
7711 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
7712 (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
7724 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7726 for(
size_t k=kbegin; k<kend; ++k ) {
7727 const SIMDType a1(
set( A(i,k) ) );
7728 xmm1 += a1 * B.load(k,j );
7729 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7730 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7731 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7732 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7735 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7736 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7737 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7738 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7739 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7743 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7747 for( ; (i+2UL) <= M; i+=2UL )
7760 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7762 for(
size_t k=kbegin; k<kend; ++k ) {
7763 const SIMDType a1(
set( A(i ,k) ) );
7764 const SIMDType a2(
set( A(i+1UL,k) ) );
7766 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7767 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7768 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7779 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7780 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
7781 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7782 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7783 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
7784 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
7785 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
7786 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
7800 for(
size_t k=kbegin; k<kend; ++k ) {
7801 const SIMDType a1(
set( A(i,k) ) );
7802 xmm1 += a1 * B.load(k,j );
7803 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7804 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7805 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7808 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7809 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7810 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7811 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7815 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
7819 for( ; (i+2UL) <= M; i+=2UL )
7832 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7834 for(
size_t k=kbegin; k<kend; ++k ) {
7835 const SIMDType a1(
set( A(i ,k) ) );
7836 const SIMDType a2(
set( A(i+1UL,k) ) );
7838 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7839 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7848 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7849 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
7850 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7851 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
7852 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
7853 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
7867 for(
size_t k=kbegin; k<kend; ++k ) {
7868 const SIMDType a1(
set( A(i,k) ) );
7869 xmm1 += a1 * B.load(k,j );
7870 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7871 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7874 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7875 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7876 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7880 for( ; !( LOW &&
UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7882 const size_t iend( UPP ?
min(j+SIMDSIZE*2UL,M) : M );
7883 size_t i( LOW ? j : 0UL );
7885 for( ; (i+4UL) <= iend; i+=4UL )
7898 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7900 for(
size_t k=kbegin; k<kend; ++k ) {
7901 const SIMDType a1(
set( A(i ,k) ) );
7902 const SIMDType a2(
set( A(i+1UL,k) ) );
7903 const SIMDType a3(
set( A(i+2UL,k) ) );
7904 const SIMDType a4(
set( A(i+3UL,k) ) );
7906 const SIMDType b2( B.load(k,j+SIMDSIZE) );
7917 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7918 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
7919 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7920 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
7921 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7922 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
7923 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
7924 (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
7927 for( ; (i+3UL) <= iend; i+=3UL )
7940 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7942 for(
size_t k=kbegin; k<kend; ++k ) {
7943 const SIMDType a1(
set( A(i ,k) ) );
7944 const SIMDType a2(
set( A(i+1UL,k) ) );
7945 const SIMDType a3(
set( A(i+2UL,k) ) );
7947 const SIMDType b2( B.load(k,j+SIMDSIZE) );
7956 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7957 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
7958 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7959 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
7960 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7961 (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
7964 for( ; (i+2UL) <= iend; i+=2UL )
7977 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7980 for( ; (k+2UL) <= kend; k+=2UL ) {
7981 const SIMDType a1(
set( A(i ,k ) ) );
7982 const SIMDType a2(
set( A(i+1UL,k ) ) );
7983 const SIMDType a3(
set( A(i ,k+1UL) ) );
7984 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
7985 const SIMDType b1( B.load(k ,j ) );
7986 const SIMDType b2( B.load(k ,j+SIMDSIZE) );
7987 const SIMDType b3( B.load(k+1UL,j ) );
7988 const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
7999 for( ; k<kend; ++k ) {
8000 const SIMDType a1(
set( A(i ,k) ) );
8001 const SIMDType a2(
set( A(i+1UL,k) ) );
8003 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8010 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
8011 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
8012 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - (xmm3+xmm7) * factor );
8013 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
8028 for( ; (k+2UL) <= kend; k+=2UL ) {
8029 const SIMDType a1(
set( A(i,k ) ) );
8030 const SIMDType a2(
set( A(i,k+1UL) ) );
8031 xmm1 += a1 * B.load(k ,j );
8032 xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8033 xmm3 += a2 * B.load(k+1UL,j );
8034 xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8037 for( ; k<kend; ++k ) {
8038 const SIMDType a1(
set( A(i,k) ) );
8039 xmm1 += a1 * B.load(k,j );
8040 xmm2 += a1 * B.load(k,j+SIMDSIZE);
8043 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
8044 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
8048 for( ; j<jpos; j+=SIMDSIZE )
8050 const size_t iend( LOW && UPP ?
min(j+SIMDSIZE,M) : M );
8051 size_t i( LOW ? j : 0UL );
8053 for( ; (i+4UL) <= iend; i+=4UL )
8064 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8067 for( ; (k+2UL) <= kend; k+=2UL ) {
8069 const SIMDType b2( B.load(k+1UL,j) );
8070 xmm1 +=
set( A(i ,k ) ) * b1;
8071 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8072 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8073 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8074 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8075 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8076 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8077 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8080 for( ; k<kend; ++k ) {
8082 xmm1 +=
set( A(i ,k) ) * b1;
8083 xmm2 +=
set( A(i+1UL,k) ) * b1;
8084 xmm3 +=
set( A(i+2UL,k) ) * b1;
8085 xmm4 +=
set( A(i+3UL,k) ) * b1;
8088 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm5) * factor );
8089 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm6) * factor );
8090 (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm7) * factor );
8091 (~C).store( i+3UL, j, (~C).load(i+3UL,j) - (xmm4+xmm8) * factor );
8094 for( ; (i+3UL) <= iend; i+=3UL )
8105 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8108 for( ; (k+2UL) <= kend; k+=2UL ) {
8110 const SIMDType b2( B.load(k+1UL,j) );
8111 xmm1 +=
set( A(i ,k ) ) * b1;
8112 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8113 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8114 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8115 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8116 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8119 for( ; k<kend; ++k ) {
8121 xmm1 +=
set( A(i ,k) ) * b1;
8122 xmm2 +=
set( A(i+1UL,k) ) * b1;
8123 xmm3 +=
set( A(i+2UL,k) ) * b1;
8126 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm4) * factor );
8127 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm5) * factor );
8128 (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm6) * factor );
8131 for( ; (i+2UL) <= iend; i+=2UL )
8145 for( ; (k+2UL) <= kend; k+=2UL ) {
8147 const SIMDType b2( B.load(k+1UL,j) );
8148 xmm1 +=
set( A(i ,k ) ) * b1;
8149 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8150 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8151 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8154 for( ; k<kend; ++k ) {
8156 xmm1 +=
set( A(i ,k) ) * b1;
8157 xmm2 +=
set( A(i+1UL,k) ) * b1;
8160 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
8161 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm4) * factor );
8175 for( ; (k+2UL) <= K; k+=2UL ) {
8176 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8177 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8181 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8184 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
8188 for( ; remainder && j<N; ++j )
8190 const size_t iend( UPP ? j+1UL : M );
8191 size_t i( LOW ? j : 0UL );
8193 for( ; (i+2UL) <= iend; i+=2UL )
8207 for(
size_t k=kbegin; k<kend; ++k ) {
8208 value1 += A(i ,k) * B(k,j);
8209 value2 += A(i+1UL,k) * B(k,j);
8212 (~C)(i ,j) -= value1 * scalar;
8213 (~C)(i+1UL,j) -= value2 * scalar;
8226 for(
size_t k=kbegin; k<K; ++k ) {
8227 value += A(i,k) * B(k,j);
8230 (~C)(i,j) -= value * scalar;
8250 template<
typename MT3
8262 const ForwardFunctor fwd;
8266 subAssign( ~C, fwd( tmp * B ) * scalar );
8270 subAssign( ~C, fwd( A * tmp ) * scalar );
8272 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
8274 subAssign( ~C, fwd( tmp * B ) * scalar );
8278 subAssign( ~C, fwd( A * tmp ) * scalar );
8297 template<
typename MT3
8302 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8304 selectDefaultSubAssignKernel( C, A, B, scalar );
8323 template<
typename MT3
8328 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8331 lmmm( C, A, B, -scalar, ST2(1) );
8333 ummm( C, A, B, -scalar, ST2(1) );
8335 mmm( C, A, B, -scalar, ST2(1) );
8353 template<
typename MT3
8358 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8360 selectLargeSubAssignKernel( C, A, B, scalar );
8365 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8379 template<
typename MT3
8384 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8390 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8391 subAssign( C, tmp );
8395 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8396 subAssign( C, tmp );
8399 gemm( C, A, B, ET(-scalar), ET(1) );
8419 template<
typename MT >
8430 const ForwardFunctor fwd;
8438 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
8440 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
8460 template<
typename MT
8474 schurAssign( ~lhs, tmp );
8505 template<
typename MT
8518 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8521 else if( left.columns() == 0UL ) {
8555 template<
typename MT
8574 const ForwardFunctor fwd;
8576 const TmpType tmp( rhs );
8595 template<
typename MT >
8606 const ForwardFunctor fwd;
8635 template<
typename MT
8648 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8680 template<
typename MT >
8691 const ForwardFunctor fwd;
8724 template<
typename MT
8737 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8769 template<
typename MT >
8780 const ForwardFunctor fwd;
8810 template<
typename MT
8890 template<
typename MT1
8892 inline decltype(
auto)
8938 template<
typename MT1
8982 template<
typename MT1
9026 template<
typename MT1
9070 template<
typename MT1
9114 template<
typename MT1
9145 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9146 struct Rows< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9163 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9164 struct Columns< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9181 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9182 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9183 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
9199 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9200 struct IsSymmetric< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9203 , IsBuiltin< ElementType_< DMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
9204 , And< Bool<LF>, Bool<UF> > >::value >
9220 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
9221 struct IsHermitian< DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
9238 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9239 struct IsLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9241 , And< IsLower<MT1>, IsLower<MT2> >
9242 , And< Or< Bool<SF>, Bool<HF> >
9243 , IsUpper<MT1>, IsUpper<MT2> > >::value >
9259 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9260 struct IsUniLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9261 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
9262 , And< Or< Bool<SF>, Bool<HF> >
9263 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
9279 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9281 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9282 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
9283 , And< Or< Bool<SF>, Bool<HF> >
9284 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9285 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
9301 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9302 struct IsUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9304 , And< IsUpper<MT1>, IsUpper<MT2> >
9305 , And< Or< Bool<SF>, Bool<HF> >
9306 , IsLower<MT1>, IsLower<MT2> > >::value >
9322 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9323 struct IsUniUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9324 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
9325 , And< Or< Bool<SF>, Bool<HF> >
9326 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
9342 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9344 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9345 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
9346 , And< Or< Bool<SF>, Bool<HF> >
9347 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9348 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:325
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:290
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:414
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:180
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:286
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Header file for the SparseVector base class.
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:490
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:547
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:299
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:620
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:537
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:458
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1027
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:284
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:151
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:434
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:179
Header file for the IsFloat type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:281
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:285
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:181
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:283
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:478
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1027
Header file for the IsLower type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:340
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:619
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:264
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:108
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:293
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:178
Header file for the MatScalarMultExpr base class.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:388
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:110
Utility type for generic codes.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1029
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:101
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:287
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:446
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:424
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:296
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:790
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1029
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:468
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:404
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:742
Header file for the IsResizable type trait.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:491
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:282
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.