35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 143 template<
typename MT1
149 class TDMatTDMatMultExpr
150 :
public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
151 ,
private Computation
176 SYM = ( SF && !( HF || LF || UF ) ),
177 HERM = ( HF && !( LF || UF ) ),
178 LOW = ( LF || ( ( SF || HF ) && UF ) ),
179 UPP = ( UF || ( ( SF || HF ) && LF ) )
191 template<
typename T1,
typename T2,
typename T3 >
192 struct CanExploitSymmetry {
205 template<
typename T1,
typename T2,
typename T3 >
206 struct IsEvaluationRequired {
207 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
208 CanExploitSymmetry<T1,T2,T3>::value };
218 template<
typename T1,
typename T2,
typename T3 >
219 struct UseBlasKernel {
220 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
226 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
241 template<
typename T1,
typename T2,
typename T3 >
242 struct UseVectorizedDefaultKernel {
243 enum :
bool { value = useOptimizedKernels &&
245 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
303 MT1::simdEnabled && MT2::simdEnabled &&
308 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
309 !evaluateRight && MT2::smpAssignable };
364 :(
lhs_.columns() ) ) );
368 const size_t n(
end - begin );
387 if( i >=
lhs_.rows() ) {
390 if( j >=
rhs_.columns() ) {
402 inline size_t rows() const noexcept {
413 return rhs_.columns();
443 template<
typename T >
444 inline bool canAlias(
const T* alias )
const noexcept {
445 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
455 template<
typename T >
456 inline bool isAliased(
const T* alias )
const noexcept {
457 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
467 return lhs_.isAligned() &&
rhs_.isAligned();
478 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
480 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
481 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
505 template<
typename MT
515 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
518 else if( rhs.lhs_.columns() == 0UL ) {
533 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
549 template<
typename MT3
552 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
555 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
556 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
557 selectSmallAssignKernel( C, A, B );
559 selectBlasAssignKernel( C, A, B );
578 template<
typename MT3
582 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
584 const size_t M( A.rows() );
585 const size_t N( B.columns() );
586 const size_t K( A.columns() );
590 for(
size_t j=0UL; j<N; ++j )
601 for(
size_t i=0UL; i<M; ++i ) {
610 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
611 :( LOW ?
max(j,kbegin) : kbegin ) )
612 :( LOW ? j : 0UL ) );
615 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
616 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
617 :( UPP ? j+1UL : M ) );
620 for(
size_t i=0UL; i<ibegin; ++i ) {
627 for(
size_t i=ibegin; i<iend; ++i ) {
628 C(i,j) = A(i,kbegin) * B(kbegin,j);
631 for(
size_t i=iend; i<M; ++i ) {
640 for(
size_t k=kbegin+1UL; k<kend; ++k )
644 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
645 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
646 :( SYM || HERM || LOW ? j : 0UL ) );
649 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
650 :( UPP ?
min(j+1UL,k) : k ) )
651 :( UPP ? j+1UL : M ) );
653 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
656 for(
size_t i=ibegin; i<iend; ++i ) {
657 C(i,j) += A(i,k) * B(k,j);
660 C(iend,j) = A(iend,k) * B(k,j);
666 for(
size_t j=1UL; j<N; ++j ) {
667 for(
size_t i=0UL; i<j; ++i ) {
668 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
690 template<
typename MT3
693 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
694 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
698 const size_t M( A.rows() );
699 const size_t N( B.columns() );
701 for(
size_t j=0UL; j<N; ++j )
712 for(
size_t i=0UL; i<ibegin; ++i ) {
716 for(
size_t i=ibegin; i<iend; ++i ) {
717 C(i,j) = A(i,j) * B(j,j);
720 for(
size_t i=iend; i<M; ++i ) {
743 template<
typename MT3
747 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
751 const size_t M( A.rows() );
752 const size_t N( B.columns() );
754 for(
size_t j=0UL; j<N; ++j )
765 for(
size_t i=0UL; i<ibegin; ++i ) {
769 for(
size_t i=ibegin; i<iend; ++i ) {
770 C(i,j) = A(i,i) * B(i,j);
773 for(
size_t i=iend; i<M; ++i ) {
796 template<
typename MT3
800 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
806 for(
size_t i=0UL; i<A.rows(); ++i ) {
807 C(i,i) = A(i,i) * B(i,i);
827 template<
typename MT3
831 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
833 selectDefaultAssignKernel( C, A, B );
853 template<
typename MT3
864 const ForwardFunctor fwd;
868 assign( ~C, fwd( A * tmp ) );
872 assign( ~C, fwd( tmp * B ) );
874 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
876 assign( ~C, fwd( A * tmp ) );
880 assign( ~C, fwd( tmp * B ) );
901 template<
typename MT3
909 const size_t M( A.rows() );
910 const size_t N( B.columns() );
911 const size_t K( A.columns() );
915 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
918 if( LOW && UPP && M > SIMDSIZE*3UL ) {
927 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
928 for(
size_t j=0UL; j<N; ++j )
941 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
943 for(
size_t k=kbegin; k<kend; ++k ) {
945 xmm1 += A.load(i ,k) * b1;
946 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
947 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
948 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
949 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
950 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
951 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
952 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
955 (~C).store( i , j, xmm1 );
956 (~C).store( i+SIMDSIZE , j, xmm2 );
957 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
958 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
959 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
960 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
961 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
962 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
967 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
971 for( ; (j+2UL) <= N; j+=2UL )
984 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
986 for(
size_t k=kbegin; k<kend; ++k ) {
988 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
989 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
990 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
991 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
992 const SIMDType b1(
set( B(k,j ) ) );
993 const SIMDType b2(
set( B(k,j+1UL) ) );
1006 (~C).store( i , j , xmm1 );
1007 (~C).store( i+SIMDSIZE , j , xmm2 );
1008 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1009 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1010 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1011 (~C).store( i , j+1UL, xmm6 );
1012 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1013 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1014 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1015 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1027 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1029 for(
size_t k=kbegin; k<kend; ++k ) {
1030 const SIMDType b1(
set( B(k,j) ) );
1031 xmm1 += A.load(i ,k) * b1;
1032 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1033 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1034 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1035 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1038 (~C).store( i , j, xmm1 );
1039 (~C).store( i+SIMDSIZE , j, xmm2 );
1040 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1041 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1042 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1046 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1048 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
1049 size_t j( UPP ? i : 0UL );
1051 for( ; (j+2UL) <= jend; j+=2UL )
1064 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1066 for(
size_t k=kbegin; k<kend; ++k ) {
1068 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1069 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1070 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1071 const SIMDType b1(
set( B(k,j ) ) );
1072 const SIMDType b2(
set( B(k,j+1UL) ) );
1083 (~C).store( i , j , xmm1 );
1084 (~C).store( i+SIMDSIZE , j , xmm2 );
1085 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1086 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1087 (~C).store( i , j+1UL, xmm5 );
1088 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1089 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1090 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1104 for(
size_t k=kbegin; k<kend; ++k ) {
1105 const SIMDType b1(
set( B(k,j) ) );
1106 xmm1 += A.load(i ,k) * b1;
1107 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1108 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1109 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1112 (~C).store( i , j, xmm1 );
1113 (~C).store( i+SIMDSIZE , j, xmm2 );
1114 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1115 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1119 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1121 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
1122 size_t j( UPP ? i : 0UL );
1124 for( ; (j+2UL) <= jend; j+=2UL )
1137 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1139 for(
size_t k=kbegin; k<kend; ++k ) {
1141 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1142 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1143 const SIMDType b1(
set( B(k,j ) ) );
1144 const SIMDType b2(
set( B(k,j+1UL) ) );
1153 (~C).store( i , j , xmm1 );
1154 (~C).store( i+SIMDSIZE , j , xmm2 );
1155 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1156 (~C).store( i , j+1UL, xmm4 );
1157 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1158 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1172 for(
size_t k=kbegin; k<kend; ++k ) {
1173 const SIMDType b1(
set( B(k,j) ) );
1174 xmm1 += A.load(i ,k) * b1;
1175 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1176 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1179 (~C).store( i , j, xmm1 );
1180 (~C).store( i+SIMDSIZE , j, xmm2 );
1181 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1185 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1187 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
1188 size_t j( UPP ? i : 0UL );
1190 for( ; (j+4UL) <= jend; j+=4UL )
1203 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1205 for(
size_t k=kbegin; k<kend; ++k ) {
1207 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1208 const SIMDType b1(
set( B(k,j ) ) );
1209 const SIMDType b2(
set( B(k,j+1UL) ) );
1210 const SIMDType b3(
set( B(k,j+2UL) ) );
1211 const SIMDType b4(
set( B(k,j+3UL) ) );
1222 (~C).store( i , j , xmm1 );
1223 (~C).store( i+SIMDSIZE, j , xmm2 );
1224 (~C).store( i , j+1UL, xmm3 );
1225 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1226 (~C).store( i , j+2UL, xmm5 );
1227 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1228 (~C).store( i , j+3UL, xmm7 );
1229 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
1232 for( ; (j+3UL) <= jend; j+=3UL )
1245 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1247 for(
size_t k=kbegin; k<kend; ++k ) {
1249 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1250 const SIMDType b1(
set( B(k,j ) ) );
1251 const SIMDType b2(
set( B(k,j+1UL) ) );
1252 const SIMDType b3(
set( B(k,j+2UL) ) );
1261 (~C).store( i , j , xmm1 );
1262 (~C).store( i+SIMDSIZE, j , xmm2 );
1263 (~C).store( i , j+1UL, xmm3 );
1264 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1265 (~C).store( i , j+2UL, xmm5 );
1266 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1269 for( ; (j+2UL) <= jend; j+=2UL )
1282 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1285 for( ; (k+2UL) <= kend; k+=2UL ) {
1286 const SIMDType a1( A.load(i ,k ) );
1287 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
1288 const SIMDType a3( A.load(i ,k+1UL) );
1289 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
1290 const SIMDType b1(
set( B(k ,j ) ) );
1291 const SIMDType b2(
set( B(k ,j+1UL) ) );
1292 const SIMDType b3(
set( B(k+1UL,j ) ) );
1293 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
1304 for( ; k<kend; ++k ) {
1306 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1307 const SIMDType b1(
set( B(k,j ) ) );
1308 const SIMDType b2(
set( B(k,j+1UL) ) );
1315 (~C).store( i , j , xmm1+xmm5 );
1316 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
1317 (~C).store( i , j+1UL, xmm3+xmm7 );
1318 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
1333 for( ; (k+2UL) <= kend; k+=2UL ) {
1334 const SIMDType b1(
set( B(k ,j) ) );
1335 const SIMDType b2(
set( B(k+1UL,j) ) );
1336 xmm1 += A.load(i ,k ) * b1;
1337 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
1338 xmm3 += A.load(i ,k+1UL) * b2;
1339 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
1342 for( ; k<kend; ++k ) {
1343 const SIMDType b1(
set( B(k,j) ) );
1344 xmm1 += A.load(i ,k) * b1;
1345 xmm2 += A.load(i+SIMDSIZE,k) * b1;
1348 (~C).store( i , j, xmm1+xmm3 );
1349 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
1353 for( ; i<ipos; i+=SIMDSIZE )
1355 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
1356 size_t j( UPP ? i : 0UL );
1358 for( ; (j+4UL) <= jend; j+=4UL )
1369 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1372 for( ; (k+2UL) <= kend; k+=2UL ) {
1374 const SIMDType a2( A.load(i,k+1UL) );
1375 xmm1 += a1 *
set( B(k ,j ) );
1376 xmm2 += a1 *
set( B(k ,j+1UL) );
1377 xmm3 += a1 *
set( B(k ,j+2UL) );
1378 xmm4 += a1 *
set( B(k ,j+3UL) );
1379 xmm5 += a2 *
set( B(k+1UL,j ) );
1380 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
1381 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
1382 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
1385 for( ; k<kend; ++k ) {
1387 xmm1 += a1 *
set( B(k,j ) );
1388 xmm2 += a1 *
set( B(k,j+1UL) );
1389 xmm3 += a1 *
set( B(k,j+2UL) );
1390 xmm4 += a1 *
set( B(k,j+3UL) );
1393 (~C).store( i, j , xmm1+xmm5 );
1394 (~C).store( i, j+1UL, xmm2+xmm6 );
1395 (~C).store( i, j+2UL, xmm3+xmm7 );
1396 (~C).store( i, j+3UL, xmm4+xmm8 );
1399 for( ; (j+3UL) <= jend; j+=3UL )
1410 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1413 for( ; (k+2UL) <= kend; k+=2UL ) {
1415 const SIMDType a2( A.load(i,k+1UL) );
1416 xmm1 += a1 *
set( B(k ,j ) );
1417 xmm2 += a1 *
set( B(k ,j+1UL) );
1418 xmm3 += a1 *
set( B(k ,j+2UL) );
1419 xmm4 += a2 *
set( B(k+1UL,j ) );
1420 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
1421 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
1424 for( ; k<kend; ++k ) {
1426 xmm1 += a1 *
set( B(k,j ) );
1427 xmm2 += a1 *
set( B(k,j+1UL) );
1428 xmm3 += a1 *
set( B(k,j+2UL) );
1431 (~C).store( i, j , xmm1+xmm4 );
1432 (~C).store( i, j+1UL, xmm2+xmm5 );
1433 (~C).store( i, j+2UL, xmm3+xmm6 );
1436 for( ; (j+2UL) <= jend; j+=2UL )
1450 for( ; (k+2UL) <= kend; k+=2UL ) {
1452 const SIMDType a2( A.load(i,k+1UL) );
1453 xmm1 += a1 *
set( B(k ,j ) );
1454 xmm2 += a1 *
set( B(k ,j+1UL) );
1455 xmm3 += a2 *
set( B(k+1UL,j ) );
1456 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
1459 for( ; k<kend; ++k ) {
1461 xmm1 += a1 *
set( B(k,j ) );
1462 xmm2 += a1 *
set( B(k,j+1UL) );
1465 (~C).store( i, j , xmm1+xmm3 );
1466 (~C).store( i, j+1UL, xmm2+xmm4 );
1480 for( ; (k+2UL) <= K; k+=2UL ) {
1481 xmm1 += A.load(i,k ) *
set( B(k ,j) );
1482 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
1486 xmm1 += A.load(i,k) *
set( B(k,j) );
1489 (~C).store( i, j, xmm1+xmm2 );
1493 for( ; remainder && i<M; ++i )
1495 size_t j( LOW && UPP ? i : 0UL );
1497 for( ; (j+2UL) <= N; j+=2UL )
1511 for(
size_t k=kbegin; k<kend; ++k ) {
1512 value1 += A(i,k) * B(k,j );
1513 value2 += A(i,k) * B(k,j+1UL);
1516 (~C)(i,j ) = value1;
1517 (~C)(i,j+1UL) = value2;
1530 for(
size_t k=kbegin; k<K; ++k ) {
1531 value += A(i,k) * B(k,j);
1539 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1540 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1541 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1542 for(
size_t i=0UL; i<iend; ++i ) {
1543 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1547 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1548 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1549 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1550 for(
size_t i=0UL; i<iend; ++i ) {
1555 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
1556 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1557 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1558 for(
size_t j=0UL; j<jend; ++j ) {
1581 template<
typename MT3
1585 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1587 selectDefaultAssignKernel( C, A, B );
1607 template<
typename MT3
1611 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1641 template<
typename MT3
1645 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1647 selectLargeAssignKernel( C, A, B );
1653 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1667 template<
typename MT3
1671 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1684 gemm( C, A, B, ET(1), ET(0) );
1704 template<
typename MT
1723 const ForwardFunctor fwd;
1725 const TmpType tmp(
serial( rhs ) );
1726 assign( ~lhs, fwd( tmp ) );
1746 template<
typename MT >
1757 const ForwardFunctor fwd;
1760 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1762 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1764 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1782 template<
typename MT
1792 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1806 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1822 template<
typename MT3
1825 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1828 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1829 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1830 selectSmallAddAssignKernel( C, A, B );
1832 selectBlasAddAssignKernel( C, A, B );
1851 template<
typename MT3
1855 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1857 const size_t M( A.rows() );
1858 const size_t N( B.columns() );
1859 const size_t K( A.columns() );
1863 for(
size_t j=0UL; j<N; ++j )
1873 for(
size_t k=kbegin; k<kend; ++k )
1877 ?( LOW ?
max(j,k+1UL) : k+1UL )
1878 :( LOW ?
max(j,k) : k ) )
1879 :( LOW ? j : 0UL ) );
1882 ?( UPP ?
min(j+1UL,k) : k )
1883 :( UPP ?
min(j,k)+1UL : k+1UL ) )
1884 :( UPP ? j+1UL : M ) );
1886 if( ( LOW || UPP ) && ibegin >= iend )
continue;
1889 const size_t inum( iend - ibegin );
1890 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1892 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1893 C(i ,j) += A(i ,k) * B(k,j);
1894 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1897 C(ipos,j) += A(ipos,k) * B(k,j);
1919 template<
typename MT3
1922 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
1923 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1927 const size_t M( A.rows() );
1928 const size_t N( B.columns() );
1930 for(
size_t j=0UL; j<N; ++j )
1940 const size_t inum( iend - ibegin );
1941 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1943 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1944 C(i ,j) += A(i ,j) * B(j,j);
1945 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1948 C(ipos,j) += A(ipos,j) * B(j,j);
1969 template<
typename MT3
1973 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1977 const size_t M( A.rows() );
1978 const size_t N( B.columns() );
1980 for(
size_t j=0UL; j<N; ++j )
1990 const size_t inum( iend - ibegin );
1991 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1993 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1994 C(i ,j) += A(i ,i ) * B(i ,j);
1995 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1998 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2019 template<
typename MT3
2023 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2027 for(
size_t i=0UL; i<A.rows(); ++i ) {
2028 C(i,i) += A(i,i) * B(i,i);
2048 template<
typename MT3
2052 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2054 selectDefaultAddAssignKernel( C, A, B );
2074 template<
typename MT3
2085 const ForwardFunctor fwd;
2089 addAssign( ~C, fwd( A * tmp ) );
2093 addAssign( ~C, fwd( tmp * B ) );
2095 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2097 addAssign( ~C, fwd( A * tmp ) );
2101 addAssign( ~C, fwd( tmp * B ) );
2122 template<
typename MT3
2130 const size_t M( A.rows() );
2131 const size_t N( B.columns() );
2132 const size_t K( A.columns() );
2136 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2143 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2144 for(
size_t j=0UL; j<N; ++j )
2158 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2159 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2160 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2161 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2162 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
2163 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
2164 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
2166 for(
size_t k=kbegin; k<kend; ++k ) {
2167 const SIMDType b1(
set( B(k,j) ) );
2168 xmm1 += A.load(i ,k) * b1;
2169 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2170 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2171 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2172 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2173 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2174 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2175 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2178 (~C).store( i , j, xmm1 );
2179 (~C).store( i+SIMDSIZE , j, xmm2 );
2180 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2181 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2182 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2183 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
2184 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
2185 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
2190 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2194 for( ; (j+2UL) <= N; j+=2UL )
2207 SIMDType xmm1 ( (~C).load(i ,j ) );
2208 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
2209 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
2210 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
2211 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
2212 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
2213 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
2214 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2215 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2216 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
2218 for(
size_t k=kbegin; k<kend; ++k ) {
2220 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2221 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2222 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2223 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2224 const SIMDType b1(
set( B(k,j ) ) );
2225 const SIMDType b2(
set( B(k,j+1UL) ) );
2238 (~C).store( i , j , xmm1 );
2239 (~C).store( i+SIMDSIZE , j , xmm2 );
2240 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2241 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2242 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
2243 (~C).store( i , j+1UL, xmm6 );
2244 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
2245 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2246 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2247 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2260 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2261 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2262 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2263 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2265 for(
size_t k=kbegin; k<kend; ++k ) {
2266 const SIMDType b1(
set( B(k,j) ) );
2267 xmm1 += A.load(i ,k) * b1;
2268 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2269 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2270 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2271 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2274 (~C).store( i , j, xmm1 );
2275 (~C).store( i+SIMDSIZE , j, xmm2 );
2276 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2277 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2278 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2282 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2286 for( ; (j+2UL) <= N; j+=2UL )
2300 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2301 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2302 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2303 SIMDType xmm5( (~C).load(i ,j+1UL) );
2304 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2305 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2306 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2308 for(
size_t k=kbegin; k<kend; ++k ) {
2310 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2311 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2312 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2313 const SIMDType b1(
set( B(k,j ) ) );
2314 const SIMDType b2(
set( B(k,j+1UL) ) );
2325 (~C).store( i , j , xmm1 );
2326 (~C).store( i+SIMDSIZE , j , xmm2 );
2327 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2328 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2329 (~C).store( i , j+1UL, xmm5 );
2330 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2331 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2332 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2345 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2346 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2347 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2349 for(
size_t k=kbegin; k<kend; ++k ) {
2350 const SIMDType b1(
set( B(k,j) ) );
2351 xmm1 += A.load(i ,k) * b1;
2352 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2353 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2354 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2357 (~C).store( i , j, xmm1 );
2358 (~C).store( i+SIMDSIZE , j, xmm2 );
2359 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2360 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2364 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2368 for( ; (j+2UL) <= N; j+=2UL )
2382 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2383 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2384 SIMDType xmm4( (~C).load(i ,j+1UL) );
2385 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
2386 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2388 for(
size_t k=kbegin; k<kend; ++k ) {
2390 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2391 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2392 const SIMDType b1(
set( B(k,j ) ) );
2393 const SIMDType b2(
set( B(k,j+1UL) ) );
2402 (~C).store( i , j , xmm1 );
2403 (~C).store( i+SIMDSIZE , j , xmm2 );
2404 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2405 (~C).store( i , j+1UL, xmm4 );
2406 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
2407 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2420 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2421 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2423 for(
size_t k=kbegin; k<kend; ++k ) {
2424 const SIMDType b1(
set( B(k,j) ) );
2425 xmm1 += A.load(i ,k) * b1;
2426 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2427 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2430 (~C).store( i , j, xmm1 );
2431 (~C).store( i+SIMDSIZE , j, xmm2 );
2432 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2436 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2438 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
2439 size_t j( UPP ? i : 0UL );
2441 for( ; (j+4UL) <= jend; j+=4UL )
2455 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2456 SIMDType xmm3( (~C).load(i ,j+1UL) );
2457 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2458 SIMDType xmm5( (~C).load(i ,j+2UL) );
2459 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2460 SIMDType xmm7( (~C).load(i ,j+3UL) );
2461 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
2463 for(
size_t k=kbegin; k<kend; ++k ) {
2465 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2466 const SIMDType b1(
set( B(k,j ) ) );
2467 const SIMDType b2(
set( B(k,j+1UL) ) );
2468 const SIMDType b3(
set( B(k,j+2UL) ) );
2469 const SIMDType b4(
set( B(k,j+3UL) ) );
2480 (~C).store( i , j , xmm1 );
2481 (~C).store( i+SIMDSIZE, j , xmm2 );
2482 (~C).store( i , j+1UL, xmm3 );
2483 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2484 (~C).store( i , j+2UL, xmm5 );
2485 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2486 (~C).store( i , j+3UL, xmm7 );
2487 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2490 for( ; (j+3UL) <= jend; j+=3UL )
2504 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2505 SIMDType xmm3( (~C).load(i ,j+1UL) );
2506 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2507 SIMDType xmm5( (~C).load(i ,j+2UL) );
2508 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2510 for(
size_t k=kbegin; k<kend; ++k ) {
2512 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2513 const SIMDType b1(
set( B(k,j ) ) );
2514 const SIMDType b2(
set( B(k,j+1UL) ) );
2515 const SIMDType b3(
set( B(k,j+2UL) ) );
2524 (~C).store( i , j , xmm1 );
2525 (~C).store( i+SIMDSIZE, j , xmm2 );
2526 (~C).store( i , j+1UL, xmm3 );
2527 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2528 (~C).store( i , j+2UL, xmm5 );
2529 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2532 for( ; (j+2UL) <= jend; j+=2UL )
2546 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2547 SIMDType xmm3( (~C).load(i ,j+1UL) );
2548 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2552 for( ; (k+2UL) < kend; k+=2UL ) {
2553 const SIMDType a1( A.load(i ,k ) );
2554 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2555 const SIMDType a3( A.load(i ,k+1UL) );
2556 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2557 const SIMDType b1(
set( B(k ,j ) ) );
2558 const SIMDType b2(
set( B(k ,j+1UL) ) );
2559 const SIMDType b3(
set( B(k+1UL,j ) ) );
2560 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
2571 for( ; k<kend; ++k ) {
2573 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2574 const SIMDType b1(
set( B(k,j ) ) );
2575 const SIMDType b2(
set( B(k,j+1UL) ) );
2582 (~C).store( i , j , xmm1+xmm5 );
2583 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2584 (~C).store( i , j+1UL, xmm3+xmm7 );
2585 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2598 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2602 for( ; (k+2UL) <= kend; k+=2UL ) {
2603 const SIMDType b1(
set( B(k ,j) ) );
2604 const SIMDType b2(
set( B(k+1UL,j) ) );
2605 xmm1 += A.load(i ,k ) * b1;
2606 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2607 xmm3 += A.load(i ,k+1UL) * b2;
2608 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2611 for( ; k<kend; ++k ) {
2612 const SIMDType b1(
set( B(k,j) ) );
2613 xmm1 += A.load(i ,k) * b1;
2614 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2617 (~C).store( i , j, xmm1+xmm3 );
2618 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2622 for( ; i<ipos; i+=SIMDSIZE )
2624 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
2625 size_t j( UPP ? i : 0UL );
2627 for( ; (j+4UL) <= jend; j+=4UL )
2639 SIMDType xmm2( (~C).load(i,j+1UL) );
2640 SIMDType xmm3( (~C).load(i,j+2UL) );
2641 SIMDType xmm4( (~C).load(i,j+3UL) );
2645 for( ; (k+2UL) <= kend; k+=2UL ) {
2647 const SIMDType a2( A.load(i,k+1UL) );
2648 xmm1 += a1 *
set( B(k ,j ) );
2649 xmm2 += a1 *
set( B(k ,j+1UL) );
2650 xmm3 += a1 *
set( B(k ,j+2UL) );
2651 xmm4 += a1 *
set( B(k ,j+3UL) );
2652 xmm5 += a2 *
set( B(k+1UL,j ) );
2653 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2654 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2655 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2658 for( ; k<kend; ++k ) {
2660 xmm1 += a1 *
set( B(k,j ) );
2661 xmm2 += a1 *
set( B(k,j+1UL) );
2662 xmm3 += a1 *
set( B(k,j+2UL) );
2663 xmm4 += a1 *
set( B(k,j+3UL) );
2666 (~C).store( i, j , xmm1+xmm5 );
2667 (~C).store( i, j+1UL, xmm2+xmm6 );
2668 (~C).store( i, j+2UL, xmm3+xmm7 );
2669 (~C).store( i, j+3UL, xmm4+xmm8 );
2672 for( ; (j+3UL) <= jend; j+=3UL )
2684 SIMDType xmm2( (~C).load(i,j+1UL) );
2685 SIMDType xmm3( (~C).load(i,j+2UL) );
2689 for( ; (k+2UL) <= kend; k+=2UL ) {
2691 const SIMDType a2( A.load(i,k+1UL) );
2692 xmm1 += a1 *
set( B(k ,j ) );
2693 xmm2 += a1 *
set( B(k ,j+1UL) );
2694 xmm3 += a1 *
set( B(k ,j+2UL) );
2695 xmm4 += a2 *
set( B(k+1UL,j ) );
2696 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2697 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2700 for( ; k<kend; ++k ) {
2702 xmm1 += a1 *
set( B(k,j ) );
2703 xmm2 += a1 *
set( B(k,j+1UL) );
2704 xmm3 += a1 *
set( B(k,j+2UL) );
2707 (~C).store( i, j , xmm1+xmm4 );
2708 (~C).store( i, j+1UL, xmm2+xmm5 );
2709 (~C).store( i, j+2UL, xmm3+xmm6 );
2712 for( ; (j+2UL) <= jend; j+=2UL )
2724 SIMDType xmm2( (~C).load(i,j+1UL) );
2728 for( ; (k+2UL) <= kend; k+=2UL ) {
2730 const SIMDType a2( A.load(i,k+1UL) );
2731 xmm1 += a1 *
set( B(k ,j ) );
2732 xmm2 += a1 *
set( B(k ,j+1UL) );
2733 xmm3 += a2 *
set( B(k+1UL,j ) );
2734 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2737 for( ; k<kend; ++k ) {
2739 xmm1 += a1 *
set( B(k,j ) );
2740 xmm2 += a1 *
set( B(k,j+1UL) );
2743 (~C).store( i, j , xmm1+xmm3 );
2744 (~C).store( i, j+1UL, xmm2+xmm4 );
2759 for( ; (k+2UL) <= K; k+=2UL ) {
2760 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2761 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2765 xmm1 += A.load(i,k) *
set( B(k,j) );
2768 (~C).store( i, j, xmm1+xmm2 );
2772 for( ; remainder && i<M; ++i )
2774 const size_t jend( LOW ? i+1UL : N );
2775 size_t j( UPP ? i : 0UL );
2777 for( ; (j+2UL) <= jend; j+=2UL )
2791 for(
size_t k=kbegin; k<kend; ++k ) {
2792 value1 += A(i,k) * B(k,j );
2793 value2 += A(i,k) * B(k,j+1UL);
2796 (~C)(i,j ) = value1;
2797 (~C)(i,j+1UL) = value2;
2810 for(
size_t k=kbegin; k<K; ++k ) {
2811 value += A(i,k) * B(k,j);
2835 template<
typename MT3
2839 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2841 selectDefaultAddAssignKernel( C, A, B );
2861 template<
typename MT3
2865 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2891 template<
typename MT3
2895 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2897 selectLargeAddAssignKernel( C, A, B );
2903 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2917 template<
typename MT3
2921 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2928 addAssign( C, tmp );
2933 addAssign( C, tmp );
2936 gemm( C, A, B, ET(1), ET(1) );
2958 template<
typename MT >
2969 const ForwardFunctor fwd;
2972 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
2974 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
2976 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
2998 template<
typename MT
3008 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3022 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3038 template<
typename MT3
3041 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3044 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3045 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3046 selectSmallSubAssignKernel( C, A, B );
3048 selectBlasSubAssignKernel( C, A, B );
3067 template<
typename MT3
3071 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3073 const size_t M( A.rows() );
3074 const size_t N( B.columns() );
3075 const size_t K( A.columns() );
3079 for(
size_t j=0UL; j<N; ++j )
3089 for(
size_t k=kbegin; k<kend; ++k )
3093 ?( LOW ?
max(j,k+1UL) : k+1UL )
3094 :( LOW ?
max(j,k) : k ) )
3095 :( LOW ? j : 0UL ) );
3098 ?( UPP ?
min(j+1UL,k) : k )
3099 :( UPP ?
min(j,k)+1UL : k+1UL ) )
3100 :( UPP ? j+1UL : M ) );
3102 if( ( LOW || UPP ) && ( ibegin >= iend ) )
continue;
3105 const size_t inum( iend - ibegin );
3106 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3108 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3109 C(i ,j) -= A(i ,k) * B(k,j);
3110 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3113 C(ipos,j) -= A(ipos,k) * B(k,j);
3135 template<
typename MT3
3138 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3139 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3143 const size_t M( A.rows() );
3144 const size_t N( B.columns() );
3146 for(
size_t j=0UL; j<N; ++j )
3156 const size_t inum( iend - ibegin );
3157 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3159 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3160 C(i ,j) -= A(i ,j) * B(j,j);
3161 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3164 C(ipos,j) -= A(ipos,j) * B(j,j);
3185 template<
typename MT3
3189 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3193 const size_t M( A.rows() );
3194 const size_t N( B.columns() );
3196 for(
size_t j=0UL; j<N; ++j )
3206 const size_t inum( iend - ibegin );
3207 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3209 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3210 C(i ,j) -= A(i ,i ) * B(i ,j);
3211 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3214 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3235 template<
typename MT3
3239 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3243 for(
size_t i=0UL; i<A.rows(); ++i ) {
3244 C(i,i) -= A(i,i) * B(i,i);
3264 template<
typename MT3
3268 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3270 selectDefaultSubAssignKernel( C, A, B );
3290 template<
typename MT3
3301 const ForwardFunctor fwd;
3305 subAssign( ~C, fwd( A * tmp ) );
3309 subAssign( ~C, fwd( tmp * B ) );
3311 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3313 subAssign( ~C, fwd( A * tmp ) );
3317 subAssign( ~C, fwd( tmp * B ) );
3338 template<
typename MT3
3346 const size_t M( A.rows() );
3347 const size_t N( B.columns() );
3348 const size_t K( A.columns() );
3352 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3359 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3360 for(
size_t j=0UL; j<N; ++j )
3374 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3375 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3376 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3377 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3378 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3379 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3380 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3382 for(
size_t k=kbegin; k<kend; ++k ) {
3383 const SIMDType b1(
set( B(k,j) ) );
3384 xmm1 -= A.load(i ,k) * b1;
3385 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3386 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3387 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3388 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3389 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3390 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3391 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3394 (~C).store( i , j, xmm1 );
3395 (~C).store( i+SIMDSIZE , j, xmm2 );
3396 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3397 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3398 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3399 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3400 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3401 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3406 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3410 for( ; (j+2UL) <= N; j+=2UL )
3423 SIMDType xmm1 ( (~C).load(i ,j ) );
3424 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3425 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3426 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3427 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3428 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3429 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3430 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3431 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3432 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3434 for(
size_t k=kbegin; k<kend; ++k ) {
3436 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3437 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3438 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3439 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3440 const SIMDType b1(
set( B(k,j ) ) );
3441 const SIMDType b2(
set( B(k,j+1UL) ) );
3454 (~C).store( i , j , xmm1 );
3455 (~C).store( i+SIMDSIZE , j , xmm2 );
3456 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3457 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3458 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3459 (~C).store( i , j+1UL, xmm6 );
3460 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3461 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3462 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3463 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3476 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3477 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3478 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3479 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3481 for(
size_t k=kbegin; k<kend; ++k ) {
3482 const SIMDType b1(
set( B(k,j) ) );
3483 xmm1 -= A.load(i ,k) * b1;
3484 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3485 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3486 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3487 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3490 (~C).store( i , j, xmm1 );
3491 (~C).store( i+SIMDSIZE , j, xmm2 );
3492 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3493 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3494 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3498 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3502 for( ; (j+2UL) <= N; j+=2UL )
3516 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3517 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3518 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3519 SIMDType xmm5( (~C).load(i ,j+1UL) );
3520 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3521 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3522 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3524 for(
size_t k=kbegin; k<kend; ++k ) {
3526 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3527 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3528 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3529 const SIMDType b1(
set( B(k,j ) ) );
3530 const SIMDType b2(
set( B(k,j+1UL) ) );
3541 (~C).store( i , j , xmm1 );
3542 (~C).store( i+SIMDSIZE , j , xmm2 );
3543 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3544 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3545 (~C).store( i , j+1UL, xmm5 );
3546 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3547 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3548 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3561 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3562 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3563 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3565 for(
size_t k=kbegin; k<kend; ++k ) {
3566 const SIMDType b1(
set( B(k,j) ) );
3567 xmm1 -= A.load(i ,k) * b1;
3568 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3569 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3570 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3573 (~C).store( i , j, xmm1 );
3574 (~C).store( i+SIMDSIZE , j, xmm2 );
3575 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3576 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3580 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3584 for( ; (j+2UL) <= N; j+=2UL )
3598 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3599 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3600 SIMDType xmm4( (~C).load(i ,j+1UL) );
3601 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3602 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3604 for(
size_t k=kbegin; k<kend; ++k ) {
3606 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3607 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3608 const SIMDType b1(
set( B(k,j ) ) );
3609 const SIMDType b2(
set( B(k,j+1UL) ) );
3618 (~C).store( i , j , xmm1 );
3619 (~C).store( i+SIMDSIZE , j , xmm2 );
3620 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3621 (~C).store( i , j+1UL, xmm4 );
3622 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3623 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3636 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3637 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3639 for(
size_t k=kbegin; k<kend; ++k ) {
3640 const SIMDType b1(
set( B(k,j) ) );
3641 xmm1 -= A.load(i ,k) * b1;
3642 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3643 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3646 (~C).store( i , j, xmm1 );
3647 (~C).store( i+SIMDSIZE , j, xmm2 );
3648 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3652 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3654 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
3655 size_t j( UPP ? i : 0UL );
3657 for( ; (j+4UL) <= jend; j+=4UL )
3671 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3672 SIMDType xmm3( (~C).load(i ,j+1UL) );
3673 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3674 SIMDType xmm5( (~C).load(i ,j+2UL) );
3675 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3676 SIMDType xmm7( (~C).load(i ,j+3UL) );
3677 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
3679 for(
size_t k=kbegin; k<kend; ++k ) {
3681 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3682 const SIMDType b1(
set( B(k,j ) ) );
3683 const SIMDType b2(
set( B(k,j+1UL) ) );
3684 const SIMDType b3(
set( B(k,j+2UL) ) );
3685 const SIMDType b4(
set( B(k,j+3UL) ) );
3696 (~C).store( i , j , xmm1 );
3697 (~C).store( i+SIMDSIZE, j , xmm2 );
3698 (~C).store( i , j+1UL, xmm3 );
3699 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3700 (~C).store( i , j+2UL, xmm5 );
3701 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3702 (~C).store( i , j+3UL, xmm7 );
3703 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
3706 for( ; (j+3UL) <= jend; j+=3UL )
3720 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3721 SIMDType xmm3( (~C).load(i ,j+1UL) );
3722 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3723 SIMDType xmm5( (~C).load(i ,j+2UL) );
3724 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3726 for(
size_t k=kbegin; k<kend; ++k ) {
3728 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3729 const SIMDType b1(
set( B(k,j ) ) );
3730 const SIMDType b2(
set( B(k,j+1UL) ) );
3731 const SIMDType b3(
set( B(k,j+2UL) ) );
3740 (~C).store( i , j , xmm1 );
3741 (~C).store( i+SIMDSIZE, j , xmm2 );
3742 (~C).store( i , j+1UL, xmm3 );
3743 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3744 (~C).store( i , j+2UL, xmm5 );
3745 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3748 for( ; (j+2UL) <= jend; j+=2UL )
3762 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3763 SIMDType xmm3( (~C).load(i ,j+1UL) );
3764 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3768 for( ; (k+2UL) <= kend; k+=2UL ) {
3769 const SIMDType a1( A.load(i ,k ) );
3770 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
3771 const SIMDType a3( A.load(i ,k+1UL) );
3772 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
3773 const SIMDType b1(
set( B(k ,j ) ) );
3774 const SIMDType b2(
set( B(k ,j+1UL) ) );
3775 const SIMDType b3(
set( B(k+1UL,j ) ) );
3776 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
3787 for( ; k<kend; ++k ) {
3789 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3790 const SIMDType b1(
set( B(k,j ) ) );
3791 const SIMDType b2(
set( B(k,j+1UL) ) );
3798 (~C).store( i , j , xmm1+xmm5 );
3799 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
3800 (~C).store( i , j+1UL, xmm3+xmm7 );
3801 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
3814 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3818 for( ; (k+2UL) <= kend; k+=2UL ) {
3819 const SIMDType b1(
set( B(k ,j) ) );
3820 const SIMDType b2(
set( B(k+1UL,j) ) );
3821 xmm1 -= A.load(i ,k ) * b1;
3822 xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
3823 xmm3 -= A.load(i ,k+1UL) * b2;
3824 xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
3827 for( ; k<kend; ++k ) {
3828 const SIMDType b1(
set( B(k,j) ) );
3829 xmm1 -= A.load(i ,k) * b1;
3830 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3833 (~C).store( i , j, xmm1+xmm3 );
3834 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
3838 for( ; i<ipos; i+=SIMDSIZE )
3840 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
3841 size_t j( UPP ? i : 0UL );
3843 for( ; (j+4UL) <= jend; j+=4UL )
3855 SIMDType xmm2( (~C).load(i,j+1UL) );
3856 SIMDType xmm3( (~C).load(i,j+2UL) );
3857 SIMDType xmm4( (~C).load(i,j+3UL) );
3861 for( ; (k+2UL) <= kend; k+=2UL ) {
3863 const SIMDType a2( A.load(i,k+1UL) );
3864 xmm1 -= a1 *
set( B(k ,j ) );
3865 xmm2 -= a1 *
set( B(k ,j+1UL) );
3866 xmm3 -= a1 *
set( B(k ,j+2UL) );
3867 xmm4 -= a1 *
set( B(k ,j+3UL) );
3868 xmm5 -= a2 *
set( B(k+1UL,j ) );
3869 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
3870 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
3871 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
3874 for( ; k<kend; ++k ) {
3876 xmm1 -= a1 *
set( B(k,j ) );
3877 xmm2 -= a1 *
set( B(k,j+1UL) );
3878 xmm3 -= a1 *
set( B(k,j+2UL) );
3879 xmm4 -= a1 *
set( B(k,j+3UL) );
3882 (~C).store( i, j , xmm1+xmm5 );
3883 (~C).store( i, j+1UL, xmm2+xmm6 );
3884 (~C).store( i, j+2UL, xmm3+xmm7 );
3885 (~C).store( i, j+3UL, xmm4+xmm8 );
3888 for( ; (j+3UL) <= jend; j+=3UL )
3900 SIMDType xmm2( (~C).load(i,j+1UL) );
3901 SIMDType xmm3( (~C).load(i,j+2UL) );
3905 for( ; (k+2UL) <= kend; k+=2UL ) {
3907 const SIMDType a2( A.load(i,k+1UL) );
3908 xmm1 -= a1 *
set( B(k ,j ) );
3909 xmm2 -= a1 *
set( B(k ,j+1UL) );
3910 xmm3 -= a1 *
set( B(k ,j+2UL) );
3911 xmm4 -= a2 *
set( B(k+1UL,j ) );
3912 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
3913 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
3916 for( ; k<kend; ++k ) {
3918 xmm1 -= a1 *
set( B(k,j ) );
3919 xmm2 -= a1 *
set( B(k,j+1UL) );
3920 xmm3 -= a1 *
set( B(k,j+2UL) );
3923 (~C).store( i, j , xmm1+xmm4 );
3924 (~C).store( i, j+1UL, xmm2+xmm5 );
3925 (~C).store( i, j+2UL, xmm3+xmm6 );
3928 for( ; (j+2UL) <= jend; j+=2UL )
3940 SIMDType xmm2( (~C).load(i,j+1UL) );
3944 for( ; (k+2UL) <= kend; k+=2UL ) {
3946 const SIMDType a2( A.load(i,k+1UL) );
3947 xmm1 -= a1 *
set( B(k ,j ) );
3948 xmm2 -= a1 *
set( B(k ,j+1UL) );
3949 xmm3 -= a2 *
set( B(k+1UL,j ) );
3950 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
3953 for( ; k<kend; ++k ) {
3955 xmm1 -= a1 *
set( B(k,j ) );
3956 xmm2 -= a1 *
set( B(k,j+1UL) );
3959 (~C).store( i, j , xmm1+xmm3 );
3960 (~C).store( i, j+1UL, xmm2+xmm4 );
3975 for( ; (k+2UL) <= K; k+=2UL ) {
3976 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
3977 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
3981 xmm1 -= A.load(i,k) *
set( B(k,j) );
3984 (~C).store( i, j, xmm1+xmm2 );
3988 for( ; remainder && i<M; ++i )
3990 const size_t jend( LOW ? i+1UL : N );
3991 size_t j( UPP ? i : 0UL );
3993 for( ; (j+2UL) <= jend; j+=2UL )
4007 for(
size_t k=kbegin; k<kend; ++k ) {
4008 value1 -= A(i,k) * B(k,j );
4009 value2 -= A(i,k) * B(k,j+1UL);
4012 (~C)(i,j ) = value1;
4013 (~C)(i,j+1UL) = value2;
4026 for(
size_t k=kbegin; k<K; ++k ) {
4027 value -= A(i,k) * B(k,j);
4051 template<
typename MT3
4055 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4057 selectDefaultSubAssignKernel( C, A, B );
4077 template<
typename MT3
4081 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4107 template<
typename MT3
4111 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4113 selectLargeSubAssignKernel( C, A, B );
4119 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4133 template<
typename MT3
4137 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4144 subAssign( C, tmp );
4149 subAssign( C, tmp );
4152 gemm( C, A, B, ET(-1), ET(1) );
4175 template<
typename MT >
4186 const ForwardFunctor fwd;
4189 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
4191 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
4193 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
4215 template<
typename MT
4229 schurAssign( ~lhs, tmp );
4258 template<
typename MT
4268 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4271 else if( rhs.lhs_.columns() == 0UL ) {
4307 template<
typename MT
4326 const ForwardFunctor fwd;
4328 const TmpType tmp( rhs );
4349 template<
typename MT >
4360 const ForwardFunctor fwd;
4388 template<
typename MT
4398 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4433 template<
typename MT >
4444 const ForwardFunctor fwd;
4476 template<
typename MT
4486 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4521 template<
typename MT >
4532 const ForwardFunctor fwd;
4562 template<
typename MT
4622 template<
typename MT1
4630 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4660 SYM = ( SF && !( HF || LF || UF ) ),
4661 HERM = ( HF && !( LF || UF ) ),
4662 LOW = ( LF || ( ( SF || HF ) && UF ) ),
4663 UPP = ( UF || ( ( SF || HF ) && LF ) )
4674 template<
typename T1,
typename T2,
typename T3 >
4675 struct CanExploitSymmetry {
4686 template<
typename T1,
typename T2,
typename T3 >
4687 struct IsEvaluationRequired {
4688 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
4689 !CanExploitSymmetry<T1,T2,T3>::value };
4697 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4698 struct UseBlasKernel {
4699 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4705 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4719 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4720 struct UseVectorizedDefaultKernel {
4721 enum :
bool { value = useOptimizedKernels &&
4723 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4777 MT1::simdEnabled && MT2::simdEnabled &&
4783 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4784 !evaluateRight && MT2::smpAssignable };
4814 return matrix_(i,j) * scalar_;
4827 if( i >= matrix_.rows() ) {
4830 if( j >= matrix_.columns() ) {
4833 return (*
this)(i,j);
4842 inline size_t rows()
const {
4843 return matrix_.rows();
4852 inline size_t columns()
const {
4853 return matrix_.columns();
4883 template<
typename T >
4884 inline bool canAlias(
const T* alias )
const {
4885 return matrix_.canAlias( alias );
4895 template<
typename T >
4896 inline bool isAliased(
const T* alias )
const {
4897 return matrix_.isAliased( alias );
4907 return matrix_.isAligned();
4918 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4920 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4921 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4943 template<
typename MT
4956 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4959 else if( left.columns() == 0UL ) {
4974 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4989 template<
typename MT3
4993 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4996 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
4997 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4998 selectSmallAssignKernel( C, A, B, scalar );
5000 selectBlasAssignKernel( C, A, B, scalar );
5018 template<
typename MT3
5023 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5025 const size_t M( A.rows() );
5026 const size_t N( B.columns() );
5027 const size_t K( A.columns() );
5031 for(
size_t j=0UL; j<N; ++j )
5042 for(
size_t i=0UL; i<M; ++i ) {
5051 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
5052 :( LOW ?
max(j,kbegin) : kbegin ) )
5053 :( LOW ? j : 0UL ) );
5056 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
5057 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
5058 :( UPP ? j+1UL : M ) );
5061 for(
size_t i=0UL; i<ibegin; ++i ) {
5068 for(
size_t i=ibegin; i<iend; ++i ) {
5069 C(i,j) = A(i,kbegin) * B(kbegin,j);
5072 for(
size_t i=iend; i<M; ++i ) {
5077 reset( C(M-1UL,j) );
5081 for(
size_t k=kbegin+1UL; k<kend; ++k )
5085 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
5086 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
5087 :( SYM || HERM || LOW ? j : 0UL ) );
5090 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
5091 :( UPP ?
min(j+1UL,k) : k ) )
5092 :( UPP ? j+1UL : M ) );
5094 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5097 for(
size_t i=ibegin; i<iend; ++i ) {
5098 C(i,j) += A(i,k) * B(k,j);
5101 C(iend,j) = A(iend,k) * B(k,j);
5108 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5111 :( UPP ? j+1UL : M ) );
5113 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5116 for(
size_t i=ibegin; i<iend; ++i ) {
5123 for(
size_t j=1UL; j<N; ++j ) {
5124 for(
size_t i=0UL; i<j; ++i ) {
5125 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5146 template<
typename MT3
5150 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
5151 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5155 const size_t M( A.rows() );
5156 const size_t N( B.columns() );
5158 for(
size_t j=0UL; j<N; ++j )
5169 for(
size_t i=0UL; i<ibegin; ++i ) {
5173 for(
size_t i=ibegin; i<iend; ++i ) {
5174 C(i,j) = A(i,j) * B(j,j) * scalar;
5177 for(
size_t i=iend; i<M; ++i ) {
5199 template<
typename MT3
5204 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5208 const size_t M( A.rows() );
5209 const size_t N( B.columns() );
5211 for(
size_t j=0UL; j<N; ++j )
5222 for(
size_t i=0UL; i<ibegin; ++i ) {
5226 for(
size_t i=ibegin; i<iend; ++i ) {
5227 C(i,j) = A(i,i) * B(i,j) * scalar;
5230 for(
size_t i=iend; i<M; ++i ) {
5252 template<
typename MT3
5257 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5263 for(
size_t i=0UL; i<A.rows(); ++i ) {
5264 C(i,i) = A(i,i) * B(i,i) * scalar;
5283 template<
typename MT3
5288 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5290 selectDefaultAssignKernel( C, A, B, scalar );
5309 template<
typename MT3
5321 const ForwardFunctor fwd;
5325 assign( ~C, fwd( A * tmp ) * scalar );
5329 assign( ~C, fwd( tmp * B ) * scalar );
5331 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5333 assign( ~C, fwd( A * tmp ) * scalar );
5337 assign( ~C, fwd( tmp * B ) * scalar );
5357 template<
typename MT3
5366 const size_t M( A.rows() );
5367 const size_t N( B.columns() );
5368 const size_t K( A.columns() );
5372 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
5375 const SIMDType factor(
set( scalar ) );
5377 if( LOW && UPP && M > SIMDSIZE*3UL ) {
5386 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5387 for(
size_t j=0UL; j<N; ++j )
5400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5402 for(
size_t k=kbegin; k<kend; ++k ) {
5403 const SIMDType b1(
set( B(k,j) ) );
5404 xmm1 += A.load(i ,k) * b1;
5405 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5406 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5407 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5408 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5409 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5410 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5411 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5414 (~C).store( i , j, xmm1 * factor );
5415 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5416 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5417 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5418 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5419 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5420 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5421 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5426 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5430 for( ; (j+2UL) <= N; j+=2UL )
5443 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5445 for(
size_t k=kbegin; k<kend; ++k ) {
5447 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5448 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5449 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5450 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5451 const SIMDType b1(
set( B(k,j ) ) );
5452 const SIMDType b2(
set( B(k,j+1UL) ) );
5465 (~C).store( i , j , xmm1 * factor );
5466 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5467 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5468 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5469 (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5470 (~C).store( i , j+1UL, xmm6 * factor );
5471 (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5472 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5473 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5474 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5486 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5488 for(
size_t k=kbegin; k<kend; ++k ) {
5489 const SIMDType b1(
set( B(k,j) ) );
5490 xmm1 += A.load(i ,k) * b1;
5491 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5492 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5493 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5494 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5497 (~C).store( i , j, xmm1 * factor );
5498 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5499 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5500 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5501 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5505 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5507 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
5508 size_t j( UPP ? i : 0UL );
5510 for( ; (j+2UL) <= jend; j+=2UL )
5523 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5525 for(
size_t k=kbegin; k<kend; ++k ) {
5527 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5528 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5529 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5530 const SIMDType b1(
set( B(k,j ) ) );
5531 const SIMDType b2(
set( B(k,j+1UL) ) );
5542 (~C).store( i , j , xmm1 * factor );
5543 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5544 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5545 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5546 (~C).store( i , j+1UL, xmm5 * factor );
5547 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5548 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5549 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5563 for(
size_t k=kbegin; k<kend; ++k ) {
5564 const SIMDType b1(
set( B(k,j) ) );
5565 xmm1 += A.load(i ,k) * b1;
5566 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5567 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5568 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5571 (~C).store( i , j, xmm1 * factor );
5572 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5573 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5574 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5578 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5580 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
5581 size_t j( UPP ? i : 0UL );
5583 for( ; (j+2UL) <= jend; j+=2UL )
5596 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5598 for(
size_t k=kbegin; k<kend; ++k ) {
5600 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5601 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5602 const SIMDType b1(
set( B(k,j ) ) );
5603 const SIMDType b2(
set( B(k,j+1UL) ) );
5612 (~C).store( i , j , xmm1 * factor );
5613 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5614 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5615 (~C).store( i , j+1UL, xmm4 * factor );
5616 (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5617 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5631 for(
size_t k=kbegin; k<kend; ++k ) {
5632 const SIMDType b1(
set( B(k,j) ) );
5633 xmm1 += A.load(i ,k) * b1;
5634 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5635 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5638 (~C).store( i , j, xmm1 * factor );
5639 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5640 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5644 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5646 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
5647 size_t j( UPP ? i : 0UL );
5649 for( ; (j+4UL) <= jend; j+=4UL )
5662 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5664 for(
size_t k=kbegin; k<kend; ++k ) {
5666 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5667 const SIMDType b1(
set( B(k,j ) ) );
5668 const SIMDType b2(
set( B(k,j+1UL) ) );
5669 const SIMDType b3(
set( B(k,j+2UL) ) );
5670 const SIMDType b4(
set( B(k,j+3UL) ) );
5681 (~C).store( i , j , xmm1 * factor );
5682 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5683 (~C).store( i , j+1UL, xmm3 * factor );
5684 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5685 (~C).store( i , j+2UL, xmm5 * factor );
5686 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5687 (~C).store( i , j+3UL, xmm7 * factor );
5688 (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
5691 for( ; (j+3UL) <= jend; j+=3UL )
5704 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5706 for(
size_t k=kbegin; k<kend; ++k ) {
5708 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5709 const SIMDType b1(
set( B(k,j ) ) );
5710 const SIMDType b2(
set( B(k,j+1UL) ) );
5711 const SIMDType b3(
set( B(k,j+2UL) ) );
5720 (~C).store( i , j , xmm1 * factor );
5721 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5722 (~C).store( i , j+1UL, xmm3 * factor );
5723 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5724 (~C).store( i , j+2UL, xmm5 * factor );
5725 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5728 for( ; (j+2UL) <= jend; j+=2UL )
5741 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5744 for( ; (k+2UL) <= kend; k+=2UL ) {
5745 const SIMDType a1( A.load(i ,k ) );
5746 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
5747 const SIMDType a3( A.load(i ,k+1UL) );
5748 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
5749 const SIMDType b1(
set( B(k ,j ) ) );
5750 const SIMDType b2(
set( B(k ,j+1UL) ) );
5751 const SIMDType b3(
set( B(k+1UL,j ) ) );
5752 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
5763 for( ; k<kend; ++k ) {
5765 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5766 const SIMDType b1(
set( B(k,j ) ) );
5767 const SIMDType b2(
set( B(k,j+1UL) ) );
5774 (~C).store( i , j , (xmm1+xmm5) * factor );
5775 (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
5776 (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
5777 (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5792 for( ; (k+2UL) <= kend; k+=2UL ) {
5793 const SIMDType b1(
set( B(k ,j) ) );
5794 const SIMDType b2(
set( B(k+1UL,j) ) );
5795 xmm1 += A.load(i ,k ) * b1;
5796 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
5797 xmm3 += A.load(i ,k+1UL) * b2;
5798 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
5801 for( ; k<kend; ++k ) {
5802 const SIMDType b1(
set( B(k,j) ) );
5803 xmm1 += A.load(i ,k) * b1;
5804 xmm2 += A.load(i+SIMDSIZE,k) * b1;
5807 (~C).store( i , j, (xmm1+xmm3) * factor );
5808 (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
5812 for( ; i<ipos; i+=SIMDSIZE )
5814 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
5815 size_t j( UPP ? i : 0UL );
5817 for( ; (j+4UL) <= jend; j+=4UL )
5828 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5831 for( ; (k+2UL) <= kend; k+=2UL ) {
5833 const SIMDType a2( A.load(i,k+1UL) );
5834 xmm1 += a1 *
set( B(k ,j ) );
5835 xmm2 += a1 *
set( B(k ,j+1UL) );
5836 xmm3 += a1 *
set( B(k ,j+2UL) );
5837 xmm4 += a1 *
set( B(k ,j+3UL) );
5838 xmm5 += a2 *
set( B(k+1UL,j ) );
5839 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
5840 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
5841 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
5844 for( ; k<kend; ++k ) {
5846 xmm1 += a1 *
set( B(k,j ) );
5847 xmm2 += a1 *
set( B(k,j+1UL) );
5848 xmm3 += a1 *
set( B(k,j+2UL) );
5849 xmm4 += a1 *
set( B(k,j+3UL) );
5852 (~C).store( i, j , (xmm1+xmm5) * factor );
5853 (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
5854 (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
5855 (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
5858 for( ; (j+3UL) <= jend; j+=3UL )
5869 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5872 for( ; (k+2UL) <= kend; k+=2UL ) {
5874 const SIMDType a2( A.load(i,k+1UL) );
5875 xmm1 += a1 *
set( B(k ,j ) );
5876 xmm2 += a1 *
set( B(k ,j+1UL) );
5877 xmm3 += a1 *
set( B(k ,j+2UL) );
5878 xmm4 += a2 *
set( B(k+1UL,j ) );
5879 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
5880 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
5883 for( ; k<kend; ++k ) {
5885 xmm1 += a1 *
set( B(k,j ) );
5886 xmm2 += a1 *
set( B(k,j+1UL) );
5887 xmm3 += a1 *
set( B(k,j+2UL) );
5890 (~C).store( i, j , (xmm1+xmm4) * factor );
5891 (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
5892 (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
5895 for( ; (j+2UL) <= jend; j+=2UL )
5909 for( ; k<kend; ++k ) {
5911 xmm1 += a1 *
set( B(k,j ) );
5912 xmm2 += a1 *
set( B(k,j+1UL) );
5915 for( ; (k+2UL) <= kend; k+=2UL ) {
5917 const SIMDType a2( A.load(i,k+1UL) );
5918 xmm1 += a1 *
set( B(k ,j ) );
5919 xmm2 += a1 *
set( B(k ,j+1UL) );
5920 xmm3 += a2 *
set( B(k+1UL,j ) );
5921 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
5924 (~C).store( i, j , (xmm1+xmm3) * factor );
5925 (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
5939 for( ; (k+2UL) <= K; k+=2UL ) {
5940 xmm1 += A.load(i,k ) *
set( B(k ,j) );
5941 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
5945 xmm1 += A.load(i,k) *
set( B(k,j) );
5948 (~C).store( i, j, (xmm1+xmm2) * factor );
5952 for( ; remainder && i<M; ++i )
5954 size_t j( LOW && UPP ? i : 0UL );
5956 for( ; (j+2UL) <= N; j+=2UL )
5970 for(
size_t k=kbegin; k<kend; ++k ) {
5971 value1 += A(i,k) * B(k,j );
5972 value2 += A(i,k) * B(k,j+1UL);
5975 (~C)(i,j ) = value1 * scalar;
5976 (~C)(i,j+1UL) = value2 * scalar;
5989 for(
size_t k=kbegin; k<K; ++k ) {
5990 value += A(i,k) * B(k,j);
5993 (~C)(i,j) = value * scalar;
5998 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
5999 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6000 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6001 for(
size_t i=0UL; i<iend; ++i ) {
6002 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
6006 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
6007 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6008 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6009 for(
size_t i=0UL; i<iend; ++i ) {
6014 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
6015 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6016 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6017 for(
size_t j=0UL; j<jend; ++j ) {
6039 template<
typename MT3
6044 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6046 selectDefaultAssignKernel( C, A, B, scalar );
6065 template<
typename MT3
6070 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6073 smmm( C, A, B, scalar );
6075 hmmm( C, A, B, scalar );
6077 lmmm( C, A, B, scalar, ST2(0) );
6079 ummm( C, A, B, scalar, ST2(0) );
6081 mmm( C, A, B, scalar, ST2(0) );
6099 template<
typename MT3
6104 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6106 selectLargeAssignKernel( C, A, B, scalar );
6111 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6125 template<
typename MT3
6130 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6136 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6140 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6143 gemm( C, A, B, ET(scalar), ET(0) );
6161 template<
typename MT
6180 const ForwardFunctor fwd;
6182 const TmpType tmp(
serial( rhs ) );
6183 assign( ~lhs, fwd( tmp ) );
6201 template<
typename MT >
6212 const ForwardFunctor fwd;
6220 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
6222 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
6238 template<
typename MT
6251 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6265 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6280 template<
typename MT3
6284 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6287 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6288 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6289 selectSmallAddAssignKernel( C, A, B, scalar );
6291 selectBlasAddAssignKernel( C, A, B, scalar );
6309 template<
typename MT3
6314 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6317 addAssign( C, tmp );
6335 template<
typename MT3
6339 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6340 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6344 const size_t M( A.rows() );
6345 const size_t N( B.columns() );
6347 for(
size_t j=0UL; j<N; ++j )
6357 const size_t inum( iend - ibegin );
6358 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6360 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6361 C(i ,j) += A(i ,j) * B(j,j) * scalar;
6362 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6365 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6385 template<
typename MT3
6390 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6394 const size_t M( A.rows() );
6395 const size_t N( B.columns() );
6397 for(
size_t j=0UL; j<N; ++j )
6407 const size_t inum( iend - ibegin );
6408 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6410 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6411 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6412 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6415 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6435 template<
typename MT3
6440 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6444 for(
size_t i=0UL; i<A.rows(); ++i ) {
6445 C(i,i) += A(i,i) * B(i,i) * scalar;
6464 template<
typename MT3
6469 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6471 selectDefaultAddAssignKernel( C, A, B, scalar );
6490 template<
typename MT3
6502 const ForwardFunctor fwd;
6506 addAssign( ~C, fwd( A * tmp ) * scalar );
6510 addAssign( ~C, fwd( tmp * B ) * scalar );
6512 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6514 addAssign( ~C, fwd( A * tmp ) * scalar );
6518 addAssign( ~C, fwd( tmp * B ) * scalar );
6538 template<
typename MT3
6547 const size_t M( A.rows() );
6548 const size_t N( B.columns() );
6549 const size_t K( A.columns() );
6553 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
6556 const SIMDType factor(
set( scalar ) );
6562 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6563 for(
size_t j=0UL; j<N; ++j )
6576 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6578 for(
size_t k=kbegin; k<kend; ++k ) {
6579 const SIMDType b1(
set( B(k,j) ) );
6580 xmm1 += A.load(i ,k) * b1;
6581 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6582 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6583 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6584 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6585 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6586 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6587 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6590 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6591 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6592 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6593 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6594 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6595 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
6596 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
6597 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
6602 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6606 for( ; (j+2UL) <= N; j+=2UL )
6619 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6621 for(
size_t k=kbegin; k<kend; ++k ) {
6623 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6624 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6625 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6626 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6627 const SIMDType b1(
set( B(k,j ) ) );
6628 const SIMDType b2(
set( B(k,j+1UL) ) );
6641 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6642 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6643 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6644 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6645 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
6646 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
6647 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
6648 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6649 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6650 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6662 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6664 for(
size_t k=kbegin; k<kend; ++k ) {
6665 const SIMDType b1(
set( B(k,j) ) );
6666 xmm1 += A.load(i ,k) * b1;
6667 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6668 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6669 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6670 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6673 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6674 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6675 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6676 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6677 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6681 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6685 for( ; (j+2UL) <= N; j+=2UL )
6698 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6700 for(
size_t k=kbegin; k<kend; ++k ) {
6702 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6703 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6704 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6705 const SIMDType b1(
set( B(k,j ) ) );
6706 const SIMDType b2(
set( B(k,j+1UL) ) );
6717 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6718 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6719 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6720 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6721 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6722 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
6723 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6724 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6738 for(
size_t k=kbegin; k<kend; ++k ) {
6739 const SIMDType b1(
set( B(k,j) ) );
6740 xmm1 += A.load(i ,k) * b1;
6741 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6742 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6743 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6746 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6747 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6748 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6749 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6753 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6757 for( ; (j+2UL) <= N; j+=2UL )
6770 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6772 for(
size_t k=kbegin; k<kend; ++k ) {
6774 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6775 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6776 const SIMDType b1(
set( B(k,j ) ) );
6777 const SIMDType b2(
set( B(k,j+1UL) ) );
6786 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6787 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6788 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6789 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
6790 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
6791 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6805 for(
size_t k=kbegin; k<kend; ++k ) {
6806 const SIMDType b1(
set( B(k,j) ) );
6807 xmm1 += A.load(i ,k) * b1;
6808 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6809 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6812 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6813 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6814 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6818 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6820 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
6821 size_t j( UPP ? i : 0UL );
6823 for( ; (j+4UL) <= jend; j+=4UL )
6836 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6838 for(
size_t k=kbegin; k<kend; ++k ) {
6840 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6841 const SIMDType b1(
set( B(k,j ) ) );
6842 const SIMDType b2(
set( B(k,j+1UL) ) );
6843 const SIMDType b3(
set( B(k,j+2UL) ) );
6844 const SIMDType b4(
set( B(k,j+3UL) ) );
6855 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6856 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6857 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6858 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6859 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6860 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6861 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6862 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
6865 for( ; (j+3UL) <= jend; j+=3UL )
6878 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6880 for(
size_t k=kbegin; k<kend; ++k ) {
6882 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6883 const SIMDType b1(
set( B(k,j ) ) );
6884 const SIMDType b2(
set( B(k,j+1UL) ) );
6885 const SIMDType b3(
set( B(k,j+2UL) ) );
6894 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6895 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6896 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6897 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6898 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6899 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6902 for( ; (j+2UL) <= jend; j+=2UL )
6915 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6918 for( ; (k+2UL) <= kend; k+=2UL ) {
6919 const SIMDType a1( A.load(i ,k ) );
6920 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6921 const SIMDType a3( A.load(i ,k+1UL) );
6922 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6923 const SIMDType b1(
set( B(k ,j ) ) );
6924 const SIMDType b2(
set( B(k ,j+1UL) ) );
6925 const SIMDType b3(
set( B(k+1UL,j ) ) );
6926 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
6937 for( ; k<kend; ++k ) {
6939 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6940 const SIMDType b1(
set( B(k,j ) ) );
6941 const SIMDType b2(
set( B(k,j+1UL) ) );
6948 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
6949 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
6950 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
6951 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
6966 for( ; (k+2UL) <= kend; k+=2UL ) {
6967 const SIMDType b1(
set( B(k ,j) ) );
6968 const SIMDType b2(
set( B(k+1UL,j) ) );
6969 xmm1 += A.load(i ,k ) * b1;
6970 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
6971 xmm3 += A.load(i ,k+1UL) * b2;
6972 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
6975 for( ; k<kend; ++k ) {
6976 const SIMDType b1(
set( B(k,j) ) );
6977 xmm1 += A.load(i ,k) * b1;
6978 xmm2 += A.load(i+SIMDSIZE,k) * b1;
6981 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
6982 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
6986 for( ; i<ipos; i+=SIMDSIZE )
6988 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
6989 size_t j( UPP ? i : 0UL );
6991 for( ; (j+4UL) <= jend; j+=4UL )
7002 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7005 for( ; (k+2UL) <= kend; k+=2UL ) {
7007 const SIMDType a2( A.load(i,k+1UL) );
7008 xmm1 += a1 *
set( B(k ,j ) );
7009 xmm2 += a1 *
set( B(k ,j+1UL) );
7010 xmm3 += a1 *
set( B(k ,j+2UL) );
7011 xmm4 += a1 *
set( B(k ,j+3UL) );
7012 xmm5 += a2 *
set( B(k+1UL,j ) );
7013 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
7014 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
7015 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
7018 for( ; k<kend; ++k ) {
7020 xmm1 += a1 *
set( B(k,j ) );
7021 xmm2 += a1 *
set( B(k,j+1UL) );
7022 xmm3 += a1 *
set( B(k,j+2UL) );
7023 xmm4 += a1 *
set( B(k,j+3UL) );
7026 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
7027 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
7028 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
7029 (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
7032 for( ; (j+3UL) <= jend; j+=3UL )
7043 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7046 for( ; (k+2UL) <= kend; k+=2UL ) {
7048 const SIMDType a2( A.load(i,k+1UL) );
7049 xmm1 += a1 *
set( B(k ,j ) );
7050 xmm2 += a1 *
set( B(k ,j+1UL) );
7051 xmm3 += a1 *
set( B(k ,j+2UL) );
7052 xmm4 += a2 *
set( B(k+1UL,j ) );
7053 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
7054 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
7057 for( ; k<kend; ++k ) {
7059 xmm1 += a1 *
set( B(k,j ) );
7060 xmm2 += a1 *
set( B(k,j+1UL) );
7061 xmm3 += a1 *
set( B(k,j+2UL) );
7064 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
7065 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
7066 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
7069 for( ; (j+2UL) <= jend; j+=2UL )
7083 for( ; (k+2UL) <= kend; k+=2UL ) {
7085 const SIMDType a2( A.load(i,k+1UL) );
7086 xmm1 += a1 *
set( B(k ,j ) );
7087 xmm2 += a1 *
set( B(k ,j+1UL) );
7088 xmm3 += a2 *
set( B(k+1UL,j ) );
7089 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
7092 for( ; k<kend; ++k ) {
7094 xmm1 += a1 *
set( B(k,j ) );
7095 xmm2 += a1 *
set( B(k,j+1UL) );
7098 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
7099 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
7113 for( ; (k+2UL) <= K; k+=2UL ) {
7114 xmm1 += A.load(i,k ) *
set( B(k ,j) );
7115 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
7119 xmm1 += A.load(i,k) *
set( B(k,j) );
7122 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
7126 for( ; remainder && i<M; ++i )
7128 const size_t jend( LOW ? i+1UL : N );
7129 size_t j( UPP ? i : 0UL );
7131 for( ; (j+2UL) <= jend; j+=2UL )
7145 for(
size_t k=kbegin; k<kend; ++k ) {
7146 value1 += A(i,k) * B(k,j );
7147 value2 += A(i,k) * B(k,j+1UL);
7150 (~C)(i,j ) += value1 * scalar;
7151 (~C)(i,j+1UL) += value2 * scalar;
7164 for(
size_t k=kbegin; k<K; ++k ) {
7165 value += A(i,k) * B(k,j);
7168 (~C)(i,j) += value * scalar;
7188 template<
typename MT3
7193 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7195 selectDefaultAddAssignKernel( C, A, B, scalar );
7214 template<
typename MT3
7219 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7222 lmmm( C, A, B, scalar, ST2(1) );
7224 ummm( C, A, B, scalar, ST2(1) );
7226 mmm( C, A, B, scalar, ST2(1) );
7245 template<
typename MT3
7250 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7252 selectLargeAddAssignKernel( C, A, B, scalar );
7257 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7271 template<
typename MT3
7276 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7282 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7283 addAssign( C, tmp );
7287 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7288 addAssign( C, tmp );
7291 gemm( C, A, B, ET(scalar), ET(1) );
7312 template<
typename MT >
7323 const ForwardFunctor fwd;
7331 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
7333 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
7353 template<
typename MT
7366 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7380 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
7395 template<
typename MT3
7399 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7402 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
7403 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7404 selectSmallSubAssignKernel( C, A, B, scalar );
7406 selectBlasSubAssignKernel( C, A, B, scalar );
7424 template<
typename MT3
7429 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7432 subAssign( C, tmp );
7450 template<
typename MT3
7454 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7455 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7459 const size_t M( A.rows() );
7460 const size_t N( B.columns() );
7462 for(
size_t j=0UL; j<N; ++j )
7472 const size_t inum( iend - ibegin );
7473 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7475 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7476 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7477 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7480 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7500 template<
typename MT3
7505 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7509 const size_t M( A.rows() );
7510 const size_t N( B.columns() );
7512 for(
size_t j=0UL; j<N; ++j )
7522 const size_t inum( iend - ibegin );
7523 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7525 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7526 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7527 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7530 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7550 template<
typename MT3
7555 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7559 for(
size_t i=0UL; i<A.rows(); ++i ) {
7560 C(i,i) -= A(i,i) * B(i,i) * scalar;
7579 template<
typename MT3
7584 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7586 selectDefaultSubAssignKernel( C, A, B, scalar );
7605 template<
typename MT3
7617 const ForwardFunctor fwd;
7621 subAssign( ~C, fwd( A * tmp ) * scalar );
7625 subAssign( ~C, fwd( tmp * B ) * scalar );
7627 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7629 subAssign( ~C, fwd( A * tmp ) * scalar );
7633 subAssign( ~C, fwd( tmp * B ) * scalar );
7653 template<
typename MT3
7662 const size_t M( A.rows() );
7663 const size_t N( B.columns() );
7664 const size_t K( A.columns() );
7668 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
7671 const SIMDType factor(
set( scalar ) );
7677 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7678 for(
size_t j=0UL; j<N; ++j )
7691 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7693 for(
size_t k=kbegin; k<kend; ++k ) {
7694 const SIMDType b1(
set( B(k,j) ) );
7695 xmm1 += A.load(i ,k) * b1;
7696 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7697 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7698 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7699 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7700 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7701 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7702 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7705 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7706 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7707 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7708 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7709 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7710 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
7711 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
7712 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
7717 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7721 for( ; (j+2UL) <= N; j+=2UL )
7734 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7736 for(
size_t k=kbegin; k<kend; ++k ) {
7738 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7739 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7740 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7741 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7742 const SIMDType b1(
set( B(k,j ) ) );
7743 const SIMDType b2(
set( B(k,j+1UL) ) );
7756 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7757 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7758 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7759 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7760 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
7761 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
7762 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
7763 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7764 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7765 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7777 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7779 for(
size_t k=kbegin; k<kend; ++k ) {
7780 const SIMDType b1(
set( B(k,j) ) );
7781 xmm1 += A.load(i ,k) * b1;
7782 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7783 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7784 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7785 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7788 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7789 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7790 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7791 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7792 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7796 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7800 for( ; (j+2UL) <= N; j+=2UL )
7813 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7815 for(
size_t k=kbegin; k<kend; ++k ) {
7817 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7818 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7819 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7820 const SIMDType b1(
set( B(k,j ) ) );
7821 const SIMDType b2(
set( B(k,j+1UL) ) );
7832 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7833 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7834 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7835 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7836 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7837 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
7838 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
7839 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
7853 for(
size_t k=kbegin; k<kend; ++k ) {
7854 const SIMDType b1(
set( B(k,j) ) );
7855 xmm1 += A.load(i ,k) * b1;
7856 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7857 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7858 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7861 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7862 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7863 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7864 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7868 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7872 for( ; (j+2UL) <= N; j+=2UL )
7885 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7887 for(
size_t k=kbegin; k<kend; ++k ) {
7889 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7890 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7891 const SIMDType b1(
set( B(k,j ) ) );
7892 const SIMDType b2(
set( B(k,j+1UL) ) );
7901 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7902 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7903 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7904 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
7905 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
7906 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
7920 for(
size_t k=kbegin; k<kend; ++k ) {
7921 const SIMDType b1(
set( B(k,j) ) );
7922 xmm1 += A.load(i ,k) * b1;
7923 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7924 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7927 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7928 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7929 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7933 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7935 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
7936 size_t j( UPP ? i : 0UL );
7938 for( ; (j+4UL) <= jend; j+=4UL )
7951 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7953 for(
size_t k=kbegin; k<kend; ++k ) {
7955 const SIMDType a2( A.load(i+SIMDSIZE,k) );
7956 const SIMDType b1(
set( B(k,j ) ) );
7957 const SIMDType b2(
set( B(k,j+1UL) ) );
7958 const SIMDType b3(
set( B(k,j+2UL) ) );
7959 const SIMDType b4(
set( B(k,j+3UL) ) );
7970 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7971 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
7972 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7973 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
7974 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7975 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
7976 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7977 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
7980 for( ; (j+3UL) <= jend; j+=3UL )
7993 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7995 for(
size_t k=kbegin; k<kend; ++k ) {
7997 const SIMDType a2( A.load(i+SIMDSIZE,k) );
7998 const SIMDType b1(
set( B(k,j ) ) );
7999 const SIMDType b2(
set( B(k,j+1UL) ) );
8000 const SIMDType b3(
set( B(k,j+2UL) ) );
8009 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
8010 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
8011 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
8012 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8013 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
8014 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8017 for( ; (j+2UL) <= jend; j+=2UL )
8030 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8033 for( ; (k+2UL) <= kend; k+=2UL ) {
8034 const SIMDType a1( A.load(i ,k ) );
8035 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8036 const SIMDType a3( A.load(i ,k+1UL) );
8037 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8038 const SIMDType b1(
set( B(k ,j ) ) );
8039 const SIMDType b2(
set( B(k ,j+1UL) ) );
8040 const SIMDType b3(
set( B(k+1UL,j ) ) );
8041 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8052 for( ; k<kend; ++k ) {
8054 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8055 const SIMDType b1(
set( B(k,j ) ) );
8056 const SIMDType b2(
set( B(k,j+1UL) ) );
8063 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
8064 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
8065 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
8066 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8081 for( ; (k+2UL) <= kend; k+=2UL ) {
8082 const SIMDType b1(
set( B(k ,j) ) );
8083 const SIMDType b2(
set( B(k+1UL,j) ) );
8084 xmm1 += A.load(i ,k ) * b1;
8085 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8086 xmm3 += A.load(i ,k+1UL) * b2;
8087 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8090 for( ; k<kend; ++k ) {
8091 const SIMDType b1(
set( B(k,j) ) );
8092 xmm1 += A.load(i ,k) * b1;
8093 xmm2 += A.load(i+SIMDSIZE,k) * b1;
8096 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
8097 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
8101 for( ; i<ipos; i+=SIMDSIZE )
8103 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
8104 size_t j( UPP ? i : 0UL );
8106 for( ; (j+4UL) <= jend; j+=4UL )
8117 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8120 for( ; (k+2UL) <= kend; k+=2UL ) {
8122 const SIMDType a2( A.load(i,k+1UL) );
8123 xmm1 += a1 *
set( B(k ,j ) );
8124 xmm2 += a1 *
set( B(k ,j+1UL) );
8125 xmm3 += a1 *
set( B(k ,j+2UL) );
8126 xmm4 += a1 *
set( B(k ,j+3UL) );
8127 xmm5 += a2 *
set( B(k+1UL,j ) );
8128 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8129 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8130 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8133 for( ; k<kend; ++k ) {
8135 xmm1 += a1 *
set( B(k,j ) );
8136 xmm2 += a1 *
set( B(k,j+1UL) );
8137 xmm3 += a1 *
set( B(k,j+2UL) );
8138 xmm4 += a1 *
set( B(k,j+3UL) );
8141 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
8142 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
8143 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
8144 (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
8147 for( ; (j+3UL) <= jend; j+=3UL )
8158 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8161 for( ; (k+2UL) <= kend; k+=2UL ) {
8163 const SIMDType a2( A.load(i,k+1UL) );
8164 xmm1 += a1 *
set( B(k ,j ) );
8165 xmm2 += a1 *
set( B(k ,j+1UL) );
8166 xmm3 += a1 *
set( B(k ,j+2UL) );
8167 xmm4 += a2 *
set( B(k+1UL,j ) );
8168 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
8169 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
8172 for( ; k<kend; ++k ) {
8174 xmm1 += a1 *
set( B(k,j ) );
8175 xmm2 += a1 *
set( B(k,j+1UL) );
8176 xmm3 += a1 *
set( B(k,j+2UL) );
8179 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
8180 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
8181 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
8184 for( ; (j+2UL) <= jend; j+=2UL )
8198 for( ; (k+2UL) <= kend; k+=2UL ) {
8200 const SIMDType a2( A.load(i,k+1UL) );
8201 xmm1 += a1 *
set( B(k ,j ) );
8202 xmm2 += a1 *
set( B(k ,j+1UL) );
8203 xmm3 += a2 *
set( B(k+1UL,j ) );
8204 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
8207 for( ; k<kend; ++k ) {
8209 xmm1 += a1 *
set( B(k,j ) );
8210 xmm2 += a1 *
set( B(k,j+1UL) );
8213 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
8214 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
8228 for( ; (k+2UL) <= K; k+=2UL ) {
8229 xmm1 += A.load(i,k ) *
set( B(k ,j) );
8230 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
8234 xmm1 += A.load(i,k) *
set( B(k,j) );
8237 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
8241 for( ; remainder && i<M; ++i )
8243 const size_t jend( LOW ? i+1UL : N );
8244 size_t j( UPP ? i : 0UL );
8246 for( ; (j+2UL) <= jend; j+=2UL )
8260 for(
size_t k=kbegin; k<kend; ++k ) {
8261 value1 += A(i,k) * B(k,j );
8262 value2 += A(i,k) * B(k,j+1UL);
8265 (~C)(i,j ) -= value1 * scalar;
8266 (~C)(i,j+1UL) -= value2 * scalar;
8279 for(
size_t k=kbegin; k<K; ++k ) {
8280 value += A(i,k) * B(k,j);
8283 (~C)(i,j) -= value * scalar;
8303 template<
typename MT3
8308 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8310 selectDefaultSubAssignKernel( C, A, B, scalar );
8329 template<
typename MT3
8334 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8337 lmmm( C, A, B, -scalar, ST2(1) );
8339 ummm( C, A, B, -scalar, ST2(1) );
8341 mmm( C, A, B, -scalar, ST2(1) );
8360 template<
typename MT3
8365 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8367 selectLargeSubAssignKernel( C, A, B, scalar );
8372 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8386 template<
typename MT3
8391 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8397 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8398 subAssign( C, tmp );
8402 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8403 subAssign( C, tmp );
8406 gemm( C, A, B, ET(-scalar), ET(1) );
8426 template<
typename MT >
8437 const ForwardFunctor fwd;
8445 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
8447 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
8467 template<
typename MT
8481 schurAssign( ~lhs, tmp );
8512 template<
typename MT
8525 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8528 else if( left.columns() == 0UL ) {
8562 template<
typename MT
8581 const ForwardFunctor fwd;
8583 const TmpType tmp( rhs );
8602 template<
typename MT >
8613 const ForwardFunctor fwd;
8642 template<
typename MT
8655 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8688 template<
typename MT >
8699 const ForwardFunctor fwd;
8732 template<
typename MT
8745 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8778 template<
typename MT >
8789 const ForwardFunctor fwd;
8819 template<
typename MT
8899 template<
typename MT1
8901 inline decltype(
auto)
8947 template<
typename MT1
8962 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8991 template<
typename MT1
9006 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9035 template<
typename MT1
9050 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9079 template<
typename MT1
9094 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9123 template<
typename MT1
9138 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9154 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9155 struct Rows< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9172 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9173 struct Columns< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9190 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9191 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9192 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
9208 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9209 struct IsSymmetric< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9212 , IsBuiltin< ElementType_< TDMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
9213 , And< Bool<LF>, Bool<UF> > >::value >
9229 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
9230 struct IsHermitian< TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
9247 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9248 struct IsLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9250 , And< IsLower<MT1>, IsLower<MT2> >
9251 , And< Or< Bool<SF>, Bool<HF> >
9252 , IsUpper<MT1>, IsUpper<MT2> > >::value >
9268 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9269 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9270 :
public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
9271 , And< Or< Bool<SF>, Bool<HF> >
9272 , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
9288 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9290 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9291 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
9292 , And< Or< Bool<SF>, Bool<HF> >
9293 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9294 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
9310 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9311 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9313 , And< IsUpper<MT1>, IsUpper<MT2> >
9314 , And< Or< Bool<SF>, Bool<HF> >
9315 , IsLower<MT1>, IsLower<MT2> > >::value >
9331 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9332 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9333 :
public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
9334 , And< Or< Bool<SF>, Bool<HF> >
9335 , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
9351 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9353 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9354 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
9355 , And< Or< Bool<SF>, Bool<HF> >
9356 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9357 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:488
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:476
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:297
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:547
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:288
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:177
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:620
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:537
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:323
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1027
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:154
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:280
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:422
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:176
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:279
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:283
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:432
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:110
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:281
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:456
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:179
Header file for the Columns type trait.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1027
Header file for the IsLower type trait.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:178
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:282
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:619
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:264
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:489
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:108
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:412
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:294
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:386
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1029
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:101
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:790
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1029
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:402
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:444
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:338
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:466
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:742
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:285
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:284
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:291