35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 141 template<
typename MT1
147 class TDMatTDMatMultExpr
148 :
public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
149 ,
private Computation
174 SYM = ( SF && !( HF || LF || UF ) ),
175 HERM = ( HF && !( LF || UF ) ),
176 LOW = ( LF || ( ( SF || HF ) && UF ) ),
177 UPP = ( UF || ( ( SF || HF ) && LF ) )
189 template<
typename T1,
typename T2,
typename T3 >
190 struct CanExploitSymmetry {
203 template<
typename T1,
typename T2,
typename T3 >
204 struct IsEvaluationRequired {
205 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
206 CanExploitSymmetry<T1,T2,T3>::value };
216 template<
typename T1,
typename T2,
typename T3 >
217 struct UseBlasKernel {
218 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
224 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
239 template<
typename T1,
typename T2,
typename T3 >
240 struct UseVectorizedDefaultKernel {
241 enum :
bool { value = useOptimizedKernels &&
243 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
301 MT1::simdEnabled && MT2::simdEnabled &&
306 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
307 !evaluateRight && MT2::smpAssignable };
362 :(
lhs_.columns() ) ) );
366 const size_t n(
end - begin );
386 if( i >=
lhs_.rows() ) {
389 if( j >=
rhs_.columns() ) {
401 inline size_t rows() const noexcept {
412 return rhs_.columns();
442 template<
typename T >
443 inline bool canAlias(
const T* alias )
const noexcept {
444 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
454 template<
typename T >
455 inline bool isAliased(
const T* alias )
const noexcept {
456 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
466 return lhs_.isAligned() &&
rhs_.isAligned();
477 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
479 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
480 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
504 template<
typename MT
514 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
517 else if( rhs.lhs_.columns() == 0UL ) {
532 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
548 template<
typename MT3
551 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
554 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
555 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
556 selectSmallAssignKernel( C, A, B );
558 selectBlasAssignKernel( C, A, B );
577 template<
typename MT3
581 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
583 const size_t M( A.rows() );
584 const size_t N( B.columns() );
585 const size_t K( A.columns() );
589 for(
size_t j=0UL; j<N; ++j )
600 for(
size_t i=0UL; i<M; ++i ) {
609 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
610 :( LOW ?
max(j,kbegin) : kbegin ) )
611 :( LOW ? j : 0UL ) );
614 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
615 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
616 :( UPP ? j+1UL : M ) );
619 for(
size_t i=0UL; i<ibegin; ++i ) {
626 for(
size_t i=ibegin; i<iend; ++i ) {
627 C(i,j) = A(i,kbegin) * B(kbegin,j);
630 for(
size_t i=iend; i<M; ++i ) {
639 for(
size_t k=kbegin+1UL; k<kend; ++k )
643 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
644 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
645 :( SYM || HERM || LOW ? j : 0UL ) );
648 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
649 :( UPP ?
min(j+1UL,k) : k ) )
650 :( UPP ? j+1UL : M ) );
652 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
655 for(
size_t i=ibegin; i<iend; ++i ) {
656 C(i,j) += A(i,k) * B(k,j);
659 C(iend,j) = A(iend,k) * B(k,j);
665 for(
size_t j=1UL; j<N; ++j ) {
666 for(
size_t i=0UL; i<j; ++i ) {
667 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
689 template<
typename MT3
692 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
693 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
697 const size_t M( A.rows() );
698 const size_t N( B.columns() );
700 for(
size_t j=0UL; j<N; ++j )
711 for(
size_t i=0UL; i<ibegin; ++i ) {
715 for(
size_t i=ibegin; i<iend; ++i ) {
716 C(i,j) = A(i,j) * B(j,j);
719 for(
size_t i=iend; i<M; ++i ) {
742 template<
typename MT3
746 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
750 const size_t M( A.rows() );
751 const size_t N( B.columns() );
753 for(
size_t j=0UL; j<N; ++j )
764 for(
size_t i=0UL; i<ibegin; ++i ) {
768 for(
size_t i=ibegin; i<iend; ++i ) {
769 C(i,j) = A(i,i) * B(i,j);
772 for(
size_t i=iend; i<M; ++i ) {
795 template<
typename MT3
799 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
805 for(
size_t i=0UL; i<A.rows(); ++i ) {
806 C(i,i) = A(i,i) * B(i,i);
826 template<
typename MT3
830 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
832 selectDefaultAssignKernel( C, A, B );
852 template<
typename MT3
863 const ForwardFunctor fwd;
867 assign( ~C, fwd( A * tmp ) );
871 assign( ~C, fwd( tmp * B ) );
873 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
875 assign( ~C, fwd( A * tmp ) );
879 assign( ~C, fwd( tmp * B ) );
900 template<
typename MT3
908 const size_t M( A.rows() );
909 const size_t N( B.columns() );
910 const size_t K( A.columns() );
914 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
917 if( LOW && UPP && M > SIMDSIZE*3UL ) {
926 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
927 for(
size_t j=0UL; j<N; ++j )
940 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
942 for(
size_t k=kbegin; k<kend; ++k ) {
944 xmm1 += A.load(i ,k) * b1;
945 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
946 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
947 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
948 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
949 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
950 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
951 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
954 (~C).store( i , j, xmm1 );
955 (~C).store( i+SIMDSIZE , j, xmm2 );
956 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
957 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
958 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
959 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
960 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
961 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
966 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
970 for( ; (j+2UL) <= N; j+=2UL )
983 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
985 for(
size_t k=kbegin; k<kend; ++k ) {
987 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
988 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
989 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
990 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
991 const SIMDType b1(
set( B(k,j ) ) );
992 const SIMDType b2(
set( B(k,j+1UL) ) );
1005 (~C).store( i , j , xmm1 );
1006 (~C).store( i+SIMDSIZE , j , xmm2 );
1007 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1008 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1009 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1010 (~C).store( i , j+1UL, xmm6 );
1011 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1012 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1013 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1014 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1026 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1028 for(
size_t k=kbegin; k<kend; ++k ) {
1029 const SIMDType b1(
set( B(k,j) ) );
1030 xmm1 += A.load(i ,k) * b1;
1031 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1032 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1033 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1034 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1037 (~C).store( i , j, xmm1 );
1038 (~C).store( i+SIMDSIZE , j, xmm2 );
1039 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1040 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1041 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1045 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1047 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
1048 size_t j( UPP ? i : 0UL );
1050 for( ; (j+2UL) <= jend; j+=2UL )
1063 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1065 for(
size_t k=kbegin; k<kend; ++k ) {
1067 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1068 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1069 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1070 const SIMDType b1(
set( B(k,j ) ) );
1071 const SIMDType b2(
set( B(k,j+1UL) ) );
1082 (~C).store( i , j , xmm1 );
1083 (~C).store( i+SIMDSIZE , j , xmm2 );
1084 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1085 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1086 (~C).store( i , j+1UL, xmm5 );
1087 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1088 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1089 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1103 for(
size_t k=kbegin; k<kend; ++k ) {
1104 const SIMDType b1(
set( B(k,j) ) );
1105 xmm1 += A.load(i ,k) * b1;
1106 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1107 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1108 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1111 (~C).store( i , j, xmm1 );
1112 (~C).store( i+SIMDSIZE , j, xmm2 );
1113 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1114 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1118 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1120 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
1121 size_t j( UPP ? i : 0UL );
1123 for( ; (j+2UL) <= jend; j+=2UL )
1136 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1138 for(
size_t k=kbegin; k<kend; ++k ) {
1140 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1141 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1142 const SIMDType b1(
set( B(k,j ) ) );
1143 const SIMDType b2(
set( B(k,j+1UL) ) );
1152 (~C).store( i , j , xmm1 );
1153 (~C).store( i+SIMDSIZE , j , xmm2 );
1154 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1155 (~C).store( i , j+1UL, xmm4 );
1156 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1157 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1171 for(
size_t k=kbegin; k<kend; ++k ) {
1172 const SIMDType b1(
set( B(k,j) ) );
1173 xmm1 += A.load(i ,k) * b1;
1174 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1175 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1178 (~C).store( i , j, xmm1 );
1179 (~C).store( i+SIMDSIZE , j, xmm2 );
1180 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1184 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1186 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
1187 size_t j( UPP ? i : 0UL );
1189 for( ; (j+4UL) <= jend; j+=4UL )
1202 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1204 for(
size_t k=kbegin; k<kend; ++k ) {
1206 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1207 const SIMDType b1(
set( B(k,j ) ) );
1208 const SIMDType b2(
set( B(k,j+1UL) ) );
1209 const SIMDType b3(
set( B(k,j+2UL) ) );
1210 const SIMDType b4(
set( B(k,j+3UL) ) );
1221 (~C).store( i , j , xmm1 );
1222 (~C).store( i+SIMDSIZE, j , xmm2 );
1223 (~C).store( i , j+1UL, xmm3 );
1224 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1225 (~C).store( i , j+2UL, xmm5 );
1226 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1227 (~C).store( i , j+3UL, xmm7 );
1228 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
1231 for( ; (j+3UL) <= jend; j+=3UL )
1244 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1246 for(
size_t k=kbegin; k<kend; ++k ) {
1248 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1249 const SIMDType b1(
set( B(k,j ) ) );
1250 const SIMDType b2(
set( B(k,j+1UL) ) );
1251 const SIMDType b3(
set( B(k,j+2UL) ) );
1260 (~C).store( i , j , xmm1 );
1261 (~C).store( i+SIMDSIZE, j , xmm2 );
1262 (~C).store( i , j+1UL, xmm3 );
1263 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1264 (~C).store( i , j+2UL, xmm5 );
1265 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1268 for( ; (j+2UL) <= jend; j+=2UL )
1281 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1284 for( ; (k+2UL) <= kend; k+=2UL ) {
1285 const SIMDType a1( A.load(i ,k ) );
1286 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
1287 const SIMDType a3( A.load(i ,k+1UL) );
1288 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
1289 const SIMDType b1(
set( B(k ,j ) ) );
1290 const SIMDType b2(
set( B(k ,j+1UL) ) );
1291 const SIMDType b3(
set( B(k+1UL,j ) ) );
1292 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
1303 for( ; k<kend; ++k ) {
1305 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1306 const SIMDType b1(
set( B(k,j ) ) );
1307 const SIMDType b2(
set( B(k,j+1UL) ) );
1314 (~C).store( i , j , xmm1+xmm5 );
1315 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
1316 (~C).store( i , j+1UL, xmm3+xmm7 );
1317 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
1332 for( ; (k+2UL) <= kend; k+=2UL ) {
1333 const SIMDType b1(
set( B(k ,j) ) );
1334 const SIMDType b2(
set( B(k+1UL,j) ) );
1335 xmm1 += A.load(i ,k ) * b1;
1336 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
1337 xmm3 += A.load(i ,k+1UL) * b2;
1338 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
1341 for( ; k<kend; ++k ) {
1342 const SIMDType b1(
set( B(k,j) ) );
1343 xmm1 += A.load(i ,k) * b1;
1344 xmm2 += A.load(i+SIMDSIZE,k) * b1;
1347 (~C).store( i , j, xmm1+xmm3 );
1348 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
1352 for( ; i<ipos; i+=SIMDSIZE )
1354 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
1355 size_t j( UPP ? i : 0UL );
1357 for( ; (j+4UL) <= jend; j+=4UL )
1368 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1371 for( ; (k+2UL) <= kend; k+=2UL ) {
1373 const SIMDType a2( A.load(i,k+1UL) );
1374 xmm1 += a1 *
set( B(k ,j ) );
1375 xmm2 += a1 *
set( B(k ,j+1UL) );
1376 xmm3 += a1 *
set( B(k ,j+2UL) );
1377 xmm4 += a1 *
set( B(k ,j+3UL) );
1378 xmm5 += a2 *
set( B(k+1UL,j ) );
1379 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
1380 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
1381 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
1384 for( ; k<kend; ++k ) {
1386 xmm1 += a1 *
set( B(k,j ) );
1387 xmm2 += a1 *
set( B(k,j+1UL) );
1388 xmm3 += a1 *
set( B(k,j+2UL) );
1389 xmm4 += a1 *
set( B(k,j+3UL) );
1392 (~C).store( i, j , xmm1+xmm5 );
1393 (~C).store( i, j+1UL, xmm2+xmm6 );
1394 (~C).store( i, j+2UL, xmm3+xmm7 );
1395 (~C).store( i, j+3UL, xmm4+xmm8 );
1398 for( ; (j+3UL) <= jend; j+=3UL )
1409 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1412 for( ; (k+2UL) <= kend; k+=2UL ) {
1414 const SIMDType a2( A.load(i,k+1UL) );
1415 xmm1 += a1 *
set( B(k ,j ) );
1416 xmm2 += a1 *
set( B(k ,j+1UL) );
1417 xmm3 += a1 *
set( B(k ,j+2UL) );
1418 xmm4 += a2 *
set( B(k+1UL,j ) );
1419 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
1420 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
1423 for( ; k<kend; ++k ) {
1425 xmm1 += a1 *
set( B(k,j ) );
1426 xmm2 += a1 *
set( B(k,j+1UL) );
1427 xmm3 += a1 *
set( B(k,j+2UL) );
1430 (~C).store( i, j , xmm1+xmm4 );
1431 (~C).store( i, j+1UL, xmm2+xmm5 );
1432 (~C).store( i, j+2UL, xmm3+xmm6 );
1435 for( ; (j+2UL) <= jend; j+=2UL )
1449 for( ; (k+2UL) <= kend; k+=2UL ) {
1451 const SIMDType a2( A.load(i,k+1UL) );
1452 xmm1 += a1 *
set( B(k ,j ) );
1453 xmm2 += a1 *
set( B(k ,j+1UL) );
1454 xmm3 += a2 *
set( B(k+1UL,j ) );
1455 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
1458 for( ; k<kend; ++k ) {
1460 xmm1 += a1 *
set( B(k,j ) );
1461 xmm2 += a1 *
set( B(k,j+1UL) );
1464 (~C).store( i, j , xmm1+xmm3 );
1465 (~C).store( i, j+1UL, xmm2+xmm4 );
1479 for( ; (k+2UL) <= K; k+=2UL ) {
1480 xmm1 += A.load(i,k ) *
set( B(k ,j) );
1481 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
1485 xmm1 += A.load(i,k) *
set( B(k,j) );
1488 (~C).store( i, j, xmm1+xmm2 );
1492 for( ; remainder && i<M; ++i )
1494 size_t j( LOW && UPP ? i : 0UL );
1496 for( ; (j+2UL) <= N; j+=2UL )
1510 for(
size_t k=kbegin; k<kend; ++k ) {
1511 value1 += A(i,k) * B(k,j );
1512 value2 += A(i,k) * B(k,j+1UL);
1515 (~C)(i,j ) = value1;
1516 (~C)(i,j+1UL) = value2;
1529 for(
size_t k=kbegin; k<K; ++k ) {
1530 value += A(i,k) * B(k,j);
1538 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1539 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1540 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1541 for(
size_t i=0UL; i<iend; ++i ) {
1542 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
1546 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1547 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1548 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1549 for(
size_t i=0UL; i<iend; ++i ) {
1554 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
1555 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1556 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1557 for(
size_t j=0UL; j<jend; ++j ) {
1580 template<
typename MT3
1584 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1586 selectDefaultAssignKernel( C, A, B );
1606 template<
typename MT3
1610 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1640 template<
typename MT3
1644 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1646 selectLargeAssignKernel( C, A, B );
1652 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1666 template<
typename MT3
1670 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1683 gemm( C, A, B, ET(1), ET(0) );
1703 template<
typename MT
1722 const ForwardFunctor fwd;
1724 const TmpType tmp(
serial( rhs ) );
1725 assign( ~lhs, fwd( tmp ) );
1745 template<
typename MT >
1756 const ForwardFunctor fwd;
1759 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1761 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1763 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1781 template<
typename MT
1791 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1805 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1821 template<
typename MT3
1824 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1827 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1828 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1829 selectSmallAddAssignKernel( C, A, B );
1831 selectBlasAddAssignKernel( C, A, B );
1850 template<
typename MT3
1854 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1856 const size_t M( A.rows() );
1857 const size_t N( B.columns() );
1858 const size_t K( A.columns() );
1862 for(
size_t j=0UL; j<N; ++j )
1872 for(
size_t k=kbegin; k<kend; ++k )
1876 ?( LOW ?
max(j,k+1UL) : k+1UL )
1877 :( LOW ?
max(j,k) : k ) )
1878 :( LOW ? j : 0UL ) );
1881 ?( UPP ?
min(j+1UL,k) : k )
1882 :( UPP ?
min(j,k)+1UL : k+1UL ) )
1883 :( UPP ? j+1UL : M ) );
1885 if( ( LOW || UPP ) && ibegin >= iend )
continue;
1888 const size_t inum( iend - ibegin );
1889 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1891 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1892 C(i ,j) += A(i ,k) * B(k,j);
1893 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1896 C(ipos,j) += A(ipos,k) * B(k,j);
1918 template<
typename MT3
1921 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
1922 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1926 const size_t M( A.rows() );
1927 const size_t N( B.columns() );
1929 for(
size_t j=0UL; j<N; ++j )
1939 const size_t inum( iend - ibegin );
1940 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1942 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1943 C(i ,j) += A(i ,j) * B(j,j);
1944 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1947 C(ipos,j) += A(ipos,j) * B(j,j);
1968 template<
typename MT3
1972 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1976 const size_t M( A.rows() );
1977 const size_t N( B.columns() );
1979 for(
size_t j=0UL; j<N; ++j )
1989 const size_t inum( iend - ibegin );
1990 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1992 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1993 C(i ,j) += A(i ,i ) * B(i ,j);
1994 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1997 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2018 template<
typename MT3
2022 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2026 for(
size_t i=0UL; i<A.rows(); ++i ) {
2027 C(i,i) += A(i,i) * B(i,i);
2047 template<
typename MT3
2051 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2053 selectDefaultAddAssignKernel( C, A, B );
2073 template<
typename MT3
2084 const ForwardFunctor fwd;
2088 addAssign( ~C, fwd( A * tmp ) );
2092 addAssign( ~C, fwd( tmp * B ) );
2094 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2096 addAssign( ~C, fwd( A * tmp ) );
2100 addAssign( ~C, fwd( tmp * B ) );
2121 template<
typename MT3
2129 const size_t M( A.rows() );
2130 const size_t N( B.columns() );
2131 const size_t K( A.columns() );
2135 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2142 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2143 for(
size_t j=0UL; j<N; ++j )
2157 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2158 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2159 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2160 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2161 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
2162 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
2163 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
2165 for(
size_t k=kbegin; k<kend; ++k ) {
2166 const SIMDType b1(
set( B(k,j) ) );
2167 xmm1 += A.load(i ,k) * b1;
2168 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2169 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2170 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2171 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2172 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2173 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2174 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2177 (~C).store( i , j, xmm1 );
2178 (~C).store( i+SIMDSIZE , j, xmm2 );
2179 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2180 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2181 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2182 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
2183 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
2184 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
2189 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2193 for( ; (j+2UL) <= N; j+=2UL )
2206 SIMDType xmm1 ( (~C).load(i ,j ) );
2207 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
2208 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
2209 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
2210 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
2211 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
2212 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
2213 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2214 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2215 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
2217 for(
size_t k=kbegin; k<kend; ++k ) {
2219 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2220 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2221 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2222 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2223 const SIMDType b1(
set( B(k,j ) ) );
2224 const SIMDType b2(
set( B(k,j+1UL) ) );
2237 (~C).store( i , j , xmm1 );
2238 (~C).store( i+SIMDSIZE , j , xmm2 );
2239 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2240 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2241 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
2242 (~C).store( i , j+1UL, xmm6 );
2243 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
2244 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2245 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2246 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2259 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2260 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2261 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2262 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2264 for(
size_t k=kbegin; k<kend; ++k ) {
2265 const SIMDType b1(
set( B(k,j) ) );
2266 xmm1 += A.load(i ,k) * b1;
2267 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2268 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2269 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2270 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2273 (~C).store( i , j, xmm1 );
2274 (~C).store( i+SIMDSIZE , j, xmm2 );
2275 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2276 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2277 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2281 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2285 for( ; (j+2UL) <= N; j+=2UL )
2299 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2300 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2301 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2302 SIMDType xmm5( (~C).load(i ,j+1UL) );
2303 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2304 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2305 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2307 for(
size_t k=kbegin; k<kend; ++k ) {
2309 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2310 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2311 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2312 const SIMDType b1(
set( B(k,j ) ) );
2313 const SIMDType b2(
set( B(k,j+1UL) ) );
2324 (~C).store( i , j , xmm1 );
2325 (~C).store( i+SIMDSIZE , j , xmm2 );
2326 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2327 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2328 (~C).store( i , j+1UL, xmm5 );
2329 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2330 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2331 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2344 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2345 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2346 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2348 for(
size_t k=kbegin; k<kend; ++k ) {
2349 const SIMDType b1(
set( B(k,j) ) );
2350 xmm1 += A.load(i ,k) * b1;
2351 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2352 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2353 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2356 (~C).store( i , j, xmm1 );
2357 (~C).store( i+SIMDSIZE , j, xmm2 );
2358 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2359 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2363 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2367 for( ; (j+2UL) <= N; j+=2UL )
2381 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2382 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2383 SIMDType xmm4( (~C).load(i ,j+1UL) );
2384 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
2385 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2387 for(
size_t k=kbegin; k<kend; ++k ) {
2389 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2390 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2391 const SIMDType b1(
set( B(k,j ) ) );
2392 const SIMDType b2(
set( B(k,j+1UL) ) );
2401 (~C).store( i , j , xmm1 );
2402 (~C).store( i+SIMDSIZE , j , xmm2 );
2403 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2404 (~C).store( i , j+1UL, xmm4 );
2405 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
2406 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2419 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2420 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2422 for(
size_t k=kbegin; k<kend; ++k ) {
2423 const SIMDType b1(
set( B(k,j) ) );
2424 xmm1 += A.load(i ,k) * b1;
2425 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2426 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2429 (~C).store( i , j, xmm1 );
2430 (~C).store( i+SIMDSIZE , j, xmm2 );
2431 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2435 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2437 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
2438 size_t j( UPP ? i : 0UL );
2440 for( ; (j+4UL) <= jend; j+=4UL )
2454 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2455 SIMDType xmm3( (~C).load(i ,j+1UL) );
2456 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2457 SIMDType xmm5( (~C).load(i ,j+2UL) );
2458 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2459 SIMDType xmm7( (~C).load(i ,j+3UL) );
2460 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
2462 for(
size_t k=kbegin; k<kend; ++k ) {
2464 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2465 const SIMDType b1(
set( B(k,j ) ) );
2466 const SIMDType b2(
set( B(k,j+1UL) ) );
2467 const SIMDType b3(
set( B(k,j+2UL) ) );
2468 const SIMDType b4(
set( B(k,j+3UL) ) );
2479 (~C).store( i , j , xmm1 );
2480 (~C).store( i+SIMDSIZE, j , xmm2 );
2481 (~C).store( i , j+1UL, xmm3 );
2482 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2483 (~C).store( i , j+2UL, xmm5 );
2484 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2485 (~C).store( i , j+3UL, xmm7 );
2486 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2489 for( ; (j+3UL) <= jend; j+=3UL )
2503 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2504 SIMDType xmm3( (~C).load(i ,j+1UL) );
2505 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2506 SIMDType xmm5( (~C).load(i ,j+2UL) );
2507 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2509 for(
size_t k=kbegin; k<kend; ++k ) {
2511 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2512 const SIMDType b1(
set( B(k,j ) ) );
2513 const SIMDType b2(
set( B(k,j+1UL) ) );
2514 const SIMDType b3(
set( B(k,j+2UL) ) );
2523 (~C).store( i , j , xmm1 );
2524 (~C).store( i+SIMDSIZE, j , xmm2 );
2525 (~C).store( i , j+1UL, xmm3 );
2526 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2527 (~C).store( i , j+2UL, xmm5 );
2528 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2531 for( ; (j+2UL) <= jend; j+=2UL )
2545 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2546 SIMDType xmm3( (~C).load(i ,j+1UL) );
2547 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2551 for( ; (k+2UL) < kend; k+=2UL ) {
2552 const SIMDType a1( A.load(i ,k ) );
2553 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2554 const SIMDType a3( A.load(i ,k+1UL) );
2555 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2556 const SIMDType b1(
set( B(k ,j ) ) );
2557 const SIMDType b2(
set( B(k ,j+1UL) ) );
2558 const SIMDType b3(
set( B(k+1UL,j ) ) );
2559 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
2570 for( ; k<kend; ++k ) {
2572 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2573 const SIMDType b1(
set( B(k,j ) ) );
2574 const SIMDType b2(
set( B(k,j+1UL) ) );
2581 (~C).store( i , j , xmm1+xmm5 );
2582 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2583 (~C).store( i , j+1UL, xmm3+xmm7 );
2584 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2597 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2601 for( ; (k+2UL) <= kend; k+=2UL ) {
2602 const SIMDType b1(
set( B(k ,j) ) );
2603 const SIMDType b2(
set( B(k+1UL,j) ) );
2604 xmm1 += A.load(i ,k ) * b1;
2605 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2606 xmm3 += A.load(i ,k+1UL) * b2;
2607 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2610 for( ; k<kend; ++k ) {
2611 const SIMDType b1(
set( B(k,j) ) );
2612 xmm1 += A.load(i ,k) * b1;
2613 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2616 (~C).store( i , j, xmm1+xmm3 );
2617 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2621 for( ; i<ipos; i+=SIMDSIZE )
2623 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
2624 size_t j( UPP ? i : 0UL );
2626 for( ; (j+4UL) <= jend; j+=4UL )
2638 SIMDType xmm2( (~C).load(i,j+1UL) );
2639 SIMDType xmm3( (~C).load(i,j+2UL) );
2640 SIMDType xmm4( (~C).load(i,j+3UL) );
2644 for( ; (k+2UL) <= kend; k+=2UL ) {
2646 const SIMDType a2( A.load(i,k+1UL) );
2647 xmm1 += a1 *
set( B(k ,j ) );
2648 xmm2 += a1 *
set( B(k ,j+1UL) );
2649 xmm3 += a1 *
set( B(k ,j+2UL) );
2650 xmm4 += a1 *
set( B(k ,j+3UL) );
2651 xmm5 += a2 *
set( B(k+1UL,j ) );
2652 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2653 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2654 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2657 for( ; k<kend; ++k ) {
2659 xmm1 += a1 *
set( B(k,j ) );
2660 xmm2 += a1 *
set( B(k,j+1UL) );
2661 xmm3 += a1 *
set( B(k,j+2UL) );
2662 xmm4 += a1 *
set( B(k,j+3UL) );
2665 (~C).store( i, j , xmm1+xmm5 );
2666 (~C).store( i, j+1UL, xmm2+xmm6 );
2667 (~C).store( i, j+2UL, xmm3+xmm7 );
2668 (~C).store( i, j+3UL, xmm4+xmm8 );
2671 for( ; (j+3UL) <= jend; j+=3UL )
2683 SIMDType xmm2( (~C).load(i,j+1UL) );
2684 SIMDType xmm3( (~C).load(i,j+2UL) );
2688 for( ; (k+2UL) <= kend; k+=2UL ) {
2690 const SIMDType a2( A.load(i,k+1UL) );
2691 xmm1 += a1 *
set( B(k ,j ) );
2692 xmm2 += a1 *
set( B(k ,j+1UL) );
2693 xmm3 += a1 *
set( B(k ,j+2UL) );
2694 xmm4 += a2 *
set( B(k+1UL,j ) );
2695 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2696 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2699 for( ; k<kend; ++k ) {
2701 xmm1 += a1 *
set( B(k,j ) );
2702 xmm2 += a1 *
set( B(k,j+1UL) );
2703 xmm3 += a1 *
set( B(k,j+2UL) );
2706 (~C).store( i, j , xmm1+xmm4 );
2707 (~C).store( i, j+1UL, xmm2+xmm5 );
2708 (~C).store( i, j+2UL, xmm3+xmm6 );
2711 for( ; (j+2UL) <= jend; j+=2UL )
2723 SIMDType xmm2( (~C).load(i,j+1UL) );
2727 for( ; (k+2UL) <= kend; k+=2UL ) {
2729 const SIMDType a2( A.load(i,k+1UL) );
2730 xmm1 += a1 *
set( B(k ,j ) );
2731 xmm2 += a1 *
set( B(k ,j+1UL) );
2732 xmm3 += a2 *
set( B(k+1UL,j ) );
2733 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2736 for( ; k<kend; ++k ) {
2738 xmm1 += a1 *
set( B(k,j ) );
2739 xmm2 += a1 *
set( B(k,j+1UL) );
2742 (~C).store( i, j , xmm1+xmm3 );
2743 (~C).store( i, j+1UL, xmm2+xmm4 );
2758 for( ; (k+2UL) <= K; k+=2UL ) {
2759 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2760 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2764 xmm1 += A.load(i,k) *
set( B(k,j) );
2767 (~C).store( i, j, xmm1+xmm2 );
2771 for( ; remainder && i<M; ++i )
2773 const size_t jend( LOW ? i+1UL : N );
2774 size_t j( UPP ? i : 0UL );
2776 for( ; (j+2UL) <= jend; j+=2UL )
2790 for(
size_t k=kbegin; k<kend; ++k ) {
2791 value1 += A(i,k) * B(k,j );
2792 value2 += A(i,k) * B(k,j+1UL);
2795 (~C)(i,j ) = value1;
2796 (~C)(i,j+1UL) = value2;
2809 for(
size_t k=kbegin; k<K; ++k ) {
2810 value += A(i,k) * B(k,j);
2834 template<
typename MT3
2838 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2840 selectDefaultAddAssignKernel( C, A, B );
2860 template<
typename MT3
2864 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2890 template<
typename MT3
2894 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2896 selectLargeAddAssignKernel( C, A, B );
2902 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2916 template<
typename MT3
2920 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2927 addAssign( C, tmp );
2932 addAssign( C, tmp );
2935 gemm( C, A, B, ET(1), ET(1) );
2957 template<
typename MT >
2968 const ForwardFunctor fwd;
2971 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
2973 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
2975 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
2997 template<
typename MT
3007 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3021 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3037 template<
typename MT3
3040 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3043 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3044 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3045 selectSmallSubAssignKernel( C, A, B );
3047 selectBlasSubAssignKernel( C, A, B );
3066 template<
typename MT3
3070 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3072 const size_t M( A.rows() );
3073 const size_t N( B.columns() );
3074 const size_t K( A.columns() );
3078 for(
size_t j=0UL; j<N; ++j )
3088 for(
size_t k=kbegin; k<kend; ++k )
3092 ?( LOW ?
max(j,k+1UL) : k+1UL )
3093 :( LOW ?
max(j,k) : k ) )
3094 :( LOW ? j : 0UL ) );
3097 ?( UPP ?
min(j+1UL,k) : k )
3098 :( UPP ?
min(j,k)+1UL : k+1UL ) )
3099 :( UPP ? j+1UL : M ) );
3101 if( ( LOW || UPP ) && ( ibegin >= iend ) )
continue;
3104 const size_t inum( iend - ibegin );
3105 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3107 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3108 C(i ,j) -= A(i ,k) * B(k,j);
3109 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3112 C(ipos,j) -= A(ipos,k) * B(k,j);
3134 template<
typename MT3
3137 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
3138 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3142 const size_t M( A.rows() );
3143 const size_t N( B.columns() );
3145 for(
size_t j=0UL; j<N; ++j )
3155 const size_t inum( iend - ibegin );
3156 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3158 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3159 C(i ,j) -= A(i ,j) * B(j,j);
3160 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3163 C(ipos,j) -= A(ipos,j) * B(j,j);
3184 template<
typename MT3
3188 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3192 const size_t M( A.rows() );
3193 const size_t N( B.columns() );
3195 for(
size_t j=0UL; j<N; ++j )
3205 const size_t inum( iend - ibegin );
3206 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3208 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3209 C(i ,j) -= A(i ,i ) * B(i ,j);
3210 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3213 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3234 template<
typename MT3
3238 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3242 for(
size_t i=0UL; i<A.rows(); ++i ) {
3243 C(i,i) -= A(i,i) * B(i,i);
3263 template<
typename MT3
3267 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3269 selectDefaultSubAssignKernel( C, A, B );
3289 template<
typename MT3
3300 const ForwardFunctor fwd;
3304 subAssign( ~C, fwd( A * tmp ) );
3308 subAssign( ~C, fwd( tmp * B ) );
3310 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3312 subAssign( ~C, fwd( A * tmp ) );
3316 subAssign( ~C, fwd( tmp * B ) );
3337 template<
typename MT3
3345 const size_t M( A.rows() );
3346 const size_t N( B.columns() );
3347 const size_t K( A.columns() );
3351 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3358 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3359 for(
size_t j=0UL; j<N; ++j )
3373 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3374 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3375 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3376 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3377 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3378 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3379 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3381 for(
size_t k=kbegin; k<kend; ++k ) {
3382 const SIMDType b1(
set( B(k,j) ) );
3383 xmm1 -= A.load(i ,k) * b1;
3384 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3385 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3386 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3387 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3388 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3389 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3390 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3393 (~C).store( i , j, xmm1 );
3394 (~C).store( i+SIMDSIZE , j, xmm2 );
3395 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3396 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3397 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3398 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3399 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3400 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3405 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3409 for( ; (j+2UL) <= N; j+=2UL )
3422 SIMDType xmm1 ( (~C).load(i ,j ) );
3423 SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3424 SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3425 SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3426 SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3427 SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3428 SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3429 SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3430 SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3431 SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3433 for(
size_t k=kbegin; k<kend; ++k ) {
3435 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3436 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3437 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3438 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3439 const SIMDType b1(
set( B(k,j ) ) );
3440 const SIMDType b2(
set( B(k,j+1UL) ) );
3453 (~C).store( i , j , xmm1 );
3454 (~C).store( i+SIMDSIZE , j , xmm2 );
3455 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3456 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3457 (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3458 (~C).store( i , j+1UL, xmm6 );
3459 (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3460 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3461 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3462 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3475 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3476 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3477 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3478 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3480 for(
size_t k=kbegin; k<kend; ++k ) {
3481 const SIMDType b1(
set( B(k,j) ) );
3482 xmm1 -= A.load(i ,k) * b1;
3483 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3484 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3485 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3486 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3489 (~C).store( i , j, xmm1 );
3490 (~C).store( i+SIMDSIZE , j, xmm2 );
3491 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3492 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3493 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3497 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3501 for( ; (j+2UL) <= N; j+=2UL )
3515 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3516 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3517 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3518 SIMDType xmm5( (~C).load(i ,j+1UL) );
3519 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3520 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3521 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3523 for(
size_t k=kbegin; k<kend; ++k ) {
3525 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3526 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3527 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3528 const SIMDType b1(
set( B(k,j ) ) );
3529 const SIMDType b2(
set( B(k,j+1UL) ) );
3540 (~C).store( i , j , xmm1 );
3541 (~C).store( i+SIMDSIZE , j , xmm2 );
3542 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3543 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3544 (~C).store( i , j+1UL, xmm5 );
3545 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3546 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3547 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3560 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3561 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3562 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3564 for(
size_t k=kbegin; k<kend; ++k ) {
3565 const SIMDType b1(
set( B(k,j) ) );
3566 xmm1 -= A.load(i ,k) * b1;
3567 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3568 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3569 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3572 (~C).store( i , j, xmm1 );
3573 (~C).store( i+SIMDSIZE , j, xmm2 );
3574 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3575 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3579 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3583 for( ; (j+2UL) <= N; j+=2UL )
3597 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3598 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3599 SIMDType xmm4( (~C).load(i ,j+1UL) );
3600 SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3601 SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3603 for(
size_t k=kbegin; k<kend; ++k ) {
3605 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3606 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3607 const SIMDType b1(
set( B(k,j ) ) );
3608 const SIMDType b2(
set( B(k,j+1UL) ) );
3617 (~C).store( i , j , xmm1 );
3618 (~C).store( i+SIMDSIZE , j , xmm2 );
3619 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3620 (~C).store( i , j+1UL, xmm4 );
3621 (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3622 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3635 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3636 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3638 for(
size_t k=kbegin; k<kend; ++k ) {
3639 const SIMDType b1(
set( B(k,j) ) );
3640 xmm1 -= A.load(i ,k) * b1;
3641 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3642 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3645 (~C).store( i , j, xmm1 );
3646 (~C).store( i+SIMDSIZE , j, xmm2 );
3647 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3651 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3653 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
3654 size_t j( UPP ? i : 0UL );
3656 for( ; (j+4UL) <= jend; j+=4UL )
3670 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3671 SIMDType xmm3( (~C).load(i ,j+1UL) );
3672 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3673 SIMDType xmm5( (~C).load(i ,j+2UL) );
3674 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3675 SIMDType xmm7( (~C).load(i ,j+3UL) );
3676 SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
3678 for(
size_t k=kbegin; k<kend; ++k ) {
3680 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3681 const SIMDType b1(
set( B(k,j ) ) );
3682 const SIMDType b2(
set( B(k,j+1UL) ) );
3683 const SIMDType b3(
set( B(k,j+2UL) ) );
3684 const SIMDType b4(
set( B(k,j+3UL) ) );
3695 (~C).store( i , j , xmm1 );
3696 (~C).store( i+SIMDSIZE, j , xmm2 );
3697 (~C).store( i , j+1UL, xmm3 );
3698 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3699 (~C).store( i , j+2UL, xmm5 );
3700 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3701 (~C).store( i , j+3UL, xmm7 );
3702 (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
3705 for( ; (j+3UL) <= jend; j+=3UL )
3719 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3720 SIMDType xmm3( (~C).load(i ,j+1UL) );
3721 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3722 SIMDType xmm5( (~C).load(i ,j+2UL) );
3723 SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3725 for(
size_t k=kbegin; k<kend; ++k ) {
3727 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3728 const SIMDType b1(
set( B(k,j ) ) );
3729 const SIMDType b2(
set( B(k,j+1UL) ) );
3730 const SIMDType b3(
set( B(k,j+2UL) ) );
3739 (~C).store( i , j , xmm1 );
3740 (~C).store( i+SIMDSIZE, j , xmm2 );
3741 (~C).store( i , j+1UL, xmm3 );
3742 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3743 (~C).store( i , j+2UL, xmm5 );
3744 (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3747 for( ; (j+2UL) <= jend; j+=2UL )
3761 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3762 SIMDType xmm3( (~C).load(i ,j+1UL) );
3763 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3767 for( ; (k+2UL) <= kend; k+=2UL ) {
3768 const SIMDType a1( A.load(i ,k ) );
3769 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
3770 const SIMDType a3( A.load(i ,k+1UL) );
3771 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
3772 const SIMDType b1(
set( B(k ,j ) ) );
3773 const SIMDType b2(
set( B(k ,j+1UL) ) );
3774 const SIMDType b3(
set( B(k+1UL,j ) ) );
3775 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
3786 for( ; k<kend; ++k ) {
3788 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3789 const SIMDType b1(
set( B(k,j ) ) );
3790 const SIMDType b2(
set( B(k,j+1UL) ) );
3797 (~C).store( i , j , xmm1+xmm5 );
3798 (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
3799 (~C).store( i , j+1UL, xmm3+xmm7 );
3800 (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
3813 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3817 for( ; (k+2UL) <= kend; k+=2UL ) {
3818 const SIMDType b1(
set( B(k ,j) ) );
3819 const SIMDType b2(
set( B(k+1UL,j) ) );
3820 xmm1 -= A.load(i ,k ) * b1;
3821 xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
3822 xmm3 -= A.load(i ,k+1UL) * b2;
3823 xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
3826 for( ; k<kend; ++k ) {
3827 const SIMDType b1(
set( B(k,j) ) );
3828 xmm1 -= A.load(i ,k) * b1;
3829 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3832 (~C).store( i , j, xmm1+xmm3 );
3833 (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
3837 for( ; i<ipos; i+=SIMDSIZE )
3839 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
3840 size_t j( UPP ? i : 0UL );
3842 for( ; (j+4UL) <= jend; j+=4UL )
3854 SIMDType xmm2( (~C).load(i,j+1UL) );
3855 SIMDType xmm3( (~C).load(i,j+2UL) );
3856 SIMDType xmm4( (~C).load(i,j+3UL) );
3860 for( ; (k+2UL) <= kend; k+=2UL ) {
3862 const SIMDType a2( A.load(i,k+1UL) );
3863 xmm1 -= a1 *
set( B(k ,j ) );
3864 xmm2 -= a1 *
set( B(k ,j+1UL) );
3865 xmm3 -= a1 *
set( B(k ,j+2UL) );
3866 xmm4 -= a1 *
set( B(k ,j+3UL) );
3867 xmm5 -= a2 *
set( B(k+1UL,j ) );
3868 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
3869 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
3870 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
3873 for( ; k<kend; ++k ) {
3875 xmm1 -= a1 *
set( B(k,j ) );
3876 xmm2 -= a1 *
set( B(k,j+1UL) );
3877 xmm3 -= a1 *
set( B(k,j+2UL) );
3878 xmm4 -= a1 *
set( B(k,j+3UL) );
3881 (~C).store( i, j , xmm1+xmm5 );
3882 (~C).store( i, j+1UL, xmm2+xmm6 );
3883 (~C).store( i, j+2UL, xmm3+xmm7 );
3884 (~C).store( i, j+3UL, xmm4+xmm8 );
3887 for( ; (j+3UL) <= jend; j+=3UL )
3899 SIMDType xmm2( (~C).load(i,j+1UL) );
3900 SIMDType xmm3( (~C).load(i,j+2UL) );
3904 for( ; (k+2UL) <= kend; k+=2UL ) {
3906 const SIMDType a2( A.load(i,k+1UL) );
3907 xmm1 -= a1 *
set( B(k ,j ) );
3908 xmm2 -= a1 *
set( B(k ,j+1UL) );
3909 xmm3 -= a1 *
set( B(k ,j+2UL) );
3910 xmm4 -= a2 *
set( B(k+1UL,j ) );
3911 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
3912 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
3915 for( ; k<kend; ++k ) {
3917 xmm1 -= a1 *
set( B(k,j ) );
3918 xmm2 -= a1 *
set( B(k,j+1UL) );
3919 xmm3 -= a1 *
set( B(k,j+2UL) );
3922 (~C).store( i, j , xmm1+xmm4 );
3923 (~C).store( i, j+1UL, xmm2+xmm5 );
3924 (~C).store( i, j+2UL, xmm3+xmm6 );
3927 for( ; (j+2UL) <= jend; j+=2UL )
3939 SIMDType xmm2( (~C).load(i,j+1UL) );
3943 for( ; (k+2UL) <= kend; k+=2UL ) {
3945 const SIMDType a2( A.load(i,k+1UL) );
3946 xmm1 -= a1 *
set( B(k ,j ) );
3947 xmm2 -= a1 *
set( B(k ,j+1UL) );
3948 xmm3 -= a2 *
set( B(k+1UL,j ) );
3949 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
3952 for( ; k<kend; ++k ) {
3954 xmm1 -= a1 *
set( B(k,j ) );
3955 xmm2 -= a1 *
set( B(k,j+1UL) );
3958 (~C).store( i, j , xmm1+xmm3 );
3959 (~C).store( i, j+1UL, xmm2+xmm4 );
3974 for( ; (k+2UL) <= K; k+=2UL ) {
3975 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
3976 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
3980 xmm1 -= A.load(i,k) *
set( B(k,j) );
3983 (~C).store( i, j, xmm1+xmm2 );
3987 for( ; remainder && i<M; ++i )
3989 const size_t jend( LOW ? i+1UL : N );
3990 size_t j( UPP ? i : 0UL );
3992 for( ; (j+2UL) <= jend; j+=2UL )
4006 for(
size_t k=kbegin; k<kend; ++k ) {
4007 value1 -= A(i,k) * B(k,j );
4008 value2 -= A(i,k) * B(k,j+1UL);
4011 (~C)(i,j ) = value1;
4012 (~C)(i,j+1UL) = value2;
4025 for(
size_t k=kbegin; k<K; ++k ) {
4026 value -= A(i,k) * B(k,j);
4050 template<
typename MT3
4054 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4056 selectDefaultSubAssignKernel( C, A, B );
4076 template<
typename MT3
4080 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4106 template<
typename MT3
4110 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4112 selectLargeSubAssignKernel( C, A, B );
4118 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4132 template<
typename MT3
4136 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4143 subAssign( C, tmp );
4148 subAssign( C, tmp );
4151 gemm( C, A, B, ET(-1), ET(1) );
4174 template<
typename MT >
4185 const ForwardFunctor fwd;
4188 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
4190 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
4192 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
4214 template<
typename MT
4228 schurAssign( ~lhs, tmp );
4257 template<
typename MT
4267 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4270 else if( rhs.lhs_.columns() == 0UL ) {
4306 template<
typename MT
4325 const ForwardFunctor fwd;
4327 const TmpType tmp( rhs );
4348 template<
typename MT >
4359 const ForwardFunctor fwd;
4387 template<
typename MT
4397 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4432 template<
typename MT >
4443 const ForwardFunctor fwd;
4475 template<
typename MT
4485 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4520 template<
typename MT >
4531 const ForwardFunctor fwd;
4561 template<
typename MT
4621 template<
typename MT1
4629 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4659 SYM = ( SF && !( HF || LF || UF ) ),
4660 HERM = ( HF && !( LF || UF ) ),
4661 LOW = ( LF || ( ( SF || HF ) && UF ) ),
4662 UPP = ( UF || ( ( SF || HF ) && LF ) )
4673 template<
typename T1,
typename T2,
typename T3 >
4674 struct CanExploitSymmetry {
4685 template<
typename T1,
typename T2,
typename T3 >
4686 struct IsEvaluationRequired {
4687 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
4688 !CanExploitSymmetry<T1,T2,T3>::value };
4696 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4697 struct UseBlasKernel {
4698 enum :
bool { value =
BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4704 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4718 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4719 struct UseVectorizedDefaultKernel {
4720 enum :
bool { value = useOptimizedKernels &&
4722 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4776 MT1::simdEnabled && MT2::simdEnabled &&
4782 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4783 !evaluateRight && MT2::smpAssignable };
4813 return matrix_(i,j) * scalar_;
4826 if( i >= matrix_.rows() ) {
4829 if( j >= matrix_.columns() ) {
4832 return (*
this)(i,j);
4841 inline size_t rows()
const {
4842 return matrix_.rows();
4851 inline size_t columns()
const {
4852 return matrix_.columns();
4882 template<
typename T >
4883 inline bool canAlias(
const T* alias )
const {
4884 return matrix_.canAlias( alias );
4894 template<
typename T >
4895 inline bool isAliased(
const T* alias )
const {
4896 return matrix_.isAliased( alias );
4906 return matrix_.isAligned();
4917 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4919 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4920 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4942 template<
typename MT
4955 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4958 else if( left.columns() == 0UL ) {
4973 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.
scalar_ );
4988 template<
typename MT3
4992 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4995 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
4996 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4997 selectSmallAssignKernel( C, A, B, scalar );
4999 selectBlasAssignKernel( C, A, B, scalar );
5017 template<
typename MT3
5022 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5024 const size_t M( A.rows() );
5025 const size_t N( B.columns() );
5026 const size_t K( A.columns() );
5030 for(
size_t j=0UL; j<N; ++j )
5041 for(
size_t i=0UL; i<M; ++i ) {
5050 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
5051 :( LOW ?
max(j,kbegin) : kbegin ) )
5052 :( LOW ? j : 0UL ) );
5055 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
5056 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
5057 :( UPP ? j+1UL : M ) );
5060 for(
size_t i=0UL; i<ibegin; ++i ) {
5067 for(
size_t i=ibegin; i<iend; ++i ) {
5068 C(i,j) = A(i,kbegin) * B(kbegin,j);
5071 for(
size_t i=iend; i<M; ++i ) {
5076 reset( C(M-1UL,j) );
5080 for(
size_t k=kbegin+1UL; k<kend; ++k )
5084 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
5085 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
5086 :( SYM || HERM || LOW ? j : 0UL ) );
5089 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
5090 :( UPP ?
min(j+1UL,k) : k ) )
5091 :( UPP ? j+1UL : M ) );
5093 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5096 for(
size_t i=ibegin; i<iend; ++i ) {
5097 C(i,j) += A(i,k) * B(k,j);
5100 C(iend,j) = A(iend,k) * B(k,j);
5107 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5110 :( UPP ? j+1UL : M ) );
5112 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5115 for(
size_t i=ibegin; i<iend; ++i ) {
5122 for(
size_t j=1UL; j<N; ++j ) {
5123 for(
size_t i=0UL; i<j; ++i ) {
5124 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5145 template<
typename MT3
5149 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
5150 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5154 const size_t M( A.rows() );
5155 const size_t N( B.columns() );
5157 for(
size_t j=0UL; j<N; ++j )
5168 for(
size_t i=0UL; i<ibegin; ++i ) {
5172 for(
size_t i=ibegin; i<iend; ++i ) {
5173 C(i,j) = A(i,j) * B(j,j) * scalar;
5176 for(
size_t i=iend; i<M; ++i ) {
5198 template<
typename MT3
5203 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5207 const size_t M( A.rows() );
5208 const size_t N( B.columns() );
5210 for(
size_t j=0UL; j<N; ++j )
5221 for(
size_t i=0UL; i<ibegin; ++i ) {
5225 for(
size_t i=ibegin; i<iend; ++i ) {
5226 C(i,j) = A(i,i) * B(i,j) * scalar;
5229 for(
size_t i=iend; i<M; ++i ) {
5251 template<
typename MT3
5256 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5262 for(
size_t i=0UL; i<A.rows(); ++i ) {
5263 C(i,i) = A(i,i) * B(i,i) * scalar;
5282 template<
typename MT3
5287 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5289 selectDefaultAssignKernel( C, A, B, scalar );
5308 template<
typename MT3
5320 const ForwardFunctor fwd;
5324 assign( ~C, fwd( A * tmp ) * scalar );
5328 assign( ~C, fwd( tmp * B ) * scalar );
5330 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5332 assign( ~C, fwd( A * tmp ) * scalar );
5336 assign( ~C, fwd( tmp * B ) * scalar );
5356 template<
typename MT3
5365 const size_t M( A.rows() );
5366 const size_t N( B.columns() );
5367 const size_t K( A.columns() );
5371 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
5374 const SIMDType factor(
set( scalar ) );
5376 if( LOW && UPP && M > SIMDSIZE*3UL ) {
5385 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5386 for(
size_t j=0UL; j<N; ++j )
5399 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5401 for(
size_t k=kbegin; k<kend; ++k ) {
5402 const SIMDType b1(
set( B(k,j) ) );
5403 xmm1 += A.load(i ,k) * b1;
5404 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5405 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5406 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5407 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5408 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5409 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5410 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5413 (~C).store( i , j, xmm1 * factor );
5414 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5415 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5416 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5417 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5418 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5419 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5420 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5425 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5429 for( ; (j+2UL) <= N; j+=2UL )
5442 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5444 for(
size_t k=kbegin; k<kend; ++k ) {
5446 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5447 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5448 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5449 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5450 const SIMDType b1(
set( B(k,j ) ) );
5451 const SIMDType b2(
set( B(k,j+1UL) ) );
5464 (~C).store( i , j , xmm1 * factor );
5465 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5466 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5467 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5468 (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5469 (~C).store( i , j+1UL, xmm6 * factor );
5470 (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5471 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5472 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5473 (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5485 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5487 for(
size_t k=kbegin; k<kend; ++k ) {
5488 const SIMDType b1(
set( B(k,j) ) );
5489 xmm1 += A.load(i ,k) * b1;
5490 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5491 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5492 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5493 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5496 (~C).store( i , j, xmm1 * factor );
5497 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5498 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5499 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5500 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5504 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5506 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*4UL,N) : N );
5507 size_t j( UPP ? i : 0UL );
5509 for( ; (j+2UL) <= jend; j+=2UL )
5522 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5524 for(
size_t k=kbegin; k<kend; ++k ) {
5526 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5527 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5528 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5529 const SIMDType b1(
set( B(k,j ) ) );
5530 const SIMDType b2(
set( B(k,j+1UL) ) );
5541 (~C).store( i , j , xmm1 * factor );
5542 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5543 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5544 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5545 (~C).store( i , j+1UL, xmm5 * factor );
5546 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5547 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5548 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5562 for(
size_t k=kbegin; k<kend; ++k ) {
5563 const SIMDType b1(
set( B(k,j) ) );
5564 xmm1 += A.load(i ,k) * b1;
5565 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5566 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5567 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5570 (~C).store( i , j, xmm1 * factor );
5571 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5572 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5573 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5577 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5579 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*3UL,N) : N );
5580 size_t j( UPP ? i : 0UL );
5582 for( ; (j+2UL) <= jend; j+=2UL )
5595 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5597 for(
size_t k=kbegin; k<kend; ++k ) {
5599 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5600 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5601 const SIMDType b1(
set( B(k,j ) ) );
5602 const SIMDType b2(
set( B(k,j+1UL) ) );
5611 (~C).store( i , j , xmm1 * factor );
5612 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5613 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5614 (~C).store( i , j+1UL, xmm4 * factor );
5615 (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5616 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5630 for(
size_t k=kbegin; k<kend; ++k ) {
5631 const SIMDType b1(
set( B(k,j) ) );
5632 xmm1 += A.load(i ,k) * b1;
5633 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5634 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5637 (~C).store( i , j, xmm1 * factor );
5638 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5639 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5643 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5645 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE*2UL,N) : N );
5646 size_t j( UPP ? i : 0UL );
5648 for( ; (j+4UL) <= jend; j+=4UL )
5661 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5663 for(
size_t k=kbegin; k<kend; ++k ) {
5665 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5666 const SIMDType b1(
set( B(k,j ) ) );
5667 const SIMDType b2(
set( B(k,j+1UL) ) );
5668 const SIMDType b3(
set( B(k,j+2UL) ) );
5669 const SIMDType b4(
set( B(k,j+3UL) ) );
5680 (~C).store( i , j , xmm1 * factor );
5681 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5682 (~C).store( i , j+1UL, xmm3 * factor );
5683 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5684 (~C).store( i , j+2UL, xmm5 * factor );
5685 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5686 (~C).store( i , j+3UL, xmm7 * factor );
5687 (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
5690 for( ; (j+3UL) <= jend; j+=3UL )
5703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5705 for(
size_t k=kbegin; k<kend; ++k ) {
5707 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5708 const SIMDType b1(
set( B(k,j ) ) );
5709 const SIMDType b2(
set( B(k,j+1UL) ) );
5710 const SIMDType b3(
set( B(k,j+2UL) ) );
5719 (~C).store( i , j , xmm1 * factor );
5720 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5721 (~C).store( i , j+1UL, xmm3 * factor );
5722 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5723 (~C).store( i , j+2UL, xmm5 * factor );
5724 (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5727 for( ; (j+2UL) <= jend; j+=2UL )
5740 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5743 for( ; (k+2UL) <= kend; k+=2UL ) {
5744 const SIMDType a1( A.load(i ,k ) );
5745 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
5746 const SIMDType a3( A.load(i ,k+1UL) );
5747 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
5748 const SIMDType b1(
set( B(k ,j ) ) );
5749 const SIMDType b2(
set( B(k ,j+1UL) ) );
5750 const SIMDType b3(
set( B(k+1UL,j ) ) );
5751 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
5762 for( ; k<kend; ++k ) {
5764 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5765 const SIMDType b1(
set( B(k,j ) ) );
5766 const SIMDType b2(
set( B(k,j+1UL) ) );
5773 (~C).store( i , j , (xmm1+xmm5) * factor );
5774 (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
5775 (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
5776 (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5791 for( ; (k+2UL) <= kend; k+=2UL ) {
5792 const SIMDType b1(
set( B(k ,j) ) );
5793 const SIMDType b2(
set( B(k+1UL,j) ) );
5794 xmm1 += A.load(i ,k ) * b1;
5795 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
5796 xmm3 += A.load(i ,k+1UL) * b2;
5797 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
5800 for( ; k<kend; ++k ) {
5801 const SIMDType b1(
set( B(k,j) ) );
5802 xmm1 += A.load(i ,k) * b1;
5803 xmm2 += A.load(i+SIMDSIZE,k) * b1;
5806 (~C).store( i , j, (xmm1+xmm3) * factor );
5807 (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
5811 for( ; i<ipos; i+=SIMDSIZE )
5813 const size_t jend( SYM || HERM || LOW ?
min(i+SIMDSIZE,N) : N );
5814 size_t j( UPP ? i : 0UL );
5816 for( ; (j+4UL) <= jend; j+=4UL )
5827 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5830 for( ; (k+2UL) <= kend; k+=2UL ) {
5832 const SIMDType a2( A.load(i,k+1UL) );
5833 xmm1 += a1 *
set( B(k ,j ) );
5834 xmm2 += a1 *
set( B(k ,j+1UL) );
5835 xmm3 += a1 *
set( B(k ,j+2UL) );
5836 xmm4 += a1 *
set( B(k ,j+3UL) );
5837 xmm5 += a2 *
set( B(k+1UL,j ) );
5838 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
5839 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
5840 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
5843 for( ; k<kend; ++k ) {
5845 xmm1 += a1 *
set( B(k,j ) );
5846 xmm2 += a1 *
set( B(k,j+1UL) );
5847 xmm3 += a1 *
set( B(k,j+2UL) );
5848 xmm4 += a1 *
set( B(k,j+3UL) );
5851 (~C).store( i, j , (xmm1+xmm5) * factor );
5852 (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
5853 (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
5854 (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
5857 for( ; (j+3UL) <= jend; j+=3UL )
5868 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5871 for( ; (k+2UL) <= kend; k+=2UL ) {
5873 const SIMDType a2( A.load(i,k+1UL) );
5874 xmm1 += a1 *
set( B(k ,j ) );
5875 xmm2 += a1 *
set( B(k ,j+1UL) );
5876 xmm3 += a1 *
set( B(k ,j+2UL) );
5877 xmm4 += a2 *
set( B(k+1UL,j ) );
5878 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
5879 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
5882 for( ; k<kend; ++k ) {
5884 xmm1 += a1 *
set( B(k,j ) );
5885 xmm2 += a1 *
set( B(k,j+1UL) );
5886 xmm3 += a1 *
set( B(k,j+2UL) );
5889 (~C).store( i, j , (xmm1+xmm4) * factor );
5890 (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
5891 (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
5894 for( ; (j+2UL) <= jend; j+=2UL )
5908 for( ; k<kend; ++k ) {
5910 xmm1 += a1 *
set( B(k,j ) );
5911 xmm2 += a1 *
set( B(k,j+1UL) );
5914 for( ; (k+2UL) <= kend; k+=2UL ) {
5916 const SIMDType a2( A.load(i,k+1UL) );
5917 xmm1 += a1 *
set( B(k ,j ) );
5918 xmm2 += a1 *
set( B(k ,j+1UL) );
5919 xmm3 += a2 *
set( B(k+1UL,j ) );
5920 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
5923 (~C).store( i, j , (xmm1+xmm3) * factor );
5924 (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
5938 for( ; (k+2UL) <= K; k+=2UL ) {
5939 xmm1 += A.load(i,k ) *
set( B(k ,j) );
5940 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
5944 xmm1 += A.load(i,k) *
set( B(k,j) );
5947 (~C).store( i, j, (xmm1+xmm2) * factor );
5951 for( ; remainder && i<M; ++i )
5953 size_t j( LOW && UPP ? i : 0UL );
5955 for( ; (j+2UL) <= N; j+=2UL )
5969 for(
size_t k=kbegin; k<kend; ++k ) {
5970 value1 += A(i,k) * B(k,j );
5971 value2 += A(i,k) * B(k,j+1UL);
5974 (~C)(i,j ) = value1 * scalar;
5975 (~C)(i,j+1UL) = value2 * scalar;
5988 for(
size_t k=kbegin; k<K; ++k ) {
5989 value += A(i,k) * B(k,j);
5992 (~C)(i,j) = value * scalar;
5997 if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
5998 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5999 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6000 for(
size_t i=0UL; i<iend; ++i ) {
6001 (~C)(i,j) = HERM ?
conj( (~C)(j,i) ) : (~C)(j,i);
6005 else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
6006 for(
size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6007 const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6008 for(
size_t i=0UL; i<iend; ++i ) {
6013 else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
6014 for(
size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6015 const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6016 for(
size_t j=0UL; j<jend; ++j ) {
6038 template<
typename MT3
6043 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6045 selectDefaultAssignKernel( C, A, B, scalar );
6064 template<
typename MT3
6069 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6072 smmm( C, A, B, scalar );
6074 hmmm( C, A, B, scalar );
6076 lmmm( C, A, B, scalar, ST2(0) );
6078 ummm( C, A, B, scalar, ST2(0) );
6080 mmm( C, A, B, scalar, ST2(0) );
6098 template<
typename MT3
6103 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6105 selectLargeAssignKernel( C, A, B, scalar );
6110 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6124 template<
typename MT3
6129 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6135 trmm( C, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6139 trmm( C, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6142 gemm( C, A, B, ET(scalar), ET(0) );
6160 template<
typename MT
6179 const ForwardFunctor fwd;
6181 const TmpType tmp(
serial( rhs ) );
6182 assign( ~lhs, fwd( tmp ) );
6200 template<
typename MT >
6211 const ForwardFunctor fwd;
6219 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
6221 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
6237 template<
typename MT
6250 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6264 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.
scalar_ );
6279 template<
typename MT3
6283 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6286 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6287 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6288 selectSmallAddAssignKernel( C, A, B, scalar );
6290 selectBlasAddAssignKernel( C, A, B, scalar );
6308 template<
typename MT3
6313 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6316 addAssign( C, tmp );
6334 template<
typename MT3
6338 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
6339 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6343 const size_t M( A.rows() );
6344 const size_t N( B.columns() );
6346 for(
size_t j=0UL; j<N; ++j )
6356 const size_t inum( iend - ibegin );
6357 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6359 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6360 C(i ,j) += A(i ,j) * B(j,j) * scalar;
6361 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6364 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6384 template<
typename MT3
6389 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6393 const size_t M( A.rows() );
6394 const size_t N( B.columns() );
6396 for(
size_t j=0UL; j<N; ++j )
6406 const size_t inum( iend - ibegin );
6407 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6409 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6410 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6411 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6414 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6434 template<
typename MT3
6439 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6443 for(
size_t i=0UL; i<A.rows(); ++i ) {
6444 C(i,i) += A(i,i) * B(i,i) * scalar;
6463 template<
typename MT3
6468 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6470 selectDefaultAddAssignKernel( C, A, B, scalar );
6489 template<
typename MT3
6501 const ForwardFunctor fwd;
6505 addAssign( ~C, fwd( A * tmp ) * scalar );
6509 addAssign( ~C, fwd( tmp * B ) * scalar );
6511 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6513 addAssign( ~C, fwd( A * tmp ) * scalar );
6517 addAssign( ~C, fwd( tmp * B ) * scalar );
6537 template<
typename MT3
6546 const size_t M( A.rows() );
6547 const size_t N( B.columns() );
6548 const size_t K( A.columns() );
6552 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
6555 const SIMDType factor(
set( scalar ) );
6561 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6562 for(
size_t j=0UL; j<N; ++j )
6575 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6577 for(
size_t k=kbegin; k<kend; ++k ) {
6578 const SIMDType b1(
set( B(k,j) ) );
6579 xmm1 += A.load(i ,k) * b1;
6580 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6581 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6582 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6583 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6584 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6585 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6586 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6589 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6590 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6591 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6592 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6593 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6594 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
6595 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
6596 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
6601 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6605 for( ; (j+2UL) <= N; j+=2UL )
6618 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6620 for(
size_t k=kbegin; k<kend; ++k ) {
6622 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6623 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6624 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6625 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6626 const SIMDType b1(
set( B(k,j ) ) );
6627 const SIMDType b2(
set( B(k,j+1UL) ) );
6640 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6641 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6642 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6643 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6644 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
6645 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
6646 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
6647 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6648 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6649 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6661 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6663 for(
size_t k=kbegin; k<kend; ++k ) {
6664 const SIMDType b1(
set( B(k,j) ) );
6665 xmm1 += A.load(i ,k) * b1;
6666 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6667 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6668 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6669 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6672 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6673 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6674 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6675 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6676 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6680 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6684 for( ; (j+2UL) <= N; j+=2UL )
6697 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6699 for(
size_t k=kbegin; k<kend; ++k ) {
6701 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6702 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6703 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6704 const SIMDType b1(
set( B(k,j ) ) );
6705 const SIMDType b2(
set( B(k,j+1UL) ) );
6716 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6717 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6718 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6719 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6720 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6721 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
6722 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6723 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6737 for(
size_t k=kbegin; k<kend; ++k ) {
6738 const SIMDType b1(
set( B(k,j) ) );
6739 xmm1 += A.load(i ,k) * b1;
6740 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6741 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6742 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6745 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6746 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6747 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6748 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6752 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6756 for( ; (j+2UL) <= N; j+=2UL )
6769 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6771 for(
size_t k=kbegin; k<kend; ++k ) {
6773 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6774 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6775 const SIMDType b1(
set( B(k,j ) ) );
6776 const SIMDType b2(
set( B(k,j+1UL) ) );
6785 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6786 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6787 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6788 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
6789 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
6790 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6804 for(
size_t k=kbegin; k<kend; ++k ) {
6805 const SIMDType b1(
set( B(k,j) ) );
6806 xmm1 += A.load(i ,k) * b1;
6807 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6808 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6811 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6812 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6813 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6817 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6819 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
6820 size_t j( UPP ? i : 0UL );
6822 for( ; (j+4UL) <= jend; j+=4UL )
6835 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6837 for(
size_t k=kbegin; k<kend; ++k ) {
6839 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6840 const SIMDType b1(
set( B(k,j ) ) );
6841 const SIMDType b2(
set( B(k,j+1UL) ) );
6842 const SIMDType b3(
set( B(k,j+2UL) ) );
6843 const SIMDType b4(
set( B(k,j+3UL) ) );
6854 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6855 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6856 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6857 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6858 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6859 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6860 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6861 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
6864 for( ; (j+3UL) <= jend; j+=3UL )
6877 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6879 for(
size_t k=kbegin; k<kend; ++k ) {
6881 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6882 const SIMDType b1(
set( B(k,j ) ) );
6883 const SIMDType b2(
set( B(k,j+1UL) ) );
6884 const SIMDType b3(
set( B(k,j+2UL) ) );
6893 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6894 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6895 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6896 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6897 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6898 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6901 for( ; (j+2UL) <= jend; j+=2UL )
6914 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6917 for( ; (k+2UL) <= kend; k+=2UL ) {
6918 const SIMDType a1( A.load(i ,k ) );
6919 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6920 const SIMDType a3( A.load(i ,k+1UL) );
6921 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6922 const SIMDType b1(
set( B(k ,j ) ) );
6923 const SIMDType b2(
set( B(k ,j+1UL) ) );
6924 const SIMDType b3(
set( B(k+1UL,j ) ) );
6925 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
6936 for( ; k<kend; ++k ) {
6938 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6939 const SIMDType b1(
set( B(k,j ) ) );
6940 const SIMDType b2(
set( B(k,j+1UL) ) );
6947 (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
6948 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
6949 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
6950 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
6965 for( ; (k+2UL) <= kend; k+=2UL ) {
6966 const SIMDType b1(
set( B(k ,j) ) );
6967 const SIMDType b2(
set( B(k+1UL,j) ) );
6968 xmm1 += A.load(i ,k ) * b1;
6969 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
6970 xmm3 += A.load(i ,k+1UL) * b2;
6971 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
6974 for( ; k<kend; ++k ) {
6975 const SIMDType b1(
set( B(k,j) ) );
6976 xmm1 += A.load(i ,k) * b1;
6977 xmm2 += A.load(i+SIMDSIZE,k) * b1;
6980 (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
6981 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
6985 for( ; i<ipos; i+=SIMDSIZE )
6987 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
6988 size_t j( UPP ? i : 0UL );
6990 for( ; (j+4UL) <= jend; j+=4UL )
7001 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7004 for( ; (k+2UL) <= kend; k+=2UL ) {
7006 const SIMDType a2( A.load(i,k+1UL) );
7007 xmm1 += a1 *
set( B(k ,j ) );
7008 xmm2 += a1 *
set( B(k ,j+1UL) );
7009 xmm3 += a1 *
set( B(k ,j+2UL) );
7010 xmm4 += a1 *
set( B(k ,j+3UL) );
7011 xmm5 += a2 *
set( B(k+1UL,j ) );
7012 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
7013 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
7014 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
7017 for( ; k<kend; ++k ) {
7019 xmm1 += a1 *
set( B(k,j ) );
7020 xmm2 += a1 *
set( B(k,j+1UL) );
7021 xmm3 += a1 *
set( B(k,j+2UL) );
7022 xmm4 += a1 *
set( B(k,j+3UL) );
7025 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
7026 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
7027 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
7028 (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
7031 for( ; (j+3UL) <= jend; j+=3UL )
7042 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7045 for( ; (k+2UL) <= kend; k+=2UL ) {
7047 const SIMDType a2( A.load(i,k+1UL) );
7048 xmm1 += a1 *
set( B(k ,j ) );
7049 xmm2 += a1 *
set( B(k ,j+1UL) );
7050 xmm3 += a1 *
set( B(k ,j+2UL) );
7051 xmm4 += a2 *
set( B(k+1UL,j ) );
7052 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
7053 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
7056 for( ; k<kend; ++k ) {
7058 xmm1 += a1 *
set( B(k,j ) );
7059 xmm2 += a1 *
set( B(k,j+1UL) );
7060 xmm3 += a1 *
set( B(k,j+2UL) );
7063 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
7064 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
7065 (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
7068 for( ; (j+2UL) <= jend; j+=2UL )
7082 for( ; (k+2UL) <= kend; k+=2UL ) {
7084 const SIMDType a2( A.load(i,k+1UL) );
7085 xmm1 += a1 *
set( B(k ,j ) );
7086 xmm2 += a1 *
set( B(k ,j+1UL) );
7087 xmm3 += a2 *
set( B(k+1UL,j ) );
7088 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
7091 for( ; k<kend; ++k ) {
7093 xmm1 += a1 *
set( B(k,j ) );
7094 xmm2 += a1 *
set( B(k,j+1UL) );
7097 (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
7098 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
7112 for( ; (k+2UL) <= K; k+=2UL ) {
7113 xmm1 += A.load(i,k ) *
set( B(k ,j) );
7114 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
7118 xmm1 += A.load(i,k) *
set( B(k,j) );
7121 (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
7125 for( ; remainder && i<M; ++i )
7127 const size_t jend( LOW ? i+1UL : N );
7128 size_t j( UPP ? i : 0UL );
7130 for( ; (j+2UL) <= jend; j+=2UL )
7144 for(
size_t k=kbegin; k<kend; ++k ) {
7145 value1 += A(i,k) * B(k,j );
7146 value2 += A(i,k) * B(k,j+1UL);
7149 (~C)(i,j ) += value1 * scalar;
7150 (~C)(i,j+1UL) += value2 * scalar;
7163 for(
size_t k=kbegin; k<K; ++k ) {
7164 value += A(i,k) * B(k,j);
7167 (~C)(i,j) += value * scalar;
7187 template<
typename MT3
7192 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7194 selectDefaultAddAssignKernel( C, A, B, scalar );
7213 template<
typename MT3
7218 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7221 lmmm( C, A, B, scalar, ST2(1) );
7223 ummm( C, A, B, scalar, ST2(1) );
7225 mmm( C, A, B, scalar, ST2(1) );
7244 template<
typename MT3
7249 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7251 selectLargeAddAssignKernel( C, A, B, scalar );
7256 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7270 template<
typename MT3
7275 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7281 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7282 addAssign( C, tmp );
7286 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7287 addAssign( C, tmp );
7290 gemm( C, A, B, ET(scalar), ET(1) );
7311 template<
typename MT >
7322 const ForwardFunctor fwd;
7330 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
7332 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
7352 template<
typename MT
7365 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7379 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.
scalar_ );
7394 template<
typename MT3
7398 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7401 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
7402 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7403 selectSmallSubAssignKernel( C, A, B, scalar );
7405 selectBlasSubAssignKernel( C, A, B, scalar );
7423 template<
typename MT3
7428 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7431 subAssign( C, tmp );
7449 template<
typename MT3
7453 static inline EnableIf_< And< Not< IsDiagonal<MT4> >,
IsDiagonal<MT5> > >
7454 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7458 const size_t M( A.rows() );
7459 const size_t N( B.columns() );
7461 for(
size_t j=0UL; j<N; ++j )
7471 const size_t inum( iend - ibegin );
7472 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7474 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7475 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7476 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7479 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7499 template<
typename MT3
7504 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7508 const size_t M( A.rows() );
7509 const size_t N( B.columns() );
7511 for(
size_t j=0UL; j<N; ++j )
7521 const size_t inum( iend - ibegin );
7522 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7524 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7525 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7526 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7529 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7549 template<
typename MT3
7554 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7558 for(
size_t i=0UL; i<A.rows(); ++i ) {
7559 C(i,i) -= A(i,i) * B(i,i) * scalar;
7578 template<
typename MT3
7583 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7585 selectDefaultSubAssignKernel( C, A, B, scalar );
7604 template<
typename MT3
7616 const ForwardFunctor fwd;
7620 subAssign( ~C, fwd( A * tmp ) * scalar );
7624 subAssign( ~C, fwd( tmp * B ) * scalar );
7626 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7628 subAssign( ~C, fwd( A * tmp ) * scalar );
7632 subAssign( ~C, fwd( tmp * B ) * scalar );
7652 template<
typename MT3
7661 const size_t M( A.rows() );
7662 const size_t N( B.columns() );
7663 const size_t K( A.columns() );
7667 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
7670 const SIMDType factor(
set( scalar ) );
7676 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7677 for(
size_t j=0UL; j<N; ++j )
7690 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7692 for(
size_t k=kbegin; k<kend; ++k ) {
7693 const SIMDType b1(
set( B(k,j) ) );
7694 xmm1 += A.load(i ,k) * b1;
7695 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7696 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7697 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7698 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7699 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7700 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7701 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7704 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7705 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7706 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7707 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7708 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7709 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
7710 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
7711 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
7716 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7720 for( ; (j+2UL) <= N; j+=2UL )
7733 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7735 for(
size_t k=kbegin; k<kend; ++k ) {
7737 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7738 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7739 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7740 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7741 const SIMDType b1(
set( B(k,j ) ) );
7742 const SIMDType b2(
set( B(k,j+1UL) ) );
7755 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7756 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7757 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7758 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7759 (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
7760 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
7761 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
7762 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7763 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7764 (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7776 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7778 for(
size_t k=kbegin; k<kend; ++k ) {
7779 const SIMDType b1(
set( B(k,j) ) );
7780 xmm1 += A.load(i ,k) * b1;
7781 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7782 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7783 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7784 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7787 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7788 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7789 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7790 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7791 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7795 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7799 for( ; (j+2UL) <= N; j+=2UL )
7812 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7814 for(
size_t k=kbegin; k<kend; ++k ) {
7816 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7817 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7818 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7819 const SIMDType b1(
set( B(k,j ) ) );
7820 const SIMDType b2(
set( B(k,j+1UL) ) );
7831 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7832 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7833 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7834 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7835 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7836 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
7837 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
7838 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
7852 for(
size_t k=kbegin; k<kend; ++k ) {
7853 const SIMDType b1(
set( B(k,j) ) );
7854 xmm1 += A.load(i ,k) * b1;
7855 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7856 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7857 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7860 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7861 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7862 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7863 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7867 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7871 for( ; (j+2UL) <= N; j+=2UL )
7884 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7886 for(
size_t k=kbegin; k<kend; ++k ) {
7888 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7889 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7890 const SIMDType b1(
set( B(k,j ) ) );
7891 const SIMDType b2(
set( B(k,j+1UL) ) );
7900 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7901 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7902 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7903 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
7904 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
7905 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
7919 for(
size_t k=kbegin; k<kend; ++k ) {
7920 const SIMDType b1(
set( B(k,j) ) );
7921 xmm1 += A.load(i ,k) * b1;
7922 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7923 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7926 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7927 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7928 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7932 for( ; !( LOW &&
UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7934 const size_t jend( LOW ?
min(i+SIMDSIZE*2UL,N) : N );
7935 size_t j( UPP ? i : 0UL );
7937 for( ; (j+4UL) <= jend; j+=4UL )
7950 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7952 for(
size_t k=kbegin; k<kend; ++k ) {
7954 const SIMDType a2( A.load(i+SIMDSIZE,k) );
7955 const SIMDType b1(
set( B(k,j ) ) );
7956 const SIMDType b2(
set( B(k,j+1UL) ) );
7957 const SIMDType b3(
set( B(k,j+2UL) ) );
7958 const SIMDType b4(
set( B(k,j+3UL) ) );
7969 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7970 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
7971 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7972 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
7973 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7974 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
7975 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7976 (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
7979 for( ; (j+3UL) <= jend; j+=3UL )
7992 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7994 for(
size_t k=kbegin; k<kend; ++k ) {
7996 const SIMDType a2( A.load(i+SIMDSIZE,k) );
7997 const SIMDType b1(
set( B(k,j ) ) );
7998 const SIMDType b2(
set( B(k,j+1UL) ) );
7999 const SIMDType b3(
set( B(k,j+2UL) ) );
8008 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
8009 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
8010 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
8011 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8012 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
8013 (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8016 for( ; (j+2UL) <= jend; j+=2UL )
8029 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8032 for( ; (k+2UL) <= kend; k+=2UL ) {
8033 const SIMDType a1( A.load(i ,k ) );
8034 const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8035 const SIMDType a3( A.load(i ,k+1UL) );
8036 const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8037 const SIMDType b1(
set( B(k ,j ) ) );
8038 const SIMDType b2(
set( B(k ,j+1UL) ) );
8039 const SIMDType b3(
set( B(k+1UL,j ) ) );
8040 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8051 for( ; k<kend; ++k ) {
8053 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8054 const SIMDType b1(
set( B(k,j ) ) );
8055 const SIMDType b2(
set( B(k,j+1UL) ) );
8062 (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
8063 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
8064 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
8065 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8080 for( ; (k+2UL) <= kend; k+=2UL ) {
8081 const SIMDType b1(
set( B(k ,j) ) );
8082 const SIMDType b2(
set( B(k+1UL,j) ) );
8083 xmm1 += A.load(i ,k ) * b1;
8084 xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8085 xmm3 += A.load(i ,k+1UL) * b2;
8086 xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8089 for( ; k<kend; ++k ) {
8090 const SIMDType b1(
set( B(k,j) ) );
8091 xmm1 += A.load(i ,k) * b1;
8092 xmm2 += A.load(i+SIMDSIZE,k) * b1;
8095 (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
8096 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
8100 for( ; i<ipos; i+=SIMDSIZE )
8102 const size_t jend( LOW && UPP ?
min(i+SIMDSIZE,N) : N );
8103 size_t j( UPP ? i : 0UL );
8105 for( ; (j+4UL) <= jend; j+=4UL )
8116 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8119 for( ; (k+2UL) <= kend; k+=2UL ) {
8121 const SIMDType a2( A.load(i,k+1UL) );
8122 xmm1 += a1 *
set( B(k ,j ) );
8123 xmm2 += a1 *
set( B(k ,j+1UL) );
8124 xmm3 += a1 *
set( B(k ,j+2UL) );
8125 xmm4 += a1 *
set( B(k ,j+3UL) );
8126 xmm5 += a2 *
set( B(k+1UL,j ) );
8127 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8128 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8129 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8132 for( ; k<kend; ++k ) {
8134 xmm1 += a1 *
set( B(k,j ) );
8135 xmm2 += a1 *
set( B(k,j+1UL) );
8136 xmm3 += a1 *
set( B(k,j+2UL) );
8137 xmm4 += a1 *
set( B(k,j+3UL) );
8140 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
8141 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
8142 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
8143 (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
8146 for( ; (j+3UL) <= jend; j+=3UL )
8157 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8160 for( ; (k+2UL) <= kend; k+=2UL ) {
8162 const SIMDType a2( A.load(i,k+1UL) );
8163 xmm1 += a1 *
set( B(k ,j ) );
8164 xmm2 += a1 *
set( B(k ,j+1UL) );
8165 xmm3 += a1 *
set( B(k ,j+2UL) );
8166 xmm4 += a2 *
set( B(k+1UL,j ) );
8167 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
8168 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
8171 for( ; k<kend; ++k ) {
8173 xmm1 += a1 *
set( B(k,j ) );
8174 xmm2 += a1 *
set( B(k,j+1UL) );
8175 xmm3 += a1 *
set( B(k,j+2UL) );
8178 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
8179 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
8180 (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
8183 for( ; (j+2UL) <= jend; j+=2UL )
8197 for( ; (k+2UL) <= kend; k+=2UL ) {
8199 const SIMDType a2( A.load(i,k+1UL) );
8200 xmm1 += a1 *
set( B(k ,j ) );
8201 xmm2 += a1 *
set( B(k ,j+1UL) );
8202 xmm3 += a2 *
set( B(k+1UL,j ) );
8203 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
8206 for( ; k<kend; ++k ) {
8208 xmm1 += a1 *
set( B(k,j ) );
8209 xmm2 += a1 *
set( B(k,j+1UL) );
8212 (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
8213 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
8227 for( ; (k+2UL) <= K; k+=2UL ) {
8228 xmm1 += A.load(i,k ) *
set( B(k ,j) );
8229 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
8233 xmm1 += A.load(i,k) *
set( B(k,j) );
8236 (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
8240 for( ; remainder && i<M; ++i )
8242 const size_t jend( LOW ? i+1UL : N );
8243 size_t j( UPP ? i : 0UL );
8245 for( ; (j+2UL) <= jend; j+=2UL )
8259 for(
size_t k=kbegin; k<kend; ++k ) {
8260 value1 += A(i,k) * B(k,j );
8261 value2 += A(i,k) * B(k,j+1UL);
8264 (~C)(i,j ) -= value1 * scalar;
8265 (~C)(i,j+1UL) -= value2 * scalar;
8278 for(
size_t k=kbegin; k<K; ++k ) {
8279 value += A(i,k) * B(k,j);
8282 (~C)(i,j) -= value * scalar;
8302 template<
typename MT3
8307 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8309 selectDefaultSubAssignKernel( C, A, B, scalar );
8328 template<
typename MT3
8333 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8336 lmmm( C, A, B, -scalar, ST2(1) );
8338 ummm( C, A, B, -scalar, ST2(1) );
8340 mmm( C, A, B, -scalar, ST2(1) );
8359 template<
typename MT3
8364 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8366 selectLargeSubAssignKernel( C, A, B, scalar );
8371 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8385 template<
typename MT3
8390 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8396 trmm( tmp, A, CblasLeft, (
IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8397 subAssign( C, tmp );
8401 trmm( tmp, B, CblasRight, (
IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8402 subAssign( C, tmp );
8405 gemm( C, A, B, ET(-scalar), ET(1) );
8425 template<
typename MT >
8436 const ForwardFunctor fwd;
8444 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.
scalar_ );
8446 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.
scalar_ );
8466 template<
typename MT
8480 schurAssign( ~lhs, tmp );
8511 template<
typename MT
8524 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8527 else if( left.columns() == 0UL ) {
8561 template<
typename MT
8580 const ForwardFunctor fwd;
8582 const TmpType tmp( rhs );
8601 template<
typename MT >
8612 const ForwardFunctor fwd;
8641 template<
typename MT
8654 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8687 template<
typename MT >
8698 const ForwardFunctor fwd;
8731 template<
typename MT
8744 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8777 template<
typename MT >
8788 const ForwardFunctor fwd;
8818 template<
typename MT
8898 template<
typename MT1
8900 inline decltype(
auto)
8946 template<
typename MT1
8961 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8990 template<
typename MT1
9005 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9034 template<
typename MT1
9049 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9078 template<
typename MT1
9093 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9122 template<
typename MT1
9137 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9153 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9154 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9155 :
public Size<MT1,0UL>
9158 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9159 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9160 :
public Size<MT2,1UL>
9176 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9177 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9178 :
public And< IsAligned<MT1>, IsAligned<MT2> >
9194 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9195 struct IsSymmetric< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9196 :
public Or< Bool<SF>
9198 , IsBuiltin< ElementType_< TDMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
9199 , And< Bool<LF>, Bool<UF> > >
9215 template<
typename MT1,
typename MT2,
bool SF,
bool LF,
bool UF >
9216 struct IsHermitian< TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
9233 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9234 struct IsLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9235 :
public Or< Bool<LF>
9236 , And< IsLower<MT1>, IsLower<MT2> >
9237 , And< Or< Bool<SF>, Bool<HF> >
9238 , IsUpper<MT1>, IsUpper<MT2> > >
9254 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9255 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9256 :
public Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
9257 , And< Or< Bool<SF>, Bool<HF> >
9258 , IsUniUpper<MT1>, IsUniUpper<MT2> > >
9274 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9276 :
public Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9277 , And< IsStrictlyLower<MT2>, IsLower<MT1> >
9278 , And< Or< Bool<SF>, Bool<HF> >
9279 , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9280 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >
9296 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9297 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9298 :
public Or< Bool<UF>
9299 , And< IsUpper<MT1>, IsUpper<MT2> >
9300 , And< Or< Bool<SF>, Bool<HF> >
9301 , IsLower<MT1>, IsLower<MT2> > >
9317 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9318 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9319 :
public Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
9320 , And< Or< Bool<SF>, Bool<HF> >
9321 , IsUniLower<MT1>, IsUniLower<MT2> > >
9337 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9339 :
public Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9340 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
9341 , And< Or< Bool<SF>, Bool<HF> >
9342 , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9343 , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:487
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:475
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:295
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:544
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:153
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:286
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:617
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:534
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:321
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1026
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:154
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:278
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:421
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:177
Header file for the IsComplexDouble type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:277
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:281
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:431
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:107
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:279
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:455
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1026
Header file for the IsLower type trait.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:176
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:86
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:280
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:616
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:430
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:488
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:107
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:411
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:174
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:292
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:385
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1028
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:101
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:154
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:789
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1028
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time logical 'or' evaluation.The Or alias declaration performs at compile time a logical 'or'...
Definition: Or.h:76
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:401
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:443
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:336
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:465
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:175
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:908
Header file for the IsResizable type trait.
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:283
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:282
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:289