35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
120 template<
typename MT1
122 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
123 ,
private MatMatMultExpr
124 ,
private Computation
152 template<
typename T1,
typename T2,
typename T3 >
153 struct IsEvaluationRequired {
154 enum { value = ( evaluateLeft || evaluateRight ) };
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseSinglePrecisionKernel {
167 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
168 IsFloat<typename T1::ElementType>::value &&
169 IsFloat<typename T2::ElementType>::value &&
170 IsFloat<typename T3::ElementType>::value };
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseDoublePrecisionKernel {
183 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
184 IsDouble<typename T1::ElementType>::value &&
185 IsDouble<typename T2::ElementType>::value &&
186 IsDouble<typename T3::ElementType>::value };
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseSinglePrecisionComplexKernel {
199 typedef complex<float> Type;
201 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
202 IsSame<typename T1::ElementType,Type>::value &&
203 IsSame<typename T2::ElementType,Type>::value &&
204 IsSame<typename T3::ElementType,Type>::value };
215 template<
typename T1,
typename T2,
typename T3 >
216 struct UseDoublePrecisionComplexKernel {
217 typedef complex<double> Type;
219 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
220 IsSame<typename T1::ElementType,Type>::value &&
221 IsSame<typename T2::ElementType,Type>::value &&
222 IsSame<typename T3::ElementType,Type>::value };
232 template<
typename T1,
typename T2,
typename T3 >
233 struct UseDefaultKernel {
234 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
235 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
236 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
237 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
247 template<
typename T1,
typename T2,
typename T3 >
248 struct UseVectorizedDefaultKernel {
249 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
250 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
251 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
252 IntrinsicTrait<typename T1::ElementType>::addition &&
253 IntrinsicTrait<typename T1::ElementType>::subtraction &&
254 IntrinsicTrait<typename T1::ElementType>::multiplication };
285 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
291 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
292 !evaluateRight && MT2::smpAssignable };
322 if(
lhs_.columns() != 0UL ) {
323 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
325 for(
size_t k=1UL; k<
end; k+=2UL ) {
327 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
329 if( end <
lhs_.columns() ) {
357 return rhs_.columns();
387 template<
typename T >
389 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
399 template<
typename T >
401 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
411 return lhs_.isAligned() &&
rhs_.isAligned();
450 template<
typename MT3
453 static inline void sgemm( MT3& C,
const MT4& A,
const MT5& B,
float alpha,
float beta )
455 using boost::numeric_cast;
461 const int M ( numeric_cast<int>( A.rows() ) );
462 const int N ( numeric_cast<int>( B.columns() ) );
463 const int K ( numeric_cast<int>( A.columns() ) );
464 const int lda( numeric_cast<int>( A.spacing() ) );
465 const int ldb( numeric_cast<int>( B.spacing() ) );
466 const int ldc( numeric_cast<int>( C.spacing() ) );
469 cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper,
470 M, N, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
473 cblas_ssymm( CblasColMajor, CblasRight, CblasLower,
474 M, N, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc );
480 M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
504 template<
typename MT3
507 static inline void dgemm( MT3& C,
const MT4& A,
const MT5& B,
double alpha,
double beta )
509 using boost::numeric_cast;
515 const int M ( numeric_cast<int>( A.rows() ) );
516 const int N ( numeric_cast<int>( B.columns() ) );
517 const int K ( numeric_cast<int>( A.columns() ) );
518 const int lda( numeric_cast<int>( A.spacing() ) );
519 const int ldb( numeric_cast<int>( B.spacing() ) );
520 const int ldc( numeric_cast<int>( C.spacing() ) );
523 cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper,
524 M, N, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
526 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
527 cblas_dsymm( CblasColMajor, CblasRight, CblasLower,
528 M, N, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc );
531 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
532 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
533 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
534 M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
558 template<
typename MT3
561 static inline void cgemm( MT3& C,
const MT4& A,
const MT5& B,
562 complex<float> alpha, complex<float> beta )
564 using boost::numeric_cast;
573 const int M ( numeric_cast<int>( A.rows() ) );
574 const int N ( numeric_cast<int>( B.columns() ) );
575 const int K ( numeric_cast<int>( A.columns() ) );
576 const int lda( numeric_cast<int>( A.spacing() ) );
577 const int ldb( numeric_cast<int>( B.spacing() ) );
578 const int ldc( numeric_cast<int>( C.spacing() ) );
580 if( IsSymmetric<MT4>::value && IsRowMajorMatrix<MT3>::value ) {
581 cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper,
582 M, N, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
584 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
585 cblas_csymm( CblasColMajor, CblasRight, CblasLower,
586 M, N, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc );
589 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
590 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
591 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
592 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
616 template<
typename MT3
619 static inline void zgemm( MT3& C,
const MT4& A,
const MT5& B,
620 complex<double> alpha, complex<double> beta )
622 using boost::numeric_cast;
631 const int M ( numeric_cast<int>( A.rows() ) );
632 const int N ( numeric_cast<int>( B.columns() ) );
633 const int K ( numeric_cast<int>( A.columns() ) );
634 const int lda( numeric_cast<int>( A.spacing() ) );
635 const int ldb( numeric_cast<int>( B.spacing() ) );
636 const int ldc( numeric_cast<int>( C.spacing() ) );
638 if( IsSymmetric<MT4>::value && IsRowMajorMatrix<MT3>::value ) {
639 cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper,
640 M, N, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
642 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
643 cblas_zsymm( CblasColMajor, CblasRight, CblasLower,
644 M, N, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc );
647 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
648 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
649 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
650 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
670 template<
typename MT
679 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
682 else if( rhs.lhs_.columns() == 0UL ) {
697 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
713 template<
typename MT3
716 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
719 TDMatDMatMultExpr::selectDefaultAssignKernel( C, A, B );
721 TDMatDMatMultExpr::selectBlasAssignKernel( C, A, B );
740 template<
typename MT3
743 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
744 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
746 const size_t M( A.rows() );
747 const size_t N( B.columns() );
748 const size_t K( A.columns() );
750 for(
size_t i=0UL; i<M; ++i ) {
751 for(
size_t j=0UL; j<N; ++j ) {
752 C(i,j) = A(i,0UL) * B(0UL,j);
754 for(
size_t k=1UL; k<K; ++k ) {
755 for(
size_t j=0UL; j<N; ++j ) {
756 C(i,j) += A(i,k) * B(k,j);
778 template<
typename MT3
781 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
782 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
784 typedef IntrinsicTrait<ElementType> IT;
786 const size_t M( A.rows() );
787 const size_t N( B.columns() );
788 const size_t K( A.columns() );
793 for(
size_t i=0UL; i<M; ++i ) {
794 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
795 for(
size_t k=0UL; k<K; ++k ) {
797 xmm1 = xmm1 + a1 * B.load(k,j );
798 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
799 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
800 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
801 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
802 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
803 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
804 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
806 (~C).
store( i, j , xmm1 );
818 for( ; (i+2UL) <= M; i+=2UL ) {
819 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
820 for(
size_t k=0UL; k<K; ++k ) {
827 xmm1 = xmm1 + a1 * b1;
828 xmm2 = xmm2 + a1 * b2;
829 xmm3 = xmm3 + a1 * b3;
830 xmm4 = xmm4 + a1 * b4;
831 xmm5 = xmm5 + a2 * b1;
832 xmm6 = xmm6 + a2 * b2;
833 xmm7 = xmm7 + a2 * b3;
834 xmm8 = xmm8 + a2 * b4;
836 (~C).
store( i , j , xmm1 );
840 (~C).
store( i+1UL, j , xmm5 );
847 for(
size_t k=0UL; k<K; ++k ) {
849 xmm1 = xmm1 + a1 * B.load(k,j );
850 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
851 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
852 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
854 (~C).
store( i, j , xmm1 );
862 for( ; (i+2UL) <= M; i+=2UL ) {
864 for(
size_t k=0UL; k<K; ++k ) {
869 xmm1 = xmm1 + a1 * b1;
870 xmm2 = xmm2 + a1 * b2;
871 xmm3 = xmm3 + a2 * b1;
872 xmm4 = xmm4 + a2 * b2;
874 (~C).
store( i , j , xmm1 );
876 (~C).
store( i+1UL, j , xmm3 );
881 for(
size_t k=0UL; k<K; ++k ) {
883 xmm1 = xmm1 + a1 * B.load(k,j );
884 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
886 (~C).
store( i, j , xmm1 );
892 for( ; (i+2UL) <= M; i+=2UL ) {
894 for(
size_t k=0UL; k<K; ++k ) {
896 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
897 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
899 (~C).
store( i , j, xmm1 );
900 (~C).
store( i+1UL, j, xmm2 );
904 for(
size_t k=0UL; k<K; ++k ) {
905 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
907 (~C).
store( i, j, xmm1 );
928 template<
typename MT3
931 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
932 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
934 typedef IntrinsicTrait<ElementType> IT;
936 const size_t M( A.rows() );
937 const size_t N( B.columns() );
938 const size_t K( A.columns() );
943 for(
size_t j=0UL; j<N; ++j ) {
944 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
945 for(
size_t k=0UL; k<K; ++k ) {
947 xmm1 = xmm1 + A.load(i ,k) * b1;
948 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
949 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
950 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
951 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
952 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
953 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
954 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
956 (~C).
store( i , j, xmm1 );
968 for( ; (j+2UL) <= N; j+=2UL ) {
969 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
970 for(
size_t k=0UL; k<K; ++k ) {
977 xmm1 = xmm1 + a1 * b1;
978 xmm2 = xmm2 + a2 * b1;
979 xmm3 = xmm3 + a3 * b1;
980 xmm4 = xmm4 + a4 * b1;
981 xmm5 = xmm5 + a1 * b2;
982 xmm6 = xmm6 + a2 * b2;
983 xmm7 = xmm7 + a3 * b2;
984 xmm8 = xmm8 + a4 * b2;
986 (~C).
store( i , j , xmm1 );
990 (~C).
store( i , j+1UL, xmm5 );
997 for(
size_t k=0UL; k<K; ++k ) {
999 xmm1 = xmm1 + A.load(i ,k) * b1;
1000 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1001 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1002 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1004 (~C).
store( i , j, xmm1 );
1012 for( ; (j+2UL) <= N; j+=2UL ) {
1014 for(
size_t k=0UL; k<K; ++k ) {
1019 xmm1 = xmm1 + a1 * b1;
1020 xmm2 = xmm2 + a2 * b1;
1021 xmm3 = xmm3 + a1 * b2;
1022 xmm4 = xmm4 + a2 * b2;
1024 (~C).
store( i , j , xmm1 );
1026 (~C).
store( i , j+1UL, xmm3 );
1031 for(
size_t k=0UL; k<K; ++k ) {
1033 xmm1 = xmm1 + A.load(i ,k) * b1;
1034 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
1036 (~C).
store( i , j, xmm1 );
1042 for( ; (j+2UL) <= N; j+=2UL ) {
1044 for(
size_t k=0UL; k<K; ++k ) {
1046 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1047 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1049 (~C).
store( i, j , xmm1 );
1050 (~C).
store( i, j+1UL, xmm2 );
1054 for(
size_t k=0UL; k<K; ++k ) {
1055 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1057 (~C).
store( i, j, xmm1 );
1078 template<
typename MT3
1081 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1082 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1084 selectDefaultAssignKernel( C, A, B );
1104 template<
typename MT3
1107 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1108 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1110 sgemm( C, A, B, 1.0F, 0.0F );
1131 template<
typename MT3
1134 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1135 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1137 dgemm( C, A, B, 1.0, 0.0 );
1158 template<
typename MT3
1161 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1162 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1164 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
1185 template<
typename MT3
1188 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1189 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1191 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
1210 template<
typename MT
1216 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1228 const TmpType tmp(
serial( rhs ) );
1247 template<
typename MT
1256 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1270 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1286 template<
typename MT3
1289 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1292 TDMatDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1294 TDMatDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1313 template<
typename MT3
1316 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1317 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1319 const size_t M( A.rows() );
1320 const size_t N( B.columns() );
1321 const size_t K( A.columns() );
1324 const size_t end( N &
size_t(-2) );
1326 for(
size_t i=0UL; i<M; ++i ) {
1327 for(
size_t k=0UL; k<K; ++k ) {
1328 for(
size_t j=0UL; j<
end; j+=2UL ) {
1329 C(i,j ) += A(i,k) * B(k,j );
1330 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1333 C(i,end) += A(i,k) * B(k,end);
1355 template<
typename MT3
1358 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1359 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1361 typedef IntrinsicTrait<ElementType> IT;
1363 const size_t M( A.rows() );
1364 const size_t N( B.columns() );
1365 const size_t K( A.columns() );
1370 for(
size_t i=0UL; i<M; ++i ) {
1379 for(
size_t k=0UL; k<K; ++k ) {
1381 xmm1 = xmm1 + a1 * B.load(k,j );
1382 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1383 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1384 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1385 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
1386 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
1387 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
1388 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
1390 (~C).
store( i, j , xmm1 );
1402 for( ; (i+2UL) <= M; i+=2UL ) {
1411 for(
size_t k=0UL; k<K; ++k ) {
1418 xmm1 = xmm1 + a1 * b1;
1419 xmm2 = xmm2 + a1 * b2;
1420 xmm3 = xmm3 + a1 * b3;
1421 xmm4 = xmm4 + a1 * b4;
1422 xmm5 = xmm5 + a2 * b1;
1423 xmm6 = xmm6 + a2 * b2;
1424 xmm7 = xmm7 + a2 * b3;
1425 xmm8 = xmm8 + a2 * b4;
1427 (~C).
store( i , j , xmm1 );
1431 (~C).
store( i+1UL, j , xmm5 );
1441 for(
size_t k=0UL; k<K; ++k ) {
1443 xmm1 = xmm1 + a1 * B.load(k,j );
1444 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1445 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1446 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1448 (~C).
store( i, j , xmm1 );
1456 for( ; (i+2UL) <= M; i+=2UL ) {
1461 for(
size_t k=0UL; k<K; ++k ) {
1466 xmm1 = xmm1 + a1 * b1;
1467 xmm2 = xmm2 + a1 * b2;
1468 xmm3 = xmm3 + a2 * b1;
1469 xmm4 = xmm4 + a2 * b2;
1471 (~C).
store( i , j , xmm1 );
1473 (~C).
store( i+1UL, j , xmm3 );
1479 for(
size_t k=0UL; k<K; ++k ) {
1481 xmm1 = xmm1 + a1 * B.load(k,j );
1482 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
1484 (~C).
store( i, j , xmm1 );
1490 for( ; (i+2UL) <= M; i+=2UL ) {
1493 for(
size_t k=0UL; k<K; ++k ) {
1495 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1496 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1498 (~C).
store( i , j, xmm1 );
1499 (~C).
store( i+1UL, j, xmm2 );
1503 for(
size_t k=0UL; k<K; ++k ) {
1504 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1506 (~C).
store( i, j, xmm1 );
1527 template<
typename MT3
1530 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1531 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1533 typedef IntrinsicTrait<ElementType> IT;
1535 const size_t M( A.rows() );
1536 const size_t N( B.columns() );
1537 const size_t K( A.columns() );
1542 for(
size_t j=0UL; j<N; ++j ) {
1551 for(
size_t k=0UL; k<K; ++k ) {
1553 xmm1 = xmm1 + A.load(i ,k) * b1;
1554 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1555 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1556 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1557 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
1558 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
1559 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
1560 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
1562 (~C).
store( i , j, xmm1 );
1574 for( ; (j+2UL) <= N; j+=2UL ) {
1583 for(
size_t k=0UL; k<K; ++k ) {
1590 xmm1 = xmm1 + a1 * b1;
1591 xmm2 = xmm2 + a2 * b1;
1592 xmm3 = xmm3 + a3 * b1;
1593 xmm4 = xmm4 + a4 * b1;
1594 xmm5 = xmm5 + a1 * b2;
1595 xmm6 = xmm6 + a2 * b2;
1596 xmm7 = xmm7 + a3 * b2;
1597 xmm8 = xmm8 + a4 * b2;
1599 (~C).
store( i , j , xmm1 );
1603 (~C).
store( i , j+1UL, xmm5 );
1613 for(
size_t k=0UL; k<K; ++k ) {
1615 xmm1 = xmm1 + A.load(i ,k) * b1;
1616 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1617 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1618 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1620 (~C).
store( i , j, xmm1 );
1628 for( ; (j+2UL) <= N; j+=2UL ) {
1633 for(
size_t k=0UL; k<K; ++k ) {
1638 xmm1 = xmm1 + a1 * b1;
1639 xmm2 = xmm2 + a2 * b1;
1640 xmm3 = xmm3 + a1 * b2;
1641 xmm4 = xmm4 + a2 * b2;
1643 (~C).
store( i , j , xmm1 );
1645 (~C).
store( i , j+1UL, xmm3 );
1651 for(
size_t k=0UL; k<K; ++k ) {
1653 xmm1 = xmm1 + A.load(i ,k) * b1;
1654 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
1656 (~C).
store( i , j, xmm1 );
1662 for( ; (j+2UL) <= N; j+=2UL ) {
1665 for(
size_t k=0UL; k<K; ++k ) {
1667 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1668 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1670 (~C).
store( i, j , xmm1 );
1671 (~C).
store( i, j+1UL, xmm2 );
1675 for(
size_t k=0UL; k<K; ++k ) {
1676 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1678 (~C).
store( i, j, xmm1 );
1699 template<
typename MT3
1702 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1703 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1705 selectDefaultAddAssignKernel( C, A, B );
1725 template<
typename MT3
1728 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1729 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1731 sgemm( C, A, B, 1.0F, 1.0F );
1752 template<
typename MT3
1755 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1756 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1758 dgemm( C, A, B, 1.0, 1.0 );
1779 template<
typename MT3
1782 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1783 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1785 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
1806 template<
typename MT3
1809 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1810 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1812 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
1835 template<
typename MT
1844 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1858 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1874 template<
typename MT3
1877 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1880 TDMatDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1882 TDMatDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1901 template<
typename MT3
1904 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1905 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1907 const size_t M( A.rows() );
1908 const size_t N( B.columns() );
1909 const size_t K( A.columns() );
1912 const size_t end( N &
size_t(-2) );
1914 for(
size_t i=0UL; i<M; ++i ) {
1915 for(
size_t k=0UL; k<K; ++k ) {
1916 for(
size_t j=0UL; j<
end; j+=2UL ) {
1917 C(i,j ) -= A(i,k) * B(k,j );
1918 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1921 C(i,end) -= A(i,k) * B(k,end);
1943 template<
typename MT3
1946 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1947 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1949 typedef IntrinsicTrait<ElementType> IT;
1951 const size_t M( A.rows() );
1952 const size_t N( B.columns() );
1953 const size_t K( A.columns() );
1958 for(
size_t i=0UL; i<M; ++i ) {
1967 for(
size_t k=0UL; k<K; ++k ) {
1969 xmm1 = xmm1 - a1 * B.load(k,j );
1970 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
1971 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
1972 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
1973 xmm5 = xmm5 - a1 * B.load(k,j+
IT::size*4UL);
1974 xmm6 = xmm6 - a1 * B.load(k,j+
IT::size*5UL);
1975 xmm7 = xmm7 - a1 * B.load(k,j+
IT::size*6UL);
1976 xmm8 = xmm8 - a1 * B.load(k,j+
IT::size*7UL);
1978 (~C).
store( i, j , xmm1 );
1990 for( ; (i+2UL) <= M; i+=2UL ) {
1999 for(
size_t k=0UL; k<K; ++k ) {
2006 xmm1 = xmm1 - a1 * b1;
2007 xmm2 = xmm2 - a1 * b2;
2008 xmm3 = xmm3 - a1 * b3;
2009 xmm4 = xmm4 - a1 * b4;
2010 xmm5 = xmm5 - a2 * b1;
2011 xmm6 = xmm6 - a2 * b2;
2012 xmm7 = xmm7 - a2 * b3;
2013 xmm8 = xmm8 - a2 * b4;
2015 (~C).
store( i , j , xmm1 );
2019 (~C).
store( i+1UL, j , xmm5 );
2029 for(
size_t k=0UL; k<K; ++k ) {
2031 xmm1 = xmm1 - a1 * B.load(k,j );
2032 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
2033 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
2034 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
2036 (~C).
store( i, j , xmm1 );
2044 for( ; (i+2UL) <= M; i+=2UL ) {
2049 for(
size_t k=0UL; k<K; ++k ) {
2054 xmm1 = xmm1 - a1 * b1;
2055 xmm2 = xmm2 - a1 * b2;
2056 xmm3 = xmm3 - a2 * b1;
2057 xmm4 = xmm4 - a2 * b2;
2059 (~C).
store( i , j , xmm1 );
2061 (~C).
store( i+1UL, j , xmm3 );
2067 for(
size_t k=0UL; k<K; ++k ) {
2069 xmm1 = xmm1 - a1 * B.load(k,j );
2070 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size);
2072 (~C).
store( i, j , xmm1 );
2078 for( ; (i+2UL) <= M; i+=2UL ) {
2081 for(
size_t k=0UL; k<K; ++k ) {
2083 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
2084 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
2086 (~C).
store( i , j, xmm1 );
2087 (~C).
store( i+1UL, j, xmm2 );
2091 for(
size_t k=0UL; k<K; ++k ) {
2092 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
2094 (~C).
store( i, j, xmm1 );
2115 template<
typename MT3
2118 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2119 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2121 typedef IntrinsicTrait<ElementType> IT;
2123 const size_t M( A.rows() );
2124 const size_t N( B.columns() );
2125 const size_t K( A.columns() );
2130 for(
size_t j=0UL; j<N; ++j ) {
2139 for(
size_t k=0UL; k<K; ++k ) {
2141 xmm1 = xmm1 - A.load(i ,k) * b1;
2142 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
2143 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
2144 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
2145 xmm5 = xmm5 - A.load(i+
IT::size*4UL,k) * b1;
2146 xmm6 = xmm6 - A.load(i+
IT::size*5UL,k) * b1;
2147 xmm7 = xmm7 - A.load(i+
IT::size*6UL,k) * b1;
2148 xmm8 = xmm8 - A.load(i+
IT::size*7UL,k) * b1;
2150 (~C).
store( i , j, xmm1 );
2162 for( ; (j+2UL) <= N; j+=2UL ) {
2171 for(
size_t k=0UL; k<K; ++k ) {
2178 xmm1 = xmm1 - a1 * b1;
2179 xmm2 = xmm2 - a2 * b1;
2180 xmm3 = xmm3 - a3 * b1;
2181 xmm4 = xmm4 - a4 * b1;
2182 xmm5 = xmm5 - a1 * b2;
2183 xmm6 = xmm6 - a2 * b2;
2184 xmm7 = xmm7 - a3 * b2;
2185 xmm8 = xmm8 - a4 * b2;
2187 (~C).
store( i , j , xmm1 );
2191 (~C).
store( i , j+1UL, xmm5 );
2201 for(
size_t k=0UL; k<K; ++k ) {
2203 xmm1 = xmm1 - A.load(i ,k) * b1;
2204 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
2205 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
2206 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
2208 (~C).
store( i , j, xmm1 );
2216 for( ; (j+2UL) <= N; j+=2UL ) {
2221 for(
size_t k=0UL; k<K; ++k ) {
2226 xmm1 = xmm1 - a1 * b1;
2227 xmm2 = xmm2 - a2 * b1;
2228 xmm3 = xmm3 - a1 * b2;
2229 xmm4 = xmm4 - a2 * b2;
2231 (~C).
store( i , j , xmm1 );
2233 (~C).
store( i , j+1UL, xmm3 );
2239 for(
size_t k=0UL; k<K; ++k ) {
2241 xmm1 = xmm1 - A.load(i ,k) * b1;
2242 xmm2 = xmm2 - A.load(i+
IT::size,k) * b1;
2244 (~C).
store( i , j, xmm1 );
2250 for( ; (j+2UL) <= N; j+=2UL ) {
2253 for(
size_t k=0UL; k<K; ++k ) {
2255 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2256 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2258 (~C).
store( i, j , xmm1 );
2259 (~C).
store( i, j+1UL, xmm2 );
2263 for(
size_t k=0UL; k<K; ++k ) {
2264 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
2266 (~C).
store( i, j, xmm1 );
2287 template<
typename MT3
2290 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2291 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2293 selectDefaultSubAssignKernel( C, A, B );
2313 template<
typename MT3
2316 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2317 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2319 sgemm( C, A, B, -1.0F, 1.0F );
2340 template<
typename MT3
2343 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2344 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2346 dgemm( C, A, B, -1.0, 1.0 );
2367 template<
typename MT3
2370 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2371 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2373 cgemm( C, A, B, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
2394 template<
typename MT3
2397 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2398 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2400 zgemm( C, A, B, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
2434 template<
typename MT
2436 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
2444 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2447 else if( rhs.lhs_.columns() == 0UL ) {
2483 template<
typename MT
2485 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
2490 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2502 const TmpType tmp( rhs );
2524 template<
typename MT
2526 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
2534 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2573 template<
typename MT
2575 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
2583 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2643 template<
typename MT1
2647 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2648 ,
private MatScalarMultExpr
2649 ,
private Computation
2653 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2665 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2670 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2678 template<
typename T1,
typename T2,
typename T3 >
2679 struct IsEvaluationRequired {
2680 enum { value = ( evaluateLeft || evaluateRight ) };
2689 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2690 struct UseSinglePrecisionKernel {
2692 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2693 IsFloat<typename T1::ElementType>::value &&
2694 IsFloat<typename T2::ElementType>::value &&
2695 IsFloat<typename T3::ElementType>::value &&
2696 !IsComplex<T4>::value };
2705 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2706 struct UseDoublePrecisionKernel {
2708 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2709 IsDouble<typename T1::ElementType>::value &&
2710 IsDouble<typename T2::ElementType>::value &&
2711 IsDouble<typename T3::ElementType>::value &&
2712 !IsComplex<T4>::value };
2721 template<
typename T1,
typename T2,
typename T3 >
2722 struct UseSinglePrecisionComplexKernel {
2723 typedef complex<float> Type;
2725 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2726 IsSame<typename T1::ElementType,Type>::value &&
2727 IsSame<typename T2::ElementType,Type>::value &&
2728 IsSame<typename T3::ElementType,Type>::value };
2737 template<
typename T1,
typename T2,
typename T3 >
2738 struct UseDoublePrecisionComplexKernel {
2739 typedef complex<double> Type;
2741 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2742 IsSame<typename T1::ElementType,Type>::value &&
2743 IsSame<typename T2::ElementType,Type>::value &&
2744 IsSame<typename T3::ElementType,Type>::value };
2752 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2753 struct UseDefaultKernel {
2754 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2755 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2756 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2757 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2765 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2766 struct UseVectorizedDefaultKernel {
2767 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2768 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2769 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2770 IsSame<typename T1::ElementType,T4>::value &&
2771 IntrinsicTrait<typename T1::ElementType>::addition &&
2772 IntrinsicTrait<typename T1::ElementType>::subtraction &&
2773 IntrinsicTrait<typename T1::ElementType>::multiplication };
2779 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2780 typedef typename MultTrait<RES,ST>::Type
ResultType;
2784 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2789 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
2795 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
2798 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
2803 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2804 IsSame<ET1,ET2>::value &&
2805 IsSame<ET1,ST>::value &&
2806 IntrinsicTrait<ET1>::addition &&
2807 IntrinsicTrait<ET1>::multiplication };
2810 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
2811 !evaluateRight && MT2::smpAssignable };
2820 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2836 return matrix_(i,j) * scalar_;
2845 inline size_t rows()
const {
2846 return matrix_.rows();
2855 inline size_t columns()
const {
2856 return matrix_.columns();
2886 template<
typename T >
2887 inline bool canAlias(
const T* alias )
const {
2888 return matrix_.canAlias( alias );
2898 template<
typename T >
2899 inline bool isAliased(
const T* alias )
const {
2900 return matrix_.isAliased( alias );
2910 return matrix_.isAligned();
2920 typename MMM::RightOperand B( matrix_.rightOperand() );
2949 template<
typename MT3
2952 static inline void sgemm( MT3& C,
const MT4& A,
const MT5& B,
float alpha,
float beta )
2954 using boost::numeric_cast;
2960 const int M ( numeric_cast<int>( A.rows() ) );
2961 const int N ( numeric_cast<int>( B.columns() ) );
2962 const int K ( numeric_cast<int>( A.columns() ) );
2963 const int lda( numeric_cast<int>( A.spacing() ) );
2964 const int ldb( numeric_cast<int>( B.spacing() ) );
2965 const int ldc( numeric_cast<int>( C.spacing() ) );
2967 if( IsSymmetric<MT4>::value && IsRowMajorMatrix<MT3>::value ) {
2968 cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper,
2969 M, N, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
2971 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
2972 cblas_ssymm( CblasColMajor, CblasRight, CblasLower,
2973 M, N, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc );
2976 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2977 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2978 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2979 M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
3001 template<
typename MT3
3004 static inline void dgemm( MT3& C,
const MT4& A,
const MT5& B,
double alpha,
double beta )
3006 using boost::numeric_cast;
3012 const int M ( numeric_cast<int>( A.rows() ) );
3013 const int N ( numeric_cast<int>( B.columns() ) );
3014 const int K ( numeric_cast<int>( A.columns() ) );
3015 const int lda( numeric_cast<int>( A.spacing() ) );
3016 const int ldb( numeric_cast<int>( B.spacing() ) );
3017 const int ldc( numeric_cast<int>( C.spacing() ) );
3019 if( IsSymmetric<MT4>::value && IsRowMajorMatrix<MT3>::value ) {
3020 cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper,
3021 M, N, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
3023 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
3024 cblas_dsymm( CblasColMajor, CblasRight, CblasLower,
3025 M, N, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc );
3028 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3029 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3030 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3031 M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc );
3053 template<
typename MT3
3056 static inline void cgemm( MT3& C,
const MT4& A,
const MT5& B,
3057 complex<float> alpha, complex<float> beta )
3059 using boost::numeric_cast;
3068 const int M ( numeric_cast<int>( A.rows() ) );
3069 const int N ( numeric_cast<int>( B.columns() ) );
3070 const int K ( numeric_cast<int>( A.columns() ) );
3071 const int lda( numeric_cast<int>( A.spacing() ) );
3072 const int ldb( numeric_cast<int>( B.spacing() ) );
3073 const int ldc( numeric_cast<int>( C.spacing() ) );
3075 if( IsSymmetric<MT4>::value && IsRowMajorMatrix<MT3>::value ) {
3076 cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper,
3077 M, N, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3079 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
3080 cblas_csymm( CblasColMajor, CblasRight, CblasLower,
3081 M, N, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc );
3084 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3085 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3086 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3087 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3109 template<
typename MT3
3112 static inline void zgemm( MT3& C,
const MT4& A,
const MT5& B,
3113 complex<double> alpha, complex<double> beta )
3115 using boost::numeric_cast;
3124 const int M ( numeric_cast<int>( A.rows() ) );
3125 const int N ( numeric_cast<int>( B.columns() ) );
3126 const int K ( numeric_cast<int>( A.columns() ) );
3127 const int lda( numeric_cast<int>( A.spacing() ) );
3128 const int ldb( numeric_cast<int>( B.spacing() ) );
3129 const int ldc( numeric_cast<int>( C.spacing() ) );
3131 if( IsSymmetric<MT4>::value && IsRowMajorMatrix<MT3>::value ) {
3132 cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper,
3133 M, N, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3135 else if( IsSymmetric<MT5>::value && IsColumnMajorMatrix<MT3>::value ) {
3136 cblas_zsymm( CblasColMajor, CblasRight, CblasLower,
3137 M, N, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc );
3140 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3141 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3142 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3143 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3161 template<
typename MT
3163 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3170 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3171 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3173 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3176 else if( left.columns() == 0UL ) {
3191 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
3206 template<
typename MT3
3210 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3213 DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
3215 DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
3233 template<
typename MT3
3237 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3238 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3240 for(
size_t i=0UL; i<A.rows(); ++i ) {
3241 for(
size_t k=0UL; k<B.columns(); ++k ) {
3242 C(i,k) = A(i,0UL) * B(0UL,k);
3244 for(
size_t j=1UL; j<A.columns(); ++j ) {
3245 for(
size_t k=0UL; k<B.columns(); ++k ) {
3246 C(i,k) += A(i,j) * B(j,k);
3249 for(
size_t k=0UL; k<B.columns(); ++k ) {
3270 template<
typename MT3
3274 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3275 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3277 typedef IntrinsicTrait<ElementType> IT;
3279 const size_t M( A.rows() );
3280 const size_t N( B.columns() );
3281 const size_t K( A.columns() );
3288 for(
size_t i=0UL; i<M; ++i ) {
3289 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3290 for(
size_t k=0UL; k<K; ++k ) {
3292 xmm1 = xmm1 + a1 * B.load(k,j );
3293 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
3294 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
3295 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
3296 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
3297 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
3298 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
3299 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
3301 (~C).
store( i, j , xmm1 * factor );
3313 for( ; (i+2UL) <= M; i+=2UL ) {
3314 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3315 for(
size_t k=0UL; k<K; ++k ) {
3322 xmm1 = xmm1 + a1 * b1;
3323 xmm2 = xmm2 + a1 * b2;
3324 xmm3 = xmm3 + a1 * b3;
3325 xmm4 = xmm4 + a1 * b4;
3326 xmm5 = xmm5 + a2 * b1;
3327 xmm6 = xmm6 + a2 * b2;
3328 xmm7 = xmm7 + a2 * b3;
3329 xmm8 = xmm8 + a2 * b4;
3331 (~C).
store( i , j , xmm1 * factor );
3335 (~C).
store( i+1UL, j , xmm5 * factor );
3342 for(
size_t k=0UL; k<K; ++k ) {
3344 xmm1 = xmm1 + a1 * B.load(k,j );
3345 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
3346 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
3347 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
3349 (~C).
store( i, j , xmm1 * factor );
3357 for( ; (i+2UL) <= M; i+=2UL ) {
3359 for(
size_t k=0UL; k<K; ++k ) {
3364 xmm1 = xmm1 + a1 * b1;
3365 xmm2 = xmm2 + a1 * b2;
3366 xmm3 = xmm3 + a2 * b1;
3367 xmm4 = xmm4 + a2 * b2;
3369 (~C).
store( i , j , xmm1 * factor );
3371 (~C).
store( i+1UL, j , xmm3 * factor );
3376 for(
size_t k=0UL; k<K; ++k ) {
3378 xmm1 = xmm1 + a1 * B.load(k,j );
3379 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
3381 (~C).
store( i, j , xmm1 * factor );
3387 for( ; (i+2UL) <= M; i+=2UL ) {
3389 for(
size_t k=0UL; k<K; ++k ) {
3391 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3392 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3394 (~C).
store( i , j, xmm1 * factor );
3395 (~C).
store( i+1UL, j, xmm2 * factor );
3399 for(
size_t k=0UL; k<K; ++k ) {
3400 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3402 (~C).
store( i, j, xmm1 * factor );
3422 template<
typename MT3
3426 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3427 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3429 typedef IntrinsicTrait<ElementType> IT;
3431 const size_t M( A.rows() );
3432 const size_t N( B.columns() );
3433 const size_t K( A.columns() );
3440 for(
size_t j=0UL; j<N; ++j ) {
3441 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3442 for(
size_t k=0UL; k<K; ++k ) {
3444 xmm1 = xmm1 + A.load(i ,k) * b1;
3445 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
3446 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
3447 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
3448 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
3449 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
3450 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
3451 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
3453 (~C).
store( i , j, xmm1 * factor );
3465 for( ; (j+2UL) <= N; j+=2UL ) {
3466 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3467 for(
size_t k=0UL; k<K; ++k ) {
3474 xmm1 = xmm1 + a1 * b1;
3475 xmm2 = xmm2 + a2 * b1;
3476 xmm3 = xmm3 + a3 * b1;
3477 xmm4 = xmm4 + a4 * b1;
3478 xmm5 = xmm5 + a1 * b2;
3479 xmm6 = xmm6 + a2 * b2;
3480 xmm7 = xmm7 + a3 * b2;
3481 xmm8 = xmm8 + a4 * b2;
3483 (~C).
store( i , j , xmm1 * factor );
3487 (~C).
store( i , j+1UL, xmm5 * factor );
3494 for(
size_t k=0UL; k<K; ++k ) {
3496 xmm1 = xmm1 + A.load(i ,k) * b1;
3497 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
3498 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
3499 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
3501 (~C).
store( i , j, xmm1 * factor );
3509 for( ; (j+2UL) <= N; j+=2UL ) {
3511 for(
size_t k=0UL; k<K; ++k ) {
3516 xmm1 = xmm1 + a1 * b1;
3517 xmm2 = xmm2 + a2 * b1;
3518 xmm3 = xmm3 + a1 * b2;
3519 xmm4 = xmm4 + a2 * b2;
3521 (~C).
store( i , j , xmm1 * factor );
3523 (~C).
store( i , j+1UL, xmm3 * factor );
3528 for(
size_t k=0UL; k<K; ++k ) {
3530 xmm1 = xmm1 + A.load(i ,k) * b1;
3531 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
3533 (~C).
store( i , j, xmm1 * factor );
3539 for( ; (j+2UL) <= N; j+=2UL ) {
3541 for(
size_t k=0UL; k<K; ++k ) {
3543 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3544 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3546 (~C).
store( i, j , xmm1 * factor );
3547 (~C).
store( i, j+1UL, xmm2 * factor );
3551 for(
size_t k=0UL; k<K; ++k ) {
3552 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3554 (~C).
store( i, j, xmm1 * factor );
3574 template<
typename MT3
3578 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3579 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3581 selectDefaultAssignKernel( C, A, B, scalar );
3600 template<
typename MT3
3604 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3605 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3607 sgemm( C, A, B, scalar, 0.0F );
3627 template<
typename MT3
3631 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3632 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3634 dgemm( C, A, B, scalar, 0.0 );
3654 template<
typename MT3
3658 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3659 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3661 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
3681 template<
typename MT3
3685 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3686 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3688 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
3705 template<
typename MT
3707 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3711 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3723 const TmpType tmp(
serial( rhs ) );
3740 template<
typename MT
3742 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3749 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3750 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3752 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3766 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3781 template<
typename MT3
3785 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3788 DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3790 DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3808 template<
typename MT3
3812 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3813 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3834 template<
typename MT3
3838 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3839 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3841 typedef IntrinsicTrait<ElementType> IT;
3843 const size_t M( A.rows() );
3844 const size_t N( B.columns() );
3845 const size_t K( A.columns() );
3852 for(
size_t i=0UL; i<M; ++i ) {
3853 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3854 for(
size_t k=0UL; k<K; ++k ) {
3856 xmm1 = xmm1 + a1 * B.load(k,j );
3857 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
3858 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
3859 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
3860 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
3861 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
3862 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
3863 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
3865 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3877 for( ; (i+2UL) <= M; i+=2UL ) {
3878 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3879 for(
size_t k=0UL; k<K; ++k ) {
3886 xmm1 = xmm1 + a1 * b1;
3887 xmm2 = xmm2 + a1 * b2;
3888 xmm3 = xmm3 + a1 * b3;
3889 xmm4 = xmm4 + a1 * b4;
3890 xmm5 = xmm5 + a2 * b1;
3891 xmm6 = xmm6 + a2 * b2;
3892 xmm7 = xmm7 + a2 * b3;
3893 xmm8 = xmm8 + a2 * b4;
3895 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3899 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3906 for(
size_t k=0UL; k<K; ++k ) {
3908 xmm1 = xmm1 + a1 * B.load(k,j );
3909 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
3910 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
3911 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
3913 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3921 for( ; (i+2UL) <= M; i+=2UL ) {
3923 for(
size_t k=0UL; k<K; ++k ) {
3928 xmm1 = xmm1 + a1 * b1;
3929 xmm2 = xmm2 + a1 * b2;
3930 xmm3 = xmm3 + a2 * b1;
3931 xmm4 = xmm4 + a2 * b2;
3933 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3935 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3940 for(
size_t k=0UL; k<K; ++k ) {
3942 xmm1 = xmm1 + a1 * B.load(k,j );
3943 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
3945 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3951 for( ; (i+2UL) <= M; i+=2UL ) {
3953 for(
size_t k=0UL; k<K; ++k ) {
3955 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3956 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3958 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3959 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3963 for(
size_t k=0UL; k<K; ++k ) {
3964 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3966 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3986 template<
typename MT3
3990 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3991 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3993 typedef IntrinsicTrait<ElementType> IT;
3995 const size_t M( A.rows() );
3996 const size_t N( B.columns() );
3997 const size_t K( A.columns() );
4004 for(
size_t j=0UL; j<N; ++j ) {
4005 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4006 for(
size_t k=0UL; k<K; ++k ) {
4008 xmm1 = xmm1 + A.load(i ,k) * b1;
4009 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
4010 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
4011 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
4012 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
4013 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
4014 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
4015 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
4017 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
4029 for( ; (j+2UL) <= N; j+=2UL ) {
4030 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4031 for(
size_t k=0UL; k<K; ++k ) {
4038 xmm1 = xmm1 + a1 * b1;
4039 xmm2 = xmm2 + a2 * b1;
4040 xmm3 = xmm3 + a3 * b1;
4041 xmm4 = xmm4 + a4 * b1;
4042 xmm5 = xmm5 + a1 * b2;
4043 xmm6 = xmm6 + a2 * b2;
4044 xmm7 = xmm7 + a3 * b2;
4045 xmm8 = xmm8 + a4 * b2;
4047 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
4051 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
4058 for(
size_t k=0UL; k<K; ++k ) {
4060 xmm1 = xmm1 + A.load(i ,k) * b1;
4061 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
4062 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
4063 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
4065 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
4073 for( ; (j+2UL) <= N; j+=2UL ) {
4075 for(
size_t k=0UL; k<K; ++k ) {
4080 xmm1 = xmm1 + a1 * b1;
4081 xmm2 = xmm2 + a2 * b1;
4082 xmm3 = xmm3 + a1 * b2;
4083 xmm4 = xmm4 + a2 * b2;
4085 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
4087 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
4092 for(
size_t k=0UL; k<K; ++k ) {
4094 xmm1 = xmm1 + A.load(i ,k) * b1;
4095 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
4097 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
4103 for( ; (j+2UL) <= N; j+=2UL ) {
4105 for(
size_t k=0UL; k<K; ++k ) {
4107 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4108 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4110 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
4111 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
4115 for(
size_t k=0UL; k<K; ++k ) {
4116 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4118 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
4138 template<
typename MT3
4142 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4143 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4145 selectDefaultAddAssignKernel( C, A, B, scalar );
4164 template<
typename MT3
4168 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4169 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4171 sgemm( C, A, B, scalar, 1.0F );
4191 template<
typename MT3
4195 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4196 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4198 dgemm( C, A, B, scalar, 1.0 );
4218 template<
typename MT3
4222 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4223 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4225 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
4245 template<
typename MT3
4249 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4250 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4252 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
4273 template<
typename MT
4275 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4282 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4283 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4285 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4299 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
4314 template<
typename MT3
4318 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4321 DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
4323 DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
4341 template<
typename MT3
4345 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4346 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4367 template<
typename MT3
4371 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4372 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4374 typedef IntrinsicTrait<ElementType> IT;
4376 const size_t M( A.rows() );
4377 const size_t N( B.columns() );
4378 const size_t K( A.columns() );
4385 for(
size_t i=0UL; i<M; ++i ) {
4386 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4387 for(
size_t k=0UL; k<K; ++k ) {
4389 xmm1 = xmm1 + a1 * B.load(k,j );
4390 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
4391 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
4392 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
4393 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
4394 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
4395 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
4396 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
4398 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4410 for( ; (i+2UL) <= M; i+=2UL ) {
4411 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4412 for(
size_t k=0UL; k<K; ++k ) {
4419 xmm1 = xmm1 + a1 * b1;
4420 xmm2 = xmm2 + a1 * b2;
4421 xmm3 = xmm3 + a1 * b3;
4422 xmm4 = xmm4 + a1 * b4;
4423 xmm5 = xmm5 + a2 * b1;
4424 xmm6 = xmm6 + a2 * b2;
4425 xmm7 = xmm7 + a2 * b3;
4426 xmm8 = xmm8 + a2 * b4;
4428 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4432 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
4439 for(
size_t k=0UL; k<K; ++k ) {
4441 xmm1 = xmm1 + a1 * B.load(k,j );
4442 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
4443 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
4444 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
4446 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4454 for( ; (i+2UL) <= M; i+=2UL ) {
4456 for(
size_t k=0UL; k<K; ++k ) {
4461 xmm1 = xmm1 + a1 * b1;
4462 xmm2 = xmm2 + a1 * b2;
4463 xmm3 = xmm3 + a2 * b1;
4464 xmm4 = xmm4 + a2 * b2;
4466 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4468 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
4473 for(
size_t k=0UL; k<K; ++k ) {
4475 xmm1 = xmm1 + a1 * B.load(k,j );
4476 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
4478 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4484 for( ; (i+2UL) <= M; i+=2UL ) {
4486 for(
size_t k=0UL; k<K; ++k ) {
4488 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
4489 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
4491 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4492 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
4496 for(
size_t k=0UL; k<K; ++k ) {
4497 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
4499 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4519 template<
typename MT3
4523 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4524 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4526 typedef IntrinsicTrait<ElementType> IT;
4528 const size_t M( A.rows() );
4529 const size_t N( B.columns() );
4530 const size_t K( A.columns() );
4537 for(
size_t j=0UL; j<N; ++j ) {
4538 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4539 for(
size_t k=0UL; k<K; ++k ) {
4541 xmm1 = xmm1 + A.load(i ,k) * b1;
4542 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
4543 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
4544 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
4545 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
4546 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
4547 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
4548 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
4550 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4562 for( ; (j+2UL) <= N; j+=2UL ) {
4563 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4564 for(
size_t k=0UL; k<K; ++k ) {
4571 xmm1 = xmm1 + a1 * b1;
4572 xmm2 = xmm2 + a2 * b1;
4573 xmm3 = xmm3 + a3 * b1;
4574 xmm4 = xmm4 + a4 * b1;
4575 xmm5 = xmm5 + a1 * b2;
4576 xmm6 = xmm6 + a2 * b2;
4577 xmm7 = xmm7 + a3 * b2;
4578 xmm8 = xmm8 + a4 * b2;
4580 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4584 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4591 for(
size_t k=0UL; k<K; ++k ) {
4593 xmm1 = xmm1 + A.load(i ,k) * b1;
4594 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
4595 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
4596 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
4598 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4606 for( ; (j+2UL) <= N; j+=2UL ) {
4608 for(
size_t k=0UL; k<K; ++k ) {
4613 xmm1 = xmm1 + a1 * b1;
4614 xmm2 = xmm2 + a2 * b1;
4615 xmm3 = xmm3 + a1 * b2;
4616 xmm4 = xmm4 + a2 * b2;
4618 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4620 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4625 for(
size_t k=0UL; k<K; ++k ) {
4627 xmm1 = xmm1 + A.load(i ,k) * b1;
4628 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
4630 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4636 for( ; (j+2UL) <= N; j+=2UL ) {
4638 for(
size_t k=0UL; k<K; ++k ) {
4640 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4641 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4643 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4644 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4648 for(
size_t k=0UL; k<K; ++k ) {
4649 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4651 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4671 template<
typename MT3
4675 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4676 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4678 selectDefaultSubAssignKernel( C, A, B, scalar );
4697 template<
typename MT3
4701 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4702 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4704 sgemm( C, A, B, -scalar, 1.0F );
4724 template<
typename MT3
4728 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4729 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4731 dgemm( C, A, B, -scalar, 1.0 );
4751 template<
typename MT3
4755 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4756 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4758 cgemm( C, A, B, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
4778 template<
typename MT3
4782 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4783 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4785 zgemm( C, A, B, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
4817 template<
typename MT
4819 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4820 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4827 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4828 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4830 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4833 else if( left.columns() == 0UL ) {
4867 template<
typename MT
4869 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4870 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4874 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
4886 const TmpType tmp( rhs );
4906 template<
typename MT
4908 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4909 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4916 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4917 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4919 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4956 template<
typename MT
4958 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4959 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4966 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4967 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4969 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5051 template<
typename T1
5053 inline const TDMatDMatMultExpr<T1,T2>
5059 throw std::invalid_argument(
"Matrix sizes do not match" );
5076 template<
typename MT1,
typename MT2 >
5094 template<
typename MT1,
typename MT2 >
5096 :
public Columns<MT2>
5112 template<
typename MT1,
typename MT2 >
5114 :
public IsTrue< IsLower<MT1>::value && IsLower<MT2>::value >
5130 template<
typename MT1,
typename MT2 >
5132 :
public IsTrue< IsUpper<MT1>::value && IsUpper<MT2>::value >
5148 template<
typename MT1,
typename MT2,
typename VT >
5153 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5154 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
5155 IsDenseVector<VT>::value && IsColumnVector<VT>::value
5156 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
5157 , INVALID_TYPE >::Type Type;
5166 template<
typename MT1,
typename MT2,
typename VT >
5171 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5172 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
5173 IsSparseVector<VT>::value && IsColumnVector<VT>::value
5174 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
5175 , INVALID_TYPE >::Type Type;
5184 template<
typename VT,
typename MT1,
typename MT2 >
5189 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
5190 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5191 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
5192 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
5193 , INVALID_TYPE >::Type Type;
5202 template<
typename VT,
typename MT1,
typename MT2 >
5207 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
5208 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5209 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
5210 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
5211 , INVALID_TYPE >::Type Type;
5220 template<
typename MT1,
typename MT2,
bool AF >
5225 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
5226 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
5235 template<
typename MT1,
typename MT2 >
5240 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
5249 template<
typename MT1,
typename MT2 >
5254 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:356
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4838
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:429
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:258
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:258
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:205
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:410
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2478
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:257
BLAZE_ALWAYS_INLINE EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:224
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:129
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:255
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:376
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:695
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:265
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:125
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:128
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:132
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:133
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:301
Header file for the multiplication trait.
Header file for the IsSymmetric type trait.
Header file for the IsDouble type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:277
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:104
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:266
Header file for the DenseMatrix base class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the Columns type trait.
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:880
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:274
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:261
Header file for the IsLower type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:280
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2476
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:271
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:267
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:211
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:130
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:104
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:142
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:420
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
BLAZE_ALWAYS_INLINE void reset(const NonNumericProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: NonNumericProxy.h:833
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:131
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:264
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:400
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:366
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:256
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:263
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2473
Header file for the IsTrue value trait.
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:346
Header file for the IsUpper type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:268
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:430
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:316
Constraint on the data type.
BLAZE_ALWAYS_INLINE EnableIf< IsIntegral< T > >::Type store(T *address, const typename Store< T, sizeof(T)>::Type &value)
Aligned store of a vector of integral values.
Definition: Store.h:225
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:262
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:388
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849