35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
112 template<
typename MT1
131 template<
typename T1,
typename T2,
typename T3 >
132 struct UseSinglePrecisionKernel {
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseDoublePrecisionKernel {
160 template<
typename T1,
typename T2,
typename T3 >
161 struct UseSinglePrecisionComplexKernel {
162 typedef complex<float> Type;
163 enum { value = IsSame<typename T1::ElementType,Type>::value &&
164 IsSame<typename T2::ElementType,Type>::value &&
165 IsSame<typename T3::ElementType,Type>::value };
176 template<
typename T1,
typename T2,
typename T3 >
177 struct UseDoublePrecisionComplexKernel {
178 typedef complex<double> Type;
179 enum { value = IsSame<typename T1::ElementType,Type>::value &&
180 IsSame<typename T2::ElementType,Type>::value &&
181 IsSame<typename T3::ElementType,Type>::value };
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseDefaultKernel {
193 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
194 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
195 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
196 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
206 template<
typename T1,
typename T2,
typename T3 >
207 struct UseVectorizedDefaultKernel {
208 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
209 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
210 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
211 IntrinsicTrait<typename T1::ElementType>::addition &&
212 IntrinsicTrait<typename T1::ElementType>::multiplication };
243 enum { vectorizable = 0 };
273 if(
lhs_.columns() != 0UL ) {
274 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
276 for(
size_t k=1UL; k<end; k+=2UL ) {
278 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
280 if( end <
lhs_.columns() ) {
308 return rhs_.columns();
338 template<
typename T >
340 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
350 template<
typename T >
352 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
372 template<
typename MT3
381 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
384 else if( rhs.
lhs_.columns() == 0UL ) {
400 DMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
402 DMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
420 template<
typename MT3
424 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
426 const size_t M( A.rows() );
427 const size_t N( B.columns() );
428 const size_t K( A.columns() );
430 for(
size_t i=0UL; i<M; ++i ) {
431 for(
size_t j=0UL; j<N; ++j ) {
432 C(i,j) = A(i,0UL) * B(0UL,j);
434 for(
size_t k=1UL; k<K; ++k ) {
435 for(
size_t j=0UL; j<N; ++j ) {
436 C(i,j) += A(i,k) * B(k,j);
458 template<
typename MT3
461 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
462 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
464 typedef IntrinsicTrait<ElementType> IT;
466 const size_t M( A.rows() );
467 const size_t N( B.columns() );
468 const size_t K( A.columns() );
472 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
473 for(
size_t i=0UL; i<M; ++i ) {
474 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
475 for(
size_t k=0UL; k<K; ++k ) {
477 xmm1 = xmm1 + a1 * B.load(k,j );
478 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
479 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
480 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
481 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
482 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
483 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
484 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
486 (~C).
store( i, j , xmm1 );
487 (~C).
store( i, j+IT::size , xmm2 );
488 (~C).
store( i, j+IT::size*2UL, xmm3 );
489 (~C).
store( i, j+IT::size*3UL, xmm4 );
490 (~C).
store( i, j+IT::size*4UL, xmm5 );
491 (~C).
store( i, j+IT::size*5UL, xmm6 );
492 (~C).
store( i, j+IT::size*6UL, xmm7 );
493 (~C).
store( i, j+IT::size*7UL, xmm8 );
496 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
498 for( ; (i+2UL) <= M; i+=2UL ) {
499 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
500 for(
size_t k=0UL; k<K; ++k ) {
507 xmm1 = xmm1 + a1 * b1;
508 xmm2 = xmm2 + a1 * b2;
509 xmm3 = xmm3 + a1 * b3;
510 xmm4 = xmm4 + a1 * b4;
511 xmm5 = xmm5 + a2 * b1;
512 xmm6 = xmm6 + a2 * b2;
513 xmm7 = xmm7 + a2 * b3;
514 xmm8 = xmm8 + a2 * b4;
516 (~C).
store( i , j , xmm1 );
517 (~C).
store( i , j+IT::size , xmm2 );
518 (~C).
store( i , j+IT::size*2UL, xmm3 );
519 (~C).
store( i , j+IT::size*3UL, xmm4 );
520 (~C).
store( i+1UL, j , xmm5 );
521 (~C).
store( i+1UL, j+IT::size , xmm6 );
522 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
523 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
527 for(
size_t k=0UL; k<K; ++k ) {
529 xmm1 = xmm1 + a1 * B.load(k,j );
530 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
531 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
532 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
534 (~C).
store( i, j , xmm1 );
535 (~C).
store( i, j+IT::size , xmm2 );
536 (~C).
store( i, j+IT::size*2UL, xmm3 );
537 (~C).
store( i, j+IT::size*3UL, xmm4 );
540 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
542 for( ; (i+2UL) <= M; i+=2UL ) {
544 for(
size_t k=0UL; k<K; ++k ) {
549 xmm1 = xmm1 + a1 * b1;
550 xmm2 = xmm2 + a1 * b2;
551 xmm3 = xmm3 + a2 * b1;
552 xmm4 = xmm4 + a2 * b2;
554 (~C).
store( i , j , xmm1 );
555 (~C).
store( i , j+IT::size, xmm2 );
556 (~C).
store( i+1UL, j , xmm3 );
557 (~C).
store( i+1UL, j+IT::size, xmm4 );
561 for(
size_t k=0UL; k<K; ++k ) {
563 xmm1 = xmm1 + a1 * B.load(k,j );
564 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
566 (~C).
store( i, j , xmm1 );
567 (~C).
store( i, j+IT::size, xmm2 );
572 for( ; (i+2UL) <= M; i+=2UL ) {
574 for(
size_t k=0UL; k<K; ++k ) {
576 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
577 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
579 (~C).
store( i , j, xmm1 );
580 (~C).
store( i+1UL, j, xmm2 );
584 for(
size_t k=0UL; k<K; ++k ) {
585 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
587 (~C).
store( i, j, xmm1 );
608 template<
typename MT3
611 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
612 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
617 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
621 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
625 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
650 template<
typename MT3
653 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
654 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
656 selectDefaultAssignKernel( C, A, B );
676 template<
typename MT3
679 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
680 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
682 using boost::numeric_cast;
688 const int M ( numeric_cast<int>( A.rows() ) );
689 const int N ( numeric_cast<int>( B.columns() ) );
690 const int K ( numeric_cast<int>( A.columns() ) );
691 const int lda( numeric_cast<int>( A.spacing() ) );
692 const int ldb( numeric_cast<int>( B.spacing() ) );
693 const int ldc( numeric_cast<int>( C.spacing() ) );
695 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
696 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
697 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
698 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
719 template<
typename MT3
722 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
723 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
725 using boost::numeric_cast;
731 const int M ( numeric_cast<int>( A.rows() ) );
732 const int N ( numeric_cast<int>( B.columns() ) );
733 const int K ( numeric_cast<int>( A.columns() ) );
734 const int lda( numeric_cast<int>( A.spacing() ) );
735 const int ldb( numeric_cast<int>( B.spacing() ) );
736 const int ldc( numeric_cast<int>( C.spacing() ) );
738 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
739 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
740 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
741 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
762 template<
typename MT3
765 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
766 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
768 using boost::numeric_cast;
777 const int M ( numeric_cast<int>( A.rows() ) );
778 const int N ( numeric_cast<int>( B.columns() ) );
779 const int K ( numeric_cast<int>( A.columns() ) );
780 const int lda( numeric_cast<int>( A.spacing() ) );
781 const int ldb( numeric_cast<int>( B.spacing() ) );
782 const int ldc( numeric_cast<int>( C.spacing() ) );
783 const complex<float> alpha( 1.0F, 0.0F );
784 const complex<float> beta ( 0.0F, 0.0F );
786 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
787 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
788 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
789 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
810 template<
typename MT3
813 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
814 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
816 using boost::numeric_cast;
825 const int M ( numeric_cast<int>( A.rows() ) );
826 const int N ( numeric_cast<int>( B.columns() ) );
827 const int K ( numeric_cast<int>( A.columns() ) );
828 const int lda( numeric_cast<int>( A.spacing() ) );
829 const int ldb( numeric_cast<int>( B.spacing() ) );
830 const int ldc( numeric_cast<int>( C.spacing() ) );
831 const complex<double> alpha( 1.0, 0.0 );
832 const complex<double> beta ( 0.0, 0.0 );
834 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
835 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
836 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
837 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
855 template<
typename MT
861 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
873 const TmpType tmp( rhs );
892 template<
typename MT3
901 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
916 DMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
918 DMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
937 template<
typename MT3
940 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
941 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
943 const size_t M( A.rows() );
944 const size_t N( B.columns() );
945 const size_t K( A.columns() );
948 const size_t end( N &
size_t(-2) );
950 for(
size_t i=0UL; i<M; ++i ) {
951 for(
size_t k=0UL; k<K; ++k ) {
952 for(
size_t j=0UL; j<end; j+=2UL ) {
953 C(i,j ) += A(i,k) * B(k,j );
954 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
957 C(i,end) += A(i,k) * B(k,end);
979 template<
typename MT3
982 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
983 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
985 typedef IntrinsicTrait<ElementType> IT;
987 const size_t M( A.rows() );
988 const size_t N( B.columns() );
989 const size_t K( A.columns() );
993 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
994 for(
size_t i=0UL; i<M; ++i ) {
1003 for(
size_t k=0UL; k<K; ++k ) {
1005 xmm1 = xmm1 + a1 * B.load(k,j );
1006 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1007 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1008 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1009 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1010 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1011 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1012 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1014 (~C).
store( i, j , xmm1 );
1015 (~C).
store( i, j+IT::size , xmm2 );
1016 (~C).
store( i, j+IT::size*2UL, xmm3 );
1017 (~C).
store( i, j+IT::size*3UL, xmm4 );
1018 (~C).
store( i, j+IT::size*4UL, xmm5 );
1019 (~C).
store( i, j+IT::size*5UL, xmm6 );
1020 (~C).
store( i, j+IT::size*6UL, xmm7 );
1021 (~C).
store( i, j+IT::size*7UL, xmm8 );
1024 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1026 for( ; (i+2UL) <= M; i+=2UL ) {
1035 for(
size_t k=0UL; k<K; ++k ) {
1042 xmm1 = xmm1 + a1 * b1;
1043 xmm2 = xmm2 + a1 * b2;
1044 xmm3 = xmm3 + a1 * b3;
1045 xmm4 = xmm4 + a1 * b4;
1046 xmm5 = xmm5 + a2 * b1;
1047 xmm6 = xmm6 + a2 * b2;
1048 xmm7 = xmm7 + a2 * b3;
1049 xmm8 = xmm8 + a2 * b4;
1051 (~C).
store( i , j , xmm1 );
1052 (~C).
store( i , j+IT::size , xmm2 );
1053 (~C).
store( i , j+IT::size*2UL, xmm3 );
1054 (~C).
store( i , j+IT::size*3UL, xmm4 );
1055 (~C).
store( i+1UL, j , xmm5 );
1056 (~C).
store( i+1UL, j+IT::size , xmm6 );
1057 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1058 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1065 for(
size_t k=0UL; k<K; ++k ) {
1067 xmm1 = xmm1 + a1 * B.load(k,j );
1068 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1069 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1070 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1072 (~C).
store( i, j , xmm1 );
1073 (~C).
store( i, j+IT::size , xmm2 );
1074 (~C).
store( i, j+IT::size*2UL, xmm3 );
1075 (~C).
store( i, j+IT::size*3UL, xmm4 );
1078 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1080 for( ; (i+2UL) <= M; i+=2UL ) {
1085 for(
size_t k=0UL; k<K; ++k ) {
1090 xmm1 = xmm1 + a1 * b1;
1091 xmm2 = xmm2 + a1 * b2;
1092 xmm3 = xmm3 + a2 * b1;
1093 xmm4 = xmm4 + a2 * b2;
1095 (~C).
store( i , j , xmm1 );
1096 (~C).
store( i , j+IT::size, xmm2 );
1097 (~C).
store( i+1UL, j , xmm3 );
1098 (~C).
store( i+1UL, j+IT::size, xmm4 );
1103 for(
size_t k=0UL; k<K; ++k ) {
1105 xmm1 = xmm1 + a1 * B.load(k,j );
1106 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1108 (~C).
store( i, j , xmm1 );
1109 (~C).
store( i, j+IT::size, xmm2 );
1114 for( ; (i+2UL) <= M; i+=2UL ) {
1117 for(
size_t k=0UL; k<K; ++k ) {
1119 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1120 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1122 (~C).
store( i , j, xmm1 );
1123 (~C).
store( i+1UL, j, xmm2 );
1127 for(
size_t k=0UL; k<K; ++k ) {
1128 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1130 (~C).
store( i, j, xmm1 );
1151 template<
typename MT3
1154 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1155 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1160 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1164 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1168 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1194 template<
typename MT3
1197 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1198 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1200 selectDefaultAddAssignKernel( C, A, B );
1220 template<
typename MT3
1223 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1224 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1226 using boost::numeric_cast;
1232 const int M ( numeric_cast<int>( A.rows() ) );
1233 const int N ( numeric_cast<int>( B.columns() ) );
1234 const int K ( numeric_cast<int>( A.columns() ) );
1235 const int lda( numeric_cast<int>( A.spacing() ) );
1236 const int ldb( numeric_cast<int>( B.spacing() ) );
1237 const int ldc( numeric_cast<int>( C.spacing() ) );
1239 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1240 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1241 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1242 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1263 template<
typename MT3
1266 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1267 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1269 using boost::numeric_cast;
1275 const int M ( numeric_cast<int>( A.rows() ) );
1276 const int N ( numeric_cast<int>( B.columns() ) );
1277 const int K ( numeric_cast<int>( A.columns() ) );
1278 const int lda( numeric_cast<int>( A.spacing() ) );
1279 const int ldb( numeric_cast<int>( B.spacing() ) );
1280 const int ldc( numeric_cast<int>( C.spacing() ) );
1282 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1283 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1284 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1285 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1306 template<
typename MT3
1309 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1310 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1312 using boost::numeric_cast;
1321 const int M ( numeric_cast<int>( A.rows() ) );
1322 const int N ( numeric_cast<int>( B.columns() ) );
1323 const int K ( numeric_cast<int>( A.columns() ) );
1324 const int lda( numeric_cast<int>( A.spacing() ) );
1325 const int ldb( numeric_cast<int>( B.spacing() ) );
1326 const int ldc( numeric_cast<int>( C.spacing() ) );
1327 const complex<float> alpha( 1.0F, 0.0F );
1328 const complex<float> beta ( 1.0F, 0.0F );
1330 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1331 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1332 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1333 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1354 template<
typename MT3
1357 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1358 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1360 using boost::numeric_cast;
1369 const int M ( numeric_cast<int>( A.rows() ) );
1370 const int N ( numeric_cast<int>( B.columns() ) );
1371 const int K ( numeric_cast<int>( A.columns() ) );
1372 const int lda( numeric_cast<int>( A.spacing() ) );
1373 const int ldb( numeric_cast<int>( B.spacing() ) );
1374 const int ldc( numeric_cast<int>( C.spacing() ) );
1375 const complex<double> alpha( 1.0, 0.0 );
1376 const complex<double> beta ( 1.0, 0.0 );
1378 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1379 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1380 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1381 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1404 template<
typename MT3
1413 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1428 DMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1430 DMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1449 template<
typename MT3
1452 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1453 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1455 const size_t M( A.rows() );
1456 const size_t N( B.columns() );
1457 const size_t K( A.columns() );
1460 const size_t end( N &
size_t(-2) );
1462 for(
size_t i=0UL; i<M; ++i ) {
1463 for(
size_t k=0UL; k<K; ++k ) {
1464 for(
size_t j=0UL; j<end; j+=2UL ) {
1465 C(i,j ) -= A(i,k) * B(k,j );
1466 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1469 C(i,end) -= A(i,k) * B(k,end);
1491 template<
typename MT3
1494 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1495 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1497 typedef IntrinsicTrait<ElementType> IT;
1499 const size_t M( A.rows() );
1500 const size_t N( B.columns() );
1501 const size_t K( A.columns() );
1505 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1506 for(
size_t i=0UL; i<M; ++i ) {
1515 for(
size_t k=0UL; k<K; ++k ) {
1517 xmm1 = xmm1 - a1 * B.load(k,j );
1518 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1519 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1520 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1521 xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1522 xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1523 xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1524 xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1526 (~C).
store( i, j , xmm1 );
1527 (~C).
store( i, j+IT::size , xmm2 );
1528 (~C).
store( i, j+IT::size*2UL, xmm3 );
1529 (~C).
store( i, j+IT::size*3UL, xmm4 );
1530 (~C).
store( i, j+IT::size*4UL, xmm5 );
1531 (~C).
store( i, j+IT::size*5UL, xmm6 );
1532 (~C).
store( i, j+IT::size*6UL, xmm7 );
1533 (~C).
store( i, j+IT::size*7UL, xmm8 );
1536 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1538 for( ; (i+2UL) <= M; i+=2UL ) {
1547 for(
size_t k=0UL; k<K; ++k ) {
1554 xmm1 = xmm1 - a1 * b1;
1555 xmm2 = xmm2 - a1 * b2;
1556 xmm3 = xmm3 - a1 * b3;
1557 xmm4 = xmm4 - a1 * b4;
1558 xmm5 = xmm5 - a2 * b1;
1559 xmm6 = xmm6 - a2 * b2;
1560 xmm7 = xmm7 - a2 * b3;
1561 xmm8 = xmm8 - a2 * b4;
1563 (~C).
store( i , j , xmm1 );
1564 (~C).
store( i , j+IT::size , xmm2 );
1565 (~C).
store( i , j+IT::size*2UL, xmm3 );
1566 (~C).
store( i , j+IT::size*3UL, xmm4 );
1567 (~C).
store( i+1UL, j , xmm5 );
1568 (~C).
store( i+1UL, j+IT::size , xmm6 );
1569 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1570 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1577 for(
size_t k=0UL; k<K; ++k ) {
1579 xmm1 = xmm1 - a1 * B.load(k,j );
1580 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1581 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1582 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1584 (~C).
store( i, j , xmm1 );
1585 (~C).
store( i, j+IT::size , xmm2 );
1586 (~C).
store( i, j+IT::size*2UL, xmm3 );
1587 (~C).
store( i, j+IT::size*3UL, xmm4 );
1590 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1592 for( ; (i+2UL) <= M; i+=2UL ) {
1597 for(
size_t k=0UL; k<K; ++k ) {
1602 xmm1 = xmm1 - a1 * b1;
1603 xmm2 = xmm2 - a1 * b2;
1604 xmm3 = xmm3 - a2 * b1;
1605 xmm4 = xmm4 - a2 * b2;
1607 (~C).
store( i , j , xmm1 );
1608 (~C).
store( i , j+IT::size, xmm2 );
1609 (~C).
store( i+1UL, j , xmm3 );
1610 (~C).
store( i+1UL, j+IT::size, xmm4 );
1615 for(
size_t k=0UL; k<K; ++k ) {
1617 xmm1 = xmm1 - a1 * B.load(k,j );
1618 xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
1620 (~C).
store( i, j , xmm1 );
1621 (~C).
store( i, j+IT::size, xmm2 );
1626 for( ; (i+2UL) <= M; i+=2UL ) {
1629 for(
size_t k=0UL; k<K; ++k ) {
1631 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1632 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1634 (~C).
store( i , j, xmm1 );
1635 (~C).
store( i+1UL, j, xmm2 );
1639 for(
size_t k=0UL; k<K; ++k ) {
1640 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
1642 (~C).
store( i, j, xmm1 );
1663 template<
typename MT3
1666 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1667 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1672 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1676 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1680 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1706 template<
typename MT3
1709 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1710 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1712 selectDefaultSubAssignKernel( C, A, B );
1732 template<
typename MT3
1735 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1736 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1738 using boost::numeric_cast;
1744 const int M ( numeric_cast<int>( A.rows() ) );
1745 const int N ( numeric_cast<int>( B.columns() ) );
1746 const int K ( numeric_cast<int>( A.columns() ) );
1747 const int lda( numeric_cast<int>( A.spacing() ) );
1748 const int ldb( numeric_cast<int>( B.spacing() ) );
1749 const int ldc( numeric_cast<int>( C.spacing() ) );
1751 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1752 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1753 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1754 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1775 template<
typename MT3
1778 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1779 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1781 using boost::numeric_cast;
1787 const int M ( numeric_cast<int>( A.rows() ) );
1788 const int N ( numeric_cast<int>( B.columns() ) );
1789 const int K ( numeric_cast<int>( A.columns() ) );
1790 const int lda( numeric_cast<int>( A.spacing() ) );
1791 const int ldb( numeric_cast<int>( B.spacing() ) );
1792 const int ldc( numeric_cast<int>( C.spacing() ) );
1794 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1795 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1796 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1797 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1818 template<
typename MT3
1821 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1822 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1824 using boost::numeric_cast;
1833 const int M ( numeric_cast<int>( A.rows() ) );
1834 const int N ( numeric_cast<int>( B.columns() ) );
1835 const int K ( numeric_cast<int>( A.columns() ) );
1836 const int lda( numeric_cast<int>( A.spacing() ) );
1837 const int ldb( numeric_cast<int>( B.spacing() ) );
1838 const int ldc( numeric_cast<int>( C.spacing() ) );
1839 const complex<float> alpha( -1.0F, 0.0F );
1840 const complex<float> beta ( 1.0F, 0.0F );
1842 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1843 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1844 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1845 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1866 template<
typename MT3
1869 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1870 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1872 using boost::numeric_cast;
1881 const int M ( numeric_cast<int>( A.rows() ) );
1882 const int N ( numeric_cast<int>( B.columns() ) );
1883 const int K ( numeric_cast<int>( A.columns() ) );
1884 const int lda( numeric_cast<int>( A.spacing() ) );
1885 const int ldb( numeric_cast<int>( B.spacing() ) );
1886 const int ldc( numeric_cast<int>( C.spacing() ) );
1887 const complex<double> alpha( -1.0, 0.0 );
1888 const complex<double> beta ( 1.0, 0.0 );
1890 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1891 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1892 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1893 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1939 template<
typename MT1
1943 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
1944 ,
private MatScalarMultExpr
1945 ,
private Computation
1949 typedef DMatDMatMultExpr<MT1,MT2> MMM;
1962 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1963 struct UseSinglePrecisionKernel {
1964 enum { value = IsFloat<typename T1::ElementType>::value &&
1965 IsFloat<typename T2::ElementType>::value &&
1966 IsFloat<typename T3::ElementType>::value &&
1967 !IsComplex<T4>::value };
1976 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1977 struct UseDoublePrecisionKernel {
1978 enum { value = IsDouble<typename T1::ElementType>::value &&
1979 IsDouble<typename T2::ElementType>::value &&
1980 IsDouble<typename T3::ElementType>::value &&
1981 !IsComplex<T4>::value };
1990 template<
typename T1,
typename T2,
typename T3 >
1991 struct UseSinglePrecisionComplexKernel {
1992 typedef complex<float> Type;
1993 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1994 IsSame<typename T2::ElementType,Type>::value &&
1995 IsSame<typename T3::ElementType,Type>::value };
2004 template<
typename T1,
typename T2,
typename T3 >
2005 struct UseDoublePrecisionComplexKernel {
2006 typedef complex<double> Type;
2007 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2008 IsSame<typename T2::ElementType,Type>::value &&
2009 IsSame<typename T3::ElementType,Type>::value };
2017 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2018 struct UseDefaultKernel {
2019 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2020 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2021 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2022 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2030 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2031 struct UseVectorizedDefaultKernel {
2032 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2033 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2034 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2035 IsSame<typename T1::ElementType,T4>::value &&
2036 IntrinsicTrait<typename T1::ElementType>::addition &&
2037 IntrinsicTrait<typename T1::ElementType>::multiplication };
2043 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2044 typedef typename MultTrait<RES,ST>::Type
ResultType;
2048 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2053 typedef const DMatDMatMultExpr<MT1,MT2>
LeftOperand;
2059 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2062 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2067 enum { vectorizable = 0 };
2076 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2092 return matrix_(i,j) * scalar_;
2101 inline size_t rows()
const {
2111 inline size_t columns()
const {
2142 template<
typename T >
2143 inline bool canAlias(
const T* alias )
const {
2144 return matrix_.canAlias( alias );
2154 template<
typename T >
2155 inline bool isAliased(
const T* alias )
const {
2156 return matrix_.isAliased( alias );
2175 template<
typename MT3
2177 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2184 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2185 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2187 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2190 else if( left.columns() == 0UL ) {
2206 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2208 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2226 template<
typename MT3
2230 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2231 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2233 const size_t M( A.rows() );
2234 const size_t N( B.columns() );
2235 const size_t K( A.columns() );
2237 for(
size_t i=0UL; i<M; ++i ) {
2238 for(
size_t j=0UL; j<N; ++j ) {
2239 C(i,j) = A(i,0UL) * B(0UL,j);
2241 for(
size_t k=1UL; k<K; ++k ) {
2242 for(
size_t j=0UL; j<N; ++j ) {
2243 C(i,j) += A(i,k) * B(k,j);
2246 for(
size_t j=0UL; j<N; ++j ) {
2267 template<
typename MT3
2271 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2272 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2274 typedef IntrinsicTrait<ElementType> IT;
2276 const size_t M( A.rows() );
2277 const size_t N( B.columns() );
2278 const size_t K( A.columns() );
2284 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2285 for(
size_t i=0UL; i<M; ++i ) {
2286 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2287 for(
size_t k=0UL; k<K; ++k ) {
2289 xmm1 = xmm1 + a1 * B.load(k,j );
2290 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2291 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2292 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2293 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2294 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2295 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2296 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2298 (~C).
store( i, j , xmm1 * factor );
2299 (~C).
store( i, j+IT::size , xmm2 * factor );
2300 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
2301 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
2302 (~C).
store( i, j+IT::size*4UL, xmm5 * factor );
2303 (~C).
store( i, j+IT::size*5UL, xmm6 * factor );
2304 (~C).
store( i, j+IT::size*6UL, xmm7 * factor );
2305 (~C).
store( i, j+IT::size*7UL, xmm8 * factor );
2308 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2310 for( ; (i+2UL) <= M; i+=2UL ) {
2311 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2312 for(
size_t k=0UL; k<K; ++k ) {
2319 xmm1 = xmm1 + a1 * b1;
2320 xmm2 = xmm2 + a1 * b2;
2321 xmm3 = xmm3 + a1 * b3;
2322 xmm4 = xmm4 + a1 * b4;
2323 xmm5 = xmm5 + a2 * b1;
2324 xmm6 = xmm6 + a2 * b2;
2325 xmm7 = xmm7 + a2 * b3;
2326 xmm8 = xmm8 + a2 * b4;
2328 (~C).
store( i , j , xmm1 * factor );
2329 (~C).
store( i , j+IT::size , xmm2 * factor );
2330 (~C).
store( i , j+IT::size*2UL, xmm3 * factor );
2331 (~C).
store( i , j+IT::size*3UL, xmm4 * factor );
2332 (~C).
store( i+1UL, j , xmm5 * factor );
2333 (~C).
store( i+1UL, j+IT::size , xmm6 * factor );
2334 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 * factor );
2335 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 * factor );
2339 for(
size_t k=0UL; k<K; ++k ) {
2341 xmm1 = xmm1 + a1 * B.load(k,j );
2342 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2343 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2344 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2346 (~C).
store( i, j , xmm1 * factor );
2347 (~C).
store( i, j+IT::size , xmm2 * factor );
2348 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
2349 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
2352 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2354 for( ; (i+2UL) <= M; i+=2UL ) {
2356 for(
size_t k=0UL; k<K; ++k ) {
2361 xmm1 = xmm1 + a1 * b1;
2362 xmm2 = xmm2 + a1 * b2;
2363 xmm3 = xmm3 + a2 * b1;
2364 xmm4 = xmm4 + a2 * b2;
2366 (~C).
store( i , j , xmm1 * factor );
2367 (~C).
store( i , j+IT::size, xmm2 * factor );
2368 (~C).
store( i+1UL, j , xmm3 * factor );
2369 (~C).
store( i+1UL, j+IT::size, xmm4 * factor );
2373 for(
size_t k=0UL; k<K; ++k ) {
2375 xmm1 = xmm1 + a1 * B.load(k,j );
2376 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2378 (~C).
store( i, j , xmm1 * factor );
2379 (~C).
store( i, j+IT::size, xmm2 * factor );
2384 for( ; (i+2UL) <= M; i+=2UL ) {
2386 for(
size_t k=0UL; k<K; ++k ) {
2388 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2389 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2391 (~C).
store( i , j, xmm1 * factor );
2392 (~C).
store( i+1UL, j, xmm2 * factor );
2396 for(
size_t k=0UL; k<K; ++k ) {
2397 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2399 (~C).
store( i, j, xmm1 * factor );
2419 template<
typename MT3
2423 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2424 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2429 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2431 assign( ~C, tmp * B * scalar );
2433 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2435 assign( ~C, A * tmp * scalar );
2437 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2439 assign( ~C, tmp * B * scalar );
2443 assign( ~C, A * tmp * scalar );
2462 template<
typename MT3
2466 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2467 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2469 selectDefaultAssignKernel( C, A, B, scalar );
2488 template<
typename MT3
2492 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2493 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2495 using boost::numeric_cast;
2501 const int M ( numeric_cast<int>( A.rows() ) );
2502 const int N ( numeric_cast<int>( B.columns() ) );
2503 const int K ( numeric_cast<int>( A.columns() ) );
2504 const int lda( numeric_cast<int>( A.spacing() ) );
2505 const int ldb( numeric_cast<int>( B.spacing() ) );
2506 const int ldc( numeric_cast<int>( C.spacing() ) );
2508 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2509 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2510 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2511 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2531 template<
typename MT3
2535 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2536 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2538 using boost::numeric_cast;
2544 const int M ( numeric_cast<int>( A.rows() ) );
2545 const int N ( numeric_cast<int>( B.columns() ) );
2546 const int K ( numeric_cast<int>( A.columns() ) );
2547 const int lda( numeric_cast<int>( A.spacing() ) );
2548 const int ldb( numeric_cast<int>( B.spacing() ) );
2549 const int ldc( numeric_cast<int>( C.spacing() ) );
2551 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2552 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2553 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2554 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2574 template<
typename MT3
2578 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2579 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2581 using boost::numeric_cast;
2590 const int M ( numeric_cast<int>( A.rows() ) );
2591 const int N ( numeric_cast<int>( B.columns() ) );
2592 const int K ( numeric_cast<int>( A.columns() ) );
2593 const int lda( numeric_cast<int>( A.spacing() ) );
2594 const int ldb( numeric_cast<int>( B.spacing() ) );
2595 const int ldc( numeric_cast<int>( C.spacing() ) );
2596 const complex<float> alpha( scalar );
2597 const complex<float> beta ( 0.0F, 0.0F );
2599 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2600 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2601 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2602 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2622 template<
typename MT3
2626 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2627 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2629 using boost::numeric_cast;
2638 const int M ( numeric_cast<int>( A.rows() ) );
2639 const int N ( numeric_cast<int>( B.columns() ) );
2640 const int K ( numeric_cast<int>( A.columns() ) );
2641 const int lda( numeric_cast<int>( A.spacing() ) );
2642 const int ldb( numeric_cast<int>( B.spacing() ) );
2643 const int ldc( numeric_cast<int>( C.spacing() ) );
2644 const complex<double> alpha( scalar );
2645 const complex<double> beta ( 0.0, 0.0 );
2647 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2648 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2649 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2650 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2666 template<
typename MT
2668 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2672 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2684 const TmpType tmp( rhs );
2701 template<
typename MT3
2703 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2710 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2711 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2713 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2728 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2730 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2748 template<
typename MT3
2752 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2753 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2774 template<
typename MT3
2778 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2779 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2781 typedef IntrinsicTrait<ElementType> IT;
2783 const size_t M( A.rows() );
2784 const size_t N( B.columns() );
2785 const size_t K( A.columns() );
2791 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2792 for(
size_t i=0UL; i<M; ++i ) {
2793 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2794 for(
size_t k=0UL; k<K; ++k ) {
2796 xmm1 = xmm1 + a1 * B.load(k,j );
2797 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2798 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2799 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2800 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2801 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2802 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2803 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2805 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
2806 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
2807 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
2808 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
2809 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
2810 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
2811 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
2812 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
2815 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2817 for( ; (i+2UL) <= M; i+=2UL ) {
2818 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2819 for(
size_t k=0UL; k<K; ++k ) {
2826 xmm1 = xmm1 + a1 * b1;
2827 xmm2 = xmm2 + a1 * b2;
2828 xmm3 = xmm3 + a1 * b3;
2829 xmm4 = xmm4 + a1 * b4;
2830 xmm5 = xmm5 + a2 * b1;
2831 xmm6 = xmm6 + a2 * b2;
2832 xmm7 = xmm7 + a2 * b3;
2833 xmm8 = xmm8 + a2 * b4;
2835 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
2836 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
2837 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
2838 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
2839 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
2840 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
2841 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
2842 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
2846 for(
size_t k=0UL; k<K; ++k ) {
2848 xmm1 = xmm1 + a1 * B.load(k,j );
2849 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2850 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2851 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2853 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
2854 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
2855 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
2856 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
2859 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2861 for( ; (i+2UL) <= M; i+=2UL ) {
2863 for(
size_t k=0UL; k<K; ++k ) {
2868 xmm1 = xmm1 + a1 * b1;
2869 xmm2 = xmm2 + a1 * b2;
2870 xmm3 = xmm3 + a2 * b1;
2871 xmm4 = xmm4 + a2 * b2;
2873 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
2874 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
2875 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
2876 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
2880 for(
size_t k=0UL; k<K; ++k ) {
2882 xmm1 = xmm1 + a1 * B.load(k,j );
2883 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2885 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
2886 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
2891 for( ; (i+2UL) <= M; i+=2UL ) {
2893 for(
size_t k=0UL; k<K; ++k ) {
2895 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2896 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2898 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
2899 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
2903 for(
size_t k=0UL; k<K; ++k ) {
2904 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2906 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
2926 template<
typename MT3
2930 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2931 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2936 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2940 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2944 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2969 template<
typename MT3
2973 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2974 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2976 selectDefaultAddAssignKernel( C, A, B, scalar );
2995 template<
typename MT3
2999 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3000 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3002 using boost::numeric_cast;
3008 const int M ( numeric_cast<int>( A.rows() ) );
3009 const int N ( numeric_cast<int>( B.columns() ) );
3010 const int K ( numeric_cast<int>( A.columns() ) );
3011 const int lda( numeric_cast<int>( A.spacing() ) );
3012 const int ldb( numeric_cast<int>( B.spacing() ) );
3013 const int ldc( numeric_cast<int>( C.spacing() ) );
3015 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3016 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3017 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3018 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3038 template<
typename MT3
3042 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3043 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3045 using boost::numeric_cast;
3051 const int M ( numeric_cast<int>( A.rows() ) );
3052 const int N ( numeric_cast<int>( B.columns() ) );
3053 const int K ( numeric_cast<int>( A.columns() ) );
3054 const int lda( numeric_cast<int>( A.spacing() ) );
3055 const int ldb( numeric_cast<int>( B.spacing() ) );
3056 const int ldc( numeric_cast<int>( C.spacing() ) );
3058 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3059 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3060 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3061 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3081 template<
typename MT3
3085 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3086 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3088 using boost::numeric_cast;
3097 const int M ( numeric_cast<int>( A.rows() ) );
3098 const int N ( numeric_cast<int>( B.columns() ) );
3099 const int K ( numeric_cast<int>( A.columns() ) );
3100 const int lda( numeric_cast<int>( A.spacing() ) );
3101 const int ldb( numeric_cast<int>( B.spacing() ) );
3102 const int ldc( numeric_cast<int>( C.spacing() ) );
3103 const complex<float> alpha( scalar );
3104 const complex<float> beta ( 1.0F, 0.0F );
3106 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3107 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3108 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3109 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3129 template<
typename MT3
3133 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3134 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3136 using boost::numeric_cast;
3145 const int M ( numeric_cast<int>( A.rows() ) );
3146 const int N ( numeric_cast<int>( B.columns() ) );
3147 const int K ( numeric_cast<int>( A.columns() ) );
3148 const int lda( numeric_cast<int>( A.spacing() ) );
3149 const int ldb( numeric_cast<int>( B.spacing() ) );
3150 const int ldc( numeric_cast<int>( C.spacing() ) );
3151 const complex<double> alpha( scalar );
3152 const complex<double> beta ( 1.0, 0.0 );
3154 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3155 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3156 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3157 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3178 template<
typename MT3
3180 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3187 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3188 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3190 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3205 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3207 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3225 template<
typename MT3
3229 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3230 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3251 template<
typename MT3
3255 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3256 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3258 typedef IntrinsicTrait<ElementType> IT;
3260 const size_t M( A.rows() );
3261 const size_t N( B.columns() );
3262 const size_t K( A.columns() );
3268 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3269 for(
size_t i=0UL; i<M; ++i ) {
3270 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3271 for(
size_t k=0UL; k<K; ++k ) {
3273 xmm1 = xmm1 + a1 * B.load(k,j );
3274 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3275 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3276 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3277 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3278 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3279 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3280 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3282 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
3283 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
3284 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
3285 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
3286 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
3287 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
3288 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
3289 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
3292 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3294 for( ; (i+2UL) <= M; i+=2UL ) {
3295 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3296 for(
size_t k=0UL; k<K; ++k ) {
3303 xmm1 = xmm1 + a1 * b1;
3304 xmm2 = xmm2 + a1 * b2;
3305 xmm3 = xmm3 + a1 * b3;
3306 xmm4 = xmm4 + a1 * b4;
3307 xmm5 = xmm5 + a2 * b1;
3308 xmm6 = xmm6 + a2 * b2;
3309 xmm7 = xmm7 + a2 * b3;
3310 xmm8 = xmm8 + a2 * b4;
3312 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
3313 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
3314 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
3315 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
3316 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
3317 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
3318 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
3319 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
3323 for(
size_t k=0UL; k<K; ++k ) {
3325 xmm1 = xmm1 + a1 * B.load(k,j );
3326 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3327 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3328 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3330 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
3331 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
3332 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
3333 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
3336 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3338 for( ; (i+2UL) <= M; i+=2UL ) {
3340 for(
size_t k=0UL; k<K; ++k ) {
3345 xmm1 = xmm1 + a1 * b1;
3346 xmm2 = xmm2 + a1 * b2;
3347 xmm3 = xmm3 + a2 * b1;
3348 xmm4 = xmm4 + a2 * b2;
3350 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
3351 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
3352 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
3353 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
3357 for(
size_t k=0UL; k<K; ++k ) {
3359 xmm1 = xmm1 + a1 * B.load(k,j );
3360 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3362 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
3363 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
3368 for( ; (i+2UL) <= M; i+=2UL ) {
3370 for(
size_t k=0UL; k<K; ++k ) {
3372 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3373 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3375 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
3376 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
3380 for(
size_t k=0UL; k<K; ++k ) {
3381 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3383 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
3403 template<
typename MT3
3407 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3408 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3413 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3417 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3421 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3446 template<
typename MT3
3450 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3451 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3453 selectDefaultSubAssignKernel( C, A, B, scalar );
3472 template<
typename MT3
3476 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3477 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3479 using boost::numeric_cast;
3485 const int M ( numeric_cast<int>( A.rows() ) );
3486 const int N ( numeric_cast<int>( B.columns() ) );
3487 const int K ( numeric_cast<int>( A.columns() ) );
3488 const int lda( numeric_cast<int>( A.spacing() ) );
3489 const int ldb( numeric_cast<int>( B.spacing() ) );
3490 const int ldc( numeric_cast<int>( C.spacing() ) );
3492 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3493 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3494 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3495 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3515 template<
typename MT3
3519 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3520 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3522 using boost::numeric_cast;
3528 const int M ( numeric_cast<int>( A.rows() ) );
3529 const int N ( numeric_cast<int>( B.columns() ) );
3530 const int K ( numeric_cast<int>( A.columns() ) );
3531 const int lda( numeric_cast<int>( A.spacing() ) );
3532 const int ldb( numeric_cast<int>( B.spacing() ) );
3533 const int ldc( numeric_cast<int>( C.spacing() ) );
3535 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3536 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3537 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3538 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3558 template<
typename MT3
3562 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3563 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3565 using boost::numeric_cast;
3574 const int M ( numeric_cast<int>( A.rows() ) );
3575 const int N ( numeric_cast<int>( B.columns() ) );
3576 const int K ( numeric_cast<int>( A.columns() ) );
3577 const int lda( numeric_cast<int>( A.spacing() ) );
3578 const int ldb( numeric_cast<int>( B.spacing() ) );
3579 const int ldc( numeric_cast<int>( C.spacing() ) );
3580 const complex<float> alpha( -scalar );
3581 const complex<float> beta ( 1.0F, 0.0F );
3583 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3584 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3585 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3586 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3606 template<
typename MT3
3610 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3611 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3613 using boost::numeric_cast;
3622 const int M ( numeric_cast<int>( A.rows() ) );
3623 const int N ( numeric_cast<int>( B.columns() ) );
3624 const int K ( numeric_cast<int>( A.columns() ) );
3625 const int lda( numeric_cast<int>( A.spacing() ) );
3626 const int ldb( numeric_cast<int>( B.spacing() ) );
3627 const int ldc( numeric_cast<int>( C.spacing() ) );
3628 const complex<double> alpha( -scalar );
3629 const complex<double> beta ( 1.0, 0.0 );
3631 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3632 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3633 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3634 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3700 template<
typename T1
3702 inline const DMatDMatMultExpr<T1,T2>
3708 throw std::invalid_argument(
"Matrix sizes do not match" );
3725 template<
typename MT1,
typename MT2,
typename VT >
3730 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3731 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3732 IsDenseVector<VT>::value && IsColumnVector<VT>::value
3733 ,
typename DMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
3734 , INVALID_TYPE >::Type Type;
3743 template<
typename MT1,
typename MT2,
typename VT >
3748 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3749 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3750 IsSparseVector<VT>::value && IsColumnVector<VT>::value
3751 ,
typename DMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
3752 , INVALID_TYPE >::Type Type;
3761 template<
typename VT,
typename MT1,
typename MT2 >
3766 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
3767 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3768 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3769 ,
typename TDVecDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3770 , INVALID_TYPE >::Type Type;
3779 template<
typename VT,
typename MT1,
typename MT2 >
3784 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
3785 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3786 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3787 ,
typename TDVecDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3788 , INVALID_TYPE >::Type Type;
3797 template<
typename MT1,
typename MT2 >
3802 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1>::Type
3803 ,
typename SubmatrixExprTrait<const MT2>::Type >::Type Type;
3812 template<
typename MT1,
typename MT2 >
3817 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
3826 template<
typename MT1,
typename MT2 >
3831 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:123
Constraint on the data type.
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4512
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:3703
Header file for the SparseVector base class.
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:196
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:232
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:297
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
DMatDMatMultExpr< MT1, MT2 > This
Type of this DMatDMatMultExpr instance.
Definition: DMatDMatMultExpr.h:219
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:223
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2375
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:248
Header file for the DenseVector base class.
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:246
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:229
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:114
CompressedMatrix< Type, false > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:2371
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
SelectType< IsComputation< MT1 >::value, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:235
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:317
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDMatMultExpr.h:224
Header file for the DenseMatrix base class.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
const size_t DMATDMATMULT_THRESHOLD
Row-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the thr...
Definition: Thresholds.h:119
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:252
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2373
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:221
Header file for the EnableIf class template.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:339
Header file for the IsNumeric type trait.
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:327
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:267
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:358
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:648
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:220
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:226
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:307
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:222
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:225
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:247
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:120
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2370
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the TSVecDMatMultExprTrait class template.
SelectType< IsComputation< MT2 >::value, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:238
Header file for the complex data type.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:122
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:121
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:359
Header file for the IsResizable type trait.
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Operand matrix_
The dense matrix containing the submatrix.
Definition: DenseSubmatrix.h:2792
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
void store(float *address, const sse_float_t &value)
Aligned store of a vector of 'float' values.
Definition: Store.h:242
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:351
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.