35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
114 template<
typename MT1
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseSMPAssignKernel {
147 enum { value = evaluateLeft || evaluateRight };
157 template<
typename T1,
typename T2,
typename T3 >
158 struct UseSinglePrecisionKernel {
159 enum { value = IsFloat<typename T1::ElementType>::value &&
160 IsFloat<typename T2::ElementType>::value &&
161 IsFloat<typename T3::ElementType>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDoublePrecisionKernel {
173 enum { value = IsDouble<typename T1::ElementType>::value &&
174 IsDouble<typename T2::ElementType>::value &&
175 IsDouble<typename T3::ElementType>::value };
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseSinglePrecisionComplexKernel {
188 typedef complex<float> Type;
189 enum { value = IsSame<typename T1::ElementType,Type>::value &&
190 IsSame<typename T2::ElementType,Type>::value &&
191 IsSame<typename T3::ElementType,Type>::value };
202 template<
typename T1,
typename T2,
typename T3 >
203 struct UseDoublePrecisionComplexKernel {
204 typedef complex<double> Type;
205 enum { value = IsSame<typename T1::ElementType,Type>::value &&
206 IsSame<typename T2::ElementType,Type>::value &&
207 IsSame<typename T3::ElementType,Type>::value };
217 template<
typename T1,
typename T2,
typename T3 >
218 struct UseDefaultKernel {
219 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
220 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
221 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
222 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
232 template<
typename T1,
typename T2,
typename T3 >
233 struct UseVectorizedDefaultKernel {
234 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
235 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
236 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
237 IntrinsicTrait<typename T1::ElementType>::addition &&
238 IntrinsicTrait<typename T1::ElementType>::multiplication };
269 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
275 enum { smpAssignable = !evaluateLeft && !evaluateRight };
305 if(
lhs_.columns() != 0UL ) {
306 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
308 for(
size_t k=1UL; k<end; k+=2UL ) {
310 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
312 if( end <
lhs_.columns() ) {
340 return rhs_.columns();
370 template<
typename T >
372 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
382 template<
typename T >
384 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
394 return lhs_.isAligned() &&
rhs_.isAligned();
426 template<
typename MT
435 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
438 else if( rhs.
lhs_.columns() == 0UL ) {
453 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
469 template<
typename MT3
473 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
476 DMatTDMatMultExpr::selectDefaultAssignKernel( C, A, B );
478 DMatTDMatMultExpr::selectBlasAssignKernel( C, A, B );
494 template<
typename MT3
497 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
498 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
519 template<
typename MT3
522 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
523 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
525 const size_t M( A.rows() );
526 const size_t N( B.columns() );
527 const size_t K( A.columns() );
529 for(
size_t i=0UL; i<M; ++i ) {
530 for(
size_t j=0UL; j<N; ++j ) {
531 C(i,j) = A(i,0UL) * B(0UL,j);
533 for(
size_t k=1UL; k<K; ++k ) {
534 for(
size_t j=0UL; j<N; ++j ) {
535 C(i,j) += A(i,k) * B(k,j);
557 template<
typename MT3
560 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
561 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
563 typedef IntrinsicTrait<ElementType> IT;
565 const size_t M( A.rows() );
566 const size_t N( B.columns() );
567 const size_t K( A.columns() );
571 for( ; (i+2UL) <= M; i+=2UL ) {
573 for( ; (j+4UL) <= N; j+=4UL ) {
574 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
575 for(
size_t k=0UL; k<K; k+=IT::size ) {
582 xmm1 = xmm1 + a1 * b1;
583 xmm2 = xmm2 + a1 * b2;
584 xmm3 = xmm3 + a1 * b3;
585 xmm4 = xmm4 + a1 * b4;
586 xmm5 = xmm5 + a2 * b1;
587 xmm6 = xmm6 + a2 * b2;
588 xmm7 = xmm7 + a2 * b3;
589 xmm8 = xmm8 + a2 * b4;
591 (~C)(i ,j ) =
sum( xmm1 );
592 (~C)(i ,j+1UL) =
sum( xmm2 );
593 (~C)(i ,j+2UL) =
sum( xmm3 );
594 (~C)(i ,j+3UL) =
sum( xmm4 );
595 (~C)(i+1UL,j ) =
sum( xmm5 );
596 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
597 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
598 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
600 for( ; (j+2UL) <= N; j+=2UL ) {
602 for(
size_t k=0UL; k<K; k+=IT::size ) {
607 xmm1 = xmm1 + a1 * b1;
608 xmm2 = xmm2 + a1 * b2;
609 xmm3 = xmm3 + a2 * b1;
610 xmm4 = xmm4 + a2 * b2;
612 (~C)(i ,j ) =
sum( xmm1 );
613 (~C)(i ,j+1UL) =
sum( xmm2 );
614 (~C)(i+1UL,j ) =
sum( xmm3 );
615 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
619 for(
size_t k=0UL; k<K; k+=IT::size ) {
621 xmm1 = xmm1 + A.load(i ,k) * b1;
622 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
624 (~C)(i ,j) =
sum( xmm1 );
625 (~C)(i+1UL,j) =
sum( xmm2 );
630 for( ; (j+4UL) <= N; j+=4UL ) {
632 for(
size_t k=0UL; k<K; k+=IT::size ) {
634 xmm1 = xmm1 + a1 * B.load(k,j );
635 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
636 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
637 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
639 (~C)(i,j ) =
sum( xmm1 );
640 (~C)(i,j+1UL) =
sum( xmm2 );
641 (~C)(i,j+2UL) =
sum( xmm3 );
642 (~C)(i,j+3UL) =
sum( xmm4 );
644 for( ; (j+2UL) <= N; j+=2UL ) {
646 for(
size_t k=0UL; k<K; k+=IT::size ) {
648 xmm1 = xmm1 + a1 * B.load(k,j );
649 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
651 (~C)(i,j ) =
sum( xmm1 );
652 (~C)(i,j+1UL) =
sum( xmm2 );
656 for(
size_t k=0UL; k<K; k+=IT::size ) {
657 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
659 (~C)(i,j) =
sum( xmm1 );
680 template<
typename MT3
683 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
684 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
686 typedef IntrinsicTrait<ElementType> IT;
688 const size_t M( A.rows() );
689 const size_t N( B.columns() );
690 const size_t K( A.columns() );
694 for( ; (i+4UL) <= M; i+=4UL ) {
696 for( ; (j+2UL) <= N; j+=2UL ) {
697 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
698 for(
size_t k=0UL; k<K; k+=IT::size ) {
705 xmm1 = xmm1 + a1 * b1;
706 xmm2 = xmm2 + a1 * b2;
707 xmm3 = xmm3 + a2 * b1;
708 xmm4 = xmm4 + a2 * b2;
709 xmm5 = xmm5 + a3 * b1;
710 xmm6 = xmm6 + a3 * b2;
711 xmm7 = xmm7 + a4 * b1;
712 xmm8 = xmm8 + a4 * b2;
714 (~C)(i ,j ) =
sum( xmm1 );
715 (~C)(i ,j+1UL) =
sum( xmm2 );
716 (~C)(i+1UL,j ) =
sum( xmm3 );
717 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
718 (~C)(i+2UL,j ) =
sum( xmm5 );
719 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
720 (~C)(i+3UL,j ) =
sum( xmm7 );
721 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
725 for(
size_t k=0UL; k<K; k+=IT::size ) {
727 xmm1 = xmm1 + A.load(i ,k) * b1;
728 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
729 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
730 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
732 (~C)(i ,j) =
sum( xmm1 );
733 (~C)(i+1UL,j) =
sum( xmm2 );
734 (~C)(i+2UL,j) =
sum( xmm3 );
735 (~C)(i+3UL,j) =
sum( xmm4 );
738 for( ; (i+2UL) <= M; i+=2UL ) {
740 for( ; (j+2UL) <= N; j+=2UL ) {
742 for(
size_t k=0UL; k<K; k+=IT::size ) {
747 xmm1 = xmm1 + a1 * b1;
748 xmm2 = xmm2 + a1 * b2;
749 xmm3 = xmm3 + a2 * b1;
750 xmm4 = xmm4 + a2 * b2;
752 (~C)(i ,j ) =
sum( xmm1 );
753 (~C)(i ,j+1UL) =
sum( xmm2 );
754 (~C)(i+1UL,j ) =
sum( xmm3 );
755 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
759 for(
size_t k=0UL; k<K; k+=IT::size ) {
761 xmm1 = xmm1 + A.load(i ,k) * b1;
762 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
764 (~C)(i ,j) =
sum( xmm1 );
765 (~C)(i+1UL,j) =
sum( xmm2 );
770 for( ; (j+2UL) <= N; j+=2UL ) {
772 for(
size_t k=0UL; k<K; k+=IT::size ) {
774 xmm1 = xmm1 + a1 * B.load(k,j );
775 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
777 (~C)(i,j ) =
sum( xmm1 );
778 (~C)(i,j+1UL) =
sum( xmm2 );
782 for(
size_t k=0UL; k<K; k+=IT::size ) {
783 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
785 (~C)(i,j) =
sum( xmm1 );
806 template<
typename MT3
809 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
810 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
812 selectDefaultAssignKernel( C, A, B );
832 template<
typename MT3
835 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
836 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
838 using boost::numeric_cast;
844 const int M ( numeric_cast<int>( A.rows() ) );
845 const int N ( numeric_cast<int>( B.columns() ) );
846 const int K ( numeric_cast<int>( A.columns() ) );
847 const int lda( numeric_cast<int>( A.spacing() ) );
848 const int ldb( numeric_cast<int>( B.spacing() ) );
849 const int ldc( numeric_cast<int>( C.spacing() ) );
851 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
852 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
853 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
854 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
875 template<
typename MT3
878 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
879 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
881 using boost::numeric_cast;
887 const int M ( numeric_cast<int>( A.rows() ) );
888 const int N ( numeric_cast<int>( B.columns() ) );
889 const int K ( numeric_cast<int>( A.columns() ) );
890 const int lda( numeric_cast<int>( A.spacing() ) );
891 const int ldb( numeric_cast<int>( B.spacing() ) );
892 const int ldc( numeric_cast<int>( C.spacing() ) );
894 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
895 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
896 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
897 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
918 template<
typename MT3
921 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
922 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
924 using boost::numeric_cast;
933 const int M ( numeric_cast<int>( A.rows() ) );
934 const int N ( numeric_cast<int>( B.columns() ) );
935 const int K ( numeric_cast<int>( A.columns() ) );
936 const int lda( numeric_cast<int>( A.spacing() ) );
937 const int ldb( numeric_cast<int>( B.spacing() ) );
938 const int ldc( numeric_cast<int>( C.spacing() ) );
939 const complex<float> alpha( 1.0F, 0.0F );
940 const complex<float> beta ( 0.0F, 0.0F );
942 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
943 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
944 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
945 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
966 template<
typename MT3
969 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
970 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
972 using boost::numeric_cast;
981 const int M ( numeric_cast<int>( A.rows() ) );
982 const int N ( numeric_cast<int>( B.columns() ) );
983 const int K ( numeric_cast<int>( A.columns() ) );
984 const int lda( numeric_cast<int>( A.spacing() ) );
985 const int ldb( numeric_cast<int>( B.spacing() ) );
986 const int ldc( numeric_cast<int>( C.spacing() ) );
987 const complex<double> alpha( 1.0, 0.0 );
988 const complex<double> beta ( 0.0, 0.0 );
990 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
991 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
992 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
993 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1011 template<
typename MT
1017 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1029 const TmpType tmp( rhs );
1048 template<
typename MT
1057 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1071 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1087 template<
typename MT3
1090 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1091 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1094 DMatTDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1096 DMatTDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1112 template<
typename MT3
1115 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1116 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1137 template<
typename MT3
1140 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1141 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1143 const size_t M( A.rows() );
1144 const size_t N( B.columns() );
1145 const size_t K( A.columns() );
1148 const size_t end( N &
size_t(-2) );
1150 for(
size_t i=0UL; i<M; ++i ) {
1151 for(
size_t k=0UL; k<K; ++k ) {
1152 for(
size_t j=0UL; j<end; j+=2UL ) {
1153 C(i,j ) += A(i,k) * B(k,j );
1154 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1157 C(i,end) += A(i,k) * B(k,end);
1179 template<
typename MT3
1182 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1183 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1185 typedef IntrinsicTrait<ElementType> IT;
1187 const size_t M( A.rows() );
1188 const size_t N( B.columns() );
1189 const size_t K( A.columns() );
1193 for( ; (i+2UL) <= M; i+=2UL ) {
1195 for( ; (j+4UL) <= N; j+=4UL ) {
1196 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1197 for(
size_t k=0UL; k<K; k+=IT::size ) {
1204 xmm1 = xmm1 + a1 * b1;
1205 xmm2 = xmm2 + a1 * b2;
1206 xmm3 = xmm3 + a1 * b3;
1207 xmm4 = xmm4 + a1 * b4;
1208 xmm5 = xmm5 + a2 * b1;
1209 xmm6 = xmm6 + a2 * b2;
1210 xmm7 = xmm7 + a2 * b3;
1211 xmm8 = xmm8 + a2 * b4;
1213 (~C)(i ,j ) +=
sum( xmm1 );
1214 (~C)(i ,j+1UL) +=
sum( xmm2 );
1215 (~C)(i ,j+2UL) +=
sum( xmm3 );
1216 (~C)(i ,j+3UL) +=
sum( xmm4 );
1217 (~C)(i+1UL,j ) +=
sum( xmm5 );
1218 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
1219 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
1220 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
1222 for( ; (j+2UL) <= N; j+=2UL ) {
1224 for(
size_t k=0UL; k<K; k+=IT::size ) {
1229 xmm1 = xmm1 + a1 * b1;
1230 xmm2 = xmm2 + a1 * b2;
1231 xmm3 = xmm3 + a2 * b1;
1232 xmm4 = xmm4 + a2 * b2;
1234 (~C)(i ,j ) +=
sum( xmm1 );
1235 (~C)(i ,j+1UL) +=
sum( xmm2 );
1236 (~C)(i+1UL,j ) +=
sum( xmm3 );
1237 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1241 for(
size_t k=0UL; k<K; k+=IT::size ) {
1243 xmm1 = xmm1 + A.load(i ,k) * b1;
1244 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1246 (~C)(i ,j) +=
sum( xmm1 );
1247 (~C)(i+1UL,j) +=
sum( xmm2 );
1252 for( ; (j+4UL) <= N; j+=4UL ) {
1254 for(
size_t k=0UL; k<K; k+=IT::size ) {
1256 xmm1 = xmm1 + a1 * B.load(k,j );
1257 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1258 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1259 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1261 (~C)(i,j ) +=
sum( xmm1 );
1262 (~C)(i,j+1UL) +=
sum( xmm2 );
1263 (~C)(i,j+2UL) +=
sum( xmm3 );
1264 (~C)(i,j+3UL) +=
sum( xmm4 );
1266 for( ; (j+2UL) <= N; j+=2UL ) {
1268 for(
size_t k=0UL; k<K; k+=IT::size ) {
1270 xmm1 = xmm1 + a1 * B.load(k,j );
1271 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1273 (~C)(i,j ) +=
sum( xmm1 );
1274 (~C)(i,j+1UL) +=
sum( xmm2 );
1278 for(
size_t k=0UL; k<K; k+=IT::size ) {
1279 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1281 (~C)(i,j) +=
sum( xmm1 );
1302 template<
typename MT3
1305 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1306 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1308 typedef IntrinsicTrait<ElementType> IT;
1310 const size_t M( A.rows() );
1311 const size_t N( B.columns() );
1312 const size_t K( A.columns() );
1316 for( ; (i+4UL) <= M; i+=4UL ) {
1318 for( ; (j+2UL) <= N; j+=2UL ) {
1319 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1320 for(
size_t k=0UL; k<K; k+=IT::size ) {
1327 xmm1 = xmm1 + a1 * b1;
1328 xmm2 = xmm2 + a1 * b2;
1329 xmm3 = xmm3 + a2 * b1;
1330 xmm4 = xmm4 + a2 * b2;
1331 xmm5 = xmm5 + a3 * b1;
1332 xmm6 = xmm6 + a3 * b2;
1333 xmm7 = xmm7 + a4 * b1;
1334 xmm8 = xmm8 + a4 * b2;
1336 (~C)(i ,j ) +=
sum( xmm1 );
1337 (~C)(i ,j+1UL) +=
sum( xmm2 );
1338 (~C)(i+1UL,j ) +=
sum( xmm3 );
1339 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1340 (~C)(i+2UL,j ) +=
sum( xmm5 );
1341 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
1342 (~C)(i+3UL,j ) +=
sum( xmm7 );
1343 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
1347 for(
size_t k=0UL; k<K; k+=IT::size ) {
1349 xmm1 = xmm1 + A.load(i ,k) * b1;
1350 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1351 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1352 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1354 (~C)(i ,j) +=
sum( xmm1 );
1355 (~C)(i+1UL,j) +=
sum( xmm2 );
1356 (~C)(i+2UL,j) +=
sum( xmm3 );
1357 (~C)(i+3UL,j) +=
sum( xmm4 );
1360 for( ; (i+2UL) <= M; i+=2UL ) {
1362 for( ; (j+2UL) <= N; j+=2UL ) {
1364 for(
size_t k=0UL; k<K; k+=IT::size ) {
1369 xmm1 = xmm1 + a1 * b1;
1370 xmm2 = xmm2 + a1 * b2;
1371 xmm3 = xmm3 + a2 * b1;
1372 xmm4 = xmm4 + a2 * b2;
1374 (~C)(i ,j ) +=
sum( xmm1 );
1375 (~C)(i ,j+1UL) +=
sum( xmm2 );
1376 (~C)(i+1UL,j ) +=
sum( xmm3 );
1377 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1381 for(
size_t k=0UL; k<K; k+=IT::size ) {
1383 xmm1 = xmm1 + A.load(i ,k) * b1;
1384 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1386 (~C)(i ,j) +=
sum( xmm1 );
1387 (~C)(i+1UL,j) +=
sum( xmm2 );
1392 for( ; (j+2UL) <= N; j+=2UL ) {
1394 for(
size_t k=0UL; k<K; k+=IT::size ) {
1396 xmm1 = xmm1 + a1 * B.load(k,j );
1397 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1399 (~C)(i,j ) +=
sum( xmm1 );
1400 (~C)(i,j+1UL) +=
sum( xmm2 );
1404 for(
size_t k=0UL; k<K; k+=IT::size ) {
1405 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1407 (~C)(i,j) +=
sum( xmm1 );
1428 template<
typename MT3
1431 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1432 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1434 selectDefaultAddAssignKernel( C, A, B );
1454 template<
typename MT3
1457 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1458 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1460 using boost::numeric_cast;
1466 const int M ( numeric_cast<int>( A.rows() ) );
1467 const int N ( numeric_cast<int>( B.columns() ) );
1468 const int K ( numeric_cast<int>( A.columns() ) );
1469 const int lda( numeric_cast<int>( A.spacing() ) );
1470 const int ldb( numeric_cast<int>( B.spacing() ) );
1471 const int ldc( numeric_cast<int>( C.spacing() ) );
1473 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1474 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1475 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1476 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1497 template<
typename MT3
1500 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1501 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1503 using boost::numeric_cast;
1509 const int M ( numeric_cast<int>( A.rows() ) );
1510 const int N ( numeric_cast<int>( B.columns() ) );
1511 const int K ( numeric_cast<int>( A.columns() ) );
1512 const int lda( numeric_cast<int>( A.spacing() ) );
1513 const int ldb( numeric_cast<int>( B.spacing() ) );
1514 const int ldc( numeric_cast<int>( C.spacing() ) );
1516 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1517 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1518 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1519 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1540 template<
typename MT3
1543 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1544 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1546 using boost::numeric_cast;
1555 const int M ( numeric_cast<int>( A.rows() ) );
1556 const int N ( numeric_cast<int>( B.columns() ) );
1557 const int K ( numeric_cast<int>( A.columns() ) );
1558 const int lda( numeric_cast<int>( A.spacing() ) );
1559 const int ldb( numeric_cast<int>( B.spacing() ) );
1560 const int ldc( numeric_cast<int>( C.spacing() ) );
1561 const complex<float> alpha( 1.0F, 0.0F );
1562 const complex<float> beta ( 1.0F, 0.0F );
1564 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1565 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1566 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1567 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1588 template<
typename MT3
1591 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1592 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1594 using boost::numeric_cast;
1603 const int M ( numeric_cast<int>( A.rows() ) );
1604 const int N ( numeric_cast<int>( B.columns() ) );
1605 const int K ( numeric_cast<int>( A.columns() ) );
1606 const int lda( numeric_cast<int>( A.spacing() ) );
1607 const int ldb( numeric_cast<int>( B.spacing() ) );
1608 const int ldc( numeric_cast<int>( C.spacing() ) );
1609 const complex<double> alpha( 1.0, 0.0 );
1610 const complex<double> beta ( 1.0, 0.0 );
1612 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1613 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1614 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1615 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1638 template<
typename MT
1647 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1661 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1677 template<
typename MT3
1680 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1681 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1684 DMatTDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1686 DMatTDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1702 template<
typename MT3
1705 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1706 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1727 template<
typename MT3
1730 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1731 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1733 const size_t M( A.rows() );
1734 const size_t N( B.columns() );
1735 const size_t K( A.columns() );
1738 const size_t end( N &
size_t(-2) );
1740 for(
size_t i=0UL; i<M; ++i ) {
1741 for(
size_t k=0UL; k<K; ++k ) {
1742 for(
size_t j=0UL; j<end; j+=2UL ) {
1743 C(i,j ) -= A(i,k) * B(k,j );
1744 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1747 C(i,end) -= A(i,k) * B(k,end);
1769 template<
typename MT3
1772 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1773 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1775 typedef IntrinsicTrait<ElementType> IT;
1777 const size_t M( A.rows() );
1778 const size_t N( B.columns() );
1779 const size_t K( A.columns() );
1783 for( ; (i+2UL) <= M; i+=2UL ) {
1785 for( ; (j+4UL) <= N; j+=4UL ) {
1786 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1787 for(
size_t k=0UL; k<K; k+=IT::size ) {
1794 xmm1 = xmm1 + a1 * b1;
1795 xmm2 = xmm2 + a1 * b2;
1796 xmm3 = xmm3 + a1 * b3;
1797 xmm4 = xmm4 + a1 * b4;
1798 xmm5 = xmm5 + a2 * b1;
1799 xmm6 = xmm6 + a2 * b2;
1800 xmm7 = xmm7 + a2 * b3;
1801 xmm8 = xmm8 + a2 * b4;
1803 (~C)(i ,j ) -=
sum( xmm1 );
1804 (~C)(i ,j+1UL) -=
sum( xmm2 );
1805 (~C)(i ,j+2UL) -=
sum( xmm3 );
1806 (~C)(i ,j+3UL) -=
sum( xmm4 );
1807 (~C)(i+1UL,j ) -=
sum( xmm5 );
1808 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
1809 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
1810 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
1812 for( ; (j+2UL) <= N; j+=2UL ) {
1814 for(
size_t k=0UL; k<K; k+=IT::size ) {
1819 xmm1 = xmm1 + a1 * b1;
1820 xmm2 = xmm2 + a1 * b2;
1821 xmm3 = xmm3 + a2 * b1;
1822 xmm4 = xmm4 + a2 * b2;
1824 (~C)(i ,j ) -=
sum( xmm1 );
1825 (~C)(i ,j+1UL) -=
sum( xmm2 );
1826 (~C)(i+1UL,j ) -=
sum( xmm3 );
1827 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1831 for(
size_t k=0UL; k<K; k+=IT::size ) {
1833 xmm1 = xmm1 + A.load(i ,k) * b1;
1834 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1836 (~C)(i ,j) -=
sum( xmm1 );
1837 (~C)(i+1UL,j) -=
sum( xmm2 );
1842 for( ; (j+4UL) <= N; j+=4UL ) {
1844 for(
size_t k=0UL; k<K; k+=IT::size ) {
1846 xmm1 = xmm1 + a1 * B.load(k,j );
1847 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1848 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1849 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1851 (~C)(i,j ) -=
sum( xmm1 );
1852 (~C)(i,j+1UL) -=
sum( xmm2 );
1853 (~C)(i,j+2UL) -=
sum( xmm3 );
1854 (~C)(i,j+3UL) -=
sum( xmm4 );
1856 for( ; (j+2UL) <= N; j+=2UL ) {
1858 for(
size_t k=0UL; k<K; k+=IT::size ) {
1860 xmm1 = xmm1 + a1 * B.load(k,j );
1861 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1863 (~C)(i,j ) -=
sum( xmm1 );
1864 (~C)(i,j+1UL) -=
sum( xmm2 );
1868 for(
size_t k=0UL; k<K; k+=IT::size ) {
1869 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1871 (~C)(i,j) -=
sum( xmm1 );
1892 template<
typename MT3
1895 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1896 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1898 typedef IntrinsicTrait<ElementType> IT;
1900 const size_t M( A.rows() );
1901 const size_t N( B.columns() );
1902 const size_t K( A.columns() );
1906 for( ; (i+4UL) <= M; i+=4UL ) {
1908 for( ; (j+2UL) <= N; j+=2UL ) {
1909 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1910 for(
size_t k=0UL; k<K; k+=IT::size ) {
1917 xmm1 = xmm1 + a1 * b1;
1918 xmm2 = xmm2 + a1 * b2;
1919 xmm3 = xmm3 + a2 * b1;
1920 xmm4 = xmm4 + a2 * b2;
1921 xmm5 = xmm5 + a3 * b1;
1922 xmm6 = xmm6 + a3 * b2;
1923 xmm7 = xmm7 + a4 * b1;
1924 xmm8 = xmm8 + a4 * b2;
1926 (~C)(i ,j ) -=
sum( xmm1 );
1927 (~C)(i ,j+1UL) -=
sum( xmm2 );
1928 (~C)(i+1UL,j ) -=
sum( xmm3 );
1929 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1930 (~C)(i+2UL,j ) -=
sum( xmm5 );
1931 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
1932 (~C)(i+3UL,j ) -=
sum( xmm7 );
1933 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
1937 for(
size_t k=0UL; k<K; k+=IT::size ) {
1939 xmm1 = xmm1 + A.load(i ,k) * b1;
1940 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1941 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1942 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1944 (~C)(i ,j) -=
sum( xmm1 );
1945 (~C)(i+1UL,j) -=
sum( xmm2 );
1946 (~C)(i+2UL,j) -=
sum( xmm3 );
1947 (~C)(i+3UL,j) -=
sum( xmm4 );
1950 for( ; (i+2UL) <= M; i+=2UL ) {
1952 for( ; (j+2UL) <= N; j+=2UL ) {
1954 for(
size_t k=0UL; k<K; k+=IT::size ) {
1959 xmm1 = xmm1 + a1 * b1;
1960 xmm2 = xmm2 + a1 * b2;
1961 xmm3 = xmm3 + a2 * b1;
1962 xmm4 = xmm4 + a2 * b2;
1964 (~C)(i ,j ) -=
sum( xmm1 );
1965 (~C)(i ,j+1UL) -=
sum( xmm2 );
1966 (~C)(i+1UL,j ) -=
sum( xmm3 );
1967 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1971 for(
size_t k=0UL; k<K; k+=IT::size ) {
1973 xmm1 = xmm1 + A.load(i ,k) * b1;
1974 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1976 (~C)(i ,j) -=
sum( xmm1 );
1977 (~C)(i+1UL,j) -=
sum( xmm2 );
1982 for( ; (j+2UL) <= N; j+=2UL ) {
1984 for(
size_t k=0UL; k<K; k+=IT::size ) {
1986 xmm1 = xmm1 + a1 * B.load(k,j );
1987 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1989 (~C)(i,j ) -=
sum( xmm1 );
1990 (~C)(i,j+1UL) -=
sum( xmm2 );
1994 for(
size_t k=0UL; k<K; k+=IT::size ) {
1995 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1997 (~C)(i,j) -=
sum( xmm1 );
2018 template<
typename MT3
2021 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2022 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2024 selectDefaultSubAssignKernel( C, A, B );
2044 template<
typename MT3
2047 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2048 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2050 using boost::numeric_cast;
2056 const int M ( numeric_cast<int>( A.rows() ) );
2057 const int N ( numeric_cast<int>( B.columns() ) );
2058 const int K ( numeric_cast<int>( A.columns() ) );
2059 const int lda( numeric_cast<int>( A.spacing() ) );
2060 const int ldb( numeric_cast<int>( B.spacing() ) );
2061 const int ldc( numeric_cast<int>( C.spacing() ) );
2063 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2064 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2065 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2066 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2087 template<
typename MT3
2090 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2091 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2093 using boost::numeric_cast;
2099 const int M ( numeric_cast<int>( A.rows() ) );
2100 const int N ( numeric_cast<int>( B.columns() ) );
2101 const int K ( numeric_cast<int>( A.columns() ) );
2102 const int lda( numeric_cast<int>( A.spacing() ) );
2103 const int ldb( numeric_cast<int>( B.spacing() ) );
2104 const int ldc( numeric_cast<int>( C.spacing() ) );
2106 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2107 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2108 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2109 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2130 template<
typename MT3
2133 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2134 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2136 using boost::numeric_cast;
2145 const int M ( numeric_cast<int>( A.rows() ) );
2146 const int N ( numeric_cast<int>( B.columns() ) );
2147 const int K ( numeric_cast<int>( A.columns() ) );
2148 const int lda( numeric_cast<int>( A.spacing() ) );
2149 const int ldb( numeric_cast<int>( B.spacing() ) );
2150 const int ldc( numeric_cast<int>( C.spacing() ) );
2151 const complex<float> alpha( -1.0F, 0.0F );
2152 const complex<float> beta ( 1.0F, 0.0F );
2154 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2155 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2156 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2157 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2178 template<
typename MT3
2181 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2182 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2184 using boost::numeric_cast;
2193 const int M ( numeric_cast<int>( A.rows() ) );
2194 const int N ( numeric_cast<int>( B.columns() ) );
2195 const int K ( numeric_cast<int>( A.columns() ) );
2196 const int lda( numeric_cast<int>( A.spacing() ) );
2197 const int ldb( numeric_cast<int>( B.spacing() ) );
2198 const int ldc( numeric_cast<int>( C.spacing() ) );
2199 const complex<double> alpha( -1.0, 0.0 );
2200 const complex<double> beta ( 1.0, 0.0 );
2202 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2203 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2204 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2205 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2251 template<
typename MT1
2255 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
2256 ,
private MatScalarMultExpr
2257 ,
private Computation
2261 typedef DMatTDMatMultExpr<MT1,MT2> MMM;
2273 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2278 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2285 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2286 struct UseSMPAssignKernel {
2287 enum { value = evaluateLeft || evaluateRight };
2296 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2297 struct UseSinglePrecisionKernel {
2298 enum { value = IsFloat<typename T1::ElementType>::value &&
2299 IsFloat<typename T2::ElementType>::value &&
2300 IsFloat<typename T3::ElementType>::value &&
2301 !IsComplex<T4>::value };
2310 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2311 struct UseDoublePrecisionKernel {
2312 enum { value = IsDouble<typename T1::ElementType>::value &&
2313 IsDouble<typename T2::ElementType>::value &&
2314 IsDouble<typename T3::ElementType>::value &&
2315 !IsComplex<T4>::value };
2324 template<
typename T1,
typename T2,
typename T3 >
2325 struct UseSinglePrecisionComplexKernel {
2326 typedef complex<float> Type;
2327 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2328 IsSame<typename T2::ElementType,Type>::value &&
2329 IsSame<typename T3::ElementType,Type>::value };
2338 template<
typename T1,
typename T2,
typename T3 >
2339 struct UseDoublePrecisionComplexKernel {
2340 typedef complex<double> Type;
2341 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2342 IsSame<typename T2::ElementType,Type>::value &&
2343 IsSame<typename T3::ElementType,Type>::value };
2351 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2352 struct UseDefaultKernel {
2353 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2354 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2355 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2356 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2364 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2365 struct UseVectorizedDefaultKernel {
2366 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2367 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2368 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2369 IsSame<typename T1::ElementType,T4>::value &&
2370 IntrinsicTrait<typename T1::ElementType>::addition &&
2371 IntrinsicTrait<typename T1::ElementType>::multiplication };
2377 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2378 typedef typename MultTrait<RES,ST>::Type
ResultType;
2382 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2387 typedef const DMatTDMatMultExpr<MT1,MT2>
LeftOperand;
2393 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
2396 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
2401 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2402 IsSame<ET1,ET2>::value &&
2403 IsSame<ET1,ST>::value &&
2404 IntrinsicTrait<ET1>::addition &&
2405 IntrinsicTrait<ET1>::multiplication };
2408 enum { smpAssignable = !evaluateLeft && !evaluateRight };
2417 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2433 return matrix_(i,j) * scalar_;
2442 inline size_t rows()
const {
2443 return matrix_.rows();
2452 inline size_t columns()
const {
2453 return matrix_.columns();
2483 template<
typename T >
2484 inline bool canAlias(
const T* alias )
const {
2485 return matrix_.canAlias( alias );
2495 template<
typename T >
2496 inline bool isAliased(
const T* alias )
const {
2497 return matrix_.isAliased( alias );
2507 return matrix_.isAligned();
2517 typename MMM::LeftOperand A( matrix_.leftOperand() );
2539 template<
typename MT3
2541 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2548 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2549 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2551 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2554 else if( left.columns() == 0UL ) {
2569 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
2584 template<
typename MT3
2588 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
2589 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2592 DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
2594 DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
2609 template<
typename MT3
2613 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
2614 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2634 template<
typename MT3
2638 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2639 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2641 for(
size_t i=0UL; i<A.rows(); ++i ) {
2642 for(
size_t k=0UL; k<B.columns(); ++k ) {
2643 C(i,k) = A(i,0UL) * B(0UL,k);
2645 for(
size_t j=1UL; j<A.columns(); ++j ) {
2646 for(
size_t k=0UL; k<B.columns(); ++k ) {
2647 C(i,k) += A(i,j) * B(j,k);
2650 for(
size_t k=0UL; k<B.columns(); ++k ) {
2671 template<
typename MT3
2675 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2676 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2678 typedef IntrinsicTrait<ElementType> IT;
2680 const size_t M( A.rows() );
2681 const size_t N( B.columns() );
2682 const size_t K( A.columns() );
2686 for( ; (i+2UL) <= M; i+=2UL ) {
2688 for( ; (j+4UL) <= N; j+=4UL ) {
2689 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2690 for(
size_t k=0UL; k<K; k+=IT::size ) {
2697 xmm1 = xmm1 + a1 * b1;
2698 xmm2 = xmm2 + a1 * b2;
2699 xmm3 = xmm3 + a1 * b3;
2700 xmm4 = xmm4 + a1 * b4;
2701 xmm5 = xmm5 + a2 * b1;
2702 xmm6 = xmm6 + a2 * b2;
2703 xmm7 = xmm7 + a2 * b3;
2704 xmm8 = xmm8 + a2 * b4;
2706 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2707 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2708 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
2709 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
2710 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
2711 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
2712 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
2713 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
2715 for( ; (j+2UL) <= N; j+=2UL ) {
2717 for(
size_t k=0UL; k<K; k+=IT::size ) {
2722 xmm1 = xmm1 + a1 * b1;
2723 xmm2 = xmm2 + a1 * b2;
2724 xmm3 = xmm3 + a2 * b1;
2725 xmm4 = xmm4 + a2 * b2;
2727 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2728 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2729 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2730 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2734 for(
size_t k=0UL; k<K; k+=IT::size ) {
2736 xmm1 = xmm1 + A.load(i ,k) * b1;
2737 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2739 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2740 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2745 for( ; (j+4UL) <= N; j+=4UL ) {
2747 for(
size_t k=0UL; k<K; k+=IT::size ) {
2749 xmm1 = xmm1 + a1 * B.load(k,j );
2750 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2751 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
2752 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
2754 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2755 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2756 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
2757 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
2759 for( ; (j+2UL) <= N; j+=2UL ) {
2761 for(
size_t k=0UL; k<K; k+=IT::size ) {
2763 xmm1 = xmm1 + a1 * B.load(k,j );
2764 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2766 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2767 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2771 for(
size_t k=0UL; k<K; k+=IT::size ) {
2772 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2774 (~C)(i,j) =
sum( xmm1 ) * scalar;
2794 template<
typename MT3
2798 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2799 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2801 typedef IntrinsicTrait<ElementType> IT;
2803 const size_t M( A.rows() );
2804 const size_t N( B.columns() );
2805 const size_t K( A.columns() );
2809 for( ; (i+4UL) <= M; i+=4UL ) {
2811 for( ; (j+2UL) <= N; j+=2UL ) {
2812 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2813 for(
size_t k=0UL; k<K; k+=IT::size ) {
2820 xmm1 = xmm1 + a1 * b1;
2821 xmm2 = xmm2 + a1 * b2;
2822 xmm3 = xmm3 + a2 * b1;
2823 xmm4 = xmm4 + a2 * b2;
2824 xmm5 = xmm5 + a3 * b1;
2825 xmm6 = xmm6 + a3 * b2;
2826 xmm7 = xmm7 + a4 * b1;
2827 xmm8 = xmm8 + a4 * b2;
2829 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2830 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2831 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2832 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2833 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
2834 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
2835 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
2836 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
2840 for(
size_t k=0UL; k<K; k+=IT::size ) {
2842 xmm1 = xmm1 + A.load(i ,k) * b1;
2843 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2844 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
2845 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
2847 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2848 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2849 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
2850 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
2853 for( ; (i+2UL) <= M; i+=2UL ) {
2855 for( ; (j+2UL) <= N; j+=2UL ) {
2857 for(
size_t k=0UL; k<K; k+=IT::size ) {
2862 xmm1 = xmm1 + a1 * b1;
2863 xmm2 = xmm2 + a1 * b2;
2864 xmm3 = xmm3 + a2 * b1;
2865 xmm4 = xmm4 + a2 * b2;
2867 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2868 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2869 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2870 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2874 for(
size_t k=0UL; k<K; k+=IT::size ) {
2876 xmm1 = xmm1 + A.load(i ,k) * b1;
2877 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2879 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2880 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2885 for( ; (j+2UL) <= N; j+=2UL ) {
2887 for(
size_t k=0UL; k<K; k+=IT::size ) {
2889 xmm1 = xmm1 + a1 * B.load(k,j );
2890 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2892 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2893 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2897 for(
size_t k=0UL; k<K; k+=IT::size ) {
2898 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2900 (~C)(i,j) =
sum( xmm1 ) * scalar;
2920 template<
typename MT3
2924 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2925 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2927 selectDefaultAssignKernel( C, A, B, scalar );
2946 template<
typename MT3
2950 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2951 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2953 using boost::numeric_cast;
2959 const int M ( numeric_cast<int>( A.rows() ) );
2960 const int N ( numeric_cast<int>( B.columns() ) );
2961 const int K ( numeric_cast<int>( A.columns() ) );
2962 const int lda( numeric_cast<int>( A.spacing() ) );
2963 const int ldb( numeric_cast<int>( B.spacing() ) );
2964 const int ldc( numeric_cast<int>( C.spacing() ) );
2966 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2967 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2968 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2969 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2989 template<
typename MT3
2993 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2994 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2996 using boost::numeric_cast;
3002 const int M ( numeric_cast<int>( A.rows() ) );
3003 const int N ( numeric_cast<int>( B.columns() ) );
3004 const int K ( numeric_cast<int>( A.columns() ) );
3005 const int lda( numeric_cast<int>( A.spacing() ) );
3006 const int ldb( numeric_cast<int>( B.spacing() ) );
3007 const int ldc( numeric_cast<int>( C.spacing() ) );
3009 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3010 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3011 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3012 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3032 template<
typename MT3
3036 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3037 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3039 using boost::numeric_cast;
3048 const int M ( numeric_cast<int>( A.rows() ) );
3049 const int N ( numeric_cast<int>( B.columns() ) );
3050 const int K ( numeric_cast<int>( A.columns() ) );
3051 const int lda( numeric_cast<int>( A.spacing() ) );
3052 const int ldb( numeric_cast<int>( B.spacing() ) );
3053 const int ldc( numeric_cast<int>( C.spacing() ) );
3054 const complex<float> alpha( scalar );
3055 const complex<float> beta ( 0.0F, 0.0F );
3057 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3058 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3059 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3060 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3080 template<
typename MT3
3084 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3085 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3087 using boost::numeric_cast;
3096 const int M ( numeric_cast<int>( A.rows() ) );
3097 const int N ( numeric_cast<int>( B.columns() ) );
3098 const int K ( numeric_cast<int>( A.columns() ) );
3099 const int lda( numeric_cast<int>( A.spacing() ) );
3100 const int ldb( numeric_cast<int>( B.spacing() ) );
3101 const int ldc( numeric_cast<int>( C.spacing() ) );
3102 const complex<double> alpha( scalar );
3103 const complex<double> beta ( 0.0, 0.0 );
3105 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3106 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3107 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3108 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3125 template<
typename MT
3127 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3131 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
3143 const TmpType tmp( rhs );
3160 template<
typename MT3
3162 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3169 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3170 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3172 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3186 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3201 template<
typename MT3
3205 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3206 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3209 DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3211 DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3226 template<
typename MT3
3230 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3231 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3251 template<
typename MT3
3255 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3256 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3277 template<
typename MT3
3281 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3282 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3284 typedef IntrinsicTrait<ElementType> IT;
3286 const size_t M( A.rows() );
3287 const size_t N( B.columns() );
3288 const size_t K( A.columns() );
3292 for( ; (i+2UL) <= M; i+=2UL ) {
3294 for( ; (j+4UL) <= N; j+=4UL ) {
3295 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3296 for(
size_t k=0UL; k<K; k+=IT::size ) {
3303 xmm1 = xmm1 + a1 * b1;
3304 xmm2 = xmm2 + a1 * b2;
3305 xmm3 = xmm3 + a1 * b3;
3306 xmm4 = xmm4 + a1 * b4;
3307 xmm5 = xmm5 + a2 * b1;
3308 xmm6 = xmm6 + a2 * b2;
3309 xmm7 = xmm7 + a2 * b3;
3310 xmm8 = xmm8 + a2 * b4;
3312 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3313 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3314 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
3315 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
3316 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
3317 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
3318 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
3319 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
3321 for( ; (j+2UL) <= N; j+=2UL ) {
3323 for(
size_t k=0UL; k<K; k+=IT::size ) {
3328 xmm1 = xmm1 + a1 * b1;
3329 xmm2 = xmm2 + a1 * b2;
3330 xmm3 = xmm3 + a2 * b1;
3331 xmm4 = xmm4 + a2 * b2;
3333 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3334 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3335 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3336 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3340 for(
size_t k=0UL; k<K; k+=IT::size ) {
3342 xmm1 = xmm1 + A.load(i ,k) * b1;
3343 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3345 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3346 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3351 for( ; (j+4UL) <= N; j+=4UL ) {
3353 for(
size_t k=0UL; k<K; k+=IT::size ) {
3355 xmm1 = xmm1 + a1 * B.load(k,j );
3356 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3357 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3358 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3360 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3361 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3362 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
3363 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
3365 for( ; (j+2UL) <= N; j+=2UL ) {
3367 for(
size_t k=0UL; k<K; k+=IT::size ) {
3369 xmm1 = xmm1 + a1 * B.load(k,j );
3370 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3372 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3373 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3377 for(
size_t k=0UL; k<K; k+=IT::size ) {
3378 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3380 (~C)(i,j) +=
sum( xmm1 ) * scalar;
3400 template<
typename MT3
3404 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3405 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3407 typedef IntrinsicTrait<ElementType> IT;
3409 const size_t M( A.rows() );
3410 const size_t N( B.columns() );
3411 const size_t K( A.columns() );
3415 for( ; (i+4UL) <= M; i+=4UL ) {
3417 for( ; (j+2UL) <= N; j+=2UL ) {
3418 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3419 for(
size_t k=0UL; k<K; k+=IT::size ) {
3426 xmm1 = xmm1 + a1 * b1;
3427 xmm2 = xmm2 + a1 * b2;
3428 xmm3 = xmm3 + a2 * b1;
3429 xmm4 = xmm4 + a2 * b2;
3430 xmm5 = xmm5 + a3 * b1;
3431 xmm6 = xmm6 + a3 * b2;
3432 xmm7 = xmm7 + a4 * b1;
3433 xmm8 = xmm8 + a4 * b2;
3435 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3436 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3437 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3438 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3439 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
3440 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
3441 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
3442 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
3446 for(
size_t k=0UL; k<K; k+=IT::size ) {
3448 xmm1 = xmm1 + A.load(i ,k) * b1;
3449 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3450 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
3451 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
3453 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3454 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3455 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
3456 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
3459 for( ; (i+2UL) <= M; i+=2UL ) {
3461 for( ; (j+2UL) <= N; j+=2UL ) {
3463 for(
size_t k=0UL; k<K; k+=IT::size ) {
3468 xmm1 = xmm1 + a1 * b1;
3469 xmm2 = xmm2 + a1 * b2;
3470 xmm3 = xmm3 + a2 * b1;
3471 xmm4 = xmm4 + a2 * b2;
3473 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3474 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3475 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3476 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3480 for(
size_t k=0UL; k<K; k+=IT::size ) {
3482 xmm1 = xmm1 + A.load(i ,k) * b1;
3483 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3485 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3486 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3491 for( ; (j+2UL) <= N; j+=2UL ) {
3493 for(
size_t k=0UL; k<K; k+=IT::size ) {
3495 xmm1 = xmm1 + a1 * B.load(k,j );
3496 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3498 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3499 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3503 for(
size_t k=0UL; k<K; k+=IT::size ) {
3504 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3506 (~C)(i,j) +=
sum( xmm1 ) * scalar;
3526 template<
typename MT3
3530 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3531 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3533 selectDefaultAddAssignKernel( C, A, B, scalar );
3552 template<
typename MT3
3556 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3557 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3559 using boost::numeric_cast;
3565 const int M ( numeric_cast<int>( A.rows() ) );
3566 const int N ( numeric_cast<int>( B.columns() ) );
3567 const int K ( numeric_cast<int>( A.columns() ) );
3568 const int lda( numeric_cast<int>( A.spacing() ) );
3569 const int ldb( numeric_cast<int>( B.spacing() ) );
3570 const int ldc( numeric_cast<int>( C.spacing() ) );
3572 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3573 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3574 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3575 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3595 template<
typename MT3
3599 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3600 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3602 using boost::numeric_cast;
3608 const int M ( numeric_cast<int>( A.rows() ) );
3609 const int N ( numeric_cast<int>( B.columns() ) );
3610 const int K ( numeric_cast<int>( A.columns() ) );
3611 const int lda( numeric_cast<int>( A.spacing() ) );
3612 const int ldb( numeric_cast<int>( B.spacing() ) );
3613 const int ldc( numeric_cast<int>( C.spacing() ) );
3615 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3616 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3617 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3618 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3638 template<
typename MT3
3642 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3643 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3645 using boost::numeric_cast;
3654 const int M ( numeric_cast<int>( A.rows() ) );
3655 const int N ( numeric_cast<int>( B.columns() ) );
3656 const int K ( numeric_cast<int>( A.columns() ) );
3657 const int lda( numeric_cast<int>( A.spacing() ) );
3658 const int ldb( numeric_cast<int>( B.spacing() ) );
3659 const int ldc( numeric_cast<int>( C.spacing() ) );
3660 const complex<float> alpha( scalar );
3661 const complex<float> beta ( 1.0F, 0.0F );
3663 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3664 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3665 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3666 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3686 template<
typename MT3
3690 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3691 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3693 using boost::numeric_cast;
3702 const int M ( numeric_cast<int>( A.rows() ) );
3703 const int N ( numeric_cast<int>( B.columns() ) );
3704 const int K ( numeric_cast<int>( A.columns() ) );
3705 const int lda( numeric_cast<int>( A.spacing() ) );
3706 const int ldb( numeric_cast<int>( B.spacing() ) );
3707 const int ldc( numeric_cast<int>( C.spacing() ) );
3708 const complex<double> alpha( scalar );
3709 const complex<double> beta ( 1.0, 0.0 );
3711 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3712 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3713 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3714 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3735 template<
typename MT3
3737 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3744 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3745 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3747 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3761 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3776 template<
typename MT3
3780 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3781 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3784 DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
3786 DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
3801 template<
typename MT3
3805 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3806 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3826 template<
typename MT3
3830 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3831 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3852 template<
typename MT3
3856 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3857 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3859 typedef IntrinsicTrait<ElementType> IT;
3861 const size_t M( A.rows() );
3862 const size_t N( B.columns() );
3863 const size_t K( A.columns() );
3867 for( ; (i+2UL) <= M; i+=2UL ) {
3869 for( ; (j+4UL) <= N; j+=4UL ) {
3870 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3871 for(
size_t k=0UL; k<K; k+=IT::size ) {
3878 xmm1 = xmm1 + a1 * b1;
3879 xmm2 = xmm2 + a1 * b2;
3880 xmm3 = xmm3 + a1 * b3;
3881 xmm4 = xmm4 + a1 * b4;
3882 xmm5 = xmm5 + a2 * b1;
3883 xmm6 = xmm6 + a2 * b2;
3884 xmm7 = xmm7 + a2 * b3;
3885 xmm8 = xmm8 + a2 * b4;
3887 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3888 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3889 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
3890 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
3891 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
3892 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
3893 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
3894 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
3896 for( ; (j+2UL) <= N; j+=2UL ) {
3898 for(
size_t k=0UL; k<K; k+=IT::size ) {
3903 xmm1 = xmm1 + a1 * b1;
3904 xmm2 = xmm2 + a1 * b2;
3905 xmm3 = xmm3 + a2 * b1;
3906 xmm4 = xmm4 + a2 * b2;
3908 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3909 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3910 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
3911 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
3915 for(
size_t k=0UL; k<K; k+=IT::size ) {
3917 xmm1 = xmm1 + A.load(i ,k) * b1;
3918 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3920 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
3921 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
3926 for( ; (j+4UL) <= N; j+=4UL ) {
3928 for(
size_t k=0UL; k<K; k+=IT::size ) {
3930 xmm1 = xmm1 + a1 * B.load(k,j );
3931 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3932 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3933 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3935 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
3936 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
3937 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
3938 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
3940 for( ; (j+2UL) <= N; j+=2UL ) {
3942 for(
size_t k=0UL; k<K; k+=IT::size ) {
3944 xmm1 = xmm1 + a1 * B.load(k,j );
3945 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3947 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
3948 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
3952 for(
size_t k=0UL; k<K; k+=IT::size ) {
3953 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3955 (~C)(i,j) -=
sum( xmm1 ) * scalar;
3975 template<
typename MT3
3979 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3980 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3982 typedef IntrinsicTrait<ElementType> IT;
3984 const size_t M( A.rows() );
3985 const size_t N( B.columns() );
3986 const size_t K( A.columns() );
3990 for( ; (i+4UL) <= M; i+=4UL ) {
3992 for( ; (j+2UL) <= N; j+=2UL ) {
3993 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3994 for(
size_t k=0UL; k<K; k+=IT::size ) {
4001 xmm1 = xmm1 + a1 * b1;
4002 xmm2 = xmm2 + a1 * b2;
4003 xmm3 = xmm3 + a2 * b1;
4004 xmm4 = xmm4 + a2 * b2;
4005 xmm5 = xmm5 + a3 * b1;
4006 xmm6 = xmm6 + a3 * b2;
4007 xmm7 = xmm7 + a4 * b1;
4008 xmm8 = xmm8 + a4 * b2;
4010 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
4011 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
4012 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
4013 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
4014 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
4015 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
4016 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
4017 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
4021 for(
size_t k=0UL; k<K; k+=IT::size ) {
4023 xmm1 = xmm1 + A.load(i ,k) * b1;
4024 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
4025 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
4026 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
4028 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
4029 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
4030 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
4031 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
4034 for( ; (i+2UL) <= M; i+=2UL ) {
4036 for( ; (j+2UL) <= N; j+=2UL ) {
4038 for(
size_t k=0UL; k<K; k+=IT::size ) {
4043 xmm1 = xmm1 + a1 * b1;
4044 xmm2 = xmm2 + a1 * b2;
4045 xmm3 = xmm3 + a2 * b1;
4046 xmm4 = xmm4 + a2 * b2;
4048 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
4049 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
4050 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
4051 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
4055 for(
size_t k=0UL; k<K; k+=IT::size ) {
4057 xmm1 = xmm1 + A.load(i ,k) * b1;
4058 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
4060 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
4061 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
4066 for( ; (j+2UL) <= N; j+=2UL ) {
4068 for(
size_t k=0UL; k<K; k+=IT::size ) {
4070 xmm1 = xmm1 + a1 * B.load(k,j );
4071 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
4073 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
4074 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
4078 for(
size_t k=0UL; k<K; k+=IT::size ) {
4079 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
4081 (~C)(i,j) -=
sum( xmm1 ) * scalar;
4101 template<
typename MT3
4105 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4106 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4108 selectDefaultSubAssignKernel( C, A, B, scalar );
4127 template<
typename MT3
4131 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4132 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4134 using boost::numeric_cast;
4140 const int M ( numeric_cast<int>( A.rows() ) );
4141 const int N ( numeric_cast<int>( B.columns() ) );
4142 const int K ( numeric_cast<int>( A.columns() ) );
4143 const int lda( numeric_cast<int>( A.spacing() ) );
4144 const int ldb( numeric_cast<int>( B.spacing() ) );
4145 const int ldc( numeric_cast<int>( C.spacing() ) );
4147 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4148 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4149 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4150 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4170 template<
typename MT3
4174 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4175 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4177 using boost::numeric_cast;
4183 const int M ( numeric_cast<int>( A.rows() ) );
4184 const int N ( numeric_cast<int>( B.columns() ) );
4185 const int K ( numeric_cast<int>( A.columns() ) );
4186 const int lda( numeric_cast<int>( A.spacing() ) );
4187 const int ldb( numeric_cast<int>( B.spacing() ) );
4188 const int ldc( numeric_cast<int>( C.spacing() ) );
4190 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4191 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4192 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4193 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4213 template<
typename MT3
4217 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4218 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4220 using boost::numeric_cast;
4229 const int M ( numeric_cast<int>( A.rows() ) );
4230 const int N ( numeric_cast<int>( B.columns() ) );
4231 const int K ( numeric_cast<int>( A.columns() ) );
4232 const int lda( numeric_cast<int>( A.spacing() ) );
4233 const int ldb( numeric_cast<int>( B.spacing() ) );
4234 const int ldc( numeric_cast<int>( C.spacing() ) );
4235 const complex<float> alpha( -scalar );
4236 const complex<float> beta ( 1.0F, 0.0F );
4238 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4239 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4240 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4241 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4261 template<
typename MT3
4265 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4266 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4268 using boost::numeric_cast;
4277 const int M ( numeric_cast<int>( A.rows() ) );
4278 const int N ( numeric_cast<int>( B.columns() ) );
4279 const int K ( numeric_cast<int>( A.columns() ) );
4280 const int lda( numeric_cast<int>( A.spacing() ) );
4281 const int ldb( numeric_cast<int>( B.spacing() ) );
4282 const int ldc( numeric_cast<int>( C.spacing() ) );
4283 const complex<double> alpha( -scalar );
4284 const complex<double> beta ( 1.0, 0.0 );
4286 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4287 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4288 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4289 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4358 template<
typename T1
4360 inline const DMatTDMatMultExpr<T1,T2>
4366 throw std::invalid_argument(
"Matrix sizes do not match" );
4383 template<
typename MT1,
typename MT2,
typename VT >
4388 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4389 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4390 IsDenseVector<VT>::value && IsColumnVector<VT>::value
4391 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
4392 , INVALID_TYPE >::Type Type;
4401 template<
typename MT1,
typename MT2,
typename VT >
4406 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4407 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4408 IsSparseVector<VT>::value && IsColumnVector<VT>::value
4409 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
4410 , INVALID_TYPE >::Type Type;
4419 template<
typename VT,
typename MT1,
typename MT2 >
4424 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
4425 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4426 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4427 ,
typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4428 , INVALID_TYPE >::Type Type;
4437 template<
typename VT,
typename MT1,
typename MT2 >
4442 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
4443 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4444 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4445 ,
typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4446 , INVALID_TYPE >::Type Type;
4455 template<
typename MT1,
typename MT2,
bool AF >
4460 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
4461 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
4470 template<
typename MT1,
typename MT2 >
4475 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4484 template<
typename MT1,
typename MT2 >
4489 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:403
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:247
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:255
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:249
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:299
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:393
Header file for the sparse matrix SMP implementation.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
DMatTDMatMultExpr< MT1, MT2 > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:245
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:247
Header file for the TDVecSMatMultExprTrait class template.
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:124
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:126
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:284
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:339
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:252
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TDMatSVecMultExprTrait class template.
Header file for the dense matrix SMP implementation.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:383
Header file for the DenseMatrix base class.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:123
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:127
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
const size_t SMP_DMATTDMATMULT_THRESHOLD
SMP row-major dense matrix/column-major dense matrix multiplication threshold.This threshold represen...
Definition: Thresholds.h:446
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatTDMatMultExpr.h:250
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:329
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:246
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
const size_t DMATTDMATMULT_THRESHOLD
Row-major dense matrix/column-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:142
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:371
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:125
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:349
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:251
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
Header file for basic type definitions.
Header file for the IsComplex type trait.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:412
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:116
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:122
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:413
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:359
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:248
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:261
Constraint on the data type.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:264
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:258
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.