22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
95 template<
typename MT1
103 typedef typename MT1::ResultType
RT1;
104 typedef typename MT2::ResultType
RT2;
105 typedef typename MT1::CompositeType
CT1;
106 typedef typename MT2::CompositeType
CT2;
114 template<
typename T1,
typename T2,
typename T3 >
115 struct UseSinglePrecisionKernel {
128 template<
typename T1,
typename T2,
typename T3 >
129 struct UseDoublePrecisionKernel {
143 template<
typename T1,
typename T2,
typename T3 >
144 struct UseSinglePrecisionComplexKernel {
145 typedef complex<float> Type;
146 enum { value = IsSame<typename T1::ElementType,Type>::value &&
147 IsSame<typename T2::ElementType,Type>::value &&
148 IsSame<typename T3::ElementType,Type>::value };
159 template<
typename T1,
typename T2,
typename T3 >
160 struct UseDoublePrecisionComplexKernel {
161 typedef complex<double> Type;
162 enum { value = IsSame<typename T1::ElementType,Type>::value &&
163 IsSame<typename T2::ElementType,Type>::value &&
164 IsSame<typename T3::ElementType,Type>::value };
174 template<
typename T1,
typename T2,
typename T3 >
175 struct UseDefaultKernel {
176 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
177 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
178 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
179 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
189 template<
typename T1,
typename T2,
typename T3 >
190 struct UseVectorizedDefaultKernel {
191 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
192 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
193 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
194 IntrinsicTrait<typename T1::ElementType>::addition &&
195 IntrinsicTrait<typename T1::ElementType>::multiplication };
226 enum { vectorizable = 0 };
256 if(
lhs_.columns() != 0UL ) {
257 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
259 for(
size_t k=1UL; k<end; k+=2UL ) {
261 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
263 if( end <
lhs_.columns() ) {
291 return rhs_.columns();
321 template<
typename T >
323 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
333 template<
typename T >
335 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
355 template<
typename MT3
364 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
367 else if( rhs.
lhs_.columns() == 0UL ) {
383 DMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
385 DMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
403 template<
typename MT3
407 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
409 const size_t M( A.rows() );
410 const size_t N( B.columns() );
411 const size_t K( A.columns() );
413 for(
size_t i=0UL; i<M; ++i ) {
414 for(
size_t j=0UL; j<N; ++j ) {
415 C(i,j) = A(i,0UL) * B(0UL,j);
417 for(
size_t k=1UL; k<K; ++k ) {
418 for(
size_t j=0UL; j<N; ++j ) {
419 C(i,j) += A(i,k) * B(k,j);
441 template<
typename MT3
444 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
445 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
447 typedef IntrinsicTrait<ElementType> IT;
449 const size_t M( A.rows() );
450 const size_t N( B.spacing() );
451 const size_t K( A.columns() );
455 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
456 for(
size_t i=0UL; i<M; ++i ) {
457 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
458 for(
size_t k=0UL; k<K; ++k ) {
460 xmm1 = xmm1 + a1 * B.get(k,j );
461 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
462 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
463 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
464 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
465 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
466 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
467 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
469 store( &(~C)(i,j ), xmm1 );
470 store( &(~C)(i,j+IT::size ), xmm2 );
471 store( &(~C)(i,j+IT::size*2UL), xmm3 );
472 store( &(~C)(i,j+IT::size*3UL), xmm4 );
473 store( &(~C)(i,j+IT::size*4UL), xmm5 );
474 store( &(~C)(i,j+IT::size*5UL), xmm6 );
475 store( &(~C)(i,j+IT::size*6UL), xmm7 );
476 store( &(~C)(i,j+IT::size*7UL), xmm8 );
479 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
481 for( ; (i+2UL) <= M; i+=2UL ) {
482 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
483 for(
size_t k=0UL; k<K; ++k ) {
490 xmm1 = xmm1 + a1 * b1;
491 xmm2 = xmm2 + a1 * b2;
492 xmm3 = xmm3 + a1 * b3;
493 xmm4 = xmm4 + a1 * b4;
494 xmm5 = xmm5 + a2 * b1;
495 xmm6 = xmm6 + a2 * b2;
496 xmm7 = xmm7 + a2 * b3;
497 xmm8 = xmm8 + a2 * b4;
499 store( &(~C)(i ,j ), xmm1 );
500 store( &(~C)(i ,j+IT::size ), xmm2 );
501 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
502 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
503 store( &(~C)(i+1UL,j ), xmm5 );
504 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
505 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
506 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
510 for(
size_t k=0UL; k<K; ++k ) {
512 xmm1 = xmm1 + a1 * B.get(k,j );
513 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
514 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
515 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
517 store( &(~C)(i,j ), xmm1 );
518 store( &(~C)(i,j+IT::size ), xmm2 );
519 store( &(~C)(i,j+IT::size*2UL), xmm3 );
520 store( &(~C)(i,j+IT::size*3UL), xmm4 );
523 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
525 for( ; (i+2UL) <= M; i+=2UL ) {
527 for(
size_t k=0UL; k<K; ++k ) {
532 xmm1 = xmm1 + a1 * b1;
533 xmm2 = xmm2 + a1 * b2;
534 xmm3 = xmm3 + a2 * b1;
535 xmm4 = xmm4 + a2 * b2;
537 store( &(~C)(i ,j ), xmm1 );
538 store( &(~C)(i ,j+IT::size), xmm2 );
539 store( &(~C)(i+1UL,j ), xmm3 );
540 store( &(~C)(i+1UL,j+IT::size), xmm4 );
544 for(
size_t k=0UL; k<K; ++k ) {
546 xmm1 = xmm1 + a1 * B.get(k,j );
547 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
549 store( &(~C)(i,j ), xmm1 );
550 store( &(~C)(i,j+IT::size), xmm2 );
555 for( ; (i+2UL) <= M; i+=2UL ) {
557 for(
size_t k=0UL; k<K; ++k ) {
559 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
560 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
562 store( &(~C)(i ,j), xmm1 );
563 store( &(~C)(i+1UL,j), xmm2 );
567 for(
size_t k=0UL; k<K; ++k ) {
568 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
570 store( &(~C)(i,j), xmm1 );
591 template<
typename MT3
594 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
595 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
600 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
601 const typename MT4::OppositeType tmp( A );
604 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
605 const typename MT5::OppositeType tmp( B );
608 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
609 const typename MT4::OppositeType tmp( A );
613 const typename MT5::OppositeType tmp( B );
633 template<
typename MT3
636 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
637 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
639 selectDefaultAssignKernel( C, A, B );
659 template<
typename MT3
662 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
663 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
665 using boost::numeric_cast;
671 const int M ( numeric_cast<int>( A.rows() ) );
672 const int N ( numeric_cast<int>( B.columns() ) );
673 const int K ( numeric_cast<int>( A.columns() ) );
674 const int lda( numeric_cast<int>( A.spacing() ) );
675 const int ldb( numeric_cast<int>( B.spacing() ) );
676 const int ldc( numeric_cast<int>( C.spacing() ) );
678 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
679 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
680 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
681 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
702 template<
typename MT3
705 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
706 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
708 using boost::numeric_cast;
714 const int M ( numeric_cast<int>( A.rows() ) );
715 const int N ( numeric_cast<int>( B.columns() ) );
716 const int K ( numeric_cast<int>( A.columns() ) );
717 const int lda( numeric_cast<int>( A.spacing() ) );
718 const int ldb( numeric_cast<int>( B.spacing() ) );
719 const int ldc( numeric_cast<int>( C.spacing() ) );
721 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
722 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
723 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
724 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
745 template<
typename MT3
748 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
749 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
751 using boost::numeric_cast;
760 const int M ( numeric_cast<int>( A.rows() ) );
761 const int N ( numeric_cast<int>( B.columns() ) );
762 const int K ( numeric_cast<int>( A.columns() ) );
763 const int lda( numeric_cast<int>( A.spacing() ) );
764 const int ldb( numeric_cast<int>( B.spacing() ) );
765 const int ldc( numeric_cast<int>( C.spacing() ) );
766 const complex<float> alpha( 1.0F, 0.0F );
767 const complex<float> beta ( 0.0F, 0.0F );
769 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
770 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
771 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
772 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
793 template<
typename MT3
796 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
797 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
799 using boost::numeric_cast;
808 const int M ( numeric_cast<int>( A.rows() ) );
809 const int N ( numeric_cast<int>( B.columns() ) );
810 const int K ( numeric_cast<int>( A.columns() ) );
811 const int lda( numeric_cast<int>( A.spacing() ) );
812 const int ldb( numeric_cast<int>( B.spacing() ) );
813 const int ldc( numeric_cast<int>( C.spacing() ) );
814 const complex<double> alpha( 1.0, 0.0 );
815 const complex<double> beta ( 0.0, 0.0 );
817 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
818 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
819 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
820 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
838 template<
typename MT
844 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
856 const TmpType tmp( rhs );
875 template<
typename MT3
884 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
899 DMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
901 DMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
920 template<
typename MT3
923 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
924 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
926 const size_t M( A.rows() );
927 const size_t N( B.columns() );
928 const size_t K( A.columns() );
931 const size_t end( N &
size_t(-2) );
933 for(
size_t i=0UL; i<M; ++i ) {
934 for(
size_t k=0UL; k<K; ++k ) {
935 for(
size_t j=0UL; j<end; j+=2UL ) {
936 C(i,j ) += A(i,k) * B(k,j );
937 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
940 C(i,end) += A(i,k) * B(k,end);
962 template<
typename MT3
965 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
966 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
968 typedef IntrinsicTrait<ElementType> IT;
970 const size_t M( A.rows() );
971 const size_t N( B.spacing() );
972 const size_t K( A.columns() );
976 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
977 for(
size_t i=0UL; i<M; ++i ) {
986 for(
size_t k=0UL; k<K; ++k ) {
988 xmm1 = xmm1 + a1 * B.get(k,j );
989 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
990 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
991 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
992 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
993 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
994 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
995 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
997 store( &(~C)(i,j ), xmm1 );
998 store( &(~C)(i,j+IT::size ), xmm2 );
999 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1000 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1001 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1002 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1003 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1004 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1007 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1009 for( ; (i+2UL) <= M; i+=2UL ) {
1018 for(
size_t k=0UL; k<K; ++k ) {
1025 xmm1 = xmm1 + a1 * b1;
1026 xmm2 = xmm2 + a1 * b2;
1027 xmm3 = xmm3 + a1 * b3;
1028 xmm4 = xmm4 + a1 * b4;
1029 xmm5 = xmm5 + a2 * b1;
1030 xmm6 = xmm6 + a2 * b2;
1031 xmm7 = xmm7 + a2 * b3;
1032 xmm8 = xmm8 + a2 * b4;
1034 store( &(~C)(i ,j ), xmm1 );
1035 store( &(~C)(i ,j+IT::size ), xmm2 );
1036 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1037 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1038 store( &(~C)(i+1UL,j ), xmm5 );
1039 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1040 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1041 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1048 for(
size_t k=0UL; k<K; ++k ) {
1050 xmm1 = xmm1 + a1 * B.get(k,j );
1051 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1052 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1053 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1055 store( &(~C)(i,j ), xmm1 );
1056 store( &(~C)(i,j+IT::size ), xmm2 );
1057 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1058 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1061 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1063 for( ; (i+2UL) <= M; i+=2UL ) {
1068 for(
size_t k=0UL; k<K; ++k ) {
1073 xmm1 = xmm1 + a1 * b1;
1074 xmm2 = xmm2 + a1 * b2;
1075 xmm3 = xmm3 + a2 * b1;
1076 xmm4 = xmm4 + a2 * b2;
1078 store( &(~C)(i ,j ), xmm1 );
1079 store( &(~C)(i ,j+IT::size), xmm2 );
1080 store( &(~C)(i+1UL,j ), xmm3 );
1081 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1086 for(
size_t k=0UL; k<K; ++k ) {
1088 xmm1 = xmm1 + a1 * B.get(k,j );
1089 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1091 store( &(~C)(i,j ), xmm1 );
1092 store( &(~C)(i,j+IT::size), xmm2 );
1097 for( ; (i+2UL) <= M; i+=2UL ) {
1100 for(
size_t k=0UL; k<K; ++k ) {
1102 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1103 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1105 store( &(~C)(i ,j), xmm1 );
1106 store( &(~C)(i+1UL,j), xmm2 );
1110 for(
size_t k=0UL; k<K; ++k ) {
1111 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
1113 store( &(~C)(i,j), xmm1 );
1134 template<
typename MT3
1137 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1138 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1143 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1144 const typename MT4::OppositeType tmp( A );
1147 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1148 const typename MT5::OppositeType tmp( B );
1151 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1152 const typename MT4::OppositeType tmp( A );
1156 const typename MT5::OppositeType tmp( B );
1177 template<
typename MT3
1180 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1181 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1183 selectDefaultAddAssignKernel( C, A, B );
1203 template<
typename MT3
1206 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1207 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1209 using boost::numeric_cast;
1215 const int M ( numeric_cast<int>( A.rows() ) );
1216 const int N ( numeric_cast<int>( B.columns() ) );
1217 const int K ( numeric_cast<int>( A.columns() ) );
1218 const int lda( numeric_cast<int>( A.spacing() ) );
1219 const int ldb( numeric_cast<int>( B.spacing() ) );
1220 const int ldc( numeric_cast<int>( C.spacing() ) );
1222 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1223 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1224 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1225 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1246 template<
typename MT3
1249 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1250 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1252 using boost::numeric_cast;
1258 const int M ( numeric_cast<int>( A.rows() ) );
1259 const int N ( numeric_cast<int>( B.columns() ) );
1260 const int K ( numeric_cast<int>( A.columns() ) );
1261 const int lda( numeric_cast<int>( A.spacing() ) );
1262 const int ldb( numeric_cast<int>( B.spacing() ) );
1263 const int ldc( numeric_cast<int>( C.spacing() ) );
1265 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1266 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1267 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1268 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1289 template<
typename MT3
1292 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1293 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1295 using boost::numeric_cast;
1304 const int M ( numeric_cast<int>( A.rows() ) );
1305 const int N ( numeric_cast<int>( B.columns() ) );
1306 const int K ( numeric_cast<int>( A.columns() ) );
1307 const int lda( numeric_cast<int>( A.spacing() ) );
1308 const int ldb( numeric_cast<int>( B.spacing() ) );
1309 const int ldc( numeric_cast<int>( C.spacing() ) );
1310 const complex<float> alpha( 1.0F, 0.0F );
1311 const complex<float> beta ( 1.0F, 0.0F );
1313 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1314 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1315 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1316 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1337 template<
typename MT3
1340 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1341 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1343 using boost::numeric_cast;
1352 const int M ( numeric_cast<int>( A.rows() ) );
1353 const int N ( numeric_cast<int>( B.columns() ) );
1354 const int K ( numeric_cast<int>( A.columns() ) );
1355 const int lda( numeric_cast<int>( A.spacing() ) );
1356 const int ldb( numeric_cast<int>( B.spacing() ) );
1357 const int ldc( numeric_cast<int>( C.spacing() ) );
1358 const complex<double> alpha( 1.0, 0.0 );
1359 const complex<double> beta ( 1.0, 0.0 );
1361 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1362 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1363 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1364 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1387 template<
typename MT3
1396 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1411 DMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1413 DMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1432 template<
typename MT3
1435 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1436 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1438 const size_t M( A.rows() );
1439 const size_t N( B.columns() );
1440 const size_t K( A.columns() );
1443 const size_t end( N &
size_t(-2) );
1445 for(
size_t i=0UL; i<M; ++i ) {
1446 for(
size_t k=0UL; k<K; ++k ) {
1447 for(
size_t j=0UL; j<end; j+=2UL ) {
1448 C(i,j ) -= A(i,k) * B(k,j );
1449 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1452 C(i,end) -= A(i,k) * B(k,end);
1474 template<
typename MT3
1477 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1478 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1480 typedef IntrinsicTrait<ElementType> IT;
1482 const size_t M( A.rows() );
1483 const size_t N( B.spacing() );
1484 const size_t K( A.columns() );
1488 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1489 for(
size_t i=0UL; i<M; ++i ) {
1498 for(
size_t k=0UL; k<K; ++k ) {
1500 xmm1 = xmm1 - a1 * B.get(k,j );
1501 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1502 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1503 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1504 xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1505 xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1506 xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1507 xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1509 store( &(~C)(i,j ), xmm1 );
1510 store( &(~C)(i,j+IT::size ), xmm2 );
1511 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1512 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1513 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1514 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1515 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1516 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1519 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1521 for( ; (i+2UL) <= M; i+=2UL ) {
1530 for(
size_t k=0UL; k<K; ++k ) {
1537 xmm1 = xmm1 - a1 * b1;
1538 xmm2 = xmm2 - a1 * b2;
1539 xmm3 = xmm3 - a1 * b3;
1540 xmm4 = xmm4 - a1 * b4;
1541 xmm5 = xmm5 - a2 * b1;
1542 xmm6 = xmm6 - a2 * b2;
1543 xmm7 = xmm7 - a2 * b3;
1544 xmm8 = xmm8 - a2 * b4;
1546 store( &(~C)(i ,j ), xmm1 );
1547 store( &(~C)(i ,j+IT::size ), xmm2 );
1548 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1549 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1550 store( &(~C)(i+1UL,j ), xmm5 );
1551 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1552 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1553 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1560 for(
size_t k=0UL; k<K; ++k ) {
1562 xmm1 = xmm1 - a1 * B.get(k,j );
1563 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1564 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1565 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1567 store( &(~C)(i,j ), xmm1 );
1568 store( &(~C)(i,j+IT::size ), xmm2 );
1569 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1570 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1573 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1575 for( ; (i+2UL) <= M; i+=2UL ) {
1580 for(
size_t k=0UL; k<K; ++k ) {
1585 xmm1 = xmm1 - a1 * b1;
1586 xmm2 = xmm2 - a1 * b2;
1587 xmm3 = xmm3 - a2 * b1;
1588 xmm4 = xmm4 - a2 * b2;
1590 store( &(~C)(i ,j ), xmm1 );
1591 store( &(~C)(i ,j+IT::size), xmm2 );
1592 store( &(~C)(i+1UL,j ), xmm3 );
1593 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1598 for(
size_t k=0UL; k<K; ++k ) {
1600 xmm1 = xmm1 - a1 * B.get(k,j );
1601 xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1603 store( &(~C)(i,j ), xmm1 );
1604 store( &(~C)(i,j+IT::size), xmm2 );
1609 for( ; (i+2UL) <= M; i+=2UL ) {
1612 for(
size_t k=0UL; k<K; ++k ) {
1614 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1615 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1617 store( &(~C)(i ,j), xmm1 );
1618 store( &(~C)(i+1UL,j), xmm2 );
1622 for(
size_t k=0UL; k<K; ++k ) {
1623 xmm1 = xmm1 -
set( A(i,k) ) * B.get(k,j);
1625 store( &(~C)(i,j), xmm1 );
1646 template<
typename MT3
1649 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1650 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1655 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1656 const typename MT4::OppositeType tmp( A );
1659 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1660 const typename MT5::OppositeType tmp( B );
1663 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1664 const typename MT4::OppositeType tmp( A );
1668 const typename MT5::OppositeType tmp( B );
1689 template<
typename MT3
1692 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1693 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1695 selectDefaultSubAssignKernel( C, A, B );
1715 template<
typename MT3
1718 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1719 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1721 using boost::numeric_cast;
1727 const int M ( numeric_cast<int>( A.rows() ) );
1728 const int N ( numeric_cast<int>( B.columns() ) );
1729 const int K ( numeric_cast<int>( A.columns() ) );
1730 const int lda( numeric_cast<int>( A.spacing() ) );
1731 const int ldb( numeric_cast<int>( B.spacing() ) );
1732 const int ldc( numeric_cast<int>( C.spacing() ) );
1734 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1735 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1736 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1737 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1758 template<
typename MT3
1761 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1762 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1764 using boost::numeric_cast;
1770 const int M ( numeric_cast<int>( A.rows() ) );
1771 const int N ( numeric_cast<int>( B.columns() ) );
1772 const int K ( numeric_cast<int>( A.columns() ) );
1773 const int lda( numeric_cast<int>( A.spacing() ) );
1774 const int ldb( numeric_cast<int>( B.spacing() ) );
1775 const int ldc( numeric_cast<int>( C.spacing() ) );
1777 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1778 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1779 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1780 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1801 template<
typename MT3
1804 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1805 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1807 using boost::numeric_cast;
1816 const int M ( numeric_cast<int>( A.rows() ) );
1817 const int N ( numeric_cast<int>( B.columns() ) );
1818 const int K ( numeric_cast<int>( A.columns() ) );
1819 const int lda( numeric_cast<int>( A.spacing() ) );
1820 const int ldb( numeric_cast<int>( B.spacing() ) );
1821 const int ldc( numeric_cast<int>( C.spacing() ) );
1822 const complex<float> alpha( -1.0F, 0.0F );
1823 const complex<float> beta ( 1.0F, 0.0F );
1825 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1826 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1827 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1828 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1849 template<
typename MT3
1852 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1853 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1855 using boost::numeric_cast;
1864 const int M ( numeric_cast<int>( A.rows() ) );
1865 const int N ( numeric_cast<int>( B.columns() ) );
1866 const int K ( numeric_cast<int>( A.columns() ) );
1867 const int lda( numeric_cast<int>( A.spacing() ) );
1868 const int ldb( numeric_cast<int>( B.spacing() ) );
1869 const int ldc( numeric_cast<int>( C.spacing() ) );
1870 const complex<double> alpha( -1.0, 0.0 );
1871 const complex<double> beta ( 1.0, 0.0 );
1873 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1874 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1875 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1876 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1922 template<
typename MT1
1926 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
1927 ,
private Expression
1928 ,
private Computation
1932 typedef DMatDMatMultExpr<MT1,MT2> MMM;
1933 typedef typename MMM::ResultType RES;
1934 typedef typename MT1::ResultType
RT1;
1935 typedef typename MT2::ResultType
RT2;
1936 typedef typename MT1::CompositeType
CT1;
1937 typedef typename MT2::CompositeType
CT2;
1945 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1946 struct UseSinglePrecisionKernel {
1947 enum { value = IsFloat<typename T1::ElementType>::value &&
1948 IsFloat<typename T2::ElementType>::value &&
1949 IsFloat<typename T3::ElementType>::value &&
1950 !IsComplex<T4>::value };
1959 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1960 struct UseDoublePrecisionKernel {
1961 enum { value = IsDouble<typename T1::ElementType>::value &&
1962 IsDouble<typename T2::ElementType>::value &&
1963 IsDouble<typename T3::ElementType>::value &&
1964 !IsComplex<T4>::value };
1973 template<
typename T1,
typename T2,
typename T3 >
1974 struct UseSinglePrecisionComplexKernel {
1975 typedef complex<float> Type;
1976 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1977 IsSame<typename T2::ElementType,Type>::value &&
1978 IsSame<typename T3::ElementType,Type>::value };
1987 template<
typename T1,
typename T2,
typename T3 >
1988 struct UseDoublePrecisionComplexKernel {
1989 typedef complex<double> Type;
1990 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1991 IsSame<typename T2::ElementType,Type>::value &&
1992 IsSame<typename T3::ElementType,Type>::value };
2000 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2001 struct UseDefaultKernel {
2002 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2003 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2004 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2005 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2013 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2014 struct UseVectorizedDefaultKernel {
2015 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2016 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2017 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2018 IsSame<typename T1::ElementType,T4>::value &&
2019 IntrinsicTrait<typename T1::ElementType>::addition &&
2020 IntrinsicTrait<typename T1::ElementType>::multiplication };
2026 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2027 typedef typename MultTrait<RES,ST>::Type
ResultType;
2028 typedef typename ResultType::OppositeType
OppositeType;
2030 typedef typename ResultType::ElementType
ElementType;
2031 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2036 typedef const DMatDMatMultExpr<MT1,MT2>
LeftOperand;
2042 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2045 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2050 enum { vectorizable = 0 };
2059 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2075 return matrix_(i,j) * scalar_;
2084 inline size_t rows()
const {
2085 return matrix_.rows();
2094 inline size_t columns()
const {
2095 return matrix_.columns();
2125 template<
typename T >
2126 inline bool canAlias(
const T* alias )
const {
2127 return matrix_.canAlias( alias );
2137 template<
typename T >
2138 inline bool isAliased(
const T* alias )
const {
2139 return matrix_.isAliased( alias );
2158 template<
typename MT3
2160 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2167 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2168 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2170 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2173 else if( left.columns() == 0UL ) {
2189 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2191 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2209 template<
typename MT3
2213 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2214 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2216 const size_t M( A.rows() );
2217 const size_t N( B.columns() );
2218 const size_t K( A.columns() );
2220 for(
size_t i=0UL; i<M; ++i ) {
2221 for(
size_t j=0UL; j<N; ++j ) {
2222 C(i,j) = A(i,0UL) * B(0UL,j);
2224 for(
size_t k=1UL; k<K; ++k ) {
2225 for(
size_t j=0UL; j<N; ++j ) {
2226 C(i,j) += A(i,k) * B(k,j);
2229 for(
size_t j=0UL; j<N; ++j ) {
2250 template<
typename MT3
2254 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2255 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2257 typedef IntrinsicTrait<ElementType> IT;
2259 const size_t M( A.rows() );
2260 const size_t N( B.spacing() );
2261 const size_t K( A.columns() );
2267 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2268 for(
size_t i=0UL; i<M; ++i ) {
2269 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2270 for(
size_t k=0UL; k<K; ++k ) {
2272 xmm1 = xmm1 + a1 * B.get(k,j );
2273 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2274 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2275 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2276 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2277 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2278 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2279 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2281 store( &(~C)(i,j ), xmm1 * factor );
2282 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2283 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2284 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2285 store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2286 store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2287 store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2288 store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2291 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2293 for( ; (i+2UL) <= M; i+=2UL ) {
2294 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2295 for(
size_t k=0UL; k<K; ++k ) {
2302 xmm1 = xmm1 + a1 * b1;
2303 xmm2 = xmm2 + a1 * b2;
2304 xmm3 = xmm3 + a1 * b3;
2305 xmm4 = xmm4 + a1 * b4;
2306 xmm5 = xmm5 + a2 * b1;
2307 xmm6 = xmm6 + a2 * b2;
2308 xmm7 = xmm7 + a2 * b3;
2309 xmm8 = xmm8 + a2 * b4;
2311 store( &(~C)(i ,j ), xmm1 * factor );
2312 store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2313 store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2314 store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2315 store( &(~C)(i+1UL,j ), xmm5 * factor );
2316 store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2317 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2318 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2322 for(
size_t k=0UL; k<K; ++k ) {
2324 xmm1 = xmm1 + a1 * B.get(k,j );
2325 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2326 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2327 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2329 store( &(~C)(i,j ), xmm1 * factor );
2330 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2331 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2332 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2335 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2337 for( ; (i+2UL) <= M; i+=2UL ) {
2339 for(
size_t k=0UL; k<K; ++k ) {
2344 xmm1 = xmm1 + a1 * b1;
2345 xmm2 = xmm2 + a1 * b2;
2346 xmm3 = xmm3 + a2 * b1;
2347 xmm4 = xmm4 + a2 * b2;
2349 store( &(~C)(i ,j ), xmm1 * factor );
2350 store( &(~C)(i ,j+IT::size), xmm2 * factor );
2351 store( &(~C)(i+1UL,j ), xmm3 * factor );
2352 store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2356 for(
size_t k=0UL; k<K; ++k ) {
2358 xmm1 = xmm1 + a1 * B.get(k,j );
2359 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2361 store( &(~C)(i,j ), xmm1 * factor );
2362 store( &(~C)(i,j+IT::size), xmm2 * factor );
2367 for( ; (i+2UL) <= M; i+=2UL ) {
2369 for(
size_t k=0UL; k<K; ++k ) {
2371 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2372 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2374 store( &(~C)(i ,j), xmm1 * factor );
2375 store( &(~C)(i+1UL,j), xmm2 * factor );
2379 for(
size_t k=0UL; k<K; ++k ) {
2380 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2382 store( &(~C)(i,j), xmm1 * factor );
2402 template<
typename MT3
2406 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2407 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2412 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2413 const typename MT4::OppositeType tmp( A );
2414 assign( ~C, tmp * B * scalar );
2416 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2417 const typename MT5::OppositeType tmp( B );
2418 assign( ~C, A * tmp * scalar );
2420 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2421 const typename MT4::OppositeType tmp( A );
2422 assign( ~C, tmp * B * scalar );
2425 const typename MT5::OppositeType tmp( B );
2426 assign( ~C, A * tmp * scalar );
2445 template<
typename MT3
2449 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2450 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2452 selectDefaultAssignKernel( C, A, B, scalar );
2471 template<
typename MT3
2475 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2476 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2478 using boost::numeric_cast;
2484 const int M ( numeric_cast<int>( A.rows() ) );
2485 const int N ( numeric_cast<int>( B.columns() ) );
2486 const int K ( numeric_cast<int>( A.columns() ) );
2487 const int lda( numeric_cast<int>( A.spacing() ) );
2488 const int ldb( numeric_cast<int>( B.spacing() ) );
2489 const int ldc( numeric_cast<int>( C.spacing() ) );
2491 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2492 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2493 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2494 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2514 template<
typename MT3
2518 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2519 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2521 using boost::numeric_cast;
2527 const int M ( numeric_cast<int>( A.rows() ) );
2528 const int N ( numeric_cast<int>( B.columns() ) );
2529 const int K ( numeric_cast<int>( A.columns() ) );
2530 const int lda( numeric_cast<int>( A.spacing() ) );
2531 const int ldb( numeric_cast<int>( B.spacing() ) );
2532 const int ldc( numeric_cast<int>( C.spacing() ) );
2534 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2535 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2536 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2537 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2557 template<
typename MT3
2561 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2562 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2564 using boost::numeric_cast;
2574 const int M ( numeric_cast<int>( A.rows() ) );
2575 const int N ( numeric_cast<int>( B.columns() ) );
2576 const int K ( numeric_cast<int>( A.columns() ) );
2577 const int lda( numeric_cast<int>( A.spacing() ) );
2578 const int ldb( numeric_cast<int>( B.spacing() ) );
2579 const int ldc( numeric_cast<int>( C.spacing() ) );
2580 const complex<float> alpha( scalar );
2581 const complex<float> beta ( 0.0F, 0.0F );
2583 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2584 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2585 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2586 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2606 template<
typename MT3
2610 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2611 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2613 using boost::numeric_cast;
2623 const int M ( numeric_cast<int>( A.rows() ) );
2624 const int N ( numeric_cast<int>( B.columns() ) );
2625 const int K ( numeric_cast<int>( A.columns() ) );
2626 const int lda( numeric_cast<int>( A.spacing() ) );
2627 const int ldb( numeric_cast<int>( B.spacing() ) );
2628 const int ldc( numeric_cast<int>( C.spacing() ) );
2629 const complex<double> alpha( scalar );
2630 const complex<double> beta ( 0.0, 0.0 );
2632 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2633 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2634 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2635 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2651 template<
typename MT
2653 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2657 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2669 const TmpType tmp( rhs );
2686 template<
typename MT3
2688 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2695 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2696 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2698 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2713 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2715 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2733 template<
typename MT3
2737 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2738 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2759 template<
typename MT3
2763 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2764 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2766 typedef IntrinsicTrait<ElementType> IT;
2768 const size_t M( A.rows() );
2769 const size_t N( B.spacing() );
2770 const size_t K( A.columns() );
2776 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2777 for(
size_t i=0UL; i<M; ++i ) {
2778 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2779 for(
size_t k=0UL; k<K; ++k ) {
2781 xmm1 = xmm1 + a1 * B.get(k,j );
2782 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2783 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2784 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2785 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2786 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2787 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2788 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2790 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2791 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
2792 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
2793 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
2794 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
2795 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
2796 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
2797 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
2800 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2802 for( ; (i+2UL) <= M; i+=2UL ) {
2803 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2804 for(
size_t k=0UL; k<K; ++k ) {
2811 xmm1 = xmm1 + a1 * b1;
2812 xmm2 = xmm2 + a1 * b2;
2813 xmm3 = xmm3 + a1 * b3;
2814 xmm4 = xmm4 + a1 * b4;
2815 xmm5 = xmm5 + a2 * b1;
2816 xmm6 = xmm6 + a2 * b2;
2817 xmm7 = xmm7 + a2 * b3;
2818 xmm8 = xmm8 + a2 * b4;
2820 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2821 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
2822 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
2823 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
2824 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
2825 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
2826 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
2827 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
2831 for(
size_t k=0UL; k<K; ++k ) {
2833 xmm1 = xmm1 + a1 * B.get(k,j );
2834 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2835 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2836 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2838 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2839 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
2840 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
2841 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
2844 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2846 for( ; (i+2UL) <= M; i+=2UL ) {
2848 for(
size_t k=0UL; k<K; ++k ) {
2853 xmm1 = xmm1 + a1 * b1;
2854 xmm2 = xmm2 + a1 * b2;
2855 xmm3 = xmm3 + a2 * b1;
2856 xmm4 = xmm4 + a2 * b2;
2858 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2859 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
2860 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
2861 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
2865 for(
size_t k=0UL; k<K; ++k ) {
2867 xmm1 = xmm1 + a1 * B.get(k,j );
2868 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2870 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2871 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
2876 for( ; (i+2UL) <= M; i+=2UL ) {
2878 for(
size_t k=0UL; k<K; ++k ) {
2880 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2881 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2883 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2884 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) + xmm2 * factor );
2888 for(
size_t k=0UL; k<K; ++k ) {
2889 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2891 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
2911 template<
typename MT3
2915 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2916 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2921 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2922 const typename MT4::OppositeType tmp( A );
2925 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2926 const typename MT5::OppositeType tmp( B );
2929 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2930 const typename MT4::OppositeType tmp( A );
2934 const typename MT5::OppositeType tmp( B );
2954 template<
typename MT3
2958 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2959 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2961 selectDefaultAddAssignKernel( C, A, B, scalar );
2980 template<
typename MT3
2984 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2985 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2987 using boost::numeric_cast;
2993 const int M ( numeric_cast<int>( A.rows() ) );
2994 const int N ( numeric_cast<int>( B.columns() ) );
2995 const int K ( numeric_cast<int>( A.columns() ) );
2996 const int lda( numeric_cast<int>( A.spacing() ) );
2997 const int ldb( numeric_cast<int>( B.spacing() ) );
2998 const int ldc( numeric_cast<int>( C.spacing() ) );
3000 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3001 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3002 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3003 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3023 template<
typename MT3
3027 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3028 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3030 using boost::numeric_cast;
3036 const int M ( numeric_cast<int>( A.rows() ) );
3037 const int N ( numeric_cast<int>( B.columns() ) );
3038 const int K ( numeric_cast<int>( A.columns() ) );
3039 const int lda( numeric_cast<int>( A.spacing() ) );
3040 const int ldb( numeric_cast<int>( B.spacing() ) );
3041 const int ldc( numeric_cast<int>( C.spacing() ) );
3043 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3044 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3045 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3046 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3066 template<
typename MT3
3070 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3071 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3073 using boost::numeric_cast;
3083 const int M ( numeric_cast<int>( A.rows() ) );
3084 const int N ( numeric_cast<int>( B.columns() ) );
3085 const int K ( numeric_cast<int>( A.columns() ) );
3086 const int lda( numeric_cast<int>( A.spacing() ) );
3087 const int ldb( numeric_cast<int>( B.spacing() ) );
3088 const int ldc( numeric_cast<int>( C.spacing() ) );
3089 const complex<float> alpha( scalar );
3090 const complex<float> beta ( 1.0F, 0.0F );
3092 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3093 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3094 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3095 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3115 template<
typename MT3
3119 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3120 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3122 using boost::numeric_cast;
3132 const int M ( numeric_cast<int>( A.rows() ) );
3133 const int N ( numeric_cast<int>( B.columns() ) );
3134 const int K ( numeric_cast<int>( A.columns() ) );
3135 const int lda( numeric_cast<int>( A.spacing() ) );
3136 const int ldb( numeric_cast<int>( B.spacing() ) );
3137 const int ldc( numeric_cast<int>( C.spacing() ) );
3138 const complex<double> alpha( scalar );
3139 const complex<double> beta ( 1.0, 0.0 );
3141 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3142 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3143 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3144 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3165 template<
typename MT3
3167 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3174 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3175 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3177 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3192 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3194 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3212 template<
typename MT3
3216 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3217 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3238 template<
typename MT3
3242 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3243 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3245 typedef IntrinsicTrait<ElementType> IT;
3247 const size_t M( A.rows() );
3248 const size_t N( B.spacing() );
3249 const size_t K( A.columns() );
3255 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3256 for(
size_t i=0UL; i<M; ++i ) {
3257 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3258 for(
size_t k=0UL; k<K; ++k ) {
3260 xmm1 = xmm1 + a1 * B.get(k,j );
3261 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3262 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3263 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3264 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3265 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3266 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3267 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3269 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3270 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3271 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3272 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3273 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3274 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3275 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3276 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3279 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3281 for( ; (i+2UL) <= M; i+=2UL ) {
3282 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3283 for(
size_t k=0UL; k<K; ++k ) {
3290 xmm1 = xmm1 + a1 * b1;
3291 xmm2 = xmm2 + a1 * b2;
3292 xmm3 = xmm3 + a1 * b3;
3293 xmm4 = xmm4 + a1 * b4;
3294 xmm5 = xmm5 + a2 * b1;
3295 xmm6 = xmm6 + a2 * b2;
3296 xmm7 = xmm7 + a2 * b3;
3297 xmm8 = xmm8 + a2 * b4;
3299 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3300 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3301 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3302 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3303 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3304 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3305 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3306 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3310 for(
size_t k=0UL; k<K; ++k ) {
3312 xmm1 = xmm1 + a1 * B.get(k,j );
3313 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3314 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3315 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3317 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3318 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3319 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3320 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3323 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3325 for( ; (i+2UL) <= M; i+=2UL ) {
3327 for(
size_t k=0UL; k<K; ++k ) {
3332 xmm1 = xmm1 + a1 * b1;
3333 xmm2 = xmm2 + a1 * b2;
3334 xmm3 = xmm3 + a2 * b1;
3335 xmm4 = xmm4 + a2 * b2;
3337 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3338 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3339 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3340 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3344 for(
size_t k=0UL; k<K; ++k ) {
3346 xmm1 = xmm1 + a1 * B.get(k,j );
3347 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3349 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3350 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3355 for( ; (i+2UL) <= M; i+=2UL ) {
3357 for(
size_t k=0UL; k<K; ++k ) {
3359 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3360 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3362 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3363 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3367 for(
size_t k=0UL; k<K; ++k ) {
3368 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3370 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3390 template<
typename MT3
3394 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3395 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3400 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3401 const typename MT4::OppositeType tmp( A );
3404 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3405 const typename MT5::OppositeType tmp( B );
3408 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3409 const typename MT4::OppositeType tmp( A );
3413 const typename MT5::OppositeType tmp( B );
3433 template<
typename MT3
3437 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3438 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3440 selectDefaultSubAssignKernel( C, A, B, scalar );
3459 template<
typename MT3
3463 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3464 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3466 using boost::numeric_cast;
3472 const int M ( numeric_cast<int>( A.rows() ) );
3473 const int N ( numeric_cast<int>( B.columns() ) );
3474 const int K ( numeric_cast<int>( A.columns() ) );
3475 const int lda( numeric_cast<int>( A.spacing() ) );
3476 const int ldb( numeric_cast<int>( B.spacing() ) );
3477 const int ldc( numeric_cast<int>( C.spacing() ) );
3479 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3480 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3481 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3482 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3502 template<
typename MT3
3506 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3507 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3509 using boost::numeric_cast;
3515 const int M ( numeric_cast<int>( A.rows() ) );
3516 const int N ( numeric_cast<int>( B.columns() ) );
3517 const int K ( numeric_cast<int>( A.columns() ) );
3518 const int lda( numeric_cast<int>( A.spacing() ) );
3519 const int ldb( numeric_cast<int>( B.spacing() ) );
3520 const int ldc( numeric_cast<int>( C.spacing() ) );
3522 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3523 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3524 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3525 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3545 template<
typename MT3
3549 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3550 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3552 using boost::numeric_cast;
3562 const int M ( numeric_cast<int>( A.rows() ) );
3563 const int N ( numeric_cast<int>( B.columns() ) );
3564 const int K ( numeric_cast<int>( A.columns() ) );
3565 const int lda( numeric_cast<int>( A.spacing() ) );
3566 const int ldb( numeric_cast<int>( B.spacing() ) );
3567 const int ldc( numeric_cast<int>( C.spacing() ) );
3568 const complex<float> alpha( -scalar );
3569 const complex<float> beta ( 1.0F, 0.0F );
3571 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3572 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3573 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3574 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3594 template<
typename MT3
3598 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3599 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3601 using boost::numeric_cast;
3611 const int M ( numeric_cast<int>( A.rows() ) );
3612 const int N ( numeric_cast<int>( B.columns() ) );
3613 const int K ( numeric_cast<int>( A.columns() ) );
3614 const int lda( numeric_cast<int>( A.spacing() ) );
3615 const int ldb( numeric_cast<int>( B.spacing() ) );
3616 const int ldc( numeric_cast<int>( C.spacing() ) );
3617 const complex<double> alpha( -scalar );
3618 const complex<double> beta ( 1.0, 0.0 );
3620 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3621 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3622 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3623 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3688 template<
typename T1
3690 inline const DMatDMatMultExpr<T1,T2>
3696 throw std::invalid_argument(
"Matrix sizes do not match" );
3723 template<
typename MT1
3725 inline typename RowExprTrait< DMatDMatMultExpr<MT1,MT2> >::Type
3726 row(
const DMatDMatMultExpr<MT1,MT2>& dm,
size_t index )
3730 return row( dm.leftOperand(), index ) * dm.rightOperand();
3748 template<
typename MT1
3750 inline typename ColumnExprTrait< DMatDMatMultExpr<MT1,MT2> >::Type
3751 column(
const DMatDMatMultExpr<MT1,MT2>& dm,
size_t index )
3755 return dm.leftOperand() *
column( dm.rightOperand(), index );
3771 template<
typename MT1,
typename MT2,
typename VT >
3776 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3777 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3778 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3779 ,
typename DMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
3780 , INVALID_TYPE >::Type Type;
3789 template<
typename MT1,
typename MT2,
typename VT >
3794 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3795 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3796 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3797 ,
typename DMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
3798 , INVALID_TYPE >::Type Type;
3807 template<
typename VT,
typename MT1,
typename MT2 >
3812 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
3813 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3814 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3815 ,
typename TDVecDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3816 , INVALID_TYPE >::Type Type;
3825 template<
typename VT,
typename MT1,
typename MT2 >
3830 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
3831 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3832 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3833 ,
typename TDVecDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3834 , INVALID_TYPE >::Type Type;
3843 template<
typename MT1,
typename MT2 >
3848 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
3857 template<
typename MT1,
typename MT2 >
3862 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;