22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
97 template<
typename MT1
105 typedef typename MT1::ResultType
RT1;
106 typedef typename MT2::ResultType
RT2;
107 typedef typename MT1::CompositeType
CT1;
108 typedef typename MT2::CompositeType
CT2;
116 template<
typename T1,
typename T2,
typename T3 >
117 struct UseSinglePrecisionKernel {
130 template<
typename T1,
typename T2,
typename T3 >
131 struct UseDoublePrecisionKernel {
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseSinglePrecisionComplexKernel {
147 typedef complex<float> Type;
148 enum { value = IsSame<typename T1::ElementType,Type>::value &&
149 IsSame<typename T2::ElementType,Type>::value &&
150 IsSame<typename T3::ElementType,Type>::value };
161 template<
typename T1,
typename T2,
typename T3 >
162 struct UseDoublePrecisionComplexKernel {
163 typedef complex<double> Type;
164 enum { value = IsSame<typename T1::ElementType,Type>::value &&
165 IsSame<typename T2::ElementType,Type>::value &&
166 IsSame<typename T3::ElementType,Type>::value };
176 template<
typename T1,
typename T2,
typename T3 >
177 struct UseDefaultKernel {
178 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
179 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
180 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
181 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseVectorizedDefaultKernel {
193 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
194 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
195 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
196 IntrinsicTrait<typename T1::ElementType>::addition &&
197 IntrinsicTrait<typename T1::ElementType>::multiplication };
228 enum { vectorizable = 0 };
258 if(
lhs_.columns() != 0UL ) {
259 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
261 for(
size_t k=1UL; k<end; k+=2UL ) {
263 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
265 if( end <
lhs_.columns() ) {
293 return rhs_.columns();
323 template<
typename T >
325 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
335 template<
typename T >
337 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
357 template<
typename MT3
366 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
369 else if( rhs.
lhs_.columns() == 0UL ) {
385 DMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
387 DMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
405 template<
typename MT3
409 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
411 const size_t M( A.rows() );
412 const size_t N( B.columns() );
413 const size_t K( A.columns() );
415 for(
size_t i=0UL; i<M; ++i ) {
416 for(
size_t j=0UL; j<N; ++j ) {
417 C(i,j) = A(i,0UL) * B(0UL,j);
419 for(
size_t k=1UL; k<K; ++k ) {
420 for(
size_t j=0UL; j<N; ++j ) {
421 C(i,j) += A(i,k) * B(k,j);
443 template<
typename MT3
446 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
447 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
449 typedef IntrinsicTrait<ElementType> IT;
451 const size_t M( A.rows() );
452 const size_t N( B.spacing() );
453 const size_t K( A.columns() );
457 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
458 for(
size_t i=0UL; i<M; ++i ) {
459 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
460 for(
size_t k=0UL; k<K; ++k ) {
462 xmm1 = xmm1 + a1 * B.get(k,j );
463 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
464 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
465 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
466 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
467 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
468 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
469 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
471 store( &(~C)(i,j ), xmm1 );
472 store( &(~C)(i,j+IT::size ), xmm2 );
473 store( &(~C)(i,j+IT::size*2UL), xmm3 );
474 store( &(~C)(i,j+IT::size*3UL), xmm4 );
475 store( &(~C)(i,j+IT::size*4UL), xmm5 );
476 store( &(~C)(i,j+IT::size*5UL), xmm6 );
477 store( &(~C)(i,j+IT::size*6UL), xmm7 );
478 store( &(~C)(i,j+IT::size*7UL), xmm8 );
481 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
483 for( ; (i+2UL) <= M; i+=2UL ) {
484 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
485 for(
size_t k=0UL; k<K; ++k ) {
492 xmm1 = xmm1 + a1 * b1;
493 xmm2 = xmm2 + a1 * b2;
494 xmm3 = xmm3 + a1 * b3;
495 xmm4 = xmm4 + a1 * b4;
496 xmm5 = xmm5 + a2 * b1;
497 xmm6 = xmm6 + a2 * b2;
498 xmm7 = xmm7 + a2 * b3;
499 xmm8 = xmm8 + a2 * b4;
501 store( &(~C)(i ,j ), xmm1 );
502 store( &(~C)(i ,j+IT::size ), xmm2 );
503 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
504 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
505 store( &(~C)(i+1UL,j ), xmm5 );
506 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
507 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
508 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
512 for(
size_t k=0UL; k<K; ++k ) {
514 xmm1 = xmm1 + a1 * B.get(k,j );
515 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
516 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
517 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
519 store( &(~C)(i,j ), xmm1 );
520 store( &(~C)(i,j+IT::size ), xmm2 );
521 store( &(~C)(i,j+IT::size*2UL), xmm3 );
522 store( &(~C)(i,j+IT::size*3UL), xmm4 );
525 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
527 for( ; (i+2UL) <= M; i+=2UL ) {
529 for(
size_t k=0UL; k<K; ++k ) {
534 xmm1 = xmm1 + a1 * b1;
535 xmm2 = xmm2 + a1 * b2;
536 xmm3 = xmm3 + a2 * b1;
537 xmm4 = xmm4 + a2 * b2;
539 store( &(~C)(i ,j ), xmm1 );
540 store( &(~C)(i ,j+IT::size), xmm2 );
541 store( &(~C)(i+1UL,j ), xmm3 );
542 store( &(~C)(i+1UL,j+IT::size), xmm4 );
546 for(
size_t k=0UL; k<K; ++k ) {
548 xmm1 = xmm1 + a1 * B.get(k,j );
549 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
551 store( &(~C)(i,j ), xmm1 );
552 store( &(~C)(i,j+IT::size), xmm2 );
557 for( ; (i+2UL) <= M; i+=2UL ) {
559 for(
size_t k=0UL; k<K; ++k ) {
561 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
562 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
564 store( &(~C)(i ,j), xmm1 );
565 store( &(~C)(i+1UL,j), xmm2 );
569 for(
size_t k=0UL; k<K; ++k ) {
570 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
572 store( &(~C)(i,j), xmm1 );
593 template<
typename MT3
596 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
597 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
602 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
603 const typename MT4::OppositeType tmp( A );
606 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
607 const typename MT5::OppositeType tmp( B );
610 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
611 const typename MT4::OppositeType tmp( A );
615 const typename MT5::OppositeType tmp( B );
635 template<
typename MT3
638 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
639 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
641 selectDefaultAssignKernel( C, A, B );
661 template<
typename MT3
664 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
665 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
667 using boost::numeric_cast;
673 const int M ( numeric_cast<int>( A.rows() ) );
674 const int N ( numeric_cast<int>( B.columns() ) );
675 const int K ( numeric_cast<int>( A.columns() ) );
676 const int lda( numeric_cast<int>( A.spacing() ) );
677 const int ldb( numeric_cast<int>( B.spacing() ) );
678 const int ldc( numeric_cast<int>( C.spacing() ) );
680 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
681 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
682 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
683 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
704 template<
typename MT3
707 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
708 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
710 using boost::numeric_cast;
716 const int M ( numeric_cast<int>( A.rows() ) );
717 const int N ( numeric_cast<int>( B.columns() ) );
718 const int K ( numeric_cast<int>( A.columns() ) );
719 const int lda( numeric_cast<int>( A.spacing() ) );
720 const int ldb( numeric_cast<int>( B.spacing() ) );
721 const int ldc( numeric_cast<int>( C.spacing() ) );
723 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
724 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
725 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
726 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
747 template<
typename MT3
750 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
751 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
753 using boost::numeric_cast;
762 const int M ( numeric_cast<int>( A.rows() ) );
763 const int N ( numeric_cast<int>( B.columns() ) );
764 const int K ( numeric_cast<int>( A.columns() ) );
765 const int lda( numeric_cast<int>( A.spacing() ) );
766 const int ldb( numeric_cast<int>( B.spacing() ) );
767 const int ldc( numeric_cast<int>( C.spacing() ) );
768 const complex<float> alpha( 1.0F, 0.0F );
769 const complex<float> beta ( 0.0F, 0.0F );
771 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
772 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
773 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
774 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
795 template<
typename MT3
798 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
799 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
801 using boost::numeric_cast;
810 const int M ( numeric_cast<int>( A.rows() ) );
811 const int N ( numeric_cast<int>( B.columns() ) );
812 const int K ( numeric_cast<int>( A.columns() ) );
813 const int lda( numeric_cast<int>( A.spacing() ) );
814 const int ldb( numeric_cast<int>( B.spacing() ) );
815 const int ldc( numeric_cast<int>( C.spacing() ) );
816 const complex<double> alpha( 1.0, 0.0 );
817 const complex<double> beta ( 0.0, 0.0 );
819 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
820 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
821 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
822 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
840 template<
typename MT
846 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
858 const TmpType tmp( rhs );
877 template<
typename MT3
886 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
901 DMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
903 DMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
922 template<
typename MT3
925 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
926 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
928 const size_t M( A.rows() );
929 const size_t N( B.columns() );
930 const size_t K( A.columns() );
933 const size_t end( N &
size_t(-2) );
935 for(
size_t i=0UL; i<M; ++i ) {
936 for(
size_t k=0UL; k<K; ++k ) {
937 for(
size_t j=0UL; j<end; j+=2UL ) {
938 C(i,j ) += A(i,k) * B(k,j );
939 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
942 C(i,end) += A(i,k) * B(k,end);
964 template<
typename MT3
967 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
968 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
970 typedef IntrinsicTrait<ElementType> IT;
972 const size_t M( A.rows() );
973 const size_t N( B.spacing() );
974 const size_t K( A.columns() );
978 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
979 for(
size_t i=0UL; i<M; ++i ) {
988 for(
size_t k=0UL; k<K; ++k ) {
990 xmm1 = xmm1 + a1 * B.get(k,j );
991 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
992 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
993 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
994 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
995 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
996 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
997 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
999 store( &(~C)(i,j ), xmm1 );
1000 store( &(~C)(i,j+IT::size ), xmm2 );
1001 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1002 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1003 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1004 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1005 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1006 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1009 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1011 for( ; (i+2UL) <= M; i+=2UL ) {
1020 for(
size_t k=0UL; k<K; ++k ) {
1027 xmm1 = xmm1 + a1 * b1;
1028 xmm2 = xmm2 + a1 * b2;
1029 xmm3 = xmm3 + a1 * b3;
1030 xmm4 = xmm4 + a1 * b4;
1031 xmm5 = xmm5 + a2 * b1;
1032 xmm6 = xmm6 + a2 * b2;
1033 xmm7 = xmm7 + a2 * b3;
1034 xmm8 = xmm8 + a2 * b4;
1036 store( &(~C)(i ,j ), xmm1 );
1037 store( &(~C)(i ,j+IT::size ), xmm2 );
1038 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1039 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1040 store( &(~C)(i+1UL,j ), xmm5 );
1041 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1042 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1043 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1050 for(
size_t k=0UL; k<K; ++k ) {
1052 xmm1 = xmm1 + a1 * B.get(k,j );
1053 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1054 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1055 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1057 store( &(~C)(i,j ), xmm1 );
1058 store( &(~C)(i,j+IT::size ), xmm2 );
1059 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1060 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1063 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1065 for( ; (i+2UL) <= M; i+=2UL ) {
1070 for(
size_t k=0UL; k<K; ++k ) {
1075 xmm1 = xmm1 + a1 * b1;
1076 xmm2 = xmm2 + a1 * b2;
1077 xmm3 = xmm3 + a2 * b1;
1078 xmm4 = xmm4 + a2 * b2;
1080 store( &(~C)(i ,j ), xmm1 );
1081 store( &(~C)(i ,j+IT::size), xmm2 );
1082 store( &(~C)(i+1UL,j ), xmm3 );
1083 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1088 for(
size_t k=0UL; k<K; ++k ) {
1090 xmm1 = xmm1 + a1 * B.get(k,j );
1091 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1093 store( &(~C)(i,j ), xmm1 );
1094 store( &(~C)(i,j+IT::size), xmm2 );
1099 for( ; (i+2UL) <= M; i+=2UL ) {
1102 for(
size_t k=0UL; k<K; ++k ) {
1104 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1105 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1107 store( &(~C)(i ,j), xmm1 );
1108 store( &(~C)(i+1UL,j), xmm2 );
1112 for(
size_t k=0UL; k<K; ++k ) {
1113 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
1115 store( &(~C)(i,j), xmm1 );
1136 template<
typename MT3
1139 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1140 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1145 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1146 const typename MT4::OppositeType tmp( A );
1149 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1150 const typename MT5::OppositeType tmp( B );
1153 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1154 const typename MT4::OppositeType tmp( A );
1158 const typename MT5::OppositeType tmp( B );
1179 template<
typename MT3
1182 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1183 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1185 selectDefaultAddAssignKernel( C, A, B );
1205 template<
typename MT3
1208 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1209 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1211 using boost::numeric_cast;
1217 const int M ( numeric_cast<int>( A.rows() ) );
1218 const int N ( numeric_cast<int>( B.columns() ) );
1219 const int K ( numeric_cast<int>( A.columns() ) );
1220 const int lda( numeric_cast<int>( A.spacing() ) );
1221 const int ldb( numeric_cast<int>( B.spacing() ) );
1222 const int ldc( numeric_cast<int>( C.spacing() ) );
1224 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1225 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1226 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1227 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1248 template<
typename MT3
1251 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1252 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1254 using boost::numeric_cast;
1260 const int M ( numeric_cast<int>( A.rows() ) );
1261 const int N ( numeric_cast<int>( B.columns() ) );
1262 const int K ( numeric_cast<int>( A.columns() ) );
1263 const int lda( numeric_cast<int>( A.spacing() ) );
1264 const int ldb( numeric_cast<int>( B.spacing() ) );
1265 const int ldc( numeric_cast<int>( C.spacing() ) );
1267 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1268 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1269 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1270 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1291 template<
typename MT3
1294 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1295 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1297 using boost::numeric_cast;
1306 const int M ( numeric_cast<int>( A.rows() ) );
1307 const int N ( numeric_cast<int>( B.columns() ) );
1308 const int K ( numeric_cast<int>( A.columns() ) );
1309 const int lda( numeric_cast<int>( A.spacing() ) );
1310 const int ldb( numeric_cast<int>( B.spacing() ) );
1311 const int ldc( numeric_cast<int>( C.spacing() ) );
1312 const complex<float> alpha( 1.0F, 0.0F );
1313 const complex<float> beta ( 1.0F, 0.0F );
1315 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1316 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1317 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1318 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1339 template<
typename MT3
1342 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1343 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1345 using boost::numeric_cast;
1354 const int M ( numeric_cast<int>( A.rows() ) );
1355 const int N ( numeric_cast<int>( B.columns() ) );
1356 const int K ( numeric_cast<int>( A.columns() ) );
1357 const int lda( numeric_cast<int>( A.spacing() ) );
1358 const int ldb( numeric_cast<int>( B.spacing() ) );
1359 const int ldc( numeric_cast<int>( C.spacing() ) );
1360 const complex<double> alpha( 1.0, 0.0 );
1361 const complex<double> beta ( 1.0, 0.0 );
1363 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1364 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1365 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1366 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1389 template<
typename MT3
1398 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1413 DMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1415 DMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1434 template<
typename MT3
1437 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1438 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1440 const size_t M( A.rows() );
1441 const size_t N( B.columns() );
1442 const size_t K( A.columns() );
1445 const size_t end( N &
size_t(-2) );
1447 for(
size_t i=0UL; i<M; ++i ) {
1448 for(
size_t k=0UL; k<K; ++k ) {
1449 for(
size_t j=0UL; j<end; j+=2UL ) {
1450 C(i,j ) -= A(i,k) * B(k,j );
1451 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1454 C(i,end) -= A(i,k) * B(k,end);
1476 template<
typename MT3
1479 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1480 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1482 typedef IntrinsicTrait<ElementType> IT;
1484 const size_t M( A.rows() );
1485 const size_t N( B.spacing() );
1486 const size_t K( A.columns() );
1490 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1491 for(
size_t i=0UL; i<M; ++i ) {
1500 for(
size_t k=0UL; k<K; ++k ) {
1502 xmm1 = xmm1 - a1 * B.get(k,j );
1503 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1504 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1505 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1506 xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1507 xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1508 xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1509 xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1511 store( &(~C)(i,j ), xmm1 );
1512 store( &(~C)(i,j+IT::size ), xmm2 );
1513 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1514 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1515 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1516 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1517 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1518 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1521 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1523 for( ; (i+2UL) <= M; i+=2UL ) {
1532 for(
size_t k=0UL; k<K; ++k ) {
1539 xmm1 = xmm1 - a1 * b1;
1540 xmm2 = xmm2 - a1 * b2;
1541 xmm3 = xmm3 - a1 * b3;
1542 xmm4 = xmm4 - a1 * b4;
1543 xmm5 = xmm5 - a2 * b1;
1544 xmm6 = xmm6 - a2 * b2;
1545 xmm7 = xmm7 - a2 * b3;
1546 xmm8 = xmm8 - a2 * b4;
1548 store( &(~C)(i ,j ), xmm1 );
1549 store( &(~C)(i ,j+IT::size ), xmm2 );
1550 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1551 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1552 store( &(~C)(i+1UL,j ), xmm5 );
1553 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1554 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1555 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1562 for(
size_t k=0UL; k<K; ++k ) {
1564 xmm1 = xmm1 - a1 * B.get(k,j );
1565 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1566 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1567 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1569 store( &(~C)(i,j ), xmm1 );
1570 store( &(~C)(i,j+IT::size ), xmm2 );
1571 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1572 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1575 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1577 for( ; (i+2UL) <= M; i+=2UL ) {
1582 for(
size_t k=0UL; k<K; ++k ) {
1587 xmm1 = xmm1 - a1 * b1;
1588 xmm2 = xmm2 - a1 * b2;
1589 xmm3 = xmm3 - a2 * b1;
1590 xmm4 = xmm4 - a2 * b2;
1592 store( &(~C)(i ,j ), xmm1 );
1593 store( &(~C)(i ,j+IT::size), xmm2 );
1594 store( &(~C)(i+1UL,j ), xmm3 );
1595 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1600 for(
size_t k=0UL; k<K; ++k ) {
1602 xmm1 = xmm1 - a1 * B.get(k,j );
1603 xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1605 store( &(~C)(i,j ), xmm1 );
1606 store( &(~C)(i,j+IT::size), xmm2 );
1611 for( ; (i+2UL) <= M; i+=2UL ) {
1614 for(
size_t k=0UL; k<K; ++k ) {
1616 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1617 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1619 store( &(~C)(i ,j), xmm1 );
1620 store( &(~C)(i+1UL,j), xmm2 );
1624 for(
size_t k=0UL; k<K; ++k ) {
1625 xmm1 = xmm1 -
set( A(i,k) ) * B.get(k,j);
1627 store( &(~C)(i,j), xmm1 );
1648 template<
typename MT3
1651 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1652 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1657 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1658 const typename MT4::OppositeType tmp( A );
1661 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1662 const typename MT5::OppositeType tmp( B );
1665 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1666 const typename MT4::OppositeType tmp( A );
1670 const typename MT5::OppositeType tmp( B );
1691 template<
typename MT3
1694 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1695 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1697 selectDefaultSubAssignKernel( C, A, B );
1717 template<
typename MT3
1720 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1721 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1723 using boost::numeric_cast;
1729 const int M ( numeric_cast<int>( A.rows() ) );
1730 const int N ( numeric_cast<int>( B.columns() ) );
1731 const int K ( numeric_cast<int>( A.columns() ) );
1732 const int lda( numeric_cast<int>( A.spacing() ) );
1733 const int ldb( numeric_cast<int>( B.spacing() ) );
1734 const int ldc( numeric_cast<int>( C.spacing() ) );
1736 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1737 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1738 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1739 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1760 template<
typename MT3
1763 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1764 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1766 using boost::numeric_cast;
1772 const int M ( numeric_cast<int>( A.rows() ) );
1773 const int N ( numeric_cast<int>( B.columns() ) );
1774 const int K ( numeric_cast<int>( A.columns() ) );
1775 const int lda( numeric_cast<int>( A.spacing() ) );
1776 const int ldb( numeric_cast<int>( B.spacing() ) );
1777 const int ldc( numeric_cast<int>( C.spacing() ) );
1779 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1780 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1781 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1782 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1803 template<
typename MT3
1806 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1807 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1809 using boost::numeric_cast;
1818 const int M ( numeric_cast<int>( A.rows() ) );
1819 const int N ( numeric_cast<int>( B.columns() ) );
1820 const int K ( numeric_cast<int>( A.columns() ) );
1821 const int lda( numeric_cast<int>( A.spacing() ) );
1822 const int ldb( numeric_cast<int>( B.spacing() ) );
1823 const int ldc( numeric_cast<int>( C.spacing() ) );
1824 const complex<float> alpha( -1.0F, 0.0F );
1825 const complex<float> beta ( 1.0F, 0.0F );
1827 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1828 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1829 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1830 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1851 template<
typename MT3
1854 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1855 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1857 using boost::numeric_cast;
1866 const int M ( numeric_cast<int>( A.rows() ) );
1867 const int N ( numeric_cast<int>( B.columns() ) );
1868 const int K ( numeric_cast<int>( A.columns() ) );
1869 const int lda( numeric_cast<int>( A.spacing() ) );
1870 const int ldb( numeric_cast<int>( B.spacing() ) );
1871 const int ldc( numeric_cast<int>( C.spacing() ) );
1872 const complex<double> alpha( -1.0, 0.0 );
1873 const complex<double> beta ( 1.0, 0.0 );
1875 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1876 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1877 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1878 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1924 template<
typename MT1
1928 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
1929 ,
private MatScalarMultExpr
1930 ,
private Computation
1934 typedef DMatDMatMultExpr<MT1,MT2> MMM;
1935 typedef typename MMM::ResultType RES;
1936 typedef typename MT1::ResultType
RT1;
1937 typedef typename MT2::ResultType
RT2;
1938 typedef typename MT1::CompositeType
CT1;
1939 typedef typename MT2::CompositeType
CT2;
1947 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1948 struct UseSinglePrecisionKernel {
1949 enum { value = IsFloat<typename T1::ElementType>::value &&
1950 IsFloat<typename T2::ElementType>::value &&
1951 IsFloat<typename T3::ElementType>::value &&
1952 !IsComplex<T4>::value };
1961 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1962 struct UseDoublePrecisionKernel {
1963 enum { value = IsDouble<typename T1::ElementType>::value &&
1964 IsDouble<typename T2::ElementType>::value &&
1965 IsDouble<typename T3::ElementType>::value &&
1966 !IsComplex<T4>::value };
1975 template<
typename T1,
typename T2,
typename T3 >
1976 struct UseSinglePrecisionComplexKernel {
1977 typedef complex<float> Type;
1978 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1979 IsSame<typename T2::ElementType,Type>::value &&
1980 IsSame<typename T3::ElementType,Type>::value };
1989 template<
typename T1,
typename T2,
typename T3 >
1990 struct UseDoublePrecisionComplexKernel {
1991 typedef complex<double> Type;
1992 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1993 IsSame<typename T2::ElementType,Type>::value &&
1994 IsSame<typename T3::ElementType,Type>::value };
2002 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2003 struct UseDefaultKernel {
2004 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2005 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2006 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2007 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2015 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2016 struct UseVectorizedDefaultKernel {
2017 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2018 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2019 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2020 IsSame<typename T1::ElementType,T4>::value &&
2021 IntrinsicTrait<typename T1::ElementType>::addition &&
2022 IntrinsicTrait<typename T1::ElementType>::multiplication };
2028 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2029 typedef typename MultTrait<RES,ST>::Type
ResultType;
2030 typedef typename ResultType::OppositeType
OppositeType;
2032 typedef typename ResultType::ElementType
ElementType;
2033 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2038 typedef const DMatDMatMultExpr<MT1,MT2>
LeftOperand;
2044 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2047 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2052 enum { vectorizable = 0 };
2061 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2077 return matrix_(i,j) * scalar_;
2086 inline size_t rows()
const {
2087 return matrix_.rows();
2096 inline size_t columns()
const {
2097 return matrix_.columns();
2127 template<
typename T >
2128 inline bool canAlias(
const T* alias )
const {
2129 return matrix_.canAlias( alias );
2139 template<
typename T >
2140 inline bool isAliased(
const T* alias )
const {
2141 return matrix_.isAliased( alias );
2160 template<
typename MT3
2162 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2169 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2170 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2172 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2175 else if( left.columns() == 0UL ) {
2191 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2193 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2211 template<
typename MT3
2215 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2216 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2218 const size_t M( A.rows() );
2219 const size_t N( B.columns() );
2220 const size_t K( A.columns() );
2222 for(
size_t i=0UL; i<M; ++i ) {
2223 for(
size_t j=0UL; j<N; ++j ) {
2224 C(i,j) = A(i,0UL) * B(0UL,j);
2226 for(
size_t k=1UL; k<K; ++k ) {
2227 for(
size_t j=0UL; j<N; ++j ) {
2228 C(i,j) += A(i,k) * B(k,j);
2231 for(
size_t j=0UL; j<N; ++j ) {
2252 template<
typename MT3
2256 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2257 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2259 typedef IntrinsicTrait<ElementType> IT;
2261 const size_t M( A.rows() );
2262 const size_t N( B.spacing() );
2263 const size_t K( A.columns() );
2269 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2270 for(
size_t i=0UL; i<M; ++i ) {
2271 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2272 for(
size_t k=0UL; k<K; ++k ) {
2274 xmm1 = xmm1 + a1 * B.get(k,j );
2275 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2276 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2277 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2278 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2279 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2280 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2281 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2283 store( &(~C)(i,j ), xmm1 * factor );
2284 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2285 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2286 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2287 store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2288 store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2289 store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2290 store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2293 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2295 for( ; (i+2UL) <= M; i+=2UL ) {
2296 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2297 for(
size_t k=0UL; k<K; ++k ) {
2304 xmm1 = xmm1 + a1 * b1;
2305 xmm2 = xmm2 + a1 * b2;
2306 xmm3 = xmm3 + a1 * b3;
2307 xmm4 = xmm4 + a1 * b4;
2308 xmm5 = xmm5 + a2 * b1;
2309 xmm6 = xmm6 + a2 * b2;
2310 xmm7 = xmm7 + a2 * b3;
2311 xmm8 = xmm8 + a2 * b4;
2313 store( &(~C)(i ,j ), xmm1 * factor );
2314 store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2315 store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2316 store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2317 store( &(~C)(i+1UL,j ), xmm5 * factor );
2318 store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2319 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2320 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2324 for(
size_t k=0UL; k<K; ++k ) {
2326 xmm1 = xmm1 + a1 * B.get(k,j );
2327 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2328 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2329 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2331 store( &(~C)(i,j ), xmm1 * factor );
2332 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2333 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2334 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2337 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2339 for( ; (i+2UL) <= M; i+=2UL ) {
2341 for(
size_t k=0UL; k<K; ++k ) {
2346 xmm1 = xmm1 + a1 * b1;
2347 xmm2 = xmm2 + a1 * b2;
2348 xmm3 = xmm3 + a2 * b1;
2349 xmm4 = xmm4 + a2 * b2;
2351 store( &(~C)(i ,j ), xmm1 * factor );
2352 store( &(~C)(i ,j+IT::size), xmm2 * factor );
2353 store( &(~C)(i+1UL,j ), xmm3 * factor );
2354 store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2358 for(
size_t k=0UL; k<K; ++k ) {
2360 xmm1 = xmm1 + a1 * B.get(k,j );
2361 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2363 store( &(~C)(i,j ), xmm1 * factor );
2364 store( &(~C)(i,j+IT::size), xmm2 * factor );
2369 for( ; (i+2UL) <= M; i+=2UL ) {
2371 for(
size_t k=0UL; k<K; ++k ) {
2373 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2374 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2376 store( &(~C)(i ,j), xmm1 * factor );
2377 store( &(~C)(i+1UL,j), xmm2 * factor );
2381 for(
size_t k=0UL; k<K; ++k ) {
2382 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2384 store( &(~C)(i,j), xmm1 * factor );
2404 template<
typename MT3
2408 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2409 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2414 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2415 const typename MT4::OppositeType tmp( A );
2416 assign( ~C, tmp * B * scalar );
2418 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2419 const typename MT5::OppositeType tmp( B );
2420 assign( ~C, A * tmp * scalar );
2422 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2423 const typename MT4::OppositeType tmp( A );
2424 assign( ~C, tmp * B * scalar );
2427 const typename MT5::OppositeType tmp( B );
2428 assign( ~C, A * tmp * scalar );
2447 template<
typename MT3
2451 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2452 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2454 selectDefaultAssignKernel( C, A, B, scalar );
2473 template<
typename MT3
2477 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2478 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2480 using boost::numeric_cast;
2486 const int M ( numeric_cast<int>( A.rows() ) );
2487 const int N ( numeric_cast<int>( B.columns() ) );
2488 const int K ( numeric_cast<int>( A.columns() ) );
2489 const int lda( numeric_cast<int>( A.spacing() ) );
2490 const int ldb( numeric_cast<int>( B.spacing() ) );
2491 const int ldc( numeric_cast<int>( C.spacing() ) );
2493 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2494 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2495 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2496 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2516 template<
typename MT3
2520 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2521 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2523 using boost::numeric_cast;
2529 const int M ( numeric_cast<int>( A.rows() ) );
2530 const int N ( numeric_cast<int>( B.columns() ) );
2531 const int K ( numeric_cast<int>( A.columns() ) );
2532 const int lda( numeric_cast<int>( A.spacing() ) );
2533 const int ldb( numeric_cast<int>( B.spacing() ) );
2534 const int ldc( numeric_cast<int>( C.spacing() ) );
2536 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2537 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2538 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2539 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2559 template<
typename MT3
2563 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2564 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2566 using boost::numeric_cast;
2575 const int M ( numeric_cast<int>( A.rows() ) );
2576 const int N ( numeric_cast<int>( B.columns() ) );
2577 const int K ( numeric_cast<int>( A.columns() ) );
2578 const int lda( numeric_cast<int>( A.spacing() ) );
2579 const int ldb( numeric_cast<int>( B.spacing() ) );
2580 const int ldc( numeric_cast<int>( C.spacing() ) );
2581 const complex<float> alpha( scalar );
2582 const complex<float> beta ( 0.0F, 0.0F );
2584 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2585 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2586 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2587 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2607 template<
typename MT3
2611 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2612 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2614 using boost::numeric_cast;
2623 const int M ( numeric_cast<int>( A.rows() ) );
2624 const int N ( numeric_cast<int>( B.columns() ) );
2625 const int K ( numeric_cast<int>( A.columns() ) );
2626 const int lda( numeric_cast<int>( A.spacing() ) );
2627 const int ldb( numeric_cast<int>( B.spacing() ) );
2628 const int ldc( numeric_cast<int>( C.spacing() ) );
2629 const complex<double> alpha( scalar );
2630 const complex<double> beta ( 0.0, 0.0 );
2632 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2633 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2634 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2635 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2651 template<
typename MT
2653 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2657 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2669 const TmpType tmp( rhs );
2686 template<
typename MT3
2688 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2695 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2696 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2698 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2713 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2715 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2733 template<
typename MT3
2737 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2738 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2759 template<
typename MT3
2763 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2764 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2766 typedef IntrinsicTrait<ElementType> IT;
2768 const size_t M( A.rows() );
2769 const size_t N( B.spacing() );
2770 const size_t K( A.columns() );
2776 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2777 for(
size_t i=0UL; i<M; ++i ) {
2778 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2779 for(
size_t k=0UL; k<K; ++k ) {
2781 xmm1 = xmm1 + a1 * B.get(k,j );
2782 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2783 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2784 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2785 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2786 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2787 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2788 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2790 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2791 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
2792 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
2793 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
2794 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
2795 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
2796 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
2797 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
2800 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2802 for( ; (i+2UL) <= M; i+=2UL ) {
2803 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2804 for(
size_t k=0UL; k<K; ++k ) {
2811 xmm1 = xmm1 + a1 * b1;
2812 xmm2 = xmm2 + a1 * b2;
2813 xmm3 = xmm3 + a1 * b3;
2814 xmm4 = xmm4 + a1 * b4;
2815 xmm5 = xmm5 + a2 * b1;
2816 xmm6 = xmm6 + a2 * b2;
2817 xmm7 = xmm7 + a2 * b3;
2818 xmm8 = xmm8 + a2 * b4;
2820 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2821 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
2822 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
2823 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
2824 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
2825 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
2826 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
2827 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
2831 for(
size_t k=0UL; k<K; ++k ) {
2833 xmm1 = xmm1 + a1 * B.get(k,j );
2834 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2835 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2836 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2838 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2839 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
2840 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
2841 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
2844 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2846 for( ; (i+2UL) <= M; i+=2UL ) {
2848 for(
size_t k=0UL; k<K; ++k ) {
2853 xmm1 = xmm1 + a1 * b1;
2854 xmm2 = xmm2 + a1 * b2;
2855 xmm3 = xmm3 + a2 * b1;
2856 xmm4 = xmm4 + a2 * b2;
2858 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2859 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
2860 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
2861 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
2865 for(
size_t k=0UL; k<K; ++k ) {
2867 xmm1 = xmm1 + a1 * B.get(k,j );
2868 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2870 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2871 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
2876 for( ; (i+2UL) <= M; i+=2UL ) {
2878 for(
size_t k=0UL; k<K; ++k ) {
2880 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2881 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2883 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2884 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) + xmm2 * factor );
2888 for(
size_t k=0UL; k<K; ++k ) {
2889 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2891 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
2911 template<
typename MT3
2915 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2916 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2921 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2922 const typename MT4::OppositeType tmp( A );
2925 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2926 const typename MT5::OppositeType tmp( B );
2929 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2930 const typename MT4::OppositeType tmp( A );
2934 const typename MT5::OppositeType tmp( B );
2954 template<
typename MT3
2958 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2959 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2961 selectDefaultAddAssignKernel( C, A, B, scalar );
2980 template<
typename MT3
2984 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2985 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2987 using boost::numeric_cast;
2993 const int M ( numeric_cast<int>( A.rows() ) );
2994 const int N ( numeric_cast<int>( B.columns() ) );
2995 const int K ( numeric_cast<int>( A.columns() ) );
2996 const int lda( numeric_cast<int>( A.spacing() ) );
2997 const int ldb( numeric_cast<int>( B.spacing() ) );
2998 const int ldc( numeric_cast<int>( C.spacing() ) );
3000 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3001 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3002 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3003 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3023 template<
typename MT3
3027 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3028 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3030 using boost::numeric_cast;
3036 const int M ( numeric_cast<int>( A.rows() ) );
3037 const int N ( numeric_cast<int>( B.columns() ) );
3038 const int K ( numeric_cast<int>( A.columns() ) );
3039 const int lda( numeric_cast<int>( A.spacing() ) );
3040 const int ldb( numeric_cast<int>( B.spacing() ) );
3041 const int ldc( numeric_cast<int>( C.spacing() ) );
3043 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3044 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3045 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3046 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3066 template<
typename MT3
3070 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3071 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3073 using boost::numeric_cast;
3082 const int M ( numeric_cast<int>( A.rows() ) );
3083 const int N ( numeric_cast<int>( B.columns() ) );
3084 const int K ( numeric_cast<int>( A.columns() ) );
3085 const int lda( numeric_cast<int>( A.spacing() ) );
3086 const int ldb( numeric_cast<int>( B.spacing() ) );
3087 const int ldc( numeric_cast<int>( C.spacing() ) );
3088 const complex<float> alpha( scalar );
3089 const complex<float> beta ( 1.0F, 0.0F );
3091 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3092 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3093 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3094 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3114 template<
typename MT3
3118 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3119 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3121 using boost::numeric_cast;
3130 const int M ( numeric_cast<int>( A.rows() ) );
3131 const int N ( numeric_cast<int>( B.columns() ) );
3132 const int K ( numeric_cast<int>( A.columns() ) );
3133 const int lda( numeric_cast<int>( A.spacing() ) );
3134 const int ldb( numeric_cast<int>( B.spacing() ) );
3135 const int ldc( numeric_cast<int>( C.spacing() ) );
3136 const complex<double> alpha( scalar );
3137 const complex<double> beta ( 1.0, 0.0 );
3139 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3140 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3141 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3142 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3163 template<
typename MT3
3165 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3172 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3173 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3175 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3190 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3192 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3210 template<
typename MT3
3214 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3215 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3236 template<
typename MT3
3240 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3241 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3243 typedef IntrinsicTrait<ElementType> IT;
3245 const size_t M( A.rows() );
3246 const size_t N( B.spacing() );
3247 const size_t K( A.columns() );
3253 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3254 for(
size_t i=0UL; i<M; ++i ) {
3255 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3256 for(
size_t k=0UL; k<K; ++k ) {
3258 xmm1 = xmm1 + a1 * B.get(k,j );
3259 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3260 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3261 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3262 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3263 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3264 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3265 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3267 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3268 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3269 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3270 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3271 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3272 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3273 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3274 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3277 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3279 for( ; (i+2UL) <= M; i+=2UL ) {
3280 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3281 for(
size_t k=0UL; k<K; ++k ) {
3288 xmm1 = xmm1 + a1 * b1;
3289 xmm2 = xmm2 + a1 * b2;
3290 xmm3 = xmm3 + a1 * b3;
3291 xmm4 = xmm4 + a1 * b4;
3292 xmm5 = xmm5 + a2 * b1;
3293 xmm6 = xmm6 + a2 * b2;
3294 xmm7 = xmm7 + a2 * b3;
3295 xmm8 = xmm8 + a2 * b4;
3297 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3298 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3299 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3300 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3301 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3302 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3303 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3304 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3308 for(
size_t k=0UL; k<K; ++k ) {
3310 xmm1 = xmm1 + a1 * B.get(k,j );
3311 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3312 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3313 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3315 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3316 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3317 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3318 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3321 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3323 for( ; (i+2UL) <= M; i+=2UL ) {
3325 for(
size_t k=0UL; k<K; ++k ) {
3330 xmm1 = xmm1 + a1 * b1;
3331 xmm2 = xmm2 + a1 * b2;
3332 xmm3 = xmm3 + a2 * b1;
3333 xmm4 = xmm4 + a2 * b2;
3335 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3336 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3337 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3338 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3342 for(
size_t k=0UL; k<K; ++k ) {
3344 xmm1 = xmm1 + a1 * B.get(k,j );
3345 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3347 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3348 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3353 for( ; (i+2UL) <= M; i+=2UL ) {
3355 for(
size_t k=0UL; k<K; ++k ) {
3357 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3358 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3360 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3361 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3365 for(
size_t k=0UL; k<K; ++k ) {
3366 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3368 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3388 template<
typename MT3
3392 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3393 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3398 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3399 const typename MT4::OppositeType tmp( A );
3402 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3403 const typename MT5::OppositeType tmp( B );
3406 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3407 const typename MT4::OppositeType tmp( A );
3411 const typename MT5::OppositeType tmp( B );
3431 template<
typename MT3
3435 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3436 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3438 selectDefaultSubAssignKernel( C, A, B, scalar );
3457 template<
typename MT3
3461 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3462 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3464 using boost::numeric_cast;
3470 const int M ( numeric_cast<int>( A.rows() ) );
3471 const int N ( numeric_cast<int>( B.columns() ) );
3472 const int K ( numeric_cast<int>( A.columns() ) );
3473 const int lda( numeric_cast<int>( A.spacing() ) );
3474 const int ldb( numeric_cast<int>( B.spacing() ) );
3475 const int ldc( numeric_cast<int>( C.spacing() ) );
3477 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3478 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3479 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3480 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3500 template<
typename MT3
3504 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3505 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3507 using boost::numeric_cast;
3513 const int M ( numeric_cast<int>( A.rows() ) );
3514 const int N ( numeric_cast<int>( B.columns() ) );
3515 const int K ( numeric_cast<int>( A.columns() ) );
3516 const int lda( numeric_cast<int>( A.spacing() ) );
3517 const int ldb( numeric_cast<int>( B.spacing() ) );
3518 const int ldc( numeric_cast<int>( C.spacing() ) );
3520 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3521 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3522 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3523 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3543 template<
typename MT3
3547 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3548 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3550 using boost::numeric_cast;
3559 const int M ( numeric_cast<int>( A.rows() ) );
3560 const int N ( numeric_cast<int>( B.columns() ) );
3561 const int K ( numeric_cast<int>( A.columns() ) );
3562 const int lda( numeric_cast<int>( A.spacing() ) );
3563 const int ldb( numeric_cast<int>( B.spacing() ) );
3564 const int ldc( numeric_cast<int>( C.spacing() ) );
3565 const complex<float> alpha( -scalar );
3566 const complex<float> beta ( 1.0F, 0.0F );
3568 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3569 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3570 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3571 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3591 template<
typename MT3
3595 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3596 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3598 using boost::numeric_cast;
3607 const int M ( numeric_cast<int>( A.rows() ) );
3608 const int N ( numeric_cast<int>( B.columns() ) );
3609 const int K ( numeric_cast<int>( A.columns() ) );
3610 const int lda( numeric_cast<int>( A.spacing() ) );
3611 const int ldb( numeric_cast<int>( B.spacing() ) );
3612 const int ldc( numeric_cast<int>( C.spacing() ) );
3613 const complex<double> alpha( -scalar );
3614 const complex<double> beta ( 1.0, 0.0 );
3616 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3617 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3618 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3619 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3685 template<
typename T1
3687 inline const DMatDMatMultExpr<T1,T2>
3693 throw std::invalid_argument(
"Matrix sizes do not match" );
3710 template<
typename MT1,
typename MT2,
typename VT >
3715 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3716 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3717 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3718 ,
typename DMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
3719 , INVALID_TYPE >::Type Type;
3728 template<
typename MT1,
typename MT2,
typename VT >
3733 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3734 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3735 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3736 ,
typename DMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
3737 , INVALID_TYPE >::Type Type;
3746 template<
typename VT,
typename MT1,
typename MT2 >
3751 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
3752 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3753 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3754 ,
typename TDVecDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3755 , INVALID_TYPE >::Type Type;
3764 template<
typename VT,
typename MT1,
typename MT2 >
3769 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
3770 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3771 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3772 ,
typename TDVecDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3773 , INVALID_TYPE >::Type Type;
3782 template<
typename MT1,
typename MT2 >
3787 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
3796 template<
typename MT1,
typename MT2 >
3801 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;