22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
94 template<
typename MT1
96 class TDMatTDMatMultExpr :
public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
97 ,
private MatMatMultExpr
102 typedef typename MT1::ResultType
RT1;
103 typedef typename MT2::ResultType
RT2;
104 typedef typename MT1::ElementType
ET1;
105 typedef typename MT2::ElementType
ET2;
106 typedef typename MT1::CompositeType
CT1;
107 typedef typename MT2::CompositeType
CT2;
115 template<
typename T1,
typename T2,
typename T3 >
116 struct UseSinglePrecisionKernel {
129 template<
typename T1,
typename T2,
typename T3 >
130 struct UseDoublePrecisionKernel {
144 template<
typename T1,
typename T2,
typename T3 >
145 struct UseSinglePrecisionComplexKernel {
146 typedef complex<float> Type;
147 enum { value = IsSame<typename T1::ElementType,Type>::value &&
148 IsSame<typename T2::ElementType,Type>::value &&
149 IsSame<typename T3::ElementType,Type>::value };
160 template<
typename T1,
typename T2,
typename T3 >
161 struct UseDoublePrecisionComplexKernel {
162 typedef complex<double> Type;
163 enum { value = IsSame<typename T1::ElementType,Type>::value &&
164 IsSame<typename T2::ElementType,Type>::value &&
165 IsSame<typename T3::ElementType,Type>::value };
175 template<
typename T1,
typename T2,
typename T3 >
176 struct UseDefaultKernel {
177 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
178 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
179 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
180 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
190 template<
typename T1,
typename T2,
typename T3 >
191 struct UseVectorizedDefaultKernel {
192 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
193 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
194 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
195 IntrinsicTrait<typename T1::ElementType>::addition &&
196 IntrinsicTrait<typename T1::ElementType>::multiplication };
227 enum { vectorizable = 0 };
257 if(
lhs_.columns() != 0UL ) {
258 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
260 for(
size_t k=1UL; k<end; k+=2UL ) {
262 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
264 if( end <
lhs_.columns() ) {
292 return rhs_.columns();
322 template<
typename T >
324 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
334 template<
typename T >
336 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
356 template<
typename MT
365 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
368 else if( rhs.lhs_.columns() == 0UL ) {
384 TDMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
386 TDMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
405 template<
typename MT3
409 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
411 const size_t M( A.rows() );
412 const size_t N( B.columns() );
413 const size_t K( A.columns() );
415 for(
size_t i=0UL; i<M; ++i ) {
416 for(
size_t j=0UL; j<N; ++j ) {
417 C(i,j) = A(i,0UL) * B(0UL,j);
419 for(
size_t k=1UL; k<K; ++k ) {
420 for(
size_t j=0UL; j<N; ++j ) {
421 C(i,j) += A(i,k) * B(k,j);
443 template<
typename MT3
446 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
447 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
452 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
453 const typename MT5::OppositeType tmp( B );
456 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
457 const typename MT4::OppositeType tmp( A );
460 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
461 const typename MT5::OppositeType tmp( B );
465 const typename MT4::OppositeType tmp( A );
486 template<
typename MT3
489 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
490 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
492 typedef IntrinsicTrait<ElementType> IT;
494 const size_t M( A.spacing() );
495 const size_t N( B.columns() );
496 const size_t K( A.columns() );
500 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
501 for(
size_t j=0UL; j<N; ++j ) {
502 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
503 for(
size_t k=0UL; k<K; ++k ) {
505 xmm1 = xmm1 + A.get(i ,k) * b1;
506 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
507 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
508 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
509 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
510 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
511 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
512 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
514 store( &(~C)(i ,j), xmm1 );
515 store( &(~C)(i+IT::size ,j), xmm2 );
516 store( &(~C)(i+IT::size*2UL,j), xmm3 );
517 store( &(~C)(i+IT::size*3UL,j), xmm4 );
518 store( &(~C)(i+IT::size*4UL,j), xmm5 );
519 store( &(~C)(i+IT::size*5UL,j), xmm6 );
520 store( &(~C)(i+IT::size*6UL,j), xmm7 );
521 store( &(~C)(i+IT::size*7UL,j), xmm8 );
524 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
526 for( ; (j+2UL) <= N; j+=2UL ) {
527 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
528 for(
size_t k=0UL; k<K; ++k ) {
535 xmm1 = xmm1 + a1 * b1;
536 xmm2 = xmm2 + a2 * b1;
537 xmm3 = xmm3 + a3 * b1;
538 xmm4 = xmm4 + a4 * b1;
539 xmm5 = xmm5 + a1 * b2;
540 xmm6 = xmm6 + a2 * b2;
541 xmm7 = xmm7 + a3 * b2;
542 xmm8 = xmm8 + a4 * b2;
544 store( &(~C)(i ,j ), xmm1 );
545 store( &(~C)(i+IT::size ,j ), xmm2 );
546 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
547 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
548 store( &(~C)(i ,j+1UL), xmm5 );
549 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
550 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
551 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
555 for(
size_t k=0UL; k<K; ++k ) {
557 xmm1 = xmm1 + A.get(i ,k) * b1;
558 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
559 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
560 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
562 store( &(~C)(i ,j), xmm1 );
563 store( &(~C)(i+IT::size ,j), xmm2 );
564 store( &(~C)(i+IT::size*2UL,j), xmm3 );
565 store( &(~C)(i+IT::size*3UL,j), xmm4 );
568 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
570 for( ; (j+2UL) <= N; j+=2UL ) {
572 for(
size_t k=0UL; k<K; ++k ) {
577 xmm1 = xmm1 + a1 * b1;
578 xmm2 = xmm2 + a2 * b1;
579 xmm3 = xmm3 + a1 * b2;
580 xmm4 = xmm4 + a2 * b2;
582 store( &(~C)(i ,j ), xmm1 );
583 store( &(~C)(i+IT::size,j ), xmm2 );
584 store( &(~C)(i ,j+1UL), xmm3 );
585 store( &(~C)(i+IT::size,j+1UL), xmm4 );
589 for(
size_t k=0UL; k<K; ++k ) {
591 xmm1 = xmm1 + A.get(i ,k) * b1;
592 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
594 store( &(~C)(i ,j), xmm1 );
595 store( &(~C)(i+IT::size,j), xmm2 );
600 for( ; (j+2UL) <= N; j+=2UL ) {
602 for(
size_t k=0UL; k<K; ++k ) {
604 xmm1 = xmm1 + a1 *
set( B(k,j ) );
605 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
607 store( &(~C)(i,j ), xmm1 );
608 store( &(~C)(i,j+1UL), xmm2 );
612 for(
size_t k=0UL; k<K; ++k ) {
613 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
615 store( &(~C)(i,j), xmm1 );
636 template<
typename MT3
639 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
640 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
642 selectDefaultAssignKernel( C, A, B );
662 template<
typename MT3
665 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
666 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
668 using boost::numeric_cast;
674 const int M ( numeric_cast<int>( A.rows() ) );
675 const int N ( numeric_cast<int>( B.columns() ) );
676 const int K ( numeric_cast<int>( A.columns() ) );
677 const int lda( numeric_cast<int>( A.spacing() ) );
678 const int ldb( numeric_cast<int>( B.spacing() ) );
679 const int ldc( numeric_cast<int>( C.spacing() ) );
681 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
682 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
683 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
684 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
705 template<
typename MT3
708 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
709 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
711 using boost::numeric_cast;
717 const int M ( numeric_cast<int>( A.rows() ) );
718 const int N ( numeric_cast<int>( B.columns() ) );
719 const int K ( numeric_cast<int>( A.columns() ) );
720 const int lda( numeric_cast<int>( A.spacing() ) );
721 const int ldb( numeric_cast<int>( B.spacing() ) );
722 const int ldc( numeric_cast<int>( C.spacing() ) );
724 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
725 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
726 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
727 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
748 template<
typename MT3
751 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
752 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
754 using boost::numeric_cast;
763 const int M ( numeric_cast<int>( A.rows() ) );
764 const int N ( numeric_cast<int>( B.columns() ) );
765 const int K ( numeric_cast<int>( A.columns() ) );
766 const int lda( numeric_cast<int>( A.spacing() ) );
767 const int ldb( numeric_cast<int>( B.spacing() ) );
768 const int ldc( numeric_cast<int>( C.spacing() ) );
769 complex<float> alpha( 1.0F, 0.0F );
770 complex<float> beta ( 0.0F, 0.0F );
772 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
773 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
774 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
775 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
796 template<
typename MT3
799 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
800 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
802 using boost::numeric_cast;
811 const int M ( numeric_cast<int>( A.rows() ) );
812 const int N ( numeric_cast<int>( B.columns() ) );
813 const int K ( numeric_cast<int>( A.columns() ) );
814 const int lda( numeric_cast<int>( A.spacing() ) );
815 const int ldb( numeric_cast<int>( B.spacing() ) );
816 const int ldc( numeric_cast<int>( C.spacing() ) );
817 complex<double> alpha( 1.0, 0.0 );
818 complex<double> beta ( 0.0, 0.0 );
820 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
821 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
822 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
823 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
842 template<
typename MT
848 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
860 const TmpType tmp( rhs );
879 template<
typename MT
888 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
903 TDMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
905 TDMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
924 template<
typename MT3
927 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
928 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
930 const size_t M( A.rows() );
931 const size_t N( B.columns() );
932 const size_t K( A.columns() );
935 const size_t end( N &
size_t(-2) );
937 for(
size_t i=0UL; i<M; ++i ) {
938 for(
size_t k=0UL; k<K; ++k ) {
939 for(
size_t j=0UL; j<end; j+=2UL ) {
940 C(i,j ) += A(i,k) * B(k,j );
941 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
944 C(i,end) += A(i,k) * B(k,end);
966 template<
typename MT3
969 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
970 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
975 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
976 const typename MT5::OppositeType tmp( B );
979 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
980 const typename MT4::OppositeType tmp( A );
983 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
984 const typename MT5::OppositeType tmp( B );
988 const typename MT4::OppositeType tmp( A );
1009 template<
typename MT3
1012 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1013 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1015 typedef IntrinsicTrait<ElementType> IT;
1017 const size_t M( A.spacing() );
1018 const size_t N( B.columns() );
1019 const size_t K( A.columns() );
1023 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1024 for(
size_t j=0UL; j<N; ++j ) {
1033 for(
size_t k=0UL; k<K; ++k ) {
1035 xmm1 = xmm1 + A.get(i ,k) * b1;
1036 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1037 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1038 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1039 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1040 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1041 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1042 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1044 store( &(~C)(i ,j), xmm1 );
1045 store( &(~C)(i+IT::size ,j), xmm2 );
1046 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1047 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1048 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1049 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1050 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1051 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1054 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1056 for( ; (j+2UL) <= N; j+=2UL ) {
1065 for(
size_t k=0UL; k<K; ++k ) {
1072 xmm1 = xmm1 + a1 * b1;
1073 xmm2 = xmm2 + a2 * b1;
1074 xmm3 = xmm3 + a3 * b1;
1075 xmm4 = xmm4 + a4 * b1;
1076 xmm5 = xmm5 + a1 * b2;
1077 xmm6 = xmm6 + a2 * b2;
1078 xmm7 = xmm7 + a3 * b2;
1079 xmm8 = xmm8 + a4 * b2;
1081 store( &(~C)(i ,j ), xmm1 );
1082 store( &(~C)(i+IT::size ,j ), xmm2 );
1083 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1084 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1085 store( &(~C)(i ,j+1UL), xmm5 );
1086 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1087 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1088 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1095 for(
size_t k=0UL; k<K; ++k ) {
1097 xmm1 = xmm1 + A.get(i ,k) * b1;
1098 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1099 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1100 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1102 store( &(~C)(i ,j), xmm1 );
1103 store( &(~C)(i+IT::size ,j), xmm2 );
1104 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1105 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1108 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1110 for( ; (j+2UL) <= N; j+=2UL ) {
1115 for(
size_t k=0UL; k<K; ++k ) {
1120 xmm1 = xmm1 + a1 * b1;
1121 xmm2 = xmm2 + a2 * b1;
1122 xmm3 = xmm3 + a1 * b2;
1123 xmm4 = xmm4 + a2 * b2;
1125 store( &(~C)(i ,j ), xmm1 );
1126 store( &(~C)(i+IT::size,j ), xmm2 );
1127 store( &(~C)(i ,j+1UL), xmm3 );
1128 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1133 for(
size_t k=0UL; k<K; ++k ) {
1135 xmm1 = xmm1 + A.get(i ,k) * b1;
1136 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1138 store( &(~C)(i ,j), xmm1 );
1139 store( &(~C)(i+IT::size,j), xmm2 );
1144 for( ; (j+2UL) <= N; j+=2UL ) {
1147 for(
size_t k=0UL; k<K; ++k ) {
1149 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1150 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1152 store( &(~C)(i,j ), xmm1 );
1153 store( &(~C)(i,j+1UL), xmm2 );
1157 for(
size_t k=0UL; k<K; ++k ) {
1158 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
1160 store( &(~C)(i,j), xmm1 );
1181 template<
typename MT3
1184 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1185 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1187 selectDefaultAddAssignKernel( C, A, B );
1207 template<
typename MT3
1210 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1211 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1213 using boost::numeric_cast;
1219 const int M ( numeric_cast<int>( A.rows() ) );
1220 const int N ( numeric_cast<int>( B.columns() ) );
1221 const int K ( numeric_cast<int>( A.columns() ) );
1222 const int lda( numeric_cast<int>( A.spacing() ) );
1223 const int ldb( numeric_cast<int>( B.spacing() ) );
1224 const int ldc( numeric_cast<int>( C.spacing() ) );
1226 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1227 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1228 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1229 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1250 template<
typename MT3
1253 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1254 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1256 using boost::numeric_cast;
1262 const int M ( numeric_cast<int>( A.rows() ) );
1263 const int N ( numeric_cast<int>( B.columns() ) );
1264 const int K ( numeric_cast<int>( A.columns() ) );
1265 const int lda( numeric_cast<int>( A.spacing() ) );
1266 const int ldb( numeric_cast<int>( B.spacing() ) );
1267 const int ldc( numeric_cast<int>( C.spacing() ) );
1269 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1270 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1271 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1272 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1293 template<
typename MT3
1296 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1297 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1299 using boost::numeric_cast;
1308 const int M ( numeric_cast<int>( A.rows() ) );
1309 const int N ( numeric_cast<int>( B.columns() ) );
1310 const int K ( numeric_cast<int>( A.columns() ) );
1311 const int lda( numeric_cast<int>( A.spacing() ) );
1312 const int ldb( numeric_cast<int>( B.spacing() ) );
1313 const int ldc( numeric_cast<int>( C.spacing() ) );
1314 const complex<float> alpha( 1.0F, 0.0F );
1315 const complex<float> beta ( 1.0F, 0.0F );
1317 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1318 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1319 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1320 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1341 template<
typename MT3
1344 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1345 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1347 using boost::numeric_cast;
1356 const int M ( numeric_cast<int>( A.rows() ) );
1357 const int N ( numeric_cast<int>( B.columns() ) );
1358 const int K ( numeric_cast<int>( A.columns() ) );
1359 const int lda( numeric_cast<int>( A.spacing() ) );
1360 const int ldb( numeric_cast<int>( B.spacing() ) );
1361 const int ldc( numeric_cast<int>( C.spacing() ) );
1362 const complex<double> alpha( 1.0, 0.0 );
1363 const complex<double> beta ( 1.0, 0.0 );
1365 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1366 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1367 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1368 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1391 template<
typename MT
1400 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1415 TDMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1417 TDMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1436 template<
typename MT3
1439 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1440 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1442 const size_t M( A.rows() );
1443 const size_t N( B.columns() );
1444 const size_t K( A.columns() );
1447 const size_t end( N &
size_t(-2) );
1449 for(
size_t i=0UL; i<M; ++i ) {
1450 for(
size_t k=0UL; k<K; ++k ) {
1451 for(
size_t j=0UL; j<end; j+=2UL ) {
1452 C(i,j ) -= A(i,k) * B(k,j );
1453 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1456 C(i,end) -= A(i,k) * B(k,end);
1478 template<
typename MT3
1481 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1482 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1487 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1488 const typename MT5::OppositeType tmp( B );
1491 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1492 const typename MT4::OppositeType tmp( A );
1495 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1496 const typename MT5::OppositeType tmp( B );
1500 const typename MT4::OppositeType tmp( A );
1521 template<
typename MT3
1524 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1525 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1527 typedef IntrinsicTrait<ElementType> IT;
1529 const size_t M( A.spacing() );
1530 const size_t N( B.columns() );
1531 const size_t K( A.columns() );
1535 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1536 for(
size_t j=0UL; j<N; ++j ) {
1545 for(
size_t k=0UL; k<K; ++k ) {
1547 xmm1 = xmm1 - A.get(i ,k) * b1;
1548 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1549 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1550 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1551 xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1552 xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1553 xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1554 xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1556 store( &(~C)(i ,j), xmm1 );
1557 store( &(~C)(i+IT::size ,j), xmm2 );
1558 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1559 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1560 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1561 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1562 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1563 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1566 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1568 for( ; (j+2UL) <= N; j+=2UL ) {
1577 for(
size_t k=0UL; k<K; ++k ) {
1584 xmm1 = xmm1 - a1 * b1;
1585 xmm2 = xmm2 - a2 * b1;
1586 xmm3 = xmm3 - a3 * b1;
1587 xmm4 = xmm4 - a4 * b1;
1588 xmm5 = xmm5 - a1 * b2;
1589 xmm6 = xmm6 - a2 * b2;
1590 xmm7 = xmm7 - a3 * b2;
1591 xmm8 = xmm8 - a4 * b2;
1593 store( &(~C)(i ,j ), xmm1 );
1594 store( &(~C)(i+IT::size ,j ), xmm2 );
1595 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1596 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1597 store( &(~C)(i ,j+1UL), xmm5 );
1598 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1599 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1600 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1607 for(
size_t k=0UL; k<K; ++k ) {
1609 xmm1 = xmm1 - A.get(i ,k) * b1;
1610 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1611 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1612 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1614 store( &(~C)(i ,j), xmm1 );
1615 store( &(~C)(i+IT::size ,j), xmm2 );
1616 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1617 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1620 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1622 for( ; (j+2UL) <= N; j+=2UL ) {
1627 for(
size_t k=0UL; k<K; ++k ) {
1632 xmm1 = xmm1 - a1 * b1;
1633 xmm2 = xmm2 - a2 * b1;
1634 xmm3 = xmm3 - a1 * b2;
1635 xmm4 = xmm4 - a2 * b2;
1637 store( &(~C)(i ,j ), xmm1 );
1638 store( &(~C)(i+IT::size,j ), xmm2 );
1639 store( &(~C)(i ,j+1UL), xmm3 );
1640 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1645 for(
size_t k=0UL; k<K; ++k ) {
1647 xmm1 = xmm1 - A.get(i ,k) * b1;
1648 xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
1650 store( &(~C)(i ,j), xmm1 );
1651 store( &(~C)(i+IT::size,j), xmm2 );
1656 for( ; (j+2UL) <= N; j+=2UL ) {
1659 for(
size_t k=0UL; k<K; ++k ) {
1661 xmm1 = xmm1 - a1 *
set( B(k,j ) );
1662 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
1664 store( &(~C)(i,j ), xmm1 );
1665 store( &(~C)(i,j+1UL), xmm2 );
1669 for(
size_t k=0UL; k<K; ++k ) {
1670 xmm1 = xmm1 - A.get(i,k) *
set( B(k,j) );
1672 store( &(~C)(i,j), xmm1 );
1693 template<
typename MT3
1696 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1697 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1699 selectDefaultSubAssignKernel( C, A, B );
1719 template<
typename MT3
1722 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1723 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1725 using boost::numeric_cast;
1731 const int M ( numeric_cast<int>( A.rows() ) );
1732 const int N ( numeric_cast<int>( B.columns() ) );
1733 const int K ( numeric_cast<int>( A.columns() ) );
1734 const int lda( numeric_cast<int>( A.spacing() ) );
1735 const int ldb( numeric_cast<int>( B.spacing() ) );
1736 const int ldc( numeric_cast<int>( C.spacing() ) );
1738 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1739 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1740 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1741 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1762 template<
typename MT3
1765 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1766 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1768 using boost::numeric_cast;
1774 const int M ( numeric_cast<int>( A.rows() ) );
1775 const int N ( numeric_cast<int>( B.columns() ) );
1776 const int K ( numeric_cast<int>( A.columns() ) );
1777 const int lda( numeric_cast<int>( A.spacing() ) );
1778 const int ldb( numeric_cast<int>( B.spacing() ) );
1779 const int ldc( numeric_cast<int>( C.spacing() ) );
1781 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1782 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1783 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1784 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1805 template<
typename MT3
1808 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1809 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1811 using boost::numeric_cast;
1820 const int M ( numeric_cast<int>( A.rows() ) );
1821 const int N ( numeric_cast<int>( B.columns() ) );
1822 const int K ( numeric_cast<int>( A.columns() ) );
1823 const int lda( numeric_cast<int>( A.spacing() ) );
1824 const int ldb( numeric_cast<int>( B.spacing() ) );
1825 const int ldc( numeric_cast<int>( C.spacing() ) );
1826 const complex<float> alpha( -1.0F, 0.0F );
1827 const complex<float> beta ( 1.0F, 0.0F );
1829 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1830 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1831 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1832 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1853 template<
typename MT3
1856 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1857 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1859 using boost::numeric_cast;
1868 const int M ( numeric_cast<int>( A.rows() ) );
1869 const int N ( numeric_cast<int>( B.columns() ) );
1870 const int K ( numeric_cast<int>( A.columns() ) );
1871 const int lda( numeric_cast<int>( A.spacing() ) );
1872 const int ldb( numeric_cast<int>( B.spacing() ) );
1873 const int ldc( numeric_cast<int>( C.spacing() ) );
1874 const complex<double> alpha( -1.0, 0.0 );
1875 const complex<double> beta ( 1.0, 0.0 );
1877 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1878 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1879 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1880 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1926 template<
typename MT1
1930 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
1931 ,
private MatScalarMultExpr
1932 ,
private Computation
1936 typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
1937 typedef typename MMM::ResultType RES;
1938 typedef typename MT1::ResultType
RT1;
1939 typedef typename MT2::ResultType
RT2;
1940 typedef typename MT1::CompositeType
CT1;
1941 typedef typename MT2::CompositeType
CT2;
1949 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1950 struct UseSinglePrecisionKernel {
1951 enum { value = IsFloat<typename T1::ElementType>::value &&
1952 IsFloat<typename T2::ElementType>::value &&
1953 IsFloat<typename T3::ElementType>::value &&
1954 !IsComplex<T4>::value };
1963 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1964 struct UseDoublePrecisionKernel {
1965 enum { value = IsDouble<typename T1::ElementType>::value &&
1966 IsDouble<typename T2::ElementType>::value &&
1967 IsDouble<typename T3::ElementType>::value &&
1968 !IsComplex<T4>::value };
1977 template<
typename T1,
typename T2,
typename T3 >
1978 struct UseSinglePrecisionComplexKernel {
1979 typedef complex<float> Type;
1980 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1981 IsSame<typename T2::ElementType,Type>::value &&
1982 IsSame<typename T3::ElementType,Type>::value };
1991 template<
typename T1,
typename T2,
typename T3 >
1992 struct UseDoublePrecisionComplexKernel {
1993 typedef complex<double> Type;
1994 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1995 IsSame<typename T2::ElementType,Type>::value &&
1996 IsSame<typename T3::ElementType,Type>::value };
2004 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2005 struct UseDefaultKernel {
2006 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2007 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2008 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2009 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2017 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2018 struct UseVectorizedDefaultKernel {
2019 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2020 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2021 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2022 IsSame<typename T1::ElementType,T4>::value &&
2023 IntrinsicTrait<typename T1::ElementType>::addition &&
2024 IntrinsicTrait<typename T1::ElementType>::multiplication };
2030 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2031 typedef typename MultTrait<RES,ST>::Type
ResultType;
2032 typedef typename ResultType::OppositeType
OppositeType;
2034 typedef typename ResultType::ElementType
ElementType;
2035 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2040 typedef const TDMatTDMatMultExpr<MT1,MT2>
LeftOperand;
2046 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2049 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2054 enum { vectorizable = 0 };
2063 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2079 return matrix_(i,j) * scalar_;
2088 inline size_t rows()
const {
2089 return matrix_.rows();
2098 inline size_t columns()
const {
2099 return matrix_.columns();
2129 template<
typename T >
2130 inline bool canAlias(
const T* alias )
const {
2131 return matrix_.canAlias( alias );
2141 template<
typename T >
2142 inline bool isAliased(
const T* alias )
const {
2143 return matrix_.isAliased( alias );
2162 template<
typename MT3
2164 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2171 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2172 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2174 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2177 else if( left.columns() == 0UL ) {
2193 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2195 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2213 template<
typename MT3
2217 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2218 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2220 for(
size_t i=0UL; i<A.rows(); ++i ) {
2221 for(
size_t k=0UL; k<B.columns(); ++k ) {
2222 C(i,k) = A(i,0UL) * B(0UL,k);
2224 for(
size_t j=1UL; j<A.columns(); ++j ) {
2225 for(
size_t k=0UL; k<B.columns(); ++k ) {
2226 C(i,k) += A(i,j) * B(j,k);
2229 for(
size_t k=0UL; k<B.columns(); ++k ) {
2250 template<
typename MT3
2254 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2255 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2260 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2261 const typename MT5::OppositeType tmp( B );
2262 assign( ~C, A * tmp * scalar );
2264 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2265 const typename MT4::OppositeType tmp( A );
2266 assign( ~C, tmp * B * scalar );
2268 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2269 const typename MT5::OppositeType tmp( B );
2270 assign( ~C, A * tmp * scalar );
2273 const typename MT4::OppositeType tmp( A );
2274 assign( ~C, tmp * B * scalar );
2293 template<
typename MT3
2297 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2298 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2300 typedef IntrinsicTrait<ElementType> IT;
2302 const size_t M( A.spacing() );
2303 const size_t N( B.columns() );
2304 const size_t K( A.columns() );
2310 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2311 for(
size_t j=0UL; j<N; ++j ) {
2312 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2313 for(
size_t k=0UL; k<K; ++k ) {
2315 xmm1 = xmm1 + A.get(i ,k) * b1;
2316 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2317 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2318 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2319 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2320 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2321 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2322 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2324 store( &(~C)(i ,j), xmm1 * factor );
2325 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2326 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2327 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2328 store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2329 store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2330 store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2331 store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2334 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2336 for( ; (j+2UL) <= N; j+=2UL ) {
2337 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2338 for(
size_t k=0UL; k<K; ++k ) {
2345 xmm1 = xmm1 + a1 * b1;
2346 xmm2 = xmm2 + a2 * b1;
2347 xmm3 = xmm3 + a3 * b1;
2348 xmm4 = xmm4 + a4 * b1;
2349 xmm5 = xmm5 + a1 * b2;
2350 xmm6 = xmm6 + a2 * b2;
2351 xmm7 = xmm7 + a3 * b2;
2352 xmm8 = xmm8 + a4 * b2;
2354 store( &(~C)(i ,j ), xmm1 * factor );
2355 store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2356 store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2357 store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2358 store( &(~C)(i ,j+1UL), xmm5 * factor );
2359 store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2360 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2361 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2365 for(
size_t k=0UL; k<K; ++k ) {
2367 xmm1 = xmm1 + A.get(i ,k) * b1;
2368 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2369 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2370 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2372 store( &(~C)(i ,j), xmm1 * factor );
2373 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2374 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2375 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2378 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2380 for( ; (j+2UL) <= N; j+=2UL ) {
2382 for(
size_t k=0UL; k<K; ++k ) {
2387 xmm1 = xmm1 + a1 * b1;
2388 xmm2 = xmm2 + a2 * b1;
2389 xmm3 = xmm3 + a1 * b2;
2390 xmm4 = xmm4 + a2 * b2;
2392 store( &(~C)(i ,j ), xmm1 * factor );
2393 store( &(~C)(i+IT::size,j ), xmm2 * factor );
2394 store( &(~C)(i ,j+1UL), xmm3 * factor );
2395 store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2399 for(
size_t k=0UL; k<K; ++k ) {
2401 xmm1 = xmm1 + A.get(i ,k) * b1;
2402 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2404 store( &(~C)(i ,j), xmm1 * factor );
2405 store( &(~C)(i+IT::size,j), xmm2 * factor );
2410 for( ; (j+2UL) <= N; j+=2UL ) {
2412 for(
size_t k=0UL; k<K; ++k ) {
2414 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2415 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2417 store( &(~C)(i,j ), xmm1 * factor );
2418 store( &(~C)(i,j+1UL), xmm2 * factor );
2422 for(
size_t k=0UL; k<K; ++k ) {
2423 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2425 store( &(~C)(i,j), xmm1 * factor );
2445 template<
typename MT3
2449 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2450 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2452 selectDefaultAssignKernel( C, A, B, scalar );
2471 template<
typename MT3
2475 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2476 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2478 using boost::numeric_cast;
2484 const int M ( numeric_cast<int>( A.rows() ) );
2485 const int N ( numeric_cast<int>( B.columns() ) );
2486 const int K ( numeric_cast<int>( A.columns() ) );
2487 const int lda( numeric_cast<int>( A.spacing() ) );
2488 const int ldb( numeric_cast<int>( B.spacing() ) );
2489 const int ldc( numeric_cast<int>( C.spacing() ) );
2491 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2492 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2493 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2494 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2514 template<
typename MT3
2518 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2519 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2521 using boost::numeric_cast;
2527 const int M ( numeric_cast<int>( A.rows() ) );
2528 const int N ( numeric_cast<int>( B.columns() ) );
2529 const int K ( numeric_cast<int>( A.columns() ) );
2530 const int lda( numeric_cast<int>( A.spacing() ) );
2531 const int ldb( numeric_cast<int>( B.spacing() ) );
2532 const int ldc( numeric_cast<int>( C.spacing() ) );
2534 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2535 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2536 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2537 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2557 template<
typename MT3
2561 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2562 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2564 using boost::numeric_cast;
2573 const int M ( numeric_cast<int>( A.rows() ) );
2574 const int N ( numeric_cast<int>( B.columns() ) );
2575 const int K ( numeric_cast<int>( A.columns() ) );
2576 const int lda( numeric_cast<int>( A.spacing() ) );
2577 const int ldb( numeric_cast<int>( B.spacing() ) );
2578 const int ldc( numeric_cast<int>( C.spacing() ) );
2579 const complex<float> alpha( scalar );
2580 const complex<float> beta ( 0.0F, 0.0F );
2582 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2583 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2584 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2585 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2605 template<
typename MT3
2609 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2610 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2612 using boost::numeric_cast;
2621 const int M ( numeric_cast<int>( A.rows() ) );
2622 const int N ( numeric_cast<int>( B.columns() ) );
2623 const int K ( numeric_cast<int>( A.columns() ) );
2624 const int lda( numeric_cast<int>( A.spacing() ) );
2625 const int ldb( numeric_cast<int>( B.spacing() ) );
2626 const int ldc( numeric_cast<int>( C.spacing() ) );
2627 const complex<double> alpha( scalar );
2628 const complex<double> beta ( 0.0, 0.0 );
2630 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2631 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2632 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2633 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2650 template<
typename MT
2652 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2656 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2668 const TmpType tmp( rhs );
2685 template<
typename MT3
2687 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2694 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2695 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2697 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2712 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2714 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2732 template<
typename MT3
2736 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2737 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2758 template<
typename MT3
2762 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2763 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2768 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2769 const typename MT5::OppositeType tmp( B );
2772 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2773 const typename MT4::OppositeType tmp( A );
2776 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2777 const typename MT5::OppositeType tmp( B );
2781 const typename MT4::OppositeType tmp( A );
2801 template<
typename MT3
2805 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2806 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2808 typedef IntrinsicTrait<ElementType> IT;
2810 const size_t M( A.spacing() );
2811 const size_t N( B.columns() );
2812 const size_t K( A.columns() );
2818 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2819 for(
size_t j=0UL; j<N; ++j ) {
2820 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2821 for(
size_t k=0UL; k<K; ++k ) {
2823 xmm1 = xmm1 + A.get(i ,k) * b1;
2824 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2825 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2826 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2827 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2828 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2829 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2830 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2832 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2833 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
2834 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
2835 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
2836 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
2837 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
2838 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
2839 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
2842 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2844 for( ; (j+2UL) <= N; j+=2UL ) {
2845 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2846 for(
size_t k=0UL; k<K; ++k ) {
2853 xmm1 = xmm1 + a1 * b1;
2854 xmm2 = xmm2 + a2 * b1;
2855 xmm3 = xmm3 + a3 * b1;
2856 xmm4 = xmm4 + a4 * b1;
2857 xmm5 = xmm5 + a1 * b2;
2858 xmm6 = xmm6 + a2 * b2;
2859 xmm7 = xmm7 + a3 * b2;
2860 xmm8 = xmm8 + a4 * b2;
2862 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2863 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
2864 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
2865 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
2866 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
2867 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
2868 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
2869 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
2873 for(
size_t k=0UL; k<K; ++k ) {
2875 xmm1 = xmm1 + A.get(i ,k) * b1;
2876 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2877 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2878 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2880 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2881 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
2882 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
2883 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
2886 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2888 for( ; (j+2UL) <= N; j+=2UL ) {
2890 for(
size_t k=0UL; k<K; ++k ) {
2895 xmm1 = xmm1 + a1 * b1;
2896 xmm2 = xmm2 + a2 * b1;
2897 xmm3 = xmm3 + a1 * b2;
2898 xmm4 = xmm4 + a2 * b2;
2900 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2901 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
2902 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
2903 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
2907 for(
size_t k=0UL; k<K; ++k ) {
2909 xmm1 = xmm1 + A.get(i ,k) * b1;
2910 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2912 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2913 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
2918 for( ; (j+2UL) <= N; j+=2UL ) {
2920 for(
size_t k=0UL; k<K; ++k ) {
2922 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2923 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2925 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2926 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) + xmm2 * factor );
2930 for(
size_t k=0UL; k<K; ++k ) {
2931 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2933 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
2953 template<
typename MT3
2957 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2958 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2960 selectDefaultAddAssignKernel( C, A, B, scalar );
2979 template<
typename MT3
2983 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2984 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2986 using boost::numeric_cast;
2992 const int M ( numeric_cast<int>( A.rows() ) );
2993 const int N ( numeric_cast<int>( B.columns() ) );
2994 const int K ( numeric_cast<int>( A.columns() ) );
2995 const int lda( numeric_cast<int>( A.spacing() ) );
2996 const int ldb( numeric_cast<int>( B.spacing() ) );
2997 const int ldc( numeric_cast<int>( C.spacing() ) );
2999 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3000 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3001 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3002 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3022 template<
typename MT3
3026 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3027 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3029 using boost::numeric_cast;
3035 const int M ( numeric_cast<int>( A.rows() ) );
3036 const int N ( numeric_cast<int>( B.columns() ) );
3037 const int K ( numeric_cast<int>( A.columns() ) );
3038 const int lda( numeric_cast<int>( A.spacing() ) );
3039 const int ldb( numeric_cast<int>( B.spacing() ) );
3040 const int ldc( numeric_cast<int>( C.spacing() ) );
3042 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3043 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3044 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3045 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3065 template<
typename MT3
3069 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3070 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3072 using boost::numeric_cast;
3081 const int M ( numeric_cast<int>( A.rows() ) );
3082 const int N ( numeric_cast<int>( B.columns() ) );
3083 const int K ( numeric_cast<int>( A.columns() ) );
3084 const int lda( numeric_cast<int>( A.spacing() ) );
3085 const int ldb( numeric_cast<int>( B.spacing() ) );
3086 const int ldc( numeric_cast<int>( C.spacing() ) );
3087 const complex<float> alpha( scalar );
3088 const complex<float> beta ( 1.0F, 0.0F );
3090 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3091 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3092 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3093 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3113 template<
typename MT3
3117 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3118 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3120 using boost::numeric_cast;
3129 const int M ( numeric_cast<int>( A.rows() ) );
3130 const int N ( numeric_cast<int>( B.columns() ) );
3131 const int K ( numeric_cast<int>( A.columns() ) );
3132 const int lda( numeric_cast<int>( A.spacing() ) );
3133 const int ldb( numeric_cast<int>( B.spacing() ) );
3134 const int ldc( numeric_cast<int>( C.spacing() ) );
3135 const complex<double> alpha( scalar );
3136 const complex<double> beta ( 1.0, 0.0 );
3138 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3139 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3140 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3141 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3162 template<
typename MT3
3164 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3171 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3172 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3174 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3189 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3191 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3209 template<
typename MT3
3213 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3214 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3235 template<
typename MT3
3239 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3240 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3245 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3246 const typename MT5::OppositeType tmp( B );
3249 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3250 const typename MT4::OppositeType tmp( A );
3253 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3254 const typename MT5::OppositeType tmp( B );
3258 const typename MT4::OppositeType tmp( A );
3278 template<
typename MT3
3282 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3283 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3285 typedef IntrinsicTrait<ElementType> IT;
3287 const size_t M( A.spacing() );
3288 const size_t N( B.columns() );
3289 const size_t K( A.columns() );
3295 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3296 for(
size_t j=0UL; j<N; ++j ) {
3297 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3298 for(
size_t k=0UL; k<K; ++k ) {
3300 xmm1 = xmm1 + A.get(i ,k) * b1;
3301 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3302 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3303 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3304 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3305 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3306 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3307 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3309 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3310 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3311 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3312 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3313 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
3314 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
3315 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
3316 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
3319 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3321 for( ; (j+2UL) <= N; j+=2UL ) {
3322 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3323 for(
size_t k=0UL; k<K; ++k ) {
3330 xmm1 = xmm1 + a1 * b1;
3331 xmm2 = xmm2 + a2 * b1;
3332 xmm3 = xmm3 + a3 * b1;
3333 xmm4 = xmm4 + a4 * b1;
3334 xmm5 = xmm5 + a1 * b2;
3335 xmm6 = xmm6 + a2 * b2;
3336 xmm7 = xmm7 + a3 * b2;
3337 xmm8 = xmm8 + a4 * b2;
3339 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3340 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
3341 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
3342 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
3343 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
3344 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
3345 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
3346 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
3350 for(
size_t k=0UL; k<K; ++k ) {
3352 xmm1 = xmm1 + A.get(i ,k) * b1;
3353 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3354 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3355 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3357 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3358 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3359 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3360 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3363 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3365 for( ; (j+2UL) <= N; j+=2UL ) {
3367 for(
size_t k=0UL; k<K; ++k ) {
3372 xmm1 = xmm1 + a1 * b1;
3373 xmm2 = xmm2 + a2 * b1;
3374 xmm3 = xmm3 + a1 * b2;
3375 xmm4 = xmm4 + a2 * b2;
3377 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3378 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
3379 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
3380 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
3384 for(
size_t k=0UL; k<K; ++k ) {
3386 xmm1 = xmm1 + A.get(i ,k) * b1;
3387 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3389 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3390 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
3395 for( ; (j+2UL) <= N; j+=2UL ) {
3397 for(
size_t k=0UL; k<K; ++k ) {
3399 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3400 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3402 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3403 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) - xmm2 * factor );
3407 for(
size_t k=0UL; k<K; ++k ) {
3408 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
3410 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3430 template<
typename MT3
3434 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3435 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3437 selectDefaultSubAssignKernel( C, A, B, scalar );
3456 template<
typename MT3
3460 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3461 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3463 using boost::numeric_cast;
3469 const int M ( numeric_cast<int>( A.rows() ) );
3470 const int N ( numeric_cast<int>( B.columns() ) );
3471 const int K ( numeric_cast<int>( A.columns() ) );
3472 const int lda( numeric_cast<int>( A.spacing() ) );
3473 const int ldb( numeric_cast<int>( B.spacing() ) );
3474 const int ldc( numeric_cast<int>( C.spacing() ) );
3476 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3477 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3478 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3479 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3499 template<
typename MT3
3503 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3504 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3506 using boost::numeric_cast;
3512 const int M ( numeric_cast<int>( A.rows() ) );
3513 const int N ( numeric_cast<int>( B.columns() ) );
3514 const int K ( numeric_cast<int>( A.columns() ) );
3515 const int lda( numeric_cast<int>( A.spacing() ) );
3516 const int ldb( numeric_cast<int>( B.spacing() ) );
3517 const int ldc( numeric_cast<int>( C.spacing() ) );
3519 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3520 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3521 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3522 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3542 template<
typename MT3
3546 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3547 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3549 using boost::numeric_cast;
3558 const int M ( numeric_cast<int>( A.rows() ) );
3559 const int N ( numeric_cast<int>( B.columns() ) );
3560 const int K ( numeric_cast<int>( A.columns() ) );
3561 const int lda( numeric_cast<int>( A.spacing() ) );
3562 const int ldb( numeric_cast<int>( B.spacing() ) );
3563 const int ldc( numeric_cast<int>( C.spacing() ) );
3564 const complex<float> alpha( -scalar );
3565 const complex<float> beta ( 1.0F, 0.0F );
3567 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3568 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3569 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3570 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3590 template<
typename MT3
3594 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3595 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3597 using boost::numeric_cast;
3606 const int M ( numeric_cast<int>( A.rows() ) );
3607 const int N ( numeric_cast<int>( B.columns() ) );
3608 const int K ( numeric_cast<int>( A.columns() ) );
3609 const int lda( numeric_cast<int>( A.spacing() ) );
3610 const int ldb( numeric_cast<int>( B.spacing() ) );
3611 const int ldc( numeric_cast<int>( C.spacing() ) );
3612 const complex<double> alpha( -scalar );
3613 const complex<double> beta ( 1.0, 0.0 );
3615 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3616 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3617 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3618 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3684 template<
typename T1
3686 inline const TDMatTDMatMultExpr<T1,T2>
3692 throw std::invalid_argument(
"Matrix sizes do not match" );
3709 template<
typename MT1,
typename MT2,
typename VT >
3714 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3715 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3716 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3717 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
3718 , INVALID_TYPE >::Type Type;
3727 template<
typename MT1,
typename MT2,
typename VT >
3732 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3733 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3734 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3735 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
3736 , INVALID_TYPE >::Type Type;
3745 template<
typename VT,
typename MT1,
typename MT2 >
3750 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
3751 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3752 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
3753 ,
typename TDVecTDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3754 , INVALID_TYPE >::Type Type;
3763 template<
typename VT,
typename MT1,
typename MT2 >
3768 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
3769 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3770 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
3771 ,
typename TDVecTDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3772 , INVALID_TYPE >::Type Type;
3781 template<
typename MT1,
typename MT2 >
3786 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
3795 template<
typename MT1,
typename MT2 >
3800 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;