22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
92 template<
typename MT1
94 class TDMatTDMatMultExpr :
public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
100 typedef typename MT1::ResultType
RT1;
101 typedef typename MT2::ResultType
RT2;
102 typedef typename MT1::ElementType
ET1;
103 typedef typename MT2::ElementType
ET2;
104 typedef typename MT1::CompositeType
CT1;
105 typedef typename MT2::CompositeType
CT2;
113 template<
typename T1,
typename T2,
typename T3 >
114 struct UseSinglePrecisionKernel {
127 template<
typename T1,
typename T2,
typename T3 >
128 struct UseDoublePrecisionKernel {
142 template<
typename T1,
typename T2,
typename T3 >
143 struct UseSinglePrecisionComplexKernel {
144 typedef complex<float> Type;
145 enum { value = IsSame<typename T1::ElementType,Type>::value &&
146 IsSame<typename T2::ElementType,Type>::value &&
147 IsSame<typename T3::ElementType,Type>::value };
158 template<
typename T1,
typename T2,
typename T3 >
159 struct UseDoublePrecisionComplexKernel {
160 typedef complex<double> Type;
161 enum { value = IsSame<typename T1::ElementType,Type>::value &&
162 IsSame<typename T2::ElementType,Type>::value &&
163 IsSame<typename T3::ElementType,Type>::value };
173 template<
typename T1,
typename T2,
typename T3 >
174 struct UseDefaultKernel {
175 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
176 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
177 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
178 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
188 template<
typename T1,
typename T2,
typename T3 >
189 struct UseVectorizedDefaultKernel {
190 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
191 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
192 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
193 IntrinsicTrait<typename T1::ElementType>::addition &&
194 IntrinsicTrait<typename T1::ElementType>::multiplication };
225 enum { vectorizable = 0 };
255 if(
lhs_.columns() != 0UL ) {
256 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
258 for(
size_t k=1UL; k<end; k+=2UL ) {
260 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
262 if( end <
lhs_.columns() ) {
290 return rhs_.columns();
320 template<
typename T >
322 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
332 template<
typename T >
334 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
354 template<
typename MT
363 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
366 else if( rhs.lhs_.columns() == 0UL ) {
382 TDMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
384 TDMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
403 template<
typename MT3
407 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
409 const size_t M( A.rows() );
410 const size_t N( B.columns() );
411 const size_t K( A.columns() );
413 for(
size_t i=0UL; i<M; ++i ) {
414 for(
size_t j=0UL; j<N; ++j ) {
415 C(i,j) = A(i,0UL) * B(0UL,j);
417 for(
size_t k=1UL; k<K; ++k ) {
418 for(
size_t j=0UL; j<N; ++j ) {
419 C(i,j) += A(i,k) * B(k,j);
441 template<
typename MT3
444 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
445 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
450 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
451 const typename MT5::OppositeType tmp( B );
454 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
455 const typename MT4::OppositeType tmp( A );
458 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
459 const typename MT5::OppositeType tmp( B );
463 const typename MT4::OppositeType tmp( A );
484 template<
typename MT3
487 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
488 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
490 typedef IntrinsicTrait<ElementType> IT;
492 const size_t M( A.spacing() );
493 const size_t N( B.columns() );
494 const size_t K( A.columns() );
498 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
499 for(
size_t j=0UL; j<N; ++j ) {
500 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
501 for(
size_t k=0UL; k<K; ++k ) {
503 xmm1 = xmm1 + A.get(i ,k) * b1;
504 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
505 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
506 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
507 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
508 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
509 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
510 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
512 store( &(~C)(i ,j), xmm1 );
513 store( &(~C)(i+IT::size ,j), xmm2 );
514 store( &(~C)(i+IT::size*2UL,j), xmm3 );
515 store( &(~C)(i+IT::size*3UL,j), xmm4 );
516 store( &(~C)(i+IT::size*4UL,j), xmm5 );
517 store( &(~C)(i+IT::size*5UL,j), xmm6 );
518 store( &(~C)(i+IT::size*6UL,j), xmm7 );
519 store( &(~C)(i+IT::size*7UL,j), xmm8 );
522 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
524 for( ; (j+2UL) <= N; j+=2UL ) {
525 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
526 for(
size_t k=0UL; k<K; ++k ) {
533 xmm1 = xmm1 + a1 * b1;
534 xmm2 = xmm2 + a2 * b1;
535 xmm3 = xmm3 + a3 * b1;
536 xmm4 = xmm4 + a4 * b1;
537 xmm5 = xmm5 + a1 * b2;
538 xmm6 = xmm6 + a2 * b2;
539 xmm7 = xmm7 + a3 * b2;
540 xmm8 = xmm8 + a4 * b2;
542 store( &(~C)(i ,j ), xmm1 );
543 store( &(~C)(i+IT::size ,j ), xmm2 );
544 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
545 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
546 store( &(~C)(i ,j+1UL), xmm5 );
547 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
548 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
549 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
553 for(
size_t k=0UL; k<K; ++k ) {
555 xmm1 = xmm1 + A.get(i ,k) * b1;
556 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
557 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
558 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
560 store( &(~C)(i ,j), xmm1 );
561 store( &(~C)(i+IT::size ,j), xmm2 );
562 store( &(~C)(i+IT::size*2UL,j), xmm3 );
563 store( &(~C)(i+IT::size*3UL,j), xmm4 );
566 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
568 for( ; (j+2UL) <= N; j+=2UL ) {
570 for(
size_t k=0UL; k<K; ++k ) {
575 xmm1 = xmm1 + a1 * b1;
576 xmm2 = xmm2 + a2 * b1;
577 xmm3 = xmm3 + a1 * b2;
578 xmm4 = xmm4 + a2 * b2;
580 store( &(~C)(i ,j ), xmm1 );
581 store( &(~C)(i+IT::size,j ), xmm2 );
582 store( &(~C)(i ,j+1UL), xmm3 );
583 store( &(~C)(i+IT::size,j+1UL), xmm4 );
587 for(
size_t k=0UL; k<K; ++k ) {
589 xmm1 = xmm1 + A.get(i ,k) * b1;
590 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
592 store( &(~C)(i ,j), xmm1 );
593 store( &(~C)(i+IT::size,j), xmm2 );
598 for( ; (j+2UL) <= N; j+=2UL ) {
600 for(
size_t k=0UL; k<K; ++k ) {
602 xmm1 = xmm1 + a1 *
set( B(k,j ) );
603 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
605 store( &(~C)(i,j ), xmm1 );
606 store( &(~C)(i,j+1UL), xmm2 );
610 for(
size_t k=0UL; k<K; ++k ) {
611 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
613 store( &(~C)(i,j), xmm1 );
634 template<
typename MT3
637 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
638 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
640 selectDefaultAssignKernel( C, A, B );
660 template<
typename MT3
663 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
664 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
666 using boost::numeric_cast;
672 const int M ( numeric_cast<int>( A.rows() ) );
673 const int N ( numeric_cast<int>( B.columns() ) );
674 const int K ( numeric_cast<int>( A.columns() ) );
675 const int lda( numeric_cast<int>( A.spacing() ) );
676 const int ldb( numeric_cast<int>( B.spacing() ) );
677 const int ldc( numeric_cast<int>( C.spacing() ) );
679 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
680 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
681 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
682 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
703 template<
typename MT3
706 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
707 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
709 using boost::numeric_cast;
715 const int M ( numeric_cast<int>( A.rows() ) );
716 const int N ( numeric_cast<int>( B.columns() ) );
717 const int K ( numeric_cast<int>( A.columns() ) );
718 const int lda( numeric_cast<int>( A.spacing() ) );
719 const int ldb( numeric_cast<int>( B.spacing() ) );
720 const int ldc( numeric_cast<int>( C.spacing() ) );
722 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
723 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
724 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
725 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
746 template<
typename MT3
749 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
750 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
752 using boost::numeric_cast;
761 const int M ( numeric_cast<int>( A.rows() ) );
762 const int N ( numeric_cast<int>( B.columns() ) );
763 const int K ( numeric_cast<int>( A.columns() ) );
764 const int lda( numeric_cast<int>( A.spacing() ) );
765 const int ldb( numeric_cast<int>( B.spacing() ) );
766 const int ldc( numeric_cast<int>( C.spacing() ) );
767 complex<float> alpha( 1.0F, 0.0F );
768 complex<float> beta ( 0.0F, 0.0F );
770 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
771 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
772 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
773 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
794 template<
typename MT3
797 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
798 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
800 using boost::numeric_cast;
809 const int M ( numeric_cast<int>( A.rows() ) );
810 const int N ( numeric_cast<int>( B.columns() ) );
811 const int K ( numeric_cast<int>( A.columns() ) );
812 const int lda( numeric_cast<int>( A.spacing() ) );
813 const int ldb( numeric_cast<int>( B.spacing() ) );
814 const int ldc( numeric_cast<int>( C.spacing() ) );
815 complex<double> alpha( 1.0, 0.0 );
816 complex<double> beta ( 0.0, 0.0 );
818 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
819 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
820 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
821 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
840 template<
typename MT
846 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
858 const TmpType tmp( rhs );
877 template<
typename MT
886 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
901 TDMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
903 TDMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
922 template<
typename MT3
925 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
926 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
928 const size_t M( A.rows() );
929 const size_t N( B.columns() );
930 const size_t K( A.columns() );
933 const size_t end( N &
size_t(-2) );
935 for(
size_t i=0UL; i<M; ++i ) {
936 for(
size_t k=0UL; k<K; ++k ) {
937 for(
size_t j=0UL; j<end; j+=2UL ) {
938 C(i,j ) += A(i,k) * B(k,j );
939 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
942 C(i,end) += A(i,k) * B(k,end);
964 template<
typename MT3
967 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
968 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
973 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
974 const typename MT5::OppositeType tmp( B );
977 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
978 const typename MT4::OppositeType tmp( A );
981 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
982 const typename MT5::OppositeType tmp( B );
986 const typename MT4::OppositeType tmp( A );
1007 template<
typename MT3
1010 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1011 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1013 typedef IntrinsicTrait<ElementType> IT;
1015 const size_t M( A.spacing() );
1016 const size_t N( B.columns() );
1017 const size_t K( A.columns() );
1021 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1022 for(
size_t j=0UL; j<N; ++j ) {
1031 for(
size_t k=0UL; k<K; ++k ) {
1033 xmm1 = xmm1 + A.get(i ,k) * b1;
1034 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1035 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1036 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1037 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1038 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1039 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1040 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1042 store( &(~C)(i ,j), xmm1 );
1043 store( &(~C)(i+IT::size ,j), xmm2 );
1044 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1045 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1046 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1047 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1048 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1049 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1052 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1054 for( ; (j+2UL) <= N; j+=2UL ) {
1063 for(
size_t k=0UL; k<K; ++k ) {
1070 xmm1 = xmm1 + a1 * b1;
1071 xmm2 = xmm2 + a2 * b1;
1072 xmm3 = xmm3 + a3 * b1;
1073 xmm4 = xmm4 + a4 * b1;
1074 xmm5 = xmm5 + a1 * b2;
1075 xmm6 = xmm6 + a2 * b2;
1076 xmm7 = xmm7 + a3 * b2;
1077 xmm8 = xmm8 + a4 * b2;
1079 store( &(~C)(i ,j ), xmm1 );
1080 store( &(~C)(i+IT::size ,j ), xmm2 );
1081 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1082 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1083 store( &(~C)(i ,j+1UL), xmm5 );
1084 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1085 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1086 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1093 for(
size_t k=0UL; k<K; ++k ) {
1095 xmm1 = xmm1 + A.get(i ,k) * b1;
1096 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1097 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1098 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1100 store( &(~C)(i ,j), xmm1 );
1101 store( &(~C)(i+IT::size ,j), xmm2 );
1102 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1103 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1106 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1108 for( ; (j+2UL) <= N; j+=2UL ) {
1113 for(
size_t k=0UL; k<K; ++k ) {
1118 xmm1 = xmm1 + a1 * b1;
1119 xmm2 = xmm2 + a2 * b1;
1120 xmm3 = xmm3 + a1 * b2;
1121 xmm4 = xmm4 + a2 * b2;
1123 store( &(~C)(i ,j ), xmm1 );
1124 store( &(~C)(i+IT::size,j ), xmm2 );
1125 store( &(~C)(i ,j+1UL), xmm3 );
1126 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1131 for(
size_t k=0UL; k<K; ++k ) {
1133 xmm1 = xmm1 + A.get(i ,k) * b1;
1134 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1136 store( &(~C)(i ,j), xmm1 );
1137 store( &(~C)(i+IT::size,j), xmm2 );
1142 for( ; (j+2UL) <= N; j+=2UL ) {
1145 for(
size_t k=0UL; k<K; ++k ) {
1147 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1148 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1150 store( &(~C)(i,j ), xmm1 );
1151 store( &(~C)(i,j+1UL), xmm2 );
1155 for(
size_t k=0UL; k<K; ++k ) {
1156 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
1158 store( &(~C)(i,j), xmm1 );
1179 template<
typename MT3
1182 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1183 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1185 selectDefaultAddAssignKernel( C, A, B );
1205 template<
typename MT3
1208 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1209 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1211 using boost::numeric_cast;
1217 const int M ( numeric_cast<int>( A.rows() ) );
1218 const int N ( numeric_cast<int>( B.columns() ) );
1219 const int K ( numeric_cast<int>( A.columns() ) );
1220 const int lda( numeric_cast<int>( A.spacing() ) );
1221 const int ldb( numeric_cast<int>( B.spacing() ) );
1222 const int ldc( numeric_cast<int>( C.spacing() ) );
1224 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1225 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1226 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1227 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1248 template<
typename MT3
1251 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1252 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1254 using boost::numeric_cast;
1260 const int M ( numeric_cast<int>( A.rows() ) );
1261 const int N ( numeric_cast<int>( B.columns() ) );
1262 const int K ( numeric_cast<int>( A.columns() ) );
1263 const int lda( numeric_cast<int>( A.spacing() ) );
1264 const int ldb( numeric_cast<int>( B.spacing() ) );
1265 const int ldc( numeric_cast<int>( C.spacing() ) );
1267 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1268 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1269 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1270 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1291 template<
typename MT3
1294 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1295 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1297 using boost::numeric_cast;
1306 const int M ( numeric_cast<int>( A.rows() ) );
1307 const int N ( numeric_cast<int>( B.columns() ) );
1308 const int K ( numeric_cast<int>( A.columns() ) );
1309 const int lda( numeric_cast<int>( A.spacing() ) );
1310 const int ldb( numeric_cast<int>( B.spacing() ) );
1311 const int ldc( numeric_cast<int>( C.spacing() ) );
1312 const complex<float> alpha( 1.0F, 0.0F );
1313 const complex<float> beta ( 1.0F, 0.0F );
1315 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1316 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1317 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1318 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1339 template<
typename MT3
1342 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1343 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1345 using boost::numeric_cast;
1354 const int M ( numeric_cast<int>( A.rows() ) );
1355 const int N ( numeric_cast<int>( B.columns() ) );
1356 const int K ( numeric_cast<int>( A.columns() ) );
1357 const int lda( numeric_cast<int>( A.spacing() ) );
1358 const int ldb( numeric_cast<int>( B.spacing() ) );
1359 const int ldc( numeric_cast<int>( C.spacing() ) );
1360 const complex<double> alpha( 1.0, 0.0 );
1361 const complex<double> beta ( 1.0, 0.0 );
1363 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1364 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1365 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1366 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1389 template<
typename MT
1398 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1413 TDMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1415 TDMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1434 template<
typename MT3
1437 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1438 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1440 const size_t M( A.rows() );
1441 const size_t N( B.columns() );
1442 const size_t K( A.columns() );
1445 const size_t end( N &
size_t(-2) );
1447 for(
size_t i=0UL; i<M; ++i ) {
1448 for(
size_t k=0UL; k<K; ++k ) {
1449 for(
size_t j=0UL; j<end; j+=2UL ) {
1450 C(i,j ) -= A(i,k) * B(k,j );
1451 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1454 C(i,end) -= A(i,k) * B(k,end);
1476 template<
typename MT3
1479 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1480 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1485 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1486 const typename MT5::OppositeType tmp( B );
1489 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1490 const typename MT4::OppositeType tmp( A );
1493 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1494 const typename MT5::OppositeType tmp( B );
1498 const typename MT4::OppositeType tmp( A );
1519 template<
typename MT3
1522 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1523 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1525 typedef IntrinsicTrait<ElementType> IT;
1527 const size_t M( A.spacing() );
1528 const size_t N( B.columns() );
1529 const size_t K( A.columns() );
1533 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1534 for(
size_t j=0UL; j<N; ++j ) {
1543 for(
size_t k=0UL; k<K; ++k ) {
1545 xmm1 = xmm1 - A.get(i ,k) * b1;
1546 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1547 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1548 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1549 xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1550 xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1551 xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1552 xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1554 store( &(~C)(i ,j), xmm1 );
1555 store( &(~C)(i+IT::size ,j), xmm2 );
1556 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1557 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1558 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1559 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1560 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1561 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1564 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1566 for( ; (j+2UL) <= N; j+=2UL ) {
1575 for(
size_t k=0UL; k<K; ++k ) {
1582 xmm1 = xmm1 - a1 * b1;
1583 xmm2 = xmm2 - a2 * b1;
1584 xmm3 = xmm3 - a3 * b1;
1585 xmm4 = xmm4 - a4 * b1;
1586 xmm5 = xmm5 - a1 * b2;
1587 xmm6 = xmm6 - a2 * b2;
1588 xmm7 = xmm7 - a3 * b2;
1589 xmm8 = xmm8 - a4 * b2;
1591 store( &(~C)(i ,j ), xmm1 );
1592 store( &(~C)(i+IT::size ,j ), xmm2 );
1593 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1594 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1595 store( &(~C)(i ,j+1UL), xmm5 );
1596 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1597 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1598 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1605 for(
size_t k=0UL; k<K; ++k ) {
1607 xmm1 = xmm1 - A.get(i ,k) * b1;
1608 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1609 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1610 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1612 store( &(~C)(i ,j), xmm1 );
1613 store( &(~C)(i+IT::size ,j), xmm2 );
1614 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1615 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1618 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1620 for( ; (j+2UL) <= N; j+=2UL ) {
1625 for(
size_t k=0UL; k<K; ++k ) {
1630 xmm1 = xmm1 - a1 * b1;
1631 xmm2 = xmm2 - a2 * b1;
1632 xmm3 = xmm3 - a1 * b2;
1633 xmm4 = xmm4 - a2 * b2;
1635 store( &(~C)(i ,j ), xmm1 );
1636 store( &(~C)(i+IT::size,j ), xmm2 );
1637 store( &(~C)(i ,j+1UL), xmm3 );
1638 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1643 for(
size_t k=0UL; k<K; ++k ) {
1645 xmm1 = xmm1 - A.get(i ,k) * b1;
1646 xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
1648 store( &(~C)(i ,j), xmm1 );
1649 store( &(~C)(i+IT::size,j), xmm2 );
1654 for( ; (j+2UL) <= N; j+=2UL ) {
1657 for(
size_t k=0UL; k<K; ++k ) {
1659 xmm1 = xmm1 - a1 *
set( B(k,j ) );
1660 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
1662 store( &(~C)(i,j ), xmm1 );
1663 store( &(~C)(i,j+1UL), xmm2 );
1667 for(
size_t k=0UL; k<K; ++k ) {
1668 xmm1 = xmm1 - A.get(i,k) *
set( B(k,j) );
1670 store( &(~C)(i,j), xmm1 );
1691 template<
typename MT3
1694 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1695 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1697 selectDefaultSubAssignKernel( C, A, B );
1717 template<
typename MT3
1720 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1721 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1723 using boost::numeric_cast;
1729 const int M ( numeric_cast<int>( A.rows() ) );
1730 const int N ( numeric_cast<int>( B.columns() ) );
1731 const int K ( numeric_cast<int>( A.columns() ) );
1732 const int lda( numeric_cast<int>( A.spacing() ) );
1733 const int ldb( numeric_cast<int>( B.spacing() ) );
1734 const int ldc( numeric_cast<int>( C.spacing() ) );
1736 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1737 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1738 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1739 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1760 template<
typename MT3
1763 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1764 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1766 using boost::numeric_cast;
1772 const int M ( numeric_cast<int>( A.rows() ) );
1773 const int N ( numeric_cast<int>( B.columns() ) );
1774 const int K ( numeric_cast<int>( A.columns() ) );
1775 const int lda( numeric_cast<int>( A.spacing() ) );
1776 const int ldb( numeric_cast<int>( B.spacing() ) );
1777 const int ldc( numeric_cast<int>( C.spacing() ) );
1779 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1780 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1781 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1782 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1803 template<
typename MT3
1806 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1807 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1809 using boost::numeric_cast;
1818 const int M ( numeric_cast<int>( A.rows() ) );
1819 const int N ( numeric_cast<int>( B.columns() ) );
1820 const int K ( numeric_cast<int>( A.columns() ) );
1821 const int lda( numeric_cast<int>( A.spacing() ) );
1822 const int ldb( numeric_cast<int>( B.spacing() ) );
1823 const int ldc( numeric_cast<int>( C.spacing() ) );
1824 const complex<float> alpha( -1.0F, 0.0F );
1825 const complex<float> beta ( 1.0F, 0.0F );
1827 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1828 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1829 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1830 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1851 template<
typename MT3
1854 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1855 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1857 using boost::numeric_cast;
1866 const int M ( numeric_cast<int>( A.rows() ) );
1867 const int N ( numeric_cast<int>( B.columns() ) );
1868 const int K ( numeric_cast<int>( A.columns() ) );
1869 const int lda( numeric_cast<int>( A.spacing() ) );
1870 const int ldb( numeric_cast<int>( B.spacing() ) );
1871 const int ldc( numeric_cast<int>( C.spacing() ) );
1872 const complex<double> alpha( -1.0, 0.0 );
1873 const complex<double> beta ( 1.0, 0.0 );
1875 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1876 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1877 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1878 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1924 template<
typename MT1
1928 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
1929 ,
private Expression
1930 ,
private Computation
1934 typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
1935 typedef typename MMM::ResultType RES;
1936 typedef typename MT1::ResultType
RT1;
1937 typedef typename MT2::ResultType
RT2;
1938 typedef typename MT1::CompositeType
CT1;
1939 typedef typename MT2::CompositeType
CT2;
1947 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1948 struct UseSinglePrecisionKernel {
1949 enum { value = IsFloat<typename T1::ElementType>::value &&
1950 IsFloat<typename T2::ElementType>::value &&
1951 IsFloat<typename T3::ElementType>::value &&
1952 !IsComplex<T4>::value };
1961 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1962 struct UseDoublePrecisionKernel {
1963 enum { value = IsDouble<typename T1::ElementType>::value &&
1964 IsDouble<typename T2::ElementType>::value &&
1965 IsDouble<typename T3::ElementType>::value &&
1966 !IsComplex<T4>::value };
1975 template<
typename T1,
typename T2,
typename T3 >
1976 struct UseSinglePrecisionComplexKernel {
1977 typedef complex<float> Type;
1978 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1979 IsSame<typename T2::ElementType,Type>::value &&
1980 IsSame<typename T3::ElementType,Type>::value };
1989 template<
typename T1,
typename T2,
typename T3 >
1990 struct UseDoublePrecisionComplexKernel {
1991 typedef complex<double> Type;
1992 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1993 IsSame<typename T2::ElementType,Type>::value &&
1994 IsSame<typename T3::ElementType,Type>::value };
2002 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2003 struct UseDefaultKernel {
2004 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2005 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2006 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2007 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2015 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2016 struct UseVectorizedDefaultKernel {
2017 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2018 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2019 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2020 IsSame<typename T1::ElementType,T4>::value &&
2021 IntrinsicTrait<typename T1::ElementType>::addition &&
2022 IntrinsicTrait<typename T1::ElementType>::multiplication };
2028 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2029 typedef typename MultTrait<RES,ST>::Type
ResultType;
2030 typedef typename ResultType::OppositeType
OppositeType;
2032 typedef typename ResultType::ElementType
ElementType;
2033 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2038 typedef const TDMatTDMatMultExpr<MT1,MT2>
LeftOperand;
2044 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2047 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2052 enum { vectorizable = 0 };
2061 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2077 return matrix_(i,j) * scalar_;
2086 inline size_t rows()
const {
2087 return matrix_.rows();
2096 inline size_t columns()
const {
2097 return matrix_.columns();
2127 template<
typename T >
2128 inline bool canAlias(
const T* alias )
const {
2129 return matrix_.canAlias( alias );
2139 template<
typename T >
2140 inline bool isAliased(
const T* alias )
const {
2141 return matrix_.isAliased( alias );
2160 template<
typename MT3
2162 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2169 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2170 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2172 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2175 else if( left.columns() == 0UL ) {
2191 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2193 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2211 template<
typename MT3
2215 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2216 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2218 for(
size_t i=0UL; i<A.rows(); ++i ) {
2219 for(
size_t k=0UL; k<B.columns(); ++k ) {
2220 C(i,k) = A(i,0UL) * B(0UL,k);
2222 for(
size_t j=1UL; j<A.columns(); ++j ) {
2223 for(
size_t k=0UL; k<B.columns(); ++k ) {
2224 C(i,k) += A(i,j) * B(j,k);
2227 for(
size_t k=0UL; k<B.columns(); ++k ) {
2248 template<
typename MT3
2252 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2253 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2258 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2259 const typename MT5::OppositeType tmp( B );
2260 assign( ~C, A * tmp * scalar );
2262 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2263 const typename MT4::OppositeType tmp( A );
2264 assign( ~C, tmp * B * scalar );
2266 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2267 const typename MT5::OppositeType tmp( B );
2268 assign( ~C, A * tmp * scalar );
2271 const typename MT4::OppositeType tmp( A );
2272 assign( ~C, tmp * B * scalar );
2291 template<
typename MT3
2295 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2296 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2298 typedef IntrinsicTrait<ElementType> IT;
2300 const size_t M( A.spacing() );
2301 const size_t N( B.columns() );
2302 const size_t K( A.columns() );
2308 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2309 for(
size_t j=0UL; j<N; ++j ) {
2310 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2311 for(
size_t k=0UL; k<K; ++k ) {
2313 xmm1 = xmm1 + A.get(i ,k) * b1;
2314 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2315 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2316 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2317 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2318 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2319 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2320 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2322 store( &(~C)(i ,j), xmm1 * factor );
2323 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2324 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2325 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2326 store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2327 store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2328 store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2329 store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2332 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2334 for( ; (j+2UL) <= N; j+=2UL ) {
2335 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2336 for(
size_t k=0UL; k<K; ++k ) {
2343 xmm1 = xmm1 + a1 * b1;
2344 xmm2 = xmm2 + a2 * b1;
2345 xmm3 = xmm3 + a3 * b1;
2346 xmm4 = xmm4 + a4 * b1;
2347 xmm5 = xmm5 + a1 * b2;
2348 xmm6 = xmm6 + a2 * b2;
2349 xmm7 = xmm7 + a3 * b2;
2350 xmm8 = xmm8 + a4 * b2;
2352 store( &(~C)(i ,j ), xmm1 * factor );
2353 store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2354 store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2355 store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2356 store( &(~C)(i ,j+1UL), xmm5 * factor );
2357 store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2358 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2359 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2363 for(
size_t k=0UL; k<K; ++k ) {
2365 xmm1 = xmm1 + A.get(i ,k) * b1;
2366 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2367 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2368 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2370 store( &(~C)(i ,j), xmm1 * factor );
2371 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2372 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2373 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2376 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2378 for( ; (j+2UL) <= N; j+=2UL ) {
2380 for(
size_t k=0UL; k<K; ++k ) {
2385 xmm1 = xmm1 + a1 * b1;
2386 xmm2 = xmm2 + a2 * b1;
2387 xmm3 = xmm3 + a1 * b2;
2388 xmm4 = xmm4 + a2 * b2;
2390 store( &(~C)(i ,j ), xmm1 * factor );
2391 store( &(~C)(i+IT::size,j ), xmm2 * factor );
2392 store( &(~C)(i ,j+1UL), xmm3 * factor );
2393 store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2397 for(
size_t k=0UL; k<K; ++k ) {
2399 xmm1 = xmm1 + A.get(i ,k) * b1;
2400 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2402 store( &(~C)(i ,j), xmm1 * factor );
2403 store( &(~C)(i+IT::size,j), xmm2 * factor );
2408 for( ; (j+2UL) <= N; j+=2UL ) {
2410 for(
size_t k=0UL; k<K; ++k ) {
2412 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2413 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2415 store( &(~C)(i,j ), xmm1 * factor );
2416 store( &(~C)(i,j+1UL), xmm2 * factor );
2420 for(
size_t k=0UL; k<K; ++k ) {
2421 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2423 store( &(~C)(i,j), xmm1 * factor );
2443 template<
typename MT3
2447 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2448 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2450 selectDefaultAssignKernel( C, A, B, scalar );
2469 template<
typename MT3
2473 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2474 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2476 using boost::numeric_cast;
2482 const int M ( numeric_cast<int>( A.rows() ) );
2483 const int N ( numeric_cast<int>( B.columns() ) );
2484 const int K ( numeric_cast<int>( A.columns() ) );
2485 const int lda( numeric_cast<int>( A.spacing() ) );
2486 const int ldb( numeric_cast<int>( B.spacing() ) );
2487 const int ldc( numeric_cast<int>( C.spacing() ) );
2489 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2490 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2491 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2492 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2512 template<
typename MT3
2516 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2517 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2519 using boost::numeric_cast;
2525 const int M ( numeric_cast<int>( A.rows() ) );
2526 const int N ( numeric_cast<int>( B.columns() ) );
2527 const int K ( numeric_cast<int>( A.columns() ) );
2528 const int lda( numeric_cast<int>( A.spacing() ) );
2529 const int ldb( numeric_cast<int>( B.spacing() ) );
2530 const int ldc( numeric_cast<int>( C.spacing() ) );
2532 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2533 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2534 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2535 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2555 template<
typename MT3
2559 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2560 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2562 using boost::numeric_cast;
2572 const int M ( numeric_cast<int>( A.rows() ) );
2573 const int N ( numeric_cast<int>( B.columns() ) );
2574 const int K ( numeric_cast<int>( A.columns() ) );
2575 const int lda( numeric_cast<int>( A.spacing() ) );
2576 const int ldb( numeric_cast<int>( B.spacing() ) );
2577 const int ldc( numeric_cast<int>( C.spacing() ) );
2578 const complex<float> alpha( scalar );
2579 const complex<float> beta ( 0.0F, 0.0F );
2581 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2582 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2583 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2584 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2604 template<
typename MT3
2608 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2609 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2611 using boost::numeric_cast;
2621 const int M ( numeric_cast<int>( A.rows() ) );
2622 const int N ( numeric_cast<int>( B.columns() ) );
2623 const int K ( numeric_cast<int>( A.columns() ) );
2624 const int lda( numeric_cast<int>( A.spacing() ) );
2625 const int ldb( numeric_cast<int>( B.spacing() ) );
2626 const int ldc( numeric_cast<int>( C.spacing() ) );
2627 const complex<double> alpha( scalar );
2628 const complex<double> beta ( 0.0, 0.0 );
2630 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2631 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2632 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2633 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2650 template<
typename MT
2652 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2656 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2668 const TmpType tmp( rhs );
2685 template<
typename MT3
2687 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2694 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2695 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2697 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2712 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2714 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2732 template<
typename MT3
2736 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2737 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2758 template<
typename MT3
2762 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2763 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2768 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2769 const typename MT5::OppositeType tmp( B );
2772 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2773 const typename MT4::OppositeType tmp( A );
2776 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2777 const typename MT5::OppositeType tmp( B );
2781 const typename MT4::OppositeType tmp( A );
2801 template<
typename MT3
2805 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2806 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2808 typedef IntrinsicTrait<ElementType> IT;
2810 const size_t M( A.spacing() );
2811 const size_t N( B.columns() );
2812 const size_t K( A.columns() );
2818 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2819 for(
size_t j=0UL; j<N; ++j ) {
2820 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2821 for(
size_t k=0UL; k<K; ++k ) {
2823 xmm1 = xmm1 + A.get(i ,k) * b1;
2824 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2825 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2826 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2827 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2828 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2829 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2830 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2832 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2833 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
2834 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
2835 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
2836 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
2837 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
2838 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
2839 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
2842 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2844 for( ; (j+2UL) <= N; j+=2UL ) {
2845 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2846 for(
size_t k=0UL; k<K; ++k ) {
2853 xmm1 = xmm1 + a1 * b1;
2854 xmm2 = xmm2 + a2 * b1;
2855 xmm3 = xmm3 + a3 * b1;
2856 xmm4 = xmm4 + a4 * b1;
2857 xmm5 = xmm5 + a1 * b2;
2858 xmm6 = xmm6 + a2 * b2;
2859 xmm7 = xmm7 + a3 * b2;
2860 xmm8 = xmm8 + a4 * b2;
2862 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2863 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
2864 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
2865 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
2866 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
2867 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
2868 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
2869 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
2873 for(
size_t k=0UL; k<K; ++k ) {
2875 xmm1 = xmm1 + A.get(i ,k) * b1;
2876 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2877 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2878 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2880 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2881 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
2882 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
2883 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
2886 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2888 for( ; (j+2UL) <= N; j+=2UL ) {
2890 for(
size_t k=0UL; k<K; ++k ) {
2895 xmm1 = xmm1 + a1 * b1;
2896 xmm2 = xmm2 + a2 * b1;
2897 xmm3 = xmm3 + a1 * b2;
2898 xmm4 = xmm4 + a2 * b2;
2900 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2901 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
2902 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
2903 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
2907 for(
size_t k=0UL; k<K; ++k ) {
2909 xmm1 = xmm1 + A.get(i ,k) * b1;
2910 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2912 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2913 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
2918 for( ; (j+2UL) <= N; j+=2UL ) {
2920 for(
size_t k=0UL; k<K; ++k ) {
2922 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2923 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2925 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2926 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) + xmm2 * factor );
2930 for(
size_t k=0UL; k<K; ++k ) {
2931 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2933 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
2953 template<
typename MT3
2957 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2958 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2960 selectDefaultAddAssignKernel( C, A, B, scalar );
2979 template<
typename MT3
2983 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2984 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2986 using boost::numeric_cast;
2992 const int M ( numeric_cast<int>( A.rows() ) );
2993 const int N ( numeric_cast<int>( B.columns() ) );
2994 const int K ( numeric_cast<int>( A.columns() ) );
2995 const int lda( numeric_cast<int>( A.spacing() ) );
2996 const int ldb( numeric_cast<int>( B.spacing() ) );
2997 const int ldc( numeric_cast<int>( C.spacing() ) );
2999 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3000 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3001 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3002 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3022 template<
typename MT3
3026 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3027 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3029 using boost::numeric_cast;
3035 const int M ( numeric_cast<int>( A.rows() ) );
3036 const int N ( numeric_cast<int>( B.columns() ) );
3037 const int K ( numeric_cast<int>( A.columns() ) );
3038 const int lda( numeric_cast<int>( A.spacing() ) );
3039 const int ldb( numeric_cast<int>( B.spacing() ) );
3040 const int ldc( numeric_cast<int>( C.spacing() ) );
3042 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3043 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3044 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3045 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3065 template<
typename MT3
3069 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3070 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3072 using boost::numeric_cast;
3082 const int M ( numeric_cast<int>( A.rows() ) );
3083 const int N ( numeric_cast<int>( B.columns() ) );
3084 const int K ( numeric_cast<int>( A.columns() ) );
3085 const int lda( numeric_cast<int>( A.spacing() ) );
3086 const int ldb( numeric_cast<int>( B.spacing() ) );
3087 const int ldc( numeric_cast<int>( C.spacing() ) );
3088 const complex<float> alpha( scalar );
3089 const complex<float> beta ( 1.0F, 0.0F );
3091 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3092 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3093 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3094 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3114 template<
typename MT3
3118 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3119 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3121 using boost::numeric_cast;
3131 const int M ( numeric_cast<int>( A.rows() ) );
3132 const int N ( numeric_cast<int>( B.columns() ) );
3133 const int K ( numeric_cast<int>( A.columns() ) );
3134 const int lda( numeric_cast<int>( A.spacing() ) );
3135 const int ldb( numeric_cast<int>( B.spacing() ) );
3136 const int ldc( numeric_cast<int>( C.spacing() ) );
3137 const complex<double> alpha( scalar );
3138 const complex<double> beta ( 1.0, 0.0 );
3140 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3141 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3142 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3143 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3164 template<
typename MT3
3166 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3173 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3174 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3176 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3191 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3193 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3211 template<
typename MT3
3215 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3216 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3237 template<
typename MT3
3241 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3242 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3247 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3248 const typename MT5::OppositeType tmp( B );
3251 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3252 const typename MT4::OppositeType tmp( A );
3255 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3256 const typename MT5::OppositeType tmp( B );
3260 const typename MT4::OppositeType tmp( A );
3280 template<
typename MT3
3284 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3285 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3287 typedef IntrinsicTrait<ElementType> IT;
3289 const size_t M( A.spacing() );
3290 const size_t N( B.columns() );
3291 const size_t K( A.columns() );
3297 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3298 for(
size_t j=0UL; j<N; ++j ) {
3299 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3300 for(
size_t k=0UL; k<K; ++k ) {
3302 xmm1 = xmm1 + A.get(i ,k) * b1;
3303 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3304 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3305 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3306 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3307 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3308 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3309 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3311 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3312 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3313 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3314 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3315 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
3316 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
3317 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
3318 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
3321 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3323 for( ; (j+2UL) <= N; j+=2UL ) {
3324 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3325 for(
size_t k=0UL; k<K; ++k ) {
3332 xmm1 = xmm1 + a1 * b1;
3333 xmm2 = xmm2 + a2 * b1;
3334 xmm3 = xmm3 + a3 * b1;
3335 xmm4 = xmm4 + a4 * b1;
3336 xmm5 = xmm5 + a1 * b2;
3337 xmm6 = xmm6 + a2 * b2;
3338 xmm7 = xmm7 + a3 * b2;
3339 xmm8 = xmm8 + a4 * b2;
3341 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3342 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
3343 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
3344 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
3345 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
3346 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
3347 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
3348 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
3352 for(
size_t k=0UL; k<K; ++k ) {
3354 xmm1 = xmm1 + A.get(i ,k) * b1;
3355 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3356 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3357 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3359 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3360 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3361 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3362 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3365 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3367 for( ; (j+2UL) <= N; j+=2UL ) {
3369 for(
size_t k=0UL; k<K; ++k ) {
3374 xmm1 = xmm1 + a1 * b1;
3375 xmm2 = xmm2 + a2 * b1;
3376 xmm3 = xmm3 + a1 * b2;
3377 xmm4 = xmm4 + a2 * b2;
3379 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3380 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
3381 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
3382 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
3386 for(
size_t k=0UL; k<K; ++k ) {
3388 xmm1 = xmm1 + A.get(i ,k) * b1;
3389 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3391 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3392 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
3397 for( ; (j+2UL) <= N; j+=2UL ) {
3399 for(
size_t k=0UL; k<K; ++k ) {
3401 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3402 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3404 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3405 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) - xmm2 * factor );
3409 for(
size_t k=0UL; k<K; ++k ) {
3410 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
3412 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3432 template<
typename MT3
3436 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3437 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3439 selectDefaultSubAssignKernel( C, A, B, scalar );
3458 template<
typename MT3
3462 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3463 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3465 using boost::numeric_cast;
3471 const int M ( numeric_cast<int>( A.rows() ) );
3472 const int N ( numeric_cast<int>( B.columns() ) );
3473 const int K ( numeric_cast<int>( A.columns() ) );
3474 const int lda( numeric_cast<int>( A.spacing() ) );
3475 const int ldb( numeric_cast<int>( B.spacing() ) );
3476 const int ldc( numeric_cast<int>( C.spacing() ) );
3478 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3479 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3480 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3481 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3501 template<
typename MT3
3505 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3506 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3508 using boost::numeric_cast;
3514 const int M ( numeric_cast<int>( A.rows() ) );
3515 const int N ( numeric_cast<int>( B.columns() ) );
3516 const int K ( numeric_cast<int>( A.columns() ) );
3517 const int lda( numeric_cast<int>( A.spacing() ) );
3518 const int ldb( numeric_cast<int>( B.spacing() ) );
3519 const int ldc( numeric_cast<int>( C.spacing() ) );
3521 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3522 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3523 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3524 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3544 template<
typename MT3
3548 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3549 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3551 using boost::numeric_cast;
3561 const int M ( numeric_cast<int>( A.rows() ) );
3562 const int N ( numeric_cast<int>( B.columns() ) );
3563 const int K ( numeric_cast<int>( A.columns() ) );
3564 const int lda( numeric_cast<int>( A.spacing() ) );
3565 const int ldb( numeric_cast<int>( B.spacing() ) );
3566 const int ldc( numeric_cast<int>( C.spacing() ) );
3567 const complex<float> alpha( -scalar );
3568 const complex<float> beta ( 1.0F, 0.0F );
3570 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3571 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3572 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3573 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3593 template<
typename MT3
3597 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3598 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3600 using boost::numeric_cast;
3610 const int M ( numeric_cast<int>( A.rows() ) );
3611 const int N ( numeric_cast<int>( B.columns() ) );
3612 const int K ( numeric_cast<int>( A.columns() ) );
3613 const int lda( numeric_cast<int>( A.spacing() ) );
3614 const int ldb( numeric_cast<int>( B.spacing() ) );
3615 const int ldc( numeric_cast<int>( C.spacing() ) );
3616 const complex<double> alpha( -scalar );
3617 const complex<double> beta ( 1.0, 0.0 );
3619 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3620 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3621 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3622 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3687 template<
typename T1
3689 inline const TDMatTDMatMultExpr<T1,T2>
3695 throw std::invalid_argument(
"Matrix sizes do not match" );
3723 template<
typename MT1
3725 inline typename RowExprTrait< TDMatTDMatMultExpr<MT1,MT2> >::Type
3726 row(
const TDMatTDMatMultExpr<MT1,MT2>& dm,
size_t index )
3730 return row( dm.leftOperand(), index ) * dm.rightOperand();
3749 template<
typename MT1
3751 inline typename ColumnExprTrait< TDMatTDMatMultExpr<MT1,MT2> >::Type
3752 column(
const TDMatTDMatMultExpr<MT1,MT2>& dm,
size_t index )
3756 return dm.leftOperand() *
column( dm.rightOperand(), index );
3772 template<
typename MT1,
typename MT2,
typename VT >
3777 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3778 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3779 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3780 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
3781 , INVALID_TYPE >::Type Type;
3790 template<
typename MT1,
typename MT2,
typename VT >
3795 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3796 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3797 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3798 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
3799 , INVALID_TYPE >::Type Type;
3808 template<
typename VT,
typename MT1,
typename MT2 >
3813 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
3814 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3815 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
3816 ,
typename TDVecTDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3817 , INVALID_TYPE >::Type Type;
3826 template<
typename VT,
typename MT1,
typename MT2 >
3831 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
3832 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3833 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
3834 ,
typename TDVecTDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3835 , INVALID_TYPE >::Type Type;
3844 template<
typename MT1,
typename MT2 >
3849 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
3858 template<
typename MT1,
typename MT2 >
3863 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;