22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
89 template<
typename MT1
97 typedef typename MT1::ResultType
RT1;
98 typedef typename MT2::ResultType
RT2;
99 typedef typename MT1::ElementType
ET1;
100 typedef typename MT2::ElementType
ET2;
101 typedef typename MT1::CompositeType
CT1;
102 typedef typename MT2::CompositeType
CT2;
110 template<
typename T1,
typename T2,
typename T3 >
111 struct UseSinglePrecisionKernel {
124 template<
typename T1,
typename T2,
typename T3 >
125 struct UseDoublePrecisionKernel {
139 template<
typename T1,
typename T2,
typename T3 >
140 struct UseSinglePrecisionComplexKernel {
141 typedef complex<float> Type;
142 enum { value = IsSame<typename T1::ElementType,Type>::value &&
143 IsSame<typename T2::ElementType,Type>::value &&
144 IsSame<typename T3::ElementType,Type>::value };
155 template<
typename T1,
typename T2,
typename T3 >
156 struct UseDoublePrecisionComplexKernel {
157 typedef complex<double> Type;
158 enum { value = IsSame<typename T1::ElementType,Type>::value &&
159 IsSame<typename T2::ElementType,Type>::value &&
160 IsSame<typename T3::ElementType,Type>::value };
170 template<
typename T1,
typename T2,
typename T3 >
171 struct UseDefaultKernel {
172 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
173 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
174 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
175 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
185 template<
typename T1,
typename T2,
typename T3 >
186 struct UseVectorizedDefaultKernel {
187 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
188 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
189 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
190 IntrinsicTrait<typename T1::ElementType>::addition &&
191 IntrinsicTrait<typename T1::ElementType>::multiplication };
222 enum { vectorizable = 0 };
255 if(
lhs_.columns() != 0UL ) {
256 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
258 for(
size_t k=1UL; k<end; k+=2UL ) {
260 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
262 if( end <
lhs_.columns() ) {
290 return rhs_.columns();
320 template<
typename T >
343 template<
typename MT
350 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
353 else if( rhs.
lhs_.columns() == 0UL ) {
369 TDMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
371 TDMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
390 template<
typename MT3
394 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
396 const size_t M( A.rows() );
397 const size_t N( B.columns() );
398 const size_t K( A.columns() );
400 for(
size_t i=0UL; i<M; ++i ) {
401 for(
size_t j=0UL; j<N; ++j ) {
402 C(i,j) = A(i,0UL) * B(0UL,j);
404 for(
size_t k=1UL; k<K; ++k ) {
405 for(
size_t j=0UL; j<N; ++j ) {
406 C(i,j) += A(i,k) * B(k,j);
428 template<
typename MT3
431 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
432 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
437 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
438 const typename MT5::OppositeType tmp( B );
441 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
442 const typename MT4::OppositeType tmp( A );
445 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
446 const typename MT5::OppositeType tmp( B );
450 const typename MT4::OppositeType tmp( A );
471 template<
typename MT3
474 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
475 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
477 typedef IntrinsicTrait<ElementType> IT;
479 const size_t M( A.spacing() );
480 const size_t N( B.columns() );
481 const size_t K( A.columns() );
485 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
486 for(
size_t j=0UL; j<N; ++j ) {
487 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
488 for(
size_t k=0UL; k<K; ++k ) {
490 xmm1 = xmm1 + A.get(i ,k) * b1;
491 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
492 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
493 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
494 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
495 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
496 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
497 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
499 store( &(~C)(i ,j), xmm1 );
500 store( &(~C)(i+IT::size ,j), xmm2 );
501 store( &(~C)(i+IT::size*2UL,j), xmm3 );
502 store( &(~C)(i+IT::size*3UL,j), xmm4 );
503 store( &(~C)(i+IT::size*4UL,j), xmm5 );
504 store( &(~C)(i+IT::size*5UL,j), xmm6 );
505 store( &(~C)(i+IT::size*6UL,j), xmm7 );
506 store( &(~C)(i+IT::size*7UL,j), xmm8 );
509 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
511 for( ; (j+2UL) <= N; j+=2UL ) {
512 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
513 for(
size_t k=0UL; k<K; ++k ) {
520 xmm1 = xmm1 + a1 * b1;
521 xmm2 = xmm2 + a2 * b1;
522 xmm3 = xmm3 + a3 * b1;
523 xmm4 = xmm4 + a4 * b1;
524 xmm5 = xmm5 + a1 * b2;
525 xmm6 = xmm6 + a2 * b2;
526 xmm7 = xmm7 + a3 * b2;
527 xmm8 = xmm8 + a4 * b2;
529 store( &(~C)(i ,j ), xmm1 );
530 store( &(~C)(i+IT::size ,j ), xmm2 );
531 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
532 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
533 store( &(~C)(i ,j+1UL), xmm5 );
534 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
535 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
536 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
540 for(
size_t k=0UL; k<K; ++k ) {
542 xmm1 = xmm1 + A.get(i ,k) * b1;
543 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
544 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
545 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
547 store( &(~C)(i ,j), xmm1 );
548 store( &(~C)(i+IT::size ,j), xmm2 );
549 store( &(~C)(i+IT::size*2UL,j), xmm3 );
550 store( &(~C)(i+IT::size*3UL,j), xmm4 );
553 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
555 for( ; (j+2UL) <= N; j+=2UL ) {
557 for(
size_t k=0UL; k<K; ++k ) {
562 xmm1 = xmm1 + a1 * b1;
563 xmm2 = xmm2 + a2 * b1;
564 xmm3 = xmm3 + a1 * b2;
565 xmm4 = xmm4 + a2 * b2;
567 store( &(~C)(i ,j ), xmm1 );
568 store( &(~C)(i+IT::size,j ), xmm2 );
569 store( &(~C)(i ,j+1UL), xmm3 );
570 store( &(~C)(i+IT::size,j+1UL), xmm4 );
574 for(
size_t k=0UL; k<K; ++k ) {
576 xmm1 = xmm1 + A.get(i ,k) * b1;
577 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
579 store( &(~C)(i ,j), xmm1 );
580 store( &(~C)(i+IT::size,j), xmm2 );
585 for( ; (j+2UL) <= N; j+=2UL ) {
587 for(
size_t k=0UL; k<K; ++k ) {
589 xmm1 = xmm1 + a1 *
set( B(k,j ) );
590 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
592 store( &(~C)(i,j ), xmm1 );
593 store( &(~C)(i,j+1UL), xmm2 );
597 for(
size_t k=0UL; k<K; ++k ) {
598 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
600 store( &(~C)(i,j), xmm1 );
621 template<
typename MT3
624 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
625 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
627 selectDefaultAssignKernel( C, A, B );
647 template<
typename MT3
650 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
651 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
653 using boost::numeric_cast;
659 const int M ( numeric_cast<int>( A.rows() ) );
660 const int N ( numeric_cast<int>( B.columns() ) );
661 const int K ( numeric_cast<int>( A.columns() ) );
662 const int lda( numeric_cast<int>( A.spacing() ) );
663 const int ldb( numeric_cast<int>( B.spacing() ) );
664 const int ldc( numeric_cast<int>( C.spacing() ) );
666 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
667 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
668 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
669 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
690 template<
typename MT3
693 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
694 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
696 using boost::numeric_cast;
702 const int M ( numeric_cast<int>( A.rows() ) );
703 const int N ( numeric_cast<int>( B.columns() ) );
704 const int K ( numeric_cast<int>( A.columns() ) );
705 const int lda( numeric_cast<int>( A.spacing() ) );
706 const int ldb( numeric_cast<int>( B.spacing() ) );
707 const int ldc( numeric_cast<int>( C.spacing() ) );
709 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
710 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
711 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
712 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
733 template<
typename MT3
736 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
737 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
739 using boost::numeric_cast;
748 const int M ( numeric_cast<int>( A.rows() ) );
749 const int N ( numeric_cast<int>( B.columns() ) );
750 const int K ( numeric_cast<int>( A.columns() ) );
751 const int lda( numeric_cast<int>( A.spacing() ) );
752 const int ldb( numeric_cast<int>( B.spacing() ) );
753 const int ldc( numeric_cast<int>( C.spacing() ) );
754 complex<float> alpha( 1.0F, 0.0F );
755 complex<float> beta ( 0.0F, 0.0F );
757 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
758 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
759 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
760 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
781 template<
typename MT3
784 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
785 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
787 using boost::numeric_cast;
796 const int M ( numeric_cast<int>( A.rows() ) );
797 const int N ( numeric_cast<int>( B.columns() ) );
798 const int K ( numeric_cast<int>( A.columns() ) );
799 const int lda( numeric_cast<int>( A.spacing() ) );
800 const int ldb( numeric_cast<int>( B.spacing() ) );
801 const int ldc( numeric_cast<int>( C.spacing() ) );
802 complex<double> alpha( 1.0, 0.0 );
803 complex<double> beta ( 0.0, 0.0 );
805 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
806 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
807 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
808 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
827 template<
typename MT
831 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
843 const TmpType tmp( rhs );
862 template<
typename MT
869 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
884 TDMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
886 TDMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
905 template<
typename MT3
908 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
909 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
911 const size_t M( A.rows() );
912 const size_t N( B.columns() );
913 const size_t K( A.columns() );
916 const size_t end( N &
size_t(-2) );
918 for(
size_t i=0UL; i<M; ++i ) {
919 for(
size_t k=0UL; k<K; ++k ) {
920 for(
size_t j=0UL; j<end; j+=2UL ) {
921 C(i,j ) += A(i,k) * B(k,j );
922 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
925 C(i,end) += A(i,k) * B(k,end);
947 template<
typename MT3
950 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
951 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
956 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
957 const typename MT5::OppositeType tmp( B );
960 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
961 const typename MT4::OppositeType tmp( A );
964 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
965 const typename MT5::OppositeType tmp( B );
969 const typename MT4::OppositeType tmp( A );
990 template<
typename MT3
993 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
994 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
996 typedef IntrinsicTrait<ElementType> IT;
998 const size_t M( A.spacing() );
999 const size_t N( B.columns() );
1000 const size_t K( A.columns() );
1004 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1005 for(
size_t j=0UL; j<N; ++j ) {
1014 for(
size_t k=0UL; k<K; ++k ) {
1016 xmm1 = xmm1 + A.get(i ,k) * b1;
1017 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1018 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1019 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1020 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1021 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1022 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1023 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1025 store( &(~C)(i ,j), xmm1 );
1026 store( &(~C)(i+IT::size ,j), xmm2 );
1027 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1028 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1029 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1030 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1031 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1032 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1035 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1037 for( ; (j+2UL) <= N; j+=2UL ) {
1046 for(
size_t k=0UL; k<K; ++k ) {
1053 xmm1 = xmm1 + a1 * b1;
1054 xmm2 = xmm2 + a2 * b1;
1055 xmm3 = xmm3 + a3 * b1;
1056 xmm4 = xmm4 + a4 * b1;
1057 xmm5 = xmm5 + a1 * b2;
1058 xmm6 = xmm6 + a2 * b2;
1059 xmm7 = xmm7 + a3 * b2;
1060 xmm8 = xmm8 + a4 * b2;
1062 store( &(~C)(i ,j ), xmm1 );
1063 store( &(~C)(i+IT::size ,j ), xmm2 );
1064 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1065 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1066 store( &(~C)(i ,j+1UL), xmm5 );
1067 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1068 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1069 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1076 for(
size_t k=0UL; k<K; ++k ) {
1078 xmm1 = xmm1 + A.get(i ,k) * b1;
1079 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1080 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1081 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1083 store( &(~C)(i ,j), xmm1 );
1084 store( &(~C)(i+IT::size ,j), xmm2 );
1085 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1086 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1089 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1091 for( ; (j+2UL) <= N; j+=2UL ) {
1096 for(
size_t k=0UL; k<K; ++k ) {
1101 xmm1 = xmm1 + a1 * b1;
1102 xmm2 = xmm2 + a2 * b1;
1103 xmm3 = xmm3 + a1 * b2;
1104 xmm4 = xmm4 + a2 * b2;
1106 store( &(~C)(i ,j ), xmm1 );
1107 store( &(~C)(i+IT::size,j ), xmm2 );
1108 store( &(~C)(i ,j+1UL), xmm3 );
1109 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1114 for(
size_t k=0UL; k<K; ++k ) {
1116 xmm1 = xmm1 + A.get(i ,k) * b1;
1117 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1119 store( &(~C)(i ,j), xmm1 );
1120 store( &(~C)(i+IT::size,j), xmm2 );
1125 for( ; (j+2UL) <= N; j+=2UL ) {
1128 for(
size_t k=0UL; k<K; ++k ) {
1130 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1131 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1133 store( &(~C)(i,j ), xmm1 );
1134 store( &(~C)(i,j+1UL), xmm2 );
1138 for(
size_t k=0UL; k<K; ++k ) {
1139 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
1141 store( &(~C)(i,j), xmm1 );
1162 template<
typename MT3
1165 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1166 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1168 selectDefaultAddAssignKernel( C, A, B );
1188 template<
typename MT3
1191 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1192 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1194 using boost::numeric_cast;
1200 const int M ( numeric_cast<int>( A.rows() ) );
1201 const int N ( numeric_cast<int>( B.columns() ) );
1202 const int K ( numeric_cast<int>( A.columns() ) );
1203 const int lda( numeric_cast<int>( A.spacing() ) );
1204 const int ldb( numeric_cast<int>( B.spacing() ) );
1205 const int ldc( numeric_cast<int>( C.spacing() ) );
1207 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1208 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1209 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1210 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1231 template<
typename MT3
1234 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1235 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1237 using boost::numeric_cast;
1243 const int M ( numeric_cast<int>( A.rows() ) );
1244 const int N ( numeric_cast<int>( B.columns() ) );
1245 const int K ( numeric_cast<int>( A.columns() ) );
1246 const int lda( numeric_cast<int>( A.spacing() ) );
1247 const int ldb( numeric_cast<int>( B.spacing() ) );
1248 const int ldc( numeric_cast<int>( C.spacing() ) );
1250 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1251 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1252 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1253 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1274 template<
typename MT3
1277 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1278 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1280 using boost::numeric_cast;
1289 const int M ( numeric_cast<int>( A.rows() ) );
1290 const int N ( numeric_cast<int>( B.columns() ) );
1291 const int K ( numeric_cast<int>( A.columns() ) );
1292 const int lda( numeric_cast<int>( A.spacing() ) );
1293 const int ldb( numeric_cast<int>( B.spacing() ) );
1294 const int ldc( numeric_cast<int>( C.spacing() ) );
1295 const complex<float> alpha( 1.0F, 0.0F );
1296 const complex<float> beta ( 1.0F, 0.0F );
1298 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1299 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1300 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1301 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1322 template<
typename MT3
1325 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1326 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1328 using boost::numeric_cast;
1337 const int M ( numeric_cast<int>( A.rows() ) );
1338 const int N ( numeric_cast<int>( B.columns() ) );
1339 const int K ( numeric_cast<int>( A.columns() ) );
1340 const int lda( numeric_cast<int>( A.spacing() ) );
1341 const int ldb( numeric_cast<int>( B.spacing() ) );
1342 const int ldc( numeric_cast<int>( C.spacing() ) );
1343 const complex<double> alpha( 1.0, 0.0 );
1344 const complex<double> beta ( 1.0, 0.0 );
1346 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1347 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1348 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1349 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1372 template<
typename MT
1379 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1394 TDMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1396 TDMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1415 template<
typename MT3
1418 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1419 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1421 const size_t M( A.rows() );
1422 const size_t N( B.columns() );
1423 const size_t K( A.columns() );
1426 const size_t end( N &
size_t(-2) );
1428 for(
size_t i=0UL; i<M; ++i ) {
1429 for(
size_t k=0UL; k<K; ++k ) {
1430 for(
size_t j=0UL; j<end; j+=2UL ) {
1431 C(i,j ) -= A(i,k) * B(k,j );
1432 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1435 C(i,end) -= A(i,k) * B(k,end);
1457 template<
typename MT3
1460 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1461 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1466 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1467 const typename MT5::OppositeType tmp( B );
1470 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1471 const typename MT4::OppositeType tmp( A );
1474 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1475 const typename MT5::OppositeType tmp( B );
1479 const typename MT4::OppositeType tmp( A );
1500 template<
typename MT3
1503 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1504 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1506 typedef IntrinsicTrait<ElementType> IT;
1508 const size_t M( A.spacing() );
1509 const size_t N( B.columns() );
1510 const size_t K( A.columns() );
1514 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1515 for(
size_t j=0UL; j<N; ++j ) {
1524 for(
size_t k=0UL; k<K; ++k ) {
1526 xmm1 = xmm1 - A.get(i ,k) * b1;
1527 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1528 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1529 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1530 xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1531 xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1532 xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1533 xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1535 store( &(~C)(i ,j), xmm1 );
1536 store( &(~C)(i+IT::size ,j), xmm2 );
1537 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1538 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1539 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1540 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1541 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1542 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1545 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1547 for( ; (j+2UL) <= N; j+=2UL ) {
1556 for(
size_t k=0UL; k<K; ++k ) {
1563 xmm1 = xmm1 - a1 * b1;
1564 xmm2 = xmm2 - a2 * b1;
1565 xmm3 = xmm3 - a3 * b1;
1566 xmm4 = xmm4 - a4 * b1;
1567 xmm5 = xmm5 - a1 * b2;
1568 xmm6 = xmm6 - a2 * b2;
1569 xmm7 = xmm7 - a3 * b2;
1570 xmm8 = xmm8 - a4 * b2;
1572 store( &(~C)(i ,j ), xmm1 );
1573 store( &(~C)(i+IT::size ,j ), xmm2 );
1574 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1575 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1576 store( &(~C)(i ,j+1UL), xmm5 );
1577 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1578 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1579 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1586 for(
size_t k=0UL; k<K; ++k ) {
1588 xmm1 = xmm1 - A.get(i ,k) * b1;
1589 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1590 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1591 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1593 store( &(~C)(i ,j), xmm1 );
1594 store( &(~C)(i+IT::size ,j), xmm2 );
1595 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1596 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1599 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1601 for( ; (j+2UL) <= N; j+=2UL ) {
1606 for(
size_t k=0UL; k<K; ++k ) {
1611 xmm1 = xmm1 - a1 * b1;
1612 xmm2 = xmm2 - a2 * b1;
1613 xmm3 = xmm3 - a1 * b2;
1614 xmm4 = xmm4 - a2 * b2;
1616 store( &(~C)(i ,j ), xmm1 );
1617 store( &(~C)(i+IT::size,j ), xmm2 );
1618 store( &(~C)(i ,j+1UL), xmm3 );
1619 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1624 for(
size_t k=0UL; k<K; ++k ) {
1626 xmm1 = xmm1 - A.get(i ,k) * b1;
1627 xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
1629 store( &(~C)(i ,j), xmm1 );
1630 store( &(~C)(i+IT::size,j), xmm2 );
1635 for( ; (j+2UL) <= N; j+=2UL ) {
1638 for(
size_t k=0UL; k<K; ++k ) {
1640 xmm1 = xmm1 - a1 *
set( B(k,j ) );
1641 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
1643 store( &(~C)(i,j ), xmm1 );
1644 store( &(~C)(i,j+1UL), xmm2 );
1648 for(
size_t k=0UL; k<K; ++k ) {
1649 xmm1 = xmm1 - A.get(i,k) *
set( B(k,j) );
1651 store( &(~C)(i,j), xmm1 );
1672 template<
typename MT3
1675 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1676 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1678 selectDefaultSubAssignKernel( C, A, B );
1698 template<
typename MT3
1701 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1702 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1704 using boost::numeric_cast;
1710 const int M ( numeric_cast<int>( A.rows() ) );
1711 const int N ( numeric_cast<int>( B.columns() ) );
1712 const int K ( numeric_cast<int>( A.columns() ) );
1713 const int lda( numeric_cast<int>( A.spacing() ) );
1714 const int ldb( numeric_cast<int>( B.spacing() ) );
1715 const int ldc( numeric_cast<int>( C.spacing() ) );
1717 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1718 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1719 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1720 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1741 template<
typename MT3
1744 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1745 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1747 using boost::numeric_cast;
1753 const int M ( numeric_cast<int>( A.rows() ) );
1754 const int N ( numeric_cast<int>( B.columns() ) );
1755 const int K ( numeric_cast<int>( A.columns() ) );
1756 const int lda( numeric_cast<int>( A.spacing() ) );
1757 const int ldb( numeric_cast<int>( B.spacing() ) );
1758 const int ldc( numeric_cast<int>( C.spacing() ) );
1760 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1761 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1762 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1763 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1784 template<
typename MT3
1787 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1788 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1790 using boost::numeric_cast;
1799 const int M ( numeric_cast<int>( A.rows() ) );
1800 const int N ( numeric_cast<int>( B.columns() ) );
1801 const int K ( numeric_cast<int>( A.columns() ) );
1802 const int lda( numeric_cast<int>( A.spacing() ) );
1803 const int ldb( numeric_cast<int>( B.spacing() ) );
1804 const int ldc( numeric_cast<int>( C.spacing() ) );
1805 const complex<float> alpha( -1.0F, 0.0F );
1806 const complex<float> beta ( 1.0F, 0.0F );
1808 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1809 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1810 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1811 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1832 template<
typename MT3
1835 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1836 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1838 using boost::numeric_cast;
1847 const int M ( numeric_cast<int>( A.rows() ) );
1848 const int N ( numeric_cast<int>( B.columns() ) );
1849 const int K ( numeric_cast<int>( A.columns() ) );
1850 const int lda( numeric_cast<int>( A.spacing() ) );
1851 const int ldb( numeric_cast<int>( B.spacing() ) );
1852 const int ldc( numeric_cast<int>( C.spacing() ) );
1853 const complex<double> alpha( -1.0, 0.0 );
1854 const complex<double> beta ( 1.0, 0.0 );
1856 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1857 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1858 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1859 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1905 template<
typename MT1
1909 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
1910 ,
private Expression
1911 ,
private Computation
1916 typedef typename MMM::ResultType RES;
1917 typedef typename MT1::ResultType
RT1;
1918 typedef typename MT2::ResultType
RT2;
1919 typedef typename MT1::CompositeType
CT1;
1920 typedef typename MT2::CompositeType
CT2;
1928 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1929 struct UseSinglePrecisionKernel {
1930 enum { value = IsFloat<typename T1::ElementType>::value &&
1931 IsFloat<typename T2::ElementType>::value &&
1932 IsFloat<typename T3::ElementType>::value &&
1933 !IsComplex<T4>::value };
1942 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1943 struct UseDoublePrecisionKernel {
1944 enum { value = IsDouble<typename T1::ElementType>::value &&
1945 IsDouble<typename T2::ElementType>::value &&
1946 IsDouble<typename T3::ElementType>::value &&
1947 !IsComplex<T4>::value };
1956 template<
typename T1,
typename T2,
typename T3 >
1957 struct UseSinglePrecisionComplexKernel {
1958 typedef complex<float> Type;
1959 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1960 IsSame<typename T2::ElementType,Type>::value &&
1961 IsSame<typename T3::ElementType,Type>::value };
1970 template<
typename T1,
typename T2,
typename T3 >
1971 struct UseDoublePrecisionComplexKernel {
1972 typedef complex<double> Type;
1973 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1974 IsSame<typename T2::ElementType,Type>::value &&
1975 IsSame<typename T3::ElementType,Type>::value };
1983 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1984 struct UseDefaultKernel {
1985 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1986 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1987 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1988 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1996 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1997 struct UseVectorizedDefaultKernel {
1998 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1999 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2000 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2001 IsSame<typename T1::ElementType,T4>::value &&
2002 IntrinsicTrait<typename T1::ElementType>::addition &&
2003 IntrinsicTrait<typename T1::ElementType>::multiplication };
2009 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2010 typedef typename MultTrait<RES,ST>::Type
ResultType;
2011 typedef typename ResultType::OppositeType
OppositeType;
2013 typedef typename ResultType::ElementType
ElementType;
2014 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2025 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2028 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2033 enum { vectorizable = 0 };
2036 enum { canAlias = CanAlias<MMM>::value };
2045 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2061 return matrix_(i,j) * scalar_;
2070 inline size_t rows()
const {
2071 return matrix_.rows();
2080 inline size_t columns()
const {
2081 return matrix_.columns();
2111 template<
typename T >
2112 inline bool isAliased(
const T* alias )
const {
2113 return CanAlias<MMM>::value && matrix_.isAliased( alias );
2132 template<
typename MT3
2134 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2139 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2140 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2142 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2145 else if( left.columns() == 0UL ) {
2161 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2163 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2181 template<
typename MT3
2185 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2186 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2188 for(
size_t i=0UL; i<A.rows(); ++i ) {
2189 for(
size_t k=0UL; k<B.columns(); ++k ) {
2190 C(i,k) = A(i,0UL) * B(0UL,k);
2192 for(
size_t j=1UL; j<A.columns(); ++j ) {
2193 for(
size_t k=0UL; k<B.columns(); ++k ) {
2194 C(i,k) += A(i,j) * B(j,k);
2197 for(
size_t k=0UL; k<B.columns(); ++k ) {
2218 template<
typename MT3
2222 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2223 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2228 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2229 const typename MT5::OppositeType tmp( B );
2230 assign( ~C, A * tmp * scalar );
2232 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2233 const typename MT4::OppositeType tmp( A );
2234 assign( ~C, tmp * B * scalar );
2236 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2237 const typename MT5::OppositeType tmp( B );
2238 assign( ~C, A * tmp * scalar );
2241 const typename MT4::OppositeType tmp( A );
2242 assign( ~C, tmp * B * scalar );
2261 template<
typename MT3
2265 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2266 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2268 typedef IntrinsicTrait<ElementType> IT;
2270 const size_t M( A.spacing() );
2271 const size_t N( B.columns() );
2272 const size_t K( A.columns() );
2278 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2279 for(
size_t j=0UL; j<N; ++j ) {
2280 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2281 for(
size_t k=0UL; k<K; ++k ) {
2283 xmm1 = xmm1 + A.get(i ,k) * b1;
2284 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2285 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2286 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2287 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2288 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2289 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2290 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2292 store( &(~C)(i ,j), xmm1 * factor );
2293 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2294 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2295 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2296 store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2297 store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2298 store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2299 store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2302 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2304 for( ; (j+2UL) <= N; j+=2UL ) {
2305 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2306 for(
size_t k=0UL; k<K; ++k ) {
2313 xmm1 = xmm1 + a1 * b1;
2314 xmm2 = xmm2 + a2 * b1;
2315 xmm3 = xmm3 + a3 * b1;
2316 xmm4 = xmm4 + a4 * b1;
2317 xmm5 = xmm5 + a1 * b2;
2318 xmm6 = xmm6 + a2 * b2;
2319 xmm7 = xmm7 + a3 * b2;
2320 xmm8 = xmm8 + a4 * b2;
2322 store( &(~C)(i ,j ), xmm1 * factor );
2323 store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2324 store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2325 store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2326 store( &(~C)(i ,j+1UL), xmm5 * factor );
2327 store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2328 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2329 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2333 for(
size_t k=0UL; k<K; ++k ) {
2335 xmm1 = xmm1 + A.get(i ,k) * b1;
2336 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2337 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2338 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2340 store( &(~C)(i ,j), xmm1 * factor );
2341 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2342 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2343 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2346 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2348 for( ; (j+2UL) <= N; j+=2UL ) {
2350 for(
size_t k=0UL; k<K; ++k ) {
2355 xmm1 = xmm1 + a1 * b1;
2356 xmm2 = xmm2 + a2 * b1;
2357 xmm3 = xmm3 + a1 * b2;
2358 xmm4 = xmm4 + a2 * b2;
2360 store( &(~C)(i ,j ), xmm1 * factor );
2361 store( &(~C)(i+IT::size,j ), xmm2 * factor );
2362 store( &(~C)(i ,j+1UL), xmm3 * factor );
2363 store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2367 for(
size_t k=0UL; k<K; ++k ) {
2369 xmm1 = xmm1 + A.get(i ,k) * b1;
2370 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2372 store( &(~C)(i ,j), xmm1 * factor );
2373 store( &(~C)(i+IT::size,j), xmm2 * factor );
2378 for( ; (j+2UL) <= N; j+=2UL ) {
2380 for(
size_t k=0UL; k<K; ++k ) {
2382 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2383 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2385 store( &(~C)(i,j ), xmm1 * factor );
2386 store( &(~C)(i,j+1UL), xmm2 * factor );
2390 for(
size_t k=0UL; k<K; ++k ) {
2391 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2393 store( &(~C)(i,j), xmm1 * factor );
2413 template<
typename MT3
2417 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2418 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2420 selectDefaultAssignKernel( C, A, B, scalar );
2439 template<
typename MT3
2443 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2444 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2446 using boost::numeric_cast;
2452 const int M ( numeric_cast<int>( A.rows() ) );
2453 const int N ( numeric_cast<int>( B.columns() ) );
2454 const int K ( numeric_cast<int>( A.columns() ) );
2455 const int lda( numeric_cast<int>( A.spacing() ) );
2456 const int ldb( numeric_cast<int>( B.spacing() ) );
2457 const int ldc( numeric_cast<int>( C.spacing() ) );
2459 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2460 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2461 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2462 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2482 template<
typename MT3
2486 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2487 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2489 using boost::numeric_cast;
2495 const int M ( numeric_cast<int>( A.rows() ) );
2496 const int N ( numeric_cast<int>( B.columns() ) );
2497 const int K ( numeric_cast<int>( A.columns() ) );
2498 const int lda( numeric_cast<int>( A.spacing() ) );
2499 const int ldb( numeric_cast<int>( B.spacing() ) );
2500 const int ldc( numeric_cast<int>( C.spacing() ) );
2502 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2503 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2504 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2505 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2525 template<
typename MT3
2529 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2530 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2532 using boost::numeric_cast;
2542 const int M ( numeric_cast<int>( A.rows() ) );
2543 const int N ( numeric_cast<int>( B.columns() ) );
2544 const int K ( numeric_cast<int>( A.columns() ) );
2545 const int lda( numeric_cast<int>( A.spacing() ) );
2546 const int ldb( numeric_cast<int>( B.spacing() ) );
2547 const int ldc( numeric_cast<int>( C.spacing() ) );
2548 const complex<float> alpha( scalar );
2549 const complex<float> beta ( 0.0F, 0.0F );
2551 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2552 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2553 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2554 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2574 template<
typename MT3
2578 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2579 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2581 using boost::numeric_cast;
2591 const int M ( numeric_cast<int>( A.rows() ) );
2592 const int N ( numeric_cast<int>( B.columns() ) );
2593 const int K ( numeric_cast<int>( A.columns() ) );
2594 const int lda( numeric_cast<int>( A.spacing() ) );
2595 const int ldb( numeric_cast<int>( B.spacing() ) );
2596 const int ldc( numeric_cast<int>( C.spacing() ) );
2597 const complex<double> alpha( scalar );
2598 const complex<double> beta ( 0.0, 0.0 );
2600 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2601 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2602 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2603 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2620 template<
typename MT
2624 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2636 const TmpType tmp( rhs );
2653 template<
typename MT3
2655 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2660 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2661 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2663 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2678 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2680 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2698 template<
typename MT3
2702 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2703 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2724 template<
typename MT3
2728 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2729 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2734 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2735 const typename MT5::OppositeType tmp( B );
2738 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2739 const typename MT4::OppositeType tmp( A );
2742 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2743 const typename MT5::OppositeType tmp( B );
2747 const typename MT4::OppositeType tmp( A );
2767 template<
typename MT3
2771 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2772 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2774 typedef IntrinsicTrait<ElementType> IT;
2776 const size_t M( A.spacing() );
2777 const size_t N( B.columns() );
2778 const size_t K( A.columns() );
2784 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2785 for(
size_t j=0UL; j<N; ++j ) {
2786 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2787 for(
size_t k=0UL; k<K; ++k ) {
2789 xmm1 = xmm1 + A.get(i ,k) * b1;
2790 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2791 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2792 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2793 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2794 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2795 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2796 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2798 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2799 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
2800 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
2801 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
2802 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
2803 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
2804 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
2805 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
2808 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2810 for( ; (j+2UL) <= N; j+=2UL ) {
2811 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2812 for(
size_t k=0UL; k<K; ++k ) {
2819 xmm1 = xmm1 + a1 * b1;
2820 xmm2 = xmm2 + a2 * b1;
2821 xmm3 = xmm3 + a3 * b1;
2822 xmm4 = xmm4 + a4 * b1;
2823 xmm5 = xmm5 + a1 * b2;
2824 xmm6 = xmm6 + a2 * b2;
2825 xmm7 = xmm7 + a3 * b2;
2826 xmm8 = xmm8 + a4 * b2;
2828 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2829 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
2830 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
2831 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
2832 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
2833 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
2834 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
2835 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
2839 for(
size_t k=0UL; k<K; ++k ) {
2841 xmm1 = xmm1 + A.get(i ,k) * b1;
2842 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2843 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2844 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2846 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2847 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
2848 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
2849 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
2852 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2854 for( ; (j+2UL) <= N; j+=2UL ) {
2856 for(
size_t k=0UL; k<K; ++k ) {
2861 xmm1 = xmm1 + a1 * b1;
2862 xmm2 = xmm2 + a2 * b1;
2863 xmm3 = xmm3 + a1 * b2;
2864 xmm4 = xmm4 + a2 * b2;
2866 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2867 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
2868 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
2869 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
2873 for(
size_t k=0UL; k<K; ++k ) {
2875 xmm1 = xmm1 + A.get(i ,k) * b1;
2876 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2878 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2879 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
2884 for( ; (j+2UL) <= N; j+=2UL ) {
2886 for(
size_t k=0UL; k<K; ++k ) {
2888 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2889 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2891 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2892 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) + xmm2 * factor );
2896 for(
size_t k=0UL; k<K; ++k ) {
2897 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2899 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
2919 template<
typename MT3
2923 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2924 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2926 selectDefaultAddAssignKernel( C, A, B, scalar );
2945 template<
typename MT3
2949 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2950 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2952 using boost::numeric_cast;
2958 const int M ( numeric_cast<int>( A.rows() ) );
2959 const int N ( numeric_cast<int>( B.columns() ) );
2960 const int K ( numeric_cast<int>( A.columns() ) );
2961 const int lda( numeric_cast<int>( A.spacing() ) );
2962 const int ldb( numeric_cast<int>( B.spacing() ) );
2963 const int ldc( numeric_cast<int>( C.spacing() ) );
2965 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2966 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2967 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2968 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2988 template<
typename MT3
2992 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2993 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2995 using boost::numeric_cast;
3001 const int M ( numeric_cast<int>( A.rows() ) );
3002 const int N ( numeric_cast<int>( B.columns() ) );
3003 const int K ( numeric_cast<int>( A.columns() ) );
3004 const int lda( numeric_cast<int>( A.spacing() ) );
3005 const int ldb( numeric_cast<int>( B.spacing() ) );
3006 const int ldc( numeric_cast<int>( C.spacing() ) );
3008 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3009 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3010 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3011 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3031 template<
typename MT3
3035 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3036 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3038 using boost::numeric_cast;
3048 const int M ( numeric_cast<int>( A.rows() ) );
3049 const int N ( numeric_cast<int>( B.columns() ) );
3050 const int K ( numeric_cast<int>( A.columns() ) );
3051 const int lda( numeric_cast<int>( A.spacing() ) );
3052 const int ldb( numeric_cast<int>( B.spacing() ) );
3053 const int ldc( numeric_cast<int>( C.spacing() ) );
3054 const complex<float> alpha( scalar );
3055 const complex<float> beta ( 1.0F, 0.0F );
3057 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3058 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3059 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3060 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3080 template<
typename MT3
3084 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3085 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3087 using boost::numeric_cast;
3097 const int M ( numeric_cast<int>( A.rows() ) );
3098 const int N ( numeric_cast<int>( B.columns() ) );
3099 const int K ( numeric_cast<int>( A.columns() ) );
3100 const int lda( numeric_cast<int>( A.spacing() ) );
3101 const int ldb( numeric_cast<int>( B.spacing() ) );
3102 const int ldc( numeric_cast<int>( C.spacing() ) );
3103 const complex<double> alpha( scalar );
3104 const complex<double> beta ( 1.0, 0.0 );
3106 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3107 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3108 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3109 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3130 template<
typename MT3
3132 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3137 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3138 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3140 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3155 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3157 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3175 template<
typename MT3
3179 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3180 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3201 template<
typename MT3
3205 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3206 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3211 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3212 const typename MT5::OppositeType tmp( B );
3215 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3216 const typename MT4::OppositeType tmp( A );
3219 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3220 const typename MT5::OppositeType tmp( B );
3224 const typename MT4::OppositeType tmp( A );
3244 template<
typename MT3
3248 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3249 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3251 typedef IntrinsicTrait<ElementType> IT;
3253 const size_t M( A.spacing() );
3254 const size_t N( B.columns() );
3255 const size_t K( A.columns() );
3261 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3262 for(
size_t j=0UL; j<N; ++j ) {
3263 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3264 for(
size_t k=0UL; k<K; ++k ) {
3266 xmm1 = xmm1 + A.get(i ,k) * b1;
3267 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3268 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3269 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3270 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3271 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3272 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3273 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3275 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3276 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3277 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3278 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3279 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
3280 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
3281 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
3282 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
3285 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3287 for( ; (j+2UL) <= N; j+=2UL ) {
3288 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3289 for(
size_t k=0UL; k<K; ++k ) {
3296 xmm1 = xmm1 + a1 * b1;
3297 xmm2 = xmm2 + a2 * b1;
3298 xmm3 = xmm3 + a3 * b1;
3299 xmm4 = xmm4 + a4 * b1;
3300 xmm5 = xmm5 + a1 * b2;
3301 xmm6 = xmm6 + a2 * b2;
3302 xmm7 = xmm7 + a3 * b2;
3303 xmm8 = xmm8 + a4 * b2;
3305 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3306 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
3307 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
3308 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
3309 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
3310 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
3311 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
3312 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
3316 for(
size_t k=0UL; k<K; ++k ) {
3318 xmm1 = xmm1 + A.get(i ,k) * b1;
3319 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3320 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3321 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3323 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3324 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3325 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3326 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3329 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3331 for( ; (j+2UL) <= N; j+=2UL ) {
3333 for(
size_t k=0UL; k<K; ++k ) {
3338 xmm1 = xmm1 + a1 * b1;
3339 xmm2 = xmm2 + a2 * b1;
3340 xmm3 = xmm3 + a1 * b2;
3341 xmm4 = xmm4 + a2 * b2;
3343 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3344 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
3345 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
3346 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
3350 for(
size_t k=0UL; k<K; ++k ) {
3352 xmm1 = xmm1 + A.get(i ,k) * b1;
3353 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3355 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3356 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
3361 for( ; (j+2UL) <= N; j+=2UL ) {
3363 for(
size_t k=0UL; k<K; ++k ) {
3365 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3366 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3368 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3369 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) - xmm2 * factor );
3373 for(
size_t k=0UL; k<K; ++k ) {
3374 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
3376 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3396 template<
typename MT3
3400 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3401 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3403 selectDefaultSubAssignKernel( C, A, B, scalar );
3422 template<
typename MT3
3426 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3427 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3429 using boost::numeric_cast;
3435 const int M ( numeric_cast<int>( A.rows() ) );
3436 const int N ( numeric_cast<int>( B.columns() ) );
3437 const int K ( numeric_cast<int>( A.columns() ) );
3438 const int lda( numeric_cast<int>( A.spacing() ) );
3439 const int ldb( numeric_cast<int>( B.spacing() ) );
3440 const int ldc( numeric_cast<int>( C.spacing() ) );
3442 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3443 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3444 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3445 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3465 template<
typename MT3
3469 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3470 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3472 using boost::numeric_cast;
3478 const int M ( numeric_cast<int>( A.rows() ) );
3479 const int N ( numeric_cast<int>( B.columns() ) );
3480 const int K ( numeric_cast<int>( A.columns() ) );
3481 const int lda( numeric_cast<int>( A.spacing() ) );
3482 const int ldb( numeric_cast<int>( B.spacing() ) );
3483 const int ldc( numeric_cast<int>( C.spacing() ) );
3485 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3486 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3487 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3488 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3508 template<
typename MT3
3512 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3513 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3515 using boost::numeric_cast;
3525 const int M ( numeric_cast<int>( A.rows() ) );
3526 const int N ( numeric_cast<int>( B.columns() ) );
3527 const int K ( numeric_cast<int>( A.columns() ) );
3528 const int lda( numeric_cast<int>( A.spacing() ) );
3529 const int ldb( numeric_cast<int>( B.spacing() ) );
3530 const int ldc( numeric_cast<int>( C.spacing() ) );
3531 const complex<float> alpha( -scalar );
3532 const complex<float> beta ( 1.0F, 0.0F );
3534 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3535 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3536 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3537 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3557 template<
typename MT3
3561 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3562 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3564 using boost::numeric_cast;
3574 const int M ( numeric_cast<int>( A.rows() ) );
3575 const int N ( numeric_cast<int>( B.columns() ) );
3576 const int K ( numeric_cast<int>( A.columns() ) );
3577 const int lda( numeric_cast<int>( A.spacing() ) );
3578 const int ldb( numeric_cast<int>( B.spacing() ) );
3579 const int ldc( numeric_cast<int>( C.spacing() ) );
3580 const complex<double> alpha( -scalar );
3581 const complex<double> beta ( 1.0, 0.0 );
3583 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3584 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3585 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3586 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3651 template<
typename T1
3657 throw std::invalid_argument(
"Matrix sizes do not match" );
3674 template<
typename MT1,
typename MT2,
typename VT >
3679 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3680 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3681 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3682 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
3683 , INVALID_TYPE >::Type Type;
3692 template<
typename MT1,
typename MT2,
typename VT >
3697 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3698 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3699 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3700 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
3701 , INVALID_TYPE >::Type Type;
3710 template<
typename VT,
typename MT1,
typename MT2 >
3715 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
3716 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3717 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
3718 ,
typename TDVecTDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3719 , INVALID_TYPE >::Type Type;
3728 template<
typename VT,
typename MT1,
typename MT2 >
3733 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
3734 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
3735 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
3736 ,
typename TDVecTDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3737 , INVALID_TYPE >::Type Type;