22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
92 template<
typename MT1
100 typedef typename MT1::ResultType
RT1;
101 typedef typename MT2::ResultType
RT2;
102 typedef typename MT1::CompositeType
CT1;
103 typedef typename MT2::CompositeType
CT2;
111 template<
typename T1,
typename T2,
typename T3 >
112 struct UseSinglePrecisionKernel {
125 template<
typename T1,
typename T2,
typename T3 >
126 struct UseDoublePrecisionKernel {
140 template<
typename T1,
typename T2,
typename T3 >
141 struct UseSinglePrecisionComplexKernel {
142 typedef complex<float> Type;
143 enum { value = IsSame<typename T1::ElementType,Type>::value &&
144 IsSame<typename T2::ElementType,Type>::value &&
145 IsSame<typename T3::ElementType,Type>::value };
156 template<
typename T1,
typename T2,
typename T3 >
157 struct UseDoublePrecisionComplexKernel {
158 typedef complex<double> Type;
159 enum { value = IsSame<typename T1::ElementType,Type>::value &&
160 IsSame<typename T2::ElementType,Type>::value &&
161 IsSame<typename T3::ElementType,Type>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDefaultKernel {
173 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
174 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
175 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
176 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseVectorizedDefaultKernel {
188 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
189 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
190 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
191 IntrinsicTrait<typename T1::ElementType>::addition &&
192 IntrinsicTrait<typename T1::ElementType>::multiplication };
223 enum { vectorizable = 0 };
256 if(
lhs_.columns() != 0UL ) {
257 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
259 for(
size_t k=1UL; k<end; k+=2UL ) {
261 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
263 if( end <
lhs_.columns() ) {
291 return rhs_.columns();
321 template<
typename T >
344 template<
typename MT3
351 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
354 else if( rhs.
lhs_.columns() == 0UL ) {
370 DMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
372 DMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
390 template<
typename MT3
394 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
396 const size_t M( A.rows() );
397 const size_t N( B.columns() );
398 const size_t K( A.columns() );
400 for(
size_t i=0UL; i<M; ++i ) {
401 for(
size_t j=0UL; j<N; ++j ) {
402 C(i,j) = A(i,0UL) * B(0UL,j);
404 for(
size_t k=1UL; k<K; ++k ) {
405 for(
size_t j=0UL; j<N; ++j ) {
406 C(i,j) += A(i,k) * B(k,j);
428 template<
typename MT3
431 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
432 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
434 typedef IntrinsicTrait<ElementType> IT;
436 const size_t M( A.rows() );
437 const size_t N( B.spacing() );
438 const size_t K( A.columns() );
442 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
443 for(
size_t i=0UL; i<M; ++i ) {
444 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
445 for(
size_t k=0UL; k<K; ++k ) {
447 xmm1 = xmm1 + a1 * B.get(k,j );
448 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
449 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
450 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
451 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
452 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
453 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
454 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
456 store( &(~C)(i,j ), xmm1 );
457 store( &(~C)(i,j+IT::size ), xmm2 );
458 store( &(~C)(i,j+IT::size*2UL), xmm3 );
459 store( &(~C)(i,j+IT::size*3UL), xmm4 );
460 store( &(~C)(i,j+IT::size*4UL), xmm5 );
461 store( &(~C)(i,j+IT::size*5UL), xmm6 );
462 store( &(~C)(i,j+IT::size*6UL), xmm7 );
463 store( &(~C)(i,j+IT::size*7UL), xmm8 );
466 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
468 for( ; (i+2UL) <= M; i+=2UL ) {
469 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
470 for(
size_t k=0UL; k<K; ++k ) {
477 xmm1 = xmm1 + a1 * b1;
478 xmm2 = xmm2 + a1 * b2;
479 xmm3 = xmm3 + a1 * b3;
480 xmm4 = xmm4 + a1 * b4;
481 xmm5 = xmm5 + a2 * b1;
482 xmm6 = xmm6 + a2 * b2;
483 xmm7 = xmm7 + a2 * b3;
484 xmm8 = xmm8 + a2 * b4;
486 store( &(~C)(i ,j ), xmm1 );
487 store( &(~C)(i ,j+IT::size ), xmm2 );
488 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
489 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
490 store( &(~C)(i+1UL,j ), xmm5 );
491 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
492 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
493 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
497 for(
size_t k=0UL; k<K; ++k ) {
499 xmm1 = xmm1 + a1 * B.get(k,j );
500 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
501 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
502 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
504 store( &(~C)(i,j ), xmm1 );
505 store( &(~C)(i,j+IT::size ), xmm2 );
506 store( &(~C)(i,j+IT::size*2UL), xmm3 );
507 store( &(~C)(i,j+IT::size*3UL), xmm4 );
510 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
512 for( ; (i+2UL) <= M; i+=2UL ) {
514 for(
size_t k=0UL; k<K; ++k ) {
519 xmm1 = xmm1 + a1 * b1;
520 xmm2 = xmm2 + a1 * b2;
521 xmm3 = xmm3 + a2 * b1;
522 xmm4 = xmm4 + a2 * b2;
524 store( &(~C)(i ,j ), xmm1 );
525 store( &(~C)(i ,j+IT::size), xmm2 );
526 store( &(~C)(i+1UL,j ), xmm3 );
527 store( &(~C)(i+1UL,j+IT::size), xmm4 );
531 for(
size_t k=0UL; k<K; ++k ) {
533 xmm1 = xmm1 + a1 * B.get(k,j );
534 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
536 store( &(~C)(i,j ), xmm1 );
537 store( &(~C)(i,j+IT::size), xmm2 );
542 for( ; (i+2UL) <= M; i+=2UL ) {
544 for(
size_t k=0UL; k<K; ++k ) {
546 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
547 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
549 store( &(~C)(i ,j), xmm1 );
550 store( &(~C)(i+1UL,j), xmm2 );
554 for(
size_t k=0UL; k<K; ++k ) {
555 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
557 store( &(~C)(i,j), xmm1 );
578 template<
typename MT3
581 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
582 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
587 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
588 const typename MT4::OppositeType tmp( A );
591 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
592 const typename MT5::OppositeType tmp( B );
595 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
596 const typename MT4::OppositeType tmp( A );
600 const typename MT5::OppositeType tmp( B );
620 template<
typename MT3
623 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
624 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
626 selectDefaultAssignKernel( C, A, B );
646 template<
typename MT3
649 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
650 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
652 using boost::numeric_cast;
658 const int M ( numeric_cast<int>( A.rows() ) );
659 const int N ( numeric_cast<int>( B.columns() ) );
660 const int K ( numeric_cast<int>( A.columns() ) );
661 const int lda( numeric_cast<int>( A.spacing() ) );
662 const int ldb( numeric_cast<int>( B.spacing() ) );
663 const int ldc( numeric_cast<int>( C.spacing() ) );
665 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
666 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
667 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
668 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
689 template<
typename MT3
692 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
693 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
695 using boost::numeric_cast;
701 const int M ( numeric_cast<int>( A.rows() ) );
702 const int N ( numeric_cast<int>( B.columns() ) );
703 const int K ( numeric_cast<int>( A.columns() ) );
704 const int lda( numeric_cast<int>( A.spacing() ) );
705 const int ldb( numeric_cast<int>( B.spacing() ) );
706 const int ldc( numeric_cast<int>( C.spacing() ) );
708 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
709 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
710 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
711 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
732 template<
typename MT3
735 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
736 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
738 using boost::numeric_cast;
747 const int M ( numeric_cast<int>( A.rows() ) );
748 const int N ( numeric_cast<int>( B.columns() ) );
749 const int K ( numeric_cast<int>( A.columns() ) );
750 const int lda( numeric_cast<int>( A.spacing() ) );
751 const int ldb( numeric_cast<int>( B.spacing() ) );
752 const int ldc( numeric_cast<int>( C.spacing() ) );
753 const complex<float> alpha( 1.0F, 0.0F );
754 const complex<float> beta ( 0.0F, 0.0F );
756 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
757 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
758 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
759 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
780 template<
typename MT3
783 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
784 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
786 using boost::numeric_cast;
795 const int M ( numeric_cast<int>( A.rows() ) );
796 const int N ( numeric_cast<int>( B.columns() ) );
797 const int K ( numeric_cast<int>( A.columns() ) );
798 const int lda( numeric_cast<int>( A.spacing() ) );
799 const int ldb( numeric_cast<int>( B.spacing() ) );
800 const int ldc( numeric_cast<int>( C.spacing() ) );
801 const complex<double> alpha( 1.0, 0.0 );
802 const complex<double> beta ( 0.0, 0.0 );
804 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
805 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
806 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
807 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
825 template<
typename MT
829 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
841 const TmpType tmp( rhs );
860 template<
typename MT3
867 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
882 DMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
884 DMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
903 template<
typename MT3
906 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
907 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
909 const size_t M( A.rows() );
910 const size_t N( B.columns() );
911 const size_t K( A.columns() );
914 const size_t end( N &
size_t(-2) );
916 for(
size_t i=0UL; i<M; ++i ) {
917 for(
size_t k=0UL; k<K; ++k ) {
918 for(
size_t j=0UL; j<end; j+=2UL ) {
919 C(i,j ) += A(i,k) * B(k,j );
920 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
923 C(i,end) += A(i,k) * B(k,end);
945 template<
typename MT3
948 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
949 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
951 typedef IntrinsicTrait<ElementType> IT;
953 const size_t M( A.rows() );
954 const size_t N( B.spacing() );
955 const size_t K( A.columns() );
959 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
960 for(
size_t i=0UL; i<M; ++i ) {
969 for(
size_t k=0UL; k<K; ++k ) {
971 xmm1 = xmm1 + a1 * B.get(k,j );
972 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
973 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
974 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
975 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
976 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
977 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
978 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
980 store( &(~C)(i,j ), xmm1 );
981 store( &(~C)(i,j+IT::size ), xmm2 );
982 store( &(~C)(i,j+IT::size*2UL), xmm3 );
983 store( &(~C)(i,j+IT::size*3UL), xmm4 );
984 store( &(~C)(i,j+IT::size*4UL), xmm5 );
985 store( &(~C)(i,j+IT::size*5UL), xmm6 );
986 store( &(~C)(i,j+IT::size*6UL), xmm7 );
987 store( &(~C)(i,j+IT::size*7UL), xmm8 );
990 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
992 for( ; (i+2UL) <= M; i+=2UL ) {
1001 for(
size_t k=0UL; k<K; ++k ) {
1008 xmm1 = xmm1 + a1 * b1;
1009 xmm2 = xmm2 + a1 * b2;
1010 xmm3 = xmm3 + a1 * b3;
1011 xmm4 = xmm4 + a1 * b4;
1012 xmm5 = xmm5 + a2 * b1;
1013 xmm6 = xmm6 + a2 * b2;
1014 xmm7 = xmm7 + a2 * b3;
1015 xmm8 = xmm8 + a2 * b4;
1017 store( &(~C)(i ,j ), xmm1 );
1018 store( &(~C)(i ,j+IT::size ), xmm2 );
1019 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1020 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1021 store( &(~C)(i+1UL,j ), xmm5 );
1022 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1023 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1024 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1031 for(
size_t k=0UL; k<K; ++k ) {
1033 xmm1 = xmm1 + a1 * B.get(k,j );
1034 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1035 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1036 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1038 store( &(~C)(i,j ), xmm1 );
1039 store( &(~C)(i,j+IT::size ), xmm2 );
1040 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1041 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1044 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1046 for( ; (i+2UL) <= M; i+=2UL ) {
1051 for(
size_t k=0UL; k<K; ++k ) {
1056 xmm1 = xmm1 + a1 * b1;
1057 xmm2 = xmm2 + a1 * b2;
1058 xmm3 = xmm3 + a2 * b1;
1059 xmm4 = xmm4 + a2 * b2;
1061 store( &(~C)(i ,j ), xmm1 );
1062 store( &(~C)(i ,j+IT::size), xmm2 );
1063 store( &(~C)(i+1UL,j ), xmm3 );
1064 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1069 for(
size_t k=0UL; k<K; ++k ) {
1071 xmm1 = xmm1 + a1 * B.get(k,j );
1072 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1074 store( &(~C)(i,j ), xmm1 );
1075 store( &(~C)(i,j+IT::size), xmm2 );
1080 for( ; (i+2UL) <= M; i+=2UL ) {
1083 for(
size_t k=0UL; k<K; ++k ) {
1085 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1086 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1088 store( &(~C)(i ,j), xmm1 );
1089 store( &(~C)(i+1UL,j), xmm2 );
1093 for(
size_t k=0UL; k<K; ++k ) {
1094 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
1096 store( &(~C)(i,j), xmm1 );
1117 template<
typename MT3
1120 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1121 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1126 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1127 const typename MT4::OppositeType tmp( A );
1130 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1131 const typename MT5::OppositeType tmp( B );
1134 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1135 const typename MT4::OppositeType tmp( A );
1139 const typename MT5::OppositeType tmp( B );
1160 template<
typename MT3
1163 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1164 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1166 selectDefaultAddAssignKernel( C, A, B );
1186 template<
typename MT3
1189 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1190 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1192 using boost::numeric_cast;
1198 const int M ( numeric_cast<int>( A.rows() ) );
1199 const int N ( numeric_cast<int>( B.columns() ) );
1200 const int K ( numeric_cast<int>( A.columns() ) );
1201 const int lda( numeric_cast<int>( A.spacing() ) );
1202 const int ldb( numeric_cast<int>( B.spacing() ) );
1203 const int ldc( numeric_cast<int>( C.spacing() ) );
1205 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1206 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1207 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1208 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1229 template<
typename MT3
1232 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1233 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1235 using boost::numeric_cast;
1241 const int M ( numeric_cast<int>( A.rows() ) );
1242 const int N ( numeric_cast<int>( B.columns() ) );
1243 const int K ( numeric_cast<int>( A.columns() ) );
1244 const int lda( numeric_cast<int>( A.spacing() ) );
1245 const int ldb( numeric_cast<int>( B.spacing() ) );
1246 const int ldc( numeric_cast<int>( C.spacing() ) );
1248 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1249 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1250 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1251 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1272 template<
typename MT3
1275 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1276 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1278 using boost::numeric_cast;
1287 const int M ( numeric_cast<int>( A.rows() ) );
1288 const int N ( numeric_cast<int>( B.columns() ) );
1289 const int K ( numeric_cast<int>( A.columns() ) );
1290 const int lda( numeric_cast<int>( A.spacing() ) );
1291 const int ldb( numeric_cast<int>( B.spacing() ) );
1292 const int ldc( numeric_cast<int>( C.spacing() ) );
1293 const complex<float> alpha( 1.0F, 0.0F );
1294 const complex<float> beta ( 1.0F, 0.0F );
1296 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1297 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1298 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1299 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1320 template<
typename MT3
1323 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1324 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1326 using boost::numeric_cast;
1335 const int M ( numeric_cast<int>( A.rows() ) );
1336 const int N ( numeric_cast<int>( B.columns() ) );
1337 const int K ( numeric_cast<int>( A.columns() ) );
1338 const int lda( numeric_cast<int>( A.spacing() ) );
1339 const int ldb( numeric_cast<int>( B.spacing() ) );
1340 const int ldc( numeric_cast<int>( C.spacing() ) );
1341 const complex<double> alpha( 1.0, 0.0 );
1342 const complex<double> beta ( 1.0, 0.0 );
1344 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1345 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1346 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1347 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1370 template<
typename MT3
1377 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1392 DMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1394 DMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1413 template<
typename MT3
1416 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1417 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1419 const size_t M( A.rows() );
1420 const size_t N( B.columns() );
1421 const size_t K( A.columns() );
1424 const size_t end( N &
size_t(-2) );
1426 for(
size_t i=0UL; i<M; ++i ) {
1427 for(
size_t k=0UL; k<K; ++k ) {
1428 for(
size_t j=0UL; j<end; j+=2UL ) {
1429 C(i,j ) -= A(i,k) * B(k,j );
1430 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1433 C(i,end) -= A(i,k) * B(k,end);
1455 template<
typename MT3
1458 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1459 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1461 typedef IntrinsicTrait<ElementType> IT;
1463 const size_t M( A.rows() );
1464 const size_t N( B.spacing() );
1465 const size_t K( A.columns() );
1469 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1470 for(
size_t i=0UL; i<M; ++i ) {
1479 for(
size_t k=0UL; k<K; ++k ) {
1481 xmm1 = xmm1 - a1 * B.get(k,j );
1482 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1483 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1484 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1485 xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1486 xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1487 xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1488 xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1490 store( &(~C)(i,j ), xmm1 );
1491 store( &(~C)(i,j+IT::size ), xmm2 );
1492 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1493 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1494 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1495 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1496 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1497 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1500 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1502 for( ; (i+2UL) <= M; i+=2UL ) {
1511 for(
size_t k=0UL; k<K; ++k ) {
1518 xmm1 = xmm1 - a1 * b1;
1519 xmm2 = xmm2 - a1 * b2;
1520 xmm3 = xmm3 - a1 * b3;
1521 xmm4 = xmm4 - a1 * b4;
1522 xmm5 = xmm5 - a2 * b1;
1523 xmm6 = xmm6 - a2 * b2;
1524 xmm7 = xmm7 - a2 * b3;
1525 xmm8 = xmm8 - a2 * b4;
1527 store( &(~C)(i ,j ), xmm1 );
1528 store( &(~C)(i ,j+IT::size ), xmm2 );
1529 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1530 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1531 store( &(~C)(i+1UL,j ), xmm5 );
1532 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1533 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1534 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1541 for(
size_t k=0UL; k<K; ++k ) {
1543 xmm1 = xmm1 - a1 * B.get(k,j );
1544 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1545 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1546 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1548 store( &(~C)(i,j ), xmm1 );
1549 store( &(~C)(i,j+IT::size ), xmm2 );
1550 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1551 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1554 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1556 for( ; (i+2UL) <= M; i+=2UL ) {
1561 for(
size_t k=0UL; k<K; ++k ) {
1566 xmm1 = xmm1 - a1 * b1;
1567 xmm2 = xmm2 - a1 * b2;
1568 xmm3 = xmm3 - a2 * b1;
1569 xmm4 = xmm4 - a2 * b2;
1571 store( &(~C)(i ,j ), xmm1 );
1572 store( &(~C)(i ,j+IT::size), xmm2 );
1573 store( &(~C)(i+1UL,j ), xmm3 );
1574 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1579 for(
size_t k=0UL; k<K; ++k ) {
1581 xmm1 = xmm1 - a1 * B.get(k,j );
1582 xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1584 store( &(~C)(i,j ), xmm1 );
1585 store( &(~C)(i,j+IT::size), xmm2 );
1590 for( ; (i+2UL) <= M; i+=2UL ) {
1593 for(
size_t k=0UL; k<K; ++k ) {
1595 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1596 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1598 store( &(~C)(i ,j), xmm1 );
1599 store( &(~C)(i+1UL,j), xmm2 );
1603 for(
size_t k=0UL; k<K; ++k ) {
1604 xmm1 = xmm1 -
set( A(i,k) ) * B.get(k,j);
1606 store( &(~C)(i,j), xmm1 );
1627 template<
typename MT3
1630 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1631 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1636 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1637 const typename MT4::OppositeType tmp( A );
1640 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1641 const typename MT5::OppositeType tmp( B );
1644 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1645 const typename MT4::OppositeType tmp( A );
1649 const typename MT5::OppositeType tmp( B );
1670 template<
typename MT3
1673 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1674 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1676 selectDefaultSubAssignKernel( C, A, B );
1696 template<
typename MT3
1699 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1700 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1702 using boost::numeric_cast;
1708 const int M ( numeric_cast<int>( A.rows() ) );
1709 const int N ( numeric_cast<int>( B.columns() ) );
1710 const int K ( numeric_cast<int>( A.columns() ) );
1711 const int lda( numeric_cast<int>( A.spacing() ) );
1712 const int ldb( numeric_cast<int>( B.spacing() ) );
1713 const int ldc( numeric_cast<int>( C.spacing() ) );
1715 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1716 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1717 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1718 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1739 template<
typename MT3
1742 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1743 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1745 using boost::numeric_cast;
1751 const int M ( numeric_cast<int>( A.rows() ) );
1752 const int N ( numeric_cast<int>( B.columns() ) );
1753 const int K ( numeric_cast<int>( A.columns() ) );
1754 const int lda( numeric_cast<int>( A.spacing() ) );
1755 const int ldb( numeric_cast<int>( B.spacing() ) );
1756 const int ldc( numeric_cast<int>( C.spacing() ) );
1758 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1759 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1760 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1761 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1782 template<
typename MT3
1785 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1786 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1788 using boost::numeric_cast;
1797 const int M ( numeric_cast<int>( A.rows() ) );
1798 const int N ( numeric_cast<int>( B.columns() ) );
1799 const int K ( numeric_cast<int>( A.columns() ) );
1800 const int lda( numeric_cast<int>( A.spacing() ) );
1801 const int ldb( numeric_cast<int>( B.spacing() ) );
1802 const int ldc( numeric_cast<int>( C.spacing() ) );
1803 const complex<float> alpha( -1.0F, 0.0F );
1804 const complex<float> beta ( 1.0F, 0.0F );
1806 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1807 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1808 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1809 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1830 template<
typename MT3
1833 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1834 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1836 using boost::numeric_cast;
1845 const int M ( numeric_cast<int>( A.rows() ) );
1846 const int N ( numeric_cast<int>( B.columns() ) );
1847 const int K ( numeric_cast<int>( A.columns() ) );
1848 const int lda( numeric_cast<int>( A.spacing() ) );
1849 const int ldb( numeric_cast<int>( B.spacing() ) );
1850 const int ldc( numeric_cast<int>( C.spacing() ) );
1851 const complex<double> alpha( -1.0, 0.0 );
1852 const complex<double> beta ( 1.0, 0.0 );
1854 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1855 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1856 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1857 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1903 template<
typename MT1
1907 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
1908 ,
private Expression
1909 ,
private Computation
1913 typedef DMatDMatMultExpr<MT1,MT2> MMM;
1914 typedef typename MMM::ResultType RES;
1915 typedef typename MT1::ResultType
RT1;
1916 typedef typename MT2::ResultType
RT2;
1917 typedef typename MT1::CompositeType
CT1;
1918 typedef typename MT2::CompositeType
CT2;
1926 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1927 struct UseSinglePrecisionKernel {
1928 enum { value = IsFloat<typename T1::ElementType>::value &&
1929 IsFloat<typename T2::ElementType>::value &&
1930 IsFloat<typename T3::ElementType>::value &&
1931 !IsComplex<T4>::value };
1940 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1941 struct UseDoublePrecisionKernel {
1942 enum { value = IsDouble<typename T1::ElementType>::value &&
1943 IsDouble<typename T2::ElementType>::value &&
1944 IsDouble<typename T3::ElementType>::value &&
1945 !IsComplex<T4>::value };
1954 template<
typename T1,
typename T2,
typename T3 >
1955 struct UseSinglePrecisionComplexKernel {
1956 typedef complex<float> Type;
1957 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1958 IsSame<typename T2::ElementType,Type>::value &&
1959 IsSame<typename T3::ElementType,Type>::value };
1968 template<
typename T1,
typename T2,
typename T3 >
1969 struct UseDoublePrecisionComplexKernel {
1970 typedef complex<double> Type;
1971 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1972 IsSame<typename T2::ElementType,Type>::value &&
1973 IsSame<typename T3::ElementType,Type>::value };
1981 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1982 struct UseDefaultKernel {
1983 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1984 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1985 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1986 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1994 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1995 struct UseVectorizedDefaultKernel {
1996 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1997 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1998 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1999 IsSame<typename T1::ElementType,T4>::value &&
2000 IntrinsicTrait<typename T1::ElementType>::addition &&
2001 IntrinsicTrait<typename T1::ElementType>::multiplication };
2007 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2008 typedef typename MultTrait<RES,ST>::Type
ResultType;
2009 typedef typename ResultType::OppositeType
OppositeType;
2011 typedef typename ResultType::ElementType
ElementType;
2012 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2017 typedef const DMatDMatMultExpr<MT1,MT2>
LeftOperand;
2023 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2026 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2031 enum { vectorizable = 0 };
2034 enum { canAlias = CanAlias<MMM>::value };
2043 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2059 return matrix_(i,j) * scalar_;
2068 inline size_t rows()
const {
2069 return matrix_.rows();
2078 inline size_t columns()
const {
2079 return matrix_.columns();
2109 template<
typename T >
2110 inline bool isAliased(
const T* alias )
const {
2111 return matrix_.isAliased( alias );
2130 template<
typename MT3
2132 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2137 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2138 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2140 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2143 else if( left.columns() == 0UL ) {
2159 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2161 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2179 template<
typename MT3
2183 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2184 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2186 const size_t M( A.rows() );
2187 const size_t N( B.columns() );
2188 const size_t K( A.columns() );
2190 for(
size_t i=0UL; i<M; ++i ) {
2191 for(
size_t j=0UL; j<N; ++j ) {
2192 C(i,j) = A(i,0UL) * B(0UL,j);
2194 for(
size_t k=1UL; k<K; ++k ) {
2195 for(
size_t j=0UL; j<N; ++j ) {
2196 C(i,j) += A(i,k) * B(k,j);
2199 for(
size_t j=0UL; j<N; ++j ) {
2220 template<
typename MT3
2224 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2225 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2227 typedef IntrinsicTrait<ElementType> IT;
2229 const size_t M( A.rows() );
2230 const size_t N( B.spacing() );
2231 const size_t K( A.columns() );
2237 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2238 for(
size_t i=0UL; i<M; ++i ) {
2239 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2240 for(
size_t k=0UL; k<K; ++k ) {
2242 xmm1 = xmm1 + a1 * B.get(k,j );
2243 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2244 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2245 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2246 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2247 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2248 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2249 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2251 store( &(~C)(i,j ), xmm1 * factor );
2252 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2253 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2254 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2255 store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2256 store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2257 store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2258 store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2261 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2263 for( ; (i+2UL) <= M; i+=2UL ) {
2264 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2265 for(
size_t k=0UL; k<K; ++k ) {
2272 xmm1 = xmm1 + a1 * b1;
2273 xmm2 = xmm2 + a1 * b2;
2274 xmm3 = xmm3 + a1 * b3;
2275 xmm4 = xmm4 + a1 * b4;
2276 xmm5 = xmm5 + a2 * b1;
2277 xmm6 = xmm6 + a2 * b2;
2278 xmm7 = xmm7 + a2 * b3;
2279 xmm8 = xmm8 + a2 * b4;
2281 store( &(~C)(i ,j ), xmm1 * factor );
2282 store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2283 store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2284 store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2285 store( &(~C)(i+1UL,j ), xmm5 * factor );
2286 store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2287 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2288 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2292 for(
size_t k=0UL; k<K; ++k ) {
2294 xmm1 = xmm1 + a1 * B.get(k,j );
2295 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2296 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2297 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2299 store( &(~C)(i,j ), xmm1 * factor );
2300 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2301 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2302 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2305 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2307 for( ; (i+2UL) <= M; i+=2UL ) {
2309 for(
size_t k=0UL; k<K; ++k ) {
2314 xmm1 = xmm1 + a1 * b1;
2315 xmm2 = xmm2 + a1 * b2;
2316 xmm3 = xmm3 + a2 * b1;
2317 xmm4 = xmm4 + a2 * b2;
2319 store( &(~C)(i ,j ), xmm1 * factor );
2320 store( &(~C)(i ,j+IT::size), xmm2 * factor );
2321 store( &(~C)(i+1UL,j ), xmm3 * factor );
2322 store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2326 for(
size_t k=0UL; k<K; ++k ) {
2328 xmm1 = xmm1 + a1 * B.get(k,j );
2329 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2331 store( &(~C)(i,j ), xmm1 * factor );
2332 store( &(~C)(i,j+IT::size), xmm2 * factor );
2337 for( ; (i+2UL) <= M; i+=2UL ) {
2339 for(
size_t k=0UL; k<K; ++k ) {
2341 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2342 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2344 store( &(~C)(i ,j), xmm1 * factor );
2345 store( &(~C)(i+1UL,j), xmm2 * factor );
2349 for(
size_t k=0UL; k<K; ++k ) {
2350 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2352 store( &(~C)(i,j), xmm1 * factor );
2372 template<
typename MT3
2376 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2377 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2382 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2383 const typename MT4::OppositeType tmp( A );
2384 assign( ~C, tmp * B * scalar );
2386 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2387 const typename MT5::OppositeType tmp( B );
2388 assign( ~C, A * tmp * scalar );
2390 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2391 const typename MT4::OppositeType tmp( A );
2392 assign( ~C, tmp * B * scalar );
2395 const typename MT5::OppositeType tmp( B );
2396 assign( ~C, A * tmp * scalar );
2415 template<
typename MT3
2419 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2420 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2422 selectDefaultAssignKernel( C, A, B, scalar );
2441 template<
typename MT3
2445 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2446 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2448 using boost::numeric_cast;
2454 const int M ( numeric_cast<int>( A.rows() ) );
2455 const int N ( numeric_cast<int>( B.columns() ) );
2456 const int K ( numeric_cast<int>( A.columns() ) );
2457 const int lda( numeric_cast<int>( A.spacing() ) );
2458 const int ldb( numeric_cast<int>( B.spacing() ) );
2459 const int ldc( numeric_cast<int>( C.spacing() ) );
2461 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2462 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2463 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2464 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2484 template<
typename MT3
2488 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2489 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2491 using boost::numeric_cast;
2497 const int M ( numeric_cast<int>( A.rows() ) );
2498 const int N ( numeric_cast<int>( B.columns() ) );
2499 const int K ( numeric_cast<int>( A.columns() ) );
2500 const int lda( numeric_cast<int>( A.spacing() ) );
2501 const int ldb( numeric_cast<int>( B.spacing() ) );
2502 const int ldc( numeric_cast<int>( C.spacing() ) );
2504 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2505 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2506 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2507 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2527 template<
typename MT3
2531 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2532 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2534 using boost::numeric_cast;
2544 const int M ( numeric_cast<int>( A.rows() ) );
2545 const int N ( numeric_cast<int>( B.columns() ) );
2546 const int K ( numeric_cast<int>( A.columns() ) );
2547 const int lda( numeric_cast<int>( A.spacing() ) );
2548 const int ldb( numeric_cast<int>( B.spacing() ) );
2549 const int ldc( numeric_cast<int>( C.spacing() ) );
2550 const complex<float> alpha( scalar );
2551 const complex<float> beta ( 0.0F, 0.0F );
2553 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2554 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2555 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2556 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2576 template<
typename MT3
2580 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2581 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2583 using boost::numeric_cast;
2593 const int M ( numeric_cast<int>( A.rows() ) );
2594 const int N ( numeric_cast<int>( B.columns() ) );
2595 const int K ( numeric_cast<int>( A.columns() ) );
2596 const int lda( numeric_cast<int>( A.spacing() ) );
2597 const int ldb( numeric_cast<int>( B.spacing() ) );
2598 const int ldc( numeric_cast<int>( C.spacing() ) );
2599 const complex<double> alpha( scalar );
2600 const complex<double> beta ( 0.0, 0.0 );
2602 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2603 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2604 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2605 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2621 template<
typename MT
2625 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2637 const TmpType tmp( rhs );
2654 template<
typename MT3
2656 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2661 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2662 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2664 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2679 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2681 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2699 template<
typename MT3
2703 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2704 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2725 template<
typename MT3
2729 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2730 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2732 typedef IntrinsicTrait<ElementType> IT;
2734 const size_t M( A.rows() );
2735 const size_t N( B.spacing() );
2736 const size_t K( A.columns() );
2742 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2743 for(
size_t i=0UL; i<M; ++i ) {
2744 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2745 for(
size_t k=0UL; k<K; ++k ) {
2747 xmm1 = xmm1 + a1 * B.get(k,j );
2748 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2749 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2750 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2751 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2752 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2753 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2754 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2756 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2757 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
2758 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
2759 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
2760 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
2761 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
2762 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
2763 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
2766 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2768 for( ; (i+2UL) <= M; i+=2UL ) {
2769 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2770 for(
size_t k=0UL; k<K; ++k ) {
2777 xmm1 = xmm1 + a1 * b1;
2778 xmm2 = xmm2 + a1 * b2;
2779 xmm3 = xmm3 + a1 * b3;
2780 xmm4 = xmm4 + a1 * b4;
2781 xmm5 = xmm5 + a2 * b1;
2782 xmm6 = xmm6 + a2 * b2;
2783 xmm7 = xmm7 + a2 * b3;
2784 xmm8 = xmm8 + a2 * b4;
2786 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2787 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
2788 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
2789 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
2790 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
2791 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
2792 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
2793 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
2797 for(
size_t k=0UL; k<K; ++k ) {
2799 xmm1 = xmm1 + a1 * B.get(k,j );
2800 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2801 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2802 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2804 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2805 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
2806 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
2807 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
2810 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2812 for( ; (i+2UL) <= M; i+=2UL ) {
2814 for(
size_t k=0UL; k<K; ++k ) {
2819 xmm1 = xmm1 + a1 * b1;
2820 xmm2 = xmm2 + a1 * b2;
2821 xmm3 = xmm3 + a2 * b1;
2822 xmm4 = xmm4 + a2 * b2;
2824 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
2825 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
2826 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
2827 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
2831 for(
size_t k=0UL; k<K; ++k ) {
2833 xmm1 = xmm1 + a1 * B.get(k,j );
2834 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2836 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
2837 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
2842 for( ; (i+2UL) <= M; i+=2UL ) {
2844 for(
size_t k=0UL; k<K; ++k ) {
2846 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2847 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2849 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
2850 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) + xmm2 * factor );
2854 for(
size_t k=0UL; k<K; ++k ) {
2855 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2857 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
2877 template<
typename MT3
2881 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2882 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2887 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2888 const typename MT4::OppositeType tmp( A );
2891 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2892 const typename MT5::OppositeType tmp( B );
2895 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2896 const typename MT4::OppositeType tmp( A );
2900 const typename MT5::OppositeType tmp( B );
2920 template<
typename MT3
2924 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2925 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2927 selectDefaultAddAssignKernel( C, A, B, scalar );
2946 template<
typename MT3
2950 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2951 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2953 using boost::numeric_cast;
2959 const int M ( numeric_cast<int>( A.rows() ) );
2960 const int N ( numeric_cast<int>( B.columns() ) );
2961 const int K ( numeric_cast<int>( A.columns() ) );
2962 const int lda( numeric_cast<int>( A.spacing() ) );
2963 const int ldb( numeric_cast<int>( B.spacing() ) );
2964 const int ldc( numeric_cast<int>( C.spacing() ) );
2966 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2967 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2968 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2969 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2989 template<
typename MT3
2993 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2994 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2996 using boost::numeric_cast;
3002 const int M ( numeric_cast<int>( A.rows() ) );
3003 const int N ( numeric_cast<int>( B.columns() ) );
3004 const int K ( numeric_cast<int>( A.columns() ) );
3005 const int lda( numeric_cast<int>( A.spacing() ) );
3006 const int ldb( numeric_cast<int>( B.spacing() ) );
3007 const int ldc( numeric_cast<int>( C.spacing() ) );
3009 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3010 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3011 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3012 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3032 template<
typename MT3
3036 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3037 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3039 using boost::numeric_cast;
3049 const int M ( numeric_cast<int>( A.rows() ) );
3050 const int N ( numeric_cast<int>( B.columns() ) );
3051 const int K ( numeric_cast<int>( A.columns() ) );
3052 const int lda( numeric_cast<int>( A.spacing() ) );
3053 const int ldb( numeric_cast<int>( B.spacing() ) );
3054 const int ldc( numeric_cast<int>( C.spacing() ) );
3055 const complex<float> alpha( scalar );
3056 const complex<float> beta ( 1.0F, 0.0F );
3058 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3059 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3060 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3061 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3081 template<
typename MT3
3085 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3086 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3088 using boost::numeric_cast;
3098 const int M ( numeric_cast<int>( A.rows() ) );
3099 const int N ( numeric_cast<int>( B.columns() ) );
3100 const int K ( numeric_cast<int>( A.columns() ) );
3101 const int lda( numeric_cast<int>( A.spacing() ) );
3102 const int ldb( numeric_cast<int>( B.spacing() ) );
3103 const int ldc( numeric_cast<int>( C.spacing() ) );
3104 const complex<double> alpha( scalar );
3105 const complex<double> beta ( 1.0, 0.0 );
3107 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3108 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3109 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3110 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3131 template<
typename MT3
3133 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3138 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3139 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3141 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3156 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3158 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3176 template<
typename MT3
3180 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3181 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3202 template<
typename MT3
3206 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3207 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3209 typedef IntrinsicTrait<ElementType> IT;
3211 const size_t M( A.rows() );
3212 const size_t N( B.spacing() );
3213 const size_t K( A.columns() );
3219 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3220 for(
size_t i=0UL; i<M; ++i ) {
3221 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3222 for(
size_t k=0UL; k<K; ++k ) {
3224 xmm1 = xmm1 + a1 * B.get(k,j );
3225 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3226 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3227 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3228 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3229 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3230 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3231 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3233 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3234 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3235 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3236 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3237 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3238 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3239 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3240 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3243 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3245 for( ; (i+2UL) <= M; i+=2UL ) {
3246 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3247 for(
size_t k=0UL; k<K; ++k ) {
3254 xmm1 = xmm1 + a1 * b1;
3255 xmm2 = xmm2 + a1 * b2;
3256 xmm3 = xmm3 + a1 * b3;
3257 xmm4 = xmm4 + a1 * b4;
3258 xmm5 = xmm5 + a2 * b1;
3259 xmm6 = xmm6 + a2 * b2;
3260 xmm7 = xmm7 + a2 * b3;
3261 xmm8 = xmm8 + a2 * b4;
3263 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3264 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3265 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3266 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3267 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3268 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3269 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3270 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3274 for(
size_t k=0UL; k<K; ++k ) {
3276 xmm1 = xmm1 + a1 * B.get(k,j );
3277 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3278 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3279 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3281 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3282 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3283 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3284 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3287 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3289 for( ; (i+2UL) <= M; i+=2UL ) {
3291 for(
size_t k=0UL; k<K; ++k ) {
3296 xmm1 = xmm1 + a1 * b1;
3297 xmm2 = xmm2 + a1 * b2;
3298 xmm3 = xmm3 + a2 * b1;
3299 xmm4 = xmm4 + a2 * b2;
3301 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3302 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3303 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3304 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3308 for(
size_t k=0UL; k<K; ++k ) {
3310 xmm1 = xmm1 + a1 * B.get(k,j );
3311 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3313 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3314 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3319 for( ; (i+2UL) <= M; i+=2UL ) {
3321 for(
size_t k=0UL; k<K; ++k ) {
3323 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3324 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3326 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3327 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3331 for(
size_t k=0UL; k<K; ++k ) {
3332 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3334 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3354 template<
typename MT3
3358 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3359 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3364 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3365 const typename MT4::OppositeType tmp( A );
3368 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3369 const typename MT5::OppositeType tmp( B );
3372 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3373 const typename MT4::OppositeType tmp( A );
3377 const typename MT5::OppositeType tmp( B );
3397 template<
typename MT3
3401 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3402 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3404 selectDefaultSubAssignKernel( C, A, B, scalar );
3423 template<
typename MT3
3427 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3428 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3430 using boost::numeric_cast;
3436 const int M ( numeric_cast<int>( A.rows() ) );
3437 const int N ( numeric_cast<int>( B.columns() ) );
3438 const int K ( numeric_cast<int>( A.columns() ) );
3439 const int lda( numeric_cast<int>( A.spacing() ) );
3440 const int ldb( numeric_cast<int>( B.spacing() ) );
3441 const int ldc( numeric_cast<int>( C.spacing() ) );
3443 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3444 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3445 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3446 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3466 template<
typename MT3
3470 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3471 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3473 using boost::numeric_cast;
3479 const int M ( numeric_cast<int>( A.rows() ) );
3480 const int N ( numeric_cast<int>( B.columns() ) );
3481 const int K ( numeric_cast<int>( A.columns() ) );
3482 const int lda( numeric_cast<int>( A.spacing() ) );
3483 const int ldb( numeric_cast<int>( B.spacing() ) );
3484 const int ldc( numeric_cast<int>( C.spacing() ) );
3486 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3487 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3488 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3489 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3509 template<
typename MT3
3513 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3514 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3516 using boost::numeric_cast;
3526 const int M ( numeric_cast<int>( A.rows() ) );
3527 const int N ( numeric_cast<int>( B.columns() ) );
3528 const int K ( numeric_cast<int>( A.columns() ) );
3529 const int lda( numeric_cast<int>( A.spacing() ) );
3530 const int ldb( numeric_cast<int>( B.spacing() ) );
3531 const int ldc( numeric_cast<int>( C.spacing() ) );
3532 const complex<float> alpha( -scalar );
3533 const complex<float> beta ( 1.0F, 0.0F );
3535 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3536 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3537 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3538 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3558 template<
typename MT3
3562 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3563 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3565 using boost::numeric_cast;
3575 const int M ( numeric_cast<int>( A.rows() ) );
3576 const int N ( numeric_cast<int>( B.columns() ) );
3577 const int K ( numeric_cast<int>( A.columns() ) );
3578 const int lda( numeric_cast<int>( A.spacing() ) );
3579 const int ldb( numeric_cast<int>( B.spacing() ) );
3580 const int ldc( numeric_cast<int>( C.spacing() ) );
3581 const complex<double> alpha( -scalar );
3582 const complex<double> beta ( 1.0, 0.0 );
3584 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3585 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3586 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3587 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3652 template<
typename T1
3654 inline const DMatDMatMultExpr<T1,T2>
3658 throw std::invalid_argument(
"Matrix sizes do not match" );
3675 template<
typename MT1,
typename MT2,
typename VT >
3680 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3681 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3682 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3683 ,
typename DMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
3684 , INVALID_TYPE >::Type Type;
3693 template<
typename MT1,
typename MT2,
typename VT >
3698 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3699 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
3700 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3701 ,
typename DMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
3702 , INVALID_TYPE >::Type Type;
3711 template<
typename VT,
typename MT1,
typename MT2 >
3716 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
3717 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3718 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3719 ,
typename TDVecDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3720 , INVALID_TYPE >::Type Type;
3729 template<
typename VT,
typename MT1,
typename MT2 >
3734 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
3735 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3736 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
3737 ,
typename TDVecDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
3738 , INVALID_TYPE >::Type Type;