35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
113 template<
typename MT1
115 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
116 ,
private MatMatMultExpr
117 ,
private Computation
145 template<
typename MT >
146 struct UseSMPAssign {
147 enum { value = ( evaluateLeft || evaluateRight ) };
157 template<
typename T1,
typename T2,
typename T3 >
158 struct UseSinglePrecisionKernel {
159 enum { value = IsFloat<typename T1::ElementType>::value &&
160 IsFloat<typename T2::ElementType>::value &&
161 IsFloat<typename T3::ElementType>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDoublePrecisionKernel {
173 enum { value = IsDouble<typename T1::ElementType>::value &&
174 IsDouble<typename T2::ElementType>::value &&
175 IsDouble<typename T3::ElementType>::value };
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseSinglePrecisionComplexKernel {
188 typedef complex<float> Type;
189 enum { value = IsSame<typename T1::ElementType,Type>::value &&
190 IsSame<typename T2::ElementType,Type>::value &&
191 IsSame<typename T3::ElementType,Type>::value };
202 template<
typename T1,
typename T2,
typename T3 >
203 struct UseDoublePrecisionComplexKernel {
204 typedef complex<double> Type;
205 enum { value = IsSame<typename T1::ElementType,Type>::value &&
206 IsSame<typename T2::ElementType,Type>::value &&
207 IsSame<typename T3::ElementType,Type>::value };
217 template<
typename T1,
typename T2,
typename T3 >
218 struct UseDefaultKernel {
219 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
220 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
221 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
222 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
232 template<
typename T1,
typename T2,
typename T3 >
233 struct UseVectorizedDefaultKernel {
234 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
235 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
236 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
237 IntrinsicTrait<typename T1::ElementType>::addition &&
238 IntrinsicTrait<typename T1::ElementType>::subtraction &&
239 IntrinsicTrait<typename T1::ElementType>::multiplication };
270 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
276 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
277 !evaluateRight && MT2::smpAssignable };
307 if(
lhs_.columns() != 0UL ) {
308 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
310 for(
size_t k=1UL; k<end; k+=2UL ) {
312 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
314 if( end <
lhs_.columns() ) {
342 return rhs_.columns();
372 template<
typename T >
374 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
384 template<
typename T >
386 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
396 return lhs_.isAligned() &&
rhs_.isAligned();
431 template<
typename MT
440 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
443 else if( rhs.lhs_.columns() == 0UL ) {
458 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
474 template<
typename MT3
477 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
480 TDMatDMatMultExpr::selectDefaultAssignKernel( C, A, B );
482 TDMatDMatMultExpr::selectBlasAssignKernel( C, A, B );
501 template<
typename MT3
504 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
505 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
507 const size_t M( A.rows() );
508 const size_t N( B.columns() );
509 const size_t K( A.columns() );
511 for(
size_t i=0UL; i<M; ++i ) {
512 for(
size_t j=0UL; j<N; ++j ) {
513 C(i,j) = A(i,0UL) * B(0UL,j);
515 for(
size_t k=1UL; k<K; ++k ) {
516 for(
size_t j=0UL; j<N; ++j ) {
517 C(i,j) += A(i,k) * B(k,j);
539 template<
typename MT3
542 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
543 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
545 typedef IntrinsicTrait<ElementType> IT;
547 const size_t M( A.rows() );
548 const size_t N( B.columns() );
549 const size_t K( A.columns() );
553 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
554 for(
size_t i=0UL; i<M; ++i ) {
555 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
556 for(
size_t k=0UL; k<K; ++k ) {
558 xmm1 = xmm1 + a1 * B.load(k,j );
559 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
560 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
561 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
562 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
563 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
564 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
565 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
567 (~C).
store( i, j , xmm1 );
568 (~C).
store( i, j+IT::size , xmm2 );
569 (~C).
store( i, j+IT::size*2UL, xmm3 );
570 (~C).
store( i, j+IT::size*3UL, xmm4 );
571 (~C).
store( i, j+IT::size*4UL, xmm5 );
572 (~C).
store( i, j+IT::size*5UL, xmm6 );
573 (~C).
store( i, j+IT::size*6UL, xmm7 );
574 (~C).
store( i, j+IT::size*7UL, xmm8 );
577 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
579 for( ; (i+2UL) <= M; i+=2UL ) {
580 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
581 for(
size_t k=0UL; k<K; ++k ) {
588 xmm1 = xmm1 + a1 * b1;
589 xmm2 = xmm2 + a1 * b2;
590 xmm3 = xmm3 + a1 * b3;
591 xmm4 = xmm4 + a1 * b4;
592 xmm5 = xmm5 + a2 * b1;
593 xmm6 = xmm6 + a2 * b2;
594 xmm7 = xmm7 + a2 * b3;
595 xmm8 = xmm8 + a2 * b4;
597 (~C).
store( i , j , xmm1 );
598 (~C).
store( i , j+IT::size , xmm2 );
599 (~C).
store( i , j+IT::size*2UL, xmm3 );
600 (~C).
store( i , j+IT::size*3UL, xmm4 );
601 (~C).
store( i+1UL, j , xmm5 );
602 (~C).
store( i+1UL, j+IT::size , xmm6 );
603 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
604 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
608 for(
size_t k=0UL; k<K; ++k ) {
610 xmm1 = xmm1 + a1 * B.load(k,j );
611 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
612 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
613 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
615 (~C).
store( i, j , xmm1 );
616 (~C).
store( i, j+IT::size , xmm2 );
617 (~C).
store( i, j+IT::size*2UL, xmm3 );
618 (~C).
store( i, j+IT::size*3UL, xmm4 );
621 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
623 for( ; (i+2UL) <= M; i+=2UL ) {
625 for(
size_t k=0UL; k<K; ++k ) {
630 xmm1 = xmm1 + a1 * b1;
631 xmm2 = xmm2 + a1 * b2;
632 xmm3 = xmm3 + a2 * b1;
633 xmm4 = xmm4 + a2 * b2;
635 (~C).
store( i , j , xmm1 );
636 (~C).
store( i , j+IT::size, xmm2 );
637 (~C).
store( i+1UL, j , xmm3 );
638 (~C).
store( i+1UL, j+IT::size, xmm4 );
642 for(
size_t k=0UL; k<K; ++k ) {
644 xmm1 = xmm1 + a1 * B.load(k,j );
645 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
647 (~C).
store( i, j , xmm1 );
648 (~C).
store( i, j+IT::size, xmm2 );
653 for( ; (i+2UL) <= M; i+=2UL ) {
655 for(
size_t k=0UL; k<K; ++k ) {
657 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
658 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
660 (~C).
store( i , j, xmm1 );
661 (~C).
store( i+1UL, j, xmm2 );
665 for(
size_t k=0UL; k<K; ++k ) {
666 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
668 (~C).
store( i, j, xmm1 );
689 template<
typename MT3
692 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
693 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
695 typedef IntrinsicTrait<ElementType> IT;
697 const size_t M( A.rows() );
698 const size_t N( B.columns() );
699 const size_t K( A.columns() );
703 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
704 for(
size_t j=0UL; j<N; ++j ) {
705 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
706 for(
size_t k=0UL; k<K; ++k ) {
708 xmm1 = xmm1 + A.load(i ,k) * b1;
709 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
710 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
711 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
712 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
713 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
714 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
715 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
717 (~C).
store( i , j, xmm1 );
718 (~C).
store( i+IT::size , j, xmm2 );
719 (~C).
store( i+IT::size*2UL, j, xmm3 );
720 (~C).
store( i+IT::size*3UL, j, xmm4 );
721 (~C).
store( i+IT::size*4UL, j, xmm5 );
722 (~C).
store( i+IT::size*5UL, j, xmm6 );
723 (~C).
store( i+IT::size*6UL, j, xmm7 );
724 (~C).
store( i+IT::size*7UL, j, xmm8 );
727 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
729 for( ; (j+2UL) <= N; j+=2UL ) {
730 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
731 for(
size_t k=0UL; k<K; ++k ) {
738 xmm1 = xmm1 + a1 * b1;
739 xmm2 = xmm2 + a2 * b1;
740 xmm3 = xmm3 + a3 * b1;
741 xmm4 = xmm4 + a4 * b1;
742 xmm5 = xmm5 + a1 * b2;
743 xmm6 = xmm6 + a2 * b2;
744 xmm7 = xmm7 + a3 * b2;
745 xmm8 = xmm8 + a4 * b2;
747 (~C).
store( i , j , xmm1 );
748 (~C).
store( i+IT::size , j , xmm2 );
749 (~C).
store( i+IT::size*2UL, j , xmm3 );
750 (~C).
store( i+IT::size*3UL, j , xmm4 );
751 (~C).
store( i , j+1UL, xmm5 );
752 (~C).
store( i+IT::size , j+1UL, xmm6 );
753 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
754 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
758 for(
size_t k=0UL; k<K; ++k ) {
760 xmm1 = xmm1 + A.load(i ,k) * b1;
761 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
762 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
763 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
765 (~C).
store( i , j, xmm1 );
766 (~C).
store( i+IT::size , j, xmm2 );
767 (~C).
store( i+IT::size*2UL, j, xmm3 );
768 (~C).
store( i+IT::size*3UL, j, xmm4 );
771 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
773 for( ; (j+2UL) <= N; j+=2UL ) {
775 for(
size_t k=0UL; k<K; ++k ) {
780 xmm1 = xmm1 + a1 * b1;
781 xmm2 = xmm2 + a2 * b1;
782 xmm3 = xmm3 + a1 * b2;
783 xmm4 = xmm4 + a2 * b2;
785 (~C).
store( i , j , xmm1 );
786 (~C).
store( i+IT::size, j , xmm2 );
787 (~C).
store( i , j+1UL, xmm3 );
788 (~C).
store( i+IT::size, j+1UL, xmm4 );
792 for(
size_t k=0UL; k<K; ++k ) {
794 xmm1 = xmm1 + A.load(i ,k) * b1;
795 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
797 (~C).
store( i , j, xmm1 );
798 (~C).
store( i+IT::size, j, xmm2 );
803 for( ; (j+2UL) <= N; j+=2UL ) {
805 for(
size_t k=0UL; k<K; ++k ) {
807 xmm1 = xmm1 + a1 *
set( B(k,j ) );
808 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
810 (~C).
store( i, j , xmm1 );
811 (~C).
store( i, j+1UL, xmm2 );
815 for(
size_t k=0UL; k<K; ++k ) {
816 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
818 (~C).
store( i, j, xmm1 );
839 template<
typename MT3
842 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
843 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
845 selectDefaultAssignKernel( C, A, B );
865 template<
typename MT3
868 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
869 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
871 using boost::numeric_cast;
877 const int M ( numeric_cast<int>( A.rows() ) );
878 const int N ( numeric_cast<int>( B.columns() ) );
879 const int K ( numeric_cast<int>( A.columns() ) );
880 const int lda( numeric_cast<int>( A.spacing() ) );
881 const int ldb( numeric_cast<int>( B.spacing() ) );
882 const int ldc( numeric_cast<int>( C.spacing() ) );
884 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
885 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
886 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
887 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
908 template<
typename MT3
911 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
912 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
914 using boost::numeric_cast;
920 const int M ( numeric_cast<int>( A.rows() ) );
921 const int N ( numeric_cast<int>( B.columns() ) );
922 const int K ( numeric_cast<int>( A.columns() ) );
923 const int lda( numeric_cast<int>( A.spacing() ) );
924 const int ldb( numeric_cast<int>( B.spacing() ) );
925 const int ldc( numeric_cast<int>( C.spacing() ) );
927 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
928 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
929 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
930 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
951 template<
typename MT3
954 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
955 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
957 using boost::numeric_cast;
966 const int M ( numeric_cast<int>( A.rows() ) );
967 const int N ( numeric_cast<int>( B.columns() ) );
968 const int K ( numeric_cast<int>( A.columns() ) );
969 const int lda( numeric_cast<int>( A.spacing() ) );
970 const int ldb( numeric_cast<int>( B.spacing() ) );
971 const int ldc( numeric_cast<int>( C.spacing() ) );
972 const complex<float> alpha( 1.0F, 0.0F );
973 const complex<float> beta ( 0.0F, 0.0F );
975 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
976 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
977 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
978 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
999 template<
typename MT3
1002 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1003 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1005 using boost::numeric_cast;
1014 const int M ( numeric_cast<int>( A.rows() ) );
1015 const int N ( numeric_cast<int>( B.columns() ) );
1016 const int K ( numeric_cast<int>( A.columns() ) );
1017 const int lda( numeric_cast<int>( A.spacing() ) );
1018 const int ldb( numeric_cast<int>( B.spacing() ) );
1019 const int ldc( numeric_cast<int>( C.spacing() ) );
1020 const complex<double> alpha( 1.0, 0.0 );
1021 const complex<double> beta ( 0.0, 0.0 );
1023 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1024 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1025 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1026 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1044 template<
typename MT
1050 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1062 const TmpType tmp(
serial( rhs ) );
1081 template<
typename MT
1090 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1104 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1120 template<
typename MT3
1123 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1126 TDMatDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1128 TDMatDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1147 template<
typename MT3
1150 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1151 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1153 const size_t M( A.rows() );
1154 const size_t N( B.columns() );
1155 const size_t K( A.columns() );
1158 const size_t end( N &
size_t(-2) );
1160 for(
size_t i=0UL; i<M; ++i ) {
1161 for(
size_t k=0UL; k<K; ++k ) {
1162 for(
size_t j=0UL; j<end; j+=2UL ) {
1163 C(i,j ) += A(i,k) * B(k,j );
1164 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1167 C(i,end) += A(i,k) * B(k,end);
1189 template<
typename MT3
1192 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1193 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1195 typedef IntrinsicTrait<ElementType> IT;
1197 const size_t M( A.rows() );
1198 const size_t N( B.columns() );
1199 const size_t K( A.columns() );
1203 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1204 for(
size_t i=0UL; i<M; ++i ) {
1213 for(
size_t k=0UL; k<K; ++k ) {
1215 xmm1 = xmm1 + a1 * B.load(k,j );
1216 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1217 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1218 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1219 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1220 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1221 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1222 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1224 (~C).
store( i, j , xmm1 );
1225 (~C).
store( i, j+IT::size , xmm2 );
1226 (~C).
store( i, j+IT::size*2UL, xmm3 );
1227 (~C).
store( i, j+IT::size*3UL, xmm4 );
1228 (~C).
store( i, j+IT::size*4UL, xmm5 );
1229 (~C).
store( i, j+IT::size*5UL, xmm6 );
1230 (~C).
store( i, j+IT::size*6UL, xmm7 );
1231 (~C).
store( i, j+IT::size*7UL, xmm8 );
1234 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1236 for( ; (i+2UL) <= M; i+=2UL ) {
1245 for(
size_t k=0UL; k<K; ++k ) {
1252 xmm1 = xmm1 + a1 * b1;
1253 xmm2 = xmm2 + a1 * b2;
1254 xmm3 = xmm3 + a1 * b3;
1255 xmm4 = xmm4 + a1 * b4;
1256 xmm5 = xmm5 + a2 * b1;
1257 xmm6 = xmm6 + a2 * b2;
1258 xmm7 = xmm7 + a2 * b3;
1259 xmm8 = xmm8 + a2 * b4;
1261 (~C).
store( i , j , xmm1 );
1262 (~C).
store( i , j+IT::size , xmm2 );
1263 (~C).
store( i , j+IT::size*2UL, xmm3 );
1264 (~C).
store( i , j+IT::size*3UL, xmm4 );
1265 (~C).
store( i+1UL, j , xmm5 );
1266 (~C).
store( i+1UL, j+IT::size , xmm6 );
1267 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1268 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1275 for(
size_t k=0UL; k<K; ++k ) {
1277 xmm1 = xmm1 + a1 * B.load(k,j );
1278 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1279 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1280 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1282 (~C).
store( i, j , xmm1 );
1283 (~C).
store( i, j+IT::size , xmm2 );
1284 (~C).
store( i, j+IT::size*2UL, xmm3 );
1285 (~C).
store( i, j+IT::size*3UL, xmm4 );
1288 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1290 for( ; (i+2UL) <= M; i+=2UL ) {
1295 for(
size_t k=0UL; k<K; ++k ) {
1300 xmm1 = xmm1 + a1 * b1;
1301 xmm2 = xmm2 + a1 * b2;
1302 xmm3 = xmm3 + a2 * b1;
1303 xmm4 = xmm4 + a2 * b2;
1305 (~C).
store( i , j , xmm1 );
1306 (~C).
store( i , j+IT::size, xmm2 );
1307 (~C).
store( i+1UL, j , xmm3 );
1308 (~C).
store( i+1UL, j+IT::size, xmm4 );
1313 for(
size_t k=0UL; k<K; ++k ) {
1315 xmm1 = xmm1 + a1 * B.load(k,j );
1316 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1318 (~C).
store( i, j , xmm1 );
1319 (~C).
store( i, j+IT::size, xmm2 );
1324 for( ; (i+2UL) <= M; i+=2UL ) {
1327 for(
size_t k=0UL; k<K; ++k ) {
1329 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1330 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1332 (~C).
store( i , j, xmm1 );
1333 (~C).
store( i+1UL, j, xmm2 );
1337 for(
size_t k=0UL; k<K; ++k ) {
1338 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1340 (~C).
store( i, j, xmm1 );
1361 template<
typename MT3
1364 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1365 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1367 typedef IntrinsicTrait<ElementType> IT;
1369 const size_t M( A.rows() );
1370 const size_t N( B.columns() );
1371 const size_t K( A.columns() );
1375 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1376 for(
size_t j=0UL; j<N; ++j ) {
1385 for(
size_t k=0UL; k<K; ++k ) {
1387 xmm1 = xmm1 + A.load(i ,k) * b1;
1388 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1389 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1390 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1391 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1392 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1393 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1394 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1396 (~C).
store( i , j, xmm1 );
1397 (~C).
store( i+IT::size , j, xmm2 );
1398 (~C).
store( i+IT::size*2UL, j, xmm3 );
1399 (~C).
store( i+IT::size*3UL, j, xmm4 );
1400 (~C).
store( i+IT::size*4UL, j, xmm5 );
1401 (~C).
store( i+IT::size*5UL, j, xmm6 );
1402 (~C).
store( i+IT::size*6UL, j, xmm7 );
1403 (~C).
store( i+IT::size*7UL, j, xmm8 );
1406 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1408 for( ; (j+2UL) <= N; j+=2UL ) {
1417 for(
size_t k=0UL; k<K; ++k ) {
1424 xmm1 = xmm1 + a1 * b1;
1425 xmm2 = xmm2 + a2 * b1;
1426 xmm3 = xmm3 + a3 * b1;
1427 xmm4 = xmm4 + a4 * b1;
1428 xmm5 = xmm5 + a1 * b2;
1429 xmm6 = xmm6 + a2 * b2;
1430 xmm7 = xmm7 + a3 * b2;
1431 xmm8 = xmm8 + a4 * b2;
1433 (~C).
store( i , j , xmm1 );
1434 (~C).
store( i+IT::size , j , xmm2 );
1435 (~C).
store( i+IT::size*2UL, j , xmm3 );
1436 (~C).
store( i+IT::size*3UL, j , xmm4 );
1437 (~C).
store( i , j+1UL, xmm5 );
1438 (~C).
store( i+IT::size , j+1UL, xmm6 );
1439 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
1440 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
1447 for(
size_t k=0UL; k<K; ++k ) {
1449 xmm1 = xmm1 + A.load(i ,k) * b1;
1450 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1451 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1452 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1454 (~C).
store( i , j, xmm1 );
1455 (~C).
store( i+IT::size , j, xmm2 );
1456 (~C).
store( i+IT::size*2UL, j, xmm3 );
1457 (~C).
store( i+IT::size*3UL, j, xmm4 );
1460 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1462 for( ; (j+2UL) <= N; j+=2UL ) {
1467 for(
size_t k=0UL; k<K; ++k ) {
1472 xmm1 = xmm1 + a1 * b1;
1473 xmm2 = xmm2 + a2 * b1;
1474 xmm3 = xmm3 + a1 * b2;
1475 xmm4 = xmm4 + a2 * b2;
1477 (~C).
store( i , j , xmm1 );
1478 (~C).
store( i+IT::size, j , xmm2 );
1479 (~C).
store( i , j+1UL, xmm3 );
1480 (~C).
store( i+IT::size, j+1UL, xmm4 );
1485 for(
size_t k=0UL; k<K; ++k ) {
1487 xmm1 = xmm1 + A.load(i ,k) * b1;
1488 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1490 (~C).
store( i , j, xmm1 );
1491 (~C).
store( i+IT::size, j, xmm2 );
1496 for( ; (j+2UL) <= N; j+=2UL ) {
1499 for(
size_t k=0UL; k<K; ++k ) {
1501 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1502 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1504 (~C).
store( i, j , xmm1 );
1505 (~C).
store( i, j+1UL, xmm2 );
1509 for(
size_t k=0UL; k<K; ++k ) {
1510 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1512 (~C).
store( i, j, xmm1 );
1533 template<
typename MT3
1536 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1537 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1539 selectDefaultAddAssignKernel( C, A, B );
1559 template<
typename MT3
1562 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1563 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1565 using boost::numeric_cast;
1571 const int M ( numeric_cast<int>( A.rows() ) );
1572 const int N ( numeric_cast<int>( B.columns() ) );
1573 const int K ( numeric_cast<int>( A.columns() ) );
1574 const int lda( numeric_cast<int>( A.spacing() ) );
1575 const int ldb( numeric_cast<int>( B.spacing() ) );
1576 const int ldc( numeric_cast<int>( C.spacing() ) );
1578 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1579 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1580 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1581 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1602 template<
typename MT3
1605 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1606 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1608 using boost::numeric_cast;
1614 const int M ( numeric_cast<int>( A.rows() ) );
1615 const int N ( numeric_cast<int>( B.columns() ) );
1616 const int K ( numeric_cast<int>( A.columns() ) );
1617 const int lda( numeric_cast<int>( A.spacing() ) );
1618 const int ldb( numeric_cast<int>( B.spacing() ) );
1619 const int ldc( numeric_cast<int>( C.spacing() ) );
1621 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1622 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1623 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1624 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1645 template<
typename MT3
1648 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1649 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1651 using boost::numeric_cast;
1660 const int M ( numeric_cast<int>( A.rows() ) );
1661 const int N ( numeric_cast<int>( B.columns() ) );
1662 const int K ( numeric_cast<int>( A.columns() ) );
1663 const int lda( numeric_cast<int>( A.spacing() ) );
1664 const int ldb( numeric_cast<int>( B.spacing() ) );
1665 const int ldc( numeric_cast<int>( C.spacing() ) );
1666 const complex<float> alpha( 1.0F, 0.0F );
1667 const complex<float> beta ( 1.0F, 0.0F );
1669 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1670 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1671 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1672 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1693 template<
typename MT3
1696 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1697 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1699 using boost::numeric_cast;
1708 const int M ( numeric_cast<int>( A.rows() ) );
1709 const int N ( numeric_cast<int>( B.columns() ) );
1710 const int K ( numeric_cast<int>( A.columns() ) );
1711 const int lda( numeric_cast<int>( A.spacing() ) );
1712 const int ldb( numeric_cast<int>( B.spacing() ) );
1713 const int ldc( numeric_cast<int>( C.spacing() ) );
1714 const complex<double> alpha( 1.0, 0.0 );
1715 const complex<double> beta ( 1.0, 0.0 );
1717 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1718 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1719 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1720 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1743 template<
typename MT
1752 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1766 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1782 template<
typename MT3
1785 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1788 TDMatDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1790 TDMatDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1809 template<
typename MT3
1812 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1813 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1815 const size_t M( A.rows() );
1816 const size_t N( B.columns() );
1817 const size_t K( A.columns() );
1820 const size_t end( N &
size_t(-2) );
1822 for(
size_t i=0UL; i<M; ++i ) {
1823 for(
size_t k=0UL; k<K; ++k ) {
1824 for(
size_t j=0UL; j<end; j+=2UL ) {
1825 C(i,j ) -= A(i,k) * B(k,j );
1826 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1829 C(i,end) -= A(i,k) * B(k,end);
1851 template<
typename MT3
1854 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1855 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1857 typedef IntrinsicTrait<ElementType> IT;
1859 const size_t M( A.rows() );
1860 const size_t N( B.columns() );
1861 const size_t K( A.columns() );
1865 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1866 for(
size_t i=0UL; i<M; ++i ) {
1875 for(
size_t k=0UL; k<K; ++k ) {
1877 xmm1 = xmm1 - a1 * B.load(k,j );
1878 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1879 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1880 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1881 xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1882 xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1883 xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1884 xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1886 (~C).
store( i, j , xmm1 );
1887 (~C).
store( i, j+IT::size , xmm2 );
1888 (~C).
store( i, j+IT::size*2UL, xmm3 );
1889 (~C).
store( i, j+IT::size*3UL, xmm4 );
1890 (~C).
store( i, j+IT::size*4UL, xmm5 );
1891 (~C).
store( i, j+IT::size*5UL, xmm6 );
1892 (~C).
store( i, j+IT::size*6UL, xmm7 );
1893 (~C).
store( i, j+IT::size*7UL, xmm8 );
1896 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1898 for( ; (i+2UL) <= M; i+=2UL ) {
1907 for(
size_t k=0UL; k<K; ++k ) {
1914 xmm1 = xmm1 - a1 * b1;
1915 xmm2 = xmm2 - a1 * b2;
1916 xmm3 = xmm3 - a1 * b3;
1917 xmm4 = xmm4 - a1 * b4;
1918 xmm5 = xmm5 - a2 * b1;
1919 xmm6 = xmm6 - a2 * b2;
1920 xmm7 = xmm7 - a2 * b3;
1921 xmm8 = xmm8 - a2 * b4;
1923 (~C).
store( i , j , xmm1 );
1924 (~C).
store( i , j+IT::size , xmm2 );
1925 (~C).
store( i , j+IT::size*2UL, xmm3 );
1926 (~C).
store( i , j+IT::size*3UL, xmm4 );
1927 (~C).
store( i+1UL, j , xmm5 );
1928 (~C).
store( i+1UL, j+IT::size , xmm6 );
1929 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1930 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1937 for(
size_t k=0UL; k<K; ++k ) {
1939 xmm1 = xmm1 - a1 * B.load(k,j );
1940 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1941 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1942 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1944 (~C).
store( i, j , xmm1 );
1945 (~C).
store( i, j+IT::size , xmm2 );
1946 (~C).
store( i, j+IT::size*2UL, xmm3 );
1947 (~C).
store( i, j+IT::size*3UL, xmm4 );
1950 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1952 for( ; (i+2UL) <= M; i+=2UL ) {
1957 for(
size_t k=0UL; k<K; ++k ) {
1962 xmm1 = xmm1 - a1 * b1;
1963 xmm2 = xmm2 - a1 * b2;
1964 xmm3 = xmm3 - a2 * b1;
1965 xmm4 = xmm4 - a2 * b2;
1967 (~C).
store( i , j , xmm1 );
1968 (~C).
store( i , j+IT::size, xmm2 );
1969 (~C).
store( i+1UL, j , xmm3 );
1970 (~C).
store( i+1UL, j+IT::size, xmm4 );
1975 for(
size_t k=0UL; k<K; ++k ) {
1977 xmm1 = xmm1 - a1 * B.load(k,j );
1978 xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
1980 (~C).
store( i, j , xmm1 );
1981 (~C).
store( i, j+IT::size, xmm2 );
1986 for( ; (i+2UL) <= M; i+=2UL ) {
1989 for(
size_t k=0UL; k<K; ++k ) {
1991 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1992 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1994 (~C).
store( i , j, xmm1 );
1995 (~C).
store( i+1UL, j, xmm2 );
1999 for(
size_t k=0UL; k<K; ++k ) {
2000 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
2002 (~C).
store( i, j, xmm1 );
2023 template<
typename MT3
2026 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2027 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2029 typedef IntrinsicTrait<ElementType> IT;
2031 const size_t M( A.rows() );
2032 const size_t N( B.columns() );
2033 const size_t K( A.columns() );
2037 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2038 for(
size_t j=0UL; j<N; ++j ) {
2047 for(
size_t k=0UL; k<K; ++k ) {
2049 xmm1 = xmm1 - A.load(i ,k) * b1;
2050 xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2051 xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2052 xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2053 xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
2054 xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
2055 xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
2056 xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
2058 (~C).
store( i , j, xmm1 );
2059 (~C).
store( i+IT::size , j, xmm2 );
2060 (~C).
store( i+IT::size*2UL, j, xmm3 );
2061 (~C).
store( i+IT::size*3UL, j, xmm4 );
2062 (~C).
store( i+IT::size*4UL, j, xmm5 );
2063 (~C).
store( i+IT::size*5UL, j, xmm6 );
2064 (~C).
store( i+IT::size*6UL, j, xmm7 );
2065 (~C).
store( i+IT::size*7UL, j, xmm8 );
2068 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2070 for( ; (j+2UL) <= N; j+=2UL ) {
2079 for(
size_t k=0UL; k<K; ++k ) {
2086 xmm1 = xmm1 - a1 * b1;
2087 xmm2 = xmm2 - a2 * b1;
2088 xmm3 = xmm3 - a3 * b1;
2089 xmm4 = xmm4 - a4 * b1;
2090 xmm5 = xmm5 - a1 * b2;
2091 xmm6 = xmm6 - a2 * b2;
2092 xmm7 = xmm7 - a3 * b2;
2093 xmm8 = xmm8 - a4 * b2;
2095 (~C).
store( i , j , xmm1 );
2096 (~C).
store( i+IT::size , j , xmm2 );
2097 (~C).
store( i+IT::size*2UL, j , xmm3 );
2098 (~C).
store( i+IT::size*3UL, j , xmm4 );
2099 (~C).
store( i , j+1UL, xmm5 );
2100 (~C).
store( i+IT::size , j+1UL, xmm6 );
2101 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
2102 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
2109 for(
size_t k=0UL; k<K; ++k ) {
2111 xmm1 = xmm1 - A.load(i ,k) * b1;
2112 xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2113 xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2114 xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2116 (~C).
store( i , j, xmm1 );
2117 (~C).
store( i+IT::size , j, xmm2 );
2118 (~C).
store( i+IT::size*2UL, j, xmm3 );
2119 (~C).
store( i+IT::size*3UL, j, xmm4 );
2122 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2124 for( ; (j+2UL) <= N; j+=2UL ) {
2129 for(
size_t k=0UL; k<K; ++k ) {
2134 xmm1 = xmm1 - a1 * b1;
2135 xmm2 = xmm2 - a2 * b1;
2136 xmm3 = xmm3 - a1 * b2;
2137 xmm4 = xmm4 - a2 * b2;
2139 (~C).
store( i , j , xmm1 );
2140 (~C).
store( i+IT::size, j , xmm2 );
2141 (~C).
store( i , j+1UL, xmm3 );
2142 (~C).
store( i+IT::size, j+1UL, xmm4 );
2147 for(
size_t k=0UL; k<K; ++k ) {
2149 xmm1 = xmm1 - A.load(i ,k) * b1;
2150 xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
2152 (~C).
store( i , j, xmm1 );
2153 (~C).
store( i+IT::size, j, xmm2 );
2158 for( ; (j+2UL) <= N; j+=2UL ) {
2161 for(
size_t k=0UL; k<K; ++k ) {
2163 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2164 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2166 (~C).
store( i, j , xmm1 );
2167 (~C).
store( i, j+1UL, xmm2 );
2171 for(
size_t k=0UL; k<K; ++k ) {
2172 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
2174 (~C).
store( i, j, xmm1 );
2195 template<
typename MT3
2198 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2199 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2201 selectDefaultSubAssignKernel( C, A, B );
2221 template<
typename MT3
2224 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2225 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2227 using boost::numeric_cast;
2233 const int M ( numeric_cast<int>( A.rows() ) );
2234 const int N ( numeric_cast<int>( B.columns() ) );
2235 const int K ( numeric_cast<int>( A.columns() ) );
2236 const int lda( numeric_cast<int>( A.spacing() ) );
2237 const int ldb( numeric_cast<int>( B.spacing() ) );
2238 const int ldc( numeric_cast<int>( C.spacing() ) );
2240 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2241 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2242 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2243 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2264 template<
typename MT3
2267 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2268 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2270 using boost::numeric_cast;
2276 const int M ( numeric_cast<int>( A.rows() ) );
2277 const int N ( numeric_cast<int>( B.columns() ) );
2278 const int K ( numeric_cast<int>( A.columns() ) );
2279 const int lda( numeric_cast<int>( A.spacing() ) );
2280 const int ldb( numeric_cast<int>( B.spacing() ) );
2281 const int ldc( numeric_cast<int>( C.spacing() ) );
2283 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2284 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2285 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2286 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2307 template<
typename MT3
2310 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2311 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2313 using boost::numeric_cast;
2322 const int M ( numeric_cast<int>( A.rows() ) );
2323 const int N ( numeric_cast<int>( B.columns() ) );
2324 const int K ( numeric_cast<int>( A.columns() ) );
2325 const int lda( numeric_cast<int>( A.spacing() ) );
2326 const int ldb( numeric_cast<int>( B.spacing() ) );
2327 const int ldc( numeric_cast<int>( C.spacing() ) );
2328 const complex<float> alpha( -1.0F, 0.0F );
2329 const complex<float> beta ( 1.0F, 0.0F );
2331 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2332 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2333 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2334 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2355 template<
typename MT3
2358 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2359 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2361 using boost::numeric_cast;
2370 const int M ( numeric_cast<int>( A.rows() ) );
2371 const int N ( numeric_cast<int>( B.columns() ) );
2372 const int K ( numeric_cast<int>( A.columns() ) );
2373 const int lda( numeric_cast<int>( A.spacing() ) );
2374 const int ldb( numeric_cast<int>( B.spacing() ) );
2375 const int ldc( numeric_cast<int>( C.spacing() ) );
2376 const complex<double> alpha( -1.0, 0.0 );
2377 const complex<double> beta ( 1.0, 0.0 );
2379 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2380 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2381 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2382 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2415 template<
typename MT
2417 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2425 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2428 else if( rhs.lhs_.columns() == 0UL ) {
2462 template<
typename MT
2464 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2469 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2481 const TmpType tmp( rhs );
2502 template<
typename MT
2504 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2512 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2550 template<
typename MT
2552 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2560 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2619 template<
typename MT1
2623 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2624 ,
private MatScalarMultExpr
2625 ,
private Computation
2629 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2641 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2646 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2654 template<
typename MT >
2655 struct UseSMPAssign {
2656 enum { value = ( evaluateLeft || evaluateRight ) };
2665 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2666 struct UseSinglePrecisionKernel {
2667 enum { value = IsFloat<typename T1::ElementType>::value &&
2668 IsFloat<typename T2::ElementType>::value &&
2669 IsFloat<typename T3::ElementType>::value &&
2670 !IsComplex<T4>::value };
2679 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2680 struct UseDoublePrecisionKernel {
2681 enum { value = IsDouble<typename T1::ElementType>::value &&
2682 IsDouble<typename T2::ElementType>::value &&
2683 IsDouble<typename T3::ElementType>::value &&
2684 !IsComplex<T4>::value };
2693 template<
typename T1,
typename T2,
typename T3 >
2694 struct UseSinglePrecisionComplexKernel {
2695 typedef complex<float> Type;
2696 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2697 IsSame<typename T2::ElementType,Type>::value &&
2698 IsSame<typename T3::ElementType,Type>::value };
2707 template<
typename T1,
typename T2,
typename T3 >
2708 struct UseDoublePrecisionComplexKernel {
2709 typedef complex<double> Type;
2710 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2711 IsSame<typename T2::ElementType,Type>::value &&
2712 IsSame<typename T3::ElementType,Type>::value };
2720 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2721 struct UseDefaultKernel {
2722 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2723 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2724 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2725 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2733 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2734 struct UseVectorizedDefaultKernel {
2735 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2736 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2737 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2738 IsSame<typename T1::ElementType,T4>::value &&
2739 IntrinsicTrait<typename T1::ElementType>::addition &&
2740 IntrinsicTrait<typename T1::ElementType>::subtraction &&
2741 IntrinsicTrait<typename T1::ElementType>::multiplication };
2747 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2748 typedef typename MultTrait<RES,ST>::Type
ResultType;
2752 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2757 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
2763 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
2766 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
2771 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2772 IsSame<ET1,ET2>::value &&
2773 IsSame<ET1,ST>::value &&
2774 IntrinsicTrait<ET1>::addition &&
2775 IntrinsicTrait<ET1>::multiplication };
2778 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
2779 !evaluateRight && MT2::smpAssignable };
2788 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2804 return matrix_(i,j) * scalar_;
2813 inline size_t rows()
const {
2814 return matrix_.rows();
2823 inline size_t columns()
const {
2824 return matrix_.columns();
2854 template<
typename T >
2855 inline bool canAlias(
const T* alias )
const {
2856 return matrix_.canAlias( alias );
2866 template<
typename T >
2867 inline bool isAliased(
const T* alias )
const {
2868 return matrix_.isAliased( alias );
2878 return matrix_.isAligned();
2888 typename MMM::RightOperand B( matrix_.rightOperand() );
2913 template<
typename MT
2915 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2922 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2923 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2925 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2928 else if( left.columns() == 0UL ) {
2943 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
2958 template<
typename MT3
2962 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2965 DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
2967 DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
2985 template<
typename MT3
2989 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2990 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2992 for(
size_t i=0UL; i<A.rows(); ++i ) {
2993 for(
size_t k=0UL; k<B.columns(); ++k ) {
2994 C(i,k) = A(i,0UL) * B(0UL,k);
2996 for(
size_t j=1UL; j<A.columns(); ++j ) {
2997 for(
size_t k=0UL; k<B.columns(); ++k ) {
2998 C(i,k) += A(i,j) * B(j,k);
3001 for(
size_t k=0UL; k<B.columns(); ++k ) {
3022 template<
typename MT3
3026 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3027 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3029 typedef IntrinsicTrait<ElementType> IT;
3031 const size_t M( A.rows() );
3032 const size_t N( B.columns() );
3033 const size_t K( A.columns() );
3039 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3040 for(
size_t i=0UL; i<M; ++i ) {
3041 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3042 for(
size_t k=0UL; k<K; ++k ) {
3044 xmm1 = xmm1 + a1 * B.load(k,j );
3045 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3046 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3047 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3048 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3049 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3050 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3051 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3053 (~C).
store( i, j , xmm1 * factor );
3054 (~C).
store( i, j+IT::size , xmm2 * factor );
3055 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
3056 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
3057 (~C).
store( i, j+IT::size*4UL, xmm5 * factor );
3058 (~C).
store( i, j+IT::size*5UL, xmm6 * factor );
3059 (~C).
store( i, j+IT::size*6UL, xmm7 * factor );
3060 (~C).
store( i, j+IT::size*7UL, xmm8 * factor );
3063 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3065 for( ; (i+2UL) <= M; i+=2UL ) {
3066 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3067 for(
size_t k=0UL; k<K; ++k ) {
3074 xmm1 = xmm1 + a1 * b1;
3075 xmm2 = xmm2 + a1 * b2;
3076 xmm3 = xmm3 + a1 * b3;
3077 xmm4 = xmm4 + a1 * b4;
3078 xmm5 = xmm5 + a2 * b1;
3079 xmm6 = xmm6 + a2 * b2;
3080 xmm7 = xmm7 + a2 * b3;
3081 xmm8 = xmm8 + a2 * b4;
3083 (~C).
store( i , j , xmm1 * factor );
3084 (~C).
store( i , j+IT::size , xmm2 * factor );
3085 (~C).
store( i , j+IT::size*2UL, xmm3 * factor );
3086 (~C).
store( i , j+IT::size*3UL, xmm4 * factor );
3087 (~C).
store( i+1UL, j , xmm5 * factor );
3088 (~C).
store( i+1UL, j+IT::size , xmm6 * factor );
3089 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 * factor );
3090 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 * factor );
3094 for(
size_t k=0UL; k<K; ++k ) {
3096 xmm1 = xmm1 + a1 * B.load(k,j );
3097 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3098 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3099 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3101 (~C).
store( i, j , xmm1 * factor );
3102 (~C).
store( i, j+IT::size , xmm2 * factor );
3103 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
3104 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
3107 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3109 for( ; (i+2UL) <= M; i+=2UL ) {
3111 for(
size_t k=0UL; k<K; ++k ) {
3116 xmm1 = xmm1 + a1 * b1;
3117 xmm2 = xmm2 + a1 * b2;
3118 xmm3 = xmm3 + a2 * b1;
3119 xmm4 = xmm4 + a2 * b2;
3121 (~C).
store( i , j , xmm1 * factor );
3122 (~C).
store( i , j+IT::size, xmm2 * factor );
3123 (~C).
store( i+1UL, j , xmm3 * factor );
3124 (~C).
store( i+1UL, j+IT::size, xmm4 * factor );
3128 for(
size_t k=0UL; k<K; ++k ) {
3130 xmm1 = xmm1 + a1 * B.load(k,j );
3131 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3133 (~C).
store( i, j , xmm1 * factor );
3134 (~C).
store( i, j+IT::size, xmm2 * factor );
3139 for( ; (i+2UL) <= M; i+=2UL ) {
3141 for(
size_t k=0UL; k<K; ++k ) {
3143 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3144 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3146 (~C).
store( i , j, xmm1 * factor );
3147 (~C).
store( i+1UL, j, xmm2 * factor );
3151 for(
size_t k=0UL; k<K; ++k ) {
3152 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3154 (~C).
store( i, j, xmm1 * factor );
3174 template<
typename MT3
3178 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3179 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3181 typedef IntrinsicTrait<ElementType> IT;
3183 const size_t M( A.rows() );
3184 const size_t N( B.columns() );
3185 const size_t K( A.columns() );
3191 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3192 for(
size_t j=0UL; j<N; ++j ) {
3193 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3194 for(
size_t k=0UL; k<K; ++k ) {
3196 xmm1 = xmm1 + A.load(i ,k) * b1;
3197 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3198 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3199 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3200 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3201 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3202 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3203 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3205 (~C).
store( i , j, xmm1 * factor );
3206 (~C).
store( i+IT::size , j, xmm2 * factor );
3207 (~C).
store( i+IT::size*2UL, j, xmm3 * factor );
3208 (~C).
store( i+IT::size*3UL, j, xmm4 * factor );
3209 (~C).
store( i+IT::size*4UL, j, xmm5 * factor );
3210 (~C).
store( i+IT::size*5UL, j, xmm6 * factor );
3211 (~C).
store( i+IT::size*6UL, j, xmm7 * factor );
3212 (~C).
store( i+IT::size*7UL, j, xmm8 * factor );
3215 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3217 for( ; (j+2UL) <= N; j+=2UL ) {
3218 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3219 for(
size_t k=0UL; k<K; ++k ) {
3226 xmm1 = xmm1 + a1 * b1;
3227 xmm2 = xmm2 + a2 * b1;
3228 xmm3 = xmm3 + a3 * b1;
3229 xmm4 = xmm4 + a4 * b1;
3230 xmm5 = xmm5 + a1 * b2;
3231 xmm6 = xmm6 + a2 * b2;
3232 xmm7 = xmm7 + a3 * b2;
3233 xmm8 = xmm8 + a4 * b2;
3235 (~C).
store( i , j , xmm1 * factor );
3236 (~C).
store( i+IT::size , j , xmm2 * factor );
3237 (~C).
store( i+IT::size*2UL, j , xmm3 * factor );
3238 (~C).
store( i+IT::size*3UL, j , xmm4 * factor );
3239 (~C).
store( i , j+1UL, xmm5 * factor );
3240 (~C).
store( i+IT::size , j+1UL, xmm6 * factor );
3241 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 * factor );
3242 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 * factor );
3246 for(
size_t k=0UL; k<K; ++k ) {
3248 xmm1 = xmm1 + A.load(i ,k) * b1;
3249 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3250 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3251 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3253 (~C).
store( i , j, xmm1 * factor );
3254 (~C).
store( i+IT::size , j, xmm2 * factor );
3255 (~C).
store( i+IT::size*2UL, j, xmm3 * factor );
3256 (~C).
store( i+IT::size*3UL, j, xmm4 * factor );
3259 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3261 for( ; (j+2UL) <= N; j+=2UL ) {
3263 for(
size_t k=0UL; k<K; ++k ) {
3268 xmm1 = xmm1 + a1 * b1;
3269 xmm2 = xmm2 + a2 * b1;
3270 xmm3 = xmm3 + a1 * b2;
3271 xmm4 = xmm4 + a2 * b2;
3273 (~C).
store( i , j , xmm1 * factor );
3274 (~C).
store( i+IT::size, j , xmm2 * factor );
3275 (~C).
store( i , j+1UL, xmm3 * factor );
3276 (~C).
store( i+IT::size, j+1UL, xmm4 * factor );
3280 for(
size_t k=0UL; k<K; ++k ) {
3282 xmm1 = xmm1 + A.load(i ,k) * b1;
3283 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3285 (~C).
store( i , j, xmm1 * factor );
3286 (~C).
store( i+IT::size, j, xmm2 * factor );
3291 for( ; (j+2UL) <= N; j+=2UL ) {
3293 for(
size_t k=0UL; k<K; ++k ) {
3295 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3296 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3298 (~C).
store( i, j , xmm1 * factor );
3299 (~C).
store( i, j+1UL, xmm2 * factor );
3303 for(
size_t k=0UL; k<K; ++k ) {
3304 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3306 (~C).
store( i, j, xmm1 * factor );
3326 template<
typename MT3
3330 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3331 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3333 selectDefaultAssignKernel( C, A, B, scalar );
3352 template<
typename MT3
3356 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3357 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3359 using boost::numeric_cast;
3365 const int M ( numeric_cast<int>( A.rows() ) );
3366 const int N ( numeric_cast<int>( B.columns() ) );
3367 const int K ( numeric_cast<int>( A.columns() ) );
3368 const int lda( numeric_cast<int>( A.spacing() ) );
3369 const int ldb( numeric_cast<int>( B.spacing() ) );
3370 const int ldc( numeric_cast<int>( C.spacing() ) );
3372 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3373 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3374 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3375 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
3395 template<
typename MT3
3399 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3400 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3402 using boost::numeric_cast;
3408 const int M ( numeric_cast<int>( A.rows() ) );
3409 const int N ( numeric_cast<int>( B.columns() ) );
3410 const int K ( numeric_cast<int>( A.columns() ) );
3411 const int lda( numeric_cast<int>( A.spacing() ) );
3412 const int ldb( numeric_cast<int>( B.spacing() ) );
3413 const int ldc( numeric_cast<int>( C.spacing() ) );
3415 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3416 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3417 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3418 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3438 template<
typename MT3
3442 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3443 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3445 using boost::numeric_cast;
3454 const int M ( numeric_cast<int>( A.rows() ) );
3455 const int N ( numeric_cast<int>( B.columns() ) );
3456 const int K ( numeric_cast<int>( A.columns() ) );
3457 const int lda( numeric_cast<int>( A.spacing() ) );
3458 const int ldb( numeric_cast<int>( B.spacing() ) );
3459 const int ldc( numeric_cast<int>( C.spacing() ) );
3460 const complex<float> alpha( scalar );
3461 const complex<float> beta ( 0.0F, 0.0F );
3463 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3464 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3465 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3466 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3486 template<
typename MT3
3490 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3491 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3493 using boost::numeric_cast;
3502 const int M ( numeric_cast<int>( A.rows() ) );
3503 const int N ( numeric_cast<int>( B.columns() ) );
3504 const int K ( numeric_cast<int>( A.columns() ) );
3505 const int lda( numeric_cast<int>( A.spacing() ) );
3506 const int ldb( numeric_cast<int>( B.spacing() ) );
3507 const int ldc( numeric_cast<int>( C.spacing() ) );
3508 const complex<double> alpha( scalar );
3509 const complex<double> beta ( 0.0, 0.0 );
3511 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3512 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3513 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3514 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3531 template<
typename MT
3533 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3537 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3549 const TmpType tmp(
serial( rhs ) );
3566 template<
typename MT
3568 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3575 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3576 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3578 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3592 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3607 template<
typename MT3
3611 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3614 DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3616 DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3634 template<
typename MT3
3638 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3639 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3660 template<
typename MT3
3664 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3665 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3667 typedef IntrinsicTrait<ElementType> IT;
3669 const size_t M( A.rows() );
3670 const size_t N( B.columns() );
3671 const size_t K( A.columns() );
3677 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3678 for(
size_t i=0UL; i<M; ++i ) {
3679 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3680 for(
size_t k=0UL; k<K; ++k ) {
3682 xmm1 = xmm1 + a1 * B.load(k,j );
3683 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3684 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3685 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3686 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3687 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3688 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3689 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3691 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3692 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3693 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3694 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3695 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
3696 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
3697 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
3698 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
3701 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3703 for( ; (i+2UL) <= M; i+=2UL ) {
3704 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3705 for(
size_t k=0UL; k<K; ++k ) {
3712 xmm1 = xmm1 + a1 * b1;
3713 xmm2 = xmm2 + a1 * b2;
3714 xmm3 = xmm3 + a1 * b3;
3715 xmm4 = xmm4 + a1 * b4;
3716 xmm5 = xmm5 + a2 * b1;
3717 xmm6 = xmm6 + a2 * b2;
3718 xmm7 = xmm7 + a2 * b3;
3719 xmm8 = xmm8 + a2 * b4;
3721 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3722 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
3723 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
3724 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
3725 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3726 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
3727 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
3728 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
3732 for(
size_t k=0UL; k<K; ++k ) {
3734 xmm1 = xmm1 + a1 * B.load(k,j );
3735 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3736 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3737 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3739 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3740 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3741 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3742 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3745 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3747 for( ; (i+2UL) <= M; i+=2UL ) {
3749 for(
size_t k=0UL; k<K; ++k ) {
3754 xmm1 = xmm1 + a1 * b1;
3755 xmm2 = xmm2 + a1 * b2;
3756 xmm3 = xmm3 + a2 * b1;
3757 xmm4 = xmm4 + a2 * b2;
3759 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3760 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
3761 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3762 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
3766 for(
size_t k=0UL; k<K; ++k ) {
3768 xmm1 = xmm1 + a1 * B.load(k,j );
3769 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3771 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3772 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
3777 for( ; (i+2UL) <= M; i+=2UL ) {
3779 for(
size_t k=0UL; k<K; ++k ) {
3781 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3782 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3784 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3785 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3789 for(
size_t k=0UL; k<K; ++k ) {
3790 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3792 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3812 template<
typename MT3
3816 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3817 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3819 typedef IntrinsicTrait<ElementType> IT;
3821 const size_t M( A.rows() );
3822 const size_t N( B.columns() );
3823 const size_t K( A.columns() );
3829 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3830 for(
size_t j=0UL; j<N; ++j ) {
3831 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3832 for(
size_t k=0UL; k<K; ++k ) {
3834 xmm1 = xmm1 + A.load(i ,k) * b1;
3835 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3836 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3837 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3838 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3839 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3840 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3841 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3843 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3844 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3845 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3846 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3847 (~C).
store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
3848 (~C).
store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
3849 (~C).
store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
3850 (~C).
store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
3853 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3855 for( ; (j+2UL) <= N; j+=2UL ) {
3856 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3857 for(
size_t k=0UL; k<K; ++k ) {
3864 xmm1 = xmm1 + a1 * b1;
3865 xmm2 = xmm2 + a2 * b1;
3866 xmm3 = xmm3 + a3 * b1;
3867 xmm4 = xmm4 + a4 * b1;
3868 xmm5 = xmm5 + a1 * b2;
3869 xmm6 = xmm6 + a2 * b2;
3870 xmm7 = xmm7 + a3 * b2;
3871 xmm8 = xmm8 + a4 * b2;
3873 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3874 (~C).
store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
3875 (~C).
store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
3876 (~C).
store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
3877 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
3878 (~C).
store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
3879 (~C).
store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
3880 (~C).
store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
3884 for(
size_t k=0UL; k<K; ++k ) {
3886 xmm1 = xmm1 + A.load(i ,k) * b1;
3887 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3888 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3889 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3891 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3892 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3893 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3894 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3897 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3899 for( ; (j+2UL) <= N; j+=2UL ) {
3901 for(
size_t k=0UL; k<K; ++k ) {
3906 xmm1 = xmm1 + a1 * b1;
3907 xmm2 = xmm2 + a2 * b1;
3908 xmm3 = xmm3 + a1 * b2;
3909 xmm4 = xmm4 + a2 * b2;
3911 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3912 (~C).
store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
3913 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
3914 (~C).
store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
3918 for(
size_t k=0UL; k<K; ++k ) {
3920 xmm1 = xmm1 + A.load(i ,k) * b1;
3921 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3923 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3924 (~C).
store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
3929 for( ; (j+2UL) <= N; j+=2UL ) {
3931 for(
size_t k=0UL; k<K; ++k ) {
3933 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3934 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3936 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3937 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
3941 for(
size_t k=0UL; k<K; ++k ) {
3942 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3944 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3964 template<
typename MT3
3968 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3969 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3971 selectDefaultAddAssignKernel( C, A, B, scalar );
3990 template<
typename MT3
3994 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3995 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3997 using boost::numeric_cast;
4003 const int M ( numeric_cast<int>( A.rows() ) );
4004 const int N ( numeric_cast<int>( B.columns() ) );
4005 const int K ( numeric_cast<int>( A.columns() ) );
4006 const int lda( numeric_cast<int>( A.spacing() ) );
4007 const int ldb( numeric_cast<int>( B.spacing() ) );
4008 const int ldc( numeric_cast<int>( C.spacing() ) );
4010 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4011 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4012 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4013 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4033 template<
typename MT3
4037 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4038 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4040 using boost::numeric_cast;
4046 const int M ( numeric_cast<int>( A.rows() ) );
4047 const int N ( numeric_cast<int>( B.columns() ) );
4048 const int K ( numeric_cast<int>( A.columns() ) );
4049 const int lda( numeric_cast<int>( A.spacing() ) );
4050 const int ldb( numeric_cast<int>( B.spacing() ) );
4051 const int ldc( numeric_cast<int>( C.spacing() ) );
4053 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4054 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4055 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4056 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4076 template<
typename MT3
4080 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4081 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4083 using boost::numeric_cast;
4092 const int M ( numeric_cast<int>( A.rows() ) );
4093 const int N ( numeric_cast<int>( B.columns() ) );
4094 const int K ( numeric_cast<int>( A.columns() ) );
4095 const int lda( numeric_cast<int>( A.spacing() ) );
4096 const int ldb( numeric_cast<int>( B.spacing() ) );
4097 const int ldc( numeric_cast<int>( C.spacing() ) );
4098 const complex<float> alpha( scalar );
4099 const complex<float> beta ( 1.0F, 0.0F );
4101 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4102 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4103 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4104 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4124 template<
typename MT3
4128 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4129 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4131 using boost::numeric_cast;
4140 const int M ( numeric_cast<int>( A.rows() ) );
4141 const int N ( numeric_cast<int>( B.columns() ) );
4142 const int K ( numeric_cast<int>( A.columns() ) );
4143 const int lda( numeric_cast<int>( A.spacing() ) );
4144 const int ldb( numeric_cast<int>( B.spacing() ) );
4145 const int ldc( numeric_cast<int>( C.spacing() ) );
4146 const complex<double> alpha( scalar );
4147 const complex<double> beta ( 1.0, 0.0 );
4149 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4150 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4151 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4152 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4173 template<
typename MT
4175 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4182 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4183 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4185 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4199 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
4214 template<
typename MT3
4218 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4221 DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
4223 DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
4241 template<
typename MT3
4245 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4246 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4267 template<
typename MT3
4271 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4272 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4274 typedef IntrinsicTrait<ElementType> IT;
4276 const size_t M( A.rows() );
4277 const size_t N( B.columns() );
4278 const size_t K( A.columns() );
4284 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
4285 for(
size_t i=0UL; i<M; ++i ) {
4286 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4287 for(
size_t k=0UL; k<K; ++k ) {
4289 xmm1 = xmm1 + a1 * B.load(k,j );
4290 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4291 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4292 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4293 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
4294 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
4295 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
4296 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
4298 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4299 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4300 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4301 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4302 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
4303 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
4304 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
4305 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
4308 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
4310 for( ; (i+2UL) <= M; i+=2UL ) {
4311 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4312 for(
size_t k=0UL; k<K; ++k ) {
4319 xmm1 = xmm1 + a1 * b1;
4320 xmm2 = xmm2 + a1 * b2;
4321 xmm3 = xmm3 + a1 * b3;
4322 xmm4 = xmm4 + a1 * b4;
4323 xmm5 = xmm5 + a2 * b1;
4324 xmm6 = xmm6 + a2 * b2;
4325 xmm7 = xmm7 + a2 * b3;
4326 xmm8 = xmm8 + a2 * b4;
4328 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4329 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
4330 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
4331 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
4332 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
4333 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
4334 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
4335 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
4339 for(
size_t k=0UL; k<K; ++k ) {
4341 xmm1 = xmm1 + a1 * B.load(k,j );
4342 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4343 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4344 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4346 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4347 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4348 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4349 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4352 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
4354 for( ; (i+2UL) <= M; i+=2UL ) {
4356 for(
size_t k=0UL; k<K; ++k ) {
4361 xmm1 = xmm1 + a1 * b1;
4362 xmm2 = xmm2 + a1 * b2;
4363 xmm3 = xmm3 + a2 * b1;
4364 xmm4 = xmm4 + a2 * b2;
4366 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4367 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
4368 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
4369 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
4373 for(
size_t k=0UL; k<K; ++k ) {
4375 xmm1 = xmm1 + a1 * B.load(k,j );
4376 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
4378 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4379 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
4384 for( ; (i+2UL) <= M; i+=2UL ) {
4386 for(
size_t k=0UL; k<K; ++k ) {
4388 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
4389 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
4391 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4392 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
4396 for(
size_t k=0UL; k<K; ++k ) {
4397 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
4399 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4419 template<
typename MT3
4423 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4424 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4426 typedef IntrinsicTrait<ElementType> IT;
4428 const size_t M( A.rows() );
4429 const size_t N( B.columns() );
4430 const size_t K( A.columns() );
4436 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4437 for(
size_t j=0UL; j<N; ++j ) {
4438 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4439 for(
size_t k=0UL; k<K; ++k ) {
4441 xmm1 = xmm1 + A.load(i ,k) * b1;
4442 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4443 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4444 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4445 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4446 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4447 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4448 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4450 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4451 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4452 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4453 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4454 (~C).
store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
4455 (~C).
store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
4456 (~C).
store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
4457 (~C).
store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
4460 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
4462 for( ; (j+2UL) <= N; j+=2UL ) {
4463 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4464 for(
size_t k=0UL; k<K; ++k ) {
4471 xmm1 = xmm1 + a1 * b1;
4472 xmm2 = xmm2 + a2 * b1;
4473 xmm3 = xmm3 + a3 * b1;
4474 xmm4 = xmm4 + a4 * b1;
4475 xmm5 = xmm5 + a1 * b2;
4476 xmm6 = xmm6 + a2 * b2;
4477 xmm7 = xmm7 + a3 * b2;
4478 xmm8 = xmm8 + a4 * b2;
4480 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4481 (~C).
store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
4482 (~C).
store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
4483 (~C).
store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
4484 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4485 (~C).
store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
4486 (~C).
store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
4487 (~C).
store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
4491 for(
size_t k=0UL; k<K; ++k ) {
4493 xmm1 = xmm1 + A.load(i ,k) * b1;
4494 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4495 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4496 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4498 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4499 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4500 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4501 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4504 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
4506 for( ; (j+2UL) <= N; j+=2UL ) {
4508 for(
size_t k=0UL; k<K; ++k ) {
4513 xmm1 = xmm1 + a1 * b1;
4514 xmm2 = xmm2 + a2 * b1;
4515 xmm3 = xmm3 + a1 * b2;
4516 xmm4 = xmm4 + a2 * b2;
4518 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4519 (~C).
store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
4520 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4521 (~C).
store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
4525 for(
size_t k=0UL; k<K; ++k ) {
4527 xmm1 = xmm1 + A.load(i ,k) * b1;
4528 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4530 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4531 (~C).
store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
4536 for( ; (j+2UL) <= N; j+=2UL ) {
4538 for(
size_t k=0UL; k<K; ++k ) {
4540 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4541 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4543 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4544 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4548 for(
size_t k=0UL; k<K; ++k ) {
4549 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4551 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4571 template<
typename MT3
4575 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4576 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4578 selectDefaultSubAssignKernel( C, A, B, scalar );
4597 template<
typename MT3
4601 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4602 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4604 using boost::numeric_cast;
4610 const int M ( numeric_cast<int>( A.rows() ) );
4611 const int N ( numeric_cast<int>( B.columns() ) );
4612 const int K ( numeric_cast<int>( A.columns() ) );
4613 const int lda( numeric_cast<int>( A.spacing() ) );
4614 const int ldb( numeric_cast<int>( B.spacing() ) );
4615 const int ldc( numeric_cast<int>( C.spacing() ) );
4617 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4618 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4619 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4620 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4640 template<
typename MT3
4644 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4645 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4647 using boost::numeric_cast;
4653 const int M ( numeric_cast<int>( A.rows() ) );
4654 const int N ( numeric_cast<int>( B.columns() ) );
4655 const int K ( numeric_cast<int>( A.columns() ) );
4656 const int lda( numeric_cast<int>( A.spacing() ) );
4657 const int ldb( numeric_cast<int>( B.spacing() ) );
4658 const int ldc( numeric_cast<int>( C.spacing() ) );
4660 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4661 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4662 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4663 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4683 template<
typename MT3
4687 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4688 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4690 using boost::numeric_cast;
4699 const int M ( numeric_cast<int>( A.rows() ) );
4700 const int N ( numeric_cast<int>( B.columns() ) );
4701 const int K ( numeric_cast<int>( A.columns() ) );
4702 const int lda( numeric_cast<int>( A.spacing() ) );
4703 const int ldb( numeric_cast<int>( B.spacing() ) );
4704 const int ldc( numeric_cast<int>( C.spacing() ) );
4705 const complex<float> alpha( -scalar );
4706 const complex<float> beta ( 1.0F, 0.0F );
4708 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4709 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4710 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4711 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4731 template<
typename MT3
4735 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4736 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4738 using boost::numeric_cast;
4747 const int M ( numeric_cast<int>( A.rows() ) );
4748 const int N ( numeric_cast<int>( B.columns() ) );
4749 const int K ( numeric_cast<int>( A.columns() ) );
4750 const int lda( numeric_cast<int>( A.spacing() ) );
4751 const int ldb( numeric_cast<int>( B.spacing() ) );
4752 const int ldc( numeric_cast<int>( C.spacing() ) );
4753 const complex<double> alpha( -scalar );
4754 const complex<double> beta ( 1.0, 0.0 );
4756 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4757 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4758 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4759 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4790 template<
typename MT
4792 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4793 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4800 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4801 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4803 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4806 else if( left.columns() == 0UL ) {
4839 template<
typename MT
4841 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4842 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4846 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
4858 const TmpType tmp( rhs );
4877 template<
typename MT
4879 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4880 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4887 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4888 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4890 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4926 template<
typename MT
4928 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4929 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4936 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4937 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4939 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5021 template<
typename T1
5023 inline const TDMatDMatMultExpr<T1,T2>
5029 throw std::invalid_argument(
"Matrix sizes do not match" );
5046 template<
typename MT1,
typename MT2,
typename VT >
5051 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5052 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
5053 IsDenseVector<VT>::value && IsColumnVector<VT>::value
5054 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
5055 , INVALID_TYPE >::Type Type;
5064 template<
typename MT1,
typename MT2,
typename VT >
5069 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5070 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
5071 IsSparseVector<VT>::value && IsColumnVector<VT>::value
5072 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
5073 , INVALID_TYPE >::Type Type;
5082 template<
typename VT,
typename MT1,
typename MT2 >
5087 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
5088 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5089 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
5090 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
5091 , INVALID_TYPE >::Type Type;
5100 template<
typename VT,
typename MT1,
typename MT2 >
5105 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
5106 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5107 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
5108 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
5109 , INVALID_TYPE >::Type Type;
5118 template<
typename MT1,
typename MT2,
bool AF >
5123 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
5124 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
5133 template<
typename MT1,
typename MT2 >
5138 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
5147 template<
typename MT1,
typename MT2 >
5152 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4599
EnableIf< IsIntegral< T > >::Type store(T *address, const typename Store< T, sizeof(T)>::Type &value)
Aligned store of a vector of integral values.
Definition: Store.h:223
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:341
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4329
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:414
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:152
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:199
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:395
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2408
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:251
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:122
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:249
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:361
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:690
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:250
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:125
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:121
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:122
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:125
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:126
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:286
Header file for the multiplication trait.
Header file for the IsDouble type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:262
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:251
Header file for the DenseMatrix base class.
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:880
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:271
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:259
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:246
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:265
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2406
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:256
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the serial shim.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:252
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:92
Header file for the IsNumeric type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:123
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:301
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:405
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:331
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:124
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:249
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:385
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:351
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:250
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:248
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2403
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:331
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:253
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:415
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:301
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:247
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:373
Header file for the FunctionTrace class.