35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
114 template<
typename MT1
116 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
117 ,
private MatMatMultExpr
118 ,
private Computation
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseSMPAssignKernel {
147 enum { value = evaluateLeft || evaluateRight };
157 template<
typename T1,
typename T2,
typename T3 >
158 struct UseSinglePrecisionKernel {
159 enum { value = IsFloat<typename T1::ElementType>::value &&
160 IsFloat<typename T2::ElementType>::value &&
161 IsFloat<typename T3::ElementType>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDoublePrecisionKernel {
173 enum { value = IsDouble<typename T1::ElementType>::value &&
174 IsDouble<typename T2::ElementType>::value &&
175 IsDouble<typename T3::ElementType>::value };
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseSinglePrecisionComplexKernel {
188 typedef complex<float> Type;
189 enum { value = IsSame<typename T1::ElementType,Type>::value &&
190 IsSame<typename T2::ElementType,Type>::value &&
191 IsSame<typename T3::ElementType,Type>::value };
202 template<
typename T1,
typename T2,
typename T3 >
203 struct UseDoublePrecisionComplexKernel {
204 typedef complex<double> Type;
205 enum { value = IsSame<typename T1::ElementType,Type>::value &&
206 IsSame<typename T2::ElementType,Type>::value &&
207 IsSame<typename T3::ElementType,Type>::value };
217 template<
typename T1,
typename T2,
typename T3 >
218 struct UseDefaultKernel {
219 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
220 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
221 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
222 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
232 template<
typename T1,
typename T2,
typename T3 >
233 struct UseVectorizedDefaultKernel {
234 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
235 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
236 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
237 IntrinsicTrait<typename T1::ElementType>::addition &&
238 IntrinsicTrait<typename T1::ElementType>::subtraction &&
239 IntrinsicTrait<typename T1::ElementType>::multiplication };
270 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
276 enum { smpAssignable = !evaluateLeft && !evaluateRight };
306 if(
lhs_.columns() != 0UL ) {
307 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
309 for(
size_t k=1UL; k<end; k+=2UL ) {
311 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
313 if( end <
lhs_.columns() ) {
341 return rhs_.columns();
371 template<
typename T >
373 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
383 template<
typename T >
385 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
395 return lhs_.isAligned() &&
rhs_.isAligned();
427 template<
typename MT
436 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
439 else if( rhs.lhs_.columns() == 0UL ) {
454 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
470 template<
typename MT3
474 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
477 TDMatDMatMultExpr::selectDefaultAssignKernel( C, A, B );
479 TDMatDMatMultExpr::selectBlasAssignKernel( C, A, B );
495 template<
typename MT3
498 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
499 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
520 template<
typename MT3
523 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
524 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
526 const size_t M( A.rows() );
527 const size_t N( B.columns() );
528 const size_t K( A.columns() );
530 for(
size_t i=0UL; i<M; ++i ) {
531 for(
size_t j=0UL; j<N; ++j ) {
532 C(i,j) = A(i,0UL) * B(0UL,j);
534 for(
size_t k=1UL; k<K; ++k ) {
535 for(
size_t j=0UL; j<N; ++j ) {
536 C(i,j) += A(i,k) * B(k,j);
558 template<
typename MT3
561 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
562 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
564 typedef IntrinsicTrait<ElementType> IT;
566 const size_t M( A.rows() );
567 const size_t N( B.columns() );
568 const size_t K( A.columns() );
572 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
573 for(
size_t i=0UL; i<M; ++i ) {
574 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
575 for(
size_t k=0UL; k<K; ++k ) {
577 xmm1 = xmm1 + a1 * B.load(k,j );
578 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
579 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
580 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
581 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
582 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
583 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
584 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
586 (~C).
store( i, j , xmm1 );
587 (~C).
store( i, j+IT::size , xmm2 );
588 (~C).
store( i, j+IT::size*2UL, xmm3 );
589 (~C).
store( i, j+IT::size*3UL, xmm4 );
590 (~C).
store( i, j+IT::size*4UL, xmm5 );
591 (~C).
store( i, j+IT::size*5UL, xmm6 );
592 (~C).
store( i, j+IT::size*6UL, xmm7 );
593 (~C).
store( i, j+IT::size*7UL, xmm8 );
596 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
598 for( ; (i+2UL) <= M; i+=2UL ) {
599 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
600 for(
size_t k=0UL; k<K; ++k ) {
607 xmm1 = xmm1 + a1 * b1;
608 xmm2 = xmm2 + a1 * b2;
609 xmm3 = xmm3 + a1 * b3;
610 xmm4 = xmm4 + a1 * b4;
611 xmm5 = xmm5 + a2 * b1;
612 xmm6 = xmm6 + a2 * b2;
613 xmm7 = xmm7 + a2 * b3;
614 xmm8 = xmm8 + a2 * b4;
616 (~C).
store( i , j , xmm1 );
617 (~C).
store( i , j+IT::size , xmm2 );
618 (~C).
store( i , j+IT::size*2UL, xmm3 );
619 (~C).
store( i , j+IT::size*3UL, xmm4 );
620 (~C).
store( i+1UL, j , xmm5 );
621 (~C).
store( i+1UL, j+IT::size , xmm6 );
622 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
623 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
627 for(
size_t k=0UL; k<K; ++k ) {
629 xmm1 = xmm1 + a1 * B.load(k,j );
630 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
631 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
632 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
634 (~C).
store( i, j , xmm1 );
635 (~C).
store( i, j+IT::size , xmm2 );
636 (~C).
store( i, j+IT::size*2UL, xmm3 );
637 (~C).
store( i, j+IT::size*3UL, xmm4 );
640 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
642 for( ; (i+2UL) <= M; i+=2UL ) {
644 for(
size_t k=0UL; k<K; ++k ) {
649 xmm1 = xmm1 + a1 * b1;
650 xmm2 = xmm2 + a1 * b2;
651 xmm3 = xmm3 + a2 * b1;
652 xmm4 = xmm4 + a2 * b2;
654 (~C).
store( i , j , xmm1 );
655 (~C).
store( i , j+IT::size, xmm2 );
656 (~C).
store( i+1UL, j , xmm3 );
657 (~C).
store( i+1UL, j+IT::size, xmm4 );
661 for(
size_t k=0UL; k<K; ++k ) {
663 xmm1 = xmm1 + a1 * B.load(k,j );
664 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
666 (~C).
store( i, j , xmm1 );
667 (~C).
store( i, j+IT::size, xmm2 );
672 for( ; (i+2UL) <= M; i+=2UL ) {
674 for(
size_t k=0UL; k<K; ++k ) {
676 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
677 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
679 (~C).
store( i , j, xmm1 );
680 (~C).
store( i+1UL, j, xmm2 );
684 for(
size_t k=0UL; k<K; ++k ) {
685 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
687 (~C).
store( i, j, xmm1 );
708 template<
typename MT3
711 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
712 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
714 typedef IntrinsicTrait<ElementType> IT;
716 const size_t M( A.rows() );
717 const size_t N( B.columns() );
718 const size_t K( A.columns() );
722 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
723 for(
size_t j=0UL; j<N; ++j ) {
724 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
725 for(
size_t k=0UL; k<K; ++k ) {
727 xmm1 = xmm1 + A.load(i ,k) * b1;
728 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
729 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
730 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
731 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
732 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
733 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
734 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
736 (~C).
store( i , j, xmm1 );
737 (~C).
store( i+IT::size , j, xmm2 );
738 (~C).
store( i+IT::size*2UL, j, xmm3 );
739 (~C).
store( i+IT::size*3UL, j, xmm4 );
740 (~C).
store( i+IT::size*4UL, j, xmm5 );
741 (~C).
store( i+IT::size*5UL, j, xmm6 );
742 (~C).
store( i+IT::size*6UL, j, xmm7 );
743 (~C).
store( i+IT::size*7UL, j, xmm8 );
746 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
748 for( ; (j+2UL) <= N; j+=2UL ) {
749 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
750 for(
size_t k=0UL; k<K; ++k ) {
757 xmm1 = xmm1 + a1 * b1;
758 xmm2 = xmm2 + a2 * b1;
759 xmm3 = xmm3 + a3 * b1;
760 xmm4 = xmm4 + a4 * b1;
761 xmm5 = xmm5 + a1 * b2;
762 xmm6 = xmm6 + a2 * b2;
763 xmm7 = xmm7 + a3 * b2;
764 xmm8 = xmm8 + a4 * b2;
766 (~C).
store( i , j , xmm1 );
767 (~C).
store( i+IT::size , j , xmm2 );
768 (~C).
store( i+IT::size*2UL, j , xmm3 );
769 (~C).
store( i+IT::size*3UL, j , xmm4 );
770 (~C).
store( i , j+1UL, xmm5 );
771 (~C).
store( i+IT::size , j+1UL, xmm6 );
772 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
773 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
777 for(
size_t k=0UL; k<K; ++k ) {
779 xmm1 = xmm1 + A.load(i ,k) * b1;
780 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
781 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
782 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
784 (~C).
store( i , j, xmm1 );
785 (~C).
store( i+IT::size , j, xmm2 );
786 (~C).
store( i+IT::size*2UL, j, xmm3 );
787 (~C).
store( i+IT::size*3UL, j, xmm4 );
790 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
792 for( ; (j+2UL) <= N; j+=2UL ) {
794 for(
size_t k=0UL; k<K; ++k ) {
799 xmm1 = xmm1 + a1 * b1;
800 xmm2 = xmm2 + a2 * b1;
801 xmm3 = xmm3 + a1 * b2;
802 xmm4 = xmm4 + a2 * b2;
804 (~C).
store( i , j , xmm1 );
805 (~C).
store( i+IT::size, j , xmm2 );
806 (~C).
store( i , j+1UL, xmm3 );
807 (~C).
store( i+IT::size, j+1UL, xmm4 );
811 for(
size_t k=0UL; k<K; ++k ) {
813 xmm1 = xmm1 + A.load(i ,k) * b1;
814 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
816 (~C).
store( i , j, xmm1 );
817 (~C).
store( i+IT::size, j, xmm2 );
822 for( ; (j+2UL) <= N; j+=2UL ) {
824 for(
size_t k=0UL; k<K; ++k ) {
826 xmm1 = xmm1 + a1 *
set( B(k,j ) );
827 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
829 (~C).
store( i, j , xmm1 );
830 (~C).
store( i, j+1UL, xmm2 );
834 for(
size_t k=0UL; k<K; ++k ) {
835 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
837 (~C).
store( i, j, xmm1 );
858 template<
typename MT3
861 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
862 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
864 selectDefaultAssignKernel( C, A, B );
884 template<
typename MT3
887 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
888 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
890 using boost::numeric_cast;
896 const int M ( numeric_cast<int>( A.rows() ) );
897 const int N ( numeric_cast<int>( B.columns() ) );
898 const int K ( numeric_cast<int>( A.columns() ) );
899 const int lda( numeric_cast<int>( A.spacing() ) );
900 const int ldb( numeric_cast<int>( B.spacing() ) );
901 const int ldc( numeric_cast<int>( C.spacing() ) );
903 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
904 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
905 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
906 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
927 template<
typename MT3
930 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
931 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
933 using boost::numeric_cast;
939 const int M ( numeric_cast<int>( A.rows() ) );
940 const int N ( numeric_cast<int>( B.columns() ) );
941 const int K ( numeric_cast<int>( A.columns() ) );
942 const int lda( numeric_cast<int>( A.spacing() ) );
943 const int ldb( numeric_cast<int>( B.spacing() ) );
944 const int ldc( numeric_cast<int>( C.spacing() ) );
946 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
947 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
948 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
949 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
970 template<
typename MT3
973 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
974 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
976 using boost::numeric_cast;
985 const int M ( numeric_cast<int>( A.rows() ) );
986 const int N ( numeric_cast<int>( B.columns() ) );
987 const int K ( numeric_cast<int>( A.columns() ) );
988 const int lda( numeric_cast<int>( A.spacing() ) );
989 const int ldb( numeric_cast<int>( B.spacing() ) );
990 const int ldc( numeric_cast<int>( C.spacing() ) );
991 const complex<float> alpha( 1.0F, 0.0F );
992 const complex<float> beta ( 0.0F, 0.0F );
994 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
995 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
996 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
997 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1018 template<
typename MT3
1021 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1022 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1024 using boost::numeric_cast;
1033 const int M ( numeric_cast<int>( A.rows() ) );
1034 const int N ( numeric_cast<int>( B.columns() ) );
1035 const int K ( numeric_cast<int>( A.columns() ) );
1036 const int lda( numeric_cast<int>( A.spacing() ) );
1037 const int ldb( numeric_cast<int>( B.spacing() ) );
1038 const int ldc( numeric_cast<int>( C.spacing() ) );
1039 const complex<double> alpha( 1.0, 0.0 );
1040 const complex<double> beta ( 0.0, 0.0 );
1042 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1043 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1044 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1045 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1063 template<
typename MT
1069 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1081 const TmpType tmp( rhs );
1100 template<
typename MT
1109 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1123 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1139 template<
typename MT3
1142 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1143 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1146 TDMatDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1148 TDMatDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1164 template<
typename MT3
1167 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1168 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1189 template<
typename MT3
1192 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1193 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1195 const size_t M( A.rows() );
1196 const size_t N( B.columns() );
1197 const size_t K( A.columns() );
1200 const size_t end( N &
size_t(-2) );
1202 for(
size_t i=0UL; i<M; ++i ) {
1203 for(
size_t k=0UL; k<K; ++k ) {
1204 for(
size_t j=0UL; j<end; j+=2UL ) {
1205 C(i,j ) += A(i,k) * B(k,j );
1206 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1209 C(i,end) += A(i,k) * B(k,end);
1231 template<
typename MT3
1234 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1235 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1237 typedef IntrinsicTrait<ElementType> IT;
1239 const size_t M( A.rows() );
1240 const size_t N( B.columns() );
1241 const size_t K( A.columns() );
1245 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1246 for(
size_t i=0UL; i<M; ++i ) {
1255 for(
size_t k=0UL; k<K; ++k ) {
1257 xmm1 = xmm1 + a1 * B.load(k,j );
1258 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1259 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1260 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1261 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1262 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1263 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1264 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1266 (~C).
store( i, j , xmm1 );
1267 (~C).
store( i, j+IT::size , xmm2 );
1268 (~C).
store( i, j+IT::size*2UL, xmm3 );
1269 (~C).
store( i, j+IT::size*3UL, xmm4 );
1270 (~C).
store( i, j+IT::size*4UL, xmm5 );
1271 (~C).
store( i, j+IT::size*5UL, xmm6 );
1272 (~C).
store( i, j+IT::size*6UL, xmm7 );
1273 (~C).
store( i, j+IT::size*7UL, xmm8 );
1276 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1278 for( ; (i+2UL) <= M; i+=2UL ) {
1287 for(
size_t k=0UL; k<K; ++k ) {
1294 xmm1 = xmm1 + a1 * b1;
1295 xmm2 = xmm2 + a1 * b2;
1296 xmm3 = xmm3 + a1 * b3;
1297 xmm4 = xmm4 + a1 * b4;
1298 xmm5 = xmm5 + a2 * b1;
1299 xmm6 = xmm6 + a2 * b2;
1300 xmm7 = xmm7 + a2 * b3;
1301 xmm8 = xmm8 + a2 * b4;
1303 (~C).
store( i , j , xmm1 );
1304 (~C).
store( i , j+IT::size , xmm2 );
1305 (~C).
store( i , j+IT::size*2UL, xmm3 );
1306 (~C).
store( i , j+IT::size*3UL, xmm4 );
1307 (~C).
store( i+1UL, j , xmm5 );
1308 (~C).
store( i+1UL, j+IT::size , xmm6 );
1309 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1310 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1317 for(
size_t k=0UL; k<K; ++k ) {
1319 xmm1 = xmm1 + a1 * B.load(k,j );
1320 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1321 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1322 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1324 (~C).
store( i, j , xmm1 );
1325 (~C).
store( i, j+IT::size , xmm2 );
1326 (~C).
store( i, j+IT::size*2UL, xmm3 );
1327 (~C).
store( i, j+IT::size*3UL, xmm4 );
1330 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1332 for( ; (i+2UL) <= M; i+=2UL ) {
1337 for(
size_t k=0UL; k<K; ++k ) {
1342 xmm1 = xmm1 + a1 * b1;
1343 xmm2 = xmm2 + a1 * b2;
1344 xmm3 = xmm3 + a2 * b1;
1345 xmm4 = xmm4 + a2 * b2;
1347 (~C).
store( i , j , xmm1 );
1348 (~C).
store( i , j+IT::size, xmm2 );
1349 (~C).
store( i+1UL, j , xmm3 );
1350 (~C).
store( i+1UL, j+IT::size, xmm4 );
1355 for(
size_t k=0UL; k<K; ++k ) {
1357 xmm1 = xmm1 + a1 * B.load(k,j );
1358 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1360 (~C).
store( i, j , xmm1 );
1361 (~C).
store( i, j+IT::size, xmm2 );
1366 for( ; (i+2UL) <= M; i+=2UL ) {
1369 for(
size_t k=0UL; k<K; ++k ) {
1371 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1372 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1374 (~C).
store( i , j, xmm1 );
1375 (~C).
store( i+1UL, j, xmm2 );
1379 for(
size_t k=0UL; k<K; ++k ) {
1380 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1382 (~C).
store( i, j, xmm1 );
1403 template<
typename MT3
1406 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1407 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1409 typedef IntrinsicTrait<ElementType> IT;
1411 const size_t M( A.rows() );
1412 const size_t N( B.columns() );
1413 const size_t K( A.columns() );
1417 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1418 for(
size_t j=0UL; j<N; ++j ) {
1427 for(
size_t k=0UL; k<K; ++k ) {
1429 xmm1 = xmm1 + A.load(i ,k) * b1;
1430 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1431 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1432 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1433 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1434 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1435 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1436 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1438 (~C).
store( i , j, xmm1 );
1439 (~C).
store( i+IT::size , j, xmm2 );
1440 (~C).
store( i+IT::size*2UL, j, xmm3 );
1441 (~C).
store( i+IT::size*3UL, j, xmm4 );
1442 (~C).
store( i+IT::size*4UL, j, xmm5 );
1443 (~C).
store( i+IT::size*5UL, j, xmm6 );
1444 (~C).
store( i+IT::size*6UL, j, xmm7 );
1445 (~C).
store( i+IT::size*7UL, j, xmm8 );
1448 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1450 for( ; (j+2UL) <= N; j+=2UL ) {
1459 for(
size_t k=0UL; k<K; ++k ) {
1466 xmm1 = xmm1 + a1 * b1;
1467 xmm2 = xmm2 + a2 * b1;
1468 xmm3 = xmm3 + a3 * b1;
1469 xmm4 = xmm4 + a4 * b1;
1470 xmm5 = xmm5 + a1 * b2;
1471 xmm6 = xmm6 + a2 * b2;
1472 xmm7 = xmm7 + a3 * b2;
1473 xmm8 = xmm8 + a4 * b2;
1475 (~C).
store( i , j , xmm1 );
1476 (~C).
store( i+IT::size , j , xmm2 );
1477 (~C).
store( i+IT::size*2UL, j , xmm3 );
1478 (~C).
store( i+IT::size*3UL, j , xmm4 );
1479 (~C).
store( i , j+1UL, xmm5 );
1480 (~C).
store( i+IT::size , j+1UL, xmm6 );
1481 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
1482 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
1489 for(
size_t k=0UL; k<K; ++k ) {
1491 xmm1 = xmm1 + A.load(i ,k) * b1;
1492 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1493 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1494 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1496 (~C).
store( i , j, xmm1 );
1497 (~C).
store( i+IT::size , j, xmm2 );
1498 (~C).
store( i+IT::size*2UL, j, xmm3 );
1499 (~C).
store( i+IT::size*3UL, j, xmm4 );
1502 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1504 for( ; (j+2UL) <= N; j+=2UL ) {
1509 for(
size_t k=0UL; k<K; ++k ) {
1514 xmm1 = xmm1 + a1 * b1;
1515 xmm2 = xmm2 + a2 * b1;
1516 xmm3 = xmm3 + a1 * b2;
1517 xmm4 = xmm4 + a2 * b2;
1519 (~C).
store( i , j , xmm1 );
1520 (~C).
store( i+IT::size, j , xmm2 );
1521 (~C).
store( i , j+1UL, xmm3 );
1522 (~C).
store( i+IT::size, j+1UL, xmm4 );
1527 for(
size_t k=0UL; k<K; ++k ) {
1529 xmm1 = xmm1 + A.load(i ,k) * b1;
1530 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1532 (~C).
store( i , j, xmm1 );
1533 (~C).
store( i+IT::size, j, xmm2 );
1538 for( ; (j+2UL) <= N; j+=2UL ) {
1541 for(
size_t k=0UL; k<K; ++k ) {
1543 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1544 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1546 (~C).
store( i, j , xmm1 );
1547 (~C).
store( i, j+1UL, xmm2 );
1551 for(
size_t k=0UL; k<K; ++k ) {
1552 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1554 (~C).
store( i, j, xmm1 );
1575 template<
typename MT3
1578 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1579 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1581 selectDefaultAddAssignKernel( C, A, B );
1601 template<
typename MT3
1604 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1605 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1607 using boost::numeric_cast;
1613 const int M ( numeric_cast<int>( A.rows() ) );
1614 const int N ( numeric_cast<int>( B.columns() ) );
1615 const int K ( numeric_cast<int>( A.columns() ) );
1616 const int lda( numeric_cast<int>( A.spacing() ) );
1617 const int ldb( numeric_cast<int>( B.spacing() ) );
1618 const int ldc( numeric_cast<int>( C.spacing() ) );
1620 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1621 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1622 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1623 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1644 template<
typename MT3
1647 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1648 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1650 using boost::numeric_cast;
1656 const int M ( numeric_cast<int>( A.rows() ) );
1657 const int N ( numeric_cast<int>( B.columns() ) );
1658 const int K ( numeric_cast<int>( A.columns() ) );
1659 const int lda( numeric_cast<int>( A.spacing() ) );
1660 const int ldb( numeric_cast<int>( B.spacing() ) );
1661 const int ldc( numeric_cast<int>( C.spacing() ) );
1663 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1664 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1665 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1666 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1687 template<
typename MT3
1690 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1691 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1693 using boost::numeric_cast;
1702 const int M ( numeric_cast<int>( A.rows() ) );
1703 const int N ( numeric_cast<int>( B.columns() ) );
1704 const int K ( numeric_cast<int>( A.columns() ) );
1705 const int lda( numeric_cast<int>( A.spacing() ) );
1706 const int ldb( numeric_cast<int>( B.spacing() ) );
1707 const int ldc( numeric_cast<int>( C.spacing() ) );
1708 const complex<float> alpha( 1.0F, 0.0F );
1709 const complex<float> beta ( 1.0F, 0.0F );
1711 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1712 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1713 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1714 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1735 template<
typename MT3
1738 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1739 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1741 using boost::numeric_cast;
1750 const int M ( numeric_cast<int>( A.rows() ) );
1751 const int N ( numeric_cast<int>( B.columns() ) );
1752 const int K ( numeric_cast<int>( A.columns() ) );
1753 const int lda( numeric_cast<int>( A.spacing() ) );
1754 const int ldb( numeric_cast<int>( B.spacing() ) );
1755 const int ldc( numeric_cast<int>( C.spacing() ) );
1756 const complex<double> alpha( 1.0, 0.0 );
1757 const complex<double> beta ( 1.0, 0.0 );
1759 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1760 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1761 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1762 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1785 template<
typename MT
1794 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1808 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1824 template<
typename MT3
1827 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1828 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1831 TDMatDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1833 TDMatDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1849 template<
typename MT3
1852 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1853 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1874 template<
typename MT3
1877 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1878 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1880 const size_t M( A.rows() );
1881 const size_t N( B.columns() );
1882 const size_t K( A.columns() );
1885 const size_t end( N &
size_t(-2) );
1887 for(
size_t i=0UL; i<M; ++i ) {
1888 for(
size_t k=0UL; k<K; ++k ) {
1889 for(
size_t j=0UL; j<end; j+=2UL ) {
1890 C(i,j ) -= A(i,k) * B(k,j );
1891 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1894 C(i,end) -= A(i,k) * B(k,end);
1916 template<
typename MT3
1919 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1920 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1922 typedef IntrinsicTrait<ElementType> IT;
1924 const size_t M( A.rows() );
1925 const size_t N( B.columns() );
1926 const size_t K( A.columns() );
1930 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1931 for(
size_t i=0UL; i<M; ++i ) {
1940 for(
size_t k=0UL; k<K; ++k ) {
1942 xmm1 = xmm1 - a1 * B.load(k,j );
1943 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1944 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1945 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1946 xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1947 xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1948 xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1949 xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1951 (~C).
store( i, j , xmm1 );
1952 (~C).
store( i, j+IT::size , xmm2 );
1953 (~C).
store( i, j+IT::size*2UL, xmm3 );
1954 (~C).
store( i, j+IT::size*3UL, xmm4 );
1955 (~C).
store( i, j+IT::size*4UL, xmm5 );
1956 (~C).
store( i, j+IT::size*5UL, xmm6 );
1957 (~C).
store( i, j+IT::size*6UL, xmm7 );
1958 (~C).
store( i, j+IT::size*7UL, xmm8 );
1961 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1963 for( ; (i+2UL) <= M; i+=2UL ) {
1972 for(
size_t k=0UL; k<K; ++k ) {
1979 xmm1 = xmm1 - a1 * b1;
1980 xmm2 = xmm2 - a1 * b2;
1981 xmm3 = xmm3 - a1 * b3;
1982 xmm4 = xmm4 - a1 * b4;
1983 xmm5 = xmm5 - a2 * b1;
1984 xmm6 = xmm6 - a2 * b2;
1985 xmm7 = xmm7 - a2 * b3;
1986 xmm8 = xmm8 - a2 * b4;
1988 (~C).
store( i , j , xmm1 );
1989 (~C).
store( i , j+IT::size , xmm2 );
1990 (~C).
store( i , j+IT::size*2UL, xmm3 );
1991 (~C).
store( i , j+IT::size*3UL, xmm4 );
1992 (~C).
store( i+1UL, j , xmm5 );
1993 (~C).
store( i+1UL, j+IT::size , xmm6 );
1994 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1995 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
2002 for(
size_t k=0UL; k<K; ++k ) {
2004 xmm1 = xmm1 - a1 * B.load(k,j );
2005 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
2006 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
2007 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
2009 (~C).
store( i, j , xmm1 );
2010 (~C).
store( i, j+IT::size , xmm2 );
2011 (~C).
store( i, j+IT::size*2UL, xmm3 );
2012 (~C).
store( i, j+IT::size*3UL, xmm4 );
2015 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2017 for( ; (i+2UL) <= M; i+=2UL ) {
2022 for(
size_t k=0UL; k<K; ++k ) {
2027 xmm1 = xmm1 - a1 * b1;
2028 xmm2 = xmm2 - a1 * b2;
2029 xmm3 = xmm3 - a2 * b1;
2030 xmm4 = xmm4 - a2 * b2;
2032 (~C).
store( i , j , xmm1 );
2033 (~C).
store( i , j+IT::size, xmm2 );
2034 (~C).
store( i+1UL, j , xmm3 );
2035 (~C).
store( i+1UL, j+IT::size, xmm4 );
2040 for(
size_t k=0UL; k<K; ++k ) {
2042 xmm1 = xmm1 - a1 * B.load(k,j );
2043 xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
2045 (~C).
store( i, j , xmm1 );
2046 (~C).
store( i, j+IT::size, xmm2 );
2051 for( ; (i+2UL) <= M; i+=2UL ) {
2054 for(
size_t k=0UL; k<K; ++k ) {
2056 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
2057 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
2059 (~C).
store( i , j, xmm1 );
2060 (~C).
store( i+1UL, j, xmm2 );
2064 for(
size_t k=0UL; k<K; ++k ) {
2065 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
2067 (~C).
store( i, j, xmm1 );
2088 template<
typename MT3
2091 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2092 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2094 typedef IntrinsicTrait<ElementType> IT;
2096 const size_t M( A.rows() );
2097 const size_t N( B.columns() );
2098 const size_t K( A.columns() );
2102 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2103 for(
size_t j=0UL; j<N; ++j ) {
2112 for(
size_t k=0UL; k<K; ++k ) {
2114 xmm1 = xmm1 - A.load(i ,k) * b1;
2115 xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2116 xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2117 xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2118 xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
2119 xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
2120 xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
2121 xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
2123 (~C).
store( i , j, xmm1 );
2124 (~C).
store( i+IT::size , j, xmm2 );
2125 (~C).
store( i+IT::size*2UL, j, xmm3 );
2126 (~C).
store( i+IT::size*3UL, j, xmm4 );
2127 (~C).
store( i+IT::size*4UL, j, xmm5 );
2128 (~C).
store( i+IT::size*5UL, j, xmm6 );
2129 (~C).
store( i+IT::size*6UL, j, xmm7 );
2130 (~C).
store( i+IT::size*7UL, j, xmm8 );
2133 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2135 for( ; (j+2UL) <= N; j+=2UL ) {
2144 for(
size_t k=0UL; k<K; ++k ) {
2151 xmm1 = xmm1 - a1 * b1;
2152 xmm2 = xmm2 - a2 * b1;
2153 xmm3 = xmm3 - a3 * b1;
2154 xmm4 = xmm4 - a4 * b1;
2155 xmm5 = xmm5 - a1 * b2;
2156 xmm6 = xmm6 - a2 * b2;
2157 xmm7 = xmm7 - a3 * b2;
2158 xmm8 = xmm8 - a4 * b2;
2160 (~C).
store( i , j , xmm1 );
2161 (~C).
store( i+IT::size , j , xmm2 );
2162 (~C).
store( i+IT::size*2UL, j , xmm3 );
2163 (~C).
store( i+IT::size*3UL, j , xmm4 );
2164 (~C).
store( i , j+1UL, xmm5 );
2165 (~C).
store( i+IT::size , j+1UL, xmm6 );
2166 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
2167 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
2174 for(
size_t k=0UL; k<K; ++k ) {
2176 xmm1 = xmm1 - A.load(i ,k) * b1;
2177 xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2178 xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2179 xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2181 (~C).
store( i , j, xmm1 );
2182 (~C).
store( i+IT::size , j, xmm2 );
2183 (~C).
store( i+IT::size*2UL, j, xmm3 );
2184 (~C).
store( i+IT::size*3UL, j, xmm4 );
2187 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2189 for( ; (j+2UL) <= N; j+=2UL ) {
2194 for(
size_t k=0UL; k<K; ++k ) {
2199 xmm1 = xmm1 - a1 * b1;
2200 xmm2 = xmm2 - a2 * b1;
2201 xmm3 = xmm3 - a1 * b2;
2202 xmm4 = xmm4 - a2 * b2;
2204 (~C).
store( i , j , xmm1 );
2205 (~C).
store( i+IT::size, j , xmm2 );
2206 (~C).
store( i , j+1UL, xmm3 );
2207 (~C).
store( i+IT::size, j+1UL, xmm4 );
2212 for(
size_t k=0UL; k<K; ++k ) {
2214 xmm1 = xmm1 - A.load(i ,k) * b1;
2215 xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
2217 (~C).
store( i , j, xmm1 );
2218 (~C).
store( i+IT::size, j, xmm2 );
2223 for( ; (j+2UL) <= N; j+=2UL ) {
2226 for(
size_t k=0UL; k<K; ++k ) {
2228 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2229 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2231 (~C).
store( i, j , xmm1 );
2232 (~C).
store( i, j+1UL, xmm2 );
2236 for(
size_t k=0UL; k<K; ++k ) {
2237 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
2239 (~C).
store( i, j, xmm1 );
2260 template<
typename MT3
2263 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2264 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2266 selectDefaultSubAssignKernel( C, A, B );
2286 template<
typename MT3
2289 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2290 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2292 using boost::numeric_cast;
2298 const int M ( numeric_cast<int>( A.rows() ) );
2299 const int N ( numeric_cast<int>( B.columns() ) );
2300 const int K ( numeric_cast<int>( A.columns() ) );
2301 const int lda( numeric_cast<int>( A.spacing() ) );
2302 const int ldb( numeric_cast<int>( B.spacing() ) );
2303 const int ldc( numeric_cast<int>( C.spacing() ) );
2305 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2306 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2307 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2308 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2329 template<
typename MT3
2332 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2333 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2335 using boost::numeric_cast;
2341 const int M ( numeric_cast<int>( A.rows() ) );
2342 const int N ( numeric_cast<int>( B.columns() ) );
2343 const int K ( numeric_cast<int>( A.columns() ) );
2344 const int lda( numeric_cast<int>( A.spacing() ) );
2345 const int ldb( numeric_cast<int>( B.spacing() ) );
2346 const int ldc( numeric_cast<int>( C.spacing() ) );
2348 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2349 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2350 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2351 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2372 template<
typename MT3
2375 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2376 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2378 using boost::numeric_cast;
2387 const int M ( numeric_cast<int>( A.rows() ) );
2388 const int N ( numeric_cast<int>( B.columns() ) );
2389 const int K ( numeric_cast<int>( A.columns() ) );
2390 const int lda( numeric_cast<int>( A.spacing() ) );
2391 const int ldb( numeric_cast<int>( B.spacing() ) );
2392 const int ldc( numeric_cast<int>( C.spacing() ) );
2393 const complex<float> alpha( -1.0F, 0.0F );
2394 const complex<float> beta ( 1.0F, 0.0F );
2396 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2397 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2398 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2399 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2420 template<
typename MT3
2423 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2424 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2426 using boost::numeric_cast;
2435 const int M ( numeric_cast<int>( A.rows() ) );
2436 const int N ( numeric_cast<int>( B.columns() ) );
2437 const int K ( numeric_cast<int>( A.columns() ) );
2438 const int lda( numeric_cast<int>( A.spacing() ) );
2439 const int ldb( numeric_cast<int>( B.spacing() ) );
2440 const int ldc( numeric_cast<int>( C.spacing() ) );
2441 const complex<double> alpha( -1.0, 0.0 );
2442 const complex<double> beta ( 1.0, 0.0 );
2444 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2445 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2446 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2447 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2493 template<
typename MT1
2497 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2498 ,
private MatScalarMultExpr
2499 ,
private Computation
2503 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2515 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2520 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2527 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2528 struct UseSMPAssignKernel {
2529 enum { value = evaluateLeft || evaluateRight };
2538 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2539 struct UseSinglePrecisionKernel {
2540 enum { value = IsFloat<typename T1::ElementType>::value &&
2541 IsFloat<typename T2::ElementType>::value &&
2542 IsFloat<typename T3::ElementType>::value &&
2543 !IsComplex<T4>::value };
2552 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2553 struct UseDoublePrecisionKernel {
2554 enum { value = IsDouble<typename T1::ElementType>::value &&
2555 IsDouble<typename T2::ElementType>::value &&
2556 IsDouble<typename T3::ElementType>::value &&
2557 !IsComplex<T4>::value };
2566 template<
typename T1,
typename T2,
typename T3 >
2567 struct UseSinglePrecisionComplexKernel {
2568 typedef complex<float> Type;
2569 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2570 IsSame<typename T2::ElementType,Type>::value &&
2571 IsSame<typename T3::ElementType,Type>::value };
2580 template<
typename T1,
typename T2,
typename T3 >
2581 struct UseDoublePrecisionComplexKernel {
2582 typedef complex<double> Type;
2583 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2584 IsSame<typename T2::ElementType,Type>::value &&
2585 IsSame<typename T3::ElementType,Type>::value };
2593 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2594 struct UseDefaultKernel {
2595 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2596 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2597 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2598 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2606 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2607 struct UseVectorizedDefaultKernel {
2608 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2609 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2610 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2611 IsSame<typename T1::ElementType,T4>::value &&
2612 IntrinsicTrait<typename T1::ElementType>::addition &&
2613 IntrinsicTrait<typename T1::ElementType>::subtraction &&
2614 IntrinsicTrait<typename T1::ElementType>::multiplication };
2620 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2621 typedef typename MultTrait<RES,ST>::Type
ResultType;
2625 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2630 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
2636 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
2639 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
2644 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2645 IsSame<ET1,ET2>::value &&
2646 IsSame<ET1,ST>::value &&
2647 IntrinsicTrait<ET1>::addition &&
2648 IntrinsicTrait<ET1>::multiplication };
2651 enum { smpAssignable = !evaluateLeft && !evaluateRight };
2660 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2676 return matrix_(i,j) * scalar_;
2685 inline size_t rows()
const {
2686 return matrix_.rows();
2695 inline size_t columns()
const {
2696 return matrix_.columns();
2726 template<
typename T >
2727 inline bool canAlias(
const T* alias )
const {
2728 return matrix_.canAlias( alias );
2738 template<
typename T >
2739 inline bool isAliased(
const T* alias )
const {
2740 return matrix_.isAliased( alias );
2750 return matrix_.isAligned();
2760 typename MMM::RightOperand B( matrix_.rightOperand() );
2782 template<
typename MT3
2784 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2791 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2792 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2794 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2797 else if( left.columns() == 0UL ) {
2812 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
2827 template<
typename MT3
2831 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
2832 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2835 DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
2837 DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
2852 template<
typename MT3
2856 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
2857 selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2877 template<
typename MT3
2881 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2882 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2884 for(
size_t i=0UL; i<A.rows(); ++i ) {
2885 for(
size_t k=0UL; k<B.columns(); ++k ) {
2886 C(i,k) = A(i,0UL) * B(0UL,k);
2888 for(
size_t j=1UL; j<A.columns(); ++j ) {
2889 for(
size_t k=0UL; k<B.columns(); ++k ) {
2890 C(i,k) += A(i,j) * B(j,k);
2893 for(
size_t k=0UL; k<B.columns(); ++k ) {
2914 template<
typename MT3
2918 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2919 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2921 typedef IntrinsicTrait<ElementType> IT;
2923 const size_t M( A.rows() );
2924 const size_t N( B.columns() );
2925 const size_t K( A.columns() );
2931 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2932 for(
size_t i=0UL; i<M; ++i ) {
2933 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 for(
size_t k=0UL; k<K; ++k ) {
2936 xmm1 = xmm1 + a1 * B.load(k,j );
2937 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2938 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2939 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2940 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2941 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2942 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2943 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2945 (~C).
store( i, j , xmm1 * factor );
2946 (~C).
store( i, j+IT::size , xmm2 * factor );
2947 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
2948 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
2949 (~C).
store( i, j+IT::size*4UL, xmm5 * factor );
2950 (~C).
store( i, j+IT::size*5UL, xmm6 * factor );
2951 (~C).
store( i, j+IT::size*6UL, xmm7 * factor );
2952 (~C).
store( i, j+IT::size*7UL, xmm8 * factor );
2955 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2957 for( ; (i+2UL) <= M; i+=2UL ) {
2958 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2959 for(
size_t k=0UL; k<K; ++k ) {
2966 xmm1 = xmm1 + a1 * b1;
2967 xmm2 = xmm2 + a1 * b2;
2968 xmm3 = xmm3 + a1 * b3;
2969 xmm4 = xmm4 + a1 * b4;
2970 xmm5 = xmm5 + a2 * b1;
2971 xmm6 = xmm6 + a2 * b2;
2972 xmm7 = xmm7 + a2 * b3;
2973 xmm8 = xmm8 + a2 * b4;
2975 (~C).
store( i , j , xmm1 * factor );
2976 (~C).
store( i , j+IT::size , xmm2 * factor );
2977 (~C).
store( i , j+IT::size*2UL, xmm3 * factor );
2978 (~C).
store( i , j+IT::size*3UL, xmm4 * factor );
2979 (~C).
store( i+1UL, j , xmm5 * factor );
2980 (~C).
store( i+1UL, j+IT::size , xmm6 * factor );
2981 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 * factor );
2982 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 * factor );
2986 for(
size_t k=0UL; k<K; ++k ) {
2988 xmm1 = xmm1 + a1 * B.load(k,j );
2989 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2990 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2991 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2993 (~C).
store( i, j , xmm1 * factor );
2994 (~C).
store( i, j+IT::size , xmm2 * factor );
2995 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
2996 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
2999 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3001 for( ; (i+2UL) <= M; i+=2UL ) {
3003 for(
size_t k=0UL; k<K; ++k ) {
3008 xmm1 = xmm1 + a1 * b1;
3009 xmm2 = xmm2 + a1 * b2;
3010 xmm3 = xmm3 + a2 * b1;
3011 xmm4 = xmm4 + a2 * b2;
3013 (~C).
store( i , j , xmm1 * factor );
3014 (~C).
store( i , j+IT::size, xmm2 * factor );
3015 (~C).
store( i+1UL, j , xmm3 * factor );
3016 (~C).
store( i+1UL, j+IT::size, xmm4 * factor );
3020 for(
size_t k=0UL; k<K; ++k ) {
3022 xmm1 = xmm1 + a1 * B.load(k,j );
3023 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3025 (~C).
store( i, j , xmm1 * factor );
3026 (~C).
store( i, j+IT::size, xmm2 * factor );
3031 for( ; (i+2UL) <= M; i+=2UL ) {
3033 for(
size_t k=0UL; k<K; ++k ) {
3035 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3036 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3038 (~C).
store( i , j, xmm1 * factor );
3039 (~C).
store( i+1UL, j, xmm2 * factor );
3043 for(
size_t k=0UL; k<K; ++k ) {
3044 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3046 (~C).
store( i, j, xmm1 * factor );
3066 template<
typename MT3
3070 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3071 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3073 typedef IntrinsicTrait<ElementType> IT;
3075 const size_t M( A.rows() );
3076 const size_t N( B.columns() );
3077 const size_t K( A.columns() );
3083 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3084 for(
size_t j=0UL; j<N; ++j ) {
3085 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3086 for(
size_t k=0UL; k<K; ++k ) {
3088 xmm1 = xmm1 + A.load(i ,k) * b1;
3089 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3090 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3091 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3092 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3093 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3094 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3095 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3097 (~C).
store( i , j, xmm1 * factor );
3098 (~C).
store( i+IT::size , j, xmm2 * factor );
3099 (~C).
store( i+IT::size*2UL, j, xmm3 * factor );
3100 (~C).
store( i+IT::size*3UL, j, xmm4 * factor );
3101 (~C).
store( i+IT::size*4UL, j, xmm5 * factor );
3102 (~C).
store( i+IT::size*5UL, j, xmm6 * factor );
3103 (~C).
store( i+IT::size*6UL, j, xmm7 * factor );
3104 (~C).
store( i+IT::size*7UL, j, xmm8 * factor );
3107 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3109 for( ; (j+2UL) <= N; j+=2UL ) {
3110 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3111 for(
size_t k=0UL; k<K; ++k ) {
3118 xmm1 = xmm1 + a1 * b1;
3119 xmm2 = xmm2 + a2 * b1;
3120 xmm3 = xmm3 + a3 * b1;
3121 xmm4 = xmm4 + a4 * b1;
3122 xmm5 = xmm5 + a1 * b2;
3123 xmm6 = xmm6 + a2 * b2;
3124 xmm7 = xmm7 + a3 * b2;
3125 xmm8 = xmm8 + a4 * b2;
3127 (~C).
store( i , j , xmm1 * factor );
3128 (~C).
store( i+IT::size , j , xmm2 * factor );
3129 (~C).
store( i+IT::size*2UL, j , xmm3 * factor );
3130 (~C).
store( i+IT::size*3UL, j , xmm4 * factor );
3131 (~C).
store( i , j+1UL, xmm5 * factor );
3132 (~C).
store( i+IT::size , j+1UL, xmm6 * factor );
3133 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 * factor );
3134 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 * factor );
3138 for(
size_t k=0UL; k<K; ++k ) {
3140 xmm1 = xmm1 + A.load(i ,k) * b1;
3141 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3142 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3143 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3145 (~C).
store( i , j, xmm1 * factor );
3146 (~C).
store( i+IT::size , j, xmm2 * factor );
3147 (~C).
store( i+IT::size*2UL, j, xmm3 * factor );
3148 (~C).
store( i+IT::size*3UL, j, xmm4 * factor );
3151 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3153 for( ; (j+2UL) <= N; j+=2UL ) {
3155 for(
size_t k=0UL; k<K; ++k ) {
3160 xmm1 = xmm1 + a1 * b1;
3161 xmm2 = xmm2 + a2 * b1;
3162 xmm3 = xmm3 + a1 * b2;
3163 xmm4 = xmm4 + a2 * b2;
3165 (~C).
store( i , j , xmm1 * factor );
3166 (~C).
store( i+IT::size, j , xmm2 * factor );
3167 (~C).
store( i , j+1UL, xmm3 * factor );
3168 (~C).
store( i+IT::size, j+1UL, xmm4 * factor );
3172 for(
size_t k=0UL; k<K; ++k ) {
3174 xmm1 = xmm1 + A.load(i ,k) * b1;
3175 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3177 (~C).
store( i , j, xmm1 * factor );
3178 (~C).
store( i+IT::size, j, xmm2 * factor );
3183 for( ; (j+2UL) <= N; j+=2UL ) {
3185 for(
size_t k=0UL; k<K; ++k ) {
3187 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3188 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3190 (~C).
store( i, j , xmm1 * factor );
3191 (~C).
store( i, j+1UL, xmm2 * factor );
3195 for(
size_t k=0UL; k<K; ++k ) {
3196 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3198 (~C).
store( i, j, xmm1 * factor );
3218 template<
typename MT3
3222 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3223 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3225 selectDefaultAssignKernel( C, A, B, scalar );
3244 template<
typename MT3
3248 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3249 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3251 using boost::numeric_cast;
3257 const int M ( numeric_cast<int>( A.rows() ) );
3258 const int N ( numeric_cast<int>( B.columns() ) );
3259 const int K ( numeric_cast<int>( A.columns() ) );
3260 const int lda( numeric_cast<int>( A.spacing() ) );
3261 const int ldb( numeric_cast<int>( B.spacing() ) );
3262 const int ldc( numeric_cast<int>( C.spacing() ) );
3264 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3265 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3266 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3267 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
3287 template<
typename MT3
3291 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3292 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3294 using boost::numeric_cast;
3300 const int M ( numeric_cast<int>( A.rows() ) );
3301 const int N ( numeric_cast<int>( B.columns() ) );
3302 const int K ( numeric_cast<int>( A.columns() ) );
3303 const int lda( numeric_cast<int>( A.spacing() ) );
3304 const int ldb( numeric_cast<int>( B.spacing() ) );
3305 const int ldc( numeric_cast<int>( C.spacing() ) );
3307 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3308 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3309 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3310 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3330 template<
typename MT3
3334 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3335 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3337 using boost::numeric_cast;
3346 const int M ( numeric_cast<int>( A.rows() ) );
3347 const int N ( numeric_cast<int>( B.columns() ) );
3348 const int K ( numeric_cast<int>( A.columns() ) );
3349 const int lda( numeric_cast<int>( A.spacing() ) );
3350 const int ldb( numeric_cast<int>( B.spacing() ) );
3351 const int ldc( numeric_cast<int>( C.spacing() ) );
3352 const complex<float> alpha( scalar );
3353 const complex<float> beta ( 0.0F, 0.0F );
3355 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3356 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3357 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3358 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3378 template<
typename MT3
3382 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3383 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3385 using boost::numeric_cast;
3394 const int M ( numeric_cast<int>( A.rows() ) );
3395 const int N ( numeric_cast<int>( B.columns() ) );
3396 const int K ( numeric_cast<int>( A.columns() ) );
3397 const int lda( numeric_cast<int>( A.spacing() ) );
3398 const int ldb( numeric_cast<int>( B.spacing() ) );
3399 const int ldc( numeric_cast<int>( C.spacing() ) );
3400 const complex<double> alpha( scalar );
3401 const complex<double> beta ( 0.0, 0.0 );
3403 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3404 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3405 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3406 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3423 template<
typename MT
3425 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3429 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3441 const TmpType tmp( rhs );
3458 template<
typename MT3
3460 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3467 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3468 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3470 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3484 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3499 template<
typename MT3
3503 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3504 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3507 DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3509 DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3524 template<
typename MT3
3528 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3529 selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3549 template<
typename MT3
3553 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3554 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3575 template<
typename MT3
3579 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3580 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3582 typedef IntrinsicTrait<ElementType> IT;
3584 const size_t M( A.rows() );
3585 const size_t N( B.columns() );
3586 const size_t K( A.columns() );
3592 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3593 for(
size_t i=0UL; i<M; ++i ) {
3594 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3595 for(
size_t k=0UL; k<K; ++k ) {
3597 xmm1 = xmm1 + a1 * B.load(k,j );
3598 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3599 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3600 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3601 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3602 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3603 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3604 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3606 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3607 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3608 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3609 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3610 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
3611 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
3612 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
3613 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
3616 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3618 for( ; (i+2UL) <= M; i+=2UL ) {
3619 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3620 for(
size_t k=0UL; k<K; ++k ) {
3627 xmm1 = xmm1 + a1 * b1;
3628 xmm2 = xmm2 + a1 * b2;
3629 xmm3 = xmm3 + a1 * b3;
3630 xmm4 = xmm4 + a1 * b4;
3631 xmm5 = xmm5 + a2 * b1;
3632 xmm6 = xmm6 + a2 * b2;
3633 xmm7 = xmm7 + a2 * b3;
3634 xmm8 = xmm8 + a2 * b4;
3636 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3637 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
3638 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
3639 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
3640 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3641 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
3642 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
3643 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
3647 for(
size_t k=0UL; k<K; ++k ) {
3649 xmm1 = xmm1 + a1 * B.load(k,j );
3650 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3651 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3652 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3654 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3655 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3656 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3657 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3660 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3662 for( ; (i+2UL) <= M; i+=2UL ) {
3664 for(
size_t k=0UL; k<K; ++k ) {
3669 xmm1 = xmm1 + a1 * b1;
3670 xmm2 = xmm2 + a1 * b2;
3671 xmm3 = xmm3 + a2 * b1;
3672 xmm4 = xmm4 + a2 * b2;
3674 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3675 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
3676 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3677 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
3681 for(
size_t k=0UL; k<K; ++k ) {
3683 xmm1 = xmm1 + a1 * B.load(k,j );
3684 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3686 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3687 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
3692 for( ; (i+2UL) <= M; i+=2UL ) {
3694 for(
size_t k=0UL; k<K; ++k ) {
3696 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3697 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3699 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3700 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3704 for(
size_t k=0UL; k<K; ++k ) {
3705 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3707 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3727 template<
typename MT3
3731 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3732 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3734 typedef IntrinsicTrait<ElementType> IT;
3736 const size_t M( A.rows() );
3737 const size_t N( B.columns() );
3738 const size_t K( A.columns() );
3744 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3745 for(
size_t j=0UL; j<N; ++j ) {
3746 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3747 for(
size_t k=0UL; k<K; ++k ) {
3749 xmm1 = xmm1 + A.load(i ,k) * b1;
3750 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3751 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3752 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3753 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3754 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3755 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3756 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3758 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3759 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3760 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3761 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3762 (~C).
store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
3763 (~C).
store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
3764 (~C).
store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
3765 (~C).
store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
3768 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3770 for( ; (j+2UL) <= N; j+=2UL ) {
3771 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3772 for(
size_t k=0UL; k<K; ++k ) {
3779 xmm1 = xmm1 + a1 * b1;
3780 xmm2 = xmm2 + a2 * b1;
3781 xmm3 = xmm3 + a3 * b1;
3782 xmm4 = xmm4 + a4 * b1;
3783 xmm5 = xmm5 + a1 * b2;
3784 xmm6 = xmm6 + a2 * b2;
3785 xmm7 = xmm7 + a3 * b2;
3786 xmm8 = xmm8 + a4 * b2;
3788 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3789 (~C).
store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
3790 (~C).
store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
3791 (~C).
store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
3792 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
3793 (~C).
store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
3794 (~C).
store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
3795 (~C).
store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
3799 for(
size_t k=0UL; k<K; ++k ) {
3801 xmm1 = xmm1 + A.load(i ,k) * b1;
3802 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3803 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3804 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3806 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3807 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3808 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3809 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3812 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3814 for( ; (j+2UL) <= N; j+=2UL ) {
3816 for(
size_t k=0UL; k<K; ++k ) {
3821 xmm1 = xmm1 + a1 * b1;
3822 xmm2 = xmm2 + a2 * b1;
3823 xmm3 = xmm3 + a1 * b2;
3824 xmm4 = xmm4 + a2 * b2;
3826 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3827 (~C).
store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
3828 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
3829 (~C).
store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
3833 for(
size_t k=0UL; k<K; ++k ) {
3835 xmm1 = xmm1 + A.load(i ,k) * b1;
3836 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3838 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3839 (~C).
store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
3844 for( ; (j+2UL) <= N; j+=2UL ) {
3846 for(
size_t k=0UL; k<K; ++k ) {
3848 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3849 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3851 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3852 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
3856 for(
size_t k=0UL; k<K; ++k ) {
3857 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3859 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3879 template<
typename MT3
3883 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3884 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3886 selectDefaultAddAssignKernel( C, A, B, scalar );
3905 template<
typename MT3
3909 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3910 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3912 using boost::numeric_cast;
3918 const int M ( numeric_cast<int>( A.rows() ) );
3919 const int N ( numeric_cast<int>( B.columns() ) );
3920 const int K ( numeric_cast<int>( A.columns() ) );
3921 const int lda( numeric_cast<int>( A.spacing() ) );
3922 const int ldb( numeric_cast<int>( B.spacing() ) );
3923 const int ldc( numeric_cast<int>( C.spacing() ) );
3925 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3926 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3927 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3928 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3948 template<
typename MT3
3952 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3953 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3955 using boost::numeric_cast;
3961 const int M ( numeric_cast<int>( A.rows() ) );
3962 const int N ( numeric_cast<int>( B.columns() ) );
3963 const int K ( numeric_cast<int>( A.columns() ) );
3964 const int lda( numeric_cast<int>( A.spacing() ) );
3965 const int ldb( numeric_cast<int>( B.spacing() ) );
3966 const int ldc( numeric_cast<int>( C.spacing() ) );
3968 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3969 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3970 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3971 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3991 template<
typename MT3
3995 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3996 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3998 using boost::numeric_cast;
4007 const int M ( numeric_cast<int>( A.rows() ) );
4008 const int N ( numeric_cast<int>( B.columns() ) );
4009 const int K ( numeric_cast<int>( A.columns() ) );
4010 const int lda( numeric_cast<int>( A.spacing() ) );
4011 const int ldb( numeric_cast<int>( B.spacing() ) );
4012 const int ldc( numeric_cast<int>( C.spacing() ) );
4013 const complex<float> alpha( scalar );
4014 const complex<float> beta ( 1.0F, 0.0F );
4016 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4017 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4018 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4019 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4039 template<
typename MT3
4043 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4044 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4046 using boost::numeric_cast;
4055 const int M ( numeric_cast<int>( A.rows() ) );
4056 const int N ( numeric_cast<int>( B.columns() ) );
4057 const int K ( numeric_cast<int>( A.columns() ) );
4058 const int lda( numeric_cast<int>( A.spacing() ) );
4059 const int ldb( numeric_cast<int>( B.spacing() ) );
4060 const int ldc( numeric_cast<int>( C.spacing() ) );
4061 const complex<double> alpha( scalar );
4062 const complex<double> beta ( 1.0, 0.0 );
4064 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4065 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4066 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4067 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4088 template<
typename MT3
4090 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
4097 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4098 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4100 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4114 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
4129 template<
typename MT3
4133 static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
4134 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4137 DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
4139 DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
4154 template<
typename MT3
4158 static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
4159 selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4179 template<
typename MT3
4183 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4184 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4205 template<
typename MT3
4209 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4210 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4212 typedef IntrinsicTrait<ElementType> IT;
4214 const size_t M( A.rows() );
4215 const size_t N( B.columns() );
4216 const size_t K( A.columns() );
4222 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
4223 for(
size_t i=0UL; i<M; ++i ) {
4224 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4225 for(
size_t k=0UL; k<K; ++k ) {
4227 xmm1 = xmm1 + a1 * B.load(k,j );
4228 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4229 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4230 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4231 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
4232 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
4233 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
4234 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
4236 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4237 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4238 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4239 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4240 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
4241 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
4242 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
4243 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
4246 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
4248 for( ; (i+2UL) <= M; i+=2UL ) {
4249 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4250 for(
size_t k=0UL; k<K; ++k ) {
4257 xmm1 = xmm1 + a1 * b1;
4258 xmm2 = xmm2 + a1 * b2;
4259 xmm3 = xmm3 + a1 * b3;
4260 xmm4 = xmm4 + a1 * b4;
4261 xmm5 = xmm5 + a2 * b1;
4262 xmm6 = xmm6 + a2 * b2;
4263 xmm7 = xmm7 + a2 * b3;
4264 xmm8 = xmm8 + a2 * b4;
4266 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4267 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
4268 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
4269 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
4270 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
4271 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
4272 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
4273 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
4277 for(
size_t k=0UL; k<K; ++k ) {
4279 xmm1 = xmm1 + a1 * B.load(k,j );
4280 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4281 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4282 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4284 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4285 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4286 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4287 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4290 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
4292 for( ; (i+2UL) <= M; i+=2UL ) {
4294 for(
size_t k=0UL; k<K; ++k ) {
4299 xmm1 = xmm1 + a1 * b1;
4300 xmm2 = xmm2 + a1 * b2;
4301 xmm3 = xmm3 + a2 * b1;
4302 xmm4 = xmm4 + a2 * b2;
4304 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4305 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
4306 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
4307 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
4311 for(
size_t k=0UL; k<K; ++k ) {
4313 xmm1 = xmm1 + a1 * B.load(k,j );
4314 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
4316 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4317 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
4322 for( ; (i+2UL) <= M; i+=2UL ) {
4324 for(
size_t k=0UL; k<K; ++k ) {
4326 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
4327 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
4329 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4330 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
4334 for(
size_t k=0UL; k<K; ++k ) {
4335 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
4337 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4357 template<
typename MT3
4361 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4362 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4364 typedef IntrinsicTrait<ElementType> IT;
4366 const size_t M( A.rows() );
4367 const size_t N( B.columns() );
4368 const size_t K( A.columns() );
4374 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4375 for(
size_t j=0UL; j<N; ++j ) {
4376 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4377 for(
size_t k=0UL; k<K; ++k ) {
4379 xmm1 = xmm1 + A.load(i ,k) * b1;
4380 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4381 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4382 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4383 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4384 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4385 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4386 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4388 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4389 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4390 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4391 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4392 (~C).
store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
4393 (~C).
store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
4394 (~C).
store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
4395 (~C).
store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
4398 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
4400 for( ; (j+2UL) <= N; j+=2UL ) {
4401 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4402 for(
size_t k=0UL; k<K; ++k ) {
4409 xmm1 = xmm1 + a1 * b1;
4410 xmm2 = xmm2 + a2 * b1;
4411 xmm3 = xmm3 + a3 * b1;
4412 xmm4 = xmm4 + a4 * b1;
4413 xmm5 = xmm5 + a1 * b2;
4414 xmm6 = xmm6 + a2 * b2;
4415 xmm7 = xmm7 + a3 * b2;
4416 xmm8 = xmm8 + a4 * b2;
4418 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4419 (~C).
store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
4420 (~C).
store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
4421 (~C).
store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
4422 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4423 (~C).
store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
4424 (~C).
store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
4425 (~C).
store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
4429 for(
size_t k=0UL; k<K; ++k ) {
4431 xmm1 = xmm1 + A.load(i ,k) * b1;
4432 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4433 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4434 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4436 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4437 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4438 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4439 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4442 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
4444 for( ; (j+2UL) <= N; j+=2UL ) {
4446 for(
size_t k=0UL; k<K; ++k ) {
4451 xmm1 = xmm1 + a1 * b1;
4452 xmm2 = xmm2 + a2 * b1;
4453 xmm3 = xmm3 + a1 * b2;
4454 xmm4 = xmm4 + a2 * b2;
4456 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4457 (~C).
store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
4458 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4459 (~C).
store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
4463 for(
size_t k=0UL; k<K; ++k ) {
4465 xmm1 = xmm1 + A.load(i ,k) * b1;
4466 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4468 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4469 (~C).
store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
4474 for( ; (j+2UL) <= N; j+=2UL ) {
4476 for(
size_t k=0UL; k<K; ++k ) {
4478 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4479 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4481 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4482 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4486 for(
size_t k=0UL; k<K; ++k ) {
4487 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4489 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4509 template<
typename MT3
4513 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4514 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4516 selectDefaultSubAssignKernel( C, A, B, scalar );
4535 template<
typename MT3
4539 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4540 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4542 using boost::numeric_cast;
4548 const int M ( numeric_cast<int>( A.rows() ) );
4549 const int N ( numeric_cast<int>( B.columns() ) );
4550 const int K ( numeric_cast<int>( A.columns() ) );
4551 const int lda( numeric_cast<int>( A.spacing() ) );
4552 const int ldb( numeric_cast<int>( B.spacing() ) );
4553 const int ldc( numeric_cast<int>( C.spacing() ) );
4555 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4556 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4557 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4558 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4578 template<
typename MT3
4582 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4583 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4585 using boost::numeric_cast;
4591 const int M ( numeric_cast<int>( A.rows() ) );
4592 const int N ( numeric_cast<int>( B.columns() ) );
4593 const int K ( numeric_cast<int>( A.columns() ) );
4594 const int lda( numeric_cast<int>( A.spacing() ) );
4595 const int ldb( numeric_cast<int>( B.spacing() ) );
4596 const int ldc( numeric_cast<int>( C.spacing() ) );
4598 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4599 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4600 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4601 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4621 template<
typename MT3
4625 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4626 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4628 using boost::numeric_cast;
4637 const int M ( numeric_cast<int>( A.rows() ) );
4638 const int N ( numeric_cast<int>( B.columns() ) );
4639 const int K ( numeric_cast<int>( A.columns() ) );
4640 const int lda( numeric_cast<int>( A.spacing() ) );
4641 const int ldb( numeric_cast<int>( B.spacing() ) );
4642 const int ldc( numeric_cast<int>( C.spacing() ) );
4643 const complex<float> alpha( -scalar );
4644 const complex<float> beta ( 1.0F, 0.0F );
4646 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4647 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4648 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4649 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4669 template<
typename MT3
4673 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4674 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4676 using boost::numeric_cast;
4685 const int M ( numeric_cast<int>( A.rows() ) );
4686 const int N ( numeric_cast<int>( B.columns() ) );
4687 const int K ( numeric_cast<int>( A.columns() ) );
4688 const int lda( numeric_cast<int>( A.spacing() ) );
4689 const int ldb( numeric_cast<int>( B.spacing() ) );
4690 const int ldc( numeric_cast<int>( C.spacing() ) );
4691 const complex<double> alpha( -scalar );
4692 const complex<double> beta ( 1.0, 0.0 );
4694 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4695 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4696 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4697 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4766 template<
typename T1
4768 inline const TDMatDMatMultExpr<T1,T2>
4774 throw std::invalid_argument(
"Matrix sizes do not match" );
4791 template<
typename MT1,
typename MT2,
typename VT >
4796 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4797 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4798 IsDenseVector<VT>::value && IsColumnVector<VT>::value
4799 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4800 , INVALID_TYPE >::Type Type;
4809 template<
typename MT1,
typename MT2,
typename VT >
4814 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4815 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4816 IsSparseVector<VT>::value && IsColumnVector<VT>::value
4817 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4818 , INVALID_TYPE >::Type Type;
4827 template<
typename VT,
typename MT1,
typename MT2 >
4832 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
4833 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4834 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4835 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4836 , INVALID_TYPE >::Type Type;
4845 template<
typename VT,
typename MT1,
typename MT2 >
4850 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
4851 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4852 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4853 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4854 , INVALID_TYPE >::Type Type;
4863 template<
typename MT1,
typename MT2,
bool AF >
4868 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
4869 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
4878 template<
typename MT1,
typename MT2 >
4883 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4892 template<
typename MT1,
typename MT2 >
4897 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:340
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:413
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
Header file for the sparse matrix SMP implementation.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:394
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:123
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:247
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:360
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:250
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:121
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:122
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:126
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:127
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:285
Header file for the multiplication trait.
Header file for the IsDouble type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:262
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:251
Header file for the dense matrix SMP implementation.
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold represen...
Definition: Thresholds.h:459
Header file for the DenseMatrix base class.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:259
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:246
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:265
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:256
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:252
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:124
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:404
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:125
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:249
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:384
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:350
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:248
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:330
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:253
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:414
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:300
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:247
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
void store(float *address, const sse_float_t &value)
Aligned store of a vector of 'float' values.
Definition: Store.h:242
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:372
Header file for the FunctionTrace class.