22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
91 template<
typename MT1
99 typedef typename MT1::ResultType
RT1;
100 typedef typename MT2::ResultType
RT2;
101 typedef typename MT1::CompositeType
CT1;
102 typedef typename MT2::CompositeType
CT2;
110 template<
typename T1,
typename T2,
typename T3 >
111 struct UseSinglePrecisionKernel {
124 template<
typename T1,
typename T2,
typename T3 >
125 struct UseDoublePrecisionKernel {
139 template<
typename T1,
typename T2,
typename T3 >
140 struct UseSinglePrecisionComplexKernel {
141 typedef complex<float> Type;
142 enum { value = IsSame<typename T1::ElementType,Type>::value &&
143 IsSame<typename T2::ElementType,Type>::value &&
144 IsSame<typename T3::ElementType,Type>::value };
155 template<
typename T1,
typename T2,
typename T3 >
156 struct UseDoublePrecisionComplexKernel {
157 typedef complex<double> Type;
158 enum { value = IsSame<typename T1::ElementType,Type>::value &&
159 IsSame<typename T2::ElementType,Type>::value &&
160 IsSame<typename T3::ElementType,Type>::value };
170 template<
typename T1,
typename T2,
typename T3 >
171 struct UseDefaultKernel {
172 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
173 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
174 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
175 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
185 template<
typename T1,
typename T2,
typename T3 >
186 struct UseVectorizedDefaultKernel {
187 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
188 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
189 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
190 IntrinsicTrait<typename T1::ElementType>::addition &&
191 IntrinsicTrait<typename T1::ElementType>::multiplication };
222 enum { vectorizable = 0 };
255 if(
lhs_.columns() != 0UL ) {
256 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
258 for(
size_t k=1UL; k<end; k+=2UL ) {
260 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
262 if( end <
lhs_.columns() ) {
290 return rhs_.columns();
320 template<
typename T >
342 template<
typename MT
349 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
352 else if( rhs.
lhs_.columns() == 0UL ) {
368 DMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
370 DMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
389 template<
typename MT3
393 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
395 const size_t M( A.rows() );
396 const size_t N( B.columns() );
397 const size_t K( A.columns() );
399 for(
size_t i=0UL; i<M; ++i ) {
400 for(
size_t j=0UL; j<N; ++j ) {
401 C(i,j) = A(i,0UL) * B(0UL,j);
403 for(
size_t k=1UL; k<K; ++k ) {
404 for(
size_t j=0UL; j<N; ++j ) {
405 C(i,j) += A(i,k) * B(k,j);
427 template<
typename MT3
430 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
431 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
433 typedef IntrinsicTrait<ElementType> IT;
435 const size_t M( A.rows() );
436 const size_t N( B.columns() );
437 const size_t K( A.columns() );
441 for( ; (i+2UL) <= M; i+=2UL ) {
443 for( ; (j+4UL) <= N; j+=4UL ) {
444 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
445 for(
size_t k=0UL; k<K; k+=IT::size ) {
452 xmm1 = xmm1 + a1 * b1;
453 xmm2 = xmm2 + a1 * b2;
454 xmm3 = xmm3 + a1 * b3;
455 xmm4 = xmm4 + a1 * b4;
456 xmm5 = xmm5 + a2 * b1;
457 xmm6 = xmm6 + a2 * b2;
458 xmm7 = xmm7 + a2 * b3;
459 xmm8 = xmm8 + a2 * b4;
461 (~C)(i ,j ) =
sum( xmm1 );
462 (~C)(i ,j+1UL) =
sum( xmm2 );
463 (~C)(i ,j+2UL) =
sum( xmm3 );
464 (~C)(i ,j+3UL) =
sum( xmm4 );
465 (~C)(i+1UL,j ) =
sum( xmm5 );
466 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
467 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
468 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
470 for( ; (j+2UL) <= N; j+=2UL ) {
472 for(
size_t k=0UL; k<K; k+=IT::size ) {
477 xmm1 = xmm1 + a1 * b1;
478 xmm2 = xmm2 + a1 * b2;
479 xmm3 = xmm3 + a2 * b1;
480 xmm4 = xmm4 + a2 * b2;
482 (~C)(i ,j ) =
sum( xmm1 );
483 (~C)(i ,j+1UL) =
sum( xmm2 );
484 (~C)(i+1UL,j ) =
sum( xmm3 );
485 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
489 for(
size_t k=0UL; k<K; k+=IT::size ) {
491 xmm1 = xmm1 + A.get(i ,k) * b1;
492 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
494 (~C)(i ,j) =
sum( xmm1 );
495 (~C)(i+1UL,j) =
sum( xmm2 );
500 for( ; (j+4UL) <= N; j+=4UL ) {
502 for(
size_t k=0UL; k<K; k+=IT::size ) {
504 xmm1 = xmm1 + a1 * B.get(k,j );
505 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
506 xmm3 = xmm3 + a1 * B.get(k,j+2UL);
507 xmm4 = xmm4 + a1 * B.get(k,j+3UL);
509 (~C)(i,j ) =
sum( xmm1 );
510 (~C)(i,j+1UL) =
sum( xmm2 );
511 (~C)(i,j+2UL) =
sum( xmm3 );
512 (~C)(i,j+3UL) =
sum( xmm4 );
514 for( ; (j+2UL) <= N; j+=2UL ) {
516 for(
size_t k=0UL; k<K; k+=IT::size ) {
518 xmm1 = xmm1 + a1 * B.get(k,j );
519 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
521 (~C)(i,j ) =
sum( xmm1 );
522 (~C)(i,j+1UL) =
sum( xmm2 );
526 for(
size_t k=0UL; k<K; k+=IT::size ) {
527 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
529 (~C)(i,j) =
sum( xmm1 );
550 template<
typename MT3
553 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
554 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
556 typedef IntrinsicTrait<ElementType> IT;
558 const size_t M( A.rows() );
559 const size_t N( B.columns() );
560 const size_t K( A.columns() );
564 for( ; (i+4UL) <= M; i+=4UL ) {
566 for( ; (j+2UL) <= N; j+=2UL ) {
567 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
568 for(
size_t k=0UL; k<K; k+=IT::size ) {
575 xmm1 = xmm1 + a1 * b1;
576 xmm2 = xmm2 + a1 * b2;
577 xmm3 = xmm3 + a2 * b1;
578 xmm4 = xmm4 + a2 * b2;
579 xmm5 = xmm5 + a3 * b1;
580 xmm6 = xmm6 + a3 * b2;
581 xmm7 = xmm7 + a4 * b1;
582 xmm8 = xmm8 + a4 * b2;
584 (~C)(i ,j ) =
sum( xmm1 );
585 (~C)(i ,j+1UL) =
sum( xmm2 );
586 (~C)(i+1UL,j ) =
sum( xmm3 );
587 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
588 (~C)(i+2UL,j ) =
sum( xmm5 );
589 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
590 (~C)(i+3UL,j ) =
sum( xmm7 );
591 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
595 for(
size_t k=0UL; k<K; k+=IT::size ) {
597 xmm1 = xmm1 + A.get(i ,k) * b1;
598 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
599 xmm3 = xmm3 + A.get(i+2UL,k) * b1;
600 xmm4 = xmm4 + A.get(i+3UL,k) * b1;
602 (~C)(i ,j) =
sum( xmm1 );
603 (~C)(i+1UL,j) =
sum( xmm2 );
604 (~C)(i+2UL,j) =
sum( xmm3 );
605 (~C)(i+3UL,j) =
sum( xmm4 );
608 for( ; (i+2UL) <= M; i+=2UL ) {
610 for( ; (j+2UL) <= N; j+=2UL ) {
612 for(
size_t k=0UL; k<K; k+=IT::size ) {
617 xmm1 = xmm1 + a1 * b1;
618 xmm2 = xmm2 + a1 * b2;
619 xmm3 = xmm3 + a2 * b1;
620 xmm4 = xmm4 + a2 * b2;
622 (~C)(i ,j ) =
sum( xmm1 );
623 (~C)(i ,j+1UL) =
sum( xmm2 );
624 (~C)(i+1UL,j ) =
sum( xmm3 );
625 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
629 for(
size_t k=0UL; k<K; k+=IT::size ) {
631 xmm1 = xmm1 + A.get(i ,k) * b1;
632 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
634 (~C)(i ,j) =
sum( xmm1 );
635 (~C)(i+1UL,j) =
sum( xmm2 );
640 for( ; (j+2UL) <= N; j+=2UL ) {
642 for(
size_t k=0UL; k<K; k+=IT::size ) {
644 xmm1 = xmm1 + a1 * B.get(k,j );
645 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
647 (~C)(i,j ) =
sum( xmm1 );
648 (~C)(i,j+1UL) =
sum( xmm2 );
652 for(
size_t k=0UL; k<K; k+=IT::size ) {
653 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
655 (~C)(i,j) =
sum( xmm1 );
676 template<
typename MT3
679 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
680 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
682 selectDefaultAssignKernel( C, A, B );
702 template<
typename MT3
705 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
706 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
708 using boost::numeric_cast;
714 const int M ( numeric_cast<int>( A.rows() ) );
715 const int N ( numeric_cast<int>( B.columns() ) );
716 const int K ( numeric_cast<int>( A.columns() ) );
717 const int lda( numeric_cast<int>( A.spacing() ) );
718 const int ldb( numeric_cast<int>( B.spacing() ) );
719 const int ldc( numeric_cast<int>( C.spacing() ) );
721 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
722 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
723 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
724 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
745 template<
typename MT3
748 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
749 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
751 using boost::numeric_cast;
757 const int M ( numeric_cast<int>( A.rows() ) );
758 const int N ( numeric_cast<int>( B.columns() ) );
759 const int K ( numeric_cast<int>( A.columns() ) );
760 const int lda( numeric_cast<int>( A.spacing() ) );
761 const int ldb( numeric_cast<int>( B.spacing() ) );
762 const int ldc( numeric_cast<int>( C.spacing() ) );
764 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
765 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
766 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
767 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
788 template<
typename MT3
791 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
792 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
794 using boost::numeric_cast;
803 const int M ( numeric_cast<int>( A.rows() ) );
804 const int N ( numeric_cast<int>( B.columns() ) );
805 const int K ( numeric_cast<int>( A.columns() ) );
806 const int lda( numeric_cast<int>( A.spacing() ) );
807 const int ldb( numeric_cast<int>( B.spacing() ) );
808 const int ldc( numeric_cast<int>( C.spacing() ) );
809 const complex<float> alpha( 1.0F, 0.0F );
810 const complex<float> beta ( 0.0F, 0.0F );
812 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
813 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
814 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
815 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
836 template<
typename MT3
839 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
840 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
842 using boost::numeric_cast;
851 const int M ( numeric_cast<int>( A.rows() ) );
852 const int N ( numeric_cast<int>( B.columns() ) );
853 const int K ( numeric_cast<int>( A.columns() ) );
854 const int lda( numeric_cast<int>( A.spacing() ) );
855 const int ldb( numeric_cast<int>( B.spacing() ) );
856 const int ldc( numeric_cast<int>( C.spacing() ) );
857 const complex<double> alpha( 1.0, 0.0 );
858 const complex<double> beta ( 0.0, 0.0 );
860 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
861 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
862 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
863 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
881 template<
typename MT
885 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
897 const TmpType tmp( rhs );
916 template<
typename MT
923 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
938 DMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
940 DMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
959 template<
typename MT3
962 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
963 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
965 const size_t M( A.rows() );
966 const size_t N( B.columns() );
967 const size_t K( A.columns() );
970 const size_t end( N &
size_t(-2) );
972 for(
size_t i=0UL; i<M; ++i ) {
973 for(
size_t k=0UL; k<K; ++k ) {
974 for(
size_t j=0UL; j<end; j+=2UL ) {
975 C(i,j ) += A(i,k) * B(k,j );
976 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
979 C(i,end) += A(i,k) * B(k,end);
1001 template<
typename MT3
1004 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1005 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1007 typedef IntrinsicTrait<ElementType> IT;
1009 const size_t M( A.rows() );
1010 const size_t N( B.columns() );
1011 const size_t K( A.columns() );
1015 for( ; (i+2UL) <= M; i+=2UL ) {
1017 for( ; (j+4UL) <= N; j+=4UL ) {
1018 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1019 for(
size_t k=0UL; k<K; k+=IT::size ) {
1026 xmm1 = xmm1 + a1 * b1;
1027 xmm2 = xmm2 + a1 * b2;
1028 xmm3 = xmm3 + a1 * b3;
1029 xmm4 = xmm4 + a1 * b4;
1030 xmm5 = xmm5 + a2 * b1;
1031 xmm6 = xmm6 + a2 * b2;
1032 xmm7 = xmm7 + a2 * b3;
1033 xmm8 = xmm8 + a2 * b4;
1035 (~C)(i ,j ) +=
sum( xmm1 );
1036 (~C)(i ,j+1UL) +=
sum( xmm2 );
1037 (~C)(i ,j+2UL) +=
sum( xmm3 );
1038 (~C)(i ,j+3UL) +=
sum( xmm4 );
1039 (~C)(i+1UL,j ) +=
sum( xmm5 );
1040 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
1041 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
1042 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
1044 for( ; (j+2UL) <= N; j+=2UL ) {
1046 for(
size_t k=0UL; k<K; k+=IT::size ) {
1051 xmm1 = xmm1 + a1 * b1;
1052 xmm2 = xmm2 + a1 * b2;
1053 xmm3 = xmm3 + a2 * b1;
1054 xmm4 = xmm4 + a2 * b2;
1056 (~C)(i ,j ) +=
sum( xmm1 );
1057 (~C)(i ,j+1UL) +=
sum( xmm2 );
1058 (~C)(i+1UL,j ) +=
sum( xmm3 );
1059 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1063 for(
size_t k=0UL; k<K; k+=IT::size ) {
1065 xmm1 = xmm1 + A.get(i ,k) * b1;
1066 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1068 (~C)(i ,j) +=
sum( xmm1 );
1069 (~C)(i+1UL,j) +=
sum( xmm2 );
1074 for( ; (j+4UL) <= N; j+=4UL ) {
1076 for(
size_t k=0UL; k<K; k+=IT::size ) {
1078 xmm1 = xmm1 + a1 * B.get(k,j );
1079 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1080 xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1081 xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1083 (~C)(i,j ) +=
sum( xmm1 );
1084 (~C)(i,j+1UL) +=
sum( xmm2 );
1085 (~C)(i,j+2UL) +=
sum( xmm3 );
1086 (~C)(i,j+3UL) +=
sum( xmm4 );
1088 for( ; (j+2UL) <= N; j+=2UL ) {
1090 for(
size_t k=0UL; k<K; k+=IT::size ) {
1092 xmm1 = xmm1 + a1 * B.get(k,j );
1093 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1095 (~C)(i,j ) +=
sum( xmm1 );
1096 (~C)(i,j+1UL) +=
sum( xmm2 );
1100 for(
size_t k=0UL; k<K; k+=IT::size ) {
1101 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1103 (~C)(i,j) +=
sum( xmm1 );
1124 template<
typename MT3
1127 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1128 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1130 typedef IntrinsicTrait<ElementType> IT;
1132 const size_t M( A.rows() );
1133 const size_t N( B.columns() );
1134 const size_t K( A.columns() );
1138 for( ; (i+4UL) <= M; i+=4UL ) {
1140 for( ; (j+2UL) <= N; j+=2UL ) {
1141 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1142 for(
size_t k=0UL; k<K; k+=IT::size ) {
1149 xmm1 = xmm1 + a1 * b1;
1150 xmm2 = xmm2 + a1 * b2;
1151 xmm3 = xmm3 + a2 * b1;
1152 xmm4 = xmm4 + a2 * b2;
1153 xmm5 = xmm5 + a3 * b1;
1154 xmm6 = xmm6 + a3 * b2;
1155 xmm7 = xmm7 + a4 * b1;
1156 xmm8 = xmm8 + a4 * b2;
1158 (~C)(i ,j ) +=
sum( xmm1 );
1159 (~C)(i ,j+1UL) +=
sum( xmm2 );
1160 (~C)(i+1UL,j ) +=
sum( xmm3 );
1161 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1162 (~C)(i+2UL,j ) +=
sum( xmm5 );
1163 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
1164 (~C)(i+3UL,j ) +=
sum( xmm7 );
1165 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
1169 for(
size_t k=0UL; k<K; k+=IT::size ) {
1171 xmm1 = xmm1 + A.get(i ,k) * b1;
1172 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1173 xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1174 xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1176 (~C)(i ,j) +=
sum( xmm1 );
1177 (~C)(i+1UL,j) +=
sum( xmm2 );
1178 (~C)(i+2UL,j) +=
sum( xmm3 );
1179 (~C)(i+3UL,j) +=
sum( xmm4 );
1182 for( ; (i+2UL) <= M; i+=2UL ) {
1184 for( ; (j+2UL) <= N; j+=2UL ) {
1186 for(
size_t k=0UL; k<K; k+=IT::size ) {
1191 xmm1 = xmm1 + a1 * b1;
1192 xmm2 = xmm2 + a1 * b2;
1193 xmm3 = xmm3 + a2 * b1;
1194 xmm4 = xmm4 + a2 * b2;
1196 (~C)(i ,j ) +=
sum( xmm1 );
1197 (~C)(i ,j+1UL) +=
sum( xmm2 );
1198 (~C)(i+1UL,j ) +=
sum( xmm3 );
1199 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1203 for(
size_t k=0UL; k<K; k+=IT::size ) {
1205 xmm1 = xmm1 + A.get(i ,k) * b1;
1206 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1208 (~C)(i ,j) +=
sum( xmm1 );
1209 (~C)(i+1UL,j) +=
sum( xmm2 );
1214 for( ; (j+2UL) <= N; j+=2UL ) {
1216 for(
size_t k=0UL; k<K; k+=IT::size ) {
1218 xmm1 = xmm1 + a1 * B.get(k,j );
1219 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1221 (~C)(i,j ) +=
sum( xmm1 );
1222 (~C)(i,j+1UL) +=
sum( xmm2 );
1226 for(
size_t k=0UL; k<K; k+=IT::size ) {
1227 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1229 (~C)(i,j) +=
sum( xmm1 );
1250 template<
typename MT3
1253 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1254 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1256 selectDefaultAddAssignKernel( C, A, B );
1276 template<
typename MT3
1279 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1280 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1282 using boost::numeric_cast;
1288 const int M ( numeric_cast<int>( A.rows() ) );
1289 const int N ( numeric_cast<int>( B.columns() ) );
1290 const int K ( numeric_cast<int>( A.columns() ) );
1291 const int lda( numeric_cast<int>( A.spacing() ) );
1292 const int ldb( numeric_cast<int>( B.spacing() ) );
1293 const int ldc( numeric_cast<int>( C.spacing() ) );
1295 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1296 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1297 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1298 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1319 template<
typename MT3
1322 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1323 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1325 using boost::numeric_cast;
1331 const int M ( numeric_cast<int>( A.rows() ) );
1332 const int N ( numeric_cast<int>( B.columns() ) );
1333 const int K ( numeric_cast<int>( A.columns() ) );
1334 const int lda( numeric_cast<int>( A.spacing() ) );
1335 const int ldb( numeric_cast<int>( B.spacing() ) );
1336 const int ldc( numeric_cast<int>( C.spacing() ) );
1338 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1339 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1340 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1341 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1362 template<
typename MT3
1365 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1366 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1368 using boost::numeric_cast;
1377 const int M ( numeric_cast<int>( A.rows() ) );
1378 const int N ( numeric_cast<int>( B.columns() ) );
1379 const int K ( numeric_cast<int>( A.columns() ) );
1380 const int lda( numeric_cast<int>( A.spacing() ) );
1381 const int ldb( numeric_cast<int>( B.spacing() ) );
1382 const int ldc( numeric_cast<int>( C.spacing() ) );
1383 const complex<float> alpha( 1.0F, 0.0F );
1384 const complex<float> beta ( 1.0F, 0.0F );
1386 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1387 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1388 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1389 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1410 template<
typename MT3
1413 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1414 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1416 using boost::numeric_cast;
1425 const int M ( numeric_cast<int>( A.rows() ) );
1426 const int N ( numeric_cast<int>( B.columns() ) );
1427 const int K ( numeric_cast<int>( A.columns() ) );
1428 const int lda( numeric_cast<int>( A.spacing() ) );
1429 const int ldb( numeric_cast<int>( B.spacing() ) );
1430 const int ldc( numeric_cast<int>( C.spacing() ) );
1431 const complex<double> alpha( 1.0, 0.0 );
1432 const complex<double> beta ( 1.0, 0.0 );
1434 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1435 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1436 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1437 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1460 template<
typename MT
1467 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1482 DMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1484 DMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1503 template<
typename MT3
1506 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1507 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1509 const size_t M( A.rows() );
1510 const size_t N( B.columns() );
1511 const size_t K( A.columns() );
1514 const size_t end( N &
size_t(-2) );
1516 for(
size_t i=0UL; i<M; ++i ) {
1517 for(
size_t k=0UL; k<K; ++k ) {
1518 for(
size_t j=0UL; j<end; j+=2UL ) {
1519 C(i,j ) -= A(i,k) * B(k,j );
1520 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1523 C(i,end) -= A(i,k) * B(k,end);
1545 template<
typename MT3
1548 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1549 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1551 typedef IntrinsicTrait<ElementType> IT;
1553 const size_t M( A.rows() );
1554 const size_t N( B.columns() );
1555 const size_t K( A.columns() );
1559 for( ; (i+2UL) <= M; i+=2UL ) {
1561 for( ; (j+4UL) <= N; j+=4UL ) {
1562 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1563 for(
size_t k=0UL; k<K; k+=IT::size ) {
1570 xmm1 = xmm1 + a1 * b1;
1571 xmm2 = xmm2 + a1 * b2;
1572 xmm3 = xmm3 + a1 * b3;
1573 xmm4 = xmm4 + a1 * b4;
1574 xmm5 = xmm5 + a2 * b1;
1575 xmm6 = xmm6 + a2 * b2;
1576 xmm7 = xmm7 + a2 * b3;
1577 xmm8 = xmm8 + a2 * b4;
1579 (~C)(i ,j ) -=
sum( xmm1 );
1580 (~C)(i ,j+1UL) -=
sum( xmm2 );
1581 (~C)(i ,j+2UL) -=
sum( xmm3 );
1582 (~C)(i ,j+3UL) -=
sum( xmm4 );
1583 (~C)(i+1UL,j ) -=
sum( xmm5 );
1584 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
1585 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
1586 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
1588 for( ; (j+2UL) <= N; j+=2UL ) {
1590 for(
size_t k=0UL; k<K; k+=IT::size ) {
1595 xmm1 = xmm1 + a1 * b1;
1596 xmm2 = xmm2 + a1 * b2;
1597 xmm3 = xmm3 + a2 * b1;
1598 xmm4 = xmm4 + a2 * b2;
1600 (~C)(i ,j ) -=
sum( xmm1 );
1601 (~C)(i ,j+1UL) -=
sum( xmm2 );
1602 (~C)(i+1UL,j ) -=
sum( xmm3 );
1603 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1607 for(
size_t k=0UL; k<K; k+=IT::size ) {
1609 xmm1 = xmm1 + A.get(i ,k) * b1;
1610 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1612 (~C)(i ,j) -=
sum( xmm1 );
1613 (~C)(i+1UL,j) -=
sum( xmm2 );
1618 for( ; (j+4UL) <= N; j+=4UL ) {
1620 for(
size_t k=0UL; k<K; k+=IT::size ) {
1622 xmm1 = xmm1 + a1 * B.get(k,j );
1623 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1624 xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1625 xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1627 (~C)(i,j ) -=
sum( xmm1 );
1628 (~C)(i,j+1UL) -=
sum( xmm2 );
1629 (~C)(i,j+2UL) -=
sum( xmm3 );
1630 (~C)(i,j+3UL) -=
sum( xmm4 );
1632 for( ; (j+2UL) <= N; j+=2UL ) {
1634 for(
size_t k=0UL; k<K; k+=IT::size ) {
1636 xmm1 = xmm1 + a1 * B.get(k,j );
1637 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1639 (~C)(i,j ) -=
sum( xmm1 );
1640 (~C)(i,j+1UL) -=
sum( xmm2 );
1644 for(
size_t k=0UL; k<K; k+=IT::size ) {
1645 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1647 (~C)(i,j) -=
sum( xmm1 );
1668 template<
typename MT3
1671 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1672 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1674 typedef IntrinsicTrait<ElementType> IT;
1676 const size_t M( A.rows() );
1677 const size_t N( B.columns() );
1678 const size_t K( A.columns() );
1682 for( ; (i+4UL) <= M; i+=4UL ) {
1684 for( ; (j+2UL) <= N; j+=2UL ) {
1685 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1686 for(
size_t k=0UL; k<K; k+=IT::size ) {
1693 xmm1 = xmm1 + a1 * b1;
1694 xmm2 = xmm2 + a1 * b2;
1695 xmm3 = xmm3 + a2 * b1;
1696 xmm4 = xmm4 + a2 * b2;
1697 xmm5 = xmm5 + a3 * b1;
1698 xmm6 = xmm6 + a3 * b2;
1699 xmm7 = xmm7 + a4 * b1;
1700 xmm8 = xmm8 + a4 * b2;
1702 (~C)(i ,j ) -=
sum( xmm1 );
1703 (~C)(i ,j+1UL) -=
sum( xmm2 );
1704 (~C)(i+1UL,j ) -=
sum( xmm3 );
1705 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1706 (~C)(i+2UL,j ) -=
sum( xmm5 );
1707 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
1708 (~C)(i+3UL,j ) -=
sum( xmm7 );
1709 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
1713 for(
size_t k=0UL; k<K; k+=IT::size ) {
1715 xmm1 = xmm1 + A.get(i ,k) * b1;
1716 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1717 xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1718 xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1720 (~C)(i ,j) -=
sum( xmm1 );
1721 (~C)(i+1UL,j) -=
sum( xmm2 );
1722 (~C)(i+2UL,j) -=
sum( xmm3 );
1723 (~C)(i+3UL,j) -=
sum( xmm4 );
1726 for( ; (i+2UL) <= M; i+=2UL ) {
1728 for( ; (j+2UL) <= N; j+=2UL ) {
1730 for(
size_t k=0UL; k<K; k+=IT::size ) {
1735 xmm1 = xmm1 + a1 * b1;
1736 xmm2 = xmm2 + a1 * b2;
1737 xmm3 = xmm3 + a2 * b1;
1738 xmm4 = xmm4 + a2 * b2;
1740 (~C)(i ,j ) -=
sum( xmm1 );
1741 (~C)(i ,j+1UL) -=
sum( xmm2 );
1742 (~C)(i+1UL,j ) -=
sum( xmm3 );
1743 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1747 for(
size_t k=0UL; k<K; k+=IT::size ) {
1749 xmm1 = xmm1 + A.get(i ,k) * b1;
1750 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1752 (~C)(i ,j) -=
sum( xmm1 );
1753 (~C)(i+1UL,j) -=
sum( xmm2 );
1758 for( ; (j+2UL) <= N; j+=2UL ) {
1760 for(
size_t k=0UL; k<K; k+=IT::size ) {
1762 xmm1 = xmm1 + a1 * B.get(k,j );
1763 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1765 (~C)(i,j ) -=
sum( xmm1 );
1766 (~C)(i,j+1UL) -=
sum( xmm2 );
1770 for(
size_t k=0UL; k<K; k+=IT::size ) {
1771 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1773 (~C)(i,j) -=
sum( xmm1 );
1794 template<
typename MT3
1797 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1798 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1800 selectDefaultSubAssignKernel( C, A, B );
1820 template<
typename MT3
1823 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1824 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1826 using boost::numeric_cast;
1832 const int M ( numeric_cast<int>( A.rows() ) );
1833 const int N ( numeric_cast<int>( B.columns() ) );
1834 const int K ( numeric_cast<int>( A.columns() ) );
1835 const int lda( numeric_cast<int>( A.spacing() ) );
1836 const int ldb( numeric_cast<int>( B.spacing() ) );
1837 const int ldc( numeric_cast<int>( C.spacing() ) );
1839 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1840 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1841 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1842 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1863 template<
typename MT3
1866 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1867 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1869 using boost::numeric_cast;
1875 const int M ( numeric_cast<int>( A.rows() ) );
1876 const int N ( numeric_cast<int>( B.columns() ) );
1877 const int K ( numeric_cast<int>( A.columns() ) );
1878 const int lda( numeric_cast<int>( A.spacing() ) );
1879 const int ldb( numeric_cast<int>( B.spacing() ) );
1880 const int ldc( numeric_cast<int>( C.spacing() ) );
1882 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1883 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1884 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1885 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1906 template<
typename MT3
1909 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1910 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1912 using boost::numeric_cast;
1921 const int M ( numeric_cast<int>( A.rows() ) );
1922 const int N ( numeric_cast<int>( B.columns() ) );
1923 const int K ( numeric_cast<int>( A.columns() ) );
1924 const int lda( numeric_cast<int>( A.spacing() ) );
1925 const int ldb( numeric_cast<int>( B.spacing() ) );
1926 const int ldc( numeric_cast<int>( C.spacing() ) );
1927 const complex<float> alpha( -1.0F, 0.0F );
1928 const complex<float> beta ( 1.0F, 0.0F );
1930 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1931 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1932 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1933 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1954 template<
typename MT3
1957 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1958 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1960 using boost::numeric_cast;
1969 const int M ( numeric_cast<int>( A.rows() ) );
1970 const int N ( numeric_cast<int>( B.columns() ) );
1971 const int K ( numeric_cast<int>( A.columns() ) );
1972 const int lda( numeric_cast<int>( A.spacing() ) );
1973 const int ldb( numeric_cast<int>( B.spacing() ) );
1974 const int ldc( numeric_cast<int>( C.spacing() ) );
1975 const complex<double> alpha( -1.0, 0.0 );
1976 const complex<double> beta ( 1.0, 0.0 );
1978 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1979 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1980 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1981 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2027 template<
typename MT1
2031 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
2032 ,
private Expression
2033 ,
private Computation
2037 typedef DMatTDMatMultExpr<MT1,MT2> MMM;
2038 typedef typename MMM::ResultType RES;
2039 typedef typename MT1::ResultType
RT1;
2040 typedef typename MT2::ResultType
RT2;
2041 typedef typename MT1::CompositeType
CT1;
2042 typedef typename MT2::CompositeType
CT2;
2050 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2051 struct UseSinglePrecisionKernel {
2052 enum { value = IsFloat<typename T1::ElementType>::value &&
2053 IsFloat<typename T2::ElementType>::value &&
2054 IsFloat<typename T3::ElementType>::value &&
2055 !IsComplex<T4>::value };
2064 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2065 struct UseDoublePrecisionKernel {
2066 enum { value = IsDouble<typename T1::ElementType>::value &&
2067 IsDouble<typename T2::ElementType>::value &&
2068 IsDouble<typename T3::ElementType>::value &&
2069 !IsComplex<T4>::value };
2078 template<
typename T1,
typename T2,
typename T3 >
2079 struct UseSinglePrecisionComplexKernel {
2080 typedef complex<float> Type;
2081 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2082 IsSame<typename T2::ElementType,Type>::value &&
2083 IsSame<typename T3::ElementType,Type>::value };
2092 template<
typename T1,
typename T2,
typename T3 >
2093 struct UseDoublePrecisionComplexKernel {
2094 typedef complex<double> Type;
2095 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2096 IsSame<typename T2::ElementType,Type>::value &&
2097 IsSame<typename T3::ElementType,Type>::value };
2105 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2106 struct UseDefaultKernel {
2107 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2108 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2109 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2110 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2118 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2119 struct UseVectorizedDefaultKernel {
2120 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2121 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2122 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2123 IsSame<typename T1::ElementType,T4>::value &&
2124 IntrinsicTrait<typename T1::ElementType>::addition &&
2125 IntrinsicTrait<typename T1::ElementType>::multiplication };
2131 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2132 typedef typename MultTrait<RES,ST>::Type
ResultType;
2133 typedef typename ResultType::OppositeType
OppositeType;
2135 typedef typename ResultType::ElementType
ElementType;
2136 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2141 typedef const DMatTDMatMultExpr<MT1,MT2>
LeftOperand;
2147 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2150 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2155 enum { vectorizable = 0 };
2158 enum { canAlias = CanAlias<MMM>::value };
2167 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2183 return matrix_(i,j) * scalar_;
2192 inline size_t rows()
const {
2193 return matrix_.rows();
2202 inline size_t columns()
const {
2203 return matrix_.columns();
2233 template<
typename T >
2234 inline bool isAliased(
const T* alias )
const {
2235 return CanAlias<MMM>::value && matrix_.isAliased( alias );
2254 template<
typename MT3
2256 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2261 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2262 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2264 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2267 else if( left.columns() == 0UL ) {
2283 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2285 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2303 template<
typename MT3
2307 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2308 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2310 for(
size_t i=0UL; i<A.rows(); ++i ) {
2311 for(
size_t k=0UL; k<B.columns(); ++k ) {
2312 C(i,k) = A(i,0UL) * B(0UL,k);
2314 for(
size_t j=1UL; j<A.columns(); ++j ) {
2315 for(
size_t k=0UL; k<B.columns(); ++k ) {
2316 C(i,k) += A(i,j) * B(j,k);
2319 for(
size_t k=0UL; k<B.columns(); ++k ) {
2340 template<
typename MT3
2344 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2345 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2347 typedef IntrinsicTrait<ElementType> IT;
2349 const size_t M( A.rows() );
2350 const size_t N( B.columns() );
2351 const size_t K( A.columns() );
2355 for( ; (i+2UL) <= M; i+=2UL ) {
2357 for( ; (j+4UL) <= N; j+=4UL ) {
2358 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2359 for(
size_t k=0UL; k<K; k+=IT::size ) {
2366 xmm1 = xmm1 + a1 * b1;
2367 xmm2 = xmm2 + a1 * b2;
2368 xmm3 = xmm3 + a1 * b3;
2369 xmm4 = xmm4 + a1 * b4;
2370 xmm5 = xmm5 + a2 * b1;
2371 xmm6 = xmm6 + a2 * b2;
2372 xmm7 = xmm7 + a2 * b3;
2373 xmm8 = xmm8 + a2 * b4;
2375 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2376 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2377 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
2378 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
2379 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
2380 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
2381 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
2382 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
2384 for( ; (j+2UL) <= N; j+=2UL ) {
2386 for(
size_t k=0UL; k<K; k+=IT::size ) {
2391 xmm1 = xmm1 + a1 * b1;
2392 xmm2 = xmm2 + a1 * b2;
2393 xmm3 = xmm3 + a2 * b1;
2394 xmm4 = xmm4 + a2 * b2;
2396 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2397 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2398 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2399 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2403 for(
size_t k=0UL; k<K; k+=IT::size ) {
2405 xmm1 = xmm1 + A.get(i ,k) * b1;
2406 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2408 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2409 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2414 for( ; (j+4UL) <= N; j+=4UL ) {
2416 for(
size_t k=0UL; k<K; k+=IT::size ) {
2418 xmm1 = xmm1 + a1 * B.get(k,j );
2419 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2420 xmm3 = xmm3 + a1 * B.get(k,j+2UL);
2421 xmm4 = xmm4 + a1 * B.get(k,j+3UL);
2423 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2424 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2425 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
2426 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
2428 for( ; (j+2UL) <= N; j+=2UL ) {
2430 for(
size_t k=0UL; k<K; k+=IT::size ) {
2432 xmm1 = xmm1 + a1 * B.get(k,j );
2433 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2435 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2436 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2440 for(
size_t k=0UL; k<K; k+=IT::size ) {
2441 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2443 (~C)(i,j) =
sum( xmm1 ) * scalar;
2463 template<
typename MT3
2467 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2468 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2470 typedef IntrinsicTrait<ElementType> IT;
2472 const size_t M( A.rows() );
2473 const size_t N( B.columns() );
2474 const size_t K( A.columns() );
2478 for( ; (i+4UL) <= M; i+=4UL ) {
2480 for( ; (j+2UL) <= N; j+=2UL ) {
2481 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2482 for(
size_t k=0UL; k<K; k+=IT::size ) {
2489 xmm1 = xmm1 + a1 * b1;
2490 xmm2 = xmm2 + a1 * b2;
2491 xmm3 = xmm3 + a2 * b1;
2492 xmm4 = xmm4 + a2 * b2;
2493 xmm5 = xmm5 + a3 * b1;
2494 xmm6 = xmm6 + a3 * b2;
2495 xmm7 = xmm7 + a4 * b1;
2496 xmm8 = xmm8 + a4 * b2;
2498 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2499 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2500 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2501 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2502 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
2503 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
2504 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
2505 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
2509 for(
size_t k=0UL; k<K; k+=IT::size ) {
2511 xmm1 = xmm1 + A.get(i ,k) * b1;
2512 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2513 xmm3 = xmm3 + A.get(i+2UL,k) * b1;
2514 xmm4 = xmm4 + A.get(i+3UL,k) * b1;
2516 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2517 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2518 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
2519 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
2522 for( ; (i+2UL) <= M; i+=2UL ) {
2524 for( ; (j+2UL) <= N; j+=2UL ) {
2526 for(
size_t k=0UL; k<K; k+=IT::size ) {
2531 xmm1 = xmm1 + a1 * b1;
2532 xmm2 = xmm2 + a1 * b2;
2533 xmm3 = xmm3 + a2 * b1;
2534 xmm4 = xmm4 + a2 * b2;
2536 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2537 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2538 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2539 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2543 for(
size_t k=0UL; k<K; k+=IT::size ) {
2545 xmm1 = xmm1 + A.get(i ,k) * b1;
2546 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2548 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2549 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2554 for( ; (j+2UL) <= N; j+=2UL ) {
2556 for(
size_t k=0UL; k<K; k+=IT::size ) {
2558 xmm1 = xmm1 + a1 * B.get(k,j );
2559 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2561 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2562 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2566 for(
size_t k=0UL; k<K; k+=IT::size ) {
2567 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2569 (~C)(i,j) =
sum( xmm1 ) * scalar;
2589 template<
typename MT3
2593 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2594 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2596 selectDefaultAssignKernel( C, A, B, scalar );
2615 template<
typename MT3
2619 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2620 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2622 using boost::numeric_cast;
2628 const int M ( numeric_cast<int>( A.rows() ) );
2629 const int N ( numeric_cast<int>( B.columns() ) );
2630 const int K ( numeric_cast<int>( A.columns() ) );
2631 const int lda( numeric_cast<int>( A.spacing() ) );
2632 const int ldb( numeric_cast<int>( B.spacing() ) );
2633 const int ldc( numeric_cast<int>( C.spacing() ) );
2635 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2636 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2637 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2638 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2658 template<
typename MT3
2662 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2663 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2665 using boost::numeric_cast;
2671 const int M ( numeric_cast<int>( A.rows() ) );
2672 const int N ( numeric_cast<int>( B.columns() ) );
2673 const int K ( numeric_cast<int>( A.columns() ) );
2674 const int lda( numeric_cast<int>( A.spacing() ) );
2675 const int ldb( numeric_cast<int>( B.spacing() ) );
2676 const int ldc( numeric_cast<int>( C.spacing() ) );
2678 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2679 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2680 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2681 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2701 template<
typename MT3
2705 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2706 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2708 using boost::numeric_cast;
2718 const int M ( numeric_cast<int>( A.rows() ) );
2719 const int N ( numeric_cast<int>( B.columns() ) );
2720 const int K ( numeric_cast<int>( A.columns() ) );
2721 const int lda( numeric_cast<int>( A.spacing() ) );
2722 const int ldb( numeric_cast<int>( B.spacing() ) );
2723 const int ldc( numeric_cast<int>( C.spacing() ) );
2724 const complex<float> alpha( scalar );
2725 const complex<float> beta ( 0.0F, 0.0F );
2727 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2728 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2729 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2730 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2750 template<
typename MT3
2754 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2755 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2757 using boost::numeric_cast;
2767 const int M ( numeric_cast<int>( A.rows() ) );
2768 const int N ( numeric_cast<int>( B.columns() ) );
2769 const int K ( numeric_cast<int>( A.columns() ) );
2770 const int lda( numeric_cast<int>( A.spacing() ) );
2771 const int ldb( numeric_cast<int>( B.spacing() ) );
2772 const int ldc( numeric_cast<int>( C.spacing() ) );
2773 const complex<double> alpha( scalar );
2774 const complex<double> beta ( 0.0, 0.0 );
2776 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2777 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2778 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2779 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2796 template<
typename MT
2800 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2812 const TmpType tmp( rhs );
2829 template<
typename MT3
2831 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2836 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2837 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2839 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
2854 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2856 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2874 template<
typename MT3
2878 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2879 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2900 template<
typename MT3
2904 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2905 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2907 typedef IntrinsicTrait<ElementType> IT;
2909 const size_t M( A.rows() );
2910 const size_t N( B.columns() );
2911 const size_t K( A.columns() );
2915 for( ; (i+2UL) <= M; i+=2UL ) {
2917 for( ; (j+4UL) <= N; j+=4UL ) {
2918 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2919 for(
size_t k=0UL; k<K; k+=IT::size ) {
2926 xmm1 = xmm1 + a1 * b1;
2927 xmm2 = xmm2 + a1 * b2;
2928 xmm3 = xmm3 + a1 * b3;
2929 xmm4 = xmm4 + a1 * b4;
2930 xmm5 = xmm5 + a2 * b1;
2931 xmm6 = xmm6 + a2 * b2;
2932 xmm7 = xmm7 + a2 * b3;
2933 xmm8 = xmm8 + a2 * b4;
2935 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
2936 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
2937 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
2938 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
2939 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
2940 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
2941 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
2942 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
2944 for( ; (j+2UL) <= N; j+=2UL ) {
2946 for(
size_t k=0UL; k<K; k+=IT::size ) {
2951 xmm1 = xmm1 + a1 * b1;
2952 xmm2 = xmm2 + a1 * b2;
2953 xmm3 = xmm3 + a2 * b1;
2954 xmm4 = xmm4 + a2 * b2;
2956 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
2957 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
2958 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
2959 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
2963 for(
size_t k=0UL; k<K; k+=IT::size ) {
2965 xmm1 = xmm1 + A.get(i ,k) * b1;
2966 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2968 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
2969 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
2974 for( ; (j+4UL) <= N; j+=4UL ) {
2976 for(
size_t k=0UL; k<K; k+=IT::size ) {
2978 xmm1 = xmm1 + a1 * B.get(k,j );
2979 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2980 xmm3 = xmm3 + a1 * B.get(k,j+2UL);
2981 xmm4 = xmm4 + a1 * B.get(k,j+3UL);
2983 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
2984 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
2985 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
2986 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
2988 for( ; (j+2UL) <= N; j+=2UL ) {
2990 for(
size_t k=0UL; k<K; k+=IT::size ) {
2992 xmm1 = xmm1 + a1 * B.get(k,j );
2993 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2995 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
2996 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3000 for(
size_t k=0UL; k<K; k+=IT::size ) {
3001 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3003 (~C)(i,j) +=
sum( xmm1 ) * scalar;
3023 template<
typename MT3
3027 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3028 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3030 typedef IntrinsicTrait<ElementType> IT;
3032 const size_t M( A.rows() );
3033 const size_t N( B.columns() );
3034 const size_t K( A.columns() );
3038 for( ; (i+4UL) <= M; i+=4UL ) {
3040 for( ; (j+2UL) <= N; j+=2UL ) {
3041 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3042 for(
size_t k=0UL; k<K; k+=IT::size ) {
3049 xmm1 = xmm1 + a1 * b1;
3050 xmm2 = xmm2 + a1 * b2;
3051 xmm3 = xmm3 + a2 * b1;
3052 xmm4 = xmm4 + a2 * b2;
3053 xmm5 = xmm5 + a3 * b1;
3054 xmm6 = xmm6 + a3 * b2;
3055 xmm7 = xmm7 + a4 * b1;
3056 xmm8 = xmm8 + a4 * b2;
3058 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3059 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3060 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3061 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3062 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
3063 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
3064 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
3065 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
3069 for(
size_t k=0UL; k<K; k+=IT::size ) {
3071 xmm1 = xmm1 + A.get(i ,k) * b1;
3072 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3073 xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3074 xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3076 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3077 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3078 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
3079 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
3082 for( ; (i+2UL) <= M; i+=2UL ) {
3084 for( ; (j+2UL) <= N; j+=2UL ) {
3086 for(
size_t k=0UL; k<K; k+=IT::size ) {
3091 xmm1 = xmm1 + a1 * b1;
3092 xmm2 = xmm2 + a1 * b2;
3093 xmm3 = xmm3 + a2 * b1;
3094 xmm4 = xmm4 + a2 * b2;
3096 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3097 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3098 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3099 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3103 for(
size_t k=0UL; k<K; k+=IT::size ) {
3105 xmm1 = xmm1 + A.get(i ,k) * b1;
3106 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3108 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3109 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3114 for( ; (j+2UL) <= N; j+=2UL ) {
3116 for(
size_t k=0UL; k<K; k+=IT::size ) {
3118 xmm1 = xmm1 + a1 * B.get(k,j );
3119 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3121 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3122 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3126 for(
size_t k=0UL; k<K; k+=IT::size ) {
3127 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3129 (~C)(i,j) +=
sum( xmm1 ) * scalar;
3149 template<
typename MT3
3153 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3154 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3156 selectDefaultAddAssignKernel( C, A, B, scalar );
3175 template<
typename MT3
3179 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3180 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3182 using boost::numeric_cast;
3188 const int M ( numeric_cast<int>( A.rows() ) );
3189 const int N ( numeric_cast<int>( B.columns() ) );
3190 const int K ( numeric_cast<int>( A.columns() ) );
3191 const int lda( numeric_cast<int>( A.spacing() ) );
3192 const int ldb( numeric_cast<int>( B.spacing() ) );
3193 const int ldc( numeric_cast<int>( C.spacing() ) );
3195 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3196 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3197 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3198 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3218 template<
typename MT3
3222 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3223 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3225 using boost::numeric_cast;
3231 const int M ( numeric_cast<int>( A.rows() ) );
3232 const int N ( numeric_cast<int>( B.columns() ) );
3233 const int K ( numeric_cast<int>( A.columns() ) );
3234 const int lda( numeric_cast<int>( A.spacing() ) );
3235 const int ldb( numeric_cast<int>( B.spacing() ) );
3236 const int ldc( numeric_cast<int>( C.spacing() ) );
3238 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3239 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3240 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3241 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3261 template<
typename MT3
3265 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3266 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3268 using boost::numeric_cast;
3278 const int M ( numeric_cast<int>( A.rows() ) );
3279 const int N ( numeric_cast<int>( B.columns() ) );
3280 const int K ( numeric_cast<int>( A.columns() ) );
3281 const int lda( numeric_cast<int>( A.spacing() ) );
3282 const int ldb( numeric_cast<int>( B.spacing() ) );
3283 const int ldc( numeric_cast<int>( C.spacing() ) );
3284 const complex<float> alpha( scalar );
3285 const complex<float> beta ( 1.0F, 0.0F );
3287 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3288 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3289 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3290 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3310 template<
typename MT3
3314 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3315 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3317 using boost::numeric_cast;
3327 const int M ( numeric_cast<int>( A.rows() ) );
3328 const int N ( numeric_cast<int>( B.columns() ) );
3329 const int K ( numeric_cast<int>( A.columns() ) );
3330 const int lda( numeric_cast<int>( A.spacing() ) );
3331 const int ldb( numeric_cast<int>( B.spacing() ) );
3332 const int ldc( numeric_cast<int>( C.spacing() ) );
3333 const complex<double> alpha( scalar );
3334 const complex<double> beta ( 1.0, 0.0 );
3336 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3337 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3338 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3339 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3360 template<
typename MT3
3362 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3367 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3368 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3370 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3385 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3387 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3405 template<
typename MT3
3409 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3410 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3431 template<
typename MT3
3435 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3436 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3438 typedef IntrinsicTrait<ElementType> IT;
3440 const size_t M( A.rows() );
3441 const size_t N( B.columns() );
3442 const size_t K( A.columns() );
3446 for( ; (i+2UL) <= M; i+=2UL ) {
3448 for( ; (j+4UL) <= N; j+=4UL ) {
3449 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3450 for(
size_t k=0UL; k<K; k+=IT::size ) {
3457 xmm1 = xmm1 + a1 * b1;
3458 xmm2 = xmm2 + a1 * b2;
3459 xmm3 = xmm3 + a1 * b3;
3460 xmm4 = xmm4 + a1 * b4;
3461 xmm5 = xmm5 + a2 * b1;
3462 xmm6 = xmm6 + a2 * b2;
3463 xmm7 = xmm7 + a2 * b3;
3464 xmm8 = xmm8 + a2 * b4;
3466 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3467 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3468 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
3469 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
3470 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
3471 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
3472 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
3473 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
3475 for( ; (j+2UL) <= N; j+=2UL ) {
3477 for(
size_t k=0UL; k<K; k+=IT::size ) {
3482 xmm1 = xmm1 + a1 * b1;
3483 xmm2 = xmm2 + a1 * b2;
3484 xmm3 = xmm3 + a2 * b1;
3485 xmm4 = xmm4 + a2 * b2;
3487 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3488 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3489 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
3490 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
3494 for(
size_t k=0UL; k<K; k+=IT::size ) {
3496 xmm1 = xmm1 + A.get(i ,k) * b1;
3497 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3499 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
3500 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
3505 for( ; (j+4UL) <= N; j+=4UL ) {
3507 for(
size_t k=0UL; k<K; k+=IT::size ) {
3509 xmm1 = xmm1 + a1 * B.get(k,j );
3510 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3511 xmm3 = xmm3 + a1 * B.get(k,j+2UL);
3512 xmm4 = xmm4 + a1 * B.get(k,j+3UL);
3514 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
3515 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
3516 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
3517 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
3519 for( ; (j+2UL) <= N; j+=2UL ) {
3521 for(
size_t k=0UL; k<K; k+=IT::size ) {
3523 xmm1 = xmm1 + a1 * B.get(k,j );
3524 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3526 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
3527 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
3531 for(
size_t k=0UL; k<K; k+=IT::size ) {
3532 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3534 (~C)(i,j) -=
sum( xmm1 ) * scalar;
3554 template<
typename MT3
3558 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3559 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3561 typedef IntrinsicTrait<ElementType> IT;
3563 const size_t M( A.rows() );
3564 const size_t N( B.columns() );
3565 const size_t K( A.columns() );
3569 for( ; (i+4UL) <= M; i+=4UL ) {
3571 for( ; (j+2UL) <= N; j+=2UL ) {
3572 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3573 for(
size_t k=0UL; k<K; k+=IT::size ) {
3580 xmm1 = xmm1 + a1 * b1;
3581 xmm2 = xmm2 + a1 * b2;
3582 xmm3 = xmm3 + a2 * b1;
3583 xmm4 = xmm4 + a2 * b2;
3584 xmm5 = xmm5 + a3 * b1;
3585 xmm6 = xmm6 + a3 * b2;
3586 xmm7 = xmm7 + a4 * b1;
3587 xmm8 = xmm8 + a4 * b2;
3589 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3590 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3591 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
3592 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
3593 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
3594 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
3595 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
3596 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
3600 for(
size_t k=0UL; k<K; k+=IT::size ) {
3602 xmm1 = xmm1 + A.get(i ,k) * b1;
3603 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3604 xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3605 xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3607 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
3608 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
3609 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
3610 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
3613 for( ; (i+2UL) <= M; i+=2UL ) {
3615 for( ; (j+2UL) <= N; j+=2UL ) {
3617 for(
size_t k=0UL; k<K; k+=IT::size ) {
3622 xmm1 = xmm1 + a1 * b1;
3623 xmm2 = xmm2 + a1 * b2;
3624 xmm3 = xmm3 + a2 * b1;
3625 xmm4 = xmm4 + a2 * b2;
3627 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3628 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3629 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
3630 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
3634 for(
size_t k=0UL; k<K; k+=IT::size ) {
3636 xmm1 = xmm1 + A.get(i ,k) * b1;
3637 xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3639 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
3640 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
3645 for( ; (j+2UL) <= N; j+=2UL ) {
3647 for(
size_t k=0UL; k<K; k+=IT::size ) {
3649 xmm1 = xmm1 + a1 * B.get(k,j );
3650 xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3652 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
3653 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
3657 for(
size_t k=0UL; k<K; k+=IT::size ) {
3658 xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3660 (~C)(i,j) -=
sum( xmm1 ) * scalar;
3680 template<
typename MT3
3684 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3685 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3687 selectDefaultSubAssignKernel( C, A, B, scalar );
3706 template<
typename MT3
3710 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3711 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3713 using boost::numeric_cast;
3719 const int M ( numeric_cast<int>( A.rows() ) );
3720 const int N ( numeric_cast<int>( B.columns() ) );
3721 const int K ( numeric_cast<int>( A.columns() ) );
3722 const int lda( numeric_cast<int>( A.spacing() ) );
3723 const int ldb( numeric_cast<int>( B.spacing() ) );
3724 const int ldc( numeric_cast<int>( C.spacing() ) );
3726 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3727 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3728 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3729 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3749 template<
typename MT3
3753 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3754 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3756 using boost::numeric_cast;
3762 const int M ( numeric_cast<int>( A.rows() ) );
3763 const int N ( numeric_cast<int>( B.columns() ) );
3764 const int K ( numeric_cast<int>( A.columns() ) );
3765 const int lda( numeric_cast<int>( A.spacing() ) );
3766 const int ldb( numeric_cast<int>( B.spacing() ) );
3767 const int ldc( numeric_cast<int>( C.spacing() ) );
3769 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3770 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3771 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3772 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3792 template<
typename MT3
3796 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3797 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3799 using boost::numeric_cast;
3809 const int M ( numeric_cast<int>( A.rows() ) );
3810 const int N ( numeric_cast<int>( B.columns() ) );
3811 const int K ( numeric_cast<int>( A.columns() ) );
3812 const int lda( numeric_cast<int>( A.spacing() ) );
3813 const int ldb( numeric_cast<int>( B.spacing() ) );
3814 const int ldc( numeric_cast<int>( C.spacing() ) );
3815 const complex<float> alpha( -scalar );
3816 const complex<float> beta ( 1.0F, 0.0F );
3818 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3819 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3820 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3821 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3841 template<
typename MT3
3845 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3846 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3848 using boost::numeric_cast;
3858 const int M ( numeric_cast<int>( A.rows() ) );
3859 const int N ( numeric_cast<int>( B.columns() ) );
3860 const int K ( numeric_cast<int>( A.columns() ) );
3861 const int lda( numeric_cast<int>( A.spacing() ) );
3862 const int ldb( numeric_cast<int>( B.spacing() ) );
3863 const int ldc( numeric_cast<int>( C.spacing() ) );
3864 const complex<double> alpha( -scalar );
3865 const complex<double> beta ( 1.0, 0.0 );
3867 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3868 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3869 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3870 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3938 template<
typename T1
3940 inline const DMatTDMatMultExpr<T1,T2>
3944 throw std::invalid_argument(
"Matrix sizes do not match" );
3961 template<
typename MT1,
typename MT2,
typename VT >
3966 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3967 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3968 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3969 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
3970 , INVALID_TYPE >::Type Type;
3979 template<
typename MT1,
typename MT2,
typename VT >
3984 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3985 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3986 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3987 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
3988 , INVALID_TYPE >::Type Type;
3997 template<
typename VT,
typename MT1,
typename MT2 >
4002 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4003 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4004 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4005 ,
typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4006 , INVALID_TYPE >::Type Type;
4015 template<
typename VT,
typename MT1,
typename MT2 >
4020 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4021 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4022 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4023 ,
typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4024 , INVALID_TYPE >::Type Type;