35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
113 template<
typename MT1
145 template<
typename MT >
146 struct UseSMPAssign {
147 enum { value = ( evaluateLeft || evaluateRight ) };
157 template<
typename T1,
typename T2,
typename T3 >
158 struct UseSinglePrecisionKernel {
159 enum { value = IsFloat<typename T1::ElementType>::value &&
160 IsFloat<typename T2::ElementType>::value &&
161 IsFloat<typename T3::ElementType>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDoublePrecisionKernel {
173 enum { value = IsDouble<typename T1::ElementType>::value &&
174 IsDouble<typename T2::ElementType>::value &&
175 IsDouble<typename T3::ElementType>::value };
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseSinglePrecisionComplexKernel {
188 typedef complex<float> Type;
189 enum { value = IsSame<typename T1::ElementType,Type>::value &&
190 IsSame<typename T2::ElementType,Type>::value &&
191 IsSame<typename T3::ElementType,Type>::value };
202 template<
typename T1,
typename T2,
typename T3 >
203 struct UseDoublePrecisionComplexKernel {
204 typedef complex<double> Type;
205 enum { value = IsSame<typename T1::ElementType,Type>::value &&
206 IsSame<typename T2::ElementType,Type>::value &&
207 IsSame<typename T3::ElementType,Type>::value };
217 template<
typename T1,
typename T2,
typename T3 >
218 struct UseDefaultKernel {
219 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
220 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
221 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
222 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
232 template<
typename T1,
typename T2,
typename T3 >
233 struct UseVectorizedDefaultKernel {
234 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
235 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
236 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
237 IntrinsicTrait<typename T1::ElementType>::addition &&
238 IntrinsicTrait<typename T1::ElementType>::multiplication };
269 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
275 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
276 !evaluateRight && MT2::smpAssignable };
306 if(
lhs_.columns() != 0UL ) {
307 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
309 for(
size_t k=1UL; k<end; k+=2UL ) {
311 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
313 if( end <
lhs_.columns() ) {
341 return rhs_.columns();
371 template<
typename T >
373 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
383 template<
typename T >
385 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
395 return lhs_.isAligned() &&
rhs_.isAligned();
430 template<
typename MT
439 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
442 else if( rhs.
lhs_.columns() == 0UL ) {
457 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
473 template<
typename MT3
476 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
479 DMatTDMatMultExpr::selectDefaultAssignKernel( C, A, B );
481 DMatTDMatMultExpr::selectBlasAssignKernel( C, A, B );
500 template<
typename MT3
503 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
504 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
506 const size_t M( A.rows() );
507 const size_t N( B.columns() );
508 const size_t K( A.columns() );
510 for(
size_t i=0UL; i<M; ++i ) {
511 for(
size_t j=0UL; j<N; ++j ) {
512 C(i,j) = A(i,0UL) * B(0UL,j);
514 for(
size_t k=1UL; k<K; ++k ) {
515 for(
size_t j=0UL; j<N; ++j ) {
516 C(i,j) += A(i,k) * B(k,j);
538 template<
typename MT3
541 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
542 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
544 typedef IntrinsicTrait<ElementType> IT;
546 const size_t M( A.rows() );
547 const size_t N( B.columns() );
548 const size_t K( A.columns() );
552 for( ; (i+2UL) <= M; i+=2UL ) {
554 for( ; (j+4UL) <= N; j+=4UL ) {
555 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
556 for(
size_t k=0UL; k<K; k+=IT::size ) {
563 xmm1 = xmm1 + a1 * b1;
564 xmm2 = xmm2 + a1 * b2;
565 xmm3 = xmm3 + a1 * b3;
566 xmm4 = xmm4 + a1 * b4;
567 xmm5 = xmm5 + a2 * b1;
568 xmm6 = xmm6 + a2 * b2;
569 xmm7 = xmm7 + a2 * b3;
570 xmm8 = xmm8 + a2 * b4;
572 (~C)(i ,j ) =
sum( xmm1 );
573 (~C)(i ,j+1UL) =
sum( xmm2 );
574 (~C)(i ,j+2UL) =
sum( xmm3 );
575 (~C)(i ,j+3UL) =
sum( xmm4 );
576 (~C)(i+1UL,j ) =
sum( xmm5 );
577 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
578 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
579 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
581 for( ; (j+2UL) <= N; j+=2UL ) {
583 for(
size_t k=0UL; k<K; k+=IT::size ) {
588 xmm1 = xmm1 + a1 * b1;
589 xmm2 = xmm2 + a1 * b2;
590 xmm3 = xmm3 + a2 * b1;
591 xmm4 = xmm4 + a2 * b2;
593 (~C)(i ,j ) =
sum( xmm1 );
594 (~C)(i ,j+1UL) =
sum( xmm2 );
595 (~C)(i+1UL,j ) =
sum( xmm3 );
596 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
600 for(
size_t k=0UL; k<K; k+=IT::size ) {
602 xmm1 = xmm1 + A.load(i ,k) * b1;
603 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
605 (~C)(i ,j) =
sum( xmm1 );
606 (~C)(i+1UL,j) =
sum( xmm2 );
611 for( ; (j+4UL) <= N; j+=4UL ) {
613 for(
size_t k=0UL; k<K; k+=IT::size ) {
615 xmm1 = xmm1 + a1 * B.load(k,j );
616 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
617 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
618 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
620 (~C)(i,j ) =
sum( xmm1 );
621 (~C)(i,j+1UL) =
sum( xmm2 );
622 (~C)(i,j+2UL) =
sum( xmm3 );
623 (~C)(i,j+3UL) =
sum( xmm4 );
625 for( ; (j+2UL) <= N; j+=2UL ) {
627 for(
size_t k=0UL; k<K; k+=IT::size ) {
629 xmm1 = xmm1 + a1 * B.load(k,j );
630 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
632 (~C)(i,j ) =
sum( xmm1 );
633 (~C)(i,j+1UL) =
sum( xmm2 );
637 for(
size_t k=0UL; k<K; k+=IT::size ) {
638 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
640 (~C)(i,j) =
sum( xmm1 );
661 template<
typename MT3
664 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
665 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
667 typedef IntrinsicTrait<ElementType> IT;
669 const size_t M( A.rows() );
670 const size_t N( B.columns() );
671 const size_t K( A.columns() );
675 for( ; (i+4UL) <= M; i+=4UL ) {
677 for( ; (j+2UL) <= N; j+=2UL ) {
678 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
679 for(
size_t k=0UL; k<K; k+=IT::size ) {
686 xmm1 = xmm1 + a1 * b1;
687 xmm2 = xmm2 + a1 * b2;
688 xmm3 = xmm3 + a2 * b1;
689 xmm4 = xmm4 + a2 * b2;
690 xmm5 = xmm5 + a3 * b1;
691 xmm6 = xmm6 + a3 * b2;
692 xmm7 = xmm7 + a4 * b1;
693 xmm8 = xmm8 + a4 * b2;
695 (~C)(i ,j ) =
sum( xmm1 );
696 (~C)(i ,j+1UL) =
sum( xmm2 );
697 (~C)(i+1UL,j ) =
sum( xmm3 );
698 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
699 (~C)(i+2UL,j ) =
sum( xmm5 );
700 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
701 (~C)(i+3UL,j ) =
sum( xmm7 );
702 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
706 for(
size_t k=0UL; k<K; k+=IT::size ) {
708 xmm1 = xmm1 + A.load(i ,k) * b1;
709 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
710 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
711 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
713 (~C)(i ,j) =
sum( xmm1 );
714 (~C)(i+1UL,j) =
sum( xmm2 );
715 (~C)(i+2UL,j) =
sum( xmm3 );
716 (~C)(i+3UL,j) =
sum( xmm4 );
719 for( ; (i+2UL) <= M; i+=2UL ) {
721 for( ; (j+2UL) <= N; j+=2UL ) {
723 for(
size_t k=0UL; k<K; k+=IT::size ) {
728 xmm1 = xmm1 + a1 * b1;
729 xmm2 = xmm2 + a1 * b2;
730 xmm3 = xmm3 + a2 * b1;
731 xmm4 = xmm4 + a2 * b2;
733 (~C)(i ,j ) =
sum( xmm1 );
734 (~C)(i ,j+1UL) =
sum( xmm2 );
735 (~C)(i+1UL,j ) =
sum( xmm3 );
736 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
740 for(
size_t k=0UL; k<K; k+=IT::size ) {
742 xmm1 = xmm1 + A.load(i ,k) * b1;
743 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
745 (~C)(i ,j) =
sum( xmm1 );
746 (~C)(i+1UL,j) =
sum( xmm2 );
751 for( ; (j+2UL) <= N; j+=2UL ) {
753 for(
size_t k=0UL; k<K; k+=IT::size ) {
755 xmm1 = xmm1 + a1 * B.load(k,j );
756 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
758 (~C)(i,j ) =
sum( xmm1 );
759 (~C)(i,j+1UL) =
sum( xmm2 );
763 for(
size_t k=0UL; k<K; k+=IT::size ) {
764 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
766 (~C)(i,j) =
sum( xmm1 );
787 template<
typename MT3
790 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
791 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
793 selectDefaultAssignKernel( C, A, B );
813 template<
typename MT3
816 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
817 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
819 using boost::numeric_cast;
825 const int M ( numeric_cast<int>( A.rows() ) );
826 const int N ( numeric_cast<int>( B.columns() ) );
827 const int K ( numeric_cast<int>( A.columns() ) );
828 const int lda( numeric_cast<int>( A.spacing() ) );
829 const int ldb( numeric_cast<int>( B.spacing() ) );
830 const int ldc( numeric_cast<int>( C.spacing() ) );
832 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
833 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
834 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
835 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
856 template<
typename MT3
859 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
860 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
862 using boost::numeric_cast;
868 const int M ( numeric_cast<int>( A.rows() ) );
869 const int N ( numeric_cast<int>( B.columns() ) );
870 const int K ( numeric_cast<int>( A.columns() ) );
871 const int lda( numeric_cast<int>( A.spacing() ) );
872 const int ldb( numeric_cast<int>( B.spacing() ) );
873 const int ldc( numeric_cast<int>( C.spacing() ) );
875 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
876 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
877 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
878 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
899 template<
typename MT3
902 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
903 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
905 using boost::numeric_cast;
914 const int M ( numeric_cast<int>( A.rows() ) );
915 const int N ( numeric_cast<int>( B.columns() ) );
916 const int K ( numeric_cast<int>( A.columns() ) );
917 const int lda( numeric_cast<int>( A.spacing() ) );
918 const int ldb( numeric_cast<int>( B.spacing() ) );
919 const int ldc( numeric_cast<int>( C.spacing() ) );
920 const complex<float> alpha( 1.0F, 0.0F );
921 const complex<float> beta ( 0.0F, 0.0F );
923 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
924 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
925 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
926 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
947 template<
typename MT3
950 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
951 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
953 using boost::numeric_cast;
962 const int M ( numeric_cast<int>( A.rows() ) );
963 const int N ( numeric_cast<int>( B.columns() ) );
964 const int K ( numeric_cast<int>( A.columns() ) );
965 const int lda( numeric_cast<int>( A.spacing() ) );
966 const int ldb( numeric_cast<int>( B.spacing() ) );
967 const int ldc( numeric_cast<int>( C.spacing() ) );
968 const complex<double> alpha( 1.0, 0.0 );
969 const complex<double> beta ( 0.0, 0.0 );
971 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
972 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
973 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
974 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
992 template<
typename MT
998 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1010 const TmpType tmp(
serial( rhs ) );
1029 template<
typename MT
1038 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1052 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1068 template<
typename MT3
1071 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1074 DMatTDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1076 DMatTDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1095 template<
typename MT3
1098 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1099 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1101 const size_t M( A.rows() );
1102 const size_t N( B.columns() );
1103 const size_t K( A.columns() );
1106 const size_t end( N &
size_t(-2) );
1108 for(
size_t i=0UL; i<M; ++i ) {
1109 for(
size_t k=0UL; k<K; ++k ) {
1110 for(
size_t j=0UL; j<end; j+=2UL ) {
1111 C(i,j ) += A(i,k) * B(k,j );
1112 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1115 C(i,end) += A(i,k) * B(k,end);
1137 template<
typename MT3
1140 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1141 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1143 typedef IntrinsicTrait<ElementType> IT;
1145 const size_t M( A.rows() );
1146 const size_t N( B.columns() );
1147 const size_t K( A.columns() );
1151 for( ; (i+2UL) <= M; i+=2UL ) {
1153 for( ; (j+4UL) <= N; j+=4UL ) {
1154 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1155 for(
size_t k=0UL; k<K; k+=IT::size ) {
1162 xmm1 = xmm1 + a1 * b1;
1163 xmm2 = xmm2 + a1 * b2;
1164 xmm3 = xmm3 + a1 * b3;
1165 xmm4 = xmm4 + a1 * b4;
1166 xmm5 = xmm5 + a2 * b1;
1167 xmm6 = xmm6 + a2 * b2;
1168 xmm7 = xmm7 + a2 * b3;
1169 xmm8 = xmm8 + a2 * b4;
1171 (~C)(i ,j ) +=
sum( xmm1 );
1172 (~C)(i ,j+1UL) +=
sum( xmm2 );
1173 (~C)(i ,j+2UL) +=
sum( xmm3 );
1174 (~C)(i ,j+3UL) +=
sum( xmm4 );
1175 (~C)(i+1UL,j ) +=
sum( xmm5 );
1176 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
1177 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
1178 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
1180 for( ; (j+2UL) <= N; j+=2UL ) {
1182 for(
size_t k=0UL; k<K; k+=IT::size ) {
1187 xmm1 = xmm1 + a1 * b1;
1188 xmm2 = xmm2 + a1 * b2;
1189 xmm3 = xmm3 + a2 * b1;
1190 xmm4 = xmm4 + a2 * b2;
1192 (~C)(i ,j ) +=
sum( xmm1 );
1193 (~C)(i ,j+1UL) +=
sum( xmm2 );
1194 (~C)(i+1UL,j ) +=
sum( xmm3 );
1195 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1199 for(
size_t k=0UL; k<K; k+=IT::size ) {
1201 xmm1 = xmm1 + A.load(i ,k) * b1;
1202 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1204 (~C)(i ,j) +=
sum( xmm1 );
1205 (~C)(i+1UL,j) +=
sum( xmm2 );
1210 for( ; (j+4UL) <= N; j+=4UL ) {
1212 for(
size_t k=0UL; k<K; k+=IT::size ) {
1214 xmm1 = xmm1 + a1 * B.load(k,j );
1215 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1216 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1217 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1219 (~C)(i,j ) +=
sum( xmm1 );
1220 (~C)(i,j+1UL) +=
sum( xmm2 );
1221 (~C)(i,j+2UL) +=
sum( xmm3 );
1222 (~C)(i,j+3UL) +=
sum( xmm4 );
1224 for( ; (j+2UL) <= N; j+=2UL ) {
1226 for(
size_t k=0UL; k<K; k+=IT::size ) {
1228 xmm1 = xmm1 + a1 * B.load(k,j );
1229 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1231 (~C)(i,j ) +=
sum( xmm1 );
1232 (~C)(i,j+1UL) +=
sum( xmm2 );
1236 for(
size_t k=0UL; k<K; k+=IT::size ) {
1237 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1239 (~C)(i,j) +=
sum( xmm1 );
1260 template<
typename MT3
1263 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1264 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1266 typedef IntrinsicTrait<ElementType> IT;
1268 const size_t M( A.rows() );
1269 const size_t N( B.columns() );
1270 const size_t K( A.columns() );
1274 for( ; (i+4UL) <= M; i+=4UL ) {
1276 for( ; (j+2UL) <= N; j+=2UL ) {
1277 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1278 for(
size_t k=0UL; k<K; k+=IT::size ) {
1285 xmm1 = xmm1 + a1 * b1;
1286 xmm2 = xmm2 + a1 * b2;
1287 xmm3 = xmm3 + a2 * b1;
1288 xmm4 = xmm4 + a2 * b2;
1289 xmm5 = xmm5 + a3 * b1;
1290 xmm6 = xmm6 + a3 * b2;
1291 xmm7 = xmm7 + a4 * b1;
1292 xmm8 = xmm8 + a4 * b2;
1294 (~C)(i ,j ) +=
sum( xmm1 );
1295 (~C)(i ,j+1UL) +=
sum( xmm2 );
1296 (~C)(i+1UL,j ) +=
sum( xmm3 );
1297 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1298 (~C)(i+2UL,j ) +=
sum( xmm5 );
1299 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
1300 (~C)(i+3UL,j ) +=
sum( xmm7 );
1301 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
1305 for(
size_t k=0UL; k<K; k+=IT::size ) {
1307 xmm1 = xmm1 + A.load(i ,k) * b1;
1308 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1309 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1310 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1312 (~C)(i ,j) +=
sum( xmm1 );
1313 (~C)(i+1UL,j) +=
sum( xmm2 );
1314 (~C)(i+2UL,j) +=
sum( xmm3 );
1315 (~C)(i+3UL,j) +=
sum( xmm4 );
1318 for( ; (i+2UL) <= M; i+=2UL ) {
1320 for( ; (j+2UL) <= N; j+=2UL ) {
1322 for(
size_t k=0UL; k<K; k+=IT::size ) {
1327 xmm1 = xmm1 + a1 * b1;
1328 xmm2 = xmm2 + a1 * b2;
1329 xmm3 = xmm3 + a2 * b1;
1330 xmm4 = xmm4 + a2 * b2;
1332 (~C)(i ,j ) +=
sum( xmm1 );
1333 (~C)(i ,j+1UL) +=
sum( xmm2 );
1334 (~C)(i+1UL,j ) +=
sum( xmm3 );
1335 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
1339 for(
size_t k=0UL; k<K; k+=IT::size ) {
1341 xmm1 = xmm1 + A.load(i ,k) * b1;
1342 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1344 (~C)(i ,j) +=
sum( xmm1 );
1345 (~C)(i+1UL,j) +=
sum( xmm2 );
1350 for( ; (j+2UL) <= N; j+=2UL ) {
1352 for(
size_t k=0UL; k<K; k+=IT::size ) {
1354 xmm1 = xmm1 + a1 * B.load(k,j );
1355 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1357 (~C)(i,j ) +=
sum( xmm1 );
1358 (~C)(i,j+1UL) +=
sum( xmm2 );
1362 for(
size_t k=0UL; k<K; k+=IT::size ) {
1363 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1365 (~C)(i,j) +=
sum( xmm1 );
1386 template<
typename MT3
1389 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1390 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1392 selectDefaultAddAssignKernel( C, A, B );
1412 template<
typename MT3
1415 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1416 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1418 using boost::numeric_cast;
1424 const int M ( numeric_cast<int>( A.rows() ) );
1425 const int N ( numeric_cast<int>( B.columns() ) );
1426 const int K ( numeric_cast<int>( A.columns() ) );
1427 const int lda( numeric_cast<int>( A.spacing() ) );
1428 const int ldb( numeric_cast<int>( B.spacing() ) );
1429 const int ldc( numeric_cast<int>( C.spacing() ) );
1431 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1432 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1433 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1434 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1455 template<
typename MT3
1458 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1459 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1461 using boost::numeric_cast;
1467 const int M ( numeric_cast<int>( A.rows() ) );
1468 const int N ( numeric_cast<int>( B.columns() ) );
1469 const int K ( numeric_cast<int>( A.columns() ) );
1470 const int lda( numeric_cast<int>( A.spacing() ) );
1471 const int ldb( numeric_cast<int>( B.spacing() ) );
1472 const int ldc( numeric_cast<int>( C.spacing() ) );
1474 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1475 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1476 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1477 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1498 template<
typename MT3
1501 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1502 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1504 using boost::numeric_cast;
1513 const int M ( numeric_cast<int>( A.rows() ) );
1514 const int N ( numeric_cast<int>( B.columns() ) );
1515 const int K ( numeric_cast<int>( A.columns() ) );
1516 const int lda( numeric_cast<int>( A.spacing() ) );
1517 const int ldb( numeric_cast<int>( B.spacing() ) );
1518 const int ldc( numeric_cast<int>( C.spacing() ) );
1519 const complex<float> alpha( 1.0F, 0.0F );
1520 const complex<float> beta ( 1.0F, 0.0F );
1522 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1523 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1524 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1525 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1546 template<
typename MT3
1549 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1550 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1552 using boost::numeric_cast;
1561 const int M ( numeric_cast<int>( A.rows() ) );
1562 const int N ( numeric_cast<int>( B.columns() ) );
1563 const int K ( numeric_cast<int>( A.columns() ) );
1564 const int lda( numeric_cast<int>( A.spacing() ) );
1565 const int ldb( numeric_cast<int>( B.spacing() ) );
1566 const int ldc( numeric_cast<int>( C.spacing() ) );
1567 const complex<double> alpha( 1.0, 0.0 );
1568 const complex<double> beta ( 1.0, 0.0 );
1570 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1571 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1572 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1573 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1596 template<
typename MT
1605 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1619 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1635 template<
typename MT3
1638 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1641 DMatTDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1643 DMatTDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1662 template<
typename MT3
1665 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1666 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1668 const size_t M( A.rows() );
1669 const size_t N( B.columns() );
1670 const size_t K( A.columns() );
1673 const size_t end( N &
size_t(-2) );
1675 for(
size_t i=0UL; i<M; ++i ) {
1676 for(
size_t k=0UL; k<K; ++k ) {
1677 for(
size_t j=0UL; j<end; j+=2UL ) {
1678 C(i,j ) -= A(i,k) * B(k,j );
1679 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1682 C(i,end) -= A(i,k) * B(k,end);
1704 template<
typename MT3
1707 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1708 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1710 typedef IntrinsicTrait<ElementType> IT;
1712 const size_t M( A.rows() );
1713 const size_t N( B.columns() );
1714 const size_t K( A.columns() );
1718 for( ; (i+2UL) <= M; i+=2UL ) {
1720 for( ; (j+4UL) <= N; j+=4UL ) {
1721 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1722 for(
size_t k=0UL; k<K; k+=IT::size ) {
1729 xmm1 = xmm1 + a1 * b1;
1730 xmm2 = xmm2 + a1 * b2;
1731 xmm3 = xmm3 + a1 * b3;
1732 xmm4 = xmm4 + a1 * b4;
1733 xmm5 = xmm5 + a2 * b1;
1734 xmm6 = xmm6 + a2 * b2;
1735 xmm7 = xmm7 + a2 * b3;
1736 xmm8 = xmm8 + a2 * b4;
1738 (~C)(i ,j ) -=
sum( xmm1 );
1739 (~C)(i ,j+1UL) -=
sum( xmm2 );
1740 (~C)(i ,j+2UL) -=
sum( xmm3 );
1741 (~C)(i ,j+3UL) -=
sum( xmm4 );
1742 (~C)(i+1UL,j ) -=
sum( xmm5 );
1743 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
1744 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
1745 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
1747 for( ; (j+2UL) <= N; j+=2UL ) {
1749 for(
size_t k=0UL; k<K; k+=IT::size ) {
1754 xmm1 = xmm1 + a1 * b1;
1755 xmm2 = xmm2 + a1 * b2;
1756 xmm3 = xmm3 + a2 * b1;
1757 xmm4 = xmm4 + a2 * b2;
1759 (~C)(i ,j ) -=
sum( xmm1 );
1760 (~C)(i ,j+1UL) -=
sum( xmm2 );
1761 (~C)(i+1UL,j ) -=
sum( xmm3 );
1762 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1766 for(
size_t k=0UL; k<K; k+=IT::size ) {
1768 xmm1 = xmm1 + A.load(i ,k) * b1;
1769 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1771 (~C)(i ,j) -=
sum( xmm1 );
1772 (~C)(i+1UL,j) -=
sum( xmm2 );
1777 for( ; (j+4UL) <= N; j+=4UL ) {
1779 for(
size_t k=0UL; k<K; k+=IT::size ) {
1781 xmm1 = xmm1 + a1 * B.load(k,j );
1782 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1783 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1784 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1786 (~C)(i,j ) -=
sum( xmm1 );
1787 (~C)(i,j+1UL) -=
sum( xmm2 );
1788 (~C)(i,j+2UL) -=
sum( xmm3 );
1789 (~C)(i,j+3UL) -=
sum( xmm4 );
1791 for( ; (j+2UL) <= N; j+=2UL ) {
1793 for(
size_t k=0UL; k<K; k+=IT::size ) {
1795 xmm1 = xmm1 + a1 * B.load(k,j );
1796 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1798 (~C)(i,j ) -=
sum( xmm1 );
1799 (~C)(i,j+1UL) -=
sum( xmm2 );
1803 for(
size_t k=0UL; k<K; k+=IT::size ) {
1804 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1806 (~C)(i,j) -=
sum( xmm1 );
1827 template<
typename MT3
1830 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1831 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1833 typedef IntrinsicTrait<ElementType> IT;
1835 const size_t M( A.rows() );
1836 const size_t N( B.columns() );
1837 const size_t K( A.columns() );
1841 for( ; (i+4UL) <= M; i+=4UL ) {
1843 for( ; (j+2UL) <= N; j+=2UL ) {
1844 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1845 for(
size_t k=0UL; k<K; k+=IT::size ) {
1852 xmm1 = xmm1 + a1 * b1;
1853 xmm2 = xmm2 + a1 * b2;
1854 xmm3 = xmm3 + a2 * b1;
1855 xmm4 = xmm4 + a2 * b2;
1856 xmm5 = xmm5 + a3 * b1;
1857 xmm6 = xmm6 + a3 * b2;
1858 xmm7 = xmm7 + a4 * b1;
1859 xmm8 = xmm8 + a4 * b2;
1861 (~C)(i ,j ) -=
sum( xmm1 );
1862 (~C)(i ,j+1UL) -=
sum( xmm2 );
1863 (~C)(i+1UL,j ) -=
sum( xmm3 );
1864 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1865 (~C)(i+2UL,j ) -=
sum( xmm5 );
1866 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
1867 (~C)(i+3UL,j ) -=
sum( xmm7 );
1868 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
1872 for(
size_t k=0UL; k<K; k+=IT::size ) {
1874 xmm1 = xmm1 + A.load(i ,k) * b1;
1875 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1876 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1877 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1879 (~C)(i ,j) -=
sum( xmm1 );
1880 (~C)(i+1UL,j) -=
sum( xmm2 );
1881 (~C)(i+2UL,j) -=
sum( xmm3 );
1882 (~C)(i+3UL,j) -=
sum( xmm4 );
1885 for( ; (i+2UL) <= M; i+=2UL ) {
1887 for( ; (j+2UL) <= N; j+=2UL ) {
1889 for(
size_t k=0UL; k<K; k+=IT::size ) {
1894 xmm1 = xmm1 + a1 * b1;
1895 xmm2 = xmm2 + a1 * b2;
1896 xmm3 = xmm3 + a2 * b1;
1897 xmm4 = xmm4 + a2 * b2;
1899 (~C)(i ,j ) -=
sum( xmm1 );
1900 (~C)(i ,j+1UL) -=
sum( xmm2 );
1901 (~C)(i+1UL,j ) -=
sum( xmm3 );
1902 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
1906 for(
size_t k=0UL; k<K; k+=IT::size ) {
1908 xmm1 = xmm1 + A.load(i ,k) * b1;
1909 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1911 (~C)(i ,j) -=
sum( xmm1 );
1912 (~C)(i+1UL,j) -=
sum( xmm2 );
1917 for( ; (j+2UL) <= N; j+=2UL ) {
1919 for(
size_t k=0UL; k<K; k+=IT::size ) {
1921 xmm1 = xmm1 + a1 * B.load(k,j );
1922 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1924 (~C)(i,j ) -=
sum( xmm1 );
1925 (~C)(i,j+1UL) -=
sum( xmm2 );
1929 for(
size_t k=0UL; k<K; k+=IT::size ) {
1930 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1932 (~C)(i,j) -=
sum( xmm1 );
1953 template<
typename MT3
1956 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1957 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1959 selectDefaultSubAssignKernel( C, A, B );
1979 template<
typename MT3
1982 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1983 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1985 using boost::numeric_cast;
1991 const int M ( numeric_cast<int>( A.rows() ) );
1992 const int N ( numeric_cast<int>( B.columns() ) );
1993 const int K ( numeric_cast<int>( A.columns() ) );
1994 const int lda( numeric_cast<int>( A.spacing() ) );
1995 const int ldb( numeric_cast<int>( B.spacing() ) );
1996 const int ldc( numeric_cast<int>( C.spacing() ) );
1998 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1999 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2000 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2001 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2022 template<
typename MT3
2025 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2026 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2028 using boost::numeric_cast;
2034 const int M ( numeric_cast<int>( A.rows() ) );
2035 const int N ( numeric_cast<int>( B.columns() ) );
2036 const int K ( numeric_cast<int>( A.columns() ) );
2037 const int lda( numeric_cast<int>( A.spacing() ) );
2038 const int ldb( numeric_cast<int>( B.spacing() ) );
2039 const int ldc( numeric_cast<int>( C.spacing() ) );
2041 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2042 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2043 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2044 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2065 template<
typename MT3
2068 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2069 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2071 using boost::numeric_cast;
2080 const int M ( numeric_cast<int>( A.rows() ) );
2081 const int N ( numeric_cast<int>( B.columns() ) );
2082 const int K ( numeric_cast<int>( A.columns() ) );
2083 const int lda( numeric_cast<int>( A.spacing() ) );
2084 const int ldb( numeric_cast<int>( B.spacing() ) );
2085 const int ldc( numeric_cast<int>( C.spacing() ) );
2086 const complex<float> alpha( -1.0F, 0.0F );
2087 const complex<float> beta ( 1.0F, 0.0F );
2089 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2090 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2091 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2092 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2113 template<
typename MT3
2116 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2117 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2119 using boost::numeric_cast;
2128 const int M ( numeric_cast<int>( A.rows() ) );
2129 const int N ( numeric_cast<int>( B.columns() ) );
2130 const int K ( numeric_cast<int>( A.columns() ) );
2131 const int lda( numeric_cast<int>( A.spacing() ) );
2132 const int ldb( numeric_cast<int>( B.spacing() ) );
2133 const int ldc( numeric_cast<int>( C.spacing() ) );
2134 const complex<double> alpha( -1.0, 0.0 );
2135 const complex<double> beta ( 1.0, 0.0 );
2137 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2138 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2139 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2140 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2173 template<
typename MT
2175 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2183 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2186 else if( rhs.lhs_.columns() == 0UL ) {
2220 template<
typename MT
2222 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2227 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2239 const TmpType tmp( rhs );
2260 template<
typename MT
2262 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2270 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2308 template<
typename MT
2310 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2318 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2377 template<
typename MT1
2381 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
2382 ,
private MatScalarMultExpr
2383 ,
private Computation
2387 typedef DMatTDMatMultExpr<MT1,MT2> MMM;
2399 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2404 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2412 template<
typename MT >
2413 struct UseSMPAssign {
2414 enum { value = ( evaluateLeft || evaluateRight ) };
2423 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2424 struct UseSinglePrecisionKernel {
2425 enum { value = IsFloat<typename T1::ElementType>::value &&
2426 IsFloat<typename T2::ElementType>::value &&
2427 IsFloat<typename T3::ElementType>::value &&
2428 !IsComplex<T4>::value };
2437 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2438 struct UseDoublePrecisionKernel {
2439 enum { value = IsDouble<typename T1::ElementType>::value &&
2440 IsDouble<typename T2::ElementType>::value &&
2441 IsDouble<typename T3::ElementType>::value &&
2442 !IsComplex<T4>::value };
2451 template<
typename T1,
typename T2,
typename T3 >
2452 struct UseSinglePrecisionComplexKernel {
2453 typedef complex<float> Type;
2454 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2455 IsSame<typename T2::ElementType,Type>::value &&
2456 IsSame<typename T3::ElementType,Type>::value };
2465 template<
typename T1,
typename T2,
typename T3 >
2466 struct UseDoublePrecisionComplexKernel {
2467 typedef complex<double> Type;
2468 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2469 IsSame<typename T2::ElementType,Type>::value &&
2470 IsSame<typename T3::ElementType,Type>::value };
2478 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2479 struct UseDefaultKernel {
2480 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2481 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2482 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2483 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2491 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2492 struct UseVectorizedDefaultKernel {
2493 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2494 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2495 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2496 IsSame<typename T1::ElementType,T4>::value &&
2497 IntrinsicTrait<typename T1::ElementType>::addition &&
2498 IntrinsicTrait<typename T1::ElementType>::multiplication };
2504 typedef DMatScalarMultExpr<MMM,ST,false>
This;
2505 typedef typename MultTrait<RES,ST>::Type
ResultType;
2509 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2514 typedef const DMatTDMatMultExpr<MT1,MT2>
LeftOperand;
2520 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
2523 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
2528 enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2529 IsSame<ET1,ET2>::value &&
2530 IsSame<ET1,ST>::value &&
2531 IntrinsicTrait<ET1>::addition &&
2532 IntrinsicTrait<ET1>::multiplication };
2535 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
2536 !evaluateRight && MT2::smpAssignable };
2545 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2561 return matrix_(i,j) * scalar_;
2570 inline size_t rows()
const {
2571 return matrix_.rows();
2580 inline size_t columns()
const {
2581 return matrix_.columns();
2611 template<
typename T >
2612 inline bool canAlias(
const T* alias )
const {
2613 return matrix_.canAlias( alias );
2623 template<
typename T >
2624 inline bool isAliased(
const T* alias )
const {
2625 return matrix_.isAliased( alias );
2635 return matrix_.isAligned();
2645 typename MMM::LeftOperand A( matrix_.leftOperand() );
2670 template<
typename MT
2672 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
2679 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2680 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2682 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2685 else if( left.columns() == 0UL ) {
2700 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
2715 template<
typename MT3
2719 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2722 DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
2724 DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
2742 template<
typename MT3
2746 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2747 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2749 for(
size_t i=0UL; i<A.rows(); ++i ) {
2750 for(
size_t k=0UL; k<B.columns(); ++k ) {
2751 C(i,k) = A(i,0UL) * B(0UL,k);
2753 for(
size_t j=1UL; j<A.columns(); ++j ) {
2754 for(
size_t k=0UL; k<B.columns(); ++k ) {
2755 C(i,k) += A(i,j) * B(j,k);
2758 for(
size_t k=0UL; k<B.columns(); ++k ) {
2779 template<
typename MT3
2783 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2784 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2786 typedef IntrinsicTrait<ElementType> IT;
2788 const size_t M( A.rows() );
2789 const size_t N( B.columns() );
2790 const size_t K( A.columns() );
2794 for( ; (i+2UL) <= M; i+=2UL ) {
2796 for( ; (j+4UL) <= N; j+=4UL ) {
2797 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2798 for(
size_t k=0UL; k<K; k+=IT::size ) {
2805 xmm1 = xmm1 + a1 * b1;
2806 xmm2 = xmm2 + a1 * b2;
2807 xmm3 = xmm3 + a1 * b3;
2808 xmm4 = xmm4 + a1 * b4;
2809 xmm5 = xmm5 + a2 * b1;
2810 xmm6 = xmm6 + a2 * b2;
2811 xmm7 = xmm7 + a2 * b3;
2812 xmm8 = xmm8 + a2 * b4;
2814 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2815 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2816 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
2817 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
2818 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
2819 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
2820 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
2821 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
2823 for( ; (j+2UL) <= N; j+=2UL ) {
2825 for(
size_t k=0UL; k<K; k+=IT::size ) {
2830 xmm1 = xmm1 + a1 * b1;
2831 xmm2 = xmm2 + a1 * b2;
2832 xmm3 = xmm3 + a2 * b1;
2833 xmm4 = xmm4 + a2 * b2;
2835 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2836 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2837 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2838 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2842 for(
size_t k=0UL; k<K; k+=IT::size ) {
2844 xmm1 = xmm1 + A.load(i ,k) * b1;
2845 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2847 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2848 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2853 for( ; (j+4UL) <= N; j+=4UL ) {
2855 for(
size_t k=0UL; k<K; k+=IT::size ) {
2857 xmm1 = xmm1 + a1 * B.load(k,j );
2858 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2859 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
2860 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
2862 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2863 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2864 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
2865 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
2867 for( ; (j+2UL) <= N; j+=2UL ) {
2869 for(
size_t k=0UL; k<K; k+=IT::size ) {
2871 xmm1 = xmm1 + a1 * B.load(k,j );
2872 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2874 (~C)(i,j ) =
sum( xmm1 ) * scalar;
2875 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
2879 for(
size_t k=0UL; k<K; k+=IT::size ) {
2880 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2882 (~C)(i,j) =
sum( xmm1 ) * scalar;
2902 template<
typename MT3
2906 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2907 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2909 typedef IntrinsicTrait<ElementType> IT;
2911 const size_t M( A.rows() );
2912 const size_t N( B.columns() );
2913 const size_t K( A.columns() );
2917 for( ; (i+4UL) <= M; i+=4UL ) {
2919 for( ; (j+2UL) <= N; j+=2UL ) {
2920 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2921 for(
size_t k=0UL; k<K; k+=IT::size ) {
2928 xmm1 = xmm1 + a1 * b1;
2929 xmm2 = xmm2 + a1 * b2;
2930 xmm3 = xmm3 + a2 * b1;
2931 xmm4 = xmm4 + a2 * b2;
2932 xmm5 = xmm5 + a3 * b1;
2933 xmm6 = xmm6 + a3 * b2;
2934 xmm7 = xmm7 + a4 * b1;
2935 xmm8 = xmm8 + a4 * b2;
2937 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2938 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2939 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2940 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2941 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
2942 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
2943 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
2944 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
2948 for(
size_t k=0UL; k<K; k+=IT::size ) {
2950 xmm1 = xmm1 + A.load(i ,k) * b1;
2951 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2952 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
2953 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
2955 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2956 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2957 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
2958 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
2961 for( ; (i+2UL) <= M; i+=2UL ) {
2963 for( ; (j+2UL) <= N; j+=2UL ) {
2965 for(
size_t k=0UL; k<K; k+=IT::size ) {
2970 xmm1 = xmm1 + a1 * b1;
2971 xmm2 = xmm2 + a1 * b2;
2972 xmm3 = xmm3 + a2 * b1;
2973 xmm4 = xmm4 + a2 * b2;
2975 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
2976 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
2977 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
2978 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
2982 for(
size_t k=0UL; k<K; k+=IT::size ) {
2984 xmm1 = xmm1 + A.load(i ,k) * b1;
2985 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2987 (~C)(i ,j) =
sum( xmm1 ) * scalar;
2988 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
2993 for( ; (j+2UL) <= N; j+=2UL ) {
2995 for(
size_t k=0UL; k<K; k+=IT::size ) {
2997 xmm1 = xmm1 + a1 * B.load(k,j );
2998 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3000 (~C)(i,j ) =
sum( xmm1 ) * scalar;
3001 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
3005 for(
size_t k=0UL; k<K; k+=IT::size ) {
3006 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3008 (~C)(i,j) =
sum( xmm1 ) * scalar;
3028 template<
typename MT3
3032 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3033 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3035 selectDefaultAssignKernel( C, A, B, scalar );
3054 template<
typename MT3
3058 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3059 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3061 using boost::numeric_cast;
3067 const int M ( numeric_cast<int>( A.rows() ) );
3068 const int N ( numeric_cast<int>( B.columns() ) );
3069 const int K ( numeric_cast<int>( A.columns() ) );
3070 const int lda( numeric_cast<int>( A.spacing() ) );
3071 const int ldb( numeric_cast<int>( B.spacing() ) );
3072 const int ldc( numeric_cast<int>( C.spacing() ) );
3074 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3075 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3076 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3077 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
3097 template<
typename MT3
3101 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3102 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3104 using boost::numeric_cast;
3110 const int M ( numeric_cast<int>( A.rows() ) );
3111 const int N ( numeric_cast<int>( B.columns() ) );
3112 const int K ( numeric_cast<int>( A.columns() ) );
3113 const int lda( numeric_cast<int>( A.spacing() ) );
3114 const int ldb( numeric_cast<int>( B.spacing() ) );
3115 const int ldc( numeric_cast<int>( C.spacing() ) );
3117 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3118 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3119 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3120 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3140 template<
typename MT3
3144 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3145 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3147 using boost::numeric_cast;
3156 const int M ( numeric_cast<int>( A.rows() ) );
3157 const int N ( numeric_cast<int>( B.columns() ) );
3158 const int K ( numeric_cast<int>( A.columns() ) );
3159 const int lda( numeric_cast<int>( A.spacing() ) );
3160 const int ldb( numeric_cast<int>( B.spacing() ) );
3161 const int ldc( numeric_cast<int>( C.spacing() ) );
3162 const complex<float> alpha( scalar );
3163 const complex<float> beta ( 0.0F, 0.0F );
3165 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3166 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3167 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3168 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3188 template<
typename MT3
3192 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3193 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3195 using boost::numeric_cast;
3204 const int M ( numeric_cast<int>( A.rows() ) );
3205 const int N ( numeric_cast<int>( B.columns() ) );
3206 const int K ( numeric_cast<int>( A.columns() ) );
3207 const int lda( numeric_cast<int>( A.spacing() ) );
3208 const int ldb( numeric_cast<int>( B.spacing() ) );
3209 const int ldc( numeric_cast<int>( C.spacing() ) );
3210 const complex<double> alpha( scalar );
3211 const complex<double> beta ( 0.0, 0.0 );
3213 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3214 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3215 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3216 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3233 template<
typename MT
3235 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3239 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
3251 const TmpType tmp(
serial( rhs ) );
3268 template<
typename MT
3270 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3277 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3278 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3280 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3294 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3309 template<
typename MT3
3313 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3316 DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3318 DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3336 template<
typename MT3
3340 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3341 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3362 template<
typename MT3
3366 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3367 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3369 typedef IntrinsicTrait<ElementType> IT;
3371 const size_t M( A.rows() );
3372 const size_t N( B.columns() );
3373 const size_t K( A.columns() );
3377 for( ; (i+2UL) <= M; i+=2UL ) {
3379 for( ; (j+4UL) <= N; j+=4UL ) {
3380 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3381 for(
size_t k=0UL; k<K; k+=IT::size ) {
3388 xmm1 = xmm1 + a1 * b1;
3389 xmm2 = xmm2 + a1 * b2;
3390 xmm3 = xmm3 + a1 * b3;
3391 xmm4 = xmm4 + a1 * b4;
3392 xmm5 = xmm5 + a2 * b1;
3393 xmm6 = xmm6 + a2 * b2;
3394 xmm7 = xmm7 + a2 * b3;
3395 xmm8 = xmm8 + a2 * b4;
3397 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3398 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3399 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
3400 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
3401 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
3402 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
3403 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
3404 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
3406 for( ; (j+2UL) <= N; j+=2UL ) {
3408 for(
size_t k=0UL; k<K; k+=IT::size ) {
3413 xmm1 = xmm1 + a1 * b1;
3414 xmm2 = xmm2 + a1 * b2;
3415 xmm3 = xmm3 + a2 * b1;
3416 xmm4 = xmm4 + a2 * b2;
3418 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3419 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3420 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3421 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3425 for(
size_t k=0UL; k<K; k+=IT::size ) {
3427 xmm1 = xmm1 + A.load(i ,k) * b1;
3428 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3430 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3431 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3436 for( ; (j+4UL) <= N; j+=4UL ) {
3438 for(
size_t k=0UL; k<K; k+=IT::size ) {
3440 xmm1 = xmm1 + a1 * B.load(k,j );
3441 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3442 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3443 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3445 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3446 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3447 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
3448 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
3450 for( ; (j+2UL) <= N; j+=2UL ) {
3452 for(
size_t k=0UL; k<K; k+=IT::size ) {
3454 xmm1 = xmm1 + a1 * B.load(k,j );
3455 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3457 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3458 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3462 for(
size_t k=0UL; k<K; k+=IT::size ) {
3463 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3465 (~C)(i,j) +=
sum( xmm1 ) * scalar;
3485 template<
typename MT3
3489 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3490 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3492 typedef IntrinsicTrait<ElementType> IT;
3494 const size_t M( A.rows() );
3495 const size_t N( B.columns() );
3496 const size_t K( A.columns() );
3500 for( ; (i+4UL) <= M; i+=4UL ) {
3502 for( ; (j+2UL) <= N; j+=2UL ) {
3503 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3504 for(
size_t k=0UL; k<K; k+=IT::size ) {
3511 xmm1 = xmm1 + a1 * b1;
3512 xmm2 = xmm2 + a1 * b2;
3513 xmm3 = xmm3 + a2 * b1;
3514 xmm4 = xmm4 + a2 * b2;
3515 xmm5 = xmm5 + a3 * b1;
3516 xmm6 = xmm6 + a3 * b2;
3517 xmm7 = xmm7 + a4 * b1;
3518 xmm8 = xmm8 + a4 * b2;
3520 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3521 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3522 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3523 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3524 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
3525 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
3526 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
3527 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
3531 for(
size_t k=0UL; k<K; k+=IT::size ) {
3533 xmm1 = xmm1 + A.load(i ,k) * b1;
3534 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3535 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
3536 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
3538 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3539 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3540 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
3541 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
3544 for( ; (i+2UL) <= M; i+=2UL ) {
3546 for( ; (j+2UL) <= N; j+=2UL ) {
3548 for(
size_t k=0UL; k<K; k+=IT::size ) {
3553 xmm1 = xmm1 + a1 * b1;
3554 xmm2 = xmm2 + a1 * b2;
3555 xmm3 = xmm3 + a2 * b1;
3556 xmm4 = xmm4 + a2 * b2;
3558 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
3559 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
3560 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
3561 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
3565 for(
size_t k=0UL; k<K; k+=IT::size ) {
3567 xmm1 = xmm1 + A.load(i ,k) * b1;
3568 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3570 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
3571 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
3576 for( ; (j+2UL) <= N; j+=2UL ) {
3578 for(
size_t k=0UL; k<K; k+=IT::size ) {
3580 xmm1 = xmm1 + a1 * B.load(k,j );
3581 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3583 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
3584 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
3588 for(
size_t k=0UL; k<K; k+=IT::size ) {
3589 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3591 (~C)(i,j) +=
sum( xmm1 ) * scalar;
3611 template<
typename MT3
3615 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3616 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3618 selectDefaultAddAssignKernel( C, A, B, scalar );
3637 template<
typename MT3
3641 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3642 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3644 using boost::numeric_cast;
3650 const int M ( numeric_cast<int>( A.rows() ) );
3651 const int N ( numeric_cast<int>( B.columns() ) );
3652 const int K ( numeric_cast<int>( A.columns() ) );
3653 const int lda( numeric_cast<int>( A.spacing() ) );
3654 const int ldb( numeric_cast<int>( B.spacing() ) );
3655 const int ldc( numeric_cast<int>( C.spacing() ) );
3657 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3658 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3659 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3660 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3680 template<
typename MT3
3684 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3685 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3687 using boost::numeric_cast;
3693 const int M ( numeric_cast<int>( A.rows() ) );
3694 const int N ( numeric_cast<int>( B.columns() ) );
3695 const int K ( numeric_cast<int>( A.columns() ) );
3696 const int lda( numeric_cast<int>( A.spacing() ) );
3697 const int ldb( numeric_cast<int>( B.spacing() ) );
3698 const int ldc( numeric_cast<int>( C.spacing() ) );
3700 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3701 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3702 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3703 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3723 template<
typename MT3
3727 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3728 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3730 using boost::numeric_cast;
3739 const int M ( numeric_cast<int>( A.rows() ) );
3740 const int N ( numeric_cast<int>( B.columns() ) );
3741 const int K ( numeric_cast<int>( A.columns() ) );
3742 const int lda( numeric_cast<int>( A.spacing() ) );
3743 const int ldb( numeric_cast<int>( B.spacing() ) );
3744 const int ldc( numeric_cast<int>( C.spacing() ) );
3745 const complex<float> alpha( scalar );
3746 const complex<float> beta ( 1.0F, 0.0F );
3748 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3749 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3750 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3751 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3771 template<
typename MT3
3775 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3776 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3778 using boost::numeric_cast;
3787 const int M ( numeric_cast<int>( A.rows() ) );
3788 const int N ( numeric_cast<int>( B.columns() ) );
3789 const int K ( numeric_cast<int>( A.columns() ) );
3790 const int lda( numeric_cast<int>( A.spacing() ) );
3791 const int ldb( numeric_cast<int>( B.spacing() ) );
3792 const int ldc( numeric_cast<int>( C.spacing() ) );
3793 const complex<double> alpha( scalar );
3794 const complex<double> beta ( 1.0, 0.0 );
3796 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3797 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3798 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3799 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3820 template<
typename MT
3822 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3829 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3830 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3832 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3846 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3861 template<
typename MT3
3865 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3868 DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
3870 DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
3888 template<
typename MT3
3892 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3893 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3914 template<
typename MT3
3918 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3919 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3921 typedef IntrinsicTrait<ElementType> IT;
3923 const size_t M( A.rows() );
3924 const size_t N( B.columns() );
3925 const size_t K( A.columns() );
3929 for( ; (i+2UL) <= M; i+=2UL ) {
3931 for( ; (j+4UL) <= N; j+=4UL ) {
3932 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3933 for(
size_t k=0UL; k<K; k+=IT::size ) {
3940 xmm1 = xmm1 + a1 * b1;
3941 xmm2 = xmm2 + a1 * b2;
3942 xmm3 = xmm3 + a1 * b3;
3943 xmm4 = xmm4 + a1 * b4;
3944 xmm5 = xmm5 + a2 * b1;
3945 xmm6 = xmm6 + a2 * b2;
3946 xmm7 = xmm7 + a2 * b3;
3947 xmm8 = xmm8 + a2 * b4;
3949 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3950 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3951 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
3952 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
3953 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
3954 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
3955 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
3956 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
3958 for( ; (j+2UL) <= N; j+=2UL ) {
3960 for(
size_t k=0UL; k<K; k+=IT::size ) {
3965 xmm1 = xmm1 + a1 * b1;
3966 xmm2 = xmm2 + a1 * b2;
3967 xmm3 = xmm3 + a2 * b1;
3968 xmm4 = xmm4 + a2 * b2;
3970 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
3971 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
3972 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
3973 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
3977 for(
size_t k=0UL; k<K; k+=IT::size ) {
3979 xmm1 = xmm1 + A.load(i ,k) * b1;
3980 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3982 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
3983 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
3988 for( ; (j+4UL) <= N; j+=4UL ) {
3990 for(
size_t k=0UL; k<K; k+=IT::size ) {
3992 xmm1 = xmm1 + a1 * B.load(k,j );
3993 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3994 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3995 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3997 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
3998 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
3999 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
4000 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
4002 for( ; (j+2UL) <= N; j+=2UL ) {
4004 for(
size_t k=0UL; k<K; k+=IT::size ) {
4006 xmm1 = xmm1 + a1 * B.load(k,j );
4007 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
4009 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
4010 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
4014 for(
size_t k=0UL; k<K; k+=IT::size ) {
4015 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
4017 (~C)(i,j) -=
sum( xmm1 ) * scalar;
4037 template<
typename MT3
4041 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4042 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4044 typedef IntrinsicTrait<ElementType> IT;
4046 const size_t M( A.rows() );
4047 const size_t N( B.columns() );
4048 const size_t K( A.columns() );
4052 for( ; (i+4UL) <= M; i+=4UL ) {
4054 for( ; (j+2UL) <= N; j+=2UL ) {
4055 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4056 for(
size_t k=0UL; k<K; k+=IT::size ) {
4063 xmm1 = xmm1 + a1 * b1;
4064 xmm2 = xmm2 + a1 * b2;
4065 xmm3 = xmm3 + a2 * b1;
4066 xmm4 = xmm4 + a2 * b2;
4067 xmm5 = xmm5 + a3 * b1;
4068 xmm6 = xmm6 + a3 * b2;
4069 xmm7 = xmm7 + a4 * b1;
4070 xmm8 = xmm8 + a4 * b2;
4072 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
4073 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
4074 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
4075 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
4076 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
4077 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
4078 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
4079 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
4083 for(
size_t k=0UL; k<K; k+=IT::size ) {
4085 xmm1 = xmm1 + A.load(i ,k) * b1;
4086 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
4087 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
4088 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
4090 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
4091 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
4092 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
4093 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
4096 for( ; (i+2UL) <= M; i+=2UL ) {
4098 for( ; (j+2UL) <= N; j+=2UL ) {
4100 for(
size_t k=0UL; k<K; k+=IT::size ) {
4105 xmm1 = xmm1 + a1 * b1;
4106 xmm2 = xmm2 + a1 * b2;
4107 xmm3 = xmm3 + a2 * b1;
4108 xmm4 = xmm4 + a2 * b2;
4110 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
4111 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
4112 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
4113 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
4117 for(
size_t k=0UL; k<K; k+=IT::size ) {
4119 xmm1 = xmm1 + A.load(i ,k) * b1;
4120 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
4122 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
4123 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
4128 for( ; (j+2UL) <= N; j+=2UL ) {
4130 for(
size_t k=0UL; k<K; k+=IT::size ) {
4132 xmm1 = xmm1 + a1 * B.load(k,j );
4133 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
4135 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
4136 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
4140 for(
size_t k=0UL; k<K; k+=IT::size ) {
4141 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
4143 (~C)(i,j) -=
sum( xmm1 ) * scalar;
4163 template<
typename MT3
4167 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4168 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4170 selectDefaultSubAssignKernel( C, A, B, scalar );
4189 template<
typename MT3
4193 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4194 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4196 using boost::numeric_cast;
4202 const int M ( numeric_cast<int>( A.rows() ) );
4203 const int N ( numeric_cast<int>( B.columns() ) );
4204 const int K ( numeric_cast<int>( A.columns() ) );
4205 const int lda( numeric_cast<int>( A.spacing() ) );
4206 const int ldb( numeric_cast<int>( B.spacing() ) );
4207 const int ldc( numeric_cast<int>( C.spacing() ) );
4209 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4210 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4211 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4212 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4232 template<
typename MT3
4236 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4237 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4239 using boost::numeric_cast;
4245 const int M ( numeric_cast<int>( A.rows() ) );
4246 const int N ( numeric_cast<int>( B.columns() ) );
4247 const int K ( numeric_cast<int>( A.columns() ) );
4248 const int lda( numeric_cast<int>( A.spacing() ) );
4249 const int ldb( numeric_cast<int>( B.spacing() ) );
4250 const int ldc( numeric_cast<int>( C.spacing() ) );
4252 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4253 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4254 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4255 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4275 template<
typename MT3
4279 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4280 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4282 using boost::numeric_cast;
4291 const int M ( numeric_cast<int>( A.rows() ) );
4292 const int N ( numeric_cast<int>( B.columns() ) );
4293 const int K ( numeric_cast<int>( A.columns() ) );
4294 const int lda( numeric_cast<int>( A.spacing() ) );
4295 const int ldb( numeric_cast<int>( B.spacing() ) );
4296 const int ldc( numeric_cast<int>( C.spacing() ) );
4297 const complex<float> alpha( -scalar );
4298 const complex<float> beta ( 1.0F, 0.0F );
4300 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4301 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4302 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4303 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4323 template<
typename MT3
4327 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4328 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4330 using boost::numeric_cast;
4339 const int M ( numeric_cast<int>( A.rows() ) );
4340 const int N ( numeric_cast<int>( B.columns() ) );
4341 const int K ( numeric_cast<int>( A.columns() ) );
4342 const int lda( numeric_cast<int>( A.spacing() ) );
4343 const int ldb( numeric_cast<int>( B.spacing() ) );
4344 const int ldc( numeric_cast<int>( C.spacing() ) );
4345 const complex<double> alpha( -scalar );
4346 const complex<double> beta ( 1.0, 0.0 );
4348 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4349 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4350 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4351 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4382 template<
typename MT
4384 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4385 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4392 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4393 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4395 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4398 else if( left.columns() == 0UL ) {
4431 template<
typename MT
4433 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4434 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4438 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
4450 const TmpType tmp( rhs );
4469 template<
typename MT
4471 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4472 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4479 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4480 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4482 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4518 template<
typename MT
4520 friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4521 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4528 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4529 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4531 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
4613 template<
typename T1
4615 inline const DMatTDMatMultExpr<T1,T2>
4621 throw std::invalid_argument(
"Matrix sizes do not match" );
4638 template<
typename MT1,
typename MT2,
typename VT >
4643 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4644 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4645 IsDenseVector<VT>::value && IsColumnVector<VT>::value
4646 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
4647 , INVALID_TYPE >::Type Type;
4656 template<
typename MT1,
typename MT2,
typename VT >
4661 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4662 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4663 IsSparseVector<VT>::value && IsColumnVector<VT>::value
4664 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
4665 , INVALID_TYPE >::Type Type;
4674 template<
typename VT,
typename MT1,
typename MT2 >
4679 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
4680 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4681 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4682 ,
typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4683 , INVALID_TYPE >::Type Type;
4692 template<
typename VT,
typename MT1,
typename MT2 >
4697 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
4698 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4699 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4700 ,
typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4701 , INVALID_TYPE >::Type Type;
4710 template<
typename MT1,
typename MT2,
bool AF >
4715 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
4716 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
4725 template<
typename MT1,
typename MT2 >
4730 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4739 template<
typename MT1,
typename MT2 >
4744 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:404
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:247
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:255
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4599
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4329
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:249
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:152
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:300
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:199
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:394
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2408
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:251
DMatTDMatMultExpr< MT1, MT2 > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:245
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:249
Header file for the TDVecSMatMultExprTrait class template.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:690
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:123
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:125
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:285
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:122
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:340
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:252
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
const size_t SMP_DMATTDMATMULT_THRESHOLD
SMP row-major dense matrix/column-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:857
Header file for the TDMatSVecMultExprTrait class template.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:384
Header file for the DenseMatrix base class.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:122
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:271
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:126
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2406
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the serial shim.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatTDMatMultExpr.h:250
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:330
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:92
Header file for the IsNumeric type trait.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:246
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:301
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:331
const size_t DMATTDMATMULT_THRESHOLD
Row-major dense matrix/column-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:142
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:372
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:124
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:350
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:250
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:251
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2403
Header file for basic type definitions.
Header file for the IsComplex type trait.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:413
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:115
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:121
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:414
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:360
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:248
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:261
Constraint on the data type.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:264
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:258
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.