22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
97 template<
typename MT1
99 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
100 ,
private MatMatMultExpr
101 ,
private Computation
105 typedef typename MT1::ResultType
RT1;
106 typedef typename MT2::ResultType
RT2;
107 typedef typename MT1::CompositeType
CT1;
108 typedef typename MT2::CompositeType
CT2;
116 template<
typename T1,
typename T2,
typename T3 >
117 struct UseSinglePrecisionKernel {
130 template<
typename T1,
typename T2,
typename T3 >
131 struct UseDoublePrecisionKernel {
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseSinglePrecisionComplexKernel {
147 typedef complex<float> Type;
148 enum { value = IsSame<typename T1::ElementType,Type>::value &&
149 IsSame<typename T2::ElementType,Type>::value &&
150 IsSame<typename T3::ElementType,Type>::value };
161 template<
typename T1,
typename T2,
typename T3 >
162 struct UseDoublePrecisionComplexKernel {
163 typedef complex<double> Type;
164 enum { value = IsSame<typename T1::ElementType,Type>::value &&
165 IsSame<typename T2::ElementType,Type>::value &&
166 IsSame<typename T3::ElementType,Type>::value };
176 template<
typename T1,
typename T2,
typename T3 >
177 struct UseDefaultKernel {
178 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
179 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
180 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
181 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseVectorizedDefaultKernel {
193 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
194 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
195 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
196 IntrinsicTrait<typename T1::ElementType>::addition &&
197 IntrinsicTrait<typename T1::ElementType>::multiplication };
228 enum { vectorizable = 0 };
258 if(
lhs_.columns() != 0UL ) {
259 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
261 for(
size_t k=1UL; k<end; k+=2UL ) {
263 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
265 if( end <
lhs_.columns() ) {
293 return rhs_.columns();
323 template<
typename T >
325 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
335 template<
typename T >
337 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
356 template<
typename MT
365 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
368 else if( rhs.lhs_.columns() == 0UL ) {
384 TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
386 TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
405 template<
typename MT3
409 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
411 const size_t M( A.rows() );
412 const size_t N( B.columns() );
413 const size_t K( A.columns() );
415 for(
size_t i=0UL; i<M; ++i ) {
416 for(
size_t j=0UL; j<N; ++j ) {
417 C(i,j) = A(i,0UL) * B(0UL,j);
419 for(
size_t k=1UL; k<K; ++k ) {
420 for(
size_t j=0UL; j<N; ++j ) {
421 C(i,j) += A(i,k) * B(k,j);
443 template<
typename MT3
446 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
447 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
449 typedef IntrinsicTrait<ElementType> IT;
451 const size_t M( A.rows() );
452 const size_t N( B.spacing() );
453 const size_t K( A.columns() );
457 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
458 for(
size_t i=0UL; i<M; ++i ) {
459 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
460 for(
size_t k=0UL; k<K; ++k ) {
462 xmm1 = xmm1 + a1 * B.get(k,j );
463 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
464 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
465 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
466 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
467 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
468 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
469 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
471 store( &(~C)(i,j ), xmm1 );
472 store( &(~C)(i,j+IT::size ), xmm2 );
473 store( &(~C)(i,j+IT::size*2UL), xmm3 );
474 store( &(~C)(i,j+IT::size*3UL), xmm4 );
475 store( &(~C)(i,j+IT::size*4UL), xmm5 );
476 store( &(~C)(i,j+IT::size*5UL), xmm6 );
477 store( &(~C)(i,j+IT::size*6UL), xmm7 );
478 store( &(~C)(i,j+IT::size*7UL), xmm8 );
481 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
483 for( ; (i+2UL) <= M; i+=2UL ) {
484 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
485 for(
size_t k=0UL; k<K; ++k ) {
492 xmm1 = xmm1 + a1 * b1;
493 xmm2 = xmm2 + a1 * b2;
494 xmm3 = xmm3 + a1 * b3;
495 xmm4 = xmm4 + a1 * b4;
496 xmm5 = xmm5 + a2 * b1;
497 xmm6 = xmm6 + a2 * b2;
498 xmm7 = xmm7 + a2 * b3;
499 xmm8 = xmm8 + a2 * b4;
501 store( &(~C)(i ,j ), xmm1 );
502 store( &(~C)(i ,j+IT::size ), xmm2 );
503 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
504 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
505 store( &(~C)(i+1UL,j ), xmm5 );
506 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
507 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
508 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
512 for(
size_t k=0UL; k<K; ++k ) {
514 xmm1 = xmm1 + a1 * B.get(k,j );
515 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
516 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
517 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
519 store( &(~C)(i,j ), xmm1 );
520 store( &(~C)(i,j+IT::size ), xmm2 );
521 store( &(~C)(i,j+IT::size*2UL), xmm3 );
522 store( &(~C)(i,j+IT::size*3UL), xmm4 );
525 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
527 for( ; (i+2UL) <= M; i+=2UL ) {
529 for(
size_t k=0UL; k<K; ++k ) {
534 xmm1 = xmm1 + a1 * b1;
535 xmm2 = xmm2 + a1 * b2;
536 xmm3 = xmm3 + a2 * b1;
537 xmm4 = xmm4 + a2 * b2;
539 store( &(~C)(i ,j ), xmm1 );
540 store( &(~C)(i ,j+IT::size), xmm2 );
541 store( &(~C)(i+1UL,j ), xmm3 );
542 store( &(~C)(i+1UL,j+IT::size), xmm4 );
546 for(
size_t k=0UL; k<K; ++k ) {
548 xmm1 = xmm1 + a1 * B.get(k,j );
549 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
551 store( &(~C)(i,j ), xmm1 );
552 store( &(~C)(i,j+IT::size), xmm2 );
557 for( ; (i+2UL) <= M; i+=2UL ) {
559 for(
size_t k=0UL; k<K; ++k ) {
561 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
562 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
564 store( &(~C)(i ,j), xmm1 );
565 store( &(~C)(i+1UL,j), xmm2 );
569 for(
size_t k=0UL; k<K; ++k ) {
570 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
572 store( &(~C)(i,j), xmm1 );
593 template<
typename MT3
596 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
597 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
599 typedef IntrinsicTrait<ElementType> IT;
601 const size_t M( A.spacing() );
602 const size_t N( B.columns() );
603 const size_t K( A.columns() );
607 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
608 for(
size_t j=0UL; j<N; ++j ) {
609 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
610 for(
size_t k=0UL; k<K; ++k ) {
612 xmm1 = xmm1 + A.get(i ,k) * b1;
613 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
614 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
615 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
616 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
617 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
618 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
619 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
621 store( &(~C)(i ,j), xmm1 );
622 store( &(~C)(i+IT::size ,j), xmm2 );
623 store( &(~C)(i+IT::size*2UL,j), xmm3 );
624 store( &(~C)(i+IT::size*3UL,j), xmm4 );
625 store( &(~C)(i+IT::size*4UL,j), xmm5 );
626 store( &(~C)(i+IT::size*5UL,j), xmm6 );
627 store( &(~C)(i+IT::size*6UL,j), xmm7 );
628 store( &(~C)(i+IT::size*7UL,j), xmm8 );
631 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
633 for( ; (j+2UL) <= N; j+=2UL ) {
634 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
635 for(
size_t k=0UL; k<K; ++k ) {
642 xmm1 = xmm1 + a1 * b1;
643 xmm2 = xmm2 + a2 * b1;
644 xmm3 = xmm3 + a3 * b1;
645 xmm4 = xmm4 + a4 * b1;
646 xmm5 = xmm5 + a1 * b2;
647 xmm6 = xmm6 + a2 * b2;
648 xmm7 = xmm7 + a3 * b2;
649 xmm8 = xmm8 + a4 * b2;
651 store( &(~C)(i ,j ), xmm1 );
652 store( &(~C)(i+IT::size ,j ), xmm2 );
653 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
654 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
655 store( &(~C)(i ,j+1UL), xmm5 );
656 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
657 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
658 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
662 for(
size_t k=0UL; k<K; ++k ) {
664 xmm1 = xmm1 + A.get(i ,k) * b1;
665 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
666 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
667 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
669 store( &(~C)(i ,j), xmm1 );
670 store( &(~C)(i+IT::size ,j), xmm2 );
671 store( &(~C)(i+IT::size*2UL,j), xmm3 );
672 store( &(~C)(i+IT::size*3UL,j), xmm4 );
675 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
677 for( ; (j+2UL) <= N; j+=2UL ) {
679 for(
size_t k=0UL; k<K; ++k ) {
684 xmm1 = xmm1 + a1 * b1;
685 xmm2 = xmm2 + a2 * b1;
686 xmm3 = xmm3 + a1 * b2;
687 xmm4 = xmm4 + a2 * b2;
689 store( &(~C)(i ,j ), xmm1 );
690 store( &(~C)(i+IT::size,j ), xmm2 );
691 store( &(~C)(i ,j+1UL), xmm3 );
692 store( &(~C)(i+IT::size,j+1UL), xmm4 );
696 for(
size_t k=0UL; k<K; ++k ) {
698 xmm1 = xmm1 + A.get(i ,k) * b1;
699 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
701 store( &(~C)(i ,j), xmm1 );
702 store( &(~C)(i+IT::size,j), xmm2 );
707 for( ; (j+2UL) <= N; j+=2UL ) {
709 for(
size_t k=0UL; k<K; ++k ) {
711 xmm1 = xmm1 + a1 *
set( B(k,j ) );
712 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
714 store( &(~C)(i,j ), xmm1 );
715 store( &(~C)(i,j+1UL), xmm2 );
719 for(
size_t k=0UL; k<K; ++k ) {
720 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
722 store( &(~C)(i,j), xmm1 );
743 template<
typename MT3
746 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
747 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
749 selectDefaultAssignKernel( C, A, B );
769 template<
typename MT3
772 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
773 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
775 using boost::numeric_cast;
781 const int M ( numeric_cast<int>( A.rows() ) );
782 const int N ( numeric_cast<int>( B.columns() ) );
783 const int K ( numeric_cast<int>( A.columns() ) );
784 const int lda( numeric_cast<int>( A.spacing() ) );
785 const int ldb( numeric_cast<int>( B.spacing() ) );
786 const int ldc( numeric_cast<int>( C.spacing() ) );
788 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
789 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
790 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
791 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
812 template<
typename MT3
815 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
816 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
818 using boost::numeric_cast;
824 const int M ( numeric_cast<int>( A.rows() ) );
825 const int N ( numeric_cast<int>( B.columns() ) );
826 const int K ( numeric_cast<int>( A.columns() ) );
827 const int lda( numeric_cast<int>( A.spacing() ) );
828 const int ldb( numeric_cast<int>( B.spacing() ) );
829 const int ldc( numeric_cast<int>( C.spacing() ) );
831 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
832 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
833 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
834 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
855 template<
typename MT3
858 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
859 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
861 using boost::numeric_cast;
870 const int M ( numeric_cast<int>( A.rows() ) );
871 const int N ( numeric_cast<int>( B.columns() ) );
872 const int K ( numeric_cast<int>( A.columns() ) );
873 const int lda( numeric_cast<int>( A.spacing() ) );
874 const int ldb( numeric_cast<int>( B.spacing() ) );
875 const int ldc( numeric_cast<int>( C.spacing() ) );
876 const complex<float> alpha( 1.0F, 0.0F );
877 const complex<float> beta ( 0.0F, 0.0F );
879 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
880 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
881 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
882 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
903 template<
typename MT3
906 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
907 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
909 using boost::numeric_cast;
918 const int M ( numeric_cast<int>( A.rows() ) );
919 const int N ( numeric_cast<int>( B.columns() ) );
920 const int K ( numeric_cast<int>( A.columns() ) );
921 const int lda( numeric_cast<int>( A.spacing() ) );
922 const int ldb( numeric_cast<int>( B.spacing() ) );
923 const int ldc( numeric_cast<int>( C.spacing() ) );
924 const complex<double> alpha( 1.0, 0.0 );
925 const complex<double> beta ( 0.0, 0.0 );
927 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
928 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
929 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
930 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
948 template<
typename MT
954 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
966 const TmpType tmp( rhs );
985 template<
typename MT
994 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1009 TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
1011 TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1030 template<
typename MT3
1033 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1034 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1036 const size_t M( A.rows() );
1037 const size_t N( B.columns() );
1038 const size_t K( A.columns() );
1041 const size_t end( N &
size_t(-2) );
1043 for(
size_t i=0UL; i<M; ++i ) {
1044 for(
size_t k=0UL; k<K; ++k ) {
1045 for(
size_t j=0UL; j<end; j+=2UL ) {
1046 C(i,j ) += A(i,k) * B(k,j );
1047 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1050 C(i,end) += A(i,k) * B(k,end);
1072 template<
typename MT3
1075 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1076 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1078 typedef IntrinsicTrait<ElementType> IT;
1080 const size_t M( A.rows() );
1081 const size_t N( B.spacing() );
1082 const size_t K( A.columns() );
1086 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1087 for(
size_t i=0UL; i<M; ++i ) {
1096 for(
size_t k=0UL; k<K; ++k ) {
1098 xmm1 = xmm1 + a1 * B.get(k,j );
1099 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1100 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1101 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1102 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
1103 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
1104 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
1105 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
1107 store( &(~C)(i,j ), xmm1 );
1108 store( &(~C)(i,j+IT::size ), xmm2 );
1109 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1110 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1111 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1112 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1113 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1114 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1117 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1119 for( ; (i+2UL) <= M; i+=2UL ) {
1128 for(
size_t k=0UL; k<K; ++k ) {
1135 xmm1 = xmm1 + a1 * b1;
1136 xmm2 = xmm2 + a1 * b2;
1137 xmm3 = xmm3 + a1 * b3;
1138 xmm4 = xmm4 + a1 * b4;
1139 xmm5 = xmm5 + a2 * b1;
1140 xmm6 = xmm6 + a2 * b2;
1141 xmm7 = xmm7 + a2 * b3;
1142 xmm8 = xmm8 + a2 * b4;
1144 store( &(~C)(i ,j ), xmm1 );
1145 store( &(~C)(i ,j+IT::size ), xmm2 );
1146 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1147 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1148 store( &(~C)(i+1UL,j ), xmm5 );
1149 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1150 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1151 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1158 for(
size_t k=0UL; k<K; ++k ) {
1160 xmm1 = xmm1 + a1 * B.get(k,j );
1161 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1162 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1163 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1165 store( &(~C)(i,j ), xmm1 );
1166 store( &(~C)(i,j+IT::size ), xmm2 );
1167 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1168 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1171 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1173 for( ; (i+2UL) <= M; i+=2UL ) {
1178 for(
size_t k=0UL; k<K; ++k ) {
1183 xmm1 = xmm1 + a1 * b1;
1184 xmm2 = xmm2 + a1 * b2;
1185 xmm3 = xmm3 + a2 * b1;
1186 xmm4 = xmm4 + a2 * b2;
1188 store( &(~C)(i ,j ), xmm1 );
1189 store( &(~C)(i ,j+IT::size), xmm2 );
1190 store( &(~C)(i+1UL,j ), xmm3 );
1191 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1196 for(
size_t k=0UL; k<K; ++k ) {
1198 xmm1 = xmm1 + a1 * B.get(k,j );
1199 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1201 store( &(~C)(i,j ), xmm1 );
1202 store( &(~C)(i,j+IT::size), xmm2 );
1207 for( ; (i+2UL) <= M; i+=2UL ) {
1210 for(
size_t k=0UL; k<K; ++k ) {
1212 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1213 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1215 store( &(~C)(i ,j), xmm1 );
1216 store( &(~C)(i+1UL,j), xmm2 );
1220 for(
size_t k=0UL; k<K; ++k ) {
1221 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
1223 store( &(~C)(i,j), xmm1 );
1244 template<
typename MT3
1247 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1248 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1250 typedef IntrinsicTrait<ElementType> IT;
1252 const size_t M( A.spacing() );
1253 const size_t N( B.columns() );
1254 const size_t K( A.columns() );
1258 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1259 for(
size_t j=0UL; j<N; ++j ) {
1268 for(
size_t k=0UL; k<K; ++k ) {
1270 xmm1 = xmm1 + A.get(i ,k) * b1;
1271 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1272 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1273 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1274 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1275 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1276 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1277 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1279 store( &(~C)(i ,j), xmm1 );
1280 store( &(~C)(i+IT::size ,j), xmm2 );
1281 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1282 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1283 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1284 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1285 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1286 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1289 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1291 for( ; (j+2UL) <= N; j+=2UL ) {
1300 for(
size_t k=0UL; k<K; ++k ) {
1307 xmm1 = xmm1 + a1 * b1;
1308 xmm2 = xmm2 + a2 * b1;
1309 xmm3 = xmm3 + a3 * b1;
1310 xmm4 = xmm4 + a4 * b1;
1311 xmm5 = xmm5 + a1 * b2;
1312 xmm6 = xmm6 + a2 * b2;
1313 xmm7 = xmm7 + a3 * b2;
1314 xmm8 = xmm8 + a4 * b2;
1316 store( &(~C)(i ,j ), xmm1 );
1317 store( &(~C)(i+IT::size ,j ), xmm2 );
1318 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1319 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1320 store( &(~C)(i ,j+1UL), xmm5 );
1321 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1322 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1323 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1330 for(
size_t k=0UL; k<K; ++k ) {
1332 xmm1 = xmm1 + A.get(i ,k) * b1;
1333 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1334 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1335 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1337 store( &(~C)(i ,j), xmm1 );
1338 store( &(~C)(i+IT::size ,j), xmm2 );
1339 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1340 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1343 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1345 for( ; (j+2UL) <= N; j+=2UL ) {
1350 for(
size_t k=0UL; k<K; ++k ) {
1355 xmm1 = xmm1 + a1 * b1;
1356 xmm2 = xmm2 + a2 * b1;
1357 xmm3 = xmm3 + a1 * b2;
1358 xmm4 = xmm4 + a2 * b2;
1360 store( &(~C)(i ,j ), xmm1 );
1361 store( &(~C)(i+IT::size,j ), xmm2 );
1362 store( &(~C)(i ,j+1UL), xmm3 );
1363 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1368 for(
size_t k=0UL; k<K; ++k ) {
1370 xmm1 = xmm1 + A.get(i ,k) * b1;
1371 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1373 store( &(~C)(i ,j), xmm1 );
1374 store( &(~C)(i+IT::size,j), xmm2 );
1379 for( ; (j+2UL) <= N; j+=2UL ) {
1382 for(
size_t k=0UL; k<K; ++k ) {
1384 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1385 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1387 store( &(~C)(i,j ), xmm1 );
1388 store( &(~C)(i,j+1UL), xmm2 );
1392 for(
size_t k=0UL; k<K; ++k ) {
1393 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
1395 store( &(~C)(i,j), xmm1 );
1416 template<
typename MT3
1419 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1420 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1422 selectDefaultAddAssignKernel( C, A, B );
1442 template<
typename MT3
1445 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1446 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1448 using boost::numeric_cast;
1454 const int M ( numeric_cast<int>( A.rows() ) );
1455 const int N ( numeric_cast<int>( B.columns() ) );
1456 const int K ( numeric_cast<int>( A.columns() ) );
1457 const int lda( numeric_cast<int>( A.spacing() ) );
1458 const int ldb( numeric_cast<int>( B.spacing() ) );
1459 const int ldc( numeric_cast<int>( C.spacing() ) );
1461 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1462 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1463 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1464 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1485 template<
typename MT3
1488 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1489 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1491 using boost::numeric_cast;
1497 const int M ( numeric_cast<int>( A.rows() ) );
1498 const int N ( numeric_cast<int>( B.columns() ) );
1499 const int K ( numeric_cast<int>( A.columns() ) );
1500 const int lda( numeric_cast<int>( A.spacing() ) );
1501 const int ldb( numeric_cast<int>( B.spacing() ) );
1502 const int ldc( numeric_cast<int>( C.spacing() ) );
1504 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1505 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1506 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1507 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1528 template<
typename MT3
1531 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1532 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1534 using boost::numeric_cast;
1543 const int M ( numeric_cast<int>( A.rows() ) );
1544 const int N ( numeric_cast<int>( B.columns() ) );
1545 const int K ( numeric_cast<int>( A.columns() ) );
1546 const int lda( numeric_cast<int>( A.spacing() ) );
1547 const int ldb( numeric_cast<int>( B.spacing() ) );
1548 const int ldc( numeric_cast<int>( C.spacing() ) );
1549 const complex<float> alpha( 1.0F, 0.0F );
1550 const complex<float> beta ( 1.0F, 0.0F );
1552 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1553 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1554 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1555 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1576 template<
typename MT3
1579 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1580 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1582 using boost::numeric_cast;
1591 const int M ( numeric_cast<int>( A.rows() ) );
1592 const int N ( numeric_cast<int>( B.columns() ) );
1593 const int K ( numeric_cast<int>( A.columns() ) );
1594 const int lda( numeric_cast<int>( A.spacing() ) );
1595 const int ldb( numeric_cast<int>( B.spacing() ) );
1596 const int ldc( numeric_cast<int>( C.spacing() ) );
1597 const complex<double> alpha( 1.0, 0.0 );
1598 const complex<double> beta ( 1.0, 0.0 );
1600 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1601 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1602 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1603 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1626 template<
typename MT
1635 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1650 TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1652 TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1671 template<
typename MT3
1674 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1675 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1677 const size_t M( A.rows() );
1678 const size_t N( B.columns() );
1679 const size_t K( A.columns() );
1682 const size_t end( N &
size_t(-2) );
1684 for(
size_t i=0UL; i<M; ++i ) {
1685 for(
size_t k=0UL; k<K; ++k ) {
1686 for(
size_t j=0UL; j<end; j+=2UL ) {
1687 C(i,j ) -= A(i,k) * B(k,j );
1688 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1691 C(i,end) -= A(i,k) * B(k,end);
1713 template<
typename MT3
1716 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1717 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1719 typedef IntrinsicTrait<ElementType> IT;
1721 const size_t M( A.rows() );
1722 const size_t N( B.spacing() );
1723 const size_t K( A.columns() );
1727 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1728 for(
size_t i=0UL; i<M; ++i ) {
1737 for(
size_t k=0UL; k<K; ++k ) {
1739 xmm1 = xmm1 - a1 * B.get(k,j );
1740 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1741 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1742 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1743 xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1744 xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1745 xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1746 xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1748 store( &(~C)(i,j ), xmm1 );
1749 store( &(~C)(i,j+IT::size ), xmm2 );
1750 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1751 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1752 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1753 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1754 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1755 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1758 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1760 for( ; (i+2UL) <= M; i+=2UL ) {
1769 for(
size_t k=0UL; k<K; ++k ) {
1776 xmm1 = xmm1 - a1 * b1;
1777 xmm2 = xmm2 - a1 * b2;
1778 xmm3 = xmm3 - a1 * b3;
1779 xmm4 = xmm4 - a1 * b4;
1780 xmm5 = xmm5 - a2 * b1;
1781 xmm6 = xmm6 - a2 * b2;
1782 xmm7 = xmm7 - a2 * b3;
1783 xmm8 = xmm8 - a2 * b4;
1785 store( &(~C)(i ,j ), xmm1 );
1786 store( &(~C)(i ,j+IT::size ), xmm2 );
1787 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1788 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1789 store( &(~C)(i+1UL,j ), xmm5 );
1790 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1791 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1792 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1799 for(
size_t k=0UL; k<K; ++k ) {
1801 xmm1 = xmm1 - a1 * B.get(k,j );
1802 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1803 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1804 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1806 store( &(~C)(i,j ), xmm1 );
1807 store( &(~C)(i,j+IT::size ), xmm2 );
1808 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1809 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1812 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1814 for( ; (i+2UL) <= M; i+=2UL ) {
1819 for(
size_t k=0UL; k<K; ++k ) {
1824 xmm1 = xmm1 - a1 * b1;
1825 xmm2 = xmm2 - a1 * b2;
1826 xmm3 = xmm3 - a2 * b1;
1827 xmm4 = xmm4 - a2 * b2;
1829 store( &(~C)(i ,j ), xmm1 );
1830 store( &(~C)(i ,j+IT::size), xmm2 );
1831 store( &(~C)(i+1UL,j ), xmm3 );
1832 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1837 for(
size_t k=0UL; k<K; ++k ) {
1839 xmm1 = xmm1 - a1 * B.get(k,j );
1840 xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1842 store( &(~C)(i,j ), xmm1 );
1843 store( &(~C)(i,j+IT::size), xmm2 );
1848 for( ; (i+2UL) <= M; i+=2UL ) {
1851 for(
size_t k=0UL; k<K; ++k ) {
1853 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1854 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1856 store( &(~C)(i ,j), xmm1 );
1857 store( &(~C)(i+1UL,j), xmm2 );
1861 for(
size_t k=0UL; k<K; ++k ) {
1862 xmm1 = xmm1 -
set( A(i,k) ) * B.get(k,j);
1864 store( &(~C)(i,j), xmm1 );
1885 template<
typename MT3
1888 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1889 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1891 typedef IntrinsicTrait<ElementType> IT;
1893 const size_t M( A.spacing() );
1894 const size_t N( B.columns() );
1895 const size_t K( A.columns() );
1899 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1900 for(
size_t j=0UL; j<N; ++j ) {
1909 for(
size_t k=0UL; k<K; ++k ) {
1911 xmm1 = xmm1 - A.get(i ,k) * b1;
1912 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1913 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1914 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1915 xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1916 xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1917 xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1918 xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1920 store( &(~C)(i ,j), xmm1 );
1921 store( &(~C)(i+IT::size ,j), xmm2 );
1922 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1923 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1924 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1925 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1926 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1927 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1930 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1932 for( ; (j+2UL) <= N; j+=2UL ) {
1941 for(
size_t k=0UL; k<K; ++k ) {
1948 xmm1 = xmm1 - a1 * b1;
1949 xmm2 = xmm2 - a2 * b1;
1950 xmm3 = xmm3 - a3 * b1;
1951 xmm4 = xmm4 - a4 * b1;
1952 xmm5 = xmm5 - a1 * b2;
1953 xmm6 = xmm6 - a2 * b2;
1954 xmm7 = xmm7 - a3 * b2;
1955 xmm8 = xmm8 - a4 * b2;
1957 store( &(~C)(i ,j ), xmm1 );
1958 store( &(~C)(i+IT::size ,j ), xmm2 );
1959 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1960 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1961 store( &(~C)(i ,j+1UL), xmm5 );
1962 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1963 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1964 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1971 for(
size_t k=0UL; k<K; ++k ) {
1973 xmm1 = xmm1 - A.get(i ,k) * b1;
1974 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1975 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1976 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1978 store( &(~C)(i ,j), xmm1 );
1979 store( &(~C)(i+IT::size ,j), xmm2 );
1980 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1981 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1984 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1986 for( ; (j+2UL) <= N; j+=2UL ) {
1991 for(
size_t k=0UL; k<K; ++k ) {
1996 xmm1 = xmm1 - a1 * b1;
1997 xmm2 = xmm2 - a2 * b1;
1998 xmm3 = xmm3 - a1 * b2;
1999 xmm4 = xmm4 - a2 * b2;
2001 store( &(~C)(i ,j ), xmm1 );
2002 store( &(~C)(i+IT::size,j ), xmm2 );
2003 store( &(~C)(i ,j+1UL), xmm3 );
2004 store( &(~C)(i+IT::size,j+1UL), xmm4 );
2009 for(
size_t k=0UL; k<K; ++k ) {
2011 xmm1 = xmm1 - A.get(i ,k) * b1;
2012 xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
2014 store( &(~C)(i ,j), xmm1 );
2015 store( &(~C)(i+IT::size,j), xmm2 );
2020 for( ; (j+2UL) <= N; j+=2UL ) {
2023 for(
size_t k=0UL; k<K; ++k ) {
2025 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2026 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2028 store( &(~C)(i,j ), xmm1 );
2029 store( &(~C)(i,j+1UL), xmm2 );
2033 for(
size_t k=0UL; k<K; ++k ) {
2034 xmm1 = xmm1 - A.get(i,k) *
set( B(k,j) );
2036 store( &(~C)(i,j), xmm1 );
2057 template<
typename MT3
2060 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2061 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2063 selectDefaultSubAssignKernel( C, A, B );
2083 template<
typename MT3
2086 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2087 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2089 using boost::numeric_cast;
2095 const int M ( numeric_cast<int>( A.rows() ) );
2096 const int N ( numeric_cast<int>( B.columns() ) );
2097 const int K ( numeric_cast<int>( A.columns() ) );
2098 const int lda( numeric_cast<int>( A.spacing() ) );
2099 const int ldb( numeric_cast<int>( B.spacing() ) );
2100 const int ldc( numeric_cast<int>( C.spacing() ) );
2102 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2103 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2104 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2105 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2126 template<
typename MT3
2129 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2130 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2132 using boost::numeric_cast;
2138 const int M ( numeric_cast<int>( A.rows() ) );
2139 const int N ( numeric_cast<int>( B.columns() ) );
2140 const int K ( numeric_cast<int>( A.columns() ) );
2141 const int lda( numeric_cast<int>( A.spacing() ) );
2142 const int ldb( numeric_cast<int>( B.spacing() ) );
2143 const int ldc( numeric_cast<int>( C.spacing() ) );
2145 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2146 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2147 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2148 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2169 template<
typename MT3
2172 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2173 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2175 using boost::numeric_cast;
2184 const int M ( numeric_cast<int>( A.rows() ) );
2185 const int N ( numeric_cast<int>( B.columns() ) );
2186 const int K ( numeric_cast<int>( A.columns() ) );
2187 const int lda( numeric_cast<int>( A.spacing() ) );
2188 const int ldb( numeric_cast<int>( B.spacing() ) );
2189 const int ldc( numeric_cast<int>( C.spacing() ) );
2190 const complex<float> alpha( -1.0F, 0.0F );
2191 const complex<float> beta ( 1.0F, 0.0F );
2193 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2194 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2195 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2196 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2217 template<
typename MT3
2220 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2221 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2223 using boost::numeric_cast;
2232 const int M ( numeric_cast<int>( A.rows() ) );
2233 const int N ( numeric_cast<int>( B.columns() ) );
2234 const int K ( numeric_cast<int>( A.columns() ) );
2235 const int lda( numeric_cast<int>( A.spacing() ) );
2236 const int ldb( numeric_cast<int>( B.spacing() ) );
2237 const int ldc( numeric_cast<int>( C.spacing() ) );
2238 const complex<double> alpha( -1.0, 0.0 );
2239 const complex<double> beta ( 1.0, 0.0 );
2241 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2242 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2243 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2244 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2290 template<
typename MT1
2294 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2295 ,
private MatScalarMultExpr
2296 ,
private Computation
2300 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2301 typedef typename MMM::ResultType RES;
2302 typedef typename MT1::ResultType
RT1;
2303 typedef typename MT2::ResultType
RT2;
2304 typedef typename MT1::CompositeType
CT1;
2305 typedef typename MT2::CompositeType
CT2;
2313 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2314 struct UseSinglePrecisionKernel {
2315 enum { value = IsFloat<typename T1::ElementType>::value &&
2316 IsFloat<typename T2::ElementType>::value &&
2317 IsFloat<typename T3::ElementType>::value &&
2318 !IsComplex<T4>::value };
2327 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2328 struct UseDoublePrecisionKernel {
2329 enum { value = IsDouble<typename T1::ElementType>::value &&
2330 IsDouble<typename T2::ElementType>::value &&
2331 IsDouble<typename T3::ElementType>::value &&
2332 !IsComplex<T4>::value };
2341 template<
typename T1,
typename T2,
typename T3 >
2342 struct UseSinglePrecisionComplexKernel {
2343 typedef complex<float> Type;
2344 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2345 IsSame<typename T2::ElementType,Type>::value &&
2346 IsSame<typename T3::ElementType,Type>::value };
2355 template<
typename T1,
typename T2,
typename T3 >
2356 struct UseDoublePrecisionComplexKernel {
2357 typedef complex<double> Type;
2358 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2359 IsSame<typename T2::ElementType,Type>::value &&
2360 IsSame<typename T3::ElementType,Type>::value };
2368 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2369 struct UseDefaultKernel {
2370 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2371 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2372 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2373 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2381 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2382 struct UseVectorizedDefaultKernel {
2383 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2384 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2385 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2386 IsSame<typename T1::ElementType,T4>::value &&
2387 IntrinsicTrait<typename T1::ElementType>::addition &&
2388 IntrinsicTrait<typename T1::ElementType>::multiplication };
2394 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2395 typedef typename MultTrait<RES,ST>::Type
ResultType;
2396 typedef typename ResultType::OppositeType
OppositeType;
2398 typedef typename ResultType::ElementType
ElementType;
2399 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2404 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
2410 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2413 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2418 enum { vectorizable = 0 };
2427 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2443 return matrix_(i,j) * scalar_;
2452 inline size_t rows()
const {
2453 return matrix_.rows();
2462 inline size_t columns()
const {
2463 return matrix_.columns();
2493 template<
typename T >
2494 inline bool canAlias(
const T* alias )
const {
2495 return matrix_.canAlias( alias );
2505 template<
typename T >
2506 inline bool isAliased(
const T* alias )
const {
2507 return matrix_.isAliased( alias );
2526 template<
typename MT3
2528 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2535 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2536 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2538 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2541 else if( left.columns() == 0UL ) {
2557 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2559 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2577 template<
typename MT3
2581 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2582 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2584 for(
size_t i=0UL; i<A.rows(); ++i ) {
2585 for(
size_t k=0UL; k<B.columns(); ++k ) {
2586 C(i,k) = A(i,0UL) * B(0UL,k);
2588 for(
size_t j=1UL; j<A.columns(); ++j ) {
2589 for(
size_t k=0UL; k<B.columns(); ++k ) {
2590 C(i,k) += A(i,j) * B(j,k);
2593 for(
size_t k=0UL; k<B.columns(); ++k ) {
2614 template<
typename MT3
2618 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2619 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2621 typedef IntrinsicTrait<ElementType> IT;
2623 const size_t M( A.rows() );
2624 const size_t N( B.spacing() );
2625 const size_t K( A.columns() );
2631 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2632 for(
size_t i=0UL; i<M; ++i ) {
2633 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2634 for(
size_t k=0UL; k<K; ++k ) {
2636 xmm1 = xmm1 + a1 * B.get(k,j );
2637 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2638 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2639 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2640 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2641 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2642 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2643 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2645 store( &(~C)(i,j ), xmm1 * factor );
2646 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2647 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2648 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2649 store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2650 store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2651 store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2652 store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2655 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2657 for( ; (i+2UL) <= M; i+=2UL ) {
2658 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2659 for(
size_t k=0UL; k<K; ++k ) {
2666 xmm1 = xmm1 + a1 * b1;
2667 xmm2 = xmm2 + a1 * b2;
2668 xmm3 = xmm3 + a1 * b3;
2669 xmm4 = xmm4 + a1 * b4;
2670 xmm5 = xmm5 + a2 * b1;
2671 xmm6 = xmm6 + a2 * b2;
2672 xmm7 = xmm7 + a2 * b3;
2673 xmm8 = xmm8 + a2 * b4;
2675 store( &(~C)(i ,j ), xmm1 * factor );
2676 store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2677 store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2678 store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2679 store( &(~C)(i+1UL,j ), xmm5 * factor );
2680 store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2681 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2682 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2686 for(
size_t k=0UL; k<K; ++k ) {
2688 xmm1 = xmm1 + a1 * B.get(k,j );
2689 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2690 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2691 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2693 store( &(~C)(i,j ), xmm1 * factor );
2694 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2695 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2696 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2699 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2701 for( ; (i+2UL) <= M; i+=2UL ) {
2703 for(
size_t k=0UL; k<K; ++k ) {
2708 xmm1 = xmm1 + a1 * b1;
2709 xmm2 = xmm2 + a1 * b2;
2710 xmm3 = xmm3 + a2 * b1;
2711 xmm4 = xmm4 + a2 * b2;
2713 store( &(~C)(i ,j ), xmm1 * factor );
2714 store( &(~C)(i ,j+IT::size), xmm2 * factor );
2715 store( &(~C)(i+1UL,j ), xmm3 * factor );
2716 store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2720 for(
size_t k=0UL; k<K; ++k ) {
2722 xmm1 = xmm1 + a1 * B.get(k,j );
2723 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2725 store( &(~C)(i,j ), xmm1 * factor );
2726 store( &(~C)(i,j+IT::size), xmm2 * factor );
2731 for( ; (i+2UL) <= M; i+=2UL ) {
2733 for(
size_t k=0UL; k<K; ++k ) {
2735 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2736 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2738 store( &(~C)(i ,j), xmm1 * factor );
2739 store( &(~C)(i+1UL,j), xmm2 * factor );
2743 for(
size_t k=0UL; k<K; ++k ) {
2744 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2746 store( &(~C)(i,j), xmm1 * factor );
2766 template<
typename MT3
2770 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2771 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2773 typedef IntrinsicTrait<ElementType> IT;
2775 const size_t M( A.spacing() );
2776 const size_t N( B.columns() );
2777 const size_t K( A.columns() );
2783 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2784 for(
size_t j=0UL; j<N; ++j ) {
2785 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2786 for(
size_t k=0UL; k<K; ++k ) {
2788 xmm1 = xmm1 + A.get(i ,k) * b1;
2789 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2790 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2791 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2792 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2793 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2794 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2795 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2797 store( &(~C)(i ,j), xmm1 * factor );
2798 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2799 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2800 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2801 store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2802 store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2803 store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2804 store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2807 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2809 for( ; (j+2UL) <= N; j+=2UL ) {
2810 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2811 for(
size_t k=0UL; k<K; ++k ) {
2818 xmm1 = xmm1 + a1 * b1;
2819 xmm2 = xmm2 + a2 * b1;
2820 xmm3 = xmm3 + a3 * b1;
2821 xmm4 = xmm4 + a4 * b1;
2822 xmm5 = xmm5 + a1 * b2;
2823 xmm6 = xmm6 + a2 * b2;
2824 xmm7 = xmm7 + a3 * b2;
2825 xmm8 = xmm8 + a4 * b2;
2827 store( &(~C)(i ,j ), xmm1 * factor );
2828 store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2829 store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2830 store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2831 store( &(~C)(i ,j+1UL), xmm5 * factor );
2832 store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2833 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2834 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2838 for(
size_t k=0UL; k<K; ++k ) {
2840 xmm1 = xmm1 + A.get(i ,k) * b1;
2841 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2842 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2843 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2845 store( &(~C)(i ,j), xmm1 * factor );
2846 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2847 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2848 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2851 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2853 for( ; (j+2UL) <= N; j+=2UL ) {
2855 for(
size_t k=0UL; k<K; ++k ) {
2860 xmm1 = xmm1 + a1 * b1;
2861 xmm2 = xmm2 + a2 * b1;
2862 xmm3 = xmm3 + a1 * b2;
2863 xmm4 = xmm4 + a2 * b2;
2865 store( &(~C)(i ,j ), xmm1 * factor );
2866 store( &(~C)(i+IT::size,j ), xmm2 * factor );
2867 store( &(~C)(i ,j+1UL), xmm3 * factor );
2868 store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2872 for(
size_t k=0UL; k<K; ++k ) {
2874 xmm1 = xmm1 + A.get(i ,k) * b1;
2875 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2877 store( &(~C)(i ,j), xmm1 * factor );
2878 store( &(~C)(i+IT::size,j), xmm2 * factor );
2883 for( ; (j+2UL) <= N; j+=2UL ) {
2885 for(
size_t k=0UL; k<K; ++k ) {
2887 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2888 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2890 store( &(~C)(i,j ), xmm1 * factor );
2891 store( &(~C)(i,j+1UL), xmm2 * factor );
2895 for(
size_t k=0UL; k<K; ++k ) {
2896 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2898 store( &(~C)(i,j), xmm1 * factor );
2918 template<
typename MT3
2922 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2923 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2925 selectDefaultAssignKernel( C, A, B, scalar );
2944 template<
typename MT3
2948 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2949 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2951 using boost::numeric_cast;
2957 const int M ( numeric_cast<int>( A.rows() ) );
2958 const int N ( numeric_cast<int>( B.columns() ) );
2959 const int K ( numeric_cast<int>( A.columns() ) );
2960 const int lda( numeric_cast<int>( A.spacing() ) );
2961 const int ldb( numeric_cast<int>( B.spacing() ) );
2962 const int ldc( numeric_cast<int>( C.spacing() ) );
2964 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2965 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2966 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2967 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2987 template<
typename MT3
2991 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2992 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2994 using boost::numeric_cast;
3000 const int M ( numeric_cast<int>( A.rows() ) );
3001 const int N ( numeric_cast<int>( B.columns() ) );
3002 const int K ( numeric_cast<int>( A.columns() ) );
3003 const int lda( numeric_cast<int>( A.spacing() ) );
3004 const int ldb( numeric_cast<int>( B.spacing() ) );
3005 const int ldc( numeric_cast<int>( C.spacing() ) );
3007 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3008 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3009 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3010 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3030 template<
typename MT3
3034 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3035 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3037 using boost::numeric_cast;
3046 const int M ( numeric_cast<int>( A.rows() ) );
3047 const int N ( numeric_cast<int>( B.columns() ) );
3048 const int K ( numeric_cast<int>( A.columns() ) );
3049 const int lda( numeric_cast<int>( A.spacing() ) );
3050 const int ldb( numeric_cast<int>( B.spacing() ) );
3051 const int ldc( numeric_cast<int>( C.spacing() ) );
3052 const complex<float> alpha( scalar );
3053 const complex<float> beta ( 0.0F, 0.0F );
3055 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3056 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3057 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3058 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3078 template<
typename MT3
3082 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3083 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3085 using boost::numeric_cast;
3094 const int M ( numeric_cast<int>( A.rows() ) );
3095 const int N ( numeric_cast<int>( B.columns() ) );
3096 const int K ( numeric_cast<int>( A.columns() ) );
3097 const int lda( numeric_cast<int>( A.spacing() ) );
3098 const int ldb( numeric_cast<int>( B.spacing() ) );
3099 const int ldc( numeric_cast<int>( C.spacing() ) );
3100 const complex<double> alpha( scalar );
3101 const complex<double> beta ( 0.0, 0.0 );
3103 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3104 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3105 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3106 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3123 template<
typename MT
3125 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3129 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3141 const TmpType tmp( rhs );
3158 template<
typename MT3
3160 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3167 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3168 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3170 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3185 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3187 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3205 template<
typename MT3
3209 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3210 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3231 template<
typename MT3
3235 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3236 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3238 typedef IntrinsicTrait<ElementType> IT;
3240 const size_t M( A.rows() );
3241 const size_t N( B.spacing() );
3242 const size_t K( A.columns() );
3248 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3249 for(
size_t i=0UL; i<M; ++i ) {
3250 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3251 for(
size_t k=0UL; k<K; ++k ) {
3253 xmm1 = xmm1 + a1 * B.get(k,j );
3254 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3255 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3256 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3257 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3258 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3259 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3260 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3262 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3263 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3264 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3265 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3266 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
3267 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
3268 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
3269 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
3272 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3274 for( ; (i+2UL) <= M; i+=2UL ) {
3275 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3276 for(
size_t k=0UL; k<K; ++k ) {
3283 xmm1 = xmm1 + a1 * b1;
3284 xmm2 = xmm2 + a1 * b2;
3285 xmm3 = xmm3 + a1 * b3;
3286 xmm4 = xmm4 + a1 * b4;
3287 xmm5 = xmm5 + a2 * b1;
3288 xmm6 = xmm6 + a2 * b2;
3289 xmm7 = xmm7 + a2 * b3;
3290 xmm8 = xmm8 + a2 * b4;
3292 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3293 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
3294 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
3295 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
3296 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
3297 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
3298 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
3299 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
3303 for(
size_t k=0UL; k<K; ++k ) {
3305 xmm1 = xmm1 + a1 * B.get(k,j );
3306 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3307 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3308 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3310 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3311 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3312 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3313 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3316 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3318 for( ; (i+2UL) <= M; i+=2UL ) {
3320 for(
size_t k=0UL; k<K; ++k ) {
3325 xmm1 = xmm1 + a1 * b1;
3326 xmm2 = xmm2 + a1 * b2;
3327 xmm3 = xmm3 + a2 * b1;
3328 xmm4 = xmm4 + a2 * b2;
3330 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3331 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
3332 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
3333 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
3337 for(
size_t k=0UL; k<K; ++k ) {
3339 xmm1 = xmm1 + a1 * B.get(k,j );
3340 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3342 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3343 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
3348 for( ; (i+2UL) <= M; i+=2UL ) {
3350 for(
size_t k=0UL; k<K; ++k ) {
3352 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3353 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3355 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3356 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) + xmm2 * factor );
3360 for(
size_t k=0UL; k<K; ++k ) {
3361 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3363 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
3383 template<
typename MT3
3387 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3388 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3390 typedef IntrinsicTrait<ElementType> IT;
3392 const size_t M( A.spacing() );
3393 const size_t N( B.columns() );
3394 const size_t K( A.columns() );
3400 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3401 for(
size_t j=0UL; j<N; ++j ) {
3402 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3403 for(
size_t k=0UL; k<K; ++k ) {
3405 xmm1 = xmm1 + A.get(i ,k) * b1;
3406 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3407 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3408 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3409 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3410 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3411 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3412 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3414 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3415 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3416 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3417 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3418 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
3419 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
3420 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
3421 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
3424 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3426 for( ; (j+2UL) <= N; j+=2UL ) {
3427 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3428 for(
size_t k=0UL; k<K; ++k ) {
3435 xmm1 = xmm1 + a1 * b1;
3436 xmm2 = xmm2 + a2 * b1;
3437 xmm3 = xmm3 + a3 * b1;
3438 xmm4 = xmm4 + a4 * b1;
3439 xmm5 = xmm5 + a1 * b2;
3440 xmm6 = xmm6 + a2 * b2;
3441 xmm7 = xmm7 + a3 * b2;
3442 xmm8 = xmm8 + a4 * b2;
3444 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3445 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
3446 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
3447 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
3448 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
3449 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
3450 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
3451 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
3455 for(
size_t k=0UL; k<K; ++k ) {
3457 xmm1 = xmm1 + A.get(i ,k) * b1;
3458 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3459 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3460 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3462 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3463 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3464 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3465 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3468 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3470 for( ; (j+2UL) <= N; j+=2UL ) {
3472 for(
size_t k=0UL; k<K; ++k ) {
3477 xmm1 = xmm1 + a1 * b1;
3478 xmm2 = xmm2 + a2 * b1;
3479 xmm3 = xmm3 + a1 * b2;
3480 xmm4 = xmm4 + a2 * b2;
3482 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3483 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
3484 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
3485 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
3489 for(
size_t k=0UL; k<K; ++k ) {
3491 xmm1 = xmm1 + A.get(i ,k) * b1;
3492 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3494 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3495 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
3500 for( ; (j+2UL) <= N; j+=2UL ) {
3502 for(
size_t k=0UL; k<K; ++k ) {
3504 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3505 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3507 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3508 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) + xmm2 * factor );
3512 for(
size_t k=0UL; k<K; ++k ) {
3513 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
3515 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
3535 template<
typename MT3
3539 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3540 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3542 selectDefaultAddAssignKernel( C, A, B, scalar );
3561 template<
typename MT3
3565 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3566 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3568 using boost::numeric_cast;
3574 const int M ( numeric_cast<int>( A.rows() ) );
3575 const int N ( numeric_cast<int>( B.columns() ) );
3576 const int K ( numeric_cast<int>( A.columns() ) );
3577 const int lda( numeric_cast<int>( A.spacing() ) );
3578 const int ldb( numeric_cast<int>( B.spacing() ) );
3579 const int ldc( numeric_cast<int>( C.spacing() ) );
3581 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3582 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3583 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3584 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3604 template<
typename MT3
3608 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3609 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3611 using boost::numeric_cast;
3617 const int M ( numeric_cast<int>( A.rows() ) );
3618 const int N ( numeric_cast<int>( B.columns() ) );
3619 const int K ( numeric_cast<int>( A.columns() ) );
3620 const int lda( numeric_cast<int>( A.spacing() ) );
3621 const int ldb( numeric_cast<int>( B.spacing() ) );
3622 const int ldc( numeric_cast<int>( C.spacing() ) );
3624 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3625 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3626 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3627 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3647 template<
typename MT3
3651 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3652 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3654 using boost::numeric_cast;
3663 const int M ( numeric_cast<int>( A.rows() ) );
3664 const int N ( numeric_cast<int>( B.columns() ) );
3665 const int K ( numeric_cast<int>( A.columns() ) );
3666 const int lda( numeric_cast<int>( A.spacing() ) );
3667 const int ldb( numeric_cast<int>( B.spacing() ) );
3668 const int ldc( numeric_cast<int>( C.spacing() ) );
3669 const complex<float> alpha( scalar );
3670 const complex<float> beta ( 1.0F, 0.0F );
3672 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3673 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3674 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3675 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3695 template<
typename MT3
3699 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3700 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3702 using boost::numeric_cast;
3711 const int M ( numeric_cast<int>( A.rows() ) );
3712 const int N ( numeric_cast<int>( B.columns() ) );
3713 const int K ( numeric_cast<int>( A.columns() ) );
3714 const int lda( numeric_cast<int>( A.spacing() ) );
3715 const int ldb( numeric_cast<int>( B.spacing() ) );
3716 const int ldc( numeric_cast<int>( C.spacing() ) );
3717 const complex<double> alpha( scalar );
3718 const complex<double> beta ( 1.0, 0.0 );
3720 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3721 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3722 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3723 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3744 template<
typename MT3
3746 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3753 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3754 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3756 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3771 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3773 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3791 template<
typename MT3
3795 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3796 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3817 template<
typename MT3
3821 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3822 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3824 typedef IntrinsicTrait<ElementType> IT;
3826 const size_t M( A.rows() );
3827 const size_t N( B.spacing() );
3828 const size_t K( A.columns() );
3834 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3835 for(
size_t i=0UL; i<M; ++i ) {
3836 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3837 for(
size_t k=0UL; k<K; ++k ) {
3839 xmm1 = xmm1 + a1 * B.get(k,j );
3840 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3841 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3842 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3843 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3844 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3845 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3846 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3848 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3849 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3850 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3851 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3852 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3853 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3854 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3855 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3858 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3860 for( ; (i+2UL) <= M; i+=2UL ) {
3861 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3862 for(
size_t k=0UL; k<K; ++k ) {
3869 xmm1 = xmm1 + a1 * b1;
3870 xmm2 = xmm2 + a1 * b2;
3871 xmm3 = xmm3 + a1 * b3;
3872 xmm4 = xmm4 + a1 * b4;
3873 xmm5 = xmm5 + a2 * b1;
3874 xmm6 = xmm6 + a2 * b2;
3875 xmm7 = xmm7 + a2 * b3;
3876 xmm8 = xmm8 + a2 * b4;
3878 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3879 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3880 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3881 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3882 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3883 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3884 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3885 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3889 for(
size_t k=0UL; k<K; ++k ) {
3891 xmm1 = xmm1 + a1 * B.get(k,j );
3892 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3893 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3894 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3896 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3897 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3898 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3899 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3902 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3904 for( ; (i+2UL) <= M; i+=2UL ) {
3906 for(
size_t k=0UL; k<K; ++k ) {
3911 xmm1 = xmm1 + a1 * b1;
3912 xmm2 = xmm2 + a1 * b2;
3913 xmm3 = xmm3 + a2 * b1;
3914 xmm4 = xmm4 + a2 * b2;
3916 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3917 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3918 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3919 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3923 for(
size_t k=0UL; k<K; ++k ) {
3925 xmm1 = xmm1 + a1 * B.get(k,j );
3926 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3928 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3929 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3934 for( ; (i+2UL) <= M; i+=2UL ) {
3936 for(
size_t k=0UL; k<K; ++k ) {
3938 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3939 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3941 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3942 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3946 for(
size_t k=0UL; k<K; ++k ) {
3947 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3949 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3969 template<
typename MT3
3973 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3974 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3976 typedef IntrinsicTrait<ElementType> IT;
3978 const size_t M( A.spacing() );
3979 const size_t N( B.columns() );
3980 const size_t K( A.columns() );
3986 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3987 for(
size_t j=0UL; j<N; ++j ) {
3988 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3989 for(
size_t k=0UL; k<K; ++k ) {
3991 xmm1 = xmm1 + A.get(i ,k) * b1;
3992 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3993 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3994 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3995 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3996 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3997 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3998 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
4000 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4001 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4002 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4003 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4004 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
4005 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
4006 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
4007 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
4010 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
4012 for( ; (j+2UL) <= N; j+=2UL ) {
4013 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4014 for(
size_t k=0UL; k<K; ++k ) {
4021 xmm1 = xmm1 + a1 * b1;
4022 xmm2 = xmm2 + a2 * b1;
4023 xmm3 = xmm3 + a3 * b1;
4024 xmm4 = xmm4 + a4 * b1;
4025 xmm5 = xmm5 + a1 * b2;
4026 xmm6 = xmm6 + a2 * b2;
4027 xmm7 = xmm7 + a3 * b2;
4028 xmm8 = xmm8 + a4 * b2;
4030 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
4031 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
4032 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
4033 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
4034 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
4035 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
4036 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
4037 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
4041 for(
size_t k=0UL; k<K; ++k ) {
4043 xmm1 = xmm1 + A.get(i ,k) * b1;
4044 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
4045 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
4046 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
4048 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4049 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4050 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4051 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4054 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
4056 for( ; (j+2UL) <= N; j+=2UL ) {
4058 for(
size_t k=0UL; k<K; ++k ) {
4063 xmm1 = xmm1 + a1 * b1;
4064 xmm2 = xmm2 + a2 * b1;
4065 xmm3 = xmm3 + a1 * b2;
4066 xmm4 = xmm4 + a2 * b2;
4068 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
4069 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
4070 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
4071 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
4075 for(
size_t k=0UL; k<K; ++k ) {
4077 xmm1 = xmm1 + A.get(i ,k) * b1;
4078 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
4080 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4081 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
4086 for( ; (j+2UL) <= N; j+=2UL ) {
4088 for(
size_t k=0UL; k<K; ++k ) {
4090 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4091 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4093 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
4094 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) - xmm2 * factor );
4098 for(
size_t k=0UL; k<K; ++k ) {
4099 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
4101 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
4121 template<
typename MT3
4125 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4126 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4128 selectDefaultSubAssignKernel( C, A, B, scalar );
4147 template<
typename MT3
4151 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4152 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4154 using boost::numeric_cast;
4160 const int M ( numeric_cast<int>( A.rows() ) );
4161 const int N ( numeric_cast<int>( B.columns() ) );
4162 const int K ( numeric_cast<int>( A.columns() ) );
4163 const int lda( numeric_cast<int>( A.spacing() ) );
4164 const int ldb( numeric_cast<int>( B.spacing() ) );
4165 const int ldc( numeric_cast<int>( C.spacing() ) );
4167 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4168 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4169 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4170 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4190 template<
typename MT3
4194 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4195 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4197 using boost::numeric_cast;
4203 const int M ( numeric_cast<int>( A.rows() ) );
4204 const int N ( numeric_cast<int>( B.columns() ) );
4205 const int K ( numeric_cast<int>( A.columns() ) );
4206 const int lda( numeric_cast<int>( A.spacing() ) );
4207 const int ldb( numeric_cast<int>( B.spacing() ) );
4208 const int ldc( numeric_cast<int>( C.spacing() ) );
4210 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4211 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4212 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4213 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4233 template<
typename MT3
4237 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4238 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4240 using boost::numeric_cast;
4249 const int M ( numeric_cast<int>( A.rows() ) );
4250 const int N ( numeric_cast<int>( B.columns() ) );
4251 const int K ( numeric_cast<int>( A.columns() ) );
4252 const int lda( numeric_cast<int>( A.spacing() ) );
4253 const int ldb( numeric_cast<int>( B.spacing() ) );
4254 const int ldc( numeric_cast<int>( C.spacing() ) );
4255 const complex<float> alpha( -scalar );
4256 const complex<float> beta ( 1.0F, 0.0F );
4258 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4259 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4260 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4261 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4281 template<
typename MT3
4285 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4286 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4288 using boost::numeric_cast;
4297 const int M ( numeric_cast<int>( A.rows() ) );
4298 const int N ( numeric_cast<int>( B.columns() ) );
4299 const int K ( numeric_cast<int>( A.columns() ) );
4300 const int lda( numeric_cast<int>( A.spacing() ) );
4301 const int ldb( numeric_cast<int>( B.spacing() ) );
4302 const int ldc( numeric_cast<int>( C.spacing() ) );
4303 const complex<double> alpha( -scalar );
4304 const complex<double> beta ( 1.0, 0.0 );
4306 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4307 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4308 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4309 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4378 template<
typename T1
4380 inline const TDMatDMatMultExpr<T1,T2>
4386 throw std::invalid_argument(
"Matrix sizes do not match" );
4403 template<
typename MT1,
typename MT2,
typename VT >
4408 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4409 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4410 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4411 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4412 , INVALID_TYPE >::Type Type;
4421 template<
typename MT1,
typename MT2,
typename VT >
4426 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4427 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4428 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4429 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4430 , INVALID_TYPE >::Type Type;
4439 template<
typename VT,
typename MT1,
typename MT2 >
4444 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4445 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4446 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4447 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4448 , INVALID_TYPE >::Type Type;
4457 template<
typename VT,
typename MT1,
typename MT2 >
4462 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4463 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4464 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4465 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4466 , INVALID_TYPE >::Type Type;
4475 template<
typename MT1,
typename MT2 >
4480 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4489 template<
typename MT1,
typename MT2 >
4494 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;