22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
95 template<
typename MT1
97 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
103 typedef typename MT1::ResultType
RT1;
104 typedef typename MT2::ResultType
RT2;
105 typedef typename MT1::CompositeType
CT1;
106 typedef typename MT2::CompositeType
CT2;
114 template<
typename T1,
typename T2,
typename T3 >
115 struct UseSinglePrecisionKernel {
128 template<
typename T1,
typename T2,
typename T3 >
129 struct UseDoublePrecisionKernel {
143 template<
typename T1,
typename T2,
typename T3 >
144 struct UseSinglePrecisionComplexKernel {
145 typedef complex<float> Type;
146 enum { value = IsSame<typename T1::ElementType,Type>::value &&
147 IsSame<typename T2::ElementType,Type>::value &&
148 IsSame<typename T3::ElementType,Type>::value };
159 template<
typename T1,
typename T2,
typename T3 >
160 struct UseDoublePrecisionComplexKernel {
161 typedef complex<double> Type;
162 enum { value = IsSame<typename T1::ElementType,Type>::value &&
163 IsSame<typename T2::ElementType,Type>::value &&
164 IsSame<typename T3::ElementType,Type>::value };
174 template<
typename T1,
typename T2,
typename T3 >
175 struct UseDefaultKernel {
176 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
177 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
178 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
179 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
189 template<
typename T1,
typename T2,
typename T3 >
190 struct UseVectorizedDefaultKernel {
191 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
192 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
193 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
194 IntrinsicTrait<typename T1::ElementType>::addition &&
195 IntrinsicTrait<typename T1::ElementType>::multiplication };
226 enum { vectorizable = 0 };
256 if(
lhs_.columns() != 0UL ) {
257 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
259 for(
size_t k=1UL; k<end; k+=2UL ) {
261 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
263 if( end <
lhs_.columns() ) {
291 return rhs_.columns();
321 template<
typename T >
323 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
333 template<
typename T >
335 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
354 template<
typename MT
363 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
366 else if( rhs.lhs_.columns() == 0UL ) {
382 TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
384 TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
403 template<
typename MT3
407 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
409 const size_t M( A.rows() );
410 const size_t N( B.columns() );
411 const size_t K( A.columns() );
413 for(
size_t i=0UL; i<M; ++i ) {
414 for(
size_t j=0UL; j<N; ++j ) {
415 C(i,j) = A(i,0UL) * B(0UL,j);
417 for(
size_t k=1UL; k<K; ++k ) {
418 for(
size_t j=0UL; j<N; ++j ) {
419 C(i,j) += A(i,k) * B(k,j);
441 template<
typename MT3
444 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
445 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
447 typedef IntrinsicTrait<ElementType> IT;
449 const size_t M( A.rows() );
450 const size_t N( B.spacing() );
451 const size_t K( A.columns() );
455 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
456 for(
size_t i=0UL; i<M; ++i ) {
457 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
458 for(
size_t k=0UL; k<K; ++k ) {
460 xmm1 = xmm1 + a1 * B.get(k,j );
461 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
462 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
463 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
464 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
465 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
466 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
467 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
469 store( &(~C)(i,j ), xmm1 );
470 store( &(~C)(i,j+IT::size ), xmm2 );
471 store( &(~C)(i,j+IT::size*2UL), xmm3 );
472 store( &(~C)(i,j+IT::size*3UL), xmm4 );
473 store( &(~C)(i,j+IT::size*4UL), xmm5 );
474 store( &(~C)(i,j+IT::size*5UL), xmm6 );
475 store( &(~C)(i,j+IT::size*6UL), xmm7 );
476 store( &(~C)(i,j+IT::size*7UL), xmm8 );
479 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
481 for( ; (i+2UL) <= M; i+=2UL ) {
482 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
483 for(
size_t k=0UL; k<K; ++k ) {
490 xmm1 = xmm1 + a1 * b1;
491 xmm2 = xmm2 + a1 * b2;
492 xmm3 = xmm3 + a1 * b3;
493 xmm4 = xmm4 + a1 * b4;
494 xmm5 = xmm5 + a2 * b1;
495 xmm6 = xmm6 + a2 * b2;
496 xmm7 = xmm7 + a2 * b3;
497 xmm8 = xmm8 + a2 * b4;
499 store( &(~C)(i ,j ), xmm1 );
500 store( &(~C)(i ,j+IT::size ), xmm2 );
501 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
502 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
503 store( &(~C)(i+1UL,j ), xmm5 );
504 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
505 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
506 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
510 for(
size_t k=0UL; k<K; ++k ) {
512 xmm1 = xmm1 + a1 * B.get(k,j );
513 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
514 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
515 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
517 store( &(~C)(i,j ), xmm1 );
518 store( &(~C)(i,j+IT::size ), xmm2 );
519 store( &(~C)(i,j+IT::size*2UL), xmm3 );
520 store( &(~C)(i,j+IT::size*3UL), xmm4 );
523 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
525 for( ; (i+2UL) <= M; i+=2UL ) {
527 for(
size_t k=0UL; k<K; ++k ) {
532 xmm1 = xmm1 + a1 * b1;
533 xmm2 = xmm2 + a1 * b2;
534 xmm3 = xmm3 + a2 * b1;
535 xmm4 = xmm4 + a2 * b2;
537 store( &(~C)(i ,j ), xmm1 );
538 store( &(~C)(i ,j+IT::size), xmm2 );
539 store( &(~C)(i+1UL,j ), xmm3 );
540 store( &(~C)(i+1UL,j+IT::size), xmm4 );
544 for(
size_t k=0UL; k<K; ++k ) {
546 xmm1 = xmm1 + a1 * B.get(k,j );
547 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
549 store( &(~C)(i,j ), xmm1 );
550 store( &(~C)(i,j+IT::size), xmm2 );
555 for( ; (i+2UL) <= M; i+=2UL ) {
557 for(
size_t k=0UL; k<K; ++k ) {
559 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
560 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
562 store( &(~C)(i ,j), xmm1 );
563 store( &(~C)(i+1UL,j), xmm2 );
567 for(
size_t k=0UL; k<K; ++k ) {
568 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
570 store( &(~C)(i,j), xmm1 );
591 template<
typename MT3
594 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
595 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
597 typedef IntrinsicTrait<ElementType> IT;
599 const size_t M( A.spacing() );
600 const size_t N( B.columns() );
601 const size_t K( A.columns() );
605 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
606 for(
size_t j=0UL; j<N; ++j ) {
607 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
608 for(
size_t k=0UL; k<K; ++k ) {
610 xmm1 = xmm1 + A.get(i ,k) * b1;
611 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
612 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
613 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
614 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
615 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
616 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
617 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
619 store( &(~C)(i ,j), xmm1 );
620 store( &(~C)(i+IT::size ,j), xmm2 );
621 store( &(~C)(i+IT::size*2UL,j), xmm3 );
622 store( &(~C)(i+IT::size*3UL,j), xmm4 );
623 store( &(~C)(i+IT::size*4UL,j), xmm5 );
624 store( &(~C)(i+IT::size*5UL,j), xmm6 );
625 store( &(~C)(i+IT::size*6UL,j), xmm7 );
626 store( &(~C)(i+IT::size*7UL,j), xmm8 );
629 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
631 for( ; (j+2UL) <= N; j+=2UL ) {
632 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
633 for(
size_t k=0UL; k<K; ++k ) {
640 xmm1 = xmm1 + a1 * b1;
641 xmm2 = xmm2 + a2 * b1;
642 xmm3 = xmm3 + a3 * b1;
643 xmm4 = xmm4 + a4 * b1;
644 xmm5 = xmm5 + a1 * b2;
645 xmm6 = xmm6 + a2 * b2;
646 xmm7 = xmm7 + a3 * b2;
647 xmm8 = xmm8 + a4 * b2;
649 store( &(~C)(i ,j ), xmm1 );
650 store( &(~C)(i+IT::size ,j ), xmm2 );
651 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
652 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
653 store( &(~C)(i ,j+1UL), xmm5 );
654 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
655 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
656 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
660 for(
size_t k=0UL; k<K; ++k ) {
662 xmm1 = xmm1 + A.get(i ,k) * b1;
663 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
664 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
665 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
667 store( &(~C)(i ,j), xmm1 );
668 store( &(~C)(i+IT::size ,j), xmm2 );
669 store( &(~C)(i+IT::size*2UL,j), xmm3 );
670 store( &(~C)(i+IT::size*3UL,j), xmm4 );
673 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
675 for( ; (j+2UL) <= N; j+=2UL ) {
677 for(
size_t k=0UL; k<K; ++k ) {
682 xmm1 = xmm1 + a1 * b1;
683 xmm2 = xmm2 + a2 * b1;
684 xmm3 = xmm3 + a1 * b2;
685 xmm4 = xmm4 + a2 * b2;
687 store( &(~C)(i ,j ), xmm1 );
688 store( &(~C)(i+IT::size,j ), xmm2 );
689 store( &(~C)(i ,j+1UL), xmm3 );
690 store( &(~C)(i+IT::size,j+1UL), xmm4 );
694 for(
size_t k=0UL; k<K; ++k ) {
696 xmm1 = xmm1 + A.get(i ,k) * b1;
697 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
699 store( &(~C)(i ,j), xmm1 );
700 store( &(~C)(i+IT::size,j), xmm2 );
705 for( ; (j+2UL) <= N; j+=2UL ) {
707 for(
size_t k=0UL; k<K; ++k ) {
709 xmm1 = xmm1 + a1 *
set( B(k,j ) );
710 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
712 store( &(~C)(i,j ), xmm1 );
713 store( &(~C)(i,j+1UL), xmm2 );
717 for(
size_t k=0UL; k<K; ++k ) {
718 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
720 store( &(~C)(i,j), xmm1 );
741 template<
typename MT3
744 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
745 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
747 selectDefaultAssignKernel( C, A, B );
767 template<
typename MT3
770 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
771 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
773 using boost::numeric_cast;
779 const int M ( numeric_cast<int>( A.rows() ) );
780 const int N ( numeric_cast<int>( B.columns() ) );
781 const int K ( numeric_cast<int>( A.columns() ) );
782 const int lda( numeric_cast<int>( A.spacing() ) );
783 const int ldb( numeric_cast<int>( B.spacing() ) );
784 const int ldc( numeric_cast<int>( C.spacing() ) );
786 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
787 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
788 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
789 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
810 template<
typename MT3
813 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
814 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
816 using boost::numeric_cast;
822 const int M ( numeric_cast<int>( A.rows() ) );
823 const int N ( numeric_cast<int>( B.columns() ) );
824 const int K ( numeric_cast<int>( A.columns() ) );
825 const int lda( numeric_cast<int>( A.spacing() ) );
826 const int ldb( numeric_cast<int>( B.spacing() ) );
827 const int ldc( numeric_cast<int>( C.spacing() ) );
829 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
830 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
831 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
832 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
853 template<
typename MT3
856 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
857 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
859 using boost::numeric_cast;
868 const int M ( numeric_cast<int>( A.rows() ) );
869 const int N ( numeric_cast<int>( B.columns() ) );
870 const int K ( numeric_cast<int>( A.columns() ) );
871 const int lda( numeric_cast<int>( A.spacing() ) );
872 const int ldb( numeric_cast<int>( B.spacing() ) );
873 const int ldc( numeric_cast<int>( C.spacing() ) );
874 const complex<float> alpha( 1.0F, 0.0F );
875 const complex<float> beta ( 0.0F, 0.0F );
877 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
878 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
879 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
880 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
901 template<
typename MT3
904 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
905 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
907 using boost::numeric_cast;
916 const int M ( numeric_cast<int>( A.rows() ) );
917 const int N ( numeric_cast<int>( B.columns() ) );
918 const int K ( numeric_cast<int>( A.columns() ) );
919 const int lda( numeric_cast<int>( A.spacing() ) );
920 const int ldb( numeric_cast<int>( B.spacing() ) );
921 const int ldc( numeric_cast<int>( C.spacing() ) );
922 const complex<double> alpha( 1.0, 0.0 );
923 const complex<double> beta ( 0.0, 0.0 );
925 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
926 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
927 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
928 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
946 template<
typename MT
952 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
964 const TmpType tmp( rhs );
983 template<
typename MT
992 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1007 TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
1009 TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1028 template<
typename MT3
1031 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1032 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1034 const size_t M( A.rows() );
1035 const size_t N( B.columns() );
1036 const size_t K( A.columns() );
1039 const size_t end( N &
size_t(-2) );
1041 for(
size_t i=0UL; i<M; ++i ) {
1042 for(
size_t k=0UL; k<K; ++k ) {
1043 for(
size_t j=0UL; j<end; j+=2UL ) {
1044 C(i,j ) += A(i,k) * B(k,j );
1045 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1048 C(i,end) += A(i,k) * B(k,end);
1070 template<
typename MT3
1073 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1074 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1076 typedef IntrinsicTrait<ElementType> IT;
1078 const size_t M( A.rows() );
1079 const size_t N( B.spacing() );
1080 const size_t K( A.columns() );
1084 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1085 for(
size_t i=0UL; i<M; ++i ) {
1094 for(
size_t k=0UL; k<K; ++k ) {
1096 xmm1 = xmm1 + a1 * B.get(k,j );
1097 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1098 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1099 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1100 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
1101 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
1102 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
1103 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
1105 store( &(~C)(i,j ), xmm1 );
1106 store( &(~C)(i,j+IT::size ), xmm2 );
1107 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1108 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1109 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1110 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1111 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1112 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1115 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1117 for( ; (i+2UL) <= M; i+=2UL ) {
1126 for(
size_t k=0UL; k<K; ++k ) {
1133 xmm1 = xmm1 + a1 * b1;
1134 xmm2 = xmm2 + a1 * b2;
1135 xmm3 = xmm3 + a1 * b3;
1136 xmm4 = xmm4 + a1 * b4;
1137 xmm5 = xmm5 + a2 * b1;
1138 xmm6 = xmm6 + a2 * b2;
1139 xmm7 = xmm7 + a2 * b3;
1140 xmm8 = xmm8 + a2 * b4;
1142 store( &(~C)(i ,j ), xmm1 );
1143 store( &(~C)(i ,j+IT::size ), xmm2 );
1144 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1145 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1146 store( &(~C)(i+1UL,j ), xmm5 );
1147 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1148 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1149 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1156 for(
size_t k=0UL; k<K; ++k ) {
1158 xmm1 = xmm1 + a1 * B.get(k,j );
1159 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1160 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1161 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1163 store( &(~C)(i,j ), xmm1 );
1164 store( &(~C)(i,j+IT::size ), xmm2 );
1165 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1166 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1169 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1171 for( ; (i+2UL) <= M; i+=2UL ) {
1176 for(
size_t k=0UL; k<K; ++k ) {
1181 xmm1 = xmm1 + a1 * b1;
1182 xmm2 = xmm2 + a1 * b2;
1183 xmm3 = xmm3 + a2 * b1;
1184 xmm4 = xmm4 + a2 * b2;
1186 store( &(~C)(i ,j ), xmm1 );
1187 store( &(~C)(i ,j+IT::size), xmm2 );
1188 store( &(~C)(i+1UL,j ), xmm3 );
1189 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1194 for(
size_t k=0UL; k<K; ++k ) {
1196 xmm1 = xmm1 + a1 * B.get(k,j );
1197 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1199 store( &(~C)(i,j ), xmm1 );
1200 store( &(~C)(i,j+IT::size), xmm2 );
1205 for( ; (i+2UL) <= M; i+=2UL ) {
1208 for(
size_t k=0UL; k<K; ++k ) {
1210 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1211 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1213 store( &(~C)(i ,j), xmm1 );
1214 store( &(~C)(i+1UL,j), xmm2 );
1218 for(
size_t k=0UL; k<K; ++k ) {
1219 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
1221 store( &(~C)(i,j), xmm1 );
1242 template<
typename MT3
1245 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1246 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1248 typedef IntrinsicTrait<ElementType> IT;
1250 const size_t M( A.spacing() );
1251 const size_t N( B.columns() );
1252 const size_t K( A.columns() );
1256 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1257 for(
size_t j=0UL; j<N; ++j ) {
1266 for(
size_t k=0UL; k<K; ++k ) {
1268 xmm1 = xmm1 + A.get(i ,k) * b1;
1269 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1270 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1271 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1272 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1273 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1274 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1275 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1277 store( &(~C)(i ,j), xmm1 );
1278 store( &(~C)(i+IT::size ,j), xmm2 );
1279 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1280 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1281 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1282 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1283 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1284 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1287 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1289 for( ; (j+2UL) <= N; j+=2UL ) {
1298 for(
size_t k=0UL; k<K; ++k ) {
1305 xmm1 = xmm1 + a1 * b1;
1306 xmm2 = xmm2 + a2 * b1;
1307 xmm3 = xmm3 + a3 * b1;
1308 xmm4 = xmm4 + a4 * b1;
1309 xmm5 = xmm5 + a1 * b2;
1310 xmm6 = xmm6 + a2 * b2;
1311 xmm7 = xmm7 + a3 * b2;
1312 xmm8 = xmm8 + a4 * b2;
1314 store( &(~C)(i ,j ), xmm1 );
1315 store( &(~C)(i+IT::size ,j ), xmm2 );
1316 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1317 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1318 store( &(~C)(i ,j+1UL), xmm5 );
1319 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1320 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1321 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1328 for(
size_t k=0UL; k<K; ++k ) {
1330 xmm1 = xmm1 + A.get(i ,k) * b1;
1331 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1332 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1333 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1335 store( &(~C)(i ,j), xmm1 );
1336 store( &(~C)(i+IT::size ,j), xmm2 );
1337 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1338 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1341 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1343 for( ; (j+2UL) <= N; j+=2UL ) {
1348 for(
size_t k=0UL; k<K; ++k ) {
1353 xmm1 = xmm1 + a1 * b1;
1354 xmm2 = xmm2 + a2 * b1;
1355 xmm3 = xmm3 + a1 * b2;
1356 xmm4 = xmm4 + a2 * b2;
1358 store( &(~C)(i ,j ), xmm1 );
1359 store( &(~C)(i+IT::size,j ), xmm2 );
1360 store( &(~C)(i ,j+1UL), xmm3 );
1361 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1366 for(
size_t k=0UL; k<K; ++k ) {
1368 xmm1 = xmm1 + A.get(i ,k) * b1;
1369 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1371 store( &(~C)(i ,j), xmm1 );
1372 store( &(~C)(i+IT::size,j), xmm2 );
1377 for( ; (j+2UL) <= N; j+=2UL ) {
1380 for(
size_t k=0UL; k<K; ++k ) {
1382 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1383 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1385 store( &(~C)(i,j ), xmm1 );
1386 store( &(~C)(i,j+1UL), xmm2 );
1390 for(
size_t k=0UL; k<K; ++k ) {
1391 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
1393 store( &(~C)(i,j), xmm1 );
1414 template<
typename MT3
1417 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1418 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1420 selectDefaultAddAssignKernel( C, A, B );
1440 template<
typename MT3
1443 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1444 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1446 using boost::numeric_cast;
1452 const int M ( numeric_cast<int>( A.rows() ) );
1453 const int N ( numeric_cast<int>( B.columns() ) );
1454 const int K ( numeric_cast<int>( A.columns() ) );
1455 const int lda( numeric_cast<int>( A.spacing() ) );
1456 const int ldb( numeric_cast<int>( B.spacing() ) );
1457 const int ldc( numeric_cast<int>( C.spacing() ) );
1459 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1460 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1461 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1462 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1483 template<
typename MT3
1486 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1487 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1489 using boost::numeric_cast;
1495 const int M ( numeric_cast<int>( A.rows() ) );
1496 const int N ( numeric_cast<int>( B.columns() ) );
1497 const int K ( numeric_cast<int>( A.columns() ) );
1498 const int lda( numeric_cast<int>( A.spacing() ) );
1499 const int ldb( numeric_cast<int>( B.spacing() ) );
1500 const int ldc( numeric_cast<int>( C.spacing() ) );
1502 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1503 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1504 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1505 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1526 template<
typename MT3
1529 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1530 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1532 using boost::numeric_cast;
1541 const int M ( numeric_cast<int>( A.rows() ) );
1542 const int N ( numeric_cast<int>( B.columns() ) );
1543 const int K ( numeric_cast<int>( A.columns() ) );
1544 const int lda( numeric_cast<int>( A.spacing() ) );
1545 const int ldb( numeric_cast<int>( B.spacing() ) );
1546 const int ldc( numeric_cast<int>( C.spacing() ) );
1547 const complex<float> alpha( 1.0F, 0.0F );
1548 const complex<float> beta ( 1.0F, 0.0F );
1550 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1551 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1552 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1553 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1574 template<
typename MT3
1577 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1578 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1580 using boost::numeric_cast;
1589 const int M ( numeric_cast<int>( A.rows() ) );
1590 const int N ( numeric_cast<int>( B.columns() ) );
1591 const int K ( numeric_cast<int>( A.columns() ) );
1592 const int lda( numeric_cast<int>( A.spacing() ) );
1593 const int ldb( numeric_cast<int>( B.spacing() ) );
1594 const int ldc( numeric_cast<int>( C.spacing() ) );
1595 const complex<double> alpha( 1.0, 0.0 );
1596 const complex<double> beta ( 1.0, 0.0 );
1598 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1599 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1600 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1601 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1624 template<
typename MT
1633 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1648 TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1650 TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1669 template<
typename MT3
1672 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1673 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1675 const size_t M( A.rows() );
1676 const size_t N( B.columns() );
1677 const size_t K( A.columns() );
1680 const size_t end( N &
size_t(-2) );
1682 for(
size_t i=0UL; i<M; ++i ) {
1683 for(
size_t k=0UL; k<K; ++k ) {
1684 for(
size_t j=0UL; j<end; j+=2UL ) {
1685 C(i,j ) -= A(i,k) * B(k,j );
1686 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1689 C(i,end) -= A(i,k) * B(k,end);
1711 template<
typename MT3
1714 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1715 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1717 typedef IntrinsicTrait<ElementType> IT;
1719 const size_t M( A.rows() );
1720 const size_t N( B.spacing() );
1721 const size_t K( A.columns() );
1725 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1726 for(
size_t i=0UL; i<M; ++i ) {
1735 for(
size_t k=0UL; k<K; ++k ) {
1737 xmm1 = xmm1 - a1 * B.get(k,j );
1738 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1739 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1740 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1741 xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1742 xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1743 xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1744 xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1746 store( &(~C)(i,j ), xmm1 );
1747 store( &(~C)(i,j+IT::size ), xmm2 );
1748 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1749 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1750 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1751 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1752 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1753 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1756 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1758 for( ; (i+2UL) <= M; i+=2UL ) {
1767 for(
size_t k=0UL; k<K; ++k ) {
1774 xmm1 = xmm1 - a1 * b1;
1775 xmm2 = xmm2 - a1 * b2;
1776 xmm3 = xmm3 - a1 * b3;
1777 xmm4 = xmm4 - a1 * b4;
1778 xmm5 = xmm5 - a2 * b1;
1779 xmm6 = xmm6 - a2 * b2;
1780 xmm7 = xmm7 - a2 * b3;
1781 xmm8 = xmm8 - a2 * b4;
1783 store( &(~C)(i ,j ), xmm1 );
1784 store( &(~C)(i ,j+IT::size ), xmm2 );
1785 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1786 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1787 store( &(~C)(i+1UL,j ), xmm5 );
1788 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1789 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1790 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1797 for(
size_t k=0UL; k<K; ++k ) {
1799 xmm1 = xmm1 - a1 * B.get(k,j );
1800 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1801 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1802 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1804 store( &(~C)(i,j ), xmm1 );
1805 store( &(~C)(i,j+IT::size ), xmm2 );
1806 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1807 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1810 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1812 for( ; (i+2UL) <= M; i+=2UL ) {
1817 for(
size_t k=0UL; k<K; ++k ) {
1822 xmm1 = xmm1 - a1 * b1;
1823 xmm2 = xmm2 - a1 * b2;
1824 xmm3 = xmm3 - a2 * b1;
1825 xmm4 = xmm4 - a2 * b2;
1827 store( &(~C)(i ,j ), xmm1 );
1828 store( &(~C)(i ,j+IT::size), xmm2 );
1829 store( &(~C)(i+1UL,j ), xmm3 );
1830 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1835 for(
size_t k=0UL; k<K; ++k ) {
1837 xmm1 = xmm1 - a1 * B.get(k,j );
1838 xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1840 store( &(~C)(i,j ), xmm1 );
1841 store( &(~C)(i,j+IT::size), xmm2 );
1846 for( ; (i+2UL) <= M; i+=2UL ) {
1849 for(
size_t k=0UL; k<K; ++k ) {
1851 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1852 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1854 store( &(~C)(i ,j), xmm1 );
1855 store( &(~C)(i+1UL,j), xmm2 );
1859 for(
size_t k=0UL; k<K; ++k ) {
1860 xmm1 = xmm1 -
set( A(i,k) ) * B.get(k,j);
1862 store( &(~C)(i,j), xmm1 );
1883 template<
typename MT3
1886 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1887 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1889 typedef IntrinsicTrait<ElementType> IT;
1891 const size_t M( A.spacing() );
1892 const size_t N( B.columns() );
1893 const size_t K( A.columns() );
1897 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1898 for(
size_t j=0UL; j<N; ++j ) {
1907 for(
size_t k=0UL; k<K; ++k ) {
1909 xmm1 = xmm1 - A.get(i ,k) * b1;
1910 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1911 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1912 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1913 xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1914 xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1915 xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1916 xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1918 store( &(~C)(i ,j), xmm1 );
1919 store( &(~C)(i+IT::size ,j), xmm2 );
1920 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1921 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1922 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1923 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1924 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1925 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1928 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1930 for( ; (j+2UL) <= N; j+=2UL ) {
1939 for(
size_t k=0UL; k<K; ++k ) {
1946 xmm1 = xmm1 - a1 * b1;
1947 xmm2 = xmm2 - a2 * b1;
1948 xmm3 = xmm3 - a3 * b1;
1949 xmm4 = xmm4 - a4 * b1;
1950 xmm5 = xmm5 - a1 * b2;
1951 xmm6 = xmm6 - a2 * b2;
1952 xmm7 = xmm7 - a3 * b2;
1953 xmm8 = xmm8 - a4 * b2;
1955 store( &(~C)(i ,j ), xmm1 );
1956 store( &(~C)(i+IT::size ,j ), xmm2 );
1957 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1958 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1959 store( &(~C)(i ,j+1UL), xmm5 );
1960 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1961 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1962 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1969 for(
size_t k=0UL; k<K; ++k ) {
1971 xmm1 = xmm1 - A.get(i ,k) * b1;
1972 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1973 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1974 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1976 store( &(~C)(i ,j), xmm1 );
1977 store( &(~C)(i+IT::size ,j), xmm2 );
1978 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1979 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1982 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1984 for( ; (j+2UL) <= N; j+=2UL ) {
1989 for(
size_t k=0UL; k<K; ++k ) {
1994 xmm1 = xmm1 - a1 * b1;
1995 xmm2 = xmm2 - a2 * b1;
1996 xmm3 = xmm3 - a1 * b2;
1997 xmm4 = xmm4 - a2 * b2;
1999 store( &(~C)(i ,j ), xmm1 );
2000 store( &(~C)(i+IT::size,j ), xmm2 );
2001 store( &(~C)(i ,j+1UL), xmm3 );
2002 store( &(~C)(i+IT::size,j+1UL), xmm4 );
2007 for(
size_t k=0UL; k<K; ++k ) {
2009 xmm1 = xmm1 - A.get(i ,k) * b1;
2010 xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
2012 store( &(~C)(i ,j), xmm1 );
2013 store( &(~C)(i+IT::size,j), xmm2 );
2018 for( ; (j+2UL) <= N; j+=2UL ) {
2021 for(
size_t k=0UL; k<K; ++k ) {
2023 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2024 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2026 store( &(~C)(i,j ), xmm1 );
2027 store( &(~C)(i,j+1UL), xmm2 );
2031 for(
size_t k=0UL; k<K; ++k ) {
2032 xmm1 = xmm1 - A.get(i,k) *
set( B(k,j) );
2034 store( &(~C)(i,j), xmm1 );
2055 template<
typename MT3
2058 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2059 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2061 selectDefaultSubAssignKernel( C, A, B );
2081 template<
typename MT3
2084 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2085 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2087 using boost::numeric_cast;
2093 const int M ( numeric_cast<int>( A.rows() ) );
2094 const int N ( numeric_cast<int>( B.columns() ) );
2095 const int K ( numeric_cast<int>( A.columns() ) );
2096 const int lda( numeric_cast<int>( A.spacing() ) );
2097 const int ldb( numeric_cast<int>( B.spacing() ) );
2098 const int ldc( numeric_cast<int>( C.spacing() ) );
2100 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2101 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2102 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2103 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2124 template<
typename MT3
2127 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2128 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2130 using boost::numeric_cast;
2136 const int M ( numeric_cast<int>( A.rows() ) );
2137 const int N ( numeric_cast<int>( B.columns() ) );
2138 const int K ( numeric_cast<int>( A.columns() ) );
2139 const int lda( numeric_cast<int>( A.spacing() ) );
2140 const int ldb( numeric_cast<int>( B.spacing() ) );
2141 const int ldc( numeric_cast<int>( C.spacing() ) );
2143 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2144 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2145 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2146 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2167 template<
typename MT3
2170 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2171 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2173 using boost::numeric_cast;
2182 const int M ( numeric_cast<int>( A.rows() ) );
2183 const int N ( numeric_cast<int>( B.columns() ) );
2184 const int K ( numeric_cast<int>( A.columns() ) );
2185 const int lda( numeric_cast<int>( A.spacing() ) );
2186 const int ldb( numeric_cast<int>( B.spacing() ) );
2187 const int ldc( numeric_cast<int>( C.spacing() ) );
2188 const complex<float> alpha( -1.0F, 0.0F );
2189 const complex<float> beta ( 1.0F, 0.0F );
2191 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2192 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2193 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2194 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2215 template<
typename MT3
2218 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2219 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2221 using boost::numeric_cast;
2230 const int M ( numeric_cast<int>( A.rows() ) );
2231 const int N ( numeric_cast<int>( B.columns() ) );
2232 const int K ( numeric_cast<int>( A.columns() ) );
2233 const int lda( numeric_cast<int>( A.spacing() ) );
2234 const int ldb( numeric_cast<int>( B.spacing() ) );
2235 const int ldc( numeric_cast<int>( C.spacing() ) );
2236 const complex<double> alpha( -1.0, 0.0 );
2237 const complex<double> beta ( 1.0, 0.0 );
2239 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2240 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2241 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2242 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2288 template<
typename MT1
2292 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2293 ,
private Expression
2294 ,
private Computation
2298 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2299 typedef typename MMM::ResultType RES;
2300 typedef typename MT1::ResultType
RT1;
2301 typedef typename MT2::ResultType
RT2;
2302 typedef typename MT1::CompositeType
CT1;
2303 typedef typename MT2::CompositeType
CT2;
2311 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2312 struct UseSinglePrecisionKernel {
2313 enum { value = IsFloat<typename T1::ElementType>::value &&
2314 IsFloat<typename T2::ElementType>::value &&
2315 IsFloat<typename T3::ElementType>::value &&
2316 !IsComplex<T4>::value };
2325 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2326 struct UseDoublePrecisionKernel {
2327 enum { value = IsDouble<typename T1::ElementType>::value &&
2328 IsDouble<typename T2::ElementType>::value &&
2329 IsDouble<typename T3::ElementType>::value &&
2330 !IsComplex<T4>::value };
2339 template<
typename T1,
typename T2,
typename T3 >
2340 struct UseSinglePrecisionComplexKernel {
2341 typedef complex<float> Type;
2342 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2343 IsSame<typename T2::ElementType,Type>::value &&
2344 IsSame<typename T3::ElementType,Type>::value };
2353 template<
typename T1,
typename T2,
typename T3 >
2354 struct UseDoublePrecisionComplexKernel {
2355 typedef complex<double> Type;
2356 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2357 IsSame<typename T2::ElementType,Type>::value &&
2358 IsSame<typename T3::ElementType,Type>::value };
2366 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2367 struct UseDefaultKernel {
2368 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2369 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2370 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2371 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2379 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2380 struct UseVectorizedDefaultKernel {
2381 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2382 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2383 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2384 IsSame<typename T1::ElementType,T4>::value &&
2385 IntrinsicTrait<typename T1::ElementType>::addition &&
2386 IntrinsicTrait<typename T1::ElementType>::multiplication };
2392 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2393 typedef typename MultTrait<RES,ST>::Type
ResultType;
2394 typedef typename ResultType::OppositeType
OppositeType;
2396 typedef typename ResultType::ElementType
ElementType;
2397 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2402 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
2408 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2411 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2416 enum { vectorizable = 0 };
2425 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2441 return matrix_(i,j) * scalar_;
2450 inline size_t rows()
const {
2451 return matrix_.rows();
2460 inline size_t columns()
const {
2461 return matrix_.columns();
2491 template<
typename T >
2492 inline bool canAlias(
const T* alias )
const {
2493 return matrix_.canAlias( alias );
2503 template<
typename T >
2504 inline bool isAliased(
const T* alias )
const {
2505 return matrix_.isAliased( alias );
2524 template<
typename MT3
2526 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2533 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2534 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2536 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2539 else if( left.columns() == 0UL ) {
2555 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2557 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2575 template<
typename MT3
2579 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2580 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2582 for(
size_t i=0UL; i<A.rows(); ++i ) {
2583 for(
size_t k=0UL; k<B.columns(); ++k ) {
2584 C(i,k) = A(i,0UL) * B(0UL,k);
2586 for(
size_t j=1UL; j<A.columns(); ++j ) {
2587 for(
size_t k=0UL; k<B.columns(); ++k ) {
2588 C(i,k) += A(i,j) * B(j,k);
2591 for(
size_t k=0UL; k<B.columns(); ++k ) {
2612 template<
typename MT3
2616 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2617 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2619 typedef IntrinsicTrait<ElementType> IT;
2621 const size_t M( A.rows() );
2622 const size_t N( B.spacing() );
2623 const size_t K( A.columns() );
2629 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2630 for(
size_t i=0UL; i<M; ++i ) {
2631 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2632 for(
size_t k=0UL; k<K; ++k ) {
2634 xmm1 = xmm1 + a1 * B.get(k,j );
2635 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2636 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2637 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2638 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2639 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2640 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2641 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2643 store( &(~C)(i,j ), xmm1 * factor );
2644 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2645 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2646 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2647 store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2648 store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2649 store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2650 store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2653 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2655 for( ; (i+2UL) <= M; i+=2UL ) {
2656 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2657 for(
size_t k=0UL; k<K; ++k ) {
2664 xmm1 = xmm1 + a1 * b1;
2665 xmm2 = xmm2 + a1 * b2;
2666 xmm3 = xmm3 + a1 * b3;
2667 xmm4 = xmm4 + a1 * b4;
2668 xmm5 = xmm5 + a2 * b1;
2669 xmm6 = xmm6 + a2 * b2;
2670 xmm7 = xmm7 + a2 * b3;
2671 xmm8 = xmm8 + a2 * b4;
2673 store( &(~C)(i ,j ), xmm1 * factor );
2674 store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2675 store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2676 store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2677 store( &(~C)(i+1UL,j ), xmm5 * factor );
2678 store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2679 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2680 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2684 for(
size_t k=0UL; k<K; ++k ) {
2686 xmm1 = xmm1 + a1 * B.get(k,j );
2687 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2688 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2689 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2691 store( &(~C)(i,j ), xmm1 * factor );
2692 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2693 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2694 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2697 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2699 for( ; (i+2UL) <= M; i+=2UL ) {
2701 for(
size_t k=0UL; k<K; ++k ) {
2706 xmm1 = xmm1 + a1 * b1;
2707 xmm2 = xmm2 + a1 * b2;
2708 xmm3 = xmm3 + a2 * b1;
2709 xmm4 = xmm4 + a2 * b2;
2711 store( &(~C)(i ,j ), xmm1 * factor );
2712 store( &(~C)(i ,j+IT::size), xmm2 * factor );
2713 store( &(~C)(i+1UL,j ), xmm3 * factor );
2714 store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2718 for(
size_t k=0UL; k<K; ++k ) {
2720 xmm1 = xmm1 + a1 * B.get(k,j );
2721 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2723 store( &(~C)(i,j ), xmm1 * factor );
2724 store( &(~C)(i,j+IT::size), xmm2 * factor );
2729 for( ; (i+2UL) <= M; i+=2UL ) {
2731 for(
size_t k=0UL; k<K; ++k ) {
2733 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2734 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2736 store( &(~C)(i ,j), xmm1 * factor );
2737 store( &(~C)(i+1UL,j), xmm2 * factor );
2741 for(
size_t k=0UL; k<K; ++k ) {
2742 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2744 store( &(~C)(i,j), xmm1 * factor );
2764 template<
typename MT3
2768 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2769 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2771 typedef IntrinsicTrait<ElementType> IT;
2773 const size_t M( A.spacing() );
2774 const size_t N( B.columns() );
2775 const size_t K( A.columns() );
2781 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2782 for(
size_t j=0UL; j<N; ++j ) {
2783 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2784 for(
size_t k=0UL; k<K; ++k ) {
2786 xmm1 = xmm1 + A.get(i ,k) * b1;
2787 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2788 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2789 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2790 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2791 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2792 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2793 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2795 store( &(~C)(i ,j), xmm1 * factor );
2796 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2797 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2798 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2799 store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2800 store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2801 store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2802 store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2805 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2807 for( ; (j+2UL) <= N; j+=2UL ) {
2808 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2809 for(
size_t k=0UL; k<K; ++k ) {
2816 xmm1 = xmm1 + a1 * b1;
2817 xmm2 = xmm2 + a2 * b1;
2818 xmm3 = xmm3 + a3 * b1;
2819 xmm4 = xmm4 + a4 * b1;
2820 xmm5 = xmm5 + a1 * b2;
2821 xmm6 = xmm6 + a2 * b2;
2822 xmm7 = xmm7 + a3 * b2;
2823 xmm8 = xmm8 + a4 * b2;
2825 store( &(~C)(i ,j ), xmm1 * factor );
2826 store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2827 store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2828 store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2829 store( &(~C)(i ,j+1UL), xmm5 * factor );
2830 store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2831 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2832 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2836 for(
size_t k=0UL; k<K; ++k ) {
2838 xmm1 = xmm1 + A.get(i ,k) * b1;
2839 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2840 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2841 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2843 store( &(~C)(i ,j), xmm1 * factor );
2844 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2845 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2846 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2849 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2851 for( ; (j+2UL) <= N; j+=2UL ) {
2853 for(
size_t k=0UL; k<K; ++k ) {
2858 xmm1 = xmm1 + a1 * b1;
2859 xmm2 = xmm2 + a2 * b1;
2860 xmm3 = xmm3 + a1 * b2;
2861 xmm4 = xmm4 + a2 * b2;
2863 store( &(~C)(i ,j ), xmm1 * factor );
2864 store( &(~C)(i+IT::size,j ), xmm2 * factor );
2865 store( &(~C)(i ,j+1UL), xmm3 * factor );
2866 store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2870 for(
size_t k=0UL; k<K; ++k ) {
2872 xmm1 = xmm1 + A.get(i ,k) * b1;
2873 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2875 store( &(~C)(i ,j), xmm1 * factor );
2876 store( &(~C)(i+IT::size,j), xmm2 * factor );
2881 for( ; (j+2UL) <= N; j+=2UL ) {
2883 for(
size_t k=0UL; k<K; ++k ) {
2885 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2886 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2888 store( &(~C)(i,j ), xmm1 * factor );
2889 store( &(~C)(i,j+1UL), xmm2 * factor );
2893 for(
size_t k=0UL; k<K; ++k ) {
2894 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2896 store( &(~C)(i,j), xmm1 * factor );
2916 template<
typename MT3
2920 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2921 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2923 selectDefaultAssignKernel( C, A, B, scalar );
2942 template<
typename MT3
2946 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2947 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2949 using boost::numeric_cast;
2955 const int M ( numeric_cast<int>( A.rows() ) );
2956 const int N ( numeric_cast<int>( B.columns() ) );
2957 const int K ( numeric_cast<int>( A.columns() ) );
2958 const int lda( numeric_cast<int>( A.spacing() ) );
2959 const int ldb( numeric_cast<int>( B.spacing() ) );
2960 const int ldc( numeric_cast<int>( C.spacing() ) );
2962 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2963 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2964 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2965 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2985 template<
typename MT3
2989 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2990 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2992 using boost::numeric_cast;
2998 const int M ( numeric_cast<int>( A.rows() ) );
2999 const int N ( numeric_cast<int>( B.columns() ) );
3000 const int K ( numeric_cast<int>( A.columns() ) );
3001 const int lda( numeric_cast<int>( A.spacing() ) );
3002 const int ldb( numeric_cast<int>( B.spacing() ) );
3003 const int ldc( numeric_cast<int>( C.spacing() ) );
3005 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3006 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3007 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3008 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3028 template<
typename MT3
3032 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3033 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3035 using boost::numeric_cast;
3045 const int M ( numeric_cast<int>( A.rows() ) );
3046 const int N ( numeric_cast<int>( B.columns() ) );
3047 const int K ( numeric_cast<int>( A.columns() ) );
3048 const int lda( numeric_cast<int>( A.spacing() ) );
3049 const int ldb( numeric_cast<int>( B.spacing() ) );
3050 const int ldc( numeric_cast<int>( C.spacing() ) );
3051 const complex<float> alpha( scalar );
3052 const complex<float> beta ( 0.0F, 0.0F );
3054 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3055 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3056 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3057 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3077 template<
typename MT3
3081 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3082 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3084 using boost::numeric_cast;
3094 const int M ( numeric_cast<int>( A.rows() ) );
3095 const int N ( numeric_cast<int>( B.columns() ) );
3096 const int K ( numeric_cast<int>( A.columns() ) );
3097 const int lda( numeric_cast<int>( A.spacing() ) );
3098 const int ldb( numeric_cast<int>( B.spacing() ) );
3099 const int ldc( numeric_cast<int>( C.spacing() ) );
3100 const complex<double> alpha( scalar );
3101 const complex<double> beta ( 0.0, 0.0 );
3103 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3104 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3105 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3106 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3123 template<
typename MT
3125 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3129 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3141 const TmpType tmp( rhs );
3158 template<
typename MT3
3160 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3167 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3168 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3170 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3185 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3187 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3205 template<
typename MT3
3209 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3210 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3231 template<
typename MT3
3235 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3236 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3238 typedef IntrinsicTrait<ElementType> IT;
3240 const size_t M( A.rows() );
3241 const size_t N( B.spacing() );
3242 const size_t K( A.columns() );
3248 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3249 for(
size_t i=0UL; i<M; ++i ) {
3250 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3251 for(
size_t k=0UL; k<K; ++k ) {
3253 xmm1 = xmm1 + a1 * B.get(k,j );
3254 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3255 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3256 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3257 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3258 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3259 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3260 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3262 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3263 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3264 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3265 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3266 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
3267 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
3268 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
3269 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
3272 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3274 for( ; (i+2UL) <= M; i+=2UL ) {
3275 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3276 for(
size_t k=0UL; k<K; ++k ) {
3283 xmm1 = xmm1 + a1 * b1;
3284 xmm2 = xmm2 + a1 * b2;
3285 xmm3 = xmm3 + a1 * b3;
3286 xmm4 = xmm4 + a1 * b4;
3287 xmm5 = xmm5 + a2 * b1;
3288 xmm6 = xmm6 + a2 * b2;
3289 xmm7 = xmm7 + a2 * b3;
3290 xmm8 = xmm8 + a2 * b4;
3292 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3293 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
3294 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
3295 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
3296 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
3297 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
3298 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
3299 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
3303 for(
size_t k=0UL; k<K; ++k ) {
3305 xmm1 = xmm1 + a1 * B.get(k,j );
3306 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3307 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3308 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3310 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3311 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3312 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3313 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3316 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3318 for( ; (i+2UL) <= M; i+=2UL ) {
3320 for(
size_t k=0UL; k<K; ++k ) {
3325 xmm1 = xmm1 + a1 * b1;
3326 xmm2 = xmm2 + a1 * b2;
3327 xmm3 = xmm3 + a2 * b1;
3328 xmm4 = xmm4 + a2 * b2;
3330 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3331 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
3332 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
3333 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
3337 for(
size_t k=0UL; k<K; ++k ) {
3339 xmm1 = xmm1 + a1 * B.get(k,j );
3340 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3342 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3343 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
3348 for( ; (i+2UL) <= M; i+=2UL ) {
3350 for(
size_t k=0UL; k<K; ++k ) {
3352 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3353 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3355 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3356 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) + xmm2 * factor );
3360 for(
size_t k=0UL; k<K; ++k ) {
3361 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3363 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
3383 template<
typename MT3
3387 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3388 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3390 typedef IntrinsicTrait<ElementType> IT;
3392 const size_t M( A.spacing() );
3393 const size_t N( B.columns() );
3394 const size_t K( A.columns() );
3400 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3401 for(
size_t j=0UL; j<N; ++j ) {
3402 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3403 for(
size_t k=0UL; k<K; ++k ) {
3405 xmm1 = xmm1 + A.get(i ,k) * b1;
3406 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3407 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3408 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3409 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3410 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3411 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3412 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3414 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3415 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3416 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3417 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3418 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
3419 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
3420 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
3421 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
3424 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3426 for( ; (j+2UL) <= N; j+=2UL ) {
3427 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3428 for(
size_t k=0UL; k<K; ++k ) {
3435 xmm1 = xmm1 + a1 * b1;
3436 xmm2 = xmm2 + a2 * b1;
3437 xmm3 = xmm3 + a3 * b1;
3438 xmm4 = xmm4 + a4 * b1;
3439 xmm5 = xmm5 + a1 * b2;
3440 xmm6 = xmm6 + a2 * b2;
3441 xmm7 = xmm7 + a3 * b2;
3442 xmm8 = xmm8 + a4 * b2;
3444 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3445 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
3446 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
3447 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
3448 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
3449 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
3450 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
3451 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
3455 for(
size_t k=0UL; k<K; ++k ) {
3457 xmm1 = xmm1 + A.get(i ,k) * b1;
3458 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3459 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3460 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3462 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3463 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3464 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3465 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3468 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3470 for( ; (j+2UL) <= N; j+=2UL ) {
3472 for(
size_t k=0UL; k<K; ++k ) {
3477 xmm1 = xmm1 + a1 * b1;
3478 xmm2 = xmm2 + a2 * b1;
3479 xmm3 = xmm3 + a1 * b2;
3480 xmm4 = xmm4 + a2 * b2;
3482 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3483 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
3484 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
3485 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
3489 for(
size_t k=0UL; k<K; ++k ) {
3491 xmm1 = xmm1 + A.get(i ,k) * b1;
3492 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3494 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3495 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
3500 for( ; (j+2UL) <= N; j+=2UL ) {
3502 for(
size_t k=0UL; k<K; ++k ) {
3504 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3505 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3507 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3508 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) + xmm2 * factor );
3512 for(
size_t k=0UL; k<K; ++k ) {
3513 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
3515 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
3535 template<
typename MT3
3539 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3540 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3542 selectDefaultAddAssignKernel( C, A, B, scalar );
3561 template<
typename MT3
3565 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3566 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3568 using boost::numeric_cast;
3574 const int M ( numeric_cast<int>( A.rows() ) );
3575 const int N ( numeric_cast<int>( B.columns() ) );
3576 const int K ( numeric_cast<int>( A.columns() ) );
3577 const int lda( numeric_cast<int>( A.spacing() ) );
3578 const int ldb( numeric_cast<int>( B.spacing() ) );
3579 const int ldc( numeric_cast<int>( C.spacing() ) );
3581 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3582 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3583 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3584 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3604 template<
typename MT3
3608 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3609 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3611 using boost::numeric_cast;
3617 const int M ( numeric_cast<int>( A.rows() ) );
3618 const int N ( numeric_cast<int>( B.columns() ) );
3619 const int K ( numeric_cast<int>( A.columns() ) );
3620 const int lda( numeric_cast<int>( A.spacing() ) );
3621 const int ldb( numeric_cast<int>( B.spacing() ) );
3622 const int ldc( numeric_cast<int>( C.spacing() ) );
3624 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3625 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3626 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3627 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3647 template<
typename MT3
3651 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3652 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3654 using boost::numeric_cast;
3664 const int M ( numeric_cast<int>( A.rows() ) );
3665 const int N ( numeric_cast<int>( B.columns() ) );
3666 const int K ( numeric_cast<int>( A.columns() ) );
3667 const int lda( numeric_cast<int>( A.spacing() ) );
3668 const int ldb( numeric_cast<int>( B.spacing() ) );
3669 const int ldc( numeric_cast<int>( C.spacing() ) );
3670 const complex<float> alpha( scalar );
3671 const complex<float> beta ( 1.0F, 0.0F );
3673 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3674 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3675 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3676 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3696 template<
typename MT3
3700 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3701 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3703 using boost::numeric_cast;
3713 const int M ( numeric_cast<int>( A.rows() ) );
3714 const int N ( numeric_cast<int>( B.columns() ) );
3715 const int K ( numeric_cast<int>( A.columns() ) );
3716 const int lda( numeric_cast<int>( A.spacing() ) );
3717 const int ldb( numeric_cast<int>( B.spacing() ) );
3718 const int ldc( numeric_cast<int>( C.spacing() ) );
3719 const complex<double> alpha( scalar );
3720 const complex<double> beta ( 1.0, 0.0 );
3722 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3723 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3724 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3725 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3746 template<
typename MT3
3748 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3755 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3756 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3758 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3773 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3775 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3793 template<
typename MT3
3797 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3798 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3819 template<
typename MT3
3823 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3824 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3826 typedef IntrinsicTrait<ElementType> IT;
3828 const size_t M( A.rows() );
3829 const size_t N( B.spacing() );
3830 const size_t K( A.columns() );
3836 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3837 for(
size_t i=0UL; i<M; ++i ) {
3838 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3839 for(
size_t k=0UL; k<K; ++k ) {
3841 xmm1 = xmm1 + a1 * B.get(k,j );
3842 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3843 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3844 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3845 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3846 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3847 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3848 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3850 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3851 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3852 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3853 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3854 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3855 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3856 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3857 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3860 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3862 for( ; (i+2UL) <= M; i+=2UL ) {
3863 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3864 for(
size_t k=0UL; k<K; ++k ) {
3871 xmm1 = xmm1 + a1 * b1;
3872 xmm2 = xmm2 + a1 * b2;
3873 xmm3 = xmm3 + a1 * b3;
3874 xmm4 = xmm4 + a1 * b4;
3875 xmm5 = xmm5 + a2 * b1;
3876 xmm6 = xmm6 + a2 * b2;
3877 xmm7 = xmm7 + a2 * b3;
3878 xmm8 = xmm8 + a2 * b4;
3880 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3881 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3882 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3883 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3884 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3885 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3886 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3887 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3891 for(
size_t k=0UL; k<K; ++k ) {
3893 xmm1 = xmm1 + a1 * B.get(k,j );
3894 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3895 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3896 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3898 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3899 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3900 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3901 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3904 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3906 for( ; (i+2UL) <= M; i+=2UL ) {
3908 for(
size_t k=0UL; k<K; ++k ) {
3913 xmm1 = xmm1 + a1 * b1;
3914 xmm2 = xmm2 + a1 * b2;
3915 xmm3 = xmm3 + a2 * b1;
3916 xmm4 = xmm4 + a2 * b2;
3918 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3919 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3920 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3921 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3925 for(
size_t k=0UL; k<K; ++k ) {
3927 xmm1 = xmm1 + a1 * B.get(k,j );
3928 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3930 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3931 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3936 for( ; (i+2UL) <= M; i+=2UL ) {
3938 for(
size_t k=0UL; k<K; ++k ) {
3940 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3941 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3943 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3944 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3948 for(
size_t k=0UL; k<K; ++k ) {
3949 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3951 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3971 template<
typename MT3
3975 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3976 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3978 typedef IntrinsicTrait<ElementType> IT;
3980 const size_t M( A.spacing() );
3981 const size_t N( B.columns() );
3982 const size_t K( A.columns() );
3988 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3989 for(
size_t j=0UL; j<N; ++j ) {
3990 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3991 for(
size_t k=0UL; k<K; ++k ) {
3993 xmm1 = xmm1 + A.get(i ,k) * b1;
3994 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3995 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3996 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3997 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3998 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3999 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
4000 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
4002 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4003 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4004 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4005 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4006 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
4007 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
4008 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
4009 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
4012 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
4014 for( ; (j+2UL) <= N; j+=2UL ) {
4015 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4016 for(
size_t k=0UL; k<K; ++k ) {
4023 xmm1 = xmm1 + a1 * b1;
4024 xmm2 = xmm2 + a2 * b1;
4025 xmm3 = xmm3 + a3 * b1;
4026 xmm4 = xmm4 + a4 * b1;
4027 xmm5 = xmm5 + a1 * b2;
4028 xmm6 = xmm6 + a2 * b2;
4029 xmm7 = xmm7 + a3 * b2;
4030 xmm8 = xmm8 + a4 * b2;
4032 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
4033 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
4034 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
4035 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
4036 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
4037 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
4038 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
4039 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
4043 for(
size_t k=0UL; k<K; ++k ) {
4045 xmm1 = xmm1 + A.get(i ,k) * b1;
4046 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
4047 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
4048 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
4050 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4051 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4052 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4053 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4056 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
4058 for( ; (j+2UL) <= N; j+=2UL ) {
4060 for(
size_t k=0UL; k<K; ++k ) {
4065 xmm1 = xmm1 + a1 * b1;
4066 xmm2 = xmm2 + a2 * b1;
4067 xmm3 = xmm3 + a1 * b2;
4068 xmm4 = xmm4 + a2 * b2;
4070 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
4071 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
4072 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
4073 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
4077 for(
size_t k=0UL; k<K; ++k ) {
4079 xmm1 = xmm1 + A.get(i ,k) * b1;
4080 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
4082 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4083 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
4088 for( ; (j+2UL) <= N; j+=2UL ) {
4090 for(
size_t k=0UL; k<K; ++k ) {
4092 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4093 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4095 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
4096 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) - xmm2 * factor );
4100 for(
size_t k=0UL; k<K; ++k ) {
4101 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
4103 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
4123 template<
typename MT3
4127 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4128 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4130 selectDefaultSubAssignKernel( C, A, B, scalar );
4149 template<
typename MT3
4153 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4154 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4156 using boost::numeric_cast;
4162 const int M ( numeric_cast<int>( A.rows() ) );
4163 const int N ( numeric_cast<int>( B.columns() ) );
4164 const int K ( numeric_cast<int>( A.columns() ) );
4165 const int lda( numeric_cast<int>( A.spacing() ) );
4166 const int ldb( numeric_cast<int>( B.spacing() ) );
4167 const int ldc( numeric_cast<int>( C.spacing() ) );
4169 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4170 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4171 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4172 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4192 template<
typename MT3
4196 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4197 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4199 using boost::numeric_cast;
4205 const int M ( numeric_cast<int>( A.rows() ) );
4206 const int N ( numeric_cast<int>( B.columns() ) );
4207 const int K ( numeric_cast<int>( A.columns() ) );
4208 const int lda( numeric_cast<int>( A.spacing() ) );
4209 const int ldb( numeric_cast<int>( B.spacing() ) );
4210 const int ldc( numeric_cast<int>( C.spacing() ) );
4212 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4213 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4214 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4215 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4235 template<
typename MT3
4239 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4240 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4242 using boost::numeric_cast;
4252 const int M ( numeric_cast<int>( A.rows() ) );
4253 const int N ( numeric_cast<int>( B.columns() ) );
4254 const int K ( numeric_cast<int>( A.columns() ) );
4255 const int lda( numeric_cast<int>( A.spacing() ) );
4256 const int ldb( numeric_cast<int>( B.spacing() ) );
4257 const int ldc( numeric_cast<int>( C.spacing() ) );
4258 const complex<float> alpha( -scalar );
4259 const complex<float> beta ( 1.0F, 0.0F );
4261 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4262 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4263 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4264 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4284 template<
typename MT3
4288 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4289 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4291 using boost::numeric_cast;
4301 const int M ( numeric_cast<int>( A.rows() ) );
4302 const int N ( numeric_cast<int>( B.columns() ) );
4303 const int K ( numeric_cast<int>( A.columns() ) );
4304 const int lda( numeric_cast<int>( A.spacing() ) );
4305 const int ldb( numeric_cast<int>( B.spacing() ) );
4306 const int ldc( numeric_cast<int>( C.spacing() ) );
4307 const complex<double> alpha( -scalar );
4308 const complex<double> beta ( 1.0, 0.0 );
4310 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4311 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4312 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4313 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4381 template<
typename T1
4383 inline const TDMatDMatMultExpr<T1,T2>
4389 throw std::invalid_argument(
"Matrix sizes do not match" );
4417 template<
typename MT1
4419 inline typename RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >::Type
4420 row(
const TDMatDMatMultExpr<MT1,MT2>& dm,
size_t index )
4424 return row( dm.leftOperand(), index ) * dm.rightOperand();
4443 template<
typename MT1
4445 inline typename ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >::Type
4446 column(
const TDMatDMatMultExpr<MT1,MT2>& dm,
size_t index )
4450 return dm.leftOperand() *
column( dm.rightOperand(), index );
4466 template<
typename MT1,
typename MT2,
typename VT >
4471 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4472 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4473 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4474 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4475 , INVALID_TYPE >::Type Type;
4484 template<
typename MT1,
typename MT2,
typename VT >
4489 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4490 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4491 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4492 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4493 , INVALID_TYPE >::Type Type;
4502 template<
typename VT,
typename MT1,
typename MT2 >
4507 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4508 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4509 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4510 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4511 , INVALID_TYPE >::Type Type;
4520 template<
typename VT,
typename MT1,
typename MT2 >
4525 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4526 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4527 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4528 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4529 , INVALID_TYPE >::Type Type;
4538 template<
typename MT1,
typename MT2 >
4543 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4552 template<
typename MT1,
typename MT2 >
4557 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;