22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
92 template<
typename MT1
100 typedef typename MT1::ResultType
RT1;
101 typedef typename MT2::ResultType
RT2;
102 typedef typename MT1::CompositeType
CT1;
103 typedef typename MT2::CompositeType
CT2;
111 template<
typename T1,
typename T2,
typename T3 >
112 struct UseSinglePrecisionKernel {
125 template<
typename T1,
typename T2,
typename T3 >
126 struct UseDoublePrecisionKernel {
140 template<
typename T1,
typename T2,
typename T3 >
141 struct UseSinglePrecisionComplexKernel {
142 typedef complex<float> Type;
143 enum { value = IsSame<typename T1::ElementType,Type>::value &&
144 IsSame<typename T2::ElementType,Type>::value &&
145 IsSame<typename T3::ElementType,Type>::value };
156 template<
typename T1,
typename T2,
typename T3 >
157 struct UseDoublePrecisionComplexKernel {
158 typedef complex<double> Type;
159 enum { value = IsSame<typename T1::ElementType,Type>::value &&
160 IsSame<typename T2::ElementType,Type>::value &&
161 IsSame<typename T3::ElementType,Type>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDefaultKernel {
173 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
174 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
175 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
176 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseVectorizedDefaultKernel {
188 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
189 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
190 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
191 IntrinsicTrait<typename T1::ElementType>::addition &&
192 IntrinsicTrait<typename T1::ElementType>::multiplication };
223 enum { vectorizable = 0 };
256 if(
lhs_.columns() != 0UL ) {
257 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
259 for(
size_t k=1UL; k<end; k+=2UL ) {
261 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
263 if( end <
lhs_.columns() ) {
291 return rhs_.columns();
321 template<
typename T >
343 template<
typename MT
350 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
353 else if( rhs.
lhs_.columns() == 0UL ) {
369 TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
371 TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
390 template<
typename MT3
394 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
396 const size_t M( A.rows() );
397 const size_t N( B.columns() );
398 const size_t K( A.columns() );
400 for(
size_t i=0UL; i<M; ++i ) {
401 for(
size_t j=0UL; j<N; ++j ) {
402 C(i,j) = A(i,0UL) * B(0UL,j);
404 for(
size_t k=1UL; k<K; ++k ) {
405 for(
size_t j=0UL; j<N; ++j ) {
406 C(i,j) += A(i,k) * B(k,j);
428 template<
typename MT3
431 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
432 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
434 typedef IntrinsicTrait<ElementType> IT;
436 const size_t M( A.rows() );
437 const size_t N( B.spacing() );
438 const size_t K( A.columns() );
442 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
443 for(
size_t i=0UL; i<M; ++i ) {
444 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
445 for(
size_t k=0UL; k<K; ++k ) {
447 xmm1 = xmm1 + a1 * B.get(k,j );
448 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
449 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
450 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
451 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
452 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
453 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
454 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
456 store( &(~C)(i,j ), xmm1 );
457 store( &(~C)(i,j+IT::size ), xmm2 );
458 store( &(~C)(i,j+IT::size*2UL), xmm3 );
459 store( &(~C)(i,j+IT::size*3UL), xmm4 );
460 store( &(~C)(i,j+IT::size*4UL), xmm5 );
461 store( &(~C)(i,j+IT::size*5UL), xmm6 );
462 store( &(~C)(i,j+IT::size*6UL), xmm7 );
463 store( &(~C)(i,j+IT::size*7UL), xmm8 );
466 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
468 for( ; (i+2UL) <= M; i+=2UL ) {
469 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
470 for(
size_t k=0UL; k<K; ++k ) {
477 xmm1 = xmm1 + a1 * b1;
478 xmm2 = xmm2 + a1 * b2;
479 xmm3 = xmm3 + a1 * b3;
480 xmm4 = xmm4 + a1 * b4;
481 xmm5 = xmm5 + a2 * b1;
482 xmm6 = xmm6 + a2 * b2;
483 xmm7 = xmm7 + a2 * b3;
484 xmm8 = xmm8 + a2 * b4;
486 store( &(~C)(i ,j ), xmm1 );
487 store( &(~C)(i ,j+IT::size ), xmm2 );
488 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
489 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
490 store( &(~C)(i+1UL,j ), xmm5 );
491 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
492 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
493 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
497 for(
size_t k=0UL; k<K; ++k ) {
499 xmm1 = xmm1 + a1 * B.get(k,j );
500 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
501 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
502 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
504 store( &(~C)(i,j ), xmm1 );
505 store( &(~C)(i,j+IT::size ), xmm2 );
506 store( &(~C)(i,j+IT::size*2UL), xmm3 );
507 store( &(~C)(i,j+IT::size*3UL), xmm4 );
510 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
512 for( ; (i+2UL) <= M; i+=2UL ) {
514 for(
size_t k=0UL; k<K; ++k ) {
519 xmm1 = xmm1 + a1 * b1;
520 xmm2 = xmm2 + a1 * b2;
521 xmm3 = xmm3 + a2 * b1;
522 xmm4 = xmm4 + a2 * b2;
524 store( &(~C)(i ,j ), xmm1 );
525 store( &(~C)(i ,j+IT::size), xmm2 );
526 store( &(~C)(i+1UL,j ), xmm3 );
527 store( &(~C)(i+1UL,j+IT::size), xmm4 );
531 for(
size_t k=0UL; k<K; ++k ) {
533 xmm1 = xmm1 + a1 * B.get(k,j );
534 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
536 store( &(~C)(i,j ), xmm1 );
537 store( &(~C)(i,j+IT::size), xmm2 );
542 for( ; (i+2UL) <= M; i+=2UL ) {
544 for(
size_t k=0UL; k<K; ++k ) {
546 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
547 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
549 store( &(~C)(i ,j), xmm1 );
550 store( &(~C)(i+1UL,j), xmm2 );
554 for(
size_t k=0UL; k<K; ++k ) {
555 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
557 store( &(~C)(i,j), xmm1 );
578 template<
typename MT3
581 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
582 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
584 typedef IntrinsicTrait<ElementType> IT;
586 const size_t M( A.spacing() );
587 const size_t N( B.columns() );
588 const size_t K( A.columns() );
592 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
593 for(
size_t j=0UL; j<N; ++j ) {
594 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
595 for(
size_t k=0UL; k<K; ++k ) {
597 xmm1 = xmm1 + A.get(i ,k) * b1;
598 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
599 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
600 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
601 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
602 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
603 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
604 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
606 store( &(~C)(i ,j), xmm1 );
607 store( &(~C)(i+IT::size ,j), xmm2 );
608 store( &(~C)(i+IT::size*2UL,j), xmm3 );
609 store( &(~C)(i+IT::size*3UL,j), xmm4 );
610 store( &(~C)(i+IT::size*4UL,j), xmm5 );
611 store( &(~C)(i+IT::size*5UL,j), xmm6 );
612 store( &(~C)(i+IT::size*6UL,j), xmm7 );
613 store( &(~C)(i+IT::size*7UL,j), xmm8 );
616 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
618 for( ; (j+2UL) <= N; j+=2UL ) {
619 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
620 for(
size_t k=0UL; k<K; ++k ) {
627 xmm1 = xmm1 + a1 * b1;
628 xmm2 = xmm2 + a2 * b1;
629 xmm3 = xmm3 + a3 * b1;
630 xmm4 = xmm4 + a4 * b1;
631 xmm5 = xmm5 + a1 * b2;
632 xmm6 = xmm6 + a2 * b2;
633 xmm7 = xmm7 + a3 * b2;
634 xmm8 = xmm8 + a4 * b2;
636 store( &(~C)(i ,j ), xmm1 );
637 store( &(~C)(i+IT::size ,j ), xmm2 );
638 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
639 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
640 store( &(~C)(i ,j+1UL), xmm5 );
641 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
642 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
643 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
647 for(
size_t k=0UL; k<K; ++k ) {
649 xmm1 = xmm1 + A.get(i ,k) * b1;
650 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
651 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
652 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
654 store( &(~C)(i ,j), xmm1 );
655 store( &(~C)(i+IT::size ,j), xmm2 );
656 store( &(~C)(i+IT::size*2UL,j), xmm3 );
657 store( &(~C)(i+IT::size*3UL,j), xmm4 );
660 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
662 for( ; (j+2UL) <= N; j+=2UL ) {
664 for(
size_t k=0UL; k<K; ++k ) {
669 xmm1 = xmm1 + a1 * b1;
670 xmm2 = xmm2 + a2 * b1;
671 xmm3 = xmm3 + a1 * b2;
672 xmm4 = xmm4 + a2 * b2;
674 store( &(~C)(i ,j ), xmm1 );
675 store( &(~C)(i+IT::size,j ), xmm2 );
676 store( &(~C)(i ,j+1UL), xmm3 );
677 store( &(~C)(i+IT::size,j+1UL), xmm4 );
681 for(
size_t k=0UL; k<K; ++k ) {
683 xmm1 = xmm1 + A.get(i ,k) * b1;
684 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
686 store( &(~C)(i ,j), xmm1 );
687 store( &(~C)(i+IT::size,j), xmm2 );
692 for( ; (j+2UL) <= N; j+=2UL ) {
694 for(
size_t k=0UL; k<K; ++k ) {
696 xmm1 = xmm1 + a1 *
set( B(k,j ) );
697 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
699 store( &(~C)(i,j ), xmm1 );
700 store( &(~C)(i,j+1UL), xmm2 );
704 for(
size_t k=0UL; k<K; ++k ) {
705 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
707 store( &(~C)(i,j), xmm1 );
728 template<
typename MT3
731 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
732 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
734 selectDefaultAssignKernel( C, A, B );
754 template<
typename MT3
757 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
758 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
760 using boost::numeric_cast;
766 const int M ( numeric_cast<int>( A.rows() ) );
767 const int N ( numeric_cast<int>( B.columns() ) );
768 const int K ( numeric_cast<int>( A.columns() ) );
769 const int lda( numeric_cast<int>( A.spacing() ) );
770 const int ldb( numeric_cast<int>( B.spacing() ) );
771 const int ldc( numeric_cast<int>( C.spacing() ) );
773 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
774 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
775 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
776 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
797 template<
typename MT3
800 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
801 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
803 using boost::numeric_cast;
809 const int M ( numeric_cast<int>( A.rows() ) );
810 const int N ( numeric_cast<int>( B.columns() ) );
811 const int K ( numeric_cast<int>( A.columns() ) );
812 const int lda( numeric_cast<int>( A.spacing() ) );
813 const int ldb( numeric_cast<int>( B.spacing() ) );
814 const int ldc( numeric_cast<int>( C.spacing() ) );
816 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
817 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
818 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
819 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
840 template<
typename MT3
843 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
844 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
846 using boost::numeric_cast;
855 const int M ( numeric_cast<int>( A.rows() ) );
856 const int N ( numeric_cast<int>( B.columns() ) );
857 const int K ( numeric_cast<int>( A.columns() ) );
858 const int lda( numeric_cast<int>( A.spacing() ) );
859 const int ldb( numeric_cast<int>( B.spacing() ) );
860 const int ldc( numeric_cast<int>( C.spacing() ) );
861 const complex<float> alpha( 1.0F, 0.0F );
862 const complex<float> beta ( 0.0F, 0.0F );
864 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
865 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
866 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
867 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
888 template<
typename MT3
891 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
892 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
894 using boost::numeric_cast;
903 const int M ( numeric_cast<int>( A.rows() ) );
904 const int N ( numeric_cast<int>( B.columns() ) );
905 const int K ( numeric_cast<int>( A.columns() ) );
906 const int lda( numeric_cast<int>( A.spacing() ) );
907 const int ldb( numeric_cast<int>( B.spacing() ) );
908 const int ldc( numeric_cast<int>( C.spacing() ) );
909 const complex<double> alpha( 1.0, 0.0 );
910 const complex<double> beta ( 0.0, 0.0 );
912 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
913 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
914 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
915 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
933 template<
typename MT
937 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
949 const TmpType tmp( rhs );
968 template<
typename MT
975 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
990 TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
992 TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1011 template<
typename MT3
1014 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1015 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1017 const size_t M( A.rows() );
1018 const size_t N( B.columns() );
1019 const size_t K( A.columns() );
1022 const size_t end( N &
size_t(-2) );
1024 for(
size_t i=0UL; i<M; ++i ) {
1025 for(
size_t k=0UL; k<K; ++k ) {
1026 for(
size_t j=0UL; j<end; j+=2UL ) {
1027 C(i,j ) += A(i,k) * B(k,j );
1028 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1031 C(i,end) += A(i,k) * B(k,end);
1053 template<
typename MT3
1056 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1057 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1059 typedef IntrinsicTrait<ElementType> IT;
1061 const size_t M( A.rows() );
1062 const size_t N( B.spacing() );
1063 const size_t K( A.columns() );
1067 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1068 for(
size_t i=0UL; i<M; ++i ) {
1077 for(
size_t k=0UL; k<K; ++k ) {
1079 xmm1 = xmm1 + a1 * B.get(k,j );
1080 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1081 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1082 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1083 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
1084 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
1085 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
1086 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
1088 store( &(~C)(i,j ), xmm1 );
1089 store( &(~C)(i,j+IT::size ), xmm2 );
1090 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1091 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1092 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1093 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1094 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1095 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1098 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1100 for( ; (i+2UL) <= M; i+=2UL ) {
1109 for(
size_t k=0UL; k<K; ++k ) {
1116 xmm1 = xmm1 + a1 * b1;
1117 xmm2 = xmm2 + a1 * b2;
1118 xmm3 = xmm3 + a1 * b3;
1119 xmm4 = xmm4 + a1 * b4;
1120 xmm5 = xmm5 + a2 * b1;
1121 xmm6 = xmm6 + a2 * b2;
1122 xmm7 = xmm7 + a2 * b3;
1123 xmm8 = xmm8 + a2 * b4;
1125 store( &(~C)(i ,j ), xmm1 );
1126 store( &(~C)(i ,j+IT::size ), xmm2 );
1127 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1128 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1129 store( &(~C)(i+1UL,j ), xmm5 );
1130 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1131 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1132 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1139 for(
size_t k=0UL; k<K; ++k ) {
1141 xmm1 = xmm1 + a1 * B.get(k,j );
1142 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1143 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1144 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1146 store( &(~C)(i,j ), xmm1 );
1147 store( &(~C)(i,j+IT::size ), xmm2 );
1148 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1149 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1152 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1154 for( ; (i+2UL) <= M; i+=2UL ) {
1159 for(
size_t k=0UL; k<K; ++k ) {
1164 xmm1 = xmm1 + a1 * b1;
1165 xmm2 = xmm2 + a1 * b2;
1166 xmm3 = xmm3 + a2 * b1;
1167 xmm4 = xmm4 + a2 * b2;
1169 store( &(~C)(i ,j ), xmm1 );
1170 store( &(~C)(i ,j+IT::size), xmm2 );
1171 store( &(~C)(i+1UL,j ), xmm3 );
1172 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1177 for(
size_t k=0UL; k<K; ++k ) {
1179 xmm1 = xmm1 + a1 * B.get(k,j );
1180 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1182 store( &(~C)(i,j ), xmm1 );
1183 store( &(~C)(i,j+IT::size), xmm2 );
1188 for( ; (i+2UL) <= M; i+=2UL ) {
1191 for(
size_t k=0UL; k<K; ++k ) {
1193 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1194 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1196 store( &(~C)(i ,j), xmm1 );
1197 store( &(~C)(i+1UL,j), xmm2 );
1201 for(
size_t k=0UL; k<K; ++k ) {
1202 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
1204 store( &(~C)(i,j), xmm1 );
1225 template<
typename MT3
1228 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1229 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1231 typedef IntrinsicTrait<ElementType> IT;
1233 const size_t M( A.spacing() );
1234 const size_t N( B.columns() );
1235 const size_t K( A.columns() );
1239 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1240 for(
size_t j=0UL; j<N; ++j ) {
1249 for(
size_t k=0UL; k<K; ++k ) {
1251 xmm1 = xmm1 + A.get(i ,k) * b1;
1252 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1253 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1254 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1255 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1256 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1257 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1258 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1260 store( &(~C)(i ,j), xmm1 );
1261 store( &(~C)(i+IT::size ,j), xmm2 );
1262 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1263 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1264 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1265 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1266 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1267 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1270 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1272 for( ; (j+2UL) <= N; j+=2UL ) {
1281 for(
size_t k=0UL; k<K; ++k ) {
1288 xmm1 = xmm1 + a1 * b1;
1289 xmm2 = xmm2 + a2 * b1;
1290 xmm3 = xmm3 + a3 * b1;
1291 xmm4 = xmm4 + a4 * b1;
1292 xmm5 = xmm5 + a1 * b2;
1293 xmm6 = xmm6 + a2 * b2;
1294 xmm7 = xmm7 + a3 * b2;
1295 xmm8 = xmm8 + a4 * b2;
1297 store( &(~C)(i ,j ), xmm1 );
1298 store( &(~C)(i+IT::size ,j ), xmm2 );
1299 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1300 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1301 store( &(~C)(i ,j+1UL), xmm5 );
1302 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1303 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1304 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1311 for(
size_t k=0UL; k<K; ++k ) {
1313 xmm1 = xmm1 + A.get(i ,k) * b1;
1314 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1315 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1316 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1318 store( &(~C)(i ,j), xmm1 );
1319 store( &(~C)(i+IT::size ,j), xmm2 );
1320 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1321 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1324 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1326 for( ; (j+2UL) <= N; j+=2UL ) {
1331 for(
size_t k=0UL; k<K; ++k ) {
1336 xmm1 = xmm1 + a1 * b1;
1337 xmm2 = xmm2 + a2 * b1;
1338 xmm3 = xmm3 + a1 * b2;
1339 xmm4 = xmm4 + a2 * b2;
1341 store( &(~C)(i ,j ), xmm1 );
1342 store( &(~C)(i+IT::size,j ), xmm2 );
1343 store( &(~C)(i ,j+1UL), xmm3 );
1344 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1349 for(
size_t k=0UL; k<K; ++k ) {
1351 xmm1 = xmm1 + A.get(i ,k) * b1;
1352 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1354 store( &(~C)(i ,j), xmm1 );
1355 store( &(~C)(i+IT::size,j), xmm2 );
1360 for( ; (j+2UL) <= N; j+=2UL ) {
1363 for(
size_t k=0UL; k<K; ++k ) {
1365 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1366 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1368 store( &(~C)(i,j ), xmm1 );
1369 store( &(~C)(i,j+1UL), xmm2 );
1373 for(
size_t k=0UL; k<K; ++k ) {
1374 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
1376 store( &(~C)(i,j), xmm1 );
1397 template<
typename MT3
1400 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1401 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1403 selectDefaultAddAssignKernel( C, A, B );
1423 template<
typename MT3
1426 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1427 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1429 using boost::numeric_cast;
1435 const int M ( numeric_cast<int>( A.rows() ) );
1436 const int N ( numeric_cast<int>( B.columns() ) );
1437 const int K ( numeric_cast<int>( A.columns() ) );
1438 const int lda( numeric_cast<int>( A.spacing() ) );
1439 const int ldb( numeric_cast<int>( B.spacing() ) );
1440 const int ldc( numeric_cast<int>( C.spacing() ) );
1442 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1443 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1444 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1445 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1466 template<
typename MT3
1469 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1470 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1472 using boost::numeric_cast;
1478 const int M ( numeric_cast<int>( A.rows() ) );
1479 const int N ( numeric_cast<int>( B.columns() ) );
1480 const int K ( numeric_cast<int>( A.columns() ) );
1481 const int lda( numeric_cast<int>( A.spacing() ) );
1482 const int ldb( numeric_cast<int>( B.spacing() ) );
1483 const int ldc( numeric_cast<int>( C.spacing() ) );
1485 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1486 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1487 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1488 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1509 template<
typename MT3
1512 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1513 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1515 using boost::numeric_cast;
1524 const int M ( numeric_cast<int>( A.rows() ) );
1525 const int N ( numeric_cast<int>( B.columns() ) );
1526 const int K ( numeric_cast<int>( A.columns() ) );
1527 const int lda( numeric_cast<int>( A.spacing() ) );
1528 const int ldb( numeric_cast<int>( B.spacing() ) );
1529 const int ldc( numeric_cast<int>( C.spacing() ) );
1530 const complex<float> alpha( 1.0F, 0.0F );
1531 const complex<float> beta ( 1.0F, 0.0F );
1533 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1534 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1535 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1536 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1557 template<
typename MT3
1560 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1561 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1563 using boost::numeric_cast;
1572 const int M ( numeric_cast<int>( A.rows() ) );
1573 const int N ( numeric_cast<int>( B.columns() ) );
1574 const int K ( numeric_cast<int>( A.columns() ) );
1575 const int lda( numeric_cast<int>( A.spacing() ) );
1576 const int ldb( numeric_cast<int>( B.spacing() ) );
1577 const int ldc( numeric_cast<int>( C.spacing() ) );
1578 const complex<double> alpha( 1.0, 0.0 );
1579 const complex<double> beta ( 1.0, 0.0 );
1581 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1582 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1583 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1584 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1607 template<
typename MT
1614 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1629 TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1631 TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1650 template<
typename MT3
1653 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1654 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1656 const size_t M( A.rows() );
1657 const size_t N( B.columns() );
1658 const size_t K( A.columns() );
1661 const size_t end( N &
size_t(-2) );
1663 for(
size_t i=0UL; i<M; ++i ) {
1664 for(
size_t k=0UL; k<K; ++k ) {
1665 for(
size_t j=0UL; j<end; j+=2UL ) {
1666 C(i,j ) -= A(i,k) * B(k,j );
1667 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1670 C(i,end) -= A(i,k) * B(k,end);
1692 template<
typename MT3
1695 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1696 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1698 typedef IntrinsicTrait<ElementType> IT;
1700 const size_t M( A.rows() );
1701 const size_t N( B.spacing() );
1702 const size_t K( A.columns() );
1706 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1707 for(
size_t i=0UL; i<M; ++i ) {
1716 for(
size_t k=0UL; k<K; ++k ) {
1718 xmm1 = xmm1 - a1 * B.get(k,j );
1719 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1720 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1721 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1722 xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1723 xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1724 xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1725 xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1727 store( &(~C)(i,j ), xmm1 );
1728 store( &(~C)(i,j+IT::size ), xmm2 );
1729 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1730 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1731 store( &(~C)(i,j+IT::size*4UL), xmm5 );
1732 store( &(~C)(i,j+IT::size*5UL), xmm6 );
1733 store( &(~C)(i,j+IT::size*6UL), xmm7 );
1734 store( &(~C)(i,j+IT::size*7UL), xmm8 );
1737 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1739 for( ; (i+2UL) <= M; i+=2UL ) {
1748 for(
size_t k=0UL; k<K; ++k ) {
1755 xmm1 = xmm1 - a1 * b1;
1756 xmm2 = xmm2 - a1 * b2;
1757 xmm3 = xmm3 - a1 * b3;
1758 xmm4 = xmm4 - a1 * b4;
1759 xmm5 = xmm5 - a2 * b1;
1760 xmm6 = xmm6 - a2 * b2;
1761 xmm7 = xmm7 - a2 * b3;
1762 xmm8 = xmm8 - a2 * b4;
1764 store( &(~C)(i ,j ), xmm1 );
1765 store( &(~C)(i ,j+IT::size ), xmm2 );
1766 store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1767 store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1768 store( &(~C)(i+1UL,j ), xmm5 );
1769 store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1770 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1771 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1778 for(
size_t k=0UL; k<K; ++k ) {
1780 xmm1 = xmm1 - a1 * B.get(k,j );
1781 xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1782 xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1783 xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1785 store( &(~C)(i,j ), xmm1 );
1786 store( &(~C)(i,j+IT::size ), xmm2 );
1787 store( &(~C)(i,j+IT::size*2UL), xmm3 );
1788 store( &(~C)(i,j+IT::size*3UL), xmm4 );
1791 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1793 for( ; (i+2UL) <= M; i+=2UL ) {
1798 for(
size_t k=0UL; k<K; ++k ) {
1803 xmm1 = xmm1 - a1 * b1;
1804 xmm2 = xmm2 - a1 * b2;
1805 xmm3 = xmm3 - a2 * b1;
1806 xmm4 = xmm4 - a2 * b2;
1808 store( &(~C)(i ,j ), xmm1 );
1809 store( &(~C)(i ,j+IT::size), xmm2 );
1810 store( &(~C)(i+1UL,j ), xmm3 );
1811 store( &(~C)(i+1UL,j+IT::size), xmm4 );
1816 for(
size_t k=0UL; k<K; ++k ) {
1818 xmm1 = xmm1 - a1 * B.get(k,j );
1819 xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1821 store( &(~C)(i,j ), xmm1 );
1822 store( &(~C)(i,j+IT::size), xmm2 );
1827 for( ; (i+2UL) <= M; i+=2UL ) {
1830 for(
size_t k=0UL; k<K; ++k ) {
1832 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1833 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1835 store( &(~C)(i ,j), xmm1 );
1836 store( &(~C)(i+1UL,j), xmm2 );
1840 for(
size_t k=0UL; k<K; ++k ) {
1841 xmm1 = xmm1 -
set( A(i,k) ) * B.get(k,j);
1843 store( &(~C)(i,j), xmm1 );
1864 template<
typename MT3
1867 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1868 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1870 typedef IntrinsicTrait<ElementType> IT;
1872 const size_t M( A.spacing() );
1873 const size_t N( B.columns() );
1874 const size_t K( A.columns() );
1878 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1879 for(
size_t j=0UL; j<N; ++j ) {
1888 for(
size_t k=0UL; k<K; ++k ) {
1890 xmm1 = xmm1 - A.get(i ,k) * b1;
1891 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1892 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1893 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1894 xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1895 xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1896 xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1897 xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1899 store( &(~C)(i ,j), xmm1 );
1900 store( &(~C)(i+IT::size ,j), xmm2 );
1901 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1902 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1903 store( &(~C)(i+IT::size*4UL,j), xmm5 );
1904 store( &(~C)(i+IT::size*5UL,j), xmm6 );
1905 store( &(~C)(i+IT::size*6UL,j), xmm7 );
1906 store( &(~C)(i+IT::size*7UL,j), xmm8 );
1909 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1911 for( ; (j+2UL) <= N; j+=2UL ) {
1920 for(
size_t k=0UL; k<K; ++k ) {
1927 xmm1 = xmm1 - a1 * b1;
1928 xmm2 = xmm2 - a2 * b1;
1929 xmm3 = xmm3 - a3 * b1;
1930 xmm4 = xmm4 - a4 * b1;
1931 xmm5 = xmm5 - a1 * b2;
1932 xmm6 = xmm6 - a2 * b2;
1933 xmm7 = xmm7 - a3 * b2;
1934 xmm8 = xmm8 - a4 * b2;
1936 store( &(~C)(i ,j ), xmm1 );
1937 store( &(~C)(i+IT::size ,j ), xmm2 );
1938 store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1939 store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1940 store( &(~C)(i ,j+1UL), xmm5 );
1941 store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1942 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1943 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1950 for(
size_t k=0UL; k<K; ++k ) {
1952 xmm1 = xmm1 - A.get(i ,k) * b1;
1953 xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1954 xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1955 xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1957 store( &(~C)(i ,j), xmm1 );
1958 store( &(~C)(i+IT::size ,j), xmm2 );
1959 store( &(~C)(i+IT::size*2UL,j), xmm3 );
1960 store( &(~C)(i+IT::size*3UL,j), xmm4 );
1963 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1965 for( ; (j+2UL) <= N; j+=2UL ) {
1970 for(
size_t k=0UL; k<K; ++k ) {
1975 xmm1 = xmm1 - a1 * b1;
1976 xmm2 = xmm2 - a2 * b1;
1977 xmm3 = xmm3 - a1 * b2;
1978 xmm4 = xmm4 - a2 * b2;
1980 store( &(~C)(i ,j ), xmm1 );
1981 store( &(~C)(i+IT::size,j ), xmm2 );
1982 store( &(~C)(i ,j+1UL), xmm3 );
1983 store( &(~C)(i+IT::size,j+1UL), xmm4 );
1988 for(
size_t k=0UL; k<K; ++k ) {
1990 xmm1 = xmm1 - A.get(i ,k) * b1;
1991 xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
1993 store( &(~C)(i ,j), xmm1 );
1994 store( &(~C)(i+IT::size,j), xmm2 );
1999 for( ; (j+2UL) <= N; j+=2UL ) {
2002 for(
size_t k=0UL; k<K; ++k ) {
2004 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2005 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2007 store( &(~C)(i,j ), xmm1 );
2008 store( &(~C)(i,j+1UL), xmm2 );
2012 for(
size_t k=0UL; k<K; ++k ) {
2013 xmm1 = xmm1 - A.get(i,k) *
set( B(k,j) );
2015 store( &(~C)(i,j), xmm1 );
2036 template<
typename MT3
2039 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2040 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2042 selectDefaultSubAssignKernel( C, A, B );
2062 template<
typename MT3
2065 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2066 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2068 using boost::numeric_cast;
2074 const int M ( numeric_cast<int>( A.rows() ) );
2075 const int N ( numeric_cast<int>( B.columns() ) );
2076 const int K ( numeric_cast<int>( A.columns() ) );
2077 const int lda( numeric_cast<int>( A.spacing() ) );
2078 const int ldb( numeric_cast<int>( B.spacing() ) );
2079 const int ldc( numeric_cast<int>( C.spacing() ) );
2081 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2082 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2083 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2084 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2105 template<
typename MT3
2108 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2109 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2111 using boost::numeric_cast;
2117 const int M ( numeric_cast<int>( A.rows() ) );
2118 const int N ( numeric_cast<int>( B.columns() ) );
2119 const int K ( numeric_cast<int>( A.columns() ) );
2120 const int lda( numeric_cast<int>( A.spacing() ) );
2121 const int ldb( numeric_cast<int>( B.spacing() ) );
2122 const int ldc( numeric_cast<int>( C.spacing() ) );
2124 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2125 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2126 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2127 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2148 template<
typename MT3
2151 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2152 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2154 using boost::numeric_cast;
2163 const int M ( numeric_cast<int>( A.rows() ) );
2164 const int N ( numeric_cast<int>( B.columns() ) );
2165 const int K ( numeric_cast<int>( A.columns() ) );
2166 const int lda( numeric_cast<int>( A.spacing() ) );
2167 const int ldb( numeric_cast<int>( B.spacing() ) );
2168 const int ldc( numeric_cast<int>( C.spacing() ) );
2169 const complex<float> alpha( -1.0F, 0.0F );
2170 const complex<float> beta ( 1.0F, 0.0F );
2172 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2173 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2174 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2175 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2196 template<
typename MT3
2199 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2200 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2202 using boost::numeric_cast;
2211 const int M ( numeric_cast<int>( A.rows() ) );
2212 const int N ( numeric_cast<int>( B.columns() ) );
2213 const int K ( numeric_cast<int>( A.columns() ) );
2214 const int lda( numeric_cast<int>( A.spacing() ) );
2215 const int ldb( numeric_cast<int>( B.spacing() ) );
2216 const int ldc( numeric_cast<int>( C.spacing() ) );
2217 const complex<double> alpha( -1.0, 0.0 );
2218 const complex<double> beta ( 1.0, 0.0 );
2220 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2221 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2222 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2223 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2269 template<
typename MT1
2273 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2274 ,
private Expression
2275 ,
private Computation
2280 typedef typename MMM::ResultType RES;
2281 typedef typename MT1::ResultType
RT1;
2282 typedef typename MT2::ResultType
RT2;
2283 typedef typename MT1::CompositeType
CT1;
2284 typedef typename MT2::CompositeType
CT2;
2292 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2293 struct UseSinglePrecisionKernel {
2294 enum { value = IsFloat<typename T1::ElementType>::value &&
2295 IsFloat<typename T2::ElementType>::value &&
2296 IsFloat<typename T3::ElementType>::value &&
2297 !IsComplex<T4>::value };
2306 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2307 struct UseDoublePrecisionKernel {
2308 enum { value = IsDouble<typename T1::ElementType>::value &&
2309 IsDouble<typename T2::ElementType>::value &&
2310 IsDouble<typename T3::ElementType>::value &&
2311 !IsComplex<T4>::value };
2320 template<
typename T1,
typename T2,
typename T3 >
2321 struct UseSinglePrecisionComplexKernel {
2322 typedef complex<float> Type;
2323 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2324 IsSame<typename T2::ElementType,Type>::value &&
2325 IsSame<typename T3::ElementType,Type>::value };
2334 template<
typename T1,
typename T2,
typename T3 >
2335 struct UseDoublePrecisionComplexKernel {
2336 typedef complex<double> Type;
2337 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2338 IsSame<typename T2::ElementType,Type>::value &&
2339 IsSame<typename T3::ElementType,Type>::value };
2347 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2348 struct UseDefaultKernel {
2349 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2350 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2351 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2352 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2360 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2361 struct UseVectorizedDefaultKernel {
2362 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2363 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2364 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2365 IsSame<typename T1::ElementType,T4>::value &&
2366 IntrinsicTrait<typename T1::ElementType>::addition &&
2367 IntrinsicTrait<typename T1::ElementType>::multiplication };
2373 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2374 typedef typename MultTrait<RES,ST>::Type
ResultType;
2375 typedef typename ResultType::OppositeType
OppositeType;
2377 typedef typename ResultType::ElementType
ElementType;
2378 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2389 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2392 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2397 enum { vectorizable = 0 };
2400 enum { canAlias = CanAlias<MMM>::value };
2409 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2425 return matrix_(i,j) * scalar_;
2434 inline size_t rows()
const {
2435 return matrix_.rows();
2444 inline size_t columns()
const {
2445 return matrix_.columns();
2475 template<
typename T >
2476 inline bool isAliased(
const T* alias )
const {
2477 return CanAlias<MMM>::value && matrix_.isAliased( alias );
2496 template<
typename MT3
2498 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2503 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2504 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2506 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2509 else if( left.columns() == 0UL ) {
2525 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2527 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2545 template<
typename MT3
2549 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2550 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2552 for(
size_t i=0UL; i<A.rows(); ++i ) {
2553 for(
size_t k=0UL; k<B.columns(); ++k ) {
2554 C(i,k) = A(i,0UL) * B(0UL,k);
2556 for(
size_t j=1UL; j<A.columns(); ++j ) {
2557 for(
size_t k=0UL; k<B.columns(); ++k ) {
2558 C(i,k) += A(i,j) * B(j,k);
2561 for(
size_t k=0UL; k<B.columns(); ++k ) {
2582 template<
typename MT3
2586 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2587 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2589 typedef IntrinsicTrait<ElementType> IT;
2591 const size_t M( A.rows() );
2592 const size_t N( B.spacing() );
2593 const size_t K( A.columns() );
2599 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2600 for(
size_t i=0UL; i<M; ++i ) {
2601 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2602 for(
size_t k=0UL; k<K; ++k ) {
2604 xmm1 = xmm1 + a1 * B.get(k,j );
2605 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2606 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2607 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2608 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2609 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2610 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2611 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2613 store( &(~C)(i,j ), xmm1 * factor );
2614 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2615 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2616 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2617 store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2618 store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2619 store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2620 store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2623 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2625 for( ; (i+2UL) <= M; i+=2UL ) {
2626 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2627 for(
size_t k=0UL; k<K; ++k ) {
2634 xmm1 = xmm1 + a1 * b1;
2635 xmm2 = xmm2 + a1 * b2;
2636 xmm3 = xmm3 + a1 * b3;
2637 xmm4 = xmm4 + a1 * b4;
2638 xmm5 = xmm5 + a2 * b1;
2639 xmm6 = xmm6 + a2 * b2;
2640 xmm7 = xmm7 + a2 * b3;
2641 xmm8 = xmm8 + a2 * b4;
2643 store( &(~C)(i ,j ), xmm1 * factor );
2644 store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2645 store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2646 store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2647 store( &(~C)(i+1UL,j ), xmm5 * factor );
2648 store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2649 store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2650 store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2654 for(
size_t k=0UL; k<K; ++k ) {
2656 xmm1 = xmm1 + a1 * B.get(k,j );
2657 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2658 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2659 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2661 store( &(~C)(i,j ), xmm1 * factor );
2662 store( &(~C)(i,j+IT::size ), xmm2 * factor );
2663 store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2664 store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2667 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2669 for( ; (i+2UL) <= M; i+=2UL ) {
2671 for(
size_t k=0UL; k<K; ++k ) {
2676 xmm1 = xmm1 + a1 * b1;
2677 xmm2 = xmm2 + a1 * b2;
2678 xmm3 = xmm3 + a2 * b1;
2679 xmm4 = xmm4 + a2 * b2;
2681 store( &(~C)(i ,j ), xmm1 * factor );
2682 store( &(~C)(i ,j+IT::size), xmm2 * factor );
2683 store( &(~C)(i+1UL,j ), xmm3 * factor );
2684 store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2688 for(
size_t k=0UL; k<K; ++k ) {
2690 xmm1 = xmm1 + a1 * B.get(k,j );
2691 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2693 store( &(~C)(i,j ), xmm1 * factor );
2694 store( &(~C)(i,j+IT::size), xmm2 * factor );
2699 for( ; (i+2UL) <= M; i+=2UL ) {
2701 for(
size_t k=0UL; k<K; ++k ) {
2703 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2704 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2706 store( &(~C)(i ,j), xmm1 * factor );
2707 store( &(~C)(i+1UL,j), xmm2 * factor );
2711 for(
size_t k=0UL; k<K; ++k ) {
2712 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
2714 store( &(~C)(i,j), xmm1 * factor );
2734 template<
typename MT3
2738 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2739 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2741 typedef IntrinsicTrait<ElementType> IT;
2743 const size_t M( A.spacing() );
2744 const size_t N( B.columns() );
2745 const size_t K( A.columns() );
2751 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2752 for(
size_t j=0UL; j<N; ++j ) {
2753 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2754 for(
size_t k=0UL; k<K; ++k ) {
2756 xmm1 = xmm1 + A.get(i ,k) * b1;
2757 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2758 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2759 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2760 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2761 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2762 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2763 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2765 store( &(~C)(i ,j), xmm1 * factor );
2766 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2767 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2768 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2769 store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2770 store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2771 store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2772 store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2775 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2777 for( ; (j+2UL) <= N; j+=2UL ) {
2778 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2779 for(
size_t k=0UL; k<K; ++k ) {
2786 xmm1 = xmm1 + a1 * b1;
2787 xmm2 = xmm2 + a2 * b1;
2788 xmm3 = xmm3 + a3 * b1;
2789 xmm4 = xmm4 + a4 * b1;
2790 xmm5 = xmm5 + a1 * b2;
2791 xmm6 = xmm6 + a2 * b2;
2792 xmm7 = xmm7 + a3 * b2;
2793 xmm8 = xmm8 + a4 * b2;
2795 store( &(~C)(i ,j ), xmm1 * factor );
2796 store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2797 store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2798 store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2799 store( &(~C)(i ,j+1UL), xmm5 * factor );
2800 store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2801 store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2802 store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2806 for(
size_t k=0UL; k<K; ++k ) {
2808 xmm1 = xmm1 + A.get(i ,k) * b1;
2809 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2810 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2811 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2813 store( &(~C)(i ,j), xmm1 * factor );
2814 store( &(~C)(i+IT::size ,j), xmm2 * factor );
2815 store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2816 store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2819 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2821 for( ; (j+2UL) <= N; j+=2UL ) {
2823 for(
size_t k=0UL; k<K; ++k ) {
2828 xmm1 = xmm1 + a1 * b1;
2829 xmm2 = xmm2 + a2 * b1;
2830 xmm3 = xmm3 + a1 * b2;
2831 xmm4 = xmm4 + a2 * b2;
2833 store( &(~C)(i ,j ), xmm1 * factor );
2834 store( &(~C)(i+IT::size,j ), xmm2 * factor );
2835 store( &(~C)(i ,j+1UL), xmm3 * factor );
2836 store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2840 for(
size_t k=0UL; k<K; ++k ) {
2842 xmm1 = xmm1 + A.get(i ,k) * b1;
2843 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2845 store( &(~C)(i ,j), xmm1 * factor );
2846 store( &(~C)(i+IT::size,j), xmm2 * factor );
2851 for( ; (j+2UL) <= N; j+=2UL ) {
2853 for(
size_t k=0UL; k<K; ++k ) {
2855 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2856 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2858 store( &(~C)(i,j ), xmm1 * factor );
2859 store( &(~C)(i,j+1UL), xmm2 * factor );
2863 for(
size_t k=0UL; k<K; ++k ) {
2864 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
2866 store( &(~C)(i,j), xmm1 * factor );
2886 template<
typename MT3
2890 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2891 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2893 selectDefaultAssignKernel( C, A, B, scalar );
2912 template<
typename MT3
2916 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2917 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2919 using boost::numeric_cast;
2925 const int M ( numeric_cast<int>( A.rows() ) );
2926 const int N ( numeric_cast<int>( B.columns() ) );
2927 const int K ( numeric_cast<int>( A.columns() ) );
2928 const int lda( numeric_cast<int>( A.spacing() ) );
2929 const int ldb( numeric_cast<int>( B.spacing() ) );
2930 const int ldc( numeric_cast<int>( C.spacing() ) );
2932 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2933 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2934 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2935 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2955 template<
typename MT3
2959 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2960 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2962 using boost::numeric_cast;
2968 const int M ( numeric_cast<int>( A.rows() ) );
2969 const int N ( numeric_cast<int>( B.columns() ) );
2970 const int K ( numeric_cast<int>( A.columns() ) );
2971 const int lda( numeric_cast<int>( A.spacing() ) );
2972 const int ldb( numeric_cast<int>( B.spacing() ) );
2973 const int ldc( numeric_cast<int>( C.spacing() ) );
2975 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2976 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2977 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2978 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2998 template<
typename MT3
3002 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3003 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3005 using boost::numeric_cast;
3015 const int M ( numeric_cast<int>( A.rows() ) );
3016 const int N ( numeric_cast<int>( B.columns() ) );
3017 const int K ( numeric_cast<int>( A.columns() ) );
3018 const int lda( numeric_cast<int>( A.spacing() ) );
3019 const int ldb( numeric_cast<int>( B.spacing() ) );
3020 const int ldc( numeric_cast<int>( C.spacing() ) );
3021 const complex<float> alpha( scalar );
3022 const complex<float> beta ( 0.0F, 0.0F );
3024 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3025 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3026 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3027 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3047 template<
typename MT3
3051 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3052 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3054 using boost::numeric_cast;
3064 const int M ( numeric_cast<int>( A.rows() ) );
3065 const int N ( numeric_cast<int>( B.columns() ) );
3066 const int K ( numeric_cast<int>( A.columns() ) );
3067 const int lda( numeric_cast<int>( A.spacing() ) );
3068 const int ldb( numeric_cast<int>( B.spacing() ) );
3069 const int ldc( numeric_cast<int>( C.spacing() ) );
3070 const complex<double> alpha( scalar );
3071 const complex<double> beta ( 0.0, 0.0 );
3073 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3074 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3075 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3076 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3093 template<
typename MT
3097 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3109 const TmpType tmp( rhs );
3126 template<
typename MT3
3128 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3133 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3134 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3136 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3151 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3153 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3171 template<
typename MT3
3175 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3176 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3197 template<
typename MT3
3201 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3202 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3204 typedef IntrinsicTrait<ElementType> IT;
3206 const size_t M( A.rows() );
3207 const size_t N( B.spacing() );
3208 const size_t K( A.columns() );
3214 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3215 for(
size_t i=0UL; i<M; ++i ) {
3216 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3217 for(
size_t k=0UL; k<K; ++k ) {
3219 xmm1 = xmm1 + a1 * B.get(k,j );
3220 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3221 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3222 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3223 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3224 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3225 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3226 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3228 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3229 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3230 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3231 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3232 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
3233 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
3234 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
3235 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
3238 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3240 for( ; (i+2UL) <= M; i+=2UL ) {
3241 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3242 for(
size_t k=0UL; k<K; ++k ) {
3249 xmm1 = xmm1 + a1 * b1;
3250 xmm2 = xmm2 + a1 * b2;
3251 xmm3 = xmm3 + a1 * b3;
3252 xmm4 = xmm4 + a1 * b4;
3253 xmm5 = xmm5 + a2 * b1;
3254 xmm6 = xmm6 + a2 * b2;
3255 xmm7 = xmm7 + a2 * b3;
3256 xmm8 = xmm8 + a2 * b4;
3258 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3259 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
3260 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
3261 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
3262 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
3263 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
3264 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
3265 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
3269 for(
size_t k=0UL; k<K; ++k ) {
3271 xmm1 = xmm1 + a1 * B.get(k,j );
3272 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3273 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3274 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3276 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3277 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3278 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3279 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3282 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3284 for( ; (i+2UL) <= M; i+=2UL ) {
3286 for(
size_t k=0UL; k<K; ++k ) {
3291 xmm1 = xmm1 + a1 * b1;
3292 xmm2 = xmm2 + a1 * b2;
3293 xmm3 = xmm3 + a2 * b1;
3294 xmm4 = xmm4 + a2 * b2;
3296 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3297 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
3298 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
3299 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
3303 for(
size_t k=0UL; k<K; ++k ) {
3305 xmm1 = xmm1 + a1 * B.get(k,j );
3306 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3308 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3309 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
3314 for( ; (i+2UL) <= M; i+=2UL ) {
3316 for(
size_t k=0UL; k<K; ++k ) {
3318 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3319 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3321 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3322 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) + xmm2 * factor );
3326 for(
size_t k=0UL; k<K; ++k ) {
3327 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3329 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
3349 template<
typename MT3
3353 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3354 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3356 typedef IntrinsicTrait<ElementType> IT;
3358 const size_t M( A.spacing() );
3359 const size_t N( B.columns() );
3360 const size_t K( A.columns() );
3366 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3367 for(
size_t j=0UL; j<N; ++j ) {
3368 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3369 for(
size_t k=0UL; k<K; ++k ) {
3371 xmm1 = xmm1 + A.get(i ,k) * b1;
3372 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3373 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3374 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3375 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3376 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3377 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3378 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3380 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3381 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3382 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3383 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3384 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
3385 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
3386 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
3387 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
3390 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3392 for( ; (j+2UL) <= N; j+=2UL ) {
3393 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3394 for(
size_t k=0UL; k<K; ++k ) {
3401 xmm1 = xmm1 + a1 * b1;
3402 xmm2 = xmm2 + a2 * b1;
3403 xmm3 = xmm3 + a3 * b1;
3404 xmm4 = xmm4 + a4 * b1;
3405 xmm5 = xmm5 + a1 * b2;
3406 xmm6 = xmm6 + a2 * b2;
3407 xmm7 = xmm7 + a3 * b2;
3408 xmm8 = xmm8 + a4 * b2;
3410 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3411 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
3412 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
3413 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
3414 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
3415 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
3416 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
3417 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
3421 for(
size_t k=0UL; k<K; ++k ) {
3423 xmm1 = xmm1 + A.get(i ,k) * b1;
3424 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3425 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3426 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3428 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3429 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3430 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3431 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3434 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3436 for( ; (j+2UL) <= N; j+=2UL ) {
3438 for(
size_t k=0UL; k<K; ++k ) {
3443 xmm1 = xmm1 + a1 * b1;
3444 xmm2 = xmm2 + a2 * b1;
3445 xmm3 = xmm3 + a1 * b2;
3446 xmm4 = xmm4 + a2 * b2;
3448 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) + xmm1 * factor );
3449 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
3450 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
3451 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
3455 for(
size_t k=0UL; k<K; ++k ) {
3457 xmm1 = xmm1 + A.get(i ,k) * b1;
3458 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3460 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) + xmm1 * factor );
3461 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
3466 for( ; (j+2UL) <= N; j+=2UL ) {
3468 for(
size_t k=0UL; k<K; ++k ) {
3470 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3471 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3473 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) + xmm1 * factor );
3474 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) + xmm2 * factor );
3478 for(
size_t k=0UL; k<K; ++k ) {
3479 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
3481 store( &(~C)(i,j),
load( &(~C)(i,j) ) + xmm1 * factor );
3501 template<
typename MT3
3505 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3506 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3508 selectDefaultAddAssignKernel( C, A, B, scalar );
3527 template<
typename MT3
3531 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3532 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3534 using boost::numeric_cast;
3540 const int M ( numeric_cast<int>( A.rows() ) );
3541 const int N ( numeric_cast<int>( B.columns() ) );
3542 const int K ( numeric_cast<int>( A.columns() ) );
3543 const int lda( numeric_cast<int>( A.spacing() ) );
3544 const int ldb( numeric_cast<int>( B.spacing() ) );
3545 const int ldc( numeric_cast<int>( C.spacing() ) );
3547 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3548 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3549 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3550 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3570 template<
typename MT3
3574 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3575 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3577 using boost::numeric_cast;
3583 const int M ( numeric_cast<int>( A.rows() ) );
3584 const int N ( numeric_cast<int>( B.columns() ) );
3585 const int K ( numeric_cast<int>( A.columns() ) );
3586 const int lda( numeric_cast<int>( A.spacing() ) );
3587 const int ldb( numeric_cast<int>( B.spacing() ) );
3588 const int ldc( numeric_cast<int>( C.spacing() ) );
3590 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3591 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3592 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3593 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3613 template<
typename MT3
3617 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3618 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3620 using boost::numeric_cast;
3630 const int M ( numeric_cast<int>( A.rows() ) );
3631 const int N ( numeric_cast<int>( B.columns() ) );
3632 const int K ( numeric_cast<int>( A.columns() ) );
3633 const int lda( numeric_cast<int>( A.spacing() ) );
3634 const int ldb( numeric_cast<int>( B.spacing() ) );
3635 const int ldc( numeric_cast<int>( C.spacing() ) );
3636 const complex<float> alpha( scalar );
3637 const complex<float> beta ( 1.0F, 0.0F );
3639 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3640 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3641 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3642 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3662 template<
typename MT3
3666 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3667 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3669 using boost::numeric_cast;
3679 const int M ( numeric_cast<int>( A.rows() ) );
3680 const int N ( numeric_cast<int>( B.columns() ) );
3681 const int K ( numeric_cast<int>( A.columns() ) );
3682 const int lda( numeric_cast<int>( A.spacing() ) );
3683 const int ldb( numeric_cast<int>( B.spacing() ) );
3684 const int ldc( numeric_cast<int>( C.spacing() ) );
3685 const complex<double> alpha( scalar );
3686 const complex<double> beta ( 1.0, 0.0 );
3688 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3689 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3690 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3691 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3712 template<
typename MT3
3714 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3719 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3720 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3722 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3737 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3739 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3757 template<
typename MT3
3761 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3762 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3783 template<
typename MT3
3787 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3788 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3790 typedef IntrinsicTrait<ElementType> IT;
3792 const size_t M( A.rows() );
3793 const size_t N( B.spacing() );
3794 const size_t K( A.columns() );
3800 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3801 for(
size_t i=0UL; i<M; ++i ) {
3802 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3803 for(
size_t k=0UL; k<K; ++k ) {
3805 xmm1 = xmm1 + a1 * B.get(k,j );
3806 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3807 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3808 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3809 xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3810 xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3811 xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3812 xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3814 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3815 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3816 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3817 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3818 store( &(~C)(i,j+IT::size*4UL),
load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3819 store( &(~C)(i,j+IT::size*5UL),
load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3820 store( &(~C)(i,j+IT::size*6UL),
load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3821 store( &(~C)(i,j+IT::size*7UL),
load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3824 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3826 for( ; (i+2UL) <= M; i+=2UL ) {
3827 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3828 for(
size_t k=0UL; k<K; ++k ) {
3835 xmm1 = xmm1 + a1 * b1;
3836 xmm2 = xmm2 + a1 * b2;
3837 xmm3 = xmm3 + a1 * b3;
3838 xmm4 = xmm4 + a1 * b4;
3839 xmm5 = xmm5 + a2 * b1;
3840 xmm6 = xmm6 + a2 * b2;
3841 xmm7 = xmm7 + a2 * b3;
3842 xmm8 = xmm8 + a2 * b4;
3844 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3845 store( &(~C)(i ,j+IT::size ),
load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3846 store( &(~C)(i ,j+IT::size*2UL),
load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3847 store( &(~C)(i ,j+IT::size*3UL),
load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3848 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3849 store( &(~C)(i+1UL,j+IT::size ),
load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3850 store( &(~C)(i+1UL,j+IT::size*2UL),
load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3851 store( &(~C)(i+1UL,j+IT::size*3UL),
load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3855 for(
size_t k=0UL; k<K; ++k ) {
3857 xmm1 = xmm1 + a1 * B.get(k,j );
3858 xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3859 xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3860 xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3862 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3863 store( &(~C)(i,j+IT::size ),
load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3864 store( &(~C)(i,j+IT::size*2UL),
load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3865 store( &(~C)(i,j+IT::size*3UL),
load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3868 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3870 for( ; (i+2UL) <= M; i+=2UL ) {
3872 for(
size_t k=0UL; k<K; ++k ) {
3877 xmm1 = xmm1 + a1 * b1;
3878 xmm2 = xmm2 + a1 * b2;
3879 xmm3 = xmm3 + a2 * b1;
3880 xmm4 = xmm4 + a2 * b2;
3882 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3883 store( &(~C)(i ,j+IT::size),
load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3884 store( &(~C)(i+1UL,j ),
load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3885 store( &(~C)(i+1UL,j+IT::size),
load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3889 for(
size_t k=0UL; k<K; ++k ) {
3891 xmm1 = xmm1 + a1 * B.get(k,j );
3892 xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3894 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
3895 store( &(~C)(i,j+IT::size),
load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3900 for( ; (i+2UL) <= M; i+=2UL ) {
3902 for(
size_t k=0UL; k<K; ++k ) {
3904 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3905 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3907 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3908 store( &(~C)(i+1UL,j),
load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3912 for(
size_t k=0UL; k<K; ++k ) {
3913 xmm1 = xmm1 +
set( A(i,k) ) * B.get(k,j);
3915 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
3935 template<
typename MT3
3939 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3940 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3942 typedef IntrinsicTrait<ElementType> IT;
3944 const size_t M( A.spacing() );
3945 const size_t N( B.columns() );
3946 const size_t K( A.columns() );
3952 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3953 for(
size_t j=0UL; j<N; ++j ) {
3954 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3955 for(
size_t k=0UL; k<K; ++k ) {
3957 xmm1 = xmm1 + A.get(i ,k) * b1;
3958 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3959 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3960 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3961 xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3962 xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3963 xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3964 xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3966 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
3967 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3968 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3969 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3970 store( &(~C)(i+IT::size*4UL,j),
load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
3971 store( &(~C)(i+IT::size*5UL,j),
load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
3972 store( &(~C)(i+IT::size*6UL,j),
load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
3973 store( &(~C)(i+IT::size*7UL,j),
load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
3976 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3978 for( ; (j+2UL) <= N; j+=2UL ) {
3979 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3980 for(
size_t k=0UL; k<K; ++k ) {
3987 xmm1 = xmm1 + a1 * b1;
3988 xmm2 = xmm2 + a2 * b1;
3989 xmm3 = xmm3 + a3 * b1;
3990 xmm4 = xmm4 + a4 * b1;
3991 xmm5 = xmm5 + a1 * b2;
3992 xmm6 = xmm6 + a2 * b2;
3993 xmm7 = xmm7 + a3 * b2;
3994 xmm8 = xmm8 + a4 * b2;
3996 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
3997 store( &(~C)(i+IT::size ,j ),
load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
3998 store( &(~C)(i+IT::size*2UL,j ),
load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
3999 store( &(~C)(i+IT::size*3UL,j ),
load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
4000 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
4001 store( &(~C)(i+IT::size ,j+1UL),
load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
4002 store( &(~C)(i+IT::size*2UL,j+1UL),
load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
4003 store( &(~C)(i+IT::size*3UL,j+1UL),
load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
4007 for(
size_t k=0UL; k<K; ++k ) {
4009 xmm1 = xmm1 + A.get(i ,k) * b1;
4010 xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
4011 xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
4012 xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
4014 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4015 store( &(~C)(i+IT::size ,j),
load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4016 store( &(~C)(i+IT::size*2UL,j),
load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4017 store( &(~C)(i+IT::size*3UL,j),
load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4020 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
4022 for( ; (j+2UL) <= N; j+=2UL ) {
4024 for(
size_t k=0UL; k<K; ++k ) {
4029 xmm1 = xmm1 + a1 * b1;
4030 xmm2 = xmm2 + a2 * b1;
4031 xmm3 = xmm3 + a1 * b2;
4032 xmm4 = xmm4 + a2 * b2;
4034 store( &(~C)(i ,j ),
load( &(~C)(i ,j ) ) - xmm1 * factor );
4035 store( &(~C)(i+IT::size,j ),
load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
4036 store( &(~C)(i ,j+1UL),
load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
4037 store( &(~C)(i+IT::size,j+1UL),
load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
4041 for(
size_t k=0UL; k<K; ++k ) {
4043 xmm1 = xmm1 + A.get(i ,k) * b1;
4044 xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
4046 store( &(~C)(i ,j),
load( &(~C)(i ,j) ) - xmm1 * factor );
4047 store( &(~C)(i+IT::size,j),
load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
4052 for( ; (j+2UL) <= N; j+=2UL ) {
4054 for(
size_t k=0UL; k<K; ++k ) {
4056 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4057 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4059 store( &(~C)(i,j ),
load( &(~C)(i,j ) ) - xmm1 * factor );
4060 store( &(~C)(i,j+1UL),
load( &(~C)(i,j+1UL) ) - xmm2 * factor );
4064 for(
size_t k=0UL; k<K; ++k ) {
4065 xmm1 = xmm1 + A.get(i,k) *
set( B(k,j) );
4067 store( &(~C)(i,j),
load( &(~C)(i,j) ) - xmm1 * factor );
4087 template<
typename MT3
4091 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4092 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4094 selectDefaultSubAssignKernel( C, A, B, scalar );
4113 template<
typename MT3
4117 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4118 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4120 using boost::numeric_cast;
4126 const int M ( numeric_cast<int>( A.rows() ) );
4127 const int N ( numeric_cast<int>( B.columns() ) );
4128 const int K ( numeric_cast<int>( A.columns() ) );
4129 const int lda( numeric_cast<int>( A.spacing() ) );
4130 const int ldb( numeric_cast<int>( B.spacing() ) );
4131 const int ldc( numeric_cast<int>( C.spacing() ) );
4133 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4134 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4135 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4136 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4156 template<
typename MT3
4160 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4161 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4163 using boost::numeric_cast;
4169 const int M ( numeric_cast<int>( A.rows() ) );
4170 const int N ( numeric_cast<int>( B.columns() ) );
4171 const int K ( numeric_cast<int>( A.columns() ) );
4172 const int lda( numeric_cast<int>( A.spacing() ) );
4173 const int ldb( numeric_cast<int>( B.spacing() ) );
4174 const int ldc( numeric_cast<int>( C.spacing() ) );
4176 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4177 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4178 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4179 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4199 template<
typename MT3
4203 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4204 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4206 using boost::numeric_cast;
4216 const int M ( numeric_cast<int>( A.rows() ) );
4217 const int N ( numeric_cast<int>( B.columns() ) );
4218 const int K ( numeric_cast<int>( A.columns() ) );
4219 const int lda( numeric_cast<int>( A.spacing() ) );
4220 const int ldb( numeric_cast<int>( B.spacing() ) );
4221 const int ldc( numeric_cast<int>( C.spacing() ) );
4222 const complex<float> alpha( -scalar );
4223 const complex<float> beta ( 1.0F, 0.0F );
4225 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4226 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4227 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4228 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4248 template<
typename MT3
4252 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4253 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4255 using boost::numeric_cast;
4265 const int M ( numeric_cast<int>( A.rows() ) );
4266 const int N ( numeric_cast<int>( B.columns() ) );
4267 const int K ( numeric_cast<int>( A.columns() ) );
4268 const int lda( numeric_cast<int>( A.spacing() ) );
4269 const int ldb( numeric_cast<int>( B.spacing() ) );
4270 const int ldc( numeric_cast<int>( C.spacing() ) );
4271 const complex<double> alpha( -scalar );
4272 const complex<double> beta ( 1.0, 0.0 );
4274 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4275 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4276 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4277 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4345 template<
typename T1
4351 throw std::invalid_argument(
"Matrix sizes do not match" );
4368 template<
typename MT1,
typename MT2,
typename VT >
4373 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4374 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4375 IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4376 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4377 , INVALID_TYPE >::Type Type;
4386 template<
typename MT1,
typename MT2,
typename VT >
4391 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4392 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4393 IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4394 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4395 , INVALID_TYPE >::Type Type;
4404 template<
typename VT,
typename MT1,
typename MT2 >
4409 typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4410 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4411 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4412 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4413 , INVALID_TYPE >::Type Type;
4422 template<
typename VT,
typename MT1,
typename MT2 >
4427 typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4428 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4429 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4430 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4431 , INVALID_TYPE >::Type Type;