35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
112 template<
typename MT1
114 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
115 ,
private MatMatMultExpr
116 ,
private Computation
131 template<
typename T1,
typename T2,
typename T3 >
132 struct UseSinglePrecisionKernel {
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseDoublePrecisionKernel {
160 template<
typename T1,
typename T2,
typename T3 >
161 struct UseSinglePrecisionComplexKernel {
162 typedef complex<float> Type;
163 enum { value = IsSame<typename T1::ElementType,Type>::value &&
164 IsSame<typename T2::ElementType,Type>::value &&
165 IsSame<typename T3::ElementType,Type>::value };
176 template<
typename T1,
typename T2,
typename T3 >
177 struct UseDoublePrecisionComplexKernel {
178 typedef complex<double> Type;
179 enum { value = IsSame<typename T1::ElementType,Type>::value &&
180 IsSame<typename T2::ElementType,Type>::value &&
181 IsSame<typename T3::ElementType,Type>::value };
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseDefaultKernel {
193 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
194 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
195 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
196 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
206 template<
typename T1,
typename T2,
typename T3 >
207 struct UseVectorizedDefaultKernel {
208 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
209 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
210 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
211 IntrinsicTrait<typename T1::ElementType>::addition &&
212 IntrinsicTrait<typename T1::ElementType>::multiplication };
243 enum { vectorizable = 0 };
273 if(
lhs_.columns() != 0UL ) {
274 const size_t end( ( (
lhs_.columns()-1UL ) &
size_t(-2) ) + 1UL );
276 for(
size_t k=1UL; k<end; k+=2UL ) {
278 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
280 if( end <
lhs_.columns() ) {
308 return rhs_.columns();
338 template<
typename T >
340 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
350 template<
typename T >
352 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
371 template<
typename MT
380 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
383 else if( rhs.lhs_.columns() == 0UL ) {
399 TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
401 TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
420 template<
typename MT3
424 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
426 const size_t M( A.rows() );
427 const size_t N( B.columns() );
428 const size_t K( A.columns() );
430 for(
size_t i=0UL; i<M; ++i ) {
431 for(
size_t j=0UL; j<N; ++j ) {
432 C(i,j) = A(i,0UL) * B(0UL,j);
434 for(
size_t k=1UL; k<K; ++k ) {
435 for(
size_t j=0UL; j<N; ++j ) {
436 C(i,j) += A(i,k) * B(k,j);
458 template<
typename MT3
461 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
462 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
464 typedef IntrinsicTrait<ElementType> IT;
466 const size_t M( A.rows() );
467 const size_t N( B.columns() );
468 const size_t K( A.columns() );
472 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
473 for(
size_t i=0UL; i<M; ++i ) {
474 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
475 for(
size_t k=0UL; k<K; ++k ) {
477 xmm1 = xmm1 + a1 * B.load(k,j );
478 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
479 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
480 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
481 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
482 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
483 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
484 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
486 (~C).
store( i, j , xmm1 );
487 (~C).
store( i, j+IT::size , xmm2 );
488 (~C).
store( i, j+IT::size*2UL, xmm3 );
489 (~C).
store( i, j+IT::size*3UL, xmm4 );
490 (~C).
store( i, j+IT::size*4UL, xmm5 );
491 (~C).
store( i, j+IT::size*5UL, xmm6 );
492 (~C).
store( i, j+IT::size*6UL, xmm7 );
493 (~C).
store( i, j+IT::size*7UL, xmm8 );
496 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
498 for( ; (i+2UL) <= M; i+=2UL ) {
499 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
500 for(
size_t k=0UL; k<K; ++k ) {
507 xmm1 = xmm1 + a1 * b1;
508 xmm2 = xmm2 + a1 * b2;
509 xmm3 = xmm3 + a1 * b3;
510 xmm4 = xmm4 + a1 * b4;
511 xmm5 = xmm5 + a2 * b1;
512 xmm6 = xmm6 + a2 * b2;
513 xmm7 = xmm7 + a2 * b3;
514 xmm8 = xmm8 + a2 * b4;
516 (~C).
store( i , j , xmm1 );
517 (~C).
store( i , j+IT::size , xmm2 );
518 (~C).
store( i , j+IT::size*2UL, xmm3 );
519 (~C).
store( i , j+IT::size*3UL, xmm4 );
520 (~C).
store( i+1UL, j , xmm5 );
521 (~C).
store( i+1UL, j+IT::size , xmm6 );
522 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
523 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
527 for(
size_t k=0UL; k<K; ++k ) {
529 xmm1 = xmm1 + a1 * B.load(k,j );
530 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
531 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
532 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
534 (~C).
store( i, j , xmm1 );
535 (~C).
store( i, j+IT::size , xmm2 );
536 (~C).
store( i, j+IT::size*2UL, xmm3 );
537 (~C).
store( i, j+IT::size*3UL, xmm4 );
540 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
542 for( ; (i+2UL) <= M; i+=2UL ) {
544 for(
size_t k=0UL; k<K; ++k ) {
549 xmm1 = xmm1 + a1 * b1;
550 xmm2 = xmm2 + a1 * b2;
551 xmm3 = xmm3 + a2 * b1;
552 xmm4 = xmm4 + a2 * b2;
554 (~C).
store( i , j , xmm1 );
555 (~C).
store( i , j+IT::size, xmm2 );
556 (~C).
store( i+1UL, j , xmm3 );
557 (~C).
store( i+1UL, j+IT::size, xmm4 );
561 for(
size_t k=0UL; k<K; ++k ) {
563 xmm1 = xmm1 + a1 * B.load(k,j );
564 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
566 (~C).
store( i, j , xmm1 );
567 (~C).
store( i, j+IT::size, xmm2 );
572 for( ; (i+2UL) <= M; i+=2UL ) {
574 for(
size_t k=0UL; k<K; ++k ) {
576 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
577 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
579 (~C).
store( i , j, xmm1 );
580 (~C).
store( i+1UL, j, xmm2 );
584 for(
size_t k=0UL; k<K; ++k ) {
585 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
587 (~C).
store( i, j, xmm1 );
608 template<
typename MT3
611 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
612 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
614 typedef IntrinsicTrait<ElementType> IT;
616 const size_t M( A.rows() );
617 const size_t N( B.columns() );
618 const size_t K( A.columns() );
622 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
623 for(
size_t j=0UL; j<N; ++j ) {
624 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
625 for(
size_t k=0UL; k<K; ++k ) {
627 xmm1 = xmm1 + A.load(i ,k) * b1;
628 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
629 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
630 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
631 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
632 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
633 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
634 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
636 (~C).
store( i , j, xmm1 );
637 (~C).
store( i+IT::size , j, xmm2 );
638 (~C).
store( i+IT::size*2UL, j, xmm3 );
639 (~C).
store( i+IT::size*3UL, j, xmm4 );
640 (~C).
store( i+IT::size*4UL, j, xmm5 );
641 (~C).
store( i+IT::size*5UL, j, xmm6 );
642 (~C).
store( i+IT::size*6UL, j, xmm7 );
643 (~C).
store( i+IT::size*7UL, j, xmm8 );
646 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
648 for( ; (j+2UL) <= N; j+=2UL ) {
649 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
650 for(
size_t k=0UL; k<K; ++k ) {
657 xmm1 = xmm1 + a1 * b1;
658 xmm2 = xmm2 + a2 * b1;
659 xmm3 = xmm3 + a3 * b1;
660 xmm4 = xmm4 + a4 * b1;
661 xmm5 = xmm5 + a1 * b2;
662 xmm6 = xmm6 + a2 * b2;
663 xmm7 = xmm7 + a3 * b2;
664 xmm8 = xmm8 + a4 * b2;
666 (~C).
store( i , j , xmm1 );
667 (~C).
store( i+IT::size , j , xmm2 );
668 (~C).
store( i+IT::size*2UL, j , xmm3 );
669 (~C).
store( i+IT::size*3UL, j , xmm4 );
670 (~C).
store( i , j+1UL, xmm5 );
671 (~C).
store( i+IT::size , j+1UL, xmm6 );
672 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
673 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
677 for(
size_t k=0UL; k<K; ++k ) {
679 xmm1 = xmm1 + A.load(i ,k) * b1;
680 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
681 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
682 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
684 (~C).
store( i , j, xmm1 );
685 (~C).
store( i+IT::size , j, xmm2 );
686 (~C).
store( i+IT::size*2UL, j, xmm3 );
687 (~C).
store( i+IT::size*3UL, j, xmm4 );
690 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
692 for( ; (j+2UL) <= N; j+=2UL ) {
694 for(
size_t k=0UL; k<K; ++k ) {
699 xmm1 = xmm1 + a1 * b1;
700 xmm2 = xmm2 + a2 * b1;
701 xmm3 = xmm3 + a1 * b2;
702 xmm4 = xmm4 + a2 * b2;
704 (~C).
store( i , j , xmm1 );
705 (~C).
store( i+IT::size, j , xmm2 );
706 (~C).
store( i , j+1UL, xmm3 );
707 (~C).
store( i+IT::size, j+1UL, xmm4 );
711 for(
size_t k=0UL; k<K; ++k ) {
713 xmm1 = xmm1 + A.load(i ,k) * b1;
714 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
716 (~C).
store( i , j, xmm1 );
717 (~C).
store( i+IT::size, j, xmm2 );
722 for( ; (j+2UL) <= N; j+=2UL ) {
724 for(
size_t k=0UL; k<K; ++k ) {
726 xmm1 = xmm1 + a1 *
set( B(k,j ) );
727 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
729 (~C).
store( i, j , xmm1 );
730 (~C).
store( i, j+1UL, xmm2 );
734 for(
size_t k=0UL; k<K; ++k ) {
735 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
737 (~C).
store( i, j, xmm1 );
758 template<
typename MT3
761 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
762 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
764 selectDefaultAssignKernel( C, A, B );
784 template<
typename MT3
787 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
788 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
790 using boost::numeric_cast;
796 const int M ( numeric_cast<int>( A.rows() ) );
797 const int N ( numeric_cast<int>( B.columns() ) );
798 const int K ( numeric_cast<int>( A.columns() ) );
799 const int lda( numeric_cast<int>( A.spacing() ) );
800 const int ldb( numeric_cast<int>( B.spacing() ) );
801 const int ldc( numeric_cast<int>( C.spacing() ) );
803 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
804 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
805 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
806 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
827 template<
typename MT3
830 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
831 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
833 using boost::numeric_cast;
839 const int M ( numeric_cast<int>( A.rows() ) );
840 const int N ( numeric_cast<int>( B.columns() ) );
841 const int K ( numeric_cast<int>( A.columns() ) );
842 const int lda( numeric_cast<int>( A.spacing() ) );
843 const int ldb( numeric_cast<int>( B.spacing() ) );
844 const int ldc( numeric_cast<int>( C.spacing() ) );
846 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
847 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
848 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
849 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
870 template<
typename MT3
873 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
874 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
876 using boost::numeric_cast;
885 const int M ( numeric_cast<int>( A.rows() ) );
886 const int N ( numeric_cast<int>( B.columns() ) );
887 const int K ( numeric_cast<int>( A.columns() ) );
888 const int lda( numeric_cast<int>( A.spacing() ) );
889 const int ldb( numeric_cast<int>( B.spacing() ) );
890 const int ldc( numeric_cast<int>( C.spacing() ) );
891 const complex<float> alpha( 1.0F, 0.0F );
892 const complex<float> beta ( 0.0F, 0.0F );
894 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
895 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
896 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
897 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
918 template<
typename MT3
921 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
922 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
924 using boost::numeric_cast;
933 const int M ( numeric_cast<int>( A.rows() ) );
934 const int N ( numeric_cast<int>( B.columns() ) );
935 const int K ( numeric_cast<int>( A.columns() ) );
936 const int lda( numeric_cast<int>( A.spacing() ) );
937 const int ldb( numeric_cast<int>( B.spacing() ) );
938 const int ldc( numeric_cast<int>( C.spacing() ) );
939 const complex<double> alpha( 1.0, 0.0 );
940 const complex<double> beta ( 0.0, 0.0 );
942 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
943 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
944 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
945 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
963 template<
typename MT
969 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
981 const TmpType tmp( rhs );
1000 template<
typename MT
1009 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1024 TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
1026 TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1045 template<
typename MT3
1048 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1049 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1051 const size_t M( A.rows() );
1052 const size_t N( B.columns() );
1053 const size_t K( A.columns() );
1056 const size_t end( N &
size_t(-2) );
1058 for(
size_t i=0UL; i<M; ++i ) {
1059 for(
size_t k=0UL; k<K; ++k ) {
1060 for(
size_t j=0UL; j<end; j+=2UL ) {
1061 C(i,j ) += A(i,k) * B(k,j );
1062 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1065 C(i,end) += A(i,k) * B(k,end);
1087 template<
typename MT3
1090 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1091 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1093 typedef IntrinsicTrait<ElementType> IT;
1095 const size_t M( A.rows() );
1096 const size_t N( B.columns() );
1097 const size_t K( A.columns() );
1101 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1102 for(
size_t i=0UL; i<M; ++i ) {
1111 for(
size_t k=0UL; k<K; ++k ) {
1113 xmm1 = xmm1 + a1 * B.load(k,j );
1114 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1115 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1116 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1117 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1118 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1119 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1120 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1122 (~C).
store( i, j , xmm1 );
1123 (~C).
store( i, j+IT::size , xmm2 );
1124 (~C).
store( i, j+IT::size*2UL, xmm3 );
1125 (~C).
store( i, j+IT::size*3UL, xmm4 );
1126 (~C).
store( i, j+IT::size*4UL, xmm5 );
1127 (~C).
store( i, j+IT::size*5UL, xmm6 );
1128 (~C).
store( i, j+IT::size*6UL, xmm7 );
1129 (~C).
store( i, j+IT::size*7UL, xmm8 );
1132 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1134 for( ; (i+2UL) <= M; i+=2UL ) {
1143 for(
size_t k=0UL; k<K; ++k ) {
1150 xmm1 = xmm1 + a1 * b1;
1151 xmm2 = xmm2 + a1 * b2;
1152 xmm3 = xmm3 + a1 * b3;
1153 xmm4 = xmm4 + a1 * b4;
1154 xmm5 = xmm5 + a2 * b1;
1155 xmm6 = xmm6 + a2 * b2;
1156 xmm7 = xmm7 + a2 * b3;
1157 xmm8 = xmm8 + a2 * b4;
1159 (~C).
store( i , j , xmm1 );
1160 (~C).
store( i , j+IT::size , xmm2 );
1161 (~C).
store( i , j+IT::size*2UL, xmm3 );
1162 (~C).
store( i , j+IT::size*3UL, xmm4 );
1163 (~C).
store( i+1UL, j , xmm5 );
1164 (~C).
store( i+1UL, j+IT::size , xmm6 );
1165 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1166 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1173 for(
size_t k=0UL; k<K; ++k ) {
1175 xmm1 = xmm1 + a1 * B.load(k,j );
1176 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1177 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1178 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1180 (~C).
store( i, j , xmm1 );
1181 (~C).
store( i, j+IT::size , xmm2 );
1182 (~C).
store( i, j+IT::size*2UL, xmm3 );
1183 (~C).
store( i, j+IT::size*3UL, xmm4 );
1186 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1188 for( ; (i+2UL) <= M; i+=2UL ) {
1193 for(
size_t k=0UL; k<K; ++k ) {
1198 xmm1 = xmm1 + a1 * b1;
1199 xmm2 = xmm2 + a1 * b2;
1200 xmm3 = xmm3 + a2 * b1;
1201 xmm4 = xmm4 + a2 * b2;
1203 (~C).
store( i , j , xmm1 );
1204 (~C).
store( i , j+IT::size, xmm2 );
1205 (~C).
store( i+1UL, j , xmm3 );
1206 (~C).
store( i+1UL, j+IT::size, xmm4 );
1211 for(
size_t k=0UL; k<K; ++k ) {
1213 xmm1 = xmm1 + a1 * B.load(k,j );
1214 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1216 (~C).
store( i, j , xmm1 );
1217 (~C).
store( i, j+IT::size, xmm2 );
1222 for( ; (i+2UL) <= M; i+=2UL ) {
1225 for(
size_t k=0UL; k<K; ++k ) {
1227 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1228 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1230 (~C).
store( i , j, xmm1 );
1231 (~C).
store( i+1UL, j, xmm2 );
1235 for(
size_t k=0UL; k<K; ++k ) {
1236 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1238 (~C).
store( i, j, xmm1 );
1259 template<
typename MT3
1262 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1263 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1265 typedef IntrinsicTrait<ElementType> IT;
1267 const size_t M( A.rows() );
1268 const size_t N( B.columns() );
1269 const size_t K( A.columns() );
1273 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1274 for(
size_t j=0UL; j<N; ++j ) {
1283 for(
size_t k=0UL; k<K; ++k ) {
1285 xmm1 = xmm1 + A.load(i ,k) * b1;
1286 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1287 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1288 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1289 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1290 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1291 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1292 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1294 (~C).
store( i , j, xmm1 );
1295 (~C).
store( i+IT::size , j, xmm2 );
1296 (~C).
store( i+IT::size*2UL, j, xmm3 );
1297 (~C).
store( i+IT::size*3UL, j, xmm4 );
1298 (~C).
store( i+IT::size*4UL, j, xmm5 );
1299 (~C).
store( i+IT::size*5UL, j, xmm6 );
1300 (~C).
store( i+IT::size*6UL, j, xmm7 );
1301 (~C).
store( i+IT::size*7UL, j, xmm8 );
1304 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1306 for( ; (j+2UL) <= N; j+=2UL ) {
1315 for(
size_t k=0UL; k<K; ++k ) {
1322 xmm1 = xmm1 + a1 * b1;
1323 xmm2 = xmm2 + a2 * b1;
1324 xmm3 = xmm3 + a3 * b1;
1325 xmm4 = xmm4 + a4 * b1;
1326 xmm5 = xmm5 + a1 * b2;
1327 xmm6 = xmm6 + a2 * b2;
1328 xmm7 = xmm7 + a3 * b2;
1329 xmm8 = xmm8 + a4 * b2;
1331 (~C).
store( i , j , xmm1 );
1332 (~C).
store( i+IT::size , j , xmm2 );
1333 (~C).
store( i+IT::size*2UL, j , xmm3 );
1334 (~C).
store( i+IT::size*3UL, j , xmm4 );
1335 (~C).
store( i , j+1UL, xmm5 );
1336 (~C).
store( i+IT::size , j+1UL, xmm6 );
1337 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
1338 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
1345 for(
size_t k=0UL; k<K; ++k ) {
1347 xmm1 = xmm1 + A.load(i ,k) * b1;
1348 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1349 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1350 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1352 (~C).
store( i , j, xmm1 );
1353 (~C).
store( i+IT::size , j, xmm2 );
1354 (~C).
store( i+IT::size*2UL, j, xmm3 );
1355 (~C).
store( i+IT::size*3UL, j, xmm4 );
1358 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1360 for( ; (j+2UL) <= N; j+=2UL ) {
1365 for(
size_t k=0UL; k<K; ++k ) {
1370 xmm1 = xmm1 + a1 * b1;
1371 xmm2 = xmm2 + a2 * b1;
1372 xmm3 = xmm3 + a1 * b2;
1373 xmm4 = xmm4 + a2 * b2;
1375 (~C).
store( i , j , xmm1 );
1376 (~C).
store( i+IT::size, j , xmm2 );
1377 (~C).
store( i , j+1UL, xmm3 );
1378 (~C).
store( i+IT::size, j+1UL, xmm4 );
1383 for(
size_t k=0UL; k<K; ++k ) {
1385 xmm1 = xmm1 + A.load(i ,k) * b1;
1386 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1388 (~C).
store( i , j, xmm1 );
1389 (~C).
store( i+IT::size, j, xmm2 );
1394 for( ; (j+2UL) <= N; j+=2UL ) {
1397 for(
size_t k=0UL; k<K; ++k ) {
1399 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1400 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1402 (~C).
store( i, j , xmm1 );
1403 (~C).
store( i, j+1UL, xmm2 );
1407 for(
size_t k=0UL; k<K; ++k ) {
1408 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1410 (~C).
store( i, j, xmm1 );
1431 template<
typename MT3
1434 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1435 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1437 selectDefaultAddAssignKernel( C, A, B );
1457 template<
typename MT3
1460 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1461 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1463 using boost::numeric_cast;
1469 const int M ( numeric_cast<int>( A.rows() ) );
1470 const int N ( numeric_cast<int>( B.columns() ) );
1471 const int K ( numeric_cast<int>( A.columns() ) );
1472 const int lda( numeric_cast<int>( A.spacing() ) );
1473 const int ldb( numeric_cast<int>( B.spacing() ) );
1474 const int ldc( numeric_cast<int>( C.spacing() ) );
1476 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1477 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1478 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1479 M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1500 template<
typename MT3
1503 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1504 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1506 using boost::numeric_cast;
1512 const int M ( numeric_cast<int>( A.rows() ) );
1513 const int N ( numeric_cast<int>( B.columns() ) );
1514 const int K ( numeric_cast<int>( A.columns() ) );
1515 const int lda( numeric_cast<int>( A.spacing() ) );
1516 const int ldb( numeric_cast<int>( B.spacing() ) );
1517 const int ldc( numeric_cast<int>( C.spacing() ) );
1519 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1520 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1521 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1522 M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1543 template<
typename MT3
1546 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1547 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1549 using boost::numeric_cast;
1558 const int M ( numeric_cast<int>( A.rows() ) );
1559 const int N ( numeric_cast<int>( B.columns() ) );
1560 const int K ( numeric_cast<int>( A.columns() ) );
1561 const int lda( numeric_cast<int>( A.spacing() ) );
1562 const int ldb( numeric_cast<int>( B.spacing() ) );
1563 const int ldc( numeric_cast<int>( C.spacing() ) );
1564 const complex<float> alpha( 1.0F, 0.0F );
1565 const complex<float> beta ( 1.0F, 0.0F );
1567 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1568 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1569 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1570 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1591 template<
typename MT3
1594 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1595 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1597 using boost::numeric_cast;
1606 const int M ( numeric_cast<int>( A.rows() ) );
1607 const int N ( numeric_cast<int>( B.columns() ) );
1608 const int K ( numeric_cast<int>( A.columns() ) );
1609 const int lda( numeric_cast<int>( A.spacing() ) );
1610 const int ldb( numeric_cast<int>( B.spacing() ) );
1611 const int ldc( numeric_cast<int>( C.spacing() ) );
1612 const complex<double> alpha( 1.0, 0.0 );
1613 const complex<double> beta ( 1.0, 0.0 );
1615 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1616 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1617 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1618 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1641 template<
typename MT
1650 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1665 TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1667 TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1686 template<
typename MT3
1689 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1690 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1692 const size_t M( A.rows() );
1693 const size_t N( B.columns() );
1694 const size_t K( A.columns() );
1697 const size_t end( N &
size_t(-2) );
1699 for(
size_t i=0UL; i<M; ++i ) {
1700 for(
size_t k=0UL; k<K; ++k ) {
1701 for(
size_t j=0UL; j<end; j+=2UL ) {
1702 C(i,j ) -= A(i,k) * B(k,j );
1703 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1706 C(i,end) -= A(i,k) * B(k,end);
1728 template<
typename MT3
1731 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1732 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1734 typedef IntrinsicTrait<ElementType> IT;
1736 const size_t M( A.rows() );
1737 const size_t N( B.columns() );
1738 const size_t K( A.columns() );
1742 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1743 for(
size_t i=0UL; i<M; ++i ) {
1752 for(
size_t k=0UL; k<K; ++k ) {
1754 xmm1 = xmm1 - a1 * B.load(k,j );
1755 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1756 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1757 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1758 xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1759 xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1760 xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1761 xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1763 (~C).
store( i, j , xmm1 );
1764 (~C).
store( i, j+IT::size , xmm2 );
1765 (~C).
store( i, j+IT::size*2UL, xmm3 );
1766 (~C).
store( i, j+IT::size*3UL, xmm4 );
1767 (~C).
store( i, j+IT::size*4UL, xmm5 );
1768 (~C).
store( i, j+IT::size*5UL, xmm6 );
1769 (~C).
store( i, j+IT::size*6UL, xmm7 );
1770 (~C).
store( i, j+IT::size*7UL, xmm8 );
1773 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1775 for( ; (i+2UL) <= M; i+=2UL ) {
1784 for(
size_t k=0UL; k<K; ++k ) {
1791 xmm1 = xmm1 - a1 * b1;
1792 xmm2 = xmm2 - a1 * b2;
1793 xmm3 = xmm3 - a1 * b3;
1794 xmm4 = xmm4 - a1 * b4;
1795 xmm5 = xmm5 - a2 * b1;
1796 xmm6 = xmm6 - a2 * b2;
1797 xmm7 = xmm7 - a2 * b3;
1798 xmm8 = xmm8 - a2 * b4;
1800 (~C).
store( i , j , xmm1 );
1801 (~C).
store( i , j+IT::size , xmm2 );
1802 (~C).
store( i , j+IT::size*2UL, xmm3 );
1803 (~C).
store( i , j+IT::size*3UL, xmm4 );
1804 (~C).
store( i+1UL, j , xmm5 );
1805 (~C).
store( i+1UL, j+IT::size , xmm6 );
1806 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 );
1807 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 );
1814 for(
size_t k=0UL; k<K; ++k ) {
1816 xmm1 = xmm1 - a1 * B.load(k,j );
1817 xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1818 xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1819 xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1821 (~C).
store( i, j , xmm1 );
1822 (~C).
store( i, j+IT::size , xmm2 );
1823 (~C).
store( i, j+IT::size*2UL, xmm3 );
1824 (~C).
store( i, j+IT::size*3UL, xmm4 );
1827 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1829 for( ; (i+2UL) <= M; i+=2UL ) {
1834 for(
size_t k=0UL; k<K; ++k ) {
1839 xmm1 = xmm1 - a1 * b1;
1840 xmm2 = xmm2 - a1 * b2;
1841 xmm3 = xmm3 - a2 * b1;
1842 xmm4 = xmm4 - a2 * b2;
1844 (~C).
store( i , j , xmm1 );
1845 (~C).
store( i , j+IT::size, xmm2 );
1846 (~C).
store( i+1UL, j , xmm3 );
1847 (~C).
store( i+1UL, j+IT::size, xmm4 );
1852 for(
size_t k=0UL; k<K; ++k ) {
1854 xmm1 = xmm1 - a1 * B.load(k,j );
1855 xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
1857 (~C).
store( i, j , xmm1 );
1858 (~C).
store( i, j+IT::size, xmm2 );
1863 for( ; (i+2UL) <= M; i+=2UL ) {
1866 for(
size_t k=0UL; k<K; ++k ) {
1868 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
1869 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
1871 (~C).
store( i , j, xmm1 );
1872 (~C).
store( i+1UL, j, xmm2 );
1876 for(
size_t k=0UL; k<K; ++k ) {
1877 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
1879 (~C).
store( i, j, xmm1 );
1900 template<
typename MT3
1903 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1904 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1906 typedef IntrinsicTrait<ElementType> IT;
1908 const size_t M( A.rows() );
1909 const size_t N( B.columns() );
1910 const size_t K( A.columns() );
1914 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1915 for(
size_t j=0UL; j<N; ++j ) {
1924 for(
size_t k=0UL; k<K; ++k ) {
1926 xmm1 = xmm1 - A.load(i ,k) * b1;
1927 xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
1928 xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
1929 xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
1930 xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
1931 xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
1932 xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
1933 xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
1935 (~C).
store( i , j, xmm1 );
1936 (~C).
store( i+IT::size , j, xmm2 );
1937 (~C).
store( i+IT::size*2UL, j, xmm3 );
1938 (~C).
store( i+IT::size*3UL, j, xmm4 );
1939 (~C).
store( i+IT::size*4UL, j, xmm5 );
1940 (~C).
store( i+IT::size*5UL, j, xmm6 );
1941 (~C).
store( i+IT::size*6UL, j, xmm7 );
1942 (~C).
store( i+IT::size*7UL, j, xmm8 );
1945 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1947 for( ; (j+2UL) <= N; j+=2UL ) {
1956 for(
size_t k=0UL; k<K; ++k ) {
1963 xmm1 = xmm1 - a1 * b1;
1964 xmm2 = xmm2 - a2 * b1;
1965 xmm3 = xmm3 - a3 * b1;
1966 xmm4 = xmm4 - a4 * b1;
1967 xmm5 = xmm5 - a1 * b2;
1968 xmm6 = xmm6 - a2 * b2;
1969 xmm7 = xmm7 - a3 * b2;
1970 xmm8 = xmm8 - a4 * b2;
1972 (~C).
store( i , j , xmm1 );
1973 (~C).
store( i+IT::size , j , xmm2 );
1974 (~C).
store( i+IT::size*2UL, j , xmm3 );
1975 (~C).
store( i+IT::size*3UL, j , xmm4 );
1976 (~C).
store( i , j+1UL, xmm5 );
1977 (~C).
store( i+IT::size , j+1UL, xmm6 );
1978 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 );
1979 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 );
1986 for(
size_t k=0UL; k<K; ++k ) {
1988 xmm1 = xmm1 - A.load(i ,k) * b1;
1989 xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
1990 xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
1991 xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
1993 (~C).
store( i , j, xmm1 );
1994 (~C).
store( i+IT::size , j, xmm2 );
1995 (~C).
store( i+IT::size*2UL, j, xmm3 );
1996 (~C).
store( i+IT::size*3UL, j, xmm4 );
1999 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2001 for( ; (j+2UL) <= N; j+=2UL ) {
2006 for(
size_t k=0UL; k<K; ++k ) {
2011 xmm1 = xmm1 - a1 * b1;
2012 xmm2 = xmm2 - a2 * b1;
2013 xmm3 = xmm3 - a1 * b2;
2014 xmm4 = xmm4 - a2 * b2;
2016 (~C).
store( i , j , xmm1 );
2017 (~C).
store( i+IT::size, j , xmm2 );
2018 (~C).
store( i , j+1UL, xmm3 );
2019 (~C).
store( i+IT::size, j+1UL, xmm4 );
2024 for(
size_t k=0UL; k<K; ++k ) {
2026 xmm1 = xmm1 - A.load(i ,k) * b1;
2027 xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
2029 (~C).
store( i , j, xmm1 );
2030 (~C).
store( i+IT::size, j, xmm2 );
2035 for( ; (j+2UL) <= N; j+=2UL ) {
2038 for(
size_t k=0UL; k<K; ++k ) {
2040 xmm1 = xmm1 - a1 *
set( B(k,j ) );
2041 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
2043 (~C).
store( i, j , xmm1 );
2044 (~C).
store( i, j+1UL, xmm2 );
2048 for(
size_t k=0UL; k<K; ++k ) {
2049 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
2051 (~C).
store( i, j, xmm1 );
2072 template<
typename MT3
2075 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2076 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2078 selectDefaultSubAssignKernel( C, A, B );
2098 template<
typename MT3
2101 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2102 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2104 using boost::numeric_cast;
2110 const int M ( numeric_cast<int>( A.rows() ) );
2111 const int N ( numeric_cast<int>( B.columns() ) );
2112 const int K ( numeric_cast<int>( A.columns() ) );
2113 const int lda( numeric_cast<int>( A.spacing() ) );
2114 const int ldb( numeric_cast<int>( B.spacing() ) );
2115 const int ldc( numeric_cast<int>( C.spacing() ) );
2117 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2118 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2119 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2120 M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2141 template<
typename MT3
2144 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2145 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2147 using boost::numeric_cast;
2153 const int M ( numeric_cast<int>( A.rows() ) );
2154 const int N ( numeric_cast<int>( B.columns() ) );
2155 const int K ( numeric_cast<int>( A.columns() ) );
2156 const int lda( numeric_cast<int>( A.spacing() ) );
2157 const int ldb( numeric_cast<int>( B.spacing() ) );
2158 const int ldc( numeric_cast<int>( C.spacing() ) );
2160 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2161 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2162 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2163 M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2184 template<
typename MT3
2187 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2188 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2190 using boost::numeric_cast;
2199 const int M ( numeric_cast<int>( A.rows() ) );
2200 const int N ( numeric_cast<int>( B.columns() ) );
2201 const int K ( numeric_cast<int>( A.columns() ) );
2202 const int lda( numeric_cast<int>( A.spacing() ) );
2203 const int ldb( numeric_cast<int>( B.spacing() ) );
2204 const int ldc( numeric_cast<int>( C.spacing() ) );
2205 const complex<float> alpha( -1.0F, 0.0F );
2206 const complex<float> beta ( 1.0F, 0.0F );
2208 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2209 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2210 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2211 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2232 template<
typename MT3
2235 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2236 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2238 using boost::numeric_cast;
2247 const int M ( numeric_cast<int>( A.rows() ) );
2248 const int N ( numeric_cast<int>( B.columns() ) );
2249 const int K ( numeric_cast<int>( A.columns() ) );
2250 const int lda( numeric_cast<int>( A.spacing() ) );
2251 const int ldb( numeric_cast<int>( B.spacing() ) );
2252 const int ldc( numeric_cast<int>( C.spacing() ) );
2253 const complex<double> alpha( -1.0, 0.0 );
2254 const complex<double> beta ( 1.0, 0.0 );
2256 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2257 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2258 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2259 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2305 template<
typename MT1
2309 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2310 ,
private MatScalarMultExpr
2311 ,
private Computation
2315 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2328 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2329 struct UseSinglePrecisionKernel {
2330 enum { value = IsFloat<typename T1::ElementType>::value &&
2331 IsFloat<typename T2::ElementType>::value &&
2332 IsFloat<typename T3::ElementType>::value &&
2333 !IsComplex<T4>::value };
2342 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2343 struct UseDoublePrecisionKernel {
2344 enum { value = IsDouble<typename T1::ElementType>::value &&
2345 IsDouble<typename T2::ElementType>::value &&
2346 IsDouble<typename T3::ElementType>::value &&
2347 !IsComplex<T4>::value };
2356 template<
typename T1,
typename T2,
typename T3 >
2357 struct UseSinglePrecisionComplexKernel {
2358 typedef complex<float> Type;
2359 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2360 IsSame<typename T2::ElementType,Type>::value &&
2361 IsSame<typename T3::ElementType,Type>::value };
2370 template<
typename T1,
typename T2,
typename T3 >
2371 struct UseDoublePrecisionComplexKernel {
2372 typedef complex<double> Type;
2373 enum { value = IsSame<typename T1::ElementType,Type>::value &&
2374 IsSame<typename T2::ElementType,Type>::value &&
2375 IsSame<typename T3::ElementType,Type>::value };
2383 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2384 struct UseDefaultKernel {
2385 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2386 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2387 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2388 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2396 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2397 struct UseVectorizedDefaultKernel {
2398 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2399 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2400 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2401 IsSame<typename T1::ElementType,T4>::value &&
2402 IntrinsicTrait<typename T1::ElementType>::addition &&
2403 IntrinsicTrait<typename T1::ElementType>::multiplication };
2409 typedef DMatScalarMultExpr<MMM,ST,true>
This;
2410 typedef typename MultTrait<RES,ST>::Type
ResultType;
2414 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2419 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
2425 typedef typename SelectType< IsComputation<MT1>::value,
const RT1,
CT1 >::Type
LT;
2428 typedef typename SelectType< IsComputation<MT2>::value,
const RT2,
CT2 >::Type
RT;
2433 enum { vectorizable = 0 };
2442 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
2458 return matrix_(i,j) * scalar_;
2467 inline size_t rows()
const {
2477 inline size_t columns()
const {
2508 template<
typename T >
2509 inline bool canAlias(
const T* alias )
const {
2510 return matrix_.canAlias( alias );
2520 template<
typename T >
2521 inline bool isAliased(
const T* alias )
const {
2522 return matrix_.isAliased( alias );
2541 template<
typename MT3
2543 friend inline void assign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
2550 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2551 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2553 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
2556 else if( left.columns() == 0UL ) {
2572 DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2574 DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2592 template<
typename MT3
2596 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2597 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2599 for(
size_t i=0UL; i<A.rows(); ++i ) {
2600 for(
size_t k=0UL; k<B.columns(); ++k ) {
2601 C(i,k) = A(i,0UL) * B(0UL,k);
2603 for(
size_t j=1UL; j<A.columns(); ++j ) {
2604 for(
size_t k=0UL; k<B.columns(); ++k ) {
2605 C(i,k) += A(i,j) * B(j,k);
2608 for(
size_t k=0UL; k<B.columns(); ++k ) {
2629 template<
typename MT3
2633 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2634 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2636 typedef IntrinsicTrait<ElementType> IT;
2638 const size_t M( A.rows() );
2639 const size_t N( B.columns() );
2640 const size_t K( A.columns() );
2646 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2647 for(
size_t i=0UL; i<M; ++i ) {
2648 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2649 for(
size_t k=0UL; k<K; ++k ) {
2651 xmm1 = xmm1 + a1 * B.load(k,j );
2652 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2653 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2654 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2655 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2656 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2657 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2658 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2660 (~C).
store( i, j , xmm1 * factor );
2661 (~C).
store( i, j+IT::size , xmm2 * factor );
2662 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
2663 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
2664 (~C).
store( i, j+IT::size*4UL, xmm5 * factor );
2665 (~C).
store( i, j+IT::size*5UL, xmm6 * factor );
2666 (~C).
store( i, j+IT::size*6UL, xmm7 * factor );
2667 (~C).
store( i, j+IT::size*7UL, xmm8 * factor );
2670 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2672 for( ; (i+2UL) <= M; i+=2UL ) {
2673 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2674 for(
size_t k=0UL; k<K; ++k ) {
2681 xmm1 = xmm1 + a1 * b1;
2682 xmm2 = xmm2 + a1 * b2;
2683 xmm3 = xmm3 + a1 * b3;
2684 xmm4 = xmm4 + a1 * b4;
2685 xmm5 = xmm5 + a2 * b1;
2686 xmm6 = xmm6 + a2 * b2;
2687 xmm7 = xmm7 + a2 * b3;
2688 xmm8 = xmm8 + a2 * b4;
2690 (~C).
store( i , j , xmm1 * factor );
2691 (~C).
store( i , j+IT::size , xmm2 * factor );
2692 (~C).
store( i , j+IT::size*2UL, xmm3 * factor );
2693 (~C).
store( i , j+IT::size*3UL, xmm4 * factor );
2694 (~C).
store( i+1UL, j , xmm5 * factor );
2695 (~C).
store( i+1UL, j+IT::size , xmm6 * factor );
2696 (~C).
store( i+1UL, j+IT::size*2UL, xmm7 * factor );
2697 (~C).
store( i+1UL, j+IT::size*3UL, xmm8 * factor );
2701 for(
size_t k=0UL; k<K; ++k ) {
2703 xmm1 = xmm1 + a1 * B.load(k,j );
2704 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2705 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2706 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2708 (~C).
store( i, j , xmm1 * factor );
2709 (~C).
store( i, j+IT::size , xmm2 * factor );
2710 (~C).
store( i, j+IT::size*2UL, xmm3 * factor );
2711 (~C).
store( i, j+IT::size*3UL, xmm4 * factor );
2714 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2716 for( ; (i+2UL) <= M; i+=2UL ) {
2718 for(
size_t k=0UL; k<K; ++k ) {
2723 xmm1 = xmm1 + a1 * b1;
2724 xmm2 = xmm2 + a1 * b2;
2725 xmm3 = xmm3 + a2 * b1;
2726 xmm4 = xmm4 + a2 * b2;
2728 (~C).
store( i , j , xmm1 * factor );
2729 (~C).
store( i , j+IT::size, xmm2 * factor );
2730 (~C).
store( i+1UL, j , xmm3 * factor );
2731 (~C).
store( i+1UL, j+IT::size, xmm4 * factor );
2735 for(
size_t k=0UL; k<K; ++k ) {
2737 xmm1 = xmm1 + a1 * B.load(k,j );
2738 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2740 (~C).
store( i, j , xmm1 * factor );
2741 (~C).
store( i, j+IT::size, xmm2 * factor );
2746 for( ; (i+2UL) <= M; i+=2UL ) {
2748 for(
size_t k=0UL; k<K; ++k ) {
2750 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2751 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2753 (~C).
store( i , j, xmm1 * factor );
2754 (~C).
store( i+1UL, j, xmm2 * factor );
2758 for(
size_t k=0UL; k<K; ++k ) {
2759 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2761 (~C).
store( i, j, xmm1 * factor );
2781 template<
typename MT3
2785 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2786 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
2788 typedef IntrinsicTrait<ElementType> IT;
2790 const size_t M( A.rows() );
2791 const size_t N( B.columns() );
2792 const size_t K( A.columns() );
2798 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2799 for(
size_t j=0UL; j<N; ++j ) {
2800 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2801 for(
size_t k=0UL; k<K; ++k ) {
2803 xmm1 = xmm1 + A.load(i ,k) * b1;
2804 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
2805 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
2806 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
2807 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
2808 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
2809 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
2810 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
2812 (~C).
store( i , j, xmm1 * factor );
2813 (~C).
store( i+IT::size , j, xmm2 * factor );
2814 (~C).
store( i+IT::size*2UL, j, xmm3 * factor );
2815 (~C).
store( i+IT::size*3UL, j, xmm4 * factor );
2816 (~C).
store( i+IT::size*4UL, j, xmm5 * factor );
2817 (~C).
store( i+IT::size*5UL, j, xmm6 * factor );
2818 (~C).
store( i+IT::size*6UL, j, xmm7 * factor );
2819 (~C).
store( i+IT::size*7UL, j, xmm8 * factor );
2822 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2824 for( ; (j+2UL) <= N; j+=2UL ) {
2825 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2826 for(
size_t k=0UL; k<K; ++k ) {
2833 xmm1 = xmm1 + a1 * b1;
2834 xmm2 = xmm2 + a2 * b1;
2835 xmm3 = xmm3 + a3 * b1;
2836 xmm4 = xmm4 + a4 * b1;
2837 xmm5 = xmm5 + a1 * b2;
2838 xmm6 = xmm6 + a2 * b2;
2839 xmm7 = xmm7 + a3 * b2;
2840 xmm8 = xmm8 + a4 * b2;
2842 (~C).
store( i , j , xmm1 * factor );
2843 (~C).
store( i+IT::size , j , xmm2 * factor );
2844 (~C).
store( i+IT::size*2UL, j , xmm3 * factor );
2845 (~C).
store( i+IT::size*3UL, j , xmm4 * factor );
2846 (~C).
store( i , j+1UL, xmm5 * factor );
2847 (~C).
store( i+IT::size , j+1UL, xmm6 * factor );
2848 (~C).
store( i+IT::size*2UL, j+1UL, xmm7 * factor );
2849 (~C).
store( i+IT::size*3UL, j+1UL, xmm8 * factor );
2853 for(
size_t k=0UL; k<K; ++k ) {
2855 xmm1 = xmm1 + A.load(i ,k) * b1;
2856 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
2857 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
2858 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
2860 (~C).
store( i , j, xmm1 * factor );
2861 (~C).
store( i+IT::size , j, xmm2 * factor );
2862 (~C).
store( i+IT::size*2UL, j, xmm3 * factor );
2863 (~C).
store( i+IT::size*3UL, j, xmm4 * factor );
2866 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2868 for( ; (j+2UL) <= N; j+=2UL ) {
2870 for(
size_t k=0UL; k<K; ++k ) {
2875 xmm1 = xmm1 + a1 * b1;
2876 xmm2 = xmm2 + a2 * b1;
2877 xmm3 = xmm3 + a1 * b2;
2878 xmm4 = xmm4 + a2 * b2;
2880 (~C).
store( i , j , xmm1 * factor );
2881 (~C).
store( i+IT::size, j , xmm2 * factor );
2882 (~C).
store( i , j+1UL, xmm3 * factor );
2883 (~C).
store( i+IT::size, j+1UL, xmm4 * factor );
2887 for(
size_t k=0UL; k<K; ++k ) {
2889 xmm1 = xmm1 + A.load(i ,k) * b1;
2890 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
2892 (~C).
store( i , j, xmm1 * factor );
2893 (~C).
store( i+IT::size, j, xmm2 * factor );
2898 for( ; (j+2UL) <= N; j+=2UL ) {
2900 for(
size_t k=0UL; k<K; ++k ) {
2902 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2903 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2905 (~C).
store( i, j , xmm1 * factor );
2906 (~C).
store( i, j+1UL, xmm2 * factor );
2910 for(
size_t k=0UL; k<K; ++k ) {
2911 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
2913 (~C).
store( i, j, xmm1 * factor );
2933 template<
typename MT3
2937 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2938 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2940 selectDefaultAssignKernel( C, A, B, scalar );
2959 template<
typename MT3
2963 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2964 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
2966 using boost::numeric_cast;
2972 const int M ( numeric_cast<int>( A.rows() ) );
2973 const int N ( numeric_cast<int>( B.columns() ) );
2974 const int K ( numeric_cast<int>( A.columns() ) );
2975 const int lda( numeric_cast<int>( A.spacing() ) );
2976 const int ldb( numeric_cast<int>( B.spacing() ) );
2977 const int ldc( numeric_cast<int>( C.spacing() ) );
2979 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2980 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2981 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2982 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
3002 template<
typename MT3
3006 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3007 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3009 using boost::numeric_cast;
3015 const int M ( numeric_cast<int>( A.rows() ) );
3016 const int N ( numeric_cast<int>( B.columns() ) );
3017 const int K ( numeric_cast<int>( A.columns() ) );
3018 const int lda( numeric_cast<int>( A.spacing() ) );
3019 const int ldb( numeric_cast<int>( B.spacing() ) );
3020 const int ldc( numeric_cast<int>( C.spacing() ) );
3022 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3023 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3024 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3025 M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3045 template<
typename MT3
3049 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3050 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3052 using boost::numeric_cast;
3061 const int M ( numeric_cast<int>( A.rows() ) );
3062 const int N ( numeric_cast<int>( B.columns() ) );
3063 const int K ( numeric_cast<int>( A.columns() ) );
3064 const int lda( numeric_cast<int>( A.spacing() ) );
3065 const int ldb( numeric_cast<int>( B.spacing() ) );
3066 const int ldc( numeric_cast<int>( C.spacing() ) );
3067 const complex<float> alpha( scalar );
3068 const complex<float> beta ( 0.0F, 0.0F );
3070 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3071 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3072 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3073 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3093 template<
typename MT3
3097 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3098 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3100 using boost::numeric_cast;
3109 const int M ( numeric_cast<int>( A.rows() ) );
3110 const int N ( numeric_cast<int>( B.columns() ) );
3111 const int K ( numeric_cast<int>( A.columns() ) );
3112 const int lda( numeric_cast<int>( A.spacing() ) );
3113 const int ldb( numeric_cast<int>( B.spacing() ) );
3114 const int ldc( numeric_cast<int>( C.spacing() ) );
3115 const complex<double> alpha( scalar );
3116 const complex<double> beta ( 0.0, 0.0 );
3118 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3119 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3120 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3121 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3138 template<
typename MT
3140 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
3144 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3156 const TmpType tmp( rhs );
3173 template<
typename MT3
3175 friend inline void addAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3182 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3183 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3185 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3200 DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3202 DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3220 template<
typename MT3
3224 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3225 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3246 template<
typename MT3
3250 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3251 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3253 typedef IntrinsicTrait<ElementType> IT;
3255 const size_t M( A.rows() );
3256 const size_t N( B.columns() );
3257 const size_t K( A.columns() );
3263 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3264 for(
size_t i=0UL; i<M; ++i ) {
3265 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3266 for(
size_t k=0UL; k<K; ++k ) {
3268 xmm1 = xmm1 + a1 * B.load(k,j );
3269 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3270 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3271 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3272 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3273 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3274 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3275 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3277 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3278 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3279 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3280 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3281 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
3282 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
3283 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
3284 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
3287 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3289 for( ; (i+2UL) <= M; i+=2UL ) {
3290 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3291 for(
size_t k=0UL; k<K; ++k ) {
3298 xmm1 = xmm1 + a1 * b1;
3299 xmm2 = xmm2 + a1 * b2;
3300 xmm3 = xmm3 + a1 * b3;
3301 xmm4 = xmm4 + a1 * b4;
3302 xmm5 = xmm5 + a2 * b1;
3303 xmm6 = xmm6 + a2 * b2;
3304 xmm7 = xmm7 + a2 * b3;
3305 xmm8 = xmm8 + a2 * b4;
3307 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3308 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
3309 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
3310 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
3311 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3312 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
3313 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
3314 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
3318 for(
size_t k=0UL; k<K; ++k ) {
3320 xmm1 = xmm1 + a1 * B.load(k,j );
3321 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3322 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3323 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3325 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3326 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3327 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3328 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3331 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3333 for( ; (i+2UL) <= M; i+=2UL ) {
3335 for(
size_t k=0UL; k<K; ++k ) {
3340 xmm1 = xmm1 + a1 * b1;
3341 xmm2 = xmm2 + a1 * b2;
3342 xmm3 = xmm3 + a2 * b1;
3343 xmm4 = xmm4 + a2 * b2;
3345 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3346 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
3347 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3348 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
3352 for(
size_t k=0UL; k<K; ++k ) {
3354 xmm1 = xmm1 + a1 * B.load(k,j );
3355 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3357 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3358 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
3363 for( ; (i+2UL) <= M; i+=2UL ) {
3365 for(
size_t k=0UL; k<K; ++k ) {
3367 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3368 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3370 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3371 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3375 for(
size_t k=0UL; k<K; ++k ) {
3376 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3378 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3398 template<
typename MT3
3402 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3403 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3405 typedef IntrinsicTrait<ElementType> IT;
3407 const size_t M( A.rows() );
3408 const size_t N( B.columns() );
3409 const size_t K( A.columns() );
3415 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3416 for(
size_t j=0UL; j<N; ++j ) {
3417 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3418 for(
size_t k=0UL; k<K; ++k ) {
3420 xmm1 = xmm1 + A.load(i ,k) * b1;
3421 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3422 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3423 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3424 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3425 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3426 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3427 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3429 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3430 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3431 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3432 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3433 (~C).
store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
3434 (~C).
store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
3435 (~C).
store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
3436 (~C).
store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
3439 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3441 for( ; (j+2UL) <= N; j+=2UL ) {
3442 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3443 for(
size_t k=0UL; k<K; ++k ) {
3450 xmm1 = xmm1 + a1 * b1;
3451 xmm2 = xmm2 + a2 * b1;
3452 xmm3 = xmm3 + a3 * b1;
3453 xmm4 = xmm4 + a4 * b1;
3454 xmm5 = xmm5 + a1 * b2;
3455 xmm6 = xmm6 + a2 * b2;
3456 xmm7 = xmm7 + a3 * b2;
3457 xmm8 = xmm8 + a4 * b2;
3459 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3460 (~C).
store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
3461 (~C).
store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
3462 (~C).
store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
3463 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
3464 (~C).
store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
3465 (~C).
store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
3466 (~C).
store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
3470 for(
size_t k=0UL; k<K; ++k ) {
3472 xmm1 = xmm1 + A.load(i ,k) * b1;
3473 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3474 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3475 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3477 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3478 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3479 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3480 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3483 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3485 for( ; (j+2UL) <= N; j+=2UL ) {
3487 for(
size_t k=0UL; k<K; ++k ) {
3492 xmm1 = xmm1 + a1 * b1;
3493 xmm2 = xmm2 + a2 * b1;
3494 xmm3 = xmm3 + a1 * b2;
3495 xmm4 = xmm4 + a2 * b2;
3497 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3498 (~C).
store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
3499 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
3500 (~C).
store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
3504 for(
size_t k=0UL; k<K; ++k ) {
3506 xmm1 = xmm1 + A.load(i ,k) * b1;
3507 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3509 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
3510 (~C).
store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
3515 for( ; (j+2UL) <= N; j+=2UL ) {
3517 for(
size_t k=0UL; k<K; ++k ) {
3519 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3520 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3522 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
3523 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
3527 for(
size_t k=0UL; k<K; ++k ) {
3528 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3530 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
3550 template<
typename MT3
3554 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3555 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3557 selectDefaultAddAssignKernel( C, A, B, scalar );
3576 template<
typename MT3
3580 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3581 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3583 using boost::numeric_cast;
3589 const int M ( numeric_cast<int>( A.rows() ) );
3590 const int N ( numeric_cast<int>( B.columns() ) );
3591 const int K ( numeric_cast<int>( A.columns() ) );
3592 const int lda( numeric_cast<int>( A.spacing() ) );
3593 const int ldb( numeric_cast<int>( B.spacing() ) );
3594 const int ldc( numeric_cast<int>( C.spacing() ) );
3596 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3597 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3598 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3599 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3619 template<
typename MT3
3623 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3624 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3626 using boost::numeric_cast;
3632 const int M ( numeric_cast<int>( A.rows() ) );
3633 const int N ( numeric_cast<int>( B.columns() ) );
3634 const int K ( numeric_cast<int>( A.columns() ) );
3635 const int lda( numeric_cast<int>( A.spacing() ) );
3636 const int ldb( numeric_cast<int>( B.spacing() ) );
3637 const int ldc( numeric_cast<int>( C.spacing() ) );
3639 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3640 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3641 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3642 M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3662 template<
typename MT3
3666 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3667 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3669 using boost::numeric_cast;
3678 const int M ( numeric_cast<int>( A.rows() ) );
3679 const int N ( numeric_cast<int>( B.columns() ) );
3680 const int K ( numeric_cast<int>( A.columns() ) );
3681 const int lda( numeric_cast<int>( A.spacing() ) );
3682 const int ldb( numeric_cast<int>( B.spacing() ) );
3683 const int ldc( numeric_cast<int>( C.spacing() ) );
3684 const complex<float> alpha( scalar );
3685 const complex<float> beta ( 1.0F, 0.0F );
3687 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3688 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3689 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3690 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3710 template<
typename MT3
3714 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3715 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3717 using boost::numeric_cast;
3726 const int M ( numeric_cast<int>( A.rows() ) );
3727 const int N ( numeric_cast<int>( B.columns() ) );
3728 const int K ( numeric_cast<int>( A.columns() ) );
3729 const int lda( numeric_cast<int>( A.spacing() ) );
3730 const int ldb( numeric_cast<int>( B.spacing() ) );
3731 const int ldc( numeric_cast<int>( C.spacing() ) );
3732 const complex<double> alpha( scalar );
3733 const complex<double> beta ( 1.0, 0.0 );
3735 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3736 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3737 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3738 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3759 template<
typename MT3
3761 friend inline void subAssign( DenseMatrix<MT3,SO>& lhs,
const DMatScalarMultExpr& rhs )
3768 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3769 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3771 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
3786 DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3788 DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3806 template<
typename MT3
3810 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3811 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
3832 template<
typename MT3
3836 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3837 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3839 typedef IntrinsicTrait<ElementType> IT;
3841 const size_t M( A.rows() );
3842 const size_t N( B.columns() );
3843 const size_t K( A.columns() );
3849 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3850 for(
size_t i=0UL; i<M; ++i ) {
3851 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3852 for(
size_t k=0UL; k<K; ++k ) {
3854 xmm1 = xmm1 + a1 * B.load(k,j );
3855 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3856 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3857 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3858 xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3859 xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3860 xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3861 xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3863 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
3864 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
3865 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
3866 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
3867 (~C).
store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
3868 (~C).
store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
3869 (~C).
store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
3870 (~C).
store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
3873 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3875 for( ; (i+2UL) <= M; i+=2UL ) {
3876 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3877 for(
size_t k=0UL; k<K; ++k ) {
3884 xmm1 = xmm1 + a1 * b1;
3885 xmm2 = xmm2 + a1 * b2;
3886 xmm3 = xmm3 + a1 * b3;
3887 xmm4 = xmm4 + a1 * b4;
3888 xmm5 = xmm5 + a2 * b1;
3889 xmm6 = xmm6 + a2 * b2;
3890 xmm7 = xmm7 + a2 * b3;
3891 xmm8 = xmm8 + a2 * b4;
3893 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
3894 (~C).
store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
3895 (~C).
store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
3896 (~C).
store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
3897 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
3898 (~C).
store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
3899 (~C).
store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
3900 (~C).
store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
3904 for(
size_t k=0UL; k<K; ++k ) {
3906 xmm1 = xmm1 + a1 * B.load(k,j );
3907 xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3908 xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3909 xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3911 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
3912 (~C).
store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
3913 (~C).
store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
3914 (~C).
store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
3917 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3919 for( ; (i+2UL) <= M; i+=2UL ) {
3921 for(
size_t k=0UL; k<K; ++k ) {
3926 xmm1 = xmm1 + a1 * b1;
3927 xmm2 = xmm2 + a1 * b2;
3928 xmm3 = xmm3 + a2 * b1;
3929 xmm4 = xmm4 + a2 * b2;
3931 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
3932 (~C).
store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
3933 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
3934 (~C).
store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
3938 for(
size_t k=0UL; k<K; ++k ) {
3940 xmm1 = xmm1 + a1 * B.load(k,j );
3941 xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3943 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
3944 (~C).
store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
3949 for( ; (i+2UL) <= M; i+=2UL ) {
3951 for(
size_t k=0UL; k<K; ++k ) {
3953 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
3954 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
3956 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
3957 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
3961 for(
size_t k=0UL; k<K; ++k ) {
3962 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
3964 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
3984 template<
typename MT3
3988 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3989 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
3991 typedef IntrinsicTrait<ElementType> IT;
3993 const size_t M( A.rows() );
3994 const size_t N( B.columns() );
3995 const size_t K( A.columns() );
4001 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4002 for(
size_t j=0UL; j<N; ++j ) {
4003 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4004 for(
size_t k=0UL; k<K; ++k ) {
4006 xmm1 = xmm1 + A.load(i ,k) * b1;
4007 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4008 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4009 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4010 xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4011 xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4012 xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4013 xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4015 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4016 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4017 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4018 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4019 (~C).
store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
4020 (~C).
store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
4021 (~C).
store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
4022 (~C).
store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
4025 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
4027 for( ; (j+2UL) <= N; j+=2UL ) {
4028 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4029 for(
size_t k=0UL; k<K; ++k ) {
4036 xmm1 = xmm1 + a1 * b1;
4037 xmm2 = xmm2 + a2 * b1;
4038 xmm3 = xmm3 + a3 * b1;
4039 xmm4 = xmm4 + a4 * b1;
4040 xmm5 = xmm5 + a1 * b2;
4041 xmm6 = xmm6 + a2 * b2;
4042 xmm7 = xmm7 + a3 * b2;
4043 xmm8 = xmm8 + a4 * b2;
4045 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4046 (~C).
store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
4047 (~C).
store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
4048 (~C).
store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
4049 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4050 (~C).
store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
4051 (~C).
store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
4052 (~C).
store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
4056 for(
size_t k=0UL; k<K; ++k ) {
4058 xmm1 = xmm1 + A.load(i ,k) * b1;
4059 xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4060 xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4061 xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4063 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4064 (~C).
store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4065 (~C).
store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4066 (~C).
store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4069 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
4071 for( ; (j+2UL) <= N; j+=2UL ) {
4073 for(
size_t k=0UL; k<K; ++k ) {
4078 xmm1 = xmm1 + a1 * b1;
4079 xmm2 = xmm2 + a2 * b1;
4080 xmm3 = xmm3 + a1 * b2;
4081 xmm4 = xmm4 + a2 * b2;
4083 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4084 (~C).
store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
4085 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4086 (~C).
store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
4090 for(
size_t k=0UL; k<K; ++k ) {
4092 xmm1 = xmm1 + A.load(i ,k) * b1;
4093 xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4095 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
4096 (~C).
store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
4101 for( ; (j+2UL) <= N; j+=2UL ) {
4103 for(
size_t k=0UL; k<K; ++k ) {
4105 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4106 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4108 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
4109 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4113 for(
size_t k=0UL; k<K; ++k ) {
4114 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4116 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
4136 template<
typename MT3
4140 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4141 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4143 selectDefaultSubAssignKernel( C, A, B, scalar );
4162 template<
typename MT3
4166 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4167 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4169 using boost::numeric_cast;
4175 const int M ( numeric_cast<int>( A.rows() ) );
4176 const int N ( numeric_cast<int>( B.columns() ) );
4177 const int K ( numeric_cast<int>( A.columns() ) );
4178 const int lda( numeric_cast<int>( A.spacing() ) );
4179 const int ldb( numeric_cast<int>( B.spacing() ) );
4180 const int ldc( numeric_cast<int>( C.spacing() ) );
4182 cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4183 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4184 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4185 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4205 template<
typename MT3
4209 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4210 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4212 using boost::numeric_cast;
4218 const int M ( numeric_cast<int>( A.rows() ) );
4219 const int N ( numeric_cast<int>( B.columns() ) );
4220 const int K ( numeric_cast<int>( A.columns() ) );
4221 const int lda( numeric_cast<int>( A.spacing() ) );
4222 const int ldb( numeric_cast<int>( B.spacing() ) );
4223 const int ldc( numeric_cast<int>( C.spacing() ) );
4225 cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4226 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4227 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4228 M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4248 template<
typename MT3
4252 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4253 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4255 using boost::numeric_cast;
4264 const int M ( numeric_cast<int>( A.rows() ) );
4265 const int N ( numeric_cast<int>( B.columns() ) );
4266 const int K ( numeric_cast<int>( A.columns() ) );
4267 const int lda( numeric_cast<int>( A.spacing() ) );
4268 const int ldb( numeric_cast<int>( B.spacing() ) );
4269 const int ldc( numeric_cast<int>( C.spacing() ) );
4270 const complex<float> alpha( -scalar );
4271 const complex<float> beta ( 1.0F, 0.0F );
4273 cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4274 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4275 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4276 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4296 template<
typename MT3
4300 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4301 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4303 using boost::numeric_cast;
4312 const int M ( numeric_cast<int>( A.rows() ) );
4313 const int N ( numeric_cast<int>( B.columns() ) );
4314 const int K ( numeric_cast<int>( A.columns() ) );
4315 const int lda( numeric_cast<int>( A.spacing() ) );
4316 const int ldb( numeric_cast<int>( B.spacing() ) );
4317 const int ldc( numeric_cast<int>( C.spacing() ) );
4318 const complex<double> alpha( -scalar );
4319 const complex<double> beta ( 1.0, 0.0 );
4321 cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4322 ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4323 ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4324 M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4393 template<
typename T1
4395 inline const TDMatDMatMultExpr<T1,T2>
4401 throw std::invalid_argument(
"Matrix sizes do not match" );
4418 template<
typename MT1,
typename MT2,
typename VT >
4423 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4424 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4425 IsDenseVector<VT>::value && IsColumnVector<VT>::value
4426 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4427 , INVALID_TYPE >::Type Type;
4436 template<
typename MT1,
typename MT2,
typename VT >
4441 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4442 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4443 IsSparseVector<VT>::value && IsColumnVector<VT>::value
4444 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4445 , INVALID_TYPE >::Type Type;
4454 template<
typename VT,
typename MT1,
typename MT2 >
4459 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
4460 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4461 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4462 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4463 , INVALID_TYPE >::Type Type;
4472 template<
typename VT,
typename MT1,
typename MT2 >
4477 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
4478 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4479 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4480 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4481 , INVALID_TYPE >::Type Type;
4490 template<
typename MT1,
typename MT2 >
4495 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1>::Type
4496 ,
typename SubmatrixExprTrait<const MT2>::Type >::Type Type;
4505 template<
typename MT1,
typename MT2 >
4510 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4519 template<
typename MT1,
typename MT2 >
4524 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4512
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:307
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:3703
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:358
SelectType< IsComputation< MT1 >::value, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:235
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:196
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2375
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:248
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:121
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:246
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:327
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:223
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:153
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:121
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:120
Constraint on the data type.
Header file for the MultExprTrait class template.
SelectType< IsComputation< MT2 >::value, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:238
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:122
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:123
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:252
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:224
Header file for the DenseMatrix base class.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:232
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:219
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2373
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:229
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:225
Header file for the IsNumeric type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:648
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:222
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:351
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:317
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:247
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:221
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2370
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:297
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:226
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:359
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:267
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:220
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Operand matrix_
The dense matrix containing the submatrix.
Definition: DenseSubmatrix.h:2792
Header file for the TDVecTDMatMultExprTrait class template.
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
void store(float *address, const sse_float_t &value)
Aligned store of a vector of 'float' values.
Definition: Store.h:242
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:339
Header file for the FunctionTrace class.