35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
140 template<
typename MT1
172 template<
typename T1,
typename T2,
typename T3 >
173 struct IsEvaluationRequired {
174 enum { value = ( evaluateLeft || evaluateRight ) };
184 template<
typename T1,
typename T2,
typename T3 >
185 struct UseBlasKernel {
187 HasMutableDataAccess<T1>::value &&
188 HasConstDataAccess<T2>::value &&
189 HasConstDataAccess<T3>::value &&
190 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
191 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
192 IsBlasCompatible<typename T1::ElementType>::value &&
193 IsBlasCompatible<typename T2::ElementType>::value &&
194 IsBlasCompatible<typename T3::ElementType>::value &&
195 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
196 IsSame< typename T1::ElementType, typename T3::ElementType >::value };
206 template<
typename T1,
typename T2,
typename T3 >
207 struct UseVectorizedDefaultKernel {
209 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
210 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
211 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
212 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
213 IntrinsicTrait<typename T1::ElementType>::addition &&
214 IntrinsicTrait<typename T1::ElementType>::multiplication };
246 MT1::vectorizable && MT2::vectorizable &&
252 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
253 !evaluateRight && MT2::smpAssignable };
296 :(
lhs_.columns() ) ) );
298 if(
lhs_.columns() == 0UL ||
308 const size_t knum( kend - kbegin );
309 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
311 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
313 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
315 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
333 inline ReturnType
at(
size_t i,
size_t j )
const {
334 if( i >=
lhs_.rows() ) {
337 if( j >=
rhs_.columns() ) {
360 return rhs_.columns();
390 template<
typename T >
392 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
402 template<
typename T >
404 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
414 return lhs_.isAligned() &&
rhs_.isAligned();
425 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
426 (
rows() > SMP_DMATTDMATMULT_THRESHOLD );
449 template<
typename MT
458 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
461 else if( rhs.
lhs_.columns() == 0UL ) {
476 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
492 template<
typename MT3
495 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
498 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
499 selectSmallAssignKernel( C, A, B );
501 selectBlasAssignKernel( C, A, B );
520 template<
typename MT3
523 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
524 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
526 const size_t M( A.rows() );
527 const size_t N( B.columns() );
528 const size_t K( A.columns() );
530 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
531 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
533 const size_t iend( ( IsStrictlyUpper<MT4>::value )
534 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
538 for(
size_t i=0UL; i<ibegin; ++i ) {
539 for(
size_t j=0UL; j<N; ++j ) {
543 for(
size_t i=ibegin; i<iend; ++i )
545 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
546 ?( ( IsStrictlyUpper<MT4>::value )
547 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
548 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
549 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
550 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
551 ?( ( IsStrictlyLower<MT4>::value )
552 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
553 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
554 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
557 for(
size_t j=0UL; j<jbegin; ++j ) {
560 for(
size_t j=jbegin; j<jend; ++j )
562 const size_t kbegin( ( IsUpper<MT4>::value )
563 ?( ( IsLower<MT5>::value )
564 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
565 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
566 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
567 :( ( IsLower<MT5>::value )
568 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
570 const size_t kend( ( IsLower<MT4>::value )
571 ?( ( IsUpper<MT5>::value )
572 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
573 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
574 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
575 :( ( IsUpper<MT5>::value )
576 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
580 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
581 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
582 (~C)(i,j) += A(i,k) * B(k,j);
585 for(
size_t j=jend; j<N; ++j ) {
589 for(
size_t i=iend; i<M; ++i ) {
590 for(
size_t j=0UL; j<N; ++j ) {
612 template<
typename MT3
615 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
616 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
618 const size_t M( A.rows() );
619 const size_t N( B.columns() );
620 const size_t K( A.columns() );
622 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
623 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
625 const size_t jend( ( IsStrictlyLower<MT5>::value )
626 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
630 for(
size_t j=0UL; j<jbegin; ++j ) {
631 for(
size_t i=0UL; i<M; ++i ) {
635 for(
size_t j=jbegin; j<jend; ++j )
637 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
638 ?( ( IsStrictlyLower<MT4>::value )
639 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
640 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
641 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
642 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
643 ?( ( IsStrictlyUpper<MT4>::value )
644 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
645 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
646 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
649 for(
size_t i=0UL; i<ibegin; ++i ) {
652 for(
size_t i=ibegin; i<iend; ++i )
654 const size_t kbegin( ( IsUpper<MT4>::value )
655 ?( ( IsLower<MT5>::value )
656 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
657 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
658 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
659 :( ( IsLower<MT5>::value )
660 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
662 const size_t kend( ( IsLower<MT4>::value )
663 ?( ( IsUpper<MT5>::value )
664 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
665 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
666 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
667 :( ( IsUpper<MT5>::value )
668 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
672 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
673 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
674 (~C)(i,j) += A(i,k) * B(k,j);
677 for(
size_t i=iend; i<M; ++i ) {
681 for(
size_t j=jend; j<N; ++j ) {
682 for(
size_t i=0UL; i<M; ++i ) {
704 template<
typename MT3
707 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
708 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
710 const size_t M( A.rows() );
711 const size_t N( B.columns() );
713 for(
size_t i=0UL; i<M; ++i )
715 const size_t jbegin( ( IsUpper<MT4>::value )
716 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
718 const size_t jend( ( IsLower<MT4>::value )
719 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
723 if( IsUpper<MT4>::value ) {
724 for(
size_t j=0UL; j<jbegin; ++j ) {
728 for(
size_t j=jbegin; j<jend; ++j ) {
729 (~C)(i,j) = A(i,j) * B(j,j);
731 if( IsLower<MT4>::value ) {
732 for(
size_t j=jend; j<N; ++j ) {
755 template<
typename MT3
758 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
759 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
761 const size_t M( A.rows() );
762 const size_t N( B.columns() );
764 const size_t block( BLOCK_SIZE );
766 for(
size_t jj=0UL; jj<N; jj+=block ) {
767 const size_t jend(
min( N, jj+block ) );
768 for(
size_t ii=0UL; ii<M; ii+=block ) {
769 const size_t iend(
min( M, ii+block ) );
770 for(
size_t j=jj; j<jend; ++j )
772 const size_t ibegin( ( IsLower<MT4>::value )
773 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
775 const size_t ipos( ( IsUpper<MT4>::value )
776 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
779 if( IsLower<MT4>::value ) {
780 for(
size_t i=ii; i<ibegin; ++i ) {
784 for(
size_t i=ibegin; i<ipos; ++i ) {
785 (~C)(i,j) = A(i,j) * B(j,j);
787 if( IsUpper<MT4>::value ) {
788 for(
size_t i=ipos; i<iend; ++i ) {
813 template<
typename MT3
816 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
817 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
819 const size_t M( A.rows() );
820 const size_t N( B.columns() );
822 const size_t block( BLOCK_SIZE );
824 for(
size_t ii=0UL; ii<M; ii+=block ) {
825 const size_t iend(
min( M, ii+block ) );
826 for(
size_t jj=0UL; jj<N; jj+=block ) {
827 const size_t jend(
min( N, jj+block ) );
828 for(
size_t i=ii; i<iend; ++i )
830 const size_t jbegin( ( IsUpper<MT5>::value )
831 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
833 const size_t jpos( ( IsLower<MT5>::value )
834 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
837 if( IsUpper<MT5>::value ) {
838 for(
size_t j=jj; j<jbegin; ++j ) {
842 for(
size_t j=jbegin; j<jpos; ++j ) {
843 (~C)(i,j) = A(i,i) * B(i,j);
845 if( IsLower<MT5>::value ) {
846 for(
size_t j=jpos; j<jend; ++j ) {
871 template<
typename MT3
874 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
875 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
877 const size_t M( A.rows() );
878 const size_t N( B.columns() );
880 for(
size_t j=0UL; j<N; ++j )
882 const size_t ibegin( ( IsLower<MT5>::value )
883 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
885 const size_t iend( ( IsUpper<MT5>::value )
886 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
890 if( IsLower<MT5>::value ) {
891 for(
size_t i=0UL; i<ibegin; ++i ) {
895 for(
size_t i=ibegin; i<iend; ++i ) {
896 (~C)(i,j) = A(i,i) * B(i,j);
898 if( IsUpper<MT5>::value ) {
899 for(
size_t i=iend; i<M; ++i ) {
922 template<
typename MT3
925 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
926 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
930 for(
size_t i=0UL; i<A.rows(); ++i ) {
931 C(i,i) = A(i,i) * B(i,i);
951 template<
typename MT3
954 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
955 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
957 selectDefaultAssignKernel( C, A, B );
977 template<
typename MT3
980 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
981 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
983 typedef IntrinsicTrait<ElementType> IT;
985 const size_t M( A.rows() );
986 const size_t N( B.columns() );
987 const size_t K( A.columns() );
989 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
993 for( ; (i+2UL) <= M; i+=2UL )
997 for( ; (j+4UL) <= N; j+=4UL )
999 const size_t kbegin( ( IsUpper<MT4>::value )
1000 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1001 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1002 const size_t kend( ( IsLower<MT4>::value )
1003 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
1004 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
1006 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1009 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1013 const IntrinsicType a1( A.load(i ,k) );
1014 const IntrinsicType a2( A.load(i+1UL,k) );
1015 const IntrinsicType b1( B.load(k,j ) );
1016 const IntrinsicType b2( B.load(k,j+1UL) );
1017 const IntrinsicType b3( B.load(k,j+2UL) );
1018 const IntrinsicType b4( B.load(k,j+3UL) );
1019 xmm1 = xmm1 + a1 * b1;
1020 xmm2 = xmm2 + a1 * b2;
1021 xmm3 = xmm3 + a1 * b3;
1022 xmm4 = xmm4 + a1 * b4;
1023 xmm5 = xmm5 + a2 * b1;
1024 xmm6 = xmm6 + a2 * b2;
1025 xmm7 = xmm7 + a2 * b3;
1026 xmm8 = xmm8 + a2 * b4;
1029 (~C)(i ,j ) =
sum( xmm1 );
1030 (~C)(i ,j+1UL) =
sum( xmm2 );
1031 (~C)(i ,j+2UL) =
sum( xmm3 );
1032 (~C)(i ,j+3UL) =
sum( xmm4 );
1033 (~C)(i+1UL,j ) =
sum( xmm5 );
1034 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
1035 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
1036 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
1038 for( ; remainder && k<kend; ++k ) {
1039 (~C)(i ,j ) += A(i ,k) * B(k,j );
1040 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1041 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1042 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1043 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1044 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1045 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1046 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1050 for( ; (j+2UL) <= N; j+=2UL )
1052 const size_t kbegin( ( IsUpper<MT4>::value )
1053 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1054 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1055 const size_t kend( ( IsLower<MT4>::value )
1056 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1057 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1059 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1062 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1066 const IntrinsicType a1( A.load(i ,k) );
1067 const IntrinsicType a2( A.load(i+1UL,k) );
1068 const IntrinsicType b1( B.load(k,j ) );
1069 const IntrinsicType b2( B.load(k,j+1UL) );
1070 xmm1 = xmm1 + a1 * b1;
1071 xmm2 = xmm2 + a1 * b2;
1072 xmm3 = xmm3 + a2 * b1;
1073 xmm4 = xmm4 + a2 * b2;
1076 (~C)(i ,j ) =
sum( xmm1 );
1077 (~C)(i ,j+1UL) =
sum( xmm2 );
1078 (~C)(i+1UL,j ) =
sum( xmm3 );
1079 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1081 for( ; remainder && k<kend; ++k ) {
1082 (~C)(i ,j ) += A(i ,k) * B(k,j );
1083 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1084 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1085 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1091 const size_t kbegin( ( IsUpper<MT4>::value )
1092 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1093 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1094 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1096 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1099 IntrinsicType xmm1, xmm2;
1103 const IntrinsicType b1( B.load(k,j) );
1104 xmm1 = xmm1 + A.load(i ,k) * b1;
1105 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1108 (~C)(i ,j) =
sum( xmm1 );
1109 (~C)(i+1UL,j) =
sum( xmm2 );
1111 for( ; remainder && k<kend; ++k ) {
1112 (~C)(i ,j) += A(i ,k) * B(k,j);
1113 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1122 for( ; (j+4UL) <= N; j+=4UL )
1124 const size_t kbegin( ( IsUpper<MT4>::value )
1125 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1126 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1127 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
1129 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1132 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1136 const IntrinsicType a1( A.load(i,k) );
1137 xmm1 = xmm1 + a1 * B.load(k,j );
1138 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1139 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1140 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1143 (~C)(i,j ) =
sum( xmm1 );
1144 (~C)(i,j+1UL) =
sum( xmm2 );
1145 (~C)(i,j+2UL) =
sum( xmm3 );
1146 (~C)(i,j+3UL) =
sum( xmm4 );
1148 for( ; remainder && k<kend; ++k ) {
1149 (~C)(i,j ) += A(i,k) * B(k,j );
1150 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1151 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
1152 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
1156 for( ; (j+2UL) <= N; j+=2UL )
1158 const size_t kbegin( ( IsUpper<MT4>::value )
1159 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1160 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1161 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1163 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1166 IntrinsicType xmm1, xmm2;
1170 const IntrinsicType a1( A.load(i,k) );
1171 xmm1 = xmm1 + a1 * B.load(k,j );
1172 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1175 (~C)(i,j ) =
sum( xmm1 );
1176 (~C)(i,j+1UL) =
sum( xmm2 );
1178 for( ; remainder && k<kend; ++k ) {
1179 (~C)(i,j ) += A(i,k) * B(k,j );
1180 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1186 const size_t kbegin( ( IsUpper<MT4>::value )
1187 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1188 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1190 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
1197 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1200 (~C)(i,j) =
sum( xmm1 );
1202 for( ; remainder && k<K; ++k ) {
1203 (~C)(i,j) += A(i,k) * B(k,j);
1226 template<
typename MT3
1229 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1230 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1232 typedef IntrinsicTrait<ElementType> IT;
1234 const size_t M( A.rows() );
1235 const size_t N( B.columns() );
1236 const size_t K( A.columns() );
1238 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
1242 for( ; (i+4UL) <= M; i+=4UL )
1246 for( ; (j+2UL) <= N; j+=2UL )
1248 const size_t kbegin( ( IsUpper<MT4>::value )
1249 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1250 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1251 const size_t kend( ( IsLower<MT4>::value )
1252 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
1253 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1255 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1258 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1262 const IntrinsicType a1( A.load(i ,k) );
1263 const IntrinsicType a2( A.load(i+1UL,k) );
1264 const IntrinsicType a3( A.load(i+2UL,k) );
1265 const IntrinsicType a4( A.load(i+3UL,k) );
1266 const IntrinsicType b1( B.load(k,j ) );
1267 const IntrinsicType b2( B.load(k,j+1UL) );
1268 xmm1 = xmm1 + a1 * b1;
1269 xmm2 = xmm2 + a1 * b2;
1270 xmm3 = xmm3 + a2 * b1;
1271 xmm4 = xmm4 + a2 * b2;
1272 xmm5 = xmm5 + a3 * b1;
1273 xmm6 = xmm6 + a3 * b2;
1274 xmm7 = xmm7 + a4 * b1;
1275 xmm8 = xmm8 + a4 * b2;
1278 (~C)(i ,j ) =
sum( xmm1 );
1279 (~C)(i ,j+1UL) =
sum( xmm2 );
1280 (~C)(i+1UL,j ) =
sum( xmm3 );
1281 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1282 (~C)(i+2UL,j ) =
sum( xmm5 );
1283 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
1284 (~C)(i+3UL,j ) =
sum( xmm7 );
1285 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
1287 for( ; remainder && k<kend; ++k ) {
1288 (~C)(i ,j ) += A(i ,k) * B(k,j );
1289 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1290 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1291 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1292 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1293 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1294 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1295 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1301 const size_t kbegin( ( IsUpper<MT4>::value )
1302 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1303 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1304 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
1306 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1309 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1313 const IntrinsicType b1( B.load(k,j) );
1314 xmm1 = xmm1 + A.load(i ,k) * b1;
1315 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1316 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1317 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1320 (~C)(i ,j) =
sum( xmm1 );
1321 (~C)(i+1UL,j) =
sum( xmm2 );
1322 (~C)(i+2UL,j) =
sum( xmm3 );
1323 (~C)(i+3UL,j) =
sum( xmm4 );
1325 for( ; remainder && k<kend; ++k ) {
1326 (~C)(i ,j) += A(i ,k) * B(k,j);
1327 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1328 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
1329 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
1334 for( ; (i+2UL) <= M; i+=2UL )
1338 for( ; (j+2UL) <= N; j+=2UL )
1340 const size_t kbegin( ( IsUpper<MT4>::value )
1341 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1342 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1343 const size_t kend( ( IsLower<MT4>::value )
1344 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1345 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1347 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1350 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1354 const IntrinsicType a1( A.load(i ,k) );
1355 const IntrinsicType a2( A.load(i+1UL,k) );
1356 const IntrinsicType b1( B.load(k,j ) );
1357 const IntrinsicType b2( B.load(k,j+1UL) );
1358 xmm1 = xmm1 + a1 * b1;
1359 xmm2 = xmm2 + a1 * b2;
1360 xmm3 = xmm3 + a2 * b1;
1361 xmm4 = xmm4 + a2 * b2;
1364 (~C)(i ,j ) =
sum( xmm1 );
1365 (~C)(i ,j+1UL) =
sum( xmm2 );
1366 (~C)(i+1UL,j ) =
sum( xmm3 );
1367 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1369 for( ; remainder && k<kend; ++k ) {
1370 (~C)(i ,j ) += A(i ,k) * B(k,j );
1371 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1372 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1373 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1379 const size_t kbegin( ( IsUpper<MT4>::value )
1380 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1381 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1382 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1384 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1387 IntrinsicType xmm1, xmm2;
1391 const IntrinsicType b1( B.load(k,j) );
1392 xmm1 = xmm1 + A.load(i ,k) * b1;
1393 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1396 (~C)(i ,j) =
sum( xmm1 );
1397 (~C)(i+1UL,j) =
sum( xmm2 );
1399 for( ; remainder && k<kend; ++k ) {
1400 (~C)(i ,j) += A(i ,k) * B(k,j);
1401 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1410 for( ; (j+2UL) <= N; j+=2UL )
1412 const size_t kbegin( ( IsUpper<MT4>::value )
1413 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1414 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1415 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1417 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
1420 IntrinsicType xmm1, xmm2;
1424 const IntrinsicType a1( A.load(i,k) );
1425 xmm1 = xmm1 + a1 * B.load(k,j );
1426 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1429 (~C)(i,j ) =
sum( xmm1 );
1430 (~C)(i,j+1UL) =
sum( xmm2 );
1432 for( ; remainder && k<kend; ++k ) {
1433 (~C)(i,j ) += A(i,k) * B(k,j );
1434 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1440 const size_t kbegin( ( IsUpper<MT4>::value )
1441 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1442 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1444 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
1451 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1454 (~C)(i,j) =
sum( xmm1 );
1456 for( ; remainder && k<K; ++k ) {
1457 (~C)(i,j) += A(i,k) * B(k,j);
1479 template<
typename MT3
1482 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1483 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1485 selectDefaultAssignKernel( C, A, B );
1505 template<
typename MT3
1508 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1509 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1512 selectSmallAssignKernel( ~C, A, B );
1532 template<
typename MT3
1535 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1536 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1539 selectSmallAssignKernel( ~C, A, B );
1558 template<
typename MT3
1561 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1562 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1564 selectLargeAssignKernel( C, A, B );
1584 template<
typename MT3
1587 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1588 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1592 if( IsTriangular<MT4>::value ) {
1594 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1596 else if( IsTriangular<MT5>::value ) {
1598 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1601 gemm( C, A, B, ET(1), ET(0) );
1621 template<
typename MT
1623 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1627 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1639 const TmpType tmp(
serial( rhs ) );
1640 assign( ~lhs, tmp );
1658 template<
typename MT
1660 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1667 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1671 LT A(
serial( rhs.lhs_ ) );
1672 RT B(
serial( rhs.rhs_ ) );
1681 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1697 template<
typename MT3
1700 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1702 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
1703 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1704 selectSmallAddAssignKernel( C, A, B );
1706 selectBlasAddAssignKernel( C, A, B );
1725 template<
typename MT3
1728 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1729 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1731 const size_t M( A.rows() );
1732 const size_t N( B.columns() );
1733 const size_t K( A.columns() );
1735 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
1736 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
1738 const size_t iend( ( IsStrictlyUpper<MT4>::value )
1739 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
1743 for(
size_t i=ibegin; i<iend; ++i )
1745 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1746 ?( ( IsStrictlyUpper<MT4>::value )
1747 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
1748 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
1749 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
1750 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
1751 ?( ( IsStrictlyLower<MT4>::value )
1752 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
1753 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
1754 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
1757 for(
size_t j=jbegin; j<jend; ++j )
1759 const size_t kbegin( ( IsUpper<MT4>::value )
1760 ?( ( IsLower<MT5>::value )
1761 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1762 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1763 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1764 :( ( IsLower<MT5>::value )
1765 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1767 const size_t kend( ( IsLower<MT4>::value )
1768 ?( ( IsUpper<MT5>::value )
1769 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1770 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1771 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1772 :( ( IsUpper<MT5>::value )
1773 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1777 const size_t knum( kend - kbegin );
1778 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1780 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1781 (~C)(i,j) += A(i,k ) * B(k ,j);
1782 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1785 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1807 template<
typename MT3
1810 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1811 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1813 const size_t M( A.rows() );
1814 const size_t N( B.columns() );
1815 const size_t K( A.columns() );
1817 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
1818 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
1820 const size_t jend( ( IsStrictlyLower<MT5>::value )
1821 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
1825 for(
size_t j=jbegin; j<jend; ++j )
1827 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
1828 ?( ( IsStrictlyLower<MT4>::value )
1829 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
1830 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1831 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
1832 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1833 ?( ( IsStrictlyUpper<MT4>::value )
1834 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
1835 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
1836 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
1839 for(
size_t i=ibegin; i<iend; ++i )
1841 const size_t kbegin( ( IsUpper<MT4>::value )
1842 ?( ( IsLower<MT5>::value )
1843 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1844 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1845 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1846 :( ( IsLower<MT5>::value )
1847 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1849 const size_t kend( ( IsLower<MT4>::value )
1850 ?( ( IsUpper<MT5>::value )
1851 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1852 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1853 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1854 :( ( IsUpper<MT5>::value )
1855 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1859 const size_t knum( kend - kbegin );
1860 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1862 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1863 (~C)(i,j) += A(i,k ) * B(k ,j);
1864 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1867 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1889 template<
typename MT3
1892 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1893 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1895 const size_t M( A.rows() );
1896 const size_t N( B.columns() );
1898 for(
size_t i=0UL; i<M; ++i )
1900 const size_t jbegin( ( IsUpper<MT4>::value )
1901 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1903 const size_t jend( ( IsLower<MT4>::value )
1904 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1908 const size_t jnum( jend - jbegin );
1909 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1911 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1912 (~C)(i,j ) += A(i,j ) * B(j ,j );
1913 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1916 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos);
1937 template<
typename MT3
1940 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1941 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1943 const size_t M( A.rows() );
1944 const size_t N( B.columns() );
1946 const size_t block( BLOCK_SIZE );
1948 for(
size_t jj=0UL; jj<N; jj+=block ) {
1949 const size_t jend(
min( N, jj+block ) );
1950 for(
size_t ii=0UL; ii<M; ii+=block ) {
1951 const size_t iend(
min( M, ii+block ) );
1952 for(
size_t j=jj; j<jend; ++j )
1954 const size_t ibegin( ( IsLower<MT4>::value )
1955 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
1957 const size_t ipos( ( IsUpper<MT4>::value )
1958 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
1961 for(
size_t i=ibegin; i<ipos; ++i ) {
1962 (~C)(i,j) += A(i,j) * B(j,j);
1985 template<
typename MT3
1988 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1989 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1991 const size_t M( A.rows() );
1992 const size_t N( B.columns() );
1994 const size_t block( BLOCK_SIZE );
1996 for(
size_t ii=0UL; ii<M; ii+=block ) {
1997 const size_t iend(
min( M, ii+block ) );
1998 for(
size_t jj=0UL; jj<N; jj+=block ) {
1999 const size_t jend(
min( N, jj+block ) );
2000 for(
size_t i=ii; i<iend; ++i )
2002 const size_t jbegin( ( IsUpper<MT5>::value )
2003 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
2005 const size_t jpos( ( IsLower<MT5>::value )
2006 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
2009 for(
size_t j=jbegin; j<jpos; ++j ) {
2010 (~C)(i,j) += A(i,i) * B(i,j);
2033 template<
typename MT3
2036 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2037 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2039 const size_t M( A.rows() );
2040 const size_t N( B.columns() );
2042 for(
size_t j=0UL; j<N; ++j )
2044 const size_t ibegin( ( IsLower<MT5>::value )
2045 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2047 const size_t iend( ( IsUpper<MT5>::value )
2048 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2052 const size_t inum( iend - ibegin );
2053 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2055 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2056 (~C)(i ,j) += A(i ,i ) * B(i ,j);
2057 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2060 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j);
2081 template<
typename MT3
2084 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2085 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2087 for(
size_t i=0UL; i<A.rows(); ++i ) {
2088 C(i,i) += A(i,i) * B(i,i);
2108 template<
typename MT3
2111 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2112 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2114 selectDefaultAddAssignKernel( C, A, B );
2134 template<
typename MT3
2137 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2138 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2140 typedef IntrinsicTrait<ElementType> IT;
2142 const size_t M( A.rows() );
2143 const size_t N( B.columns() );
2144 const size_t K( A.columns() );
2146 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
2150 for( ; (i+2UL) <= M; i+=2UL )
2154 for( ; (j+4UL) <= N; j+=4UL )
2156 const size_t kbegin( ( IsUpper<MT4>::value )
2157 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2158 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2159 const size_t kend( ( IsLower<MT4>::value )
2160 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
2161 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
2163 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2166 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2170 const IntrinsicType a1( A.load(i ,k) );
2171 const IntrinsicType a2( A.load(i+1UL,k) );
2172 const IntrinsicType b1( B.load(k,j ) );
2173 const IntrinsicType b2( B.load(k,j+1UL) );
2174 const IntrinsicType b3( B.load(k,j+2UL) );
2175 const IntrinsicType b4( B.load(k,j+3UL) );
2176 xmm1 = xmm1 + a1 * b1;
2177 xmm2 = xmm2 + a1 * b2;
2178 xmm3 = xmm3 + a1 * b3;
2179 xmm4 = xmm4 + a1 * b4;
2180 xmm5 = xmm5 + a2 * b1;
2181 xmm6 = xmm6 + a2 * b2;
2182 xmm7 = xmm7 + a2 * b3;
2183 xmm8 = xmm8 + a2 * b4;
2186 (~C)(i ,j ) +=
sum( xmm1 );
2187 (~C)(i ,j+1UL) +=
sum( xmm2 );
2188 (~C)(i ,j+2UL) +=
sum( xmm3 );
2189 (~C)(i ,j+3UL) +=
sum( xmm4 );
2190 (~C)(i+1UL,j ) +=
sum( xmm5 );
2191 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
2192 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
2193 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
2195 for( ; remainder && k<kend; ++k ) {
2196 (~C)(i ,j ) += A(i ,k) * B(k,j );
2197 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2198 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2199 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2200 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2201 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2202 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2203 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2207 for( ; (j+2UL) <= N; j+=2UL )
2209 const size_t kbegin( ( IsUpper<MT4>::value )
2210 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2211 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2212 const size_t kend( ( IsLower<MT4>::value )
2213 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2214 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2216 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2219 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2223 const IntrinsicType a1( A.load(i ,k) );
2224 const IntrinsicType a2( A.load(i+1UL,k) );
2225 const IntrinsicType b1( B.load(k,j ) );
2226 const IntrinsicType b2( B.load(k,j+1UL) );
2227 xmm1 = xmm1 + a1 * b1;
2228 xmm2 = xmm2 + a1 * b2;
2229 xmm3 = xmm3 + a2 * b1;
2230 xmm4 = xmm4 + a2 * b2;
2233 (~C)(i ,j ) +=
sum( xmm1 );
2234 (~C)(i ,j+1UL) +=
sum( xmm2 );
2235 (~C)(i+1UL,j ) +=
sum( xmm3 );
2236 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2238 for( ; remainder && k<kend; ++k ) {
2239 (~C)(i ,j ) += A(i ,k) * B(k,j );
2240 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2241 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2242 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2248 const size_t kbegin( ( IsUpper<MT4>::value )
2249 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2250 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2251 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2253 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2256 IntrinsicType xmm1, xmm2;
2260 const IntrinsicType b1( B.load(k,j) );
2261 xmm1 = xmm1 + A.load(i ,k) * b1;
2262 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2265 (~C)(i ,j) +=
sum( xmm1 );
2266 (~C)(i+1UL,j) +=
sum( xmm2 );
2268 for( ; remainder && k<kend; ++k ) {
2269 (~C)(i ,j) += A(i ,k) * B(k,j);
2270 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2278 for( ; (j+4UL) <= N; j+=4UL )
2280 const size_t kbegin( ( IsUpper<MT4>::value )
2281 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2282 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2283 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
2285 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2288 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2292 const IntrinsicType a1( A.load(i,k) );
2293 xmm1 = xmm1 + a1 * B.load(k,j );
2294 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2295 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
2296 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
2299 (~C)(i,j ) +=
sum( xmm1 );
2300 (~C)(i,j+1UL) +=
sum( xmm2 );
2301 (~C)(i,j+2UL) +=
sum( xmm3 );
2302 (~C)(i,j+3UL) +=
sum( xmm4 );
2304 for( ; remainder && k<kend; ++k ) {
2305 (~C)(i,j ) += A(i,k) * B(k,j );
2306 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2307 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
2308 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
2312 for( ; (j+2UL) <= N; j+=2UL )
2314 const size_t kbegin( ( IsUpper<MT4>::value )
2315 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2316 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2317 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2319 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2322 IntrinsicType xmm1, xmm2;
2326 const IntrinsicType a1( A.load(i,k) );
2327 xmm1 = xmm1 + a1 * B.load(k,j );
2328 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2331 (~C)(i,j ) +=
sum( xmm1 );
2332 (~C)(i,j+1UL) +=
sum( xmm2 );
2334 for( ; remainder && k<kend; ++k ) {
2335 (~C)(i,j ) += A(i,k) * B(k,j );
2336 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2342 const size_t kbegin( ( IsUpper<MT4>::value )
2343 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2344 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2346 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
2353 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2356 (~C)(i,j) +=
sum( xmm1 );
2358 for( ; remainder && k<K; ++k ) {
2359 (~C)(i,j) += A(i,k) * B(k,j);
2382 template<
typename MT3
2385 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2386 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2388 typedef IntrinsicTrait<ElementType> IT;
2390 const size_t M( A.rows() );
2391 const size_t N( B.columns() );
2392 const size_t K( A.columns() );
2394 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
2398 for( ; (i+4UL) <= M; i+=4UL )
2402 for( ; (j+2UL) <= N; j+=2UL )
2404 const size_t kbegin( ( IsUpper<MT4>::value )
2405 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2406 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2407 const size_t kend( ( IsLower<MT4>::value )
2408 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
2409 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2411 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2414 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2418 const IntrinsicType a1( A.load(i ,k) );
2419 const IntrinsicType a2( A.load(i+1UL,k) );
2420 const IntrinsicType a3( A.load(i+2UL,k) );
2421 const IntrinsicType a4( A.load(i+3UL,k) );
2422 const IntrinsicType b1( B.load(k,j ) );
2423 const IntrinsicType b2( B.load(k,j+1UL) );
2424 xmm1 = xmm1 + a1 * b1;
2425 xmm2 = xmm2 + a1 * b2;
2426 xmm3 = xmm3 + a2 * b1;
2427 xmm4 = xmm4 + a2 * b2;
2428 xmm5 = xmm5 + a3 * b1;
2429 xmm6 = xmm6 + a3 * b2;
2430 xmm7 = xmm7 + a4 * b1;
2431 xmm8 = xmm8 + a4 * b2;
2434 (~C)(i ,j ) +=
sum( xmm1 );
2435 (~C)(i ,j+1UL) +=
sum( xmm2 );
2436 (~C)(i+1UL,j ) +=
sum( xmm3 );
2437 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2438 (~C)(i+2UL,j ) +=
sum( xmm5 );
2439 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
2440 (~C)(i+3UL,j ) +=
sum( xmm7 );
2441 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
2443 for( ; remainder && k<kend; ++k ) {
2444 (~C)(i ,j ) += A(i ,k) * B(k,j );
2445 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2446 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2447 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2448 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2449 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2450 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2451 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2457 const size_t kbegin( ( IsUpper<MT4>::value )
2458 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2459 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2460 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
2462 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2465 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2469 const IntrinsicType b1( B.load(k,j) );
2470 xmm1 = xmm1 + A.load(i ,k) * b1;
2471 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2472 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
2473 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
2476 (~C)(i ,j) +=
sum( xmm1 );
2477 (~C)(i+1UL,j) +=
sum( xmm2 );
2478 (~C)(i+2UL,j) +=
sum( xmm3 );
2479 (~C)(i+3UL,j) +=
sum( xmm4 );
2481 for( ; remainder && k<kend; ++k ) {
2482 (~C)(i ,j) += A(i ,k) * B(k,j);
2483 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2484 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
2485 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
2490 for( ; (i+2UL) <= M; i+=2UL )
2494 for( ; (j+2UL) <= N; j+=2UL )
2496 const size_t kbegin( ( IsUpper<MT4>::value )
2497 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2498 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2499 const size_t kend( ( IsLower<MT4>::value )
2500 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2501 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2503 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2506 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2510 const IntrinsicType a1( A.load(i ,k) );
2511 const IntrinsicType a2( A.load(i+1UL,k) );
2512 const IntrinsicType b1( B.load(k,j ) );
2513 const IntrinsicType b2( B.load(k,j+1UL) );
2514 xmm1 = xmm1 + a1 * b1;
2515 xmm2 = xmm2 + a1 * b2;
2516 xmm3 = xmm3 + a2 * b1;
2517 xmm4 = xmm4 + a2 * b2;
2520 (~C)(i ,j ) +=
sum( xmm1 );
2521 (~C)(i ,j+1UL) +=
sum( xmm2 );
2522 (~C)(i+1UL,j ) +=
sum( xmm3 );
2523 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2525 for( ; remainder && k<kend; ++k ) {
2526 (~C)(i ,j ) += A(i ,k) * B(k,j );
2527 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2528 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2529 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2535 const size_t kbegin( ( IsUpper<MT4>::value )
2536 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2537 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2538 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2540 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2543 IntrinsicType xmm1, xmm2;
2547 const IntrinsicType b1( B.load(k,j) );
2548 xmm1 = xmm1 + A.load(i ,k) * b1;
2549 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2552 (~C)(i ,j) +=
sum( xmm1 );
2553 (~C)(i+1UL,j) +=
sum( xmm2 );
2555 for( ; remainder && k<kend; ++k ) {
2556 (~C)(i ,j) += A(i ,k) * B(k,j);
2557 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2566 for( ; (j+2UL) <= N; j+=2UL )
2568 const size_t kbegin( ( IsUpper<MT4>::value )
2569 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2570 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2571 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2573 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
2576 IntrinsicType xmm1, xmm2;
2580 const IntrinsicType a1( A.load(i,k) );
2581 xmm1 = xmm1 + a1 * B.load(k,j );
2582 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2585 (~C)(i,j ) +=
sum( xmm1 );
2586 (~C)(i,j+1UL) +=
sum( xmm2 );
2588 for( ; remainder && k<kend; ++k ) {
2589 (~C)(i,j ) += A(i,k) * B(k,j );
2590 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2596 const size_t kbegin( ( IsUpper<MT4>::value )
2597 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2598 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2600 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
2607 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2610 (~C)(i,j) +=
sum( xmm1 );
2612 for( ; remainder && k<K; ++k ) {
2613 (~C)(i,j) += A(i,k) * B(k,j);
2635 template<
typename MT3
2638 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2639 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2641 selectDefaultAddAssignKernel( C, A, B );
2661 template<
typename MT3
2664 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2665 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2668 selectSmallAddAssignKernel( ~C, A, B );
2688 template<
typename MT3
2691 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2692 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2695 selectSmallAddAssignKernel( ~C, A, B );
2714 template<
typename MT3
2717 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2718 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2720 selectLargeAddAssignKernel( C, A, B );
2740 template<
typename MT3
2743 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2744 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2748 if( IsTriangular<MT4>::value ) {
2750 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2751 addAssign( C, tmp );
2753 else if( IsTriangular<MT5>::value ) {
2755 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2756 addAssign( C, tmp );
2759 gemm( C, A, B, ET(1), ET(1) );
2783 template<
typename MT
2785 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
2792 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2796 LT A(
serial( rhs.lhs_ ) );
2797 RT B(
serial( rhs.rhs_ ) );
2806 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2822 template<
typename MT3
2825 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2827 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
2828 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2829 selectSmallSubAssignKernel( C, A, B );
2831 selectBlasSubAssignKernel( C, A, B );
2850 template<
typename MT3
2853 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2854 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2856 const size_t M( A.rows() );
2857 const size_t N( B.columns() );
2858 const size_t K( A.columns() );
2860 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
2861 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
2863 const size_t iend( ( IsStrictlyUpper<MT4>::value )
2864 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
2868 for(
size_t i=ibegin; i<iend; ++i )
2870 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2871 ?( ( IsStrictlyUpper<MT4>::value )
2872 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
2873 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
2874 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
2875 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
2876 ?( ( IsStrictlyLower<MT4>::value )
2877 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
2878 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
2879 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
2882 for(
size_t j=jbegin; j<jend; ++j )
2884 const size_t kbegin( ( IsUpper<MT4>::value )
2885 ?( ( IsLower<MT5>::value )
2886 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2887 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2888 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2889 :( ( IsLower<MT5>::value )
2890 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2892 const size_t kend( ( IsLower<MT4>::value )
2893 ?( ( IsUpper<MT5>::value )
2894 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2895 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2896 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2897 :( ( IsUpper<MT5>::value )
2898 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2902 const size_t knum( kend - kbegin );
2903 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2905 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2906 (~C)(i,j) -= A(i,k ) * B(k ,j);
2907 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2910 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
2932 template<
typename MT3
2935 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2936 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2938 const size_t M( A.rows() );
2939 const size_t N( B.columns() );
2940 const size_t K( A.columns() );
2942 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
2943 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
2945 const size_t jend( ( IsStrictlyLower<MT5>::value )
2946 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
2950 for(
size_t j=jbegin; j<jend; ++j )
2952 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
2953 ?( ( IsStrictlyLower<MT4>::value )
2954 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
2955 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2956 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
2957 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2958 ?( ( IsStrictlyUpper<MT4>::value )
2959 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
2960 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
2961 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
2964 for(
size_t i=ibegin; i<iend; ++i )
2966 const size_t kbegin( ( IsUpper<MT4>::value )
2967 ?( ( IsLower<MT5>::value )
2968 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2969 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2970 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2971 :( ( IsLower<MT5>::value )
2972 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2974 const size_t kend( ( IsLower<MT4>::value )
2975 ?( ( IsUpper<MT5>::value )
2976 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2977 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2978 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2979 :( ( IsUpper<MT5>::value )
2980 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2984 const size_t knum( kend - kbegin );
2985 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2987 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2988 (~C)(i,j) -= A(i,k ) * B(k ,j);
2989 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2992 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3014 template<
typename MT3
3017 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
3018 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3020 const size_t M( A.rows() );
3021 const size_t N( B.columns() );
3023 for(
size_t i=0UL; i<M; ++i )
3025 const size_t jbegin( ( IsUpper<MT4>::value )
3026 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3028 const size_t jend( ( IsLower<MT4>::value )
3029 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3033 const size_t jnum( jend - jbegin );
3034 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3036 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3037 (~C)(i,j ) -= A(i,j ) * B(j ,j );
3038 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3041 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3062 template<
typename MT3
3065 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
3066 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3068 const size_t M( A.rows() );
3069 const size_t N( B.columns() );
3071 const size_t block( BLOCK_SIZE );
3073 for(
size_t jj=0UL; jj<N; jj+=block ) {
3074 const size_t jend(
min( N, jj+block ) );
3075 for(
size_t ii=0UL; ii<M; ii+=block ) {
3076 const size_t iend(
min( M, ii+block ) );
3077 for(
size_t j=jj; j<jend; ++j )
3079 const size_t ibegin( ( IsLower<MT4>::value )
3080 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
3082 const size_t ipos( ( IsUpper<MT4>::value )
3083 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
3086 for(
size_t i=ibegin; i<ipos; ++i ) {
3087 (~C)(i,j) -= A(i,j) * B(j,j);
3110 template<
typename MT3
3113 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
3114 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3116 const size_t M( A.rows() );
3117 const size_t N( B.columns() );
3119 const size_t block( BLOCK_SIZE );
3121 for(
size_t ii=0UL; ii<M; ii+=block ) {
3122 const size_t iend(
min( M, ii+block ) );
3123 for(
size_t jj=0UL; jj<N; jj+=block ) {
3124 const size_t jend(
min( N, jj+block ) );
3125 for(
size_t i=ii; i<iend; ++i )
3127 const size_t jbegin( ( IsUpper<MT5>::value )
3128 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
3130 const size_t jpos( ( IsLower<MT5>::value )
3131 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
3134 for(
size_t j=jbegin; j<jpos; ++j ) {
3135 (~C)(i,j) -= A(i,i) * B(i,j);
3158 template<
typename MT3
3161 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
3162 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3164 const size_t M( A.rows() );
3165 const size_t N( B.columns() );
3167 for(
size_t j=0UL; j<N; ++j )
3169 const size_t ibegin( ( IsLower<MT5>::value )
3170 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3172 const size_t iend( ( IsUpper<MT5>::value )
3173 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3177 const size_t inum( iend - ibegin );
3178 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3180 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3181 (~C)(i ,j) -= A(i ,i ) * B(i ,j);
3182 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3185 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3206 template<
typename MT3
3209 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
3210 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3212 for(
size_t i=0UL; i<A.rows(); ++i ) {
3213 C(i,i) -= A(i,i) * B(i,i);
3233 template<
typename MT3
3236 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3237 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3239 selectDefaultSubAssignKernel( ~C, A, B );
3259 template<
typename MT3
3262 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3263 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3265 typedef IntrinsicTrait<ElementType> IT;
3267 const size_t M( A.rows() );
3268 const size_t N( B.columns() );
3269 const size_t K( A.columns() );
3271 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
3275 for( ; (i+2UL) <= M; i+=2UL )
3279 for( ; (j+4UL) <= N; j+=4UL )
3281 const size_t kbegin( ( IsUpper<MT4>::value )
3282 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3283 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3284 const size_t kend( ( IsLower<MT4>::value )
3285 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
3286 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
3288 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3291 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3295 const IntrinsicType a1( A.load(i ,k) );
3296 const IntrinsicType a2( A.load(i+1UL,k) );
3297 const IntrinsicType b1( B.load(k,j ) );
3298 const IntrinsicType b2( B.load(k,j+1UL) );
3299 const IntrinsicType b3( B.load(k,j+2UL) );
3300 const IntrinsicType b4( B.load(k,j+3UL) );
3301 xmm1 = xmm1 + a1 * b1;
3302 xmm2 = xmm2 + a1 * b2;
3303 xmm3 = xmm3 + a1 * b3;
3304 xmm4 = xmm4 + a1 * b4;
3305 xmm5 = xmm5 + a2 * b1;
3306 xmm6 = xmm6 + a2 * b2;
3307 xmm7 = xmm7 + a2 * b3;
3308 xmm8 = xmm8 + a2 * b4;
3311 (~C)(i ,j ) -=
sum( xmm1 );
3312 (~C)(i ,j+1UL) -=
sum( xmm2 );
3313 (~C)(i ,j+2UL) -=
sum( xmm3 );
3314 (~C)(i ,j+3UL) -=
sum( xmm4 );
3315 (~C)(i+1UL,j ) -=
sum( xmm5 );
3316 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
3317 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
3318 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
3320 for( ; remainder && k<kend; ++k ) {
3321 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3322 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3323 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3324 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3325 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3326 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3327 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3328 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3332 for( ; (j+2UL) <= N; j+=2UL )
3334 const size_t kbegin( ( IsUpper<MT4>::value )
3335 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3336 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3337 const size_t kend( ( IsLower<MT4>::value )
3338 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3339 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3341 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3344 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3348 const IntrinsicType a1( A.load(i ,k) );
3349 const IntrinsicType a2( A.load(i+1UL,k) );
3350 const IntrinsicType b1( B.load(k,j ) );
3351 const IntrinsicType b2( B.load(k,j+1UL) );
3352 xmm1 = xmm1 + a1 * b1;
3353 xmm2 = xmm2 + a1 * b2;
3354 xmm3 = xmm3 + a2 * b1;
3355 xmm4 = xmm4 + a2 * b2;
3358 (~C)(i ,j ) -=
sum( xmm1 );
3359 (~C)(i ,j+1UL) -=
sum( xmm2 );
3360 (~C)(i+1UL,j ) -=
sum( xmm3 );
3361 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3363 for( ; remainder && k<kend; ++k ) {
3364 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3365 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3366 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3367 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3373 const size_t kbegin( ( IsUpper<MT4>::value )
3374 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3375 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3376 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3378 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3381 IntrinsicType xmm1, xmm2;
3385 const IntrinsicType b1( B.load(k,j) );
3386 xmm1 = xmm1 + A.load(i ,k) * b1;
3387 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3390 (~C)(i ,j) -=
sum( xmm1 );
3391 (~C)(i+1UL,j) -=
sum( xmm2 );
3393 for( ; remainder && k<kend; ++k ) {
3394 (~C)(i ,j) -= A(i ,k) * B(k,j);
3395 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3404 for( ; (j+4UL) <= N; j+=4UL )
3406 const size_t kbegin( ( IsUpper<MT4>::value )
3407 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3408 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3409 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
3411 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3414 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3418 const IntrinsicType a1( A.load(i,k) );
3419 xmm1 = xmm1 + a1 * B.load(k,j );
3420 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3421 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3422 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3425 (~C)(i,j ) -=
sum( xmm1 );
3426 (~C)(i,j+1UL) -=
sum( xmm2 );
3427 (~C)(i,j+2UL) -=
sum( xmm3 );
3428 (~C)(i,j+3UL) -=
sum( xmm4 );
3430 for( ; remainder && k<kend; ++k ) {
3431 (~C)(i,j ) -= A(i,k) * B(k,j );
3432 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3433 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3434 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3438 for( ; (j+2UL) <= N; j+=2UL )
3440 const size_t kbegin( ( IsUpper<MT4>::value )
3441 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3442 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3443 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3445 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3448 IntrinsicType xmm1, xmm2;
3452 const IntrinsicType a1( A.load(i,k) );
3453 xmm1 = xmm1 + a1 * B.load(k,j );
3454 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3457 (~C)(i,j ) -=
sum( xmm1 );
3458 (~C)(i,j+1UL) -=
sum( xmm2 );
3460 for( ; remainder && k<kend; ++k ) {
3461 (~C)(i,j ) -= A(i,k) * B(k,j );
3462 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3468 const size_t kbegin( ( IsUpper<MT4>::value )
3469 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3470 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3472 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
3479 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3482 (~C)(i,j) -=
sum( xmm1 );
3484 for( ; remainder && k<K; ++k ) {
3485 (~C)(i,j) -= A(i,k) * B(k,j);
3508 template<
typename MT3
3511 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3512 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3514 typedef IntrinsicTrait<ElementType> IT;
3516 const size_t M( A.rows() );
3517 const size_t N( B.columns() );
3518 const size_t K( A.columns() );
3520 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
3524 for( ; (i+4UL) <= M; i+=4UL )
3528 for( ; (j+2UL) <= N; j+=2UL )
3530 const size_t kbegin( ( IsUpper<MT4>::value )
3531 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3532 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3533 const size_t kend( ( IsLower<MT4>::value )
3534 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
3535 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3537 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3540 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3544 const IntrinsicType a1( A.load(i ,k) );
3545 const IntrinsicType a2( A.load(i+1UL,k) );
3546 const IntrinsicType a3( A.load(i+2UL,k) );
3547 const IntrinsicType a4( A.load(i+3UL,k) );
3548 const IntrinsicType b1( B.load(k,j ) );
3549 const IntrinsicType b2( B.load(k,j+1UL) );
3550 xmm1 = xmm1 + a1 * b1;
3551 xmm2 = xmm2 + a1 * b2;
3552 xmm3 = xmm3 + a2 * b1;
3553 xmm4 = xmm4 + a2 * b2;
3554 xmm5 = xmm5 + a3 * b1;
3555 xmm6 = xmm6 + a3 * b2;
3556 xmm7 = xmm7 + a4 * b1;
3557 xmm8 = xmm8 + a4 * b2;
3560 (~C)(i ,j ) -=
sum( xmm1 );
3561 (~C)(i ,j+1UL) -=
sum( xmm2 );
3562 (~C)(i+1UL,j ) -=
sum( xmm3 );
3563 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3564 (~C)(i+2UL,j ) -=
sum( xmm5 );
3565 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
3566 (~C)(i+3UL,j ) -=
sum( xmm7 );
3567 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
3569 for( ; remainder && k<kend; ++k ) {
3570 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3571 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3572 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3573 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3574 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3575 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3576 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3577 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3583 const size_t kbegin( ( IsUpper<MT4>::value )
3584 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3585 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3586 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
3588 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3591 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3595 const IntrinsicType b1( B.load(k,j) );
3596 xmm1 = xmm1 + A.load(i ,k) * b1;
3597 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3598 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
3599 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
3602 (~C)(i ,j) -=
sum( xmm1 );
3603 (~C)(i+1UL,j) -=
sum( xmm2 );
3604 (~C)(i+2UL,j) -=
sum( xmm3 );
3605 (~C)(i+3UL,j) -=
sum( xmm4 );
3607 for( ; remainder && k<kend; ++k ) {
3608 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3609 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3610 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3611 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3616 for( ; (i+2UL) <= M; i+=2UL )
3620 for( ; (j+2UL) <= N; j+=2UL )
3622 const size_t kbegin( ( IsUpper<MT4>::value )
3623 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3624 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3625 const size_t kend( ( IsLower<MT4>::value )
3626 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3627 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3629 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3632 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3636 const IntrinsicType a1( A.load(i ,k) );
3637 const IntrinsicType a2( A.load(i+1UL,k) );
3638 const IntrinsicType b1( B.load(k,j ) );
3639 const IntrinsicType b2( B.load(k,j+1UL) );
3640 xmm1 = xmm1 + a1 * b1;
3641 xmm2 = xmm2 + a1 * b2;
3642 xmm3 = xmm3 + a2 * b1;
3643 xmm4 = xmm4 + a2 * b2;
3646 (~C)(i ,j ) -=
sum( xmm1 );
3647 (~C)(i ,j+1UL) -=
sum( xmm2 );
3648 (~C)(i+1UL,j ) -=
sum( xmm3 );
3649 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3651 for( ; remainder && k<kend; ++k ) {
3652 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3653 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3654 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3655 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3661 const size_t kbegin( ( IsUpper<MT4>::value )
3662 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3663 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3664 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3666 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3669 IntrinsicType xmm1, xmm2;
3673 const IntrinsicType b1( B.load(k,j) );
3674 xmm1 = xmm1 + A.load(i ,k) * b1;
3675 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3678 (~C)(i ,j) -=
sum( xmm1 );
3679 (~C)(i+1UL,j) -=
sum( xmm2 );
3681 for( ; remainder && k<kend; ++k ) {
3682 (~C)(i ,j) -= A(i ,k) * B(k,j);
3683 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3691 for( ; (j+2UL) <= N; j+=2UL )
3693 const size_t kbegin( ( IsUpper<MT4>::value )
3694 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3695 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3696 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3698 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
3701 IntrinsicType xmm1, xmm2;
3705 const IntrinsicType a1( A.load(i,k) );
3706 xmm1 = xmm1 + a1 * B.load(k,j );
3707 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3710 (~C)(i,j ) -=
sum( xmm1 );
3711 (~C)(i,j+1UL) -=
sum( xmm2 );
3713 for( ; remainder && k<kend; ++k ) {
3714 (~C)(i,j ) -= A(i,k) * B(k,j );
3715 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3721 const size_t kbegin( ( IsUpper<MT4>::value )
3722 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3723 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3725 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
3732 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3735 (~C)(i,j) -=
sum( xmm1 );
3737 for( ; remainder && k<K; ++k ) {
3738 (~C)(i,j) -= A(i,k) * B(k,j);
3760 template<
typename MT3
3763 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3764 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3766 selectDefaultSubAssignKernel( ~C, A, B );
3786 template<
typename MT3
3789 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3790 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3793 selectSmallSubAssignKernel( ~C, A, B );
3813 template<
typename MT3
3816 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3817 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3820 selectSmallSubAssignKernel( ~C, A, B );
3839 template<
typename MT3
3842 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3843 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3845 selectLargeSubAssignKernel( C, A, B );
3865 template<
typename MT3
3868 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3869 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3873 if( IsTriangular<MT4>::value ) {
3875 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3876 subAssign( C, tmp );
3878 else if( IsTriangular<MT5>::value ) {
3880 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3881 subAssign( C, tmp );
3884 gemm( C, A, B, ET(-1), ET(1) );
3918 template<
typename MT
3920 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3928 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3931 else if( rhs.lhs_.columns() == 0UL ) {
3966 template<
typename MT
3968 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3973 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
3985 const TmpType tmp( rhs );
4007 template<
typename MT
4009 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4017 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4056 template<
typename MT
4058 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4066 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4126 template<
typename MT1
4130 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
4131 ,
private MatScalarMultExpr
4132 ,
private Computation
4136 typedef DMatTDMatMultExpr<MT1,MT2> MMM;
4148 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4153 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4161 template<
typename T1,
typename T2,
typename T3 >
4162 struct IsEvaluationRequired {
4163 enum { value = ( evaluateLeft || evaluateRight ) };
4171 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4172 struct UseBlasKernel {
4174 HasMutableDataAccess<T1>::value &&
4175 HasConstDataAccess<T2>::value &&
4176 HasConstDataAccess<T3>::value &&
4177 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4178 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4179 IsBlasCompatible<typename T1::ElementType>::value &&
4180 IsBlasCompatible<typename T2::ElementType>::value &&
4181 IsBlasCompatible<typename T3::ElementType>::value &&
4182 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
4183 IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
4184 !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
4192 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4193 struct UseVectorizedDefaultKernel {
4195 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4196 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4197 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4198 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4199 IsSame<typename T1::ElementType,T4>::value &&
4200 IntrinsicTrait<typename T1::ElementType>::addition &&
4201 IntrinsicTrait<typename T1::ElementType>::multiplication };
4207 typedef DMatScalarMultExpr<MMM,ST,false>
This;
4208 typedef typename MultTrait<RES,ST>::Type
ResultType;
4212 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
4217 typedef const DMatTDMatMultExpr<MT1,MT2>
LeftOperand;
4223 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
4226 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
4231 enum { vectorizable = !IsDiagonal<MT1>::value && !IsDiagonal<MT2>::value &&
4232 MT1::vectorizable && MT2::vectorizable &&
4233 IsSame<ET1,ET2>::value &&
4234 IsSame<ET1,ST>::value &&
4235 IntrinsicTrait<ET1>::addition &&
4236 IntrinsicTrait<ET1>::multiplication };
4239 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4240 !evaluateRight && MT2::smpAssignable };
4249 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4262 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4265 return matrix_(i,j) * scalar_;
4277 inline ReturnType
at(
size_t i,
size_t j )
const {
4278 if( i >= matrix_.rows() ) {
4281 if( j >= matrix_.columns() ) {
4284 return (*
this)(i,j);
4293 inline size_t rows()
const {
4294 return matrix_.rows();
4303 inline size_t columns()
const {
4304 return matrix_.columns();
4334 template<
typename T >
4335 inline bool canAlias(
const T* alias )
const {
4336 return matrix_.canAlias( alias );
4346 template<
typename T >
4347 inline bool isAliased(
const T* alias )
const {
4348 return matrix_.isAliased( alias );
4358 return matrix_.isAligned();
4368 typename MMM::LeftOperand A( matrix_.leftOperand() );
4370 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4371 ( A.rows() > SMP_DMATTDMATMULT_THRESHOLD );
4377 LeftOperand matrix_;
4378 RightOperand scalar_;
4393 template<
typename MT
4395 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4402 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4403 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4405 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4408 else if( left.columns() == 0UL ) {
4423 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4438 template<
typename MT3
4442 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4444 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
4445 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4446 selectSmallAssignKernel( C, A, B, scalar );
4448 selectBlasAssignKernel( C, A, B, scalar );
4466 template<
typename MT3
4470 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4471 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4473 const size_t M( A.rows() );
4474 const size_t N( B.columns() );
4475 const size_t K( A.columns() );
4477 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
4478 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
4480 const size_t iend( ( IsStrictlyUpper<MT4>::value )
4481 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
4485 for(
size_t i=0UL; i<ibegin; ++i ) {
4486 for(
size_t j=0UL; j<N; ++j ) {
4490 for(
size_t i=ibegin; i<iend; ++i )
4492 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4493 ?( ( IsStrictlyUpper<MT4>::value )
4494 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
4495 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
4496 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
4497 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4498 ?( ( IsStrictlyLower<MT4>::value )
4499 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
4500 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
4501 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
4504 for(
size_t j=0UL; j<jbegin; ++j ) {
4507 for(
size_t j=jbegin; j<jend; ++j )
4509 const size_t kbegin( ( IsUpper<MT4>::value )
4510 ?( ( IsLower<MT5>::value )
4511 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4512 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4513 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4514 :( ( IsLower<MT5>::value )
4515 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4517 const size_t kend( ( IsLower<MT4>::value )
4518 ?( ( IsUpper<MT5>::value )
4519 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4520 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4521 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4522 :( ( IsUpper<MT5>::value )
4523 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4527 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4528 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4529 (~C)(i,j) += A(i,k) * B(k,j);
4531 (~C)(i,j) *= scalar;
4533 for(
size_t j=jend; j<N; ++j ) {
4537 for(
size_t i=iend; i<M; ++i ) {
4538 for(
size_t j=0UL; j<N; ++j ) {
4559 template<
typename MT3
4563 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4564 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4566 const size_t M( A.rows() );
4567 const size_t N( B.columns() );
4568 const size_t K( A.columns() );
4570 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
4571 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
4573 const size_t jend( ( IsStrictlyLower<MT5>::value )
4574 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
4578 for(
size_t j=0UL; j<jbegin; ++j ) {
4579 for(
size_t i=0UL; i<M; ++i ) {
4583 for(
size_t j=jbegin; j<jend; ++j )
4585 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4586 ?( ( IsStrictlyLower<MT4>::value )
4587 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
4588 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4589 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
4590 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4591 ?( ( IsStrictlyUpper<MT4>::value )
4592 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
4593 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
4594 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
4597 for(
size_t i=0UL; i<ibegin; ++i ) {
4600 for(
size_t i=ibegin; i<iend; ++i )
4602 const size_t kbegin( ( IsUpper<MT4>::value )
4603 ?( ( IsLower<MT5>::value )
4604 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4605 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4606 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4607 :( ( IsLower<MT5>::value )
4608 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4610 const size_t kend( ( IsLower<MT4>::value )
4611 ?( ( IsUpper<MT5>::value )
4612 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4613 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4614 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4615 :( ( IsUpper<MT5>::value )
4616 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4620 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4621 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4622 (~C)(i,j) += A(i,k) * B(k,j);
4624 (~C)(i,j) *= scalar;
4626 for(
size_t i=iend; i<M; ++i ) {
4630 for(
size_t j=jend; j<N; ++j ) {
4631 for(
size_t i=0UL; i<M; ++i ) {
4652 template<
typename MT3
4656 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4657 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4659 const size_t M( A.rows() );
4660 const size_t N( B.columns() );
4662 for(
size_t i=0UL; i<M; ++i )
4664 const size_t jbegin( ( IsUpper<MT4>::value )
4665 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4667 const size_t jend( ( IsLower<MT4>::value )
4668 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4672 if( IsUpper<MT4>::value ) {
4673 for(
size_t j=0UL; j<jbegin; ++j ) {
4677 for(
size_t j=jbegin; j<jend; ++j ) {
4678 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4680 if( IsLower<MT4>::value ) {
4681 for(
size_t j=jend; j<N; ++j ) {
4703 template<
typename MT3
4707 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4708 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4710 const size_t M( A.rows() );
4711 const size_t N( B.columns() );
4713 const size_t block( BLOCK_SIZE );
4715 for(
size_t jj=0UL; jj<N; jj+=block ) {
4716 const size_t jend(
min( N, jj+block ) );
4717 for(
size_t ii=0UL; ii<M; ii+=block ) {
4718 const size_t iend(
min( M, ii+block ) );
4719 for(
size_t j=jj; j<jend; ++j )
4721 const size_t ibegin( ( IsLower<MT4>::value )
4722 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
4724 const size_t ipos( ( IsUpper<MT4>::value )
4725 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
4728 if( IsLower<MT4>::value ) {
4729 for(
size_t i=ii; i<ibegin; ++i ) {
4733 for(
size_t i=ibegin; i<ipos; ++i ) {
4734 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4736 if( IsUpper<MT4>::value ) {
4737 for(
size_t i=ipos; i<iend; ++i ) {
4761 template<
typename MT3
4765 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4766 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4768 const size_t M( A.rows() );
4769 const size_t N( B.columns() );
4771 const size_t block( BLOCK_SIZE );
4773 for(
size_t ii=0UL; ii<M; ii+=block ) {
4774 const size_t iend(
min( M, ii+block ) );
4775 for(
size_t jj=0UL; jj<N; jj+=block ) {
4776 const size_t jend(
min( N, jj+block ) );
4777 for(
size_t i=ii; i<iend; ++i )
4779 const size_t jbegin( ( IsUpper<MT5>::value )
4780 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
4782 const size_t jpos( ( IsLower<MT5>::value )
4783 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
4786 if( IsUpper<MT5>::value ) {
4787 for(
size_t j=jj; j<jbegin; ++j ) {
4791 for(
size_t j=jbegin; j<jpos; ++j ) {
4792 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4794 if( IsLower<MT5>::value ) {
4795 for(
size_t j=jpos; j<jend; ++j ) {
4819 template<
typename MT3
4823 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4824 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4826 const size_t M( A.rows() );
4827 const size_t N( B.columns() );
4829 for(
size_t j=0UL; j<N; ++j )
4831 const size_t ibegin( ( IsLower<MT5>::value )
4832 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4834 const size_t iend( ( IsUpper<MT5>::value )
4835 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4839 if( IsLower<MT5>::value ) {
4840 for(
size_t i=0UL; i<ibegin; ++i ) {
4844 for(
size_t i=ibegin; i<iend; ++i ) {
4845 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4847 if( IsUpper<MT5>::value ) {
4848 for(
size_t i=iend; i<M; ++i ) {
4870 template<
typename MT3
4874 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4875 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4879 for(
size_t i=0UL; i<A.rows(); ++i ) {
4880 C(i,i) = A(i,i) * B(i,i) * scalar;
4899 template<
typename MT3
4903 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4904 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4906 selectDefaultAssignKernel( C, A, B, scalar );
4925 template<
typename MT3
4929 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4930 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4932 typedef IntrinsicTrait<ElementType> IT;
4934 const size_t M( A.rows() );
4935 const size_t N( B.columns() );
4936 const size_t K( A.columns() );
4938 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
4942 for( ; (i+2UL) <= M; i+=2UL )
4946 for( ; (j+4UL) <= N; j+=4UL )
4948 const size_t kbegin( ( IsUpper<MT4>::value )
4949 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
4950 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
4951 const size_t kend( ( IsLower<MT4>::value )
4952 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
4953 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
4955 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
4958 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4962 const IntrinsicType a1( A.load(i ,k) );
4963 const IntrinsicType a2( A.load(i+1UL,k) );
4964 const IntrinsicType b1( B.load(k,j ) );
4965 const IntrinsicType b2( B.load(k,j+1UL) );
4966 const IntrinsicType b3( B.load(k,j+2UL) );
4967 const IntrinsicType b4( B.load(k,j+3UL) );
4968 xmm1 = xmm1 + a1 * b1;
4969 xmm2 = xmm2 + a1 * b2;
4970 xmm3 = xmm3 + a1 * b3;
4971 xmm4 = xmm4 + a1 * b4;
4972 xmm5 = xmm5 + a2 * b1;
4973 xmm6 = xmm6 + a2 * b2;
4974 xmm7 = xmm7 + a2 * b3;
4975 xmm8 = xmm8 + a2 * b4;
4978 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
4979 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
4980 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
4981 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
4982 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
4983 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
4984 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
4985 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
4987 for( ; remainder && k<kend; ++k ) {
4988 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
4989 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
4990 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
4991 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
4992 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
4993 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
4994 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
4995 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
4999 for( ; (j+2UL) <= N; j+=2UL )
5001 const size_t kbegin( ( IsUpper<MT4>::value )
5002 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5003 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5004 const size_t kend( ( IsLower<MT4>::value )
5005 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5006 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5008 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5011 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5015 const IntrinsicType a1( A.load(i ,k) );
5016 const IntrinsicType a2( A.load(i+1UL,k) );
5017 const IntrinsicType b1( B.load(k,j ) );
5018 const IntrinsicType b2( B.load(k,j+1UL) );
5019 xmm1 = xmm1 + a1 * b1;
5020 xmm2 = xmm2 + a1 * b2;
5021 xmm3 = xmm3 + a2 * b1;
5022 xmm4 = xmm4 + a2 * b2;
5025 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5026 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5027 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5028 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5030 for( ; remainder && k<kend; ++k ) {
5031 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5032 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5033 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5034 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5040 const size_t kbegin( ( IsUpper<MT4>::value )
5041 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5042 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5043 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5045 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5048 IntrinsicType xmm1, xmm2;
5052 const IntrinsicType b1( B.load(k,j) );
5053 xmm1 = xmm1 + A.load(i ,k) * b1;
5054 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5057 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5058 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5060 for( ; remainder && k<kend; ++k ) {
5061 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5062 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5071 for( ; (j+4UL) <= N; j+=4UL )
5073 const size_t kbegin( ( IsUpper<MT4>::value )
5074 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5075 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5076 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
5078 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5081 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5085 const IntrinsicType a1( A.load(i,k) );
5086 xmm1 = xmm1 + a1 * B.load(k,j );
5087 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5088 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
5089 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
5092 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5093 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5094 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
5095 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
5097 for( ; remainder && k<kend; ++k ) {
5098 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5099 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5100 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5101 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5105 for( ; (j+2UL) <= N; j+=2UL )
5107 const size_t kbegin( ( IsUpper<MT4>::value )
5108 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5109 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5110 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5112 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5115 IntrinsicType xmm1, xmm2;
5119 const IntrinsicType a1( A.load(i,k) );
5120 xmm1 = xmm1 + a1 * B.load(k,j );
5121 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5124 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5125 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5127 for( ; remainder && k<kend; ++k ) {
5128 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5129 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5135 const size_t kbegin( ( IsUpper<MT4>::value )
5136 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5137 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5139 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
5146 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5149 (~C)(i,j) =
sum( xmm1 ) * scalar;
5151 for( ; remainder && k<K; ++k ) {
5152 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5174 template<
typename MT3
5178 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5179 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5181 typedef IntrinsicTrait<ElementType> IT;
5183 const size_t M( A.rows() );
5184 const size_t N( B.columns() );
5185 const size_t K( A.columns() );
5187 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
5191 for( ; (i+4UL) <= M; i+=4UL )
5195 for( ; (j+2UL) <= N; j+=2UL )
5197 const size_t kbegin( ( IsUpper<MT4>::value )
5198 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5199 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5200 const size_t kend( ( IsLower<MT4>::value )
5201 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
5202 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5204 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5207 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5211 const IntrinsicType a1( A.load(i ,k) );
5212 const IntrinsicType a2( A.load(i+1UL,k) );
5213 const IntrinsicType a3( A.load(i+2UL,k) );
5214 const IntrinsicType a4( A.load(i+3UL,k) );
5215 const IntrinsicType b1( B.load(k,j ) );
5216 const IntrinsicType b2( B.load(k,j+1UL) );
5217 xmm1 = xmm1 + a1 * b1;
5218 xmm2 = xmm2 + a1 * b2;
5219 xmm3 = xmm3 + a2 * b1;
5220 xmm4 = xmm4 + a2 * b2;
5221 xmm5 = xmm5 + a3 * b1;
5222 xmm6 = xmm6 + a3 * b2;
5223 xmm7 = xmm7 + a4 * b1;
5224 xmm8 = xmm8 + a4 * b2;
5227 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5228 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5229 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5230 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5231 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
5232 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5233 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
5234 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5236 for( ; remainder && k<kend; ++k ) {
5237 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5238 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5239 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5240 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5241 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5242 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5243 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5244 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5250 const size_t kbegin( ( IsUpper<MT4>::value )
5251 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5252 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5253 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
5255 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5258 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5262 const IntrinsicType b1( B.load(k,j) );
5263 xmm1 = xmm1 + A.load(i ,k) * b1;
5264 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5265 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
5266 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
5269 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5270 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5271 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
5272 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
5274 for( ; remainder && k<kend; ++k ) {
5275 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5276 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5277 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5278 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5283 for( ; (i+2UL) <= M; i+=2UL )
5287 for( ; (j+2UL) <= N; j+=2UL )
5289 const size_t kbegin( ( IsUpper<MT4>::value )
5290 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5291 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5292 const size_t kend( ( IsLower<MT4>::value )
5293 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5294 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5296 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5299 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5303 const IntrinsicType a1( A.load(i ,k) );
5304 const IntrinsicType a2( A.load(i+1UL,k) );
5305 const IntrinsicType b1( B.load(k,j ) );
5306 const IntrinsicType b2( B.load(k,j+1UL) );
5307 xmm1 = xmm1 + a1 * b1;
5308 xmm2 = xmm2 + a1 * b2;
5309 xmm3 = xmm3 + a2 * b1;
5310 xmm4 = xmm4 + a2 * b2;
5313 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5314 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5315 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5316 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5318 for( ; remainder && k<kend; ++k ) {
5319 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5320 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5321 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5322 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5328 const size_t kbegin( ( IsUpper<MT4>::value )
5329 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5330 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5331 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5333 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5336 IntrinsicType xmm1, xmm2;
5340 const IntrinsicType b1( B.load(k,j) );
5341 xmm1 = xmm1 + A.load(i ,k) * b1;
5342 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5345 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5346 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5348 for( ; remainder && k<kend; ++k ) {
5349 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5350 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5359 for( ; (j+2UL) <= N; j+=2UL )
5361 const size_t kbegin( ( IsUpper<MT4>::value )
5362 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5363 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5364 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5366 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5369 IntrinsicType xmm1, xmm2;
5373 const IntrinsicType a1( A.load(i,k) );
5374 xmm1 = xmm1 + a1 * B.load(k,j );
5375 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5378 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5379 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5381 for( ; remainder && k<kend; ++k ) {
5382 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5383 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5389 const size_t kbegin( ( IsUpper<MT4>::value )
5390 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5391 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5393 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
5400 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5403 (~C)(i,j) =
sum( xmm1 ) * scalar;
5405 for( ; remainder && k<K; ++k ) {
5406 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5427 template<
typename MT3
5431 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5432 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5434 selectDefaultAssignKernel( C, A, B, scalar );
5453 template<
typename MT3
5457 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5458 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5461 selectSmallAssignKernel( ~C, A, B, scalar );
5480 template<
typename MT3
5484 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5485 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5488 selectSmallAssignKernel( ~C, A, B, scalar );
5506 template<
typename MT3
5510 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5511 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5513 selectLargeAssignKernel( C, A, B, scalar );
5532 template<
typename MT3
5536 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5537 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5541 if( IsTriangular<MT4>::value ) {
5543 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5545 else if( IsTriangular<MT5>::value ) {
5547 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5550 gemm( C, A, B, ET(scalar), ET(0) );
5568 template<
typename MT
5570 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5574 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
5586 const TmpType tmp(
serial( rhs ) );
5587 assign( ~lhs, tmp );
5603 template<
typename MT
5605 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5612 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5613 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5615 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5629 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5644 template<
typename MT3
5648 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5650 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
5651 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5652 selectSmallAddAssignKernel( C, A, B, scalar );
5654 selectBlasAddAssignKernel( C, A, B, scalar );
5672 template<
typename MT3
5676 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5677 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5679 const ResultType tmp(
serial( A * B * scalar ) );
5680 addAssign( C, tmp );
5698 template<
typename MT3
5702 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5703 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5705 const size_t M( A.rows() );
5706 const size_t N( B.columns() );
5708 for(
size_t i=0UL; i<M; ++i )
5710 const size_t jbegin( ( IsUpper<MT4>::value )
5711 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5713 const size_t jend( ( IsLower<MT4>::value )
5714 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5718 const size_t jnum( jend - jbegin );
5719 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5721 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5722 (~C)(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5723 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5726 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5746 template<
typename MT3
5750 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5751 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5753 const size_t M( A.rows() );
5754 const size_t N( B.columns() );
5756 const size_t block( BLOCK_SIZE );
5758 for(
size_t jj=0UL; jj<N; jj+=block ) {
5759 const size_t jend(
min( N, jj+block ) );
5760 for(
size_t ii=0UL; ii<M; ii+=block ) {
5761 const size_t iend(
min( M, ii+block ) );
5762 for(
size_t j=jj; j<jend; ++j )
5764 const size_t ibegin( ( IsLower<MT4>::value )
5765 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
5767 const size_t ipos( ( IsUpper<MT4>::value )
5768 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
5771 for(
size_t i=ibegin; i<ipos; ++i ) {
5772 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
5794 template<
typename MT3
5798 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5799 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5801 const size_t M( A.rows() );
5802 const size_t N( B.columns() );
5804 const size_t block( BLOCK_SIZE );
5806 for(
size_t ii=0UL; ii<M; ii+=block ) {
5807 const size_t iend(
min( M, ii+block ) );
5808 for(
size_t jj=0UL; jj<N; jj+=block ) {
5809 const size_t jend(
min( N, jj+block ) );
5810 for(
size_t i=ii; i<iend; ++i )
5812 const size_t jbegin( ( IsUpper<MT5>::value )
5813 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
5815 const size_t jpos( ( IsLower<MT5>::value )
5816 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
5819 for(
size_t j=jbegin; j<jpos; ++j ) {
5820 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
5842 template<
typename MT3
5846 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5847 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5849 const size_t M( A.rows() );
5850 const size_t N( B.columns() );
5852 for(
size_t j=0UL; j<N; ++j )
5854 const size_t ibegin( ( IsLower<MT5>::value )
5855 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5857 const size_t iend( ( IsUpper<MT5>::value )
5858 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5862 const size_t inum( iend - ibegin );
5863 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5865 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5866 (~C)(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5867 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5870 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5890 template<
typename MT3
5894 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5895 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5897 for(
size_t i=0UL; i<A.rows(); ++i ) {
5898 C(i,i) += A(i,i) * B(i,i) * scalar;
5917 template<
typename MT3
5921 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5922 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5924 selectDefaultAddAssignKernel( C, A, B, scalar );
5943 template<
typename MT3
5947 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5948 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5950 typedef IntrinsicTrait<ElementType> IT;
5952 const size_t M( A.rows() );
5953 const size_t N( B.columns() );
5954 const size_t K( A.columns() );
5956 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
5960 for( ; (i+2UL) <= M; i+=2UL )
5964 for( ; (j+4UL) <= N; j+=4UL )
5966 const size_t kbegin( ( IsUpper<MT4>::value )
5967 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5968 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5969 const size_t kend( ( IsLower<MT4>::value )
5970 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
5971 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
5973 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
5976 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5980 const IntrinsicType a1( A.load(i ,k) );
5981 const IntrinsicType a2( A.load(i+1UL,k) );
5982 const IntrinsicType b1( B.load(k,j ) );
5983 const IntrinsicType b2( B.load(k,j+1UL) );
5984 const IntrinsicType b3( B.load(k,j+2UL) );
5985 const IntrinsicType b4( B.load(k,j+3UL) );
5986 xmm1 = xmm1 + a1 * b1;
5987 xmm2 = xmm2 + a1 * b2;
5988 xmm3 = xmm3 + a1 * b3;
5989 xmm4 = xmm4 + a1 * b4;
5990 xmm5 = xmm5 + a2 * b1;
5991 xmm6 = xmm6 + a2 * b2;
5992 xmm7 = xmm7 + a2 * b3;
5993 xmm8 = xmm8 + a2 * b4;
5996 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
5997 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
5998 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
5999 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
6000 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
6001 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
6002 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
6003 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
6005 for( ; remainder && k<kend; ++k ) {
6006 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6007 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6008 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6009 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6010 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6011 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6012 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6013 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6017 for( ; (j+2UL) <= N; j+=2UL )
6019 const size_t kbegin( ( IsUpper<MT4>::value )
6020 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6021 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6022 const size_t kend( ( IsLower<MT4>::value )
6023 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6024 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6026 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6029 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6033 const IntrinsicType a1( A.load(i ,k) );
6034 const IntrinsicType a2( A.load(i+1UL,k) );
6035 const IntrinsicType b1( B.load(k,j ) );
6036 const IntrinsicType b2( B.load(k,j+1UL) );
6037 xmm1 = xmm1 + a1 * b1;
6038 xmm2 = xmm2 + a1 * b2;
6039 xmm3 = xmm3 + a2 * b1;
6040 xmm4 = xmm4 + a2 * b2;
6043 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6044 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6045 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6046 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6048 for( ; remainder && k<kend; ++k ) {
6049 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6050 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6051 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6052 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6058 const size_t kbegin( ( IsUpper<MT4>::value )
6059 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6060 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6061 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6063 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6066 IntrinsicType xmm1, xmm2;
6070 const IntrinsicType b1( B.load(k,j) );
6071 xmm1 = xmm1 + A.load(i ,k) * b1;
6072 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6075 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6076 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6078 for( ; remainder && k<kend; ++k ) {
6079 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6080 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6089 for( ; (j+4UL) <= N; j+=4UL )
6091 const size_t kbegin( ( IsUpper<MT4>::value )
6092 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6093 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6094 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
6096 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6099 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6103 const IntrinsicType a1( A.load(i,k) );
6104 xmm1 = xmm1 + a1 * B.load(k,j );
6105 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6106 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
6107 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
6110 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6111 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6112 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
6113 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
6115 for( ; remainder && k<kend; ++k ) {
6116 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6117 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6118 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6119 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6123 for( ; (j+2UL) <= N; j+=2UL )
6125 const size_t kbegin( ( IsUpper<MT4>::value )
6126 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6127 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6128 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6130 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6133 IntrinsicType xmm1, xmm2;
6137 const IntrinsicType a1( A.load(i,k) );
6138 xmm1 = xmm1 + a1 * B.load(k,j );
6139 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6142 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6143 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6145 for( ; remainder && k<kend; ++k ) {
6146 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6147 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6153 const size_t kbegin( ( IsUpper<MT4>::value )
6154 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6155 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6157 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
6164 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6167 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6169 for( ; remainder && k<K; ++k ) {
6170 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6192 template<
typename MT3
6196 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6197 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6199 typedef IntrinsicTrait<ElementType> IT;
6201 const size_t M( A.rows() );
6202 const size_t N( B.columns() );
6203 const size_t K( A.columns() );
6205 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
6209 for( ; (i+4UL) <= M; i+=4UL )
6213 for( ; (j+2UL) <= N; j+=2UL )
6215 const size_t kbegin( ( IsUpper<MT4>::value )
6216 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6217 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6218 const size_t kend( ( IsLower<MT4>::value )
6219 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
6220 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6222 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6225 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6229 const IntrinsicType a1( A.load(i ,k) );
6230 const IntrinsicType a2( A.load(i+1UL,k) );
6231 const IntrinsicType a3( A.load(i+2UL,k) );
6232 const IntrinsicType a4( A.load(i+3UL,k) );
6233 const IntrinsicType b1( B.load(k,j ) );
6234 const IntrinsicType b2( B.load(k,j+1UL) );
6235 xmm1 = xmm1 + a1 * b1;
6236 xmm2 = xmm2 + a1 * b2;
6237 xmm3 = xmm3 + a2 * b1;
6238 xmm4 = xmm4 + a2 * b2;
6239 xmm5 = xmm5 + a3 * b1;
6240 xmm6 = xmm6 + a3 * b2;
6241 xmm7 = xmm7 + a4 * b1;
6242 xmm8 = xmm8 + a4 * b2;
6245 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6246 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6247 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6248 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6249 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6250 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6251 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6252 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6254 for( ; remainder && k<kend; ++k ) {
6255 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6256 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6257 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6258 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6259 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6260 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6261 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6262 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6268 const size_t kbegin( ( IsUpper<MT4>::value )
6269 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6270 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6271 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
6273 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6276 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6280 const IntrinsicType b1( B.load(k,j) );
6281 xmm1 = xmm1 + A.load(i ,k) * b1;
6282 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6283 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
6284 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
6287 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6288 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6289 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
6290 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
6292 for( ; remainder && k<kend; ++k ) {
6293 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6294 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6295 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6296 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6301 for( ; (i+2UL) <= M; i+=2UL )
6305 for( ; (j+2UL) <= N; j+=2UL )
6307 const size_t kbegin( ( IsUpper<MT4>::value )
6308 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6309 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6310 const size_t kend( ( IsLower<MT4>::value )
6311 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6312 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6314 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6317 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6321 const IntrinsicType a1( A.load(i ,k) );
6322 const IntrinsicType a2( A.load(i+1UL,k) );
6323 const IntrinsicType b1( B.load(k,j ) );
6324 const IntrinsicType b2( B.load(k,j+1UL) );
6325 xmm1 = xmm1 + a1 * b1;
6326 xmm2 = xmm2 + a1 * b2;
6327 xmm3 = xmm3 + a2 * b1;
6328 xmm4 = xmm4 + a2 * b2;
6331 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6332 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6333 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6334 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6336 for( ; remainder && k<kend; ++k ) {
6337 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6338 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6339 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6340 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6346 const size_t kbegin( ( IsUpper<MT4>::value )
6347 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6348 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6349 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6351 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6354 IntrinsicType xmm1, xmm2;
6358 const IntrinsicType b1( B.load(k,j) );
6359 xmm1 = xmm1 + A.load(i ,k) * b1;
6360 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6363 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6364 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6366 for( ; remainder && k<kend; ++k ) {
6367 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6368 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6377 for( ; (j+2UL) <= N; j+=2UL )
6379 const size_t kbegin( ( IsUpper<MT4>::value )
6380 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6381 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6382 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6384 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6387 IntrinsicType xmm1, xmm2;
6391 const IntrinsicType a1( A.load(i,k) );
6392 xmm1 = xmm1 + a1 * B.load(k,j );
6393 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6396 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6397 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6399 for( ; remainder && k<kend; ++k ) {
6400 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6401 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6407 const size_t kbegin( ( IsUpper<MT4>::value )
6408 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6409 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6411 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
6418 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6421 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6423 for( ; remainder && k<K; ++k ) {
6424 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6445 template<
typename MT3
6449 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6450 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6452 selectDefaultAddAssignKernel( C, A, B, scalar );
6471 template<
typename MT3
6475 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6476 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6479 selectSmallAddAssignKernel( ~C, A, B, scalar );
6498 template<
typename MT3
6502 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6503 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6506 selectSmallAddAssignKernel( ~C, A, B, scalar );
6524 template<
typename MT3
6528 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6529 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6531 selectLargeAddAssignKernel( C, A, B, scalar );
6550 template<
typename MT3
6554 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6555 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6559 if( IsTriangular<MT4>::value ) {
6561 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6562 addAssign( C, tmp );
6564 else if( IsTriangular<MT5>::value ) {
6566 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6567 addAssign( C, tmp );
6570 gemm( C, A, B, ET(scalar), ET(1) );
6592 template<
typename MT
6594 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6601 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6602 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6604 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6618 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6633 template<
typename MT3
6637 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6639 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
6640 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6641 selectSmallSubAssignKernel( C, A, B, scalar );
6643 selectBlasSubAssignKernel( C, A, B, scalar );
6661 template<
typename MT3
6665 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6666 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6668 const ResultType tmp(
serial( A * B * scalar ) );
6669 subAssign( C, tmp );
6687 template<
typename MT3
6691 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6692 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6694 const size_t M( A.rows() );
6695 const size_t N( B.columns() );
6697 for(
size_t i=0UL; i<M; ++i )
6699 const size_t jbegin( ( IsUpper<MT4>::value )
6700 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6702 const size_t jend( ( IsLower<MT4>::value )
6703 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6707 const size_t jnum( jend - jbegin );
6708 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6710 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6711 (~C)(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6712 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6715 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6735 template<
typename MT3
6739 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6740 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6742 const size_t M( A.rows() );
6743 const size_t N( B.columns() );
6745 const size_t block( BLOCK_SIZE );
6747 for(
size_t jj=0UL; jj<N; jj+=block ) {
6748 const size_t jend(
min( N, jj+block ) );
6749 for(
size_t ii=0UL; ii<M; ii+=block ) {
6750 const size_t iend(
min( M, ii+block ) );
6751 for(
size_t j=jj; j<jend; ++j )
6753 const size_t ibegin( ( IsLower<MT4>::value )
6754 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
6756 const size_t ipos( ( IsUpper<MT4>::value )
6757 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
6760 for(
size_t i=ibegin; i<ipos; ++i ) {
6761 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
6784 template<
typename MT3
6788 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6789 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6791 const size_t M( A.rows() );
6792 const size_t N( B.columns() );
6794 const size_t block( BLOCK_SIZE );
6796 for(
size_t ii=0UL; ii<M; ii+=block ) {
6797 const size_t iend(
min( M, ii+block ) );
6798 for(
size_t jj=0UL; jj<N; jj+=block ) {
6799 const size_t jend(
min( N, jj+block ) );
6800 for(
size_t i=ii; i<iend; ++i )
6802 const size_t jbegin( ( IsUpper<MT5>::value )
6803 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
6805 const size_t jpos( ( IsLower<MT5>::value )
6806 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
6809 for(
size_t j=jbegin; j<jpos; ++j ) {
6810 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
6833 template<
typename MT3
6837 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6838 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6840 const size_t M( A.rows() );
6841 const size_t N( B.columns() );
6843 for(
size_t j=0UL; j<N; ++j )
6845 const size_t ibegin( ( IsLower<MT5>::value )
6846 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6848 const size_t iend( ( IsUpper<MT5>::value )
6849 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6853 const size_t inum( iend - ibegin );
6854 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6856 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6857 (~C)(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6858 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6861 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6881 template<
typename MT3
6885 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6886 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6888 for(
size_t i=0UL; i<A.rows(); ++i ) {
6889 C(i,i) -= A(i,i) * B(i,i) * scalar;
6908 template<
typename MT3
6912 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6913 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6915 selectDefaultSubAssignKernel( C, A, B, scalar );
6934 template<
typename MT3
6938 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6939 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6941 typedef IntrinsicTrait<ElementType> IT;
6943 const size_t M( A.rows() );
6944 const size_t N( B.columns() );
6945 const size_t K( A.columns() );
6947 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
6951 for( ; (i+2UL) <= M; i+=2UL )
6955 for( ; (j+4UL) <= N; j+=4UL )
6957 const size_t kbegin( ( IsUpper<MT4>::value )
6958 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6959 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6960 const size_t kend( ( IsLower<MT4>::value )
6961 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
6962 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
6964 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
6967 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6971 const IntrinsicType a1( A.load(i ,k) );
6972 const IntrinsicType a2( A.load(i+1UL,k) );
6973 const IntrinsicType b1( B.load(k,j ) );
6974 const IntrinsicType b2( B.load(k,j+1UL) );
6975 const IntrinsicType b3( B.load(k,j+2UL) );
6976 const IntrinsicType b4( B.load(k,j+3UL) );
6977 xmm1 = xmm1 + a1 * b1;
6978 xmm2 = xmm2 + a1 * b2;
6979 xmm3 = xmm3 + a1 * b3;
6980 xmm4 = xmm4 + a1 * b4;
6981 xmm5 = xmm5 + a2 * b1;
6982 xmm6 = xmm6 + a2 * b2;
6983 xmm7 = xmm7 + a2 * b3;
6984 xmm8 = xmm8 + a2 * b4;
6987 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
6988 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
6989 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
6990 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
6991 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
6992 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
6993 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
6994 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
6996 for( ; remainder && k<kend; ++k ) {
6997 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
6998 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
6999 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7000 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7001 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7002 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7003 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7004 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7008 for( ; (j+2UL) <= N; j+=2UL )
7010 const size_t kbegin( ( IsUpper<MT4>::value )
7011 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7012 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7013 const size_t kend( ( IsLower<MT4>::value )
7014 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7015 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7017 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7020 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7024 const IntrinsicType a1( A.load(i ,k) );
7025 const IntrinsicType a2( A.load(i+1UL,k) );
7026 const IntrinsicType b1( B.load(k,j ) );
7027 const IntrinsicType b2( B.load(k,j+1UL) );
7028 xmm1 = xmm1 + a1 * b1;
7029 xmm2 = xmm2 + a1 * b2;
7030 xmm3 = xmm3 + a2 * b1;
7031 xmm4 = xmm4 + a2 * b2;
7034 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7035 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7036 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7037 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7039 for( ; remainder && k<kend; ++k ) {
7040 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7041 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7042 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7043 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7049 const size_t kbegin( ( IsUpper<MT4>::value )
7050 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7051 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7052 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7054 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7057 IntrinsicType xmm1, xmm2;
7061 const IntrinsicType b1( B.load(k,j) );
7062 xmm1 = xmm1 + A.load(i ,k) * b1;
7063 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7066 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7067 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7069 for( ; remainder && k<kend; ++k ) {
7070 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7071 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7080 for( ; (j+4UL) <= N; j+=4UL )
7082 const size_t kbegin( ( IsUpper<MT4>::value )
7083 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7084 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7085 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
7087 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7090 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7094 const IntrinsicType a1( A.load(i,k) );
7095 xmm1 = xmm1 + a1 * B.load(k,j );
7096 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7097 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
7098 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
7101 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7102 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7103 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
7104 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
7106 for( ; remainder && k<kend; ++k ) {
7107 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7108 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7109 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7110 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7114 for( ; (j+2UL) <= N; j+=2UL )
7116 const size_t kbegin( ( IsUpper<MT4>::value )
7117 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7118 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7119 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7121 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7124 IntrinsicType xmm1, xmm2;
7128 const IntrinsicType a1( A.load(i,k) );
7129 xmm1 = xmm1 + a1 * B.load(k,j );
7130 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7133 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7134 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7136 for( ; remainder && k<kend; ++k ) {
7137 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7138 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7144 const size_t kbegin( ( IsUpper<MT4>::value )
7145 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7146 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7148 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
7155 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7158 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7160 for( ; remainder && k<K; ++k ) {
7161 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7183 template<
typename MT3
7187 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7188 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7190 typedef IntrinsicTrait<ElementType> IT;
7192 const size_t M( A.rows() );
7193 const size_t N( B.columns() );
7194 const size_t K( A.columns() );
7196 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
7200 for( ; (i+4UL) <= M; i+=4UL )
7204 for( ; (j+2UL) <= N; j+=2UL )
7206 const size_t kbegin( ( IsUpper<MT4>::value )
7207 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7208 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7209 const size_t kend( ( IsLower<MT4>::value )
7210 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
7211 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7213 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7216 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7221 const IntrinsicType a1( A.load(i ,k) );
7222 const IntrinsicType a2( A.load(i+1UL,k) );
7223 const IntrinsicType a3( A.load(i+2UL,k) );
7224 const IntrinsicType a4( A.load(i+3UL,k) );
7225 const IntrinsicType b1( B.load(k,j ) );
7226 const IntrinsicType b2( B.load(k,j+1UL) );
7227 xmm1 = xmm1 + a1 * b1;
7228 xmm2 = xmm2 + a1 * b2;
7229 xmm3 = xmm3 + a2 * b1;
7230 xmm4 = xmm4 + a2 * b2;
7231 xmm5 = xmm5 + a3 * b1;
7232 xmm6 = xmm6 + a3 * b2;
7233 xmm7 = xmm7 + a4 * b1;
7234 xmm8 = xmm8 + a4 * b2;
7237 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7238 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7239 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7240 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7241 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7242 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7243 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7244 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7246 for( ; remainder && k<kend; ++k ) {
7247 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7248 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7249 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7250 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7251 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7252 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7253 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7254 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7260 const size_t kbegin( ( IsUpper<MT4>::value )
7261 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7262 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7263 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
7265 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7268 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7272 const IntrinsicType b1( B.load(k,j) );
7273 xmm1 = xmm1 + A.load(i ,k) * b1;
7274 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7275 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
7276 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
7279 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7280 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7281 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
7282 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
7284 for( ; remainder && k<kend; ++k ) {
7285 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7286 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7287 (~C)(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7288 (~C)(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7293 for( ; (i+2UL) <= M; i+=2UL )
7297 for( ; (j+2UL) <= N; j+=2UL )
7299 const size_t kbegin( ( IsUpper<MT4>::value )
7300 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7301 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7302 const size_t kend( ( IsLower<MT4>::value )
7303 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7304 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7306 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7309 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7313 const IntrinsicType a1( A.load(i ,k) );
7314 const IntrinsicType a2( A.load(i+1UL,k) );
7315 const IntrinsicType b1( B.load(k,j ) );
7316 const IntrinsicType b2( B.load(k,j+1UL) );
7317 xmm1 = xmm1 + a1 * b1;
7318 xmm2 = xmm2 + a1 * b2;
7319 xmm3 = xmm3 + a2 * b1;
7320 xmm4 = xmm4 + a2 * b2;
7323 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7324 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7325 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7326 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7328 for( ; remainder && k<kend; ++k ) {
7329 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7330 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7331 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7332 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7338 const size_t kbegin( ( IsUpper<MT4>::value )
7339 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7340 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7341 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7343 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7346 IntrinsicType xmm1, xmm2;
7350 const IntrinsicType b1( B.load(k,j) );
7351 xmm1 = xmm1 + A.load(i ,k) * b1;
7352 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7355 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7356 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7358 for( ; remainder && k<kend; ++k ) {
7359 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7360 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7369 for( ; (j+2UL) <= N; j+=2UL )
7371 const size_t kbegin( ( IsUpper<MT4>::value )
7372 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7373 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7374 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7376 const size_t kpos( remainder ? ( kend &
size_t(-
IT::size) ) : kend );
7379 IntrinsicType xmm1, xmm2;
7383 const IntrinsicType a1( A.load(i,k) );
7384 xmm1 = xmm1 + a1 * B.load(k,j );
7385 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7388 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7389 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7391 for( ; remainder && k<kend; ++k ) {
7392 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7393 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7399 const size_t kbegin( ( IsUpper<MT4>::value )
7400 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7401 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7403 const size_t kpos( remainder ? ( K &
size_t(-
IT::size) ) : K );
7410 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7413 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7415 for( ; remainder && k<K; ++k ) {
7416 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7437 template<
typename MT3
7441 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7442 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7444 selectDefaultSubAssignKernel( C, A, B, scalar );
7463 template<
typename MT3
7467 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7468 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7471 selectSmallSubAssignKernel( ~C, A, B, scalar );
7490 template<
typename MT3
7494 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7495 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7498 selectSmallSubAssignKernel( ~C, A, B, scalar );
7516 template<
typename MT3
7520 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7521 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7523 selectLargeSubAssignKernel( C, A, B, scalar );
7542 template<
typename MT3
7546 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7547 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7551 if( IsTriangular<MT4>::value ) {
7553 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7554 subAssign( C, tmp );
7556 else if( IsTriangular<MT5>::value ) {
7558 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7559 subAssign( C, tmp );
7562 gemm( C, A, B, ET(-scalar), ET(1) );
7595 template<
typename MT
7597 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7598 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7605 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7606 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7608 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7611 else if( left.columns() == 0UL ) {
7645 template<
typename MT
7647 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7648 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7652 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
7664 const TmpType tmp( rhs );
7684 template<
typename MT
7686 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7687 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7694 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7695 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7697 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7734 template<
typename MT
7736 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7737 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7744 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7745 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7747 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7829 template<
typename T1
7831 inline const DMatTDMatMultExpr<T1,T2>
7855 template<
typename MT1,
typename MT2 >
7872 template<
typename MT1,
typename MT2 >
7889 template<
typename MT1,
typename MT2 >
7891 :
public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7907 template<
typename MT1,
typename MT2 >
7909 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7925 template<
typename MT1,
typename MT2 >
7927 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7943 template<
typename MT1,
typename MT2 >
7945 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7946 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7962 template<
typename MT1,
typename MT2 >
7964 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7980 template<
typename MT1,
typename MT2 >
7982 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7998 template<
typename MT1,
typename MT2 >
8000 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8001 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8017 template<
typename MT1,
typename MT2,
typename VT >
8022 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8023 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8024 IsDenseVector<VT>::value && IsColumnVector<VT>::value
8025 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
8026 , INVALID_TYPE >::Type Type;
8035 template<
typename MT1,
typename MT2,
typename VT >
8040 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8041 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8042 IsSparseVector<VT>::value && IsColumnVector<VT>::value
8043 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
8044 , INVALID_TYPE >::Type Type;
8053 template<
typename VT,
typename MT1,
typename MT2 >
8058 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8059 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8060 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8061 ,
typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8062 , INVALID_TYPE >::Type Type;
8071 template<
typename VT,
typename MT1,
typename MT2 >
8076 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8077 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8078 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8079 ,
typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8080 , INVALID_TYPE >::Type Type;
8089 template<
typename MT1,
typename MT2,
bool AF >
8094 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8095 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8104 template<
typename MT1,
typename MT2 >
8109 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8118 template<
typename MT1,
typename MT2 >
8123 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:423
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:223
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:231
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:225
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:277
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:413
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
DMatTDMatMultExpr< MT1, MT2 > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:221
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:333
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
Header file for the TDVecSMatMultExprTrait class template.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:262
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:359
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:228
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:403
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
Header file for the Not class template.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatTDMatMultExpr.h:226
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:349
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:391
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
BLAZE_ALWAYS_INLINE int16_t sum(const simd_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:369
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:227
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:432
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:142
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:148
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:433
Header file for the IsUpper type trait.
Header file for exception macros.
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:379
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:224
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:237
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:240
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:234
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.